上一节我们说了帧的接收,自然有收就有发,并且也很少说关于发送的东西,这里我们就分析下帧的发送.
参考内核2.6.32.60 net/core/dev.c
我们先看看设备无关层的经典发送函数接口 dev_queue_xmit,当然这里要说一下,上层是如何调用到这里的,当然数据发送到ip层的时候,会调用到邻居子系统模块会把这个接口给联系起来,因为要给数据包封装帧头.查询arp表,找到ip和mac的对应关系当然还有接口的.
-
/* This function can be used in contexts, where only old dev_queue_xmit
-
worked, f.e. if you want to override normal output path (eql, shaper),
-
but resolution is not made yet.
-
*/
-
-
int neigh_compat_output(struct sk_buff *skb)
-
{
-
struct net_device *dev = skb->dev;
-
-
__skb_pull(skb, skb_network_offset(skb));
-
-
if (dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL,
-
skb->len) < 0 &&
-
dev->header_ops->rebuild(skb))
-
return 0;
-
-
return dev_queue_xmit(skb);
-
}
这里不多说上层的流程.
-
/**
-
* dev_queue_xmit - transmit a buffer
-
* @skb: buffer to transmit
-
*
-
* Queue a buffer for transmission to a network device. The caller must
-
* have set the device and priority and built the buffer before calling
-
* this function. The function can be called from an interrupt.
-
*
-
* A negative errno code is returned on a failure. A success does not
-
* guarantee the frame will be transmitted as it may be dropped due
-
* to congestion or traffic shaping.
-
*
-
* -----------------------------------------------------------------------------------
-
* I notice this method can also return errors from the queue disciplines,
-
* including NET_XMIT_DROP, which is a positive value. So, errors can also
-
* be positive.
-
*
-
* Regardless of the return value, the skb is consumed, so it is currently
-
* difficult to retry a send to this method. (You can bump the ref count
-
* before sending to hold a reference for retry if you are careful.)
-
*
-
* When calling this method, interrupts MUST be enabled. This is because
-
* the BH enable code must have IRQs enabled so that it will not deadlock.
-
* --BLG
-
*/
-
int dev_queue_xmit(struct sk_buff *skb)
-
{
-
struct net_device *dev = skb->dev;
-
struct netdev_queue *txq;
-
struct Qdisc *q;
-
int rc = -ENOMEM;
-
-
/* GSO will handle the following emulations directly. */
-
if (netif_needs_gso(dev, skb)) //帧的聚合/分散 用来提高网络性能 可以参考tso,如果有了支持TSO的网卡,CPU可以直接将要发送的大数据发送到网卡上 //,由网卡硬件去负责分片和计算校验和;同时也需要支持硬件校验和
-
goto gso;
-
-
if (skb_has_frags(skb) && //这里判断帧是否有分片,如果有把分片的部分,组合到一起发送.
-
!(dev->features & NETIF_F_FRAGLIST) &&
-
__skb_linearize(skb))
-
goto out_kfree_skb;
-
-
/* Fragmented skb is linearized if device does not support SG,
-
* or if at least one of fragments is in highmem and device
-
* does not support DMA from it.
-
*/
-
if (skb_shinfo(skb)->nr_frags &&
-
(!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
-
__skb_linearize(skb))
-
goto out_kfree_skb;
-
-
/* If packet is not checksummed and device does not support
-
* checksumming for this protocol, complete checksumming here.
-
*/
-
if (skb->ip_summed == CHECKSUM_PARTIAL) { // 软件 校验L4 校验和,然后准备发送帧.
-
skb_set_transport_header(skb, skb->csum_start -
-
skb_headroom(skb));
-
if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
-
goto out_kfree_skb;
-
}
-
-
gso:
-
/* Disable soft irqs for various locks below. Also
-
* stops preemption for RCU.
-
*/
-
rcu_read_lock_bh();
-
-
txq = dev_pick_tx(dev, skb);
-
q = rcu_dereference(txq->qdisc);
-
-
#ifdef CONFIG_NET_CLS_ACT // 这里默认开启了这个选项,即qos,流量控制
-
skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
-
#endif
-
if (q->enqueue) {
-
rc = __dev_xmit_skb(skb, q, dev, txq);
-
goto out;
-
}
-
-
/* The device has no queue. Common case for software devices:
-
loopback, all the sorts of tunnels...
-
-
Really, it is unlikely that netif_tx_lock protection is necessary
-
here. (f.e. loopback and IP tunnels are clean ignoring statistics
-
counters.)
-
However, it is possible, that they rely on protection
-
made by us here.
-
-
Check this and shot the lock. It is not prone from deadlocks.
-
Either shot noqueue qdisc, it is even simpler 8)
-
*/
-
if (dev->flags & IFF_UP) {
-
int cpu = smp_processor_id(); /* ok because BHs are off */
-
-
if (txq->xmit_lock_owner != cpu) {
-
-
HARD_TX_LOCK(dev, txq, cpu);
-
-
if (!netif_tx_queue_stopped(txq)) {
-
rc = NET_XMIT_SUCCESS;
-
if (!dev_hard_start_xmit(skb, dev, txq)) {
-
HARD_TX_UNLOCK(dev, txq);
-
goto out;
-
}
-
}
-
HARD_TX_UNLOCK(dev, txq);
-
if (net_ratelimit())
-
printk(KERN_CRIT "Virtual device %s asks to "
-
"queue packet!\n", dev->name);
-
} else {
-
/* Recursion is It is possible,
-
* unfortunately */
-
if (net_ratelimit())
-
printk(KERN_CRIT "Dead loop on virtual device "
-
"%s, fix it urgently!\n", dev->name);
-
}
-
}
-
-
rc = -ENETDOWN;
-
rcu_read_unlock_bh();
-
-
out_kfree_skb:
-
kfree_skb(skb);
-
return rc;
-
out:
-
rcu_read_unlock_bh();
-
return rc;
-
}
我们这里说一下q
->enqueue ,它到底为真还是空呢?
-
txq = dev_pick_tx(dev, skb);
-
q = rcu_dereference(txq->qdisc);
-
static struct netdev_queue *dev_pick_tx(struct net_device *dev,
-
struct sk_buff *skb)
-
{
-
const struct net_device_ops *ops = dev->netdev_ops;
-
u16 queue_index = 0;
-
-
if (ops->ndo_select_queue)
-
queue_index = ops->ndo_select_queue(dev, skb);
-
else if (dev->real_num_tx_queues > 1)
-
queue_index = skb_tx_hash(dev, skb);
-
-
skb_set_queue_mapping(skb, queue_index);
-
return netdev_get_tx_queue(dev, queue_index);
-
}
这里很明显是获取tx qdisc. 而ops我们如果看过网卡驱动,就知道默认很少有人去初始化.ndo_select_queue,
而dev
->real_num_tx_queues在前面文章中我们知道默认初始化为1. 所以这个函数的返回值是:
&dev->_tx[index]; // 而index明显是0 . 也是说是_tx数组的第一个元素.
我们回想到设备注册函数中的dev_init_scheduler就明白了.默认初始化qdisc是noop_qdisc
在sch_generic.c中
-
struct Qdisc noop_qdisc = {
-
.enqueue = noop_enqueue,
-
.dequeue = noop_dequeue,
-
.flags = TCQ_F_BUILTIN,
-
.ops = &noop_qdisc_ops,
-
.list = LIST_HEAD_INIT(noop_qdisc.list),
-
.q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
-
.dev_queue = &noop_netdev_queue,
-
}
当然后来,在dev_open的时候,里面有
-
/*
-
* Wakeup transmit queue engine
-
*/
-
dev_activate(dev);
又重新初始化了.
-
void dev_activate(struct net_device *dev)
-
{
-
int need_watchdog;
-
-
/* No queueing discipline is attached to device;
-
create default one i.e. pfifo_fast for devices,
-
which need queueing and noqueue_qdisc for
-
virtual interfaces
-
*/
-
-
if (dev->qdisc == &noop_qdisc)
-
attach_default_qdiscs(dev);
-
-
if (!netif_carrier_ok(dev))
-
/* Delay activation until next carrier-on event */
-
return;
-
-
need_watchdog = 0;
-
netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
-
transition_one_qdisc(dev, &dev->rx_queue, NULL);
-
-
if (need_watchdog) {
-
dev->trans_start = jiffies;
-
dev_watchdog_up(dev);
-
}
-
}
我们看看 attach_default_qdiscs
(dev
);
-
static void attach_default_qdiscs(struct net_device *dev)
-
{
-
struct netdev_queue *txq;
-
struct Qdisc *qdisc;
-
-
txq = netdev_get_tx_queue(dev, 0);
-
-
if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) {
-
netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
-
dev->qdisc = txq->qdisc_sleeping;
-
atomic_inc(&dev->qdisc->refcnt);
-
} else {
-
qdisc = qdisc_create_dflt(dev, txq, &mq_qdisc_ops, TC_H_ROOT);
-
if (qdisc) {
-
qdisc->ops->attach(qdisc);
-
dev->qdisc = qdisc;
-
}
-
}
-
}
这里有初始化为了mq_qdisc_ops.而它的enqueue函数是:
pfifo_enqueue://sch_fifo.c
-
static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
-
{
-
struct fifo_sched_data *q = qdisc_priv(sch);
-
-
if (likely(skb_queue_len(&sch->q) < q->limit))
-
return qdisc_enqueue_tail(skb, sch);
-
-
return qdisc_reshape_fail(skb, sch);
-
}
所以会进入:
-
static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
-
struct net_device *dev,
-
struct netdev_queue *txq)
-
{
-
spinlock_t *root_lock = qdisc_lock(q);
-
int rc;
-
-
spin_lock(root_lock);
-
if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { // qdisc 没有激活,当然不可能了,前面dev_open的时候~。~
-
kfree_skb(skb);
-
rc = NET_XMIT_DROP;
-
} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && // 预留的一些操作,暂时我也没看懂,但是从qdis_qlen可以知道qlen为空才会进入....
-
!test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
-
/*
-
* This is a work-conserving queue; there are no old skbs
-
* waiting to be sent out; and the qdisc is not running -
-
* xmit the skb directly.
-
*/
-
__qdisc_update_bstats(q, skb->len);
-
if (sch_direct_xmit(skb, q, dev, txq, root_lock))
-
__qdisc_run(q);
-
else
-
clear_bit(__QDISC_STATE_RUNNING, &q->state);
-
-
rc = NET_XMIT_SUCCESS;
-
} else { //正常进入这里.
-
rc = qdisc_enqueue_root(skb, q);
-
qdisc_run(q);
-
}
-
spin_unlock(root_lock);
-
-
return rc;
-
}
qdisc_enqueue_root 会调用enqueue加入队列,然后调用qdisc_run .它设置队列状态为running然后调用:
-
void __qdisc_run(struct Qdisc *q)
-
{
-
unsigned long start_time = jiffies;
-
-
while (qdisc_restart(q)) {
-
/*
-
* Postpone processing if
-
* 1. another process needs the CPU;
-
* 2. we've been doing it for too long.
-
*/
-
if (need_resched() || jiffies != start_time) {
-
__netif_schedule(q);
-
break;
-
}
-
}
-
-
clear_bit(__QDISC_STATE_RUNNING, &q->state);
-
}
这里先说下while内的代码. 看注释我们明白,需要延迟处理呗. 然后调用__netif_schedule。
-
void __netif_schedule(struct Qdisc *q)
-
{
-
if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
-
__netif_reschedule(q);
-
}
-
static inline void __netif_reschedule(struct Qdisc *q)
-
{
-
struct softnet_data *sd;
-
unsigned long flags;
-
-
local_irq_save(flags);
-
sd = &__get_cpu_var(softnet_data);
-
q->next_sched = sd->output_queue;
-
sd->output_queue = q;
-
raise_softirq_irqoff(NET_TX_SOFTIRQ);
-
local_irq_restore(flags);
-
}
看到上面的代码我们是否明白点什么,对,就是软中断发送中断的调用处. 当然关于rx软中断前面napi机制里已经说的很清楚了.
-
static void net_tx_action(struct softirq_action *h)
-
{
-
struct softnet_data *sd = &__get_cpu_var(softnet_data);
-
-
if (sd->completion_queue) { //已经发送完的报文,但是buff还没释放
-
struct sk_buff *clist;
-
-
local_irq_disable();
-
clist = sd->completion_queue;
-
sd->completion_queue = NULL;
-
local_irq_enable();
-
-
while (clist) {
-
struct sk_buff *skb = clist;
-
clist = clist->next;
-
-
WARN_ON(atomic_read(&skb->users));
-
__kfree_skb(skb);
-
}
-
}
-
-
if (sd->output_queue) {
-
struct Qdisc *head;
-
-
local_irq_disable();
-
head = sd->output_queue;
-
sd->output_queue = NULL;
-
local_irq_enable();
-
-
while (head) {
-
struct Qdisc *q = head;
-
spinlock_t *root_lock;
-
-
head = head->next_sched;
-
-
root_lock = qdisc_lock(q);
-
if (spin_trylock(root_lock)) {
-
smp_mb__before_clear_bit();
-
clear_bit(__QDISC_STATE_SCHED,
-
&q->state);
-
qdisc_run(q);
-
spin_unlock(root_lock);
-
} else {
-
if (!test_bit(__QDISC_STATE_DEACTIVATED,
-
&q->state)) {
-
__netif_reschedule(q);
-
} else {
-
smp_mb__before_clear_bit();
-
clear_bit(__QDISC_STATE_SCHED,
-
&q->state);
-
}
-
}
-
}
-
}
-
}
在处理延迟的output_queue时,最后又调用到qdisc_run. 本质都会调用
qdisc_restart. 直到数据发送完.
-
/*
-
* NOTE: Called under qdisc_lock(q) with locally disabled BH.
-
*
-
* __QDISC_STATE_RUNNING guarantees only one CPU can process
-
* this qdisc at a time. qdisc_lock(q) serializes queue accesses for
-
* this queue.
-
*
-
* netif_tx_lock serializes accesses to device driver.
-
*
-
* qdisc_lock(q) and netif_tx_lock are mutually exclusive,
-
* if one is grabbed, another must be free.
-
*
-
* Note, that this procedure can be called by a watchdog timer
-
*
-
* Returns to the caller:
-
* 0 - queue is empty or throttled.
-
* >0 - queue is not empty.
-
*
-
*/
-
static inline int qdisc_restart(struct Qdisc *q)
-
{
-
struct netdev_queue *txq;
-
struct net_device *dev;
-
spinlock_t *root_lock;
-
struct sk_buff *skb;
-
-
/* Dequeue packet */
-
skb = dequeue_skb(q);
-
if (unlikely(!skb))
-
return 0;
-
-
root_lock = qdisc_lock(q);
-
dev = qdisc_dev(q);
-
txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
-
-
return sch_direct_xmit(skb, q, dev, txq, root_lock);
-
}
这里我们看到了dequeue函数的调用,它就是qdisc生效的地方,具体不多说.然后是直接
sch_direct_xmit发送
-
/*
-
* Transmit one skb, and handle the return status as required. Holding the
-
* __QDISC_STATE_RUNNING bit guarantees that only one CPU can execute this
-
* function.
-
*
-
* Returns to the caller:
-
* 0 - queue is empty or throttled.
-
* >0 - queue is not empty.
-
*/
-
int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
-
struct net_device *dev, struct netdev_queue *txq,
-
spinlock_t *root_lock)
-
{
-
int ret = NETDEV_TX_BUSY;
-
-
/* And release qdisc */
-
spin_unlock(root_lock);
-
-
HARD_TX_LOCK(dev, txq, smp_processor_id());
-
if (!netif_tx_queue_stopped(txq) &&
-
!netif_tx_queue_frozen(txq))
-
ret = dev_hard_start_xmit(skb, dev, txq);
-
HARD_TX_UNLOCK(dev, txq);
-
-
spin_lock(root_lock);
-
-
switch (ret) {
-
case NETDEV_TX_OK:
-
/* Driver sent out skb successfully */
-
ret = qdisc_qlen(q);
-
break;
-
-
case NETDEV_TX_LOCKED:
-
/* Driver try lock failed */
-
ret = handle_dev_cpu_collision(skb, txq, q);
-
break;
-
-
default:
-
/* Driver returned NETDEV_TX_BUSY - requeue skb */
-
if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
-
printk(KERN_WARNING "BUG %s code %d qlen %d\n",
-
dev->name, ret, q->q.qlen);
-
-
ret = dev_requeue_skb(skb, q);
-
break;
-
}
-
-
if (ret && (netif_tx_queue_stopped(txq) ||
-
netif_tx_queue_frozen(txq)))
-
ret = 0;
-
-
return ret;
-
}
这个函数有几个工作:
1.dev_hard_start_xmit 直接调用网卡驱动发送出去
2.根据第一步的返回值,继续处理,如果发送失败,又分处理
3.如果是locked ,那么就调用冲突处理
4.其他重新入队列继续发送.
基本又重新循环了,直到数据报文发送成功或者丢弃.
-
int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
-
struct netdev_queue *txq)
-
{
-
const struct net_device_ops *ops = dev->netdev_ops;
-
int rc;
-
-
if (likely(!skb->next)) {
-
if (!list_empty(&ptype_all))
-
dev_queue_xmit_nit(skb, dev); // 嗅探器
-
-
if (netif_needs_gso(dev, skb)) {
-
if (unlikely(dev_gso_segment(skb)))
-
goto out_kfree_skb;
-
if (skb->next)
-
goto gso;
-
}
-
-
/*
-
* If device doesnt need skb->dst, release it right now while
-
* its hot in this cpu cache
-
*/
-
if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
-
skb_dst_drop(skb);
-
-
rc = ops->ndo_start_xmit(skb, dev); //调用网卡驱动发送
-
if (rc == NETDEV_TX_OK)
-
txq_trans_update(txq);
-
/*
-
* TODO: if skb_orphan() was called by
-
* dev->hard_start_xmit() (for example, the unmodified
-
* igb driver does that; bnx2 doesn't), then
-
* skb_tx_software_timestamp() will be unable to send
-
* back the time stamp.
-
*
-
* How can this be prevented? Always create another
-
* reference to the socket before calling
-
* dev->hard_start_xmit()? Prevent that skb_orphan()
-
* does anything in dev->hard_start_xmit() by clearing
-
* the skb destructor before the call and restoring it
-
* afterwards, then doing the skb_orphan() ourselves?
-
*/
-
return rc;
-
}
-
-
gso:
-
do {
-
struct sk_buff *nskb = skb->next;
-
-
skb->next = nskb->next;
-
nskb->next = NULL;
-
-
/*
-
* If device doesnt need nskb->dst, release it right now while
-
* its hot in this cpu cache
-
*/
-
if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
-
skb_dst_drop(nskb);
-
-
rc = ops->ndo_start_xmit(nskb, dev); //调用网卡驱动发送
-
if (unlikely(rc != NETDEV_TX_OK)) {
-
nskb->next = skb->next;
-
skb->next = nskb;
-
return rc;
-
}
-
txq_trans_update(txq);
-
if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
-
return NETDEV_TX_BUSY;
-
} while (skb->next);
-
-
skb->destructor = DEV_GSO_CB(skb)->destructor;
-
-
out_kfree_skb:
-
kfree_skb(skb);
-
return NETDEV_TX_OK;
-
}
这个函数主要功能是如果设置了嗅探器,则dev_queue_xmit_nit复制一份,这个在前面接收的时候也说到过,它这里查询ptype_all链表,注册的类型是eth_all
-
/*
-
* Support routine. Sends outgoing frames to any network
-
* taps currently in use.
-
*/
-
-
static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
-
{
-
struct packet_type *ptype;
-
-
#ifdef CONFIG_NET_CLS_ACT
-
if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
-
net_timestamp(skb);
-
#else
-
net_timestamp(skb);
-
#endif
-
-
rcu_read_lock();
-
list_for_each_entry_rcu(ptype, &ptype_all, list) {
-
/* Never send packets back to the socket
-
* they originated from - MvS (miquels@drinkel.ow.org)
-
*/
-
if ((ptype->dev == dev || !ptype->dev) &&
-
(ptype->af_packet_priv == NULL ||
-
(struct sock *)ptype->af_packet_priv != skb->sk)) {
-
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
-
if (!skb2)
-
break;
-
-
/* skb->nh should be correctly
-
set by sender, so that the second statement is
-
just protection against buggy protocols.
-
*/
-
skb_reset_mac_header(skb2);
-
-
if (skb_network_header(skb2) < skb2->data ||
-
skb2->network_header > skb2->tail) {
-
if (net_ratelimit())
-
printk(KERN_CRIT "protocol %04x is "
-
"buggy, dev %s\n",
-
skb2->protocol, dev->name);
-
skb_reset_network_header(skb2);
-
}
-
-
skb2->transport_header = skb2->network_header;
-
skb2->pkt_type = PACKET_OUTGOING;
-
ptype->func(skb2, skb->dev, ptype, skb->dev);
-
}
-
}
-
rcu_read_unlock();
-
}
说了这么多,我们看看没有队列的设备,
-
/* The device has no queue. Common case for software devices:
-
loopback, all the sorts of tunnels...
-
-
Really, it is unlikely that netif_tx_lock protection is necessary
-
here. (f.e. loopback and IP tunnels are clean ignoring statistics
-
counters.)
-
However, it is possible, that they rely on protection
-
made by us here.
-
-
Check this and shot the lock. It is not prone from deadlocks.
-
Either shot noqueue qdisc, it is even simpler 8)
-
*/
-
if (dev->flags & IFF_UP) {
当然设备必须是开启的,up状态,典型的例子就是回环设备. 它没有队列的处理,而是直接把包发送出去,如果发送失败也不会重新发送,而是直接丢弃了.
这里我们只是大致说了下帧从ip层到驱动层的发送,没有涉及ip以上和网卡驱动的具体函数. 网卡驱动可以具体看驱动实例即可.关于流量控制这里只是简单说了一下调用流程,
具体分析还需要结合tc命令以及iptables详细分析. 到这里已经和帧的接收圆了起来.
阅读(4681) | 评论(0) | 转发(0) |