分类: 系统运维
2010-06-09 10:53:49
流控包括几个部分: 流控算法, 通常在net/sched/sch_*.c中实现, 缺省的是FIFO, 是比较典型的黑盒模式, 对外只看到入队和出对两个操作; 流控结构的操作处理; 和用户空间的控制接口, 是通过rtnetlink实现的。
以下内核代码版本为2.6.19.2。
2. 控制入口2.1 控制入口linux流控功能反映为网卡设备的属性,表明是最底层的处理部分, 已经和上层的协议栈无
关了:
/* include/linux/netdevice.h */
struct net_device
{
......
/*
* Cache line mostly used on queue transmit path (qdisc)
*/
/* device queue lock */
spinlock_t queue_lock ____cacheline_aligned_in_smp;
//这是发送数据时的队列处理
struct Qdisc *qdisc;
//网卡停止时保存网卡活动时的队列处理方法
struct Qdisc *qdisc_sleeping;
//网卡处理的数据队列链表
struct list_head qdisc_list;
//最大队列长度
unsigned long tx_queue_len; /* Max frames per queue allowed */
/* Partially transmitted GSO packet. */
struct sk_buff *gso_skb;
/* ingress path synchronizer */
//输入流控锁
spinlock_t ingress_lock;
//这是对于接收数据时的队列处理
struct Qdisc *qdisc_ingress;
...... 2.1.2 输出流控数据发出流控处理时,上层的所有处理已经完成,数据包已经交到网卡设备进行发送,在数据发送时进行相关的流控处理数据的出口函数为dev_queue_xmit(); 如果是接收流控, 数据只是刚从网卡设备中收到, 还未交到上层处理, 不过网卡的输入流控不是必须的, 缺省情况下并不进行流控,输入流控入口函数为ing_filter()函数,该函数被skb_receive_skb()调用:/* net/core/dev.c */
int dev_queue_xmit(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct Qdisc *q;
int rc = -ENOMEM;
...... /* Updates of qdisc are serialized by queue_lock.
* The struct Qdisc which is pointed to by qdisc is now a
* rcu structure - it may be accessed without acquiring
* a lock (but the structure may be stale.) The freeing of the
* qdisc will be deferred until it's known that there are no
* more references to it.
*
* If the qdisc has an enqueue function, we still need to * hold the queue_lock before calling it, since queue_lock * also serializes access to the device queue.
*/
// 获取网卡的qdisc指针, 此出不需要锁, 是各个CPU的私有数据
q = rcu_dereference(dev->qdisc);
#ifdef CONFIG_NET_CLS_ACT
skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
#endif // 如果队列输入非空, 将数据包入队// 对于物理网卡设备, 缺省使用的是FIFO qdisc, 该成员函数非空, 只有逻辑网卡
//才可能为空
if (q->enqueue) {
/* Grab device queue */
//加锁
spin_lock(&dev->queue_lock);
//可以直接访问dev->qdisc了
q = dev->qdisc;
if (q->enqueue) {
//入队处理
rc = q->enqueue(skb, q);
// 运行流控, 出队列操作
qdisc_run(dev);
spin_unlock(&dev->queue_lock);
rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
goto out;
}
spin_unlock(&dev->queue_lock);
}
......
}
//出队操作
static inline void qdisc_run(struct net_device *dev)
{
if (!netif_queue_stopped(dev) && !test_and_set_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
__qdisc_run(dev);
}
/* net/sched/sch_generic.c */
void __qdisc_run(struct net_device *dev)
{
// 如果是noop_qdisc流控, 实际是丢包
if (unlikely(dev->qdisc == &noop_qdisc))
goto out;
while (qdisc_restart(dev)
out:
clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
}
/* Kick device. Note, that this procedure can be called by a watchdog timer, so that we do not check dev->tbusy flag here. Returns: 0 - queue is empty. >0 - queue is not empty, but throttled. tbusy != 0. NOTE: Called under dev->queue_lock with locally disabled BH.
*/
static inline int qdisc_restart(struct net_device *dev)
{
struct Qdisc *q = dev->qdisc;
struct sk_buff *skb;
/* Dequeue packet */
//数据包出队
if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
unsigned nolock = (dev->features & NETIF_F_LLTX);
dev->gso_skb = NULL;
......
}