关于帧的接收,其实在前面Napi机制中,我们已经能够明白大致流程,或者网卡驱动的流程. 但是这里仍要在说一下,注意一些细节,和系统的勾画一下画面.
其实这个流程挺乏味的,单线程,剧情单一你懂的~.~ .
我们就从中断说起,网卡由硬件MAC 和PHY构成,记得以前看x86中断机制的时候,记得里面讲过8259A中断控制器芯片,用来连接外设,以供cpu处理中断.关于中断的初始化,在系统启动的时候,甚至是汇编层面代码,初始化中断向量表等.当然在后来的init/main.c ,start_kernel依然有对中断的处理函数.
参考内核 2.6.32.61
-
/*
-
* do_IRQ handles all normal device IRQ's (the special
-
* SMP cross-CPU interrupts have their own specific
-
* handlers).
-
*/
-
void __irq_entry do_IRQ(unsigned int irq)
-
{
-
irq_enter();
-
__DO_IRQ_SMTC_HOOK(irq);
-
generic_handle_irq(irq);
-
irq_exit();
-
}
这是来自arch/mips/kernel/irq.c中的代码,(硬件架构是mips)
我们看到
generic_handle_irq(irq);它会根据irq查询中断向量表找到当初我们网卡驱动初始化时注册的中断历程
-
request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,
-
const char *name, void *dev)
-
{
-
return request_threaded_irq(irq, handler, NULL, flags, name, dev);
-
}
include/linux/interrupt.h 中断注册函数api.
我们来看irq_exit:
-
/*
-
* Exit an interrupt context. Process softirqs if needed and possible:
-
*/
-
void irq_exit(void)
-
{
-
account_system_vtime(current);
-
trace_hardirq_exit();
-
sub_preempt_count(IRQ_EXIT_OFFSET);
-
if (!in_interrupt() && local_softirq_pending()) // 判断是否已经退出中断,并且有挂起的软中断需要处理
-
invoke_softirq();
-
-
rcu_irq_exit();
-
#ifdef CONFIG_NO_HZ
-
/* Make sure that timer wheel updates are propagated */
-
if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
-
tick_nohz_stop_sched_tick(0);
-
#endif
-
preempt_enable_no_resched();
-
}
-
#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
-
# define invoke_softirq() __do_softirq()
-
#else
-
# define invoke_softirq() do_softirq()
-
#endif
-
/*
-
* We restart softirq processing MAX_SOFTIRQ_RESTART times,
-
* and we fall back to softirqd after that.
-
*
-
* This number has been established via experimentation.
-
* The two things to balance is latency against fairness -
-
* we want to handle softirqs as soon as possible, but they
-
* should not be able to lock up the box.
-
*/
-
#define MAX_SOFTIRQ_RESTART 10
-
-
DEFINE_TRACE(softirq_raise);
-
-
asmlinkage void __do_softirq(void)
-
{
-
struct softirq_action *h;
-
__u32 pending;
-
int max_restart = MAX_SOFTIRQ_RESTART;
-
int cpu;
-
-
pending = local_softirq_pending();
-
account_system_vtime(current);
-
-
__local_bh_disable((unsigned long)__builtin_return_address(0));
-
lockdep_softirq_enter();
-
-
cpu = smp_processor_id();
-
restart:
-
/* Reset the pending bitmask before enabling irqs */
-
set_softirq_pending(0);
-
-
local_irq_enable();
-
-
h = softirq_vec;
-
-
do {
-
if (pending & 1) {
-
int prev_count = preempt_count();
-
kstat_incr_softirqs_this_cpu(h - softirq_vec);
-
-
trace_softirq_entry(h, softirq_vec);
-
h->action(h);
-
trace_softirq_exit(h, softirq_vec);
-
if (unlikely(prev_count != preempt_count())) {
-
printk(KERN_ERR "huh, entered softirq %td %s %p"
-
"with preempt_count %08x,"
-
" exited with %08x?\n", h - softirq_vec,
-
softirq_to_name[h - softirq_vec],
-
h->action, prev_count, preempt_count());
-
preempt_count() = prev_count;
-
}
-
-
rcu_bh_qs(cpu);
-
}
-
h++;
-
pending >>= 1;
-
} while (pending);
-
-
local_irq_disable();
-
-
pending = local_softirq_pending();
-
if (pending && --max_restart)
-
goto restart;
-
-
if (pending)
-
wakeup_softirqd();
-
-
lockdep_softirq_exit();
-
-
account_system_vtime(current);
-
_local_bh_enable();
-
}
__do_softirq它最终会调用接收软中断 net_rx_action ,关于软中断初始化是在 net_dev_init里
-
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
-
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
这里处理了大部分的软中断,但是还有一些不能及时处理,就需要wakeup_softirqd即唤醒 ksoftirqd这个守护进程,后续处理.它同样也是调用net_rx_action
我们回过头看看驱动里注册的XXX_isr :
-
void rxqueue_isr(BL_CPU_RX_QUEUE_ID_DTE queue_id)
-
{
-
bl_api_ctrl_cpu_rx_queue_interrupt(queue_id,
-
CE_BL_INTERRUPT_ACTION_DISABLE);
-
napi_schedule(&global_napi);
-
return;
-
}
这里只是随便举个例子,重点是里面的napi_schedule(&global_napi); 我们看到isr几乎什么都没做,就马上退出了.
struct napi_struct global_napi;
-
/**
-
* napi_schedule - schedule NAPI poll
-
* @n: napi context
-
*
-
* Schedule NAPI poll routine to be called if it is not already
-
* running.
-
*/
-
static inline void napi_schedule(struct napi_struct *n)
-
{
-
if (napi_schedule_prep(n)) //判断napi是否运行
-
__napi_schedule(n);
-
}
-
/**
-
* napi_schedule_prep - check if napi can be scheduled
-
* @n: napi context
-
*
-
* Test if NAPI routine is already running, and if not mark
-
* it as running. This is used as a condition variable
-
* insure only one NAPI poll instance runs. We also make
-
* sure there is no pending NAPI disable.
-
*/
-
static inline int napi_schedule_prep(struct napi_struct *n)
-
{
-
return !napi_disable_pending(n) &&
-
!test_and_set_bit(NAPI_STATE_SCHED, &n->state);
-
}
上面这个函数的注释说的很清晰.主要判断napi的状态
-
/**
-
* __napi_schedule - schedule for receive
-
* @n: entry to schedule
-
*
-
* The entry's receive function will be scheduled to run
-
*/
-
void __napi_schedule(struct napi_struct *n)
-
{
-
unsigned long flags;
-
-
trace_net_napi_schedule(n);
-
-
local_irq_save(flags);
-
list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
-
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
-
local_irq_restore(flags);
-
}
而这个函数就是开启软中断,并把收到帧的设备加入到cpu接收poll链表.
cpu接收队列的定义和初始化也是在net_dev_init里。这里不多说,struct softnet_data .我们就看看软中断的历程吧
-
static void net_rx_action(struct softirq_action *h)
-
{
-
struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
-
unsigned long time_limit = jiffies + 2;
-
int budget = netdev_budget;
-
void *have;
-
-
local_irq_disable();
-
-
while (!list_empty(list)) { //查询链表直到空
-
struct napi_struct *n;
-
int work, weight;
-
-
/* If softirq window is exhuasted then punt.
-
* Allow this to run for 2 jiffies since which will allow
-
* an average latency of 1.5/HZ.
-
*/
-
if (unlikely(budget <= 0 || time_after(jiffies, time_limit))) // 定时处理
-
goto softnet_break;
-
-
local_irq_enable();
-
-
/* Even though interrupts have been re-enabled, this
-
* access is safe because interrupts can only add new
-
* entries to the tail of this list, and only ->poll()
-
* calls can remove this head entry from the list.
-
*/
-
n = list_first_entry(list, struct napi_struct, poll_list);
-
-
have = netpoll_poll_lock(n);
-
-
weight = n->weight;
-
-
/* This NAPI_STATE_SCHED test is for avoiding a race
-
* with netpoll's poll_napi(). Only the entity which
-
* obtains the lock and sees NAPI_STATE_SCHED set will
-
* actually make the ->poll() call. Therefore we avoid
-
* accidently calling ->poll() when NAPI is not scheduled.
-
*/
-
work = 0;
-
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
-
trace_net_napi_poll(n);
-
work = n->poll(n, weight);
-
trace_napi_poll(n);
-
}
-
-
WARN_ON_ONCE(work > weight);
-
-
budget -= work;
-
-
local_irq_disable();
-
-
/* Drivers must not modify the NAPI state if they
-
* consume the entire weight. In such cases this code
-
* still "owns" the NAPI instance and therefore can
-
* move the instance around on the list at-will.
-
*/
-
if (unlikely(work == weight)) {
-
if (unlikely(napi_disable_pending(n))) {
-
local_irq_enable();
-
napi_complete(n);
-
local_irq_disable();
-
} else
-
list_move_tail(&n->poll_list, list);
-
}
-
-
netpoll_poll_unlock(have);
-
}
-
out:
-
local_irq_enable();
-
-
#ifdef CONFIG_NET_DMA
-
/*
-
* There may not be any more sk_buffs coming right now, so push
-
* any pending DMA copies to hardware
-
*/
-
dma_issue_pending_all();
-
#endif
-
-
return;
-
-
softnet_break:
-
__get_cpu_var(netdev_rx_stat).time_squeeze++;
-
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
-
goto out;
-
}
这个函数很明显,获取cpu接收poll链表.并查询处理,直到空.当然有时候这个链表或许很长,总不能让它一直执行吧,那其他进程什么的,难道喝西北风饿死么?!
我们看到代码有这样一句:
-
/* If softirq window is exhuasted then punt.
-
* Allow this to run for 2 jiffies since which will allow
-
* an average latency of 1.5/HZ.
-
*/
-
if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
-
goto softnet_break;
然后就是调用poll函数 :
work = n->poll(n, weight);
它一般就是从dma或者队列缓冲区里读取数据包 ,到内存.然后传递到上层.
poll注册:
netif_napi_add(dummy_dev, &global_napi, rxqueue_poll, 128);
这里也仅仅是一个接口实例.仅供参考.
-
int rxqueue_poll(struct napi_struct *napi, int budget)
-
{
-
int rx_packet_cnt = 0;
-
static int empty_count = 0;
bl_api_ctrl_cpu_rx_queue_interrupt(param_queue_id, //清中断
CE_BL_INTERRUPT_ACTION_CLEAR);
-
while (rx_packet_cnt < budget) // 当有高速流量的时候,队列满一次处理不完的时候,poll返回值等于weight.
-
{
-
if (netdev_read_packet()) //获取数据包,然后到上层
-
{
-
empty_count++;
-
break;
-
}
-
rx_packet_cnt++;
-
}
-
if(rx_packet_cnt < budget && empty_count > 1)
-
{
-
empty_count = 0;
-
napi_complete(napi);
-
bl_api_ctrl_cpu_rx_queue_interrupt(param_queue_id, //恢复中断
-
CE_BL_INTERRUPT_ACTION_ENABLE);
-
}
-
return rx_packet_cnt;
-
}
在队列满的时候,即poll返回值等于weight时,由于这个时候关闭了中断,所以在这里停留的时间越久,相对丢包就越多.也是网络性能的一个参考点。
这里面会把数据帧传给netif_recevice_skb
-
/**
-
* netif_receive_skb - process receive buffer from network
-
* @skb: buffer to process
-
*
-
* netif_receive_skb() is the main receive data processing function.
-
* It always succeeds. The buffer may be dropped during processing
-
* for congestion control or by the protocol layers.
-
*
-
* This function may only be called from softirq context and interrupts
-
* should be enabled.
-
*
-
* Return values (usually ignored):
-
* NET_RX_SUCCESS: no congestion
-
* NET_RX_DROP: packet was dropped
-
*/
-
int netif_receive_skb(struct sk_buff *skb)
-
{
-
struct packet_type *ptype, *pt_prev;
-
struct net_device *orig_dev;
-
struct net_device *master;
-
struct net_device *null_or_orig;
-
struct net_device *null_or_bond;
-
int ret = NET_RX_DROP;
-
__be16 type;
-
-
if (!skb->tstamp.tv64)
-
net_timestamp(skb);
-
-
if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
-
return NET_RX_SUCCESS;
-
-
/* if we've gotten here through NAPI, check netpoll */
-
if (netpoll_receive_skb(skb))
-
return NET_RX_DROP;
-
-
trace_net_dev_receive(skb);
-
-
if (!skb->skb_iif)
-
skb->skb_iif = skb->dev->ifindex;
-
-
null_or_orig = NULL;
-
orig_dev = skb->dev;
-
master = ACCESS_ONCE(orig_dev->master);
-
if (master) {
-
if (skb_bond_should_drop(skb, master))
-
null_or_orig = orig_dev; /* deliver only exact match */
-
else
-
skb->dev = master;
-
}
-
-
__get_cpu_var(netdev_rx_stat).total++;
-
-
skb_reset_network_header(skb);
-
skb_reset_transport_header(skb);
-
skb->mac_len = skb->network_header - skb->mac_header;
-
-
pt_prev = NULL;
-
-
rcu_read_lock();
-
-
#ifdef CONFIG_NET_CLS_ACT //qdisc 入口队列处理
-
if (skb->tc_verd & TC_NCLS) {
-
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
-
goto ncls;
-
}
-
#endif
-
-
list_for_each_entry_rcu(ptype, &ptype_all, list) {
-
if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
-
ptype->dev == orig_dev) {
-
if (pt_prev)
-
ret = deliver_skb(skb, pt_prev, orig_dev);
-
pt_prev = ptype;
-
}
-
}
-
-
#ifdef CONFIG_NET_CLS_ACT ////qdisc 入口队列处理
-
skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
-
if (!skb)
-
goto out;
-
ncls:
-
#endif
-
-
skb = handle_bridge(skb, &pt_prev, &ret, orig_dev); // bridge 处理
-
if (!skb)
-
goto out;
-
skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev); //macvlan
-
if (!skb)
-
goto out;
-
-
/*
-
* Make sure frames received on VLAN interfaces stacked on
-
* bonding interfaces still make their way to any base bonding
-
* device that may have registered for a specific ptype. The
-
* handler may have to adjust skb->dev and orig_dev.
-
*/
-
null_or_bond = NULL;
-
if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
-
(vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
-
null_or_bond = vlan_dev_real_dev(skb->dev);
-
}
-
-
type = skb->protocol;
-
list_for_each_entry_rcu(ptype,
-
&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
-
if (ptype->type == type && (ptype->dev == null_or_orig || //根据type查找相关的协议处理模块
-
ptype->dev == skb->dev || ptype->dev == orig_dev ||
-
ptype->dev == null_or_bond)) {
-
if (pt_prev)
-
ret = deliver_skb(skb, pt_prev, orig_dev);
-
pt_prev = ptype;
-
}
-
}
-
-
if (pt_prev) {
-
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
-
} else {
-
kfree_skb(skb);
-
/* Jamal, now you will not able to escape explaining
-
* me how you were going to use this. :-)
-
*/
-
ret = NET_RX_DROP;
-
}
-
-
out:
-
rcu_read_unlock();
-
return ret;
-
}
我们先看两个链表的查询ptype_all 和ptype_base. 前者是为嗅探做准备的,比如tcpdump工具分析包.后者就是具体的协议,真正发送给上层协议的.比如ip_rcv等.
对于ptype_all我们看看dev_add_pack就明白了:
-
/*******************************************************************************
-
-
Protocol management and registration routines
-
-
*******************************************************************************/
-
-
/*
-
* Add a protocol ID to the list. Now that the input handler is
-
* smarter we can dispense with all the messy stuff that used to be
-
* here.
-
*
-
* Protocol handlers, mangling input packets,
-
* MUST BE last in hash buckets and checking protocol handlers
-
* MUST start from promiscuous ptype_all chain in net_bh.
-
* It is true now, do not change it.
-
* Explanation follows: if protocol handler, mangling packet, will
-
* be the first on list, it is not able to sense, that packet
-
* is cloned and should be copied-on-write, so that it will
-
* change it and subsequent readers will get broken packet.
-
* --ANK (980803)
-
*/
-
-
/**
-
* dev_add_pack - add packet handler
-
* @pt: packet type declaration
-
*
-
* Add a protocol handler to the networking stack. The passed &packet_type
-
* is linked into kernel lists and may not be freed until it has been
-
* removed from the kernel lists.
-
*
-
* This call does not sleep therefore it can not
-
* guarantee all CPU's that are in middle of receiving packets
-
* will see the new packet type (until the next received packet).
-
*/
-
-
void dev_add_pack(struct packet_type *pt)
-
{
-
int hash;
-
-
spin_lock_bh(&ptype_lock);
-
if (pt->type == htons(ETH_P_ALL))
-
list_add_rcu(&pt->list, &ptype_all);
-
else {
-
hash = ntohs(pt->type) & PTYPE_HASH_MASK;
-
list_add_rcu(&pt->list, &ptype_base[hash]);
-
}
-
spin_unlock_bh(&ptype_lock);
-
}
-
EXPORT_SYMBOL(dev_add_pack);
我们看到只有协议类型是ETH_P_ALL才会添加到ptype_all链表.
流量控制的核心代码在net/sched中.之前我们说过,当设备open时会调用dev_activate激活qdisc.
然后我们看#ifdef CONFIG_NET_CLS_ACT 的部分,这里是处理入口队列的部分,如果配置了入口队列规则或者其他,就会深入处理.大部分的功能发挥在了出口队列,流量控制tc--qos.
我们来看第一个判断:skb->tc_verd & TC_NCLS 默认情况下是没有人赋值skb->tc_verd 所以与的结果肯定是0 .
-
#ifdef CONFIG_NET_CLS_ACT
-
/* TODO: Maybe we should just force sch_ingress to be compiled in
-
* when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
-
* a compare and 2 stores extra right now if we dont have it on
-
* but have CONFIG_NET_CLS_ACT
-
* NOTE: This doesnt stop any functionality; if you dont have
-
* the ingress scheduler, you just cant add policies on ingress.
-
*
-
*/
-
static int ing_filter(struct sk_buff *skb)
-
{
-
struct net_device *dev = skb->dev;
-
u32 ttl = G_TC_RTTL(skb->tc_verd);
-
struct netdev_queue *rxq;
-
int result = TC_ACT_OK;
-
struct Qdisc *q;
-
-
if (MAX_RED_LOOP < ttl++) {
-
printk(KERN_WARNING
-
"Redir loop detected Dropping packet (%d->%d)\n",
-
skb->iif, dev->ifindex);
-
return TC_ACT_SHOT;
-
}
-
-
skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
-
skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
-
-
rxq = &dev->rx_queue;
-
-
q = rxq->qdisc;
-
if (q != &noop_qdisc) {
-
spin_lock(qdisc_lock(q));
-
if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
-
result = qdisc_enqueue_root(skb, q);
-
spin_unlock(qdisc_lock(q));
-
}
-
-
return result;
-
}
-
-
static inline struct sk_buff *handle_ing(struct sk_buff *skb,
-
struct packet_type **pt_prev,
-
int *ret, struct net_device *orig_dev)
-
{
-
if (skb->dev->rx_queue.qdisc == &noop_qdisc)
-
goto out;
-
-
if (*pt_prev) {
-
*ret = deliver_skb(skb, *pt_prev, orig_dev);
-
*pt_prev = NULL;
-
} else {
-
/* Huh? Why does turning on AF_PACKET affect this? */
-
skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
-
}
-
-
switch (ing_filter(skb)) {
-
case TC_ACT_SHOT:
-
case TC_ACT_STOLEN:
-
kfree_skb(skb);
-
return NULL;
-
}
-
-
out:
-
skb->tc_verd = 0;
-
return skb;
-
}
-
#endif
我们看ing_filter里q != &noop_qdisc这个判断,默认它们是相等的,我们应该记得以前讲过qdisc的初始化.默认就是noop_qdisc.所以默认的情况就是什么也不做,就返回了,当然
这里没有深入讨论,以后会深入分析入口队列流量控制的应用.
再接着就是判断是不是属于桥handle_bridge 。关于bridge也需要单独分析.至少到这里整个流程我想大家都明白了吧.
阅读(3607) | 评论(0) | 转发(0) |