Chinaunix首页 | 论坛 | 博客
  • 博客访问: 134362
  • 博文数量: 38
  • 博客积分: 2510
  • 博客等级: 少校
  • 技术积分: 376
  • 用 户 组: 普通用户
  • 注册时间: 2010-05-07 22:44
文章分类
文章存档

2010年(38)

我的朋友

分类: LINUX

2010-06-01 23:19:35

rtl8139_interrupt(中断处理函数)

当网卡收到数据,发送数据完成,或收发出错都可能发出中断,在中断处理中根据网卡
中断状态寄存器的值来判断是什么情况的中断,然后调用相应的处理函数。

/* The interrupt handler does all of the Rx thread work and cleans up
after the Tx thread. */

static irqreturn_t rtl8139_interrupt (int irq, void *dev_instance,
struct pt_regs *regs)
{
struct net_device *dev = (struct net_device *) dev_instance;
struct rtl8139_private *tp = dev->priv;
void *ioaddr = tp->mmio_addr;
u16 status, ackstat;
int link_changed = 0; /* avoid bogus "uninit" warning */
int handled = 0;
spin_lock (&tp->lock);
// /􀗛 读取中断状态寄存器的值。􀗛/

status = RTL_R16 (IntrStatus);
/* shared irq? */
if (unlikely((status & rtl8139_intr_mask) == 0))
goto out;
handled = 1;
/* h/w no longer present (hotplug?) or major error, bail */
if (unlikely(status == 0xFFFF))
goto out;
/* close possible race's with dev_close */
if (unlikely(!netif_running(dev))) {
RTL_W16 (IntrMask, 0);
goto out;
}
/* Acknowledge all of the current interrupt sources ASAP, but
an first get an additional status bit from CSCR. */

if (unlikely(status & RxUnderrun))
link_changed = RTL_R16 (CSCR) & CSCR_LinkChangeBit;
ackstat = status & ~(RxAckBits | TxErr);
if (ackstat)
RTL_W16 (IntrStatus, ackstat);
/* Receive packets are processed by poll routine. If not running start
it now. */

//如果状态寄存器的接收位置1 ,则进入接收处理函数。

//根据NAPI 机制。这里先向中断屏蔽寄存器中写入r t l 8 1 3 9 n o r x

i n t r m a s k ,
//关闭接收中断, n e t i f r x s h e d u l e p r e p 检查网卡是不

是处于up 状态,
if (status & RxAckBits){
if (netif_rx_schedule_prep(dev)) {// netif_rx_schedule_prep,用

于检查网卡是不是处于up 状态。
RTL_W16_F (IntrMask, rtl8139_norx_intr_mask);
//在dev−>state 上设置L I N K S T A T E R X S C H E D 标

记,
//然后通过把接收的n e t i f r x s c h e d u l e poll 函数

加入软中断队列。
//将来软中断调度的时候,会调用r t l 8 1 3 9 p o l l , 进

行轮询。
//轮询完成的时候,会清除dev−>state 上的L I N K S T A T E

R X S C H E D 标记。
//这主要是避免软中断队列中出现多余的poll 请求。

//我们都知道中断的优先级比较高,如果直接在这里用n e t i f

r x s c h e d u l e
//把poll 请求加入软中断队列中,那么很可能在软中断还没被调

度的时候,又来了一次接收中断,
//于是又有一个poll 请求被加入队列中。等软中断被调度的时

候,
//很可能在第一次poll 的时候就处理完成了所有的接收,

//而后来的那些中断所收到的数据也被第一个处理了。poll 􀗛/

__netif_rx_schedule (dev);//把设备的poll 处理函数加入poll

处理队列中
}
}
/* Check uncommon events with one test. */
//如果状态寄存器的相关错误位置1 ,则进入错误处理函数。

if (unlikely(status & (PCIErr | PCSTimeout | RxUnderrun | RxErr)))
rtl8139_weird_interrupt (dev, tp, ioaddr,status, link_changed);
//传送完一个封包,就调用rtl8139_tx_interrupt.

//如果状态寄存器的发送位置1 ,则进入发送中断的处理函数。

if (status & (TxOK | TxErr)) {
rtl8139_tx_interrupt (dev, tp, ioaddr);
if (status & TxErr)
RTL_W16 (IntrStatus, TxErr);
}
out:
spin_unlock (&tp->lock);
DPRINTK ("%s: exiting interrupt, intr_status=%#4.4x.\n",
dev->name, RTL_R16 (IntrStatus));
return IRQ_RETVAL(handled);
}

NAPI 方式接收新数据中断函数

当RJ‐45 那个接□有数据从网线上“流入”的时候,网卡把它放到内部FIFO 中,同时进
行DMA 传输到接收环形缓冲区。之后网卡发出中断。中断后进入接收处理函数。它先关闭
接收中断后,将轮询函数挂入软中断调度队列。之后在软中断处理过程中将调用
rtl8139_poll。
NAPI 方式直接调用__netif_rx_schedule() , NAPI 中, 提交的是
__netif_rx_schedule(netdev),即是设备驱动的net_device 结构,而不是queue 中的backlog_dev。

netif_rx_schedule_prep (netdevice.h)

/* Test if receive needs to be scheduled but only if up */
static inline int netif_rx_schedule_prep(struct net_device *dev)
{
return netif_running(dev) && __netif_rx_schedule_prep(dev);
}

netif_running(netdevice.h) 

static inline int netif_running(const struct net_device *dev)
{
return test_bit(__LINK_STATE_START, &dev->state);
}

__netif_rx_schedule_prep(netdevice.h)

/* Test if receive needs to be scheduled */
static inline int __netif_rx_schedule_prep(struct net_device *dev)
{
return !test_and_set_bit(__LINK_STATE_RX_SCHED, &dev->state);
}

__netif_rx_schedule (dev)(dev.c)(NAPI 方式软中断请求)

__netif_rx_schedule(dev)是把poll 函数加入软中断调度队列。

/* Add interface to tail of rx poll list. This assumes that _prep has
* already been called and returned 1.
*/

static inline void __netif_rx_schedule(struct net_device *dev)
{
unsigned long flags;
local_irq_save(flags);
dev_hold(dev);
list_add_tail(&dev->poll_list,
&__get_cpu_var(softnet_data).poll_list);
if (dev->quota < 0)//dev->quota?????

dev->quota += dev->weight;
else
dev->quota = dev->weight;
__raise_softirq_irqoff(NET_RX_SOFTIRQ);//?关闭中端

local_irq_restore(flags);
}

__raise_softirq_irqoff(src/include/linux/interrupt.h)

#define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
#define or_softirq_pending(x) (local_softirq_pending() |= (x))
#define local_softirq_pending() read_pda(__softirq_pending)
#define read_pda(field) pda_from_op("mov",field)
#define pda_from_op(op,field) ({ \
typeof(_proxy_pda.field) ret__; \
switch (sizeof(_proxy_pda.field)) { \
case 2: \
asm(op "w %%gs:%c1,%0" : \
"=r" (ret__) : \
"i" (pda_offset(field)), \
"m" (_proxy_pda.field)); \
break; \
case 4: \
asm(op "l %%gs:%c1,%0": \
"=r" (ret__): \
"i" (pda_offset(field)), \
"m" (_proxy_pda.field)); \
break; \
case 8: \
asm(op "q %%gs:%c1,%0": \
"=r" (ret__) : \
"i" (pda_offset(field)), \
"m" (_proxy_pda.field)); \
break; \
default: \
__bad_pda_field(); \
} \
ret__; })

对软中断NET_RX_SOFTIRQ 的注册

在dev.c 的函数 net_dev_init 中
/*
* Initialize the DEV module. At boot time this walks the device list and
* unhooks any devices that fail to initialise (normally hardware not
* present) and leaves us with a valid list of present and active devices.
*
*/

/*
* This is called single threaded during boot, so no need
* to take the rtnl semaphore.
*/

static int __init net_dev_init(void)
open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
在interrupt.h 中定义NET_TX_SOFTIRQ/ NET_RX_SOFTIRQ
/* PLEASE, avoid to allocate new softirqs, if you need not _really_ high
frequency threaded job scheduling. For almost all the purposes
tasklets are more than enough. F.e. all serial device BHs et
al. should be converted to tasklets, not to softirqs.
*/

enum
{
HI_SOFTIRQ=0,
TIMER_SOFTIRQ,
NET_TX_SOFTIRQ,
NET_RX_SOFTIRQ,
BLOCK_SOFTIRQ,
TASKLET_SOFTIRQ,
SCHED_SOFTIRQ,
#ifdef CONFIG_HIGH_RES_TIMERS
HRTIMER_SOFTIRQ,
#endif
};

对软中断NET_RX_SOFTIRQ 的处理函数Net_rx_action 函数

static void net_rx_action(struct softirq_action *h)
{
struct softnet_data *queue = &__get_cpu_var(softnet_data);
unsigned long start_time = jiffies;
int budget = netdev_max_backlog;
local_irq_disable();
// /􀗛 循环处理软中断队列。􀗛/

while (!list_empty(&queue->poll_list)) {
struct net_device *dev;
// /􀗛 一次软中断轮询时间不能过长。􀗛/

if (budget <= 0 || jiffies - start_time > 1)
goto softnet_break;
local_irq_enable();
dev = list_entry(queue->poll_list.next,
struct net_device, poll_list);
if (dev->quota <= 0 || dev->poll(dev, &budget)) {
local_irq_disable();
list_del(&dev->poll_list);
list_add_tail(&dev->poll_list, &queue->poll_list);
if (dev->quota < 0)
dev->quota += dev->weight;
else
dev->quota = dev->weight;
} else {
dev_put(dev);
local_irq_disable();
}
}
out:
local_irq_enable();
return;
softnet_break:
__get_cpu_var(netdev_rx_stat).time_squeeze++;
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
goto out;
}

rtl8139_poll

static int rtl8139_poll(struct net_device *dev, int *budget)
{
struct rtl8139_private *tp = dev->priv;
void *ioaddr = tp->mmio_addr;
int orig_budget = min(*budget, dev->quota);
int done = 1;
spin_lock(&tp->rx_lock);
if (likely(RTL_R16(IntrStatus) & RxAckBits)) {
int work_done;
///􀗛 在r t l 8 1 3 9 r x 中将接送到的数据(在tp 中)拷贝出来并

传递给上层协议驱动。&#1050075;/
work_done = rtl8139_rx(dev, tp, orig_budget);
if (likely(work_done > 0)) {
*budget -= work_done;
dev->quota -= work_done;
done = (work_done < orig_budget);
}
}
if (done) {
/*
* Order is important since data can get interrupted
* again when we think we are done.
*/

///􀗛 轮询结束开启接收中断。􀗛/

local_irq_disable();
RTL_W16_F(IntrMask, rtl8139_intr_mask);
__netif_rx_complete(dev);
local_irq_enable();
}
spin_unlock(&tp->rx_lock);
return !done;
}

rtl8139_rx 把数据从网卡接收缓存中拷贝到内核网络协议栈缓冲区

rtl8139_rx 把数据从网卡接收缓存中拷贝出来。数据在环形缓冲区的存放格式如下:
| 长度| 状态位| 内容| 长度| 状态位| 内容| ...

static int rtl8139_rx(struct net_device *dev, struct rtl8139_private
*tp,
int budget)
{
void *ioaddr = tp->mmio_addr;
int received = 0;
unsigned char *rx_ring = tp->rx_ring;
// /􀗛 网卡不断的把数据放进环形接收缓冲区, CPU 读出来的时候,读到

哪里的顺序需要自己维护,
//tp−>c u r r x 记录上次读到哪里,这里将接着从上一次的地方拷贝。􀗛/

unsigned int cur_rx = tp->cur_rx;
DPRINTK ("%s: In rtl8139_rx(), current %4.4x BufAddr %4.4x,"
" free to %4.4x, Cmd %2.2x.\n", dev->name, cur_rx,
RTL_R16 (RxBufAddr),
RTL_R16 (RxBufPtr), RTL_R8 (ChipCmd));
//轮询寄存器,当ChipCmd RxBufEmpty 位没被网卡设置的时候,

//则说明环形缓冲区中有接收到的数据等待处理。

while (netif_running(dev) && received < budget
&& (RTL_R8 (ChipCmd) & RxBufEmpty) == 0) {
u32 ring_offset = cur_rx % RX_BUF_LEN;
u32 rx_status;
unsigned int rx_size;
unsigned int pkt_size;
struct sk_buff *skb;
u16 status;
rmb();
/* read size+status of next frame from DMA ring buffer */
rx_status = le32_to_cpu (*(u32 *) (rx_ring + ring_offset));
rx_size = rx_status >> 16;
pkt_size = rx_size - 4;
if (netif_msg_rx_status(tp))
printk(KERN_DEBUG "%s: rtl8139_rx() status %4.4x,
size %4.4x,"

" cur %4.4x.\n", dev->name, rx_status,
rx_size, cur_rx);
#if RTL8139_DEBUG > 2
{
int i;
DPRINTK ("%s: Frame contents ", dev->name);
for (i = 0; i < 70; i++)
printk (" %2.2x",
rx_ring[ring_offset + i]);
printk (".\n");
}
#endif
/* Packet copy from FIFO still in progress.
* Theoretically, this should never happen
* since EarlyRx is disabled.
*/

//当EarlyRX 允许的时候,可能会发生这种情况,一个完整的数据包

的一部分已经通过DMA
//传送到了内存中,而另外一部分还在网卡内部FIFO 中,网卡的DMA

操作还在进行中。
if (unlikely(rx_size == 0xfff0)) {
tp->xstats.early_rx++;
goto done;
}
/* If Rx err or invalid rx_size/rx_status received
* (which happens if we get lost in the ring),
* Rx process gets reset, so we abort any further
* Rx processing.
*/

if (unlikely((rx_size > (MAX_ETH_FRAME_SIZE+4)) ||
(rx_size < 8) ||
(!(rx_status & RxStatusOK)))) {
rtl8139_rx_err (rx_status, dev, tp, ioaddr);
return -1;
}
/* Malloc up new buffer, compatible with net-2e. */
/* Omit the four octet CRC from the length. */
// /􀗛 把数据拷贝到SKB 中来。􀗛/

skb = dev_alloc_skb (pkt_size + 2);
if (likely(skb)) {
skb->dev = dev;
skb_reserve (skb, 2); /* 16 byte align the IP fields. */
#if RX_BUF_IDX == 3
wrap_copy(skb, rx_ring, ring_offset+4, pkt_size);
#else
eth_copy_and_sum (skb, &rx_ring[ring_offset + 4], pkt_size,
0);
#endif
skb_put (skb, pkt_size);
// /􀗛 判断包的协议。􀗛/

skb->protocol = eth_type_trans (skb, dev);
dev->last_rx = jiffies;
// /􀗛 更新统计信息,这些信息就是我们用i f c o n f i g 命

令看到的。&#1050075;/
tp->stats.rx_bytes += pkt_size;
tp->stats.rx_packets++;
//调用系统函数通知上层协议驱动数据包的到来

netif_receive_skb (skb);
} else {
if (net_ratelimit())
printk (KERN_WARNING
"%s: Memory squeeze, dropping packet.\n",
dev->name);
tp->stats.rx_dropped++;
}
received++;
cur_rx = (cur_rx + rx_size + 4 + 3) & ~3;
RTL_W16 (RxBufPtr, (u16) (cur_rx - 16));
/* Clear out errors and receive interrupts */
status = RTL_R16 (IntrStatus) & RxAckBits;
if (likely(status != 0)) {
if (unlikely(status & (RxFIFOOver | RxOverflow))) {
tp->stats.rx_errors++;
if (status & RxFIFOOver)
tp->stats.rx_fifo_errors++;
}
RTL_W16_F (IntrStatus, RxAckBits);
}
}
done:
#if RTL8139_DEBUG > 1
DPRINTK ("%s: Done rtl8139_rx(), current %4.4x BufAddr %4.4x,"
" free to %4.4x, Cmd %2.2x.\n", dev->name, cur_rx,
RTL_R16 (RxBufAddr),
RTL_R16 (RxBufPtr), RTL_R8 (ChipCmd));
#endif
tp->cur_rx = cur_rx;
return received;
}

__netif_rx_complete(src/include/linux/netdevice.h)

/* same as netif_rx_complete, except that local_irq_save(flags)
* has already been issued
*/

static inline void __netif_rx_complete(struct net_device *dev)
{
BUG_ON(!test_bit(__LINK_STATE_RX_SCHED, &dev->state));
list_del(&dev->poll_list);
smp_mb__before_clear_bit();
clear_bit(__LINK_STATE_RX_SCHED, &dev->state);
}

非NAPI 方式接收新数据中断函数 (以3c59x 网卡为利,但是
虚拟网卡等还使用)

NON-NAPI 的中断上半部接收过程可以简单的描述为,它首先为新到来的数据帧分配
合适长度的SKB,再将接收到的数据从NIC 中拷贝过来,然后将这个SKB 链入当前CPU
的softnet_data 中的链表中,最后进一步触发中断下半部发继续处理,当然,这下半部的中断
处理是由虚拟网卡backlog_dev 来统一处理的。
3c59x(EtherLinkXL.c: A 3Com EtherLink PCI III/XL ethernet driver for linux.)为例。
3c59x 中中断处理函数vortex_interrupt(),它会判断寄存器的值作出相应的动作:
if (status & RxComplete)
vortex_rx(dev);
当中断指示,有数据包在等待接收,这时,中断例程会调用接收函数vortex_rx(dev)接
收新到来的包(如下,只保留核心部分):

接收中断处理函数(网卡驱动提供)

static int vortex_rx(struct net_device *dev)
{
struct vortex_private *vp = netdev_priv(dev);
void __iomem *ioaddr = vp->ioaddr;
int i;
short rx_status;
if (vortex_debug > 5)
printk(KERN_DEBUG "vortex_rx(): status %4.4x,
rx_status %4.4x.\n"
,
ioread16(ioaddr+EL3_STATUS), ioread16(ioaddr+RxStatus));
while ((rx_status = ioread16(ioaddr + RxStatus)) > 0) {
if (rx_status & 0x4000) { /* Error, update stats. */
unsigned char rx_error = ioread8(ioaddr + RxErrors);
if (vortex_debug > 2)
printk(KERN_DEBUG " Rx error: status %2.2x.\n", rx_error);
vp->stats.rx_errors++;
if (rx_error & 0x01) vp->stats.rx_over_errors++;
if (rx_error & 0x02) vp->stats.rx_length_errors++;
if (rx_error & 0x04) vp->stats.rx_frame_errors++;
if (rx_error & 0x08) vp->stats.rx_crc_errors++;
if (rx_error & 0x10) vp->stats.rx_length_errors++;
} else {
/* The packet length: up to 4.5K!. */
int pkt_len = rx_status & 0x1fff;
struct sk_buff *skb;
//分配SKB 缓存, 为新到来的数据包分配一个skb 结构及

pkt_len+5 大小的数据长度
skb = dev_alloc_skb(pkt_len + 5);
if (vortex_debug > 4)
printk(KERN_DEBUG "Receiving packet size %d
status %4.4x.\n"
,
pkt_len, rx_status);
if (skb != NULL) {
skb_reserve(skb, 2); /* Align IP on 16 byte boundaries
*/

/* 'skb_put()' points to the start of sk_buff data area.
*/

//接收到的数据从网卡复制到(DMA)这个SKB 的数据部分

if (vp->bus_master &&
! (ioread16(ioaddr + Wn7_MasterStatus) & 0x8000)) {
dma_addr_t dma = pci_map_single(VORTEX_PCI(vp),
skb_put(skb, pkt_len),
pkt_len, PCI_DMA_FROMDEVICE);
iowrite32(dma, ioaddr + Wn7_MasterAddr);
iowrite16((skb->len + 3) & ~3, ioaddr + Wn7_MasterLen);
iowrite16(StartDMAUp, ioaddr + EL3_CMD);
while (ioread16(ioaddr + Wn7_MasterStatus) & 0x8000)
;
pci_unmap_single(VORTEX_PCI(vp), dma, pkt_len,
PCI_DMA_FROMDEVICE);
} else {
ioread32_rep(ioaddr + RX_FIFO,
skb_put(skb, pkt_len),
(pkt_len + 3) >> 2);
}
iowrite16(RxDiscard, ioaddr + EL3_CMD); /* Pop top Rx
packet. */

skb->protocol = eth_type_trans(skb, dev);
//调用netif_rx(skb)进一步处理数据

netif_rx(skb);
dev->last_rx = jiffies;
vp->stats.rx_packets++;
/* Wait a limited time to go to next packet. */
for (i = 200; i >= 0; i--)
if ( ! (ioread16(ioaddr + EL3_STATUS) & CmdInProgress))
break;
continue;
} else if (vortex_debug > 0)
printk(KERN_NOTICE "%s: No memory to allocate a sk_buff of
"

"size %d.\n", dev->name, pkt_len);
vp->stats.rx_dropped++;
}
issue_and_wait(dev, RxDiscard);
}
return 0;
}


在2.6 的内核中,许多驱动默认就直接使用NAPI 方式了,在没使用NAPI 方式的情况
下,网卡驱动在接收中断的时候,调用netif_rx 而不是netif_rx_action。

Netif_rx(src/net/core/netdevice.h)

/**
* netif_rx - post buffer to the network code
* @skb: buffer to post
*
* This function receives a packet from a device driver and queues it
for
* the upper (protocol) levels to process. It always succeeds. The
buffer
* may be dropped during processing for congestion control or by the
* protocol layers.
*
* return values:
* NET_RX_SUCCESS (no congestion)
* NET_RX_CN_LOW (low congestion)
* NET_RX_CN_MOD (moderate congestion)
* NET_RX_CN_HIGH (high congestion)
* NET_RX_DROP (packet was dropped)
*
*/

int netif_rx(struct sk_buff *skb)
{
struct softnet_data *queue;
unsigned long flags;
/* if netpoll wants it, pretend we never saw it */
if (netpoll_rx(skb))
return NET_RX_DROP;
if (!skb->tstamp.tv64)
net_timestamp(skb);
/*
* The code is rearranged so that the path is the most
* short when CPU is congested, but is still operating.
*/

local_irq_save(flags);
queue = &__get_cpu_var(softnet_data);
__get_cpu_var(netdev_rx_stat).total++;
// int netdev_max_backlog __read_mostly = 1000;

//当前排在队列中的skb 的数量,当数量超过netdev_max_backlog 的值时,

//直接丢弃新收到的包,netdev_max_backlog 在协议栈中定义的缺省值为

1000,
//可以通过文件/proc/sys/net/core/netdev_max_backlog 进行修改。

//如果当前队列长度未达到上限,把新收到的skb 加到这个队列中,在加到

队列之前,
//要确保对这个队列的接收处理已启动,如果当前队列为空,

//则要先调用netif_rx_schedule 启动队列的处理,再把skb 加到队列中。

//需要注意的是softnet_data 是CPU 绑定的,但不是网络设备绑定的,

//多个网络设备收到的数据报可能存放在同一个队列中待处理。

if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
if (queue->input_pkt_queue.qlen) {
enqueue:
dev_hold(skb->dev);
//将这个SKB 加入到相应的input_pkt_queue 队列中

__skb_queue_tail(&queue->input_pkt_queue, skb);
local_irq_restore(flags);
return NET_RX_SUCCESS;
}
//调用netif_rx_schedule(),应用的设备是backlog_dev 这个虚拟设备

//这里是否调用netif_rx_schedule()是有条件的,即当

queue->input_pkt_queue.qlen==0 时才会调用,
//否则由于这个队列的长度不为0,这个中断下半部的执行已由先前的

中断触发,
//它会断续处理余下来的数据包的接收,所以,这里就不必要再次触

发它的执行了。
netif_rx_schedule(&queue->backlog_dev);
goto enqueue;
}
__get_cpu_var(netdev_rx_stat).dropped++;
local_irq_restore(flags);
kfree_skb(skb);
return NET_RX_DROP;
}

netif_rx_schedule(backlog_dev)(src/include/linux/netdevice.h)

/* Try to reschedule poll. Called by irq handler. */
static inline void netif_rx_schedule(struct net_device *dev)
{
// netif_rx_schedule()就是将有等待接收数据包的NIC 链入softnet_data 的

poll_list 队列
if (netif_rx_schedule_prep(dev))
__netif_rx_schedule(dev);
}

__netif_rx_schedule(backlog_dev)(dev.c)
Backlog_dev 的Poll 函数process_backlog(src/net/core/dev.c)

static int process_backlog(struct net_device *backlog_dev, int *budget)
{
int work = 0;
int quota = min(backlog_dev->quota, *budget);
struct softnet_data *queue = &__get_cpu_var(softnet_data);
unsigned long start_time = jiffies;
backlog_dev->weight = weight_p;
for (;;) {
struct sk_buff *skb;
struct net_device *dev;
local_irq_disable();
//取下队列中SKB 包。

skb = __skb_dequeue(&queue->input_pkt_queue);
if (!skb)
goto job_done;
local_irq_enable();
dev = skb->dev;//获取目的设备

//把包向上传递给目的设备。

netif_receive_skb(skb);
dev_put(dev);
work++;
if (work >= quota || jiffies - start_time > 1)
break;
}
backlog_dev->quota -= work;
*budget -= work;
return -1;
job_done:
backlog_dev->quota -= work;
*budget -= work;
list_del(&backlog_dev->poll_list);
smp_mb__before_clear_bit();
netif_poll_enable(backlog_dev);
local_irq_enable();
return 0;
}

netif_receive_skb(dev.c)数据链路层以上网络包的传递

这是一个辅助函数,用于在poll 中处理接收到的帧。它主要是向各个已注册的协议处
理例程发送一个SKB。每个协议的类型由一个packet_type 结构表示:网桥的处理就在这儿
进行处理.
netif_receive_skb()的主要作用体现在两个遍历链表的操作中,其中之一为遍历ptype_all链,这些为注册到内核的一些sniffer,将上传给这些sniffer,另一个就是遍历ptype_base,这个就是具体的协议类型。
假高如上图如示,当eth1 接收到一个IP 数据包时,它首先分别发送一份副本给两个
ptype_all 链表中的packet_type,它们都由package_rcv 处理,然后再根据HASH 值,在遍历另一个HASH 表时,发送一份给类型为ETH_P_IP 的类型,它由ip_rcv 处理。如果这个链中还注册有其它IP 层的协议,它也会同时发送一个副本给它,是由deliver_skb(skb, pt_prev,orig_dev)去完成的:

int netif_receive_skb(struct sk_buff *skb)
{
struct packet_type *ptype, *pt_prev;
struct net_device *orig_dev;
int ret = NET_RX_DROP;
__be16 type;
static int count;
/* if we've gotten here through NAPI, check netpoll */
if (skb->dev->poll && netpoll_rx(skb))
return NET_RX_DROP;
//2.4 中skb 的时间戳是自动记录的,获取skb 后就能直接读取其进入系统的时

间。
//而在2.6 中,是否记录时间戳成为可选的,大概因为很多网络应用中用不到

skb 的内部时间,//为其赋值将增加系统的开销,系统增加了一个静态参数

netstamp_needed 来控制是否记录时间戳。
if (!skb->tstamp.tv64)
net_timestamp(skb);// 设置时间戳

if (!skb->iif)
skb->iif = skb->dev->ifindex;
orig_dev = skb_bond(skb);
if (!orig_dev)
return NET_RX_DROP;
__get_cpu_var(netdev_rx_stat).total++;
skb_reset_network_header(skb);
skb_reset_transport_header(skb);
skb->mac_len = skb->network_header - skb->mac_header;
pt_prev = NULL;
rcu_read_lock();
#ifdef CONFIG_NET_CLS_ACT
if (skb->tc_verd & TC_NCLS) {
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
goto ncls;
}
#endif
//它只是一个包装函数,它只去执行相应packet_type 里的func 处理函数,

//如对于ETH_P_IP 类型,由上面可以看到,它执行的就是ip_rcv 了

//// 先查处理所有以太类型的链表各节点

list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (!ptype->dev || ptype->dev == skb->dev) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
#ifdef CONFIG_NET_CLS_ACT
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL; /* noone else should process this after*/
} else {
skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
}
ret = ing_filter(skb);
if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
kfree_skb(skb);
goto out;
}
skb->tc_verd = 0;
ncls:
#endif
skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
if (!skb)
{
count++ ;
printk(KERN_INFO "skb==NULL Count=%d",count);
goto out;
}
type = skb->protocol;
//// 再查指定协议的HASH 链表

list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
if (ptype->type == type &&
(!ptype->dev || ptype->dev == skb->dev)) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
if (pt_prev) {
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
} else {
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/

ret = NET_RX_DROP;
}
out:
rcu_read_unlock();
return ret;
}


至此,一个以太网帧的链路层接收过程就全部完成,再下去就是网络层的处理了。
阅读(3302) | 评论(0) | 转发(2) |
给主人留下些什么吧!~~