Chinaunix首页 | 论坛 | 博客
  • 博客访问: 253421
  • 博文数量: 88
  • 博客积分: 1429
  • 博客等级:
  • 技术积分: 523
  • 用 户 组: 普通用户
  • 注册时间: 2010-01-18 15:31
文章分类

全部博文(88)

文章存档

2017年(2)

2016年(24)

2013年(1)

2012年(24)

2011年(15)

2010年(22)

我的朋友

分类: LINUX

2016-09-25 01:02:09

原文地址:网卡驱动的收发包流程 作者:随风去

分析网络协议栈的代码,如果不看驱动代码的话,总是感觉没有完全落到实处。
由于手头上只有rtl8169的网卡设备,为了方便调试和分析,因此选择该款设备对应的驱动进行分析:r8169.ko,内核代码还是基于3.1.3
对于协议栈而言:
1)发包的最后步骤为调用驱动注册的ndo_start_xmit钩子函数,具体如何实现由驱动完成
2)收包由中断触发,驱动负责skb的生成以及其他操作如校验等,然后传给协议栈
但是这些都只是模糊的概念,具体如何操作,如卸载功能、NAPI如何在驱动层次实现没有完全清晰的了解。
因此希望通过分析r8169的驱动代码,对更底层的逻辑有更好的掌握。
要分析驱动的代码,数据手册必不可少,特别是寄存器以及描述符的说明。

发包流程:

对于驱动而言,发包流程简单的可以总结为:根据skb赋值好发送描述符,然后告诉硬件进行发包
具体代码如下:

  1. static const struct net_device_ops rtl8169_netdev_ops = {
  2.     .ndo_open        = rtl8169_open,
  3.     .ndo_stop        = rtl8169_close,
  4.     .ndo_get_stats        = rtl8169_get_stats,
  5.     .ndo_start_xmit        = rtl8169_start_xmit,  //r8169对应的发包函数
  6.     .ndo_tx_timeout        = rtl8169_tx_timeout,
  7.     .ndo_validate_addr    = eth_validate_addr,
  8.     .ndo_change_mtu        = rtl8169_change_mtu,
  9.     .ndo_fix_features    = rtl8169_fix_features,
  10.     .ndo_set_features    = rtl8169_set_features,
  11.     .ndo_set_mac_address    = rtl_set_mac_address,
  12.     .ndo_do_ioctl        = rtl8169_ioctl,
  13.     .ndo_set_multicast_list    = rtl_set_rx_mode,
  14. #ifdef CONFIG_NET_POLL_CONTROLLER
  15.     .ndo_poll_controller    = rtl8169_netpoll,
  16. #endif

  17. }
要看懂发包函数的代码,需要先查看r8169的数据手册,对发送描述符以及一些寄存器有一定的了解:
由于r8169支持TSO卸载功能,因此其发送描述符分成两种格式:




ps:网上没有找到完全对应的数据手册,因此名字以及具体的格式可能有一定的差别,但是基本上影响不大

有了上面的说明,就可以看懂r8169的发包流程了

  1. static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb,
  2.                  struct net_device *dev)
  3. {
  4.     struct rtl8169_private *tp = netdev_priv(dev);
  5.     unsigned int entry = tp->cur_tx % NUM_TX_DESC;
  6.     struct TxDesc *txd = tp->TxDescArray + entry;
  7.     void __iomem *ioaddr = tp->mmio_addr;
  8.     struct device *d = &tp->pci_dev->dev;
  9.     dma_addr_t mapping;
  10.     u32 status, len;
  11.     u32 opts[2];
  12.     int frags;
  13.     //首先查看发送描述符是否足够,A skbuff with nr_frags needs nr_frags+1 entries in the tx queue
  14.     if (unlikely(TX_BUFFS_AVAIL(tp) < skb_shinfo(skb)->nr_frags)) {  //这个判断其实有点问题,应该再加1,最新的代码修复了这个问题
  15.         netif_err(tp, drv, dev, "BUG! Tx Ring full when queue awake!\n")
  16.         goto err_stop_0;
  17.     }
  18.     //从DescOwn的定义可以知道,如果为1表示该描述符为网卡所有,等待发送,即不是空闲的,驱动没法用
  19.     //基本操作:初始化为0,驱动填充完后为1,网卡发送完后触发中断,释放资源,又赋值为0
  20.     if (unlikely(le32_to_cpu(txd->opts1) & DescOwn)) //
  21.         goto err_stop_0;
  22.     //首先处理线形区的数据,分配一个发送描述符
  23.     len = skb_headlen(skb);
  24.     mapping = dma_map_single(d, skb->data, len, DMA_TO_DEVICE);  //进行DMA映射
  25.     if (unlikely(dma_mapping_error(d, mapping))) {
  26.         if (net_ratelimit())
  27.             netif_err(tp, drv, dev, "Failed to map TX DMA!\n");
  28.         goto err_dma_0;
  29.     }

  30.     tp->tx_skb[entry].len = len;
  31.     txd->addr = cpu_to_le64(mapping);

  32.     opts[1] = cpu_to_le32(rtl8169_tx_vlan_tag(tp, skb));
  33.     opts[0] = DescOwn;
  34.     //根据skb_shinfo(skb)->gso_size决定使用哪种发送描述符
  35.     rtl8169_tso_csum(tp, skb, opts);

  36.     frags = rtl8169_xmit_frags(tp, skb, opts); //对每一个frag分配一个发送描述符和它对应
  37.     if (frags < 0)
  38.         goto err_dma_1;
  39.     else if (frags) //skb有frag,即该skb对应>=2的发送描述符,即FirstFrag和LastFrag不是同一个
  40.         opts[0] |= FirstFrag;   
  41.     else { //skb没有frag,即该skb只对应一个发送描述符,FirstFrag和LastFrag是同一个
  42.         opts[0] |= FirstFrag | LastFrag;
  43.         tp->tx_skb[entry].skb = skb;
  44.     }

  45.     txd->opts2 = cpu_to_le32(opts[1]);

  46.     wmb();

  47.     /* Anti gcc 2.95.3 bugware (sic) */
  48.     status = opts[0] | len | (RingEnd * !((entry + 1) % NUM_TX_DESC));
  49.     txd->opts1 = cpu_to_le32(status);

  50.     tp->cur_tx += frags + 1;

  51.     wmb();

  52.     RTL_W8(TxPoll, NPQ); //置位TxPoll寄存器的NPQ位,告诉硬件可以发包了

  53.     //下面是流量控制,如果发送描述符不够的话暂时关闭该发送队列,等资源释放后再开启
  54.     if (TX_BUFFS_AVAIL(tp) < MAX_SKB_FRAGS) {
  55.         netif_stop_queue(dev);
  56.         smp_rmb();
  57.         if (TX_BUFFS_AVAIL(tp) >= MAX_SKB_FRAGS)
  58.             netif_wake_queue(dev);
  59.     }

  60.     return NETDEV_TX_OK;

  61. err_dma_1:
  62.     rtl8169_unmap_tx_skb(d, tp->tx_skb + entry, txd);
  63. err_dma_0:
  64.     dev_kfree_skb(skb);
  65.     dev->stats.tx_dropped++;
  66.     return NETDEV_TX_OK;

  67. err_stop_0:
  68.     netif_stop_queue(dev);
  69.     dev->stats.tx_dropped++;
  70.     return NETDEV_TX_BUSY;
  71. }
由于两种发送描述符的结构不一样,因此分别赋值,开启TSO的话,指定TD_LSO标记,让硬件进行分包和校验
不开启TSO的话,根据传输类型,指定校验标记,让硬件完成校验功能。
  1. static inline void rtl8169_tso_csum(struct rtl8169_private *tp,
  2.                  struct sk_buff *skb, u32 *opts)
  3. {
  4.     const struct rtl_tx_desc_info *info = tx_desc_info + tp->txd_version;
  5.     u32 mss = skb_shinfo(skb)->gso_size;
  6.     int offset = info->opts_offset;

  7.     if (mss) {
  8.         opts[0] |= TD_LSO;
  9.         opts[offset] |= min(mss, TD_MSS_MAX) << info->mss_shift;  //分片的大小
  10.     } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
  11.         const struct iphdr *ip = ip_hdr(skb);

  12.         if (ip->protocol == IPPROTO_TCP)
  13.             opts[offset] |= info->checksum.tcp;
  14.         else if (ip->protocol == IPPROTO_UDP)
  15.             opts[offset] |= info->checksum.udp;
  16.         else
  17.             WARN_ON_ONCE(1);
  18.     }
  19. }
从上面可以看到,驱动是如何实现卸载功能的(分片以及校验功能),和协议栈是如何交互的:
1)根据参数skb_shinfo(skb)->gso_size,决定使用的发送描述符的类型,反过来,上层协议栈肯定是感知到底层可以分片的情况下才会把大包传下来的。
2)上层没有校验的话,硬件进行校验,反过来上层协议栈肯定是感知到底层可以校验的情况下才不进行校验的

收包流程

和发包类似于同步的流程不一样,收包是异步的,中断触发,直观的感觉发包应该也有中断,即硬件发完一个包后触发一个中断,释放相应的资源,
但是目前很多驱动使用了变通的方法:在收包中断中处理相应的流程,r8169驱动就是这么做的。
目前主流网卡驱动都使用了NAPI的机制,即中断和轮询相结合的方法来提高性能。
在网卡open的时候申请中断号:

  1. retval = request_irq(dev->irq, rtl8169_interrupt,
  2.              (tp->features & RTL_FEATURE_MSI) ? 0 : IRQF_SHARED,
  3.              dev->name, dev)


在分析中断函数前,需要知道r8169对应的中断寄存器的说明:

  1.     SYSErr        = 0x8000,
  2.     PCSTimeout    = 0x4000,
  3.     SWInt        = 0x0100,
  4.     TxDescUnavail    = 0x0080,
  5.     RxFIFOOver    = 0x0040,
  6.     LinkChg        = 0x0020,
  7.     RxOverflow    = 0x0010,
  8.     TxErr        = 0x0008,
  9.     TxOK        = 0x0004,
  10.     RxErr        = 0x0002,
  11.     RxOK        = 0x0001,

  1. static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance)
  2. {
  3.     struct net_device *dev = dev_instance;
  4.     struct rtl8169_private *tp = netdev_priv(dev);
  5.     void __iomem *ioaddr = tp->mmio_addr;
  6.     int handled = 0;
  7.     int status;

  8.     /* loop handling interrupts until we have no new ones or
  9.      * we hit a invalid/hotplug case.
  10.      */
  11.     status = RTL_R16(IntrStatus);
  12.     while (status && status != 0xffff) {  //status的某一位为1表示有相应的中断发生
  13.         handled = 1;

  14.         /* Handle all of the error cases first. These will reset
  15.          * the chip, so just exit the loop.
  16.          */
  17.         if (unlikely(!netif_running(dev))) {
  18.             rtl8169_hw_reset(tp);
  19.             break;
  20.         }

  21.         if (unlikely(status & SYSErr)) { //发送错误的一些处理流程
  22.             rtl8169_pcierr_interrupt(dev);
  23.             break;
  24.         }

  25.         if (status & LinkChg)
  26.             __rtl8169_check_link_status(dev, tp, ioaddr, true);

  27.         /* We need to see the lastest version of tp->intr_mask to
  28.          * avoid ignoring an MSI interrupt and having to wait for
  29.          * another event which may never come.
  30.          */
  31.         smp_rmb();
  32.         if (status & tp->intr_mask & tp->napi_event)
  33.             RTL_W16(IntrMask, tp->intr_event & ~tp->napi_event); //IntrMask的某一位置1表示使能该中断,处理过程中先关闭同样的中断
  34.             tp->intr_mask = ~tp->napi_event;  //处理完后再全部打开,具体看rtl8169_poll函数

  35.             if (likely(napi_schedule_prep(&tp->napi))) //和协议栈的NAPI机制交互,触发收包软中断,调用r8169的poll函数:rtl8169_poll
  36.                 __napi_schedule(&tp->napi);   
  37.             else
  38.                 netif_info(tp, intr, dev,
  39.                      "interrupt %04x in poll\n", status);
  40.         }

  41.         /* We only get a new MSI interrupt when all active irq
  42.          * sources on the chip have been acknowledged. So, ack
  43.          * everything we
收包软中断net_rx_action主要是调用各个NAPI对应的poll函数。
接下来看r8169的poll函数,该函数除了完成收包任务外,还附带完成发送相关的资源释放任务
  1. static int rtl8169_poll(struct napi_struct *napi, int budget)
  2. {
  3.     struct rtl8169_private *tp = container_of(napi, struct rtl8169_private, napi);
  4.     struct net_device *dev = tp->dev;
  5.     void __iomem *ioaddr = tp->mmio_addr;
  6.     int work_done;

  7.     work_done = rtl8169_rx_interrupt(dev, tp, ioaddr, (u32) budget);
  8.     rtl8169_tx_interrupt(dev, tp, ioaddr);

  9.     if (work_done < budget) {
  10.         napi_complete(napi);

  11.         /* We need for force the visibility of tp->intr_mask
  12.          * for other CPUs, as we can loose an MSI interrupt
  13.          * and potentially wait for a retransmit timeout if we don't.
  14.          * The posted write to IntrMask is safe, as it will
  15.          * eventually make it to the chip and we won't loose anything
  16.          * until it does.
  17.          */
  18.         tp->intr_mask = 0xffff;
  19.         wmb();
  20.         RTL_W16(IntrMask, tp->intr_event);
  21.     }

  22.     return work_done;
  23. }
要看懂收包流程同样需要结合数据手册:
收包和发包相反,其发送描述符相关的内容由硬件赋值好,驱动主要根据该描述符构造skb数据结构然后传给协议栈。
为了使硬件能够正确的使用发送描述符,驱动要先准备好相应的资源:
初始化后的描述符如下:

硬件赋值好的描述符如下,这种描述符是驱动用于构造skb的:


  1. static int rtl8169_rx_interrupt(struct net_device *dev,
  2.                 struct rtl8169_private *tp,
  3.                 void __iomem *ioaddr, u32 budget)
  4. {
  5.     unsigned int cur_rx, rx_left;
  6.     unsigned int count;

  7.     cur_rx = tp->cur_rx;
  8.     rx_left = NUM_RX_DESC + tp->dirty_rx - cur_rx; //dirty_rx始终等于cur_rx,这段代码没啥意义,新版的代码去除了这个分量
  9.     rx_left = min(rx_left, budget); //直接使用min(budget, NUM_RX_DESC)

  10.     for (; rx_left > 0; rx_left--, cur_rx++) {
  11.         unsigned int entry = cur_rx % NUM_RX_DESC;
  12.         struct RxDesc *desc = tp->RxDescArray + entry;
  13.         u32 status;

  14.         rmb();
  15.         status = le32_to_cpu(desc->opts1) & tp->opts1_mask;

  16.         if (status & DescOwn) //从数据手册可以知道,此时这个描述符还属于网卡,即还没有接到数据
  17.             break;
  18.         if (unlikely(status & RxRES)) {  //错误综合标志位
  19.             netif_info(tp, rx_err, dev, "Rx ERROR. status = %08x\n",
  20.                  status);
  21.             dev->stats.rx_errors++;
  22.             if (status & (RxRWT | RxRUNT))
  23.                 dev->stats.rx_length_errors++;
  24.             if (status & RxCRC)
  25.                 dev->stats.rx_crc_errors++;
  26.             if (status & RxFOVF) {
  27.                 rtl8169_schedule_work(dev, rtl8169_reset_task);
  28.                 dev->stats.rx_fifo_errors++;
  29.             }
  30.             rtl8169_mark_to_asic(desc, rx_buf_sz);
  31.         } else {
  32.             struct sk_buff *skb;
  33.             dma_addr_t addr = le64_to_cpu(desc->addr);
  34.             int pkt_size = (status & 0x00001FFF) - 4;

  35.             /*
  36.              * The driver does not support incoming fragmented
  37.              * frames. They are seen as a symptom of over-mtu
  38.              * sized frames.
  39.              */
  40.             if (unlikely(rtl8169_fragmented_frame(status))) {
  41.                 dev->stats.rx_dropped++;
  42.                 dev->stats.rx_length_errors++;
  43.                 rtl8169_mark_to_asic(desc, rx_buf_sz);
  44.                 continue;
  45.             }

  46.             skb = rtl8169_try_rx_copy(tp->Rx_databuff[entry],
  47.                          tp, pkt_size, addr);
  48.             rtl8169_mark_to_asic(desc, rx_buf_sz); //数据拷贝完后,该接收描述符可以释放给网卡了
  49.             if (!skb) {
  50.                 dev->stats.rx_dropped++;
  51.                 continue;
  52.             }
  53.             //如果硬件已经校验的话,赋值skb->ip_summed = CHECKSUM_UNNECESSARY;
  54.             rtl8169_rx_csum(skb, status);
  55.             skb_put(skb, pkt_size);
  56.             skb->protocol = eth_type_trans(skb, dev); //根据包L2层的数据赋值protocal

  57.             rtl8169_rx_vlan_tag(desc, skb);

  58.             napi_gro_receive(&tp->napi, skb); //通过GRO,把包传给协议栈

  59.             dev->stats.rx_bytes += pkt_size;
  60.             dev->stats.rx_packets++;
  61.         }

  62.         /* Work around for AMD plateform. */
  63.         if ((desc->opts2 & cpu_to_le32(0xfffe000)) &&
  64.          (tp->mac_version == RTL_GIGA_MAC_VER_05)) {
  65.             desc->opts2 = 0;
  66.             cur_rx++;
  67.         }
  68.     }

  69.     count = cur_rx - tp->cur_rx;
  70.     tp->cur_rx = cur_rx;

  71.     tp->dirty_rx += count;

  72.     return count;
  73. }
驱动负责解析L2的头,根据mac地址的值以及协议类型设置skb的分量:
  1. __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
  2. {
  3.     struct ethhdr *eth;

  4.     skb->dev = dev;
  5.     skb_reset_mac_header(skb);
  6.     skb_pull_inline(skb, ETH_HLEN)
  7.     eth = eth_hdr(skb);

  8.     if (unlikely(is_multicast_ether_addr(eth->h_dest))) {  //根据mac地址赋值pkt_type
  9.         if (!compare_ether_addr_64bits(eth->h_dest, dev->broadcast))
  10.             skb->pkt_type = PACKET_BROADCAST;
  11.         else
  12.             skb->pkt_type = PACKET_MULTICAST;
  13.     }

  14.     /*
  15.      * This ALLMULTI check should be redundant by 1.4
  16.      * so don't forget to remove it.
  17.      *
  18.      * Seems, you forgot to remove it. All silly devices
  19.      * seems to set IFF_PROMISC.
  20.      */

  21.     else if (1 /*dev->flags&IFF_PROMISC */ ) {
  22.         if (unlikely(compare_ether_addr_64bits(eth->h_dest, dev->dev_addr)))
  23.             skb->pkt_type = PACKET_OTHERHOST;
  24.     }

  25.     /*
  26.      * Some variants of DSA tagging don't have an ethertype field
  27.      * at all, so we check here whether one of those tagging
  28.      * variants has been configured on the receiving interface,
  29.      * and if so, set skb->protocol without looking at the packet.
  30.      */
  31.     if (netdev_uses_dsa_tags(dev))
  32.         return htons(ETH_P_DSA);
  33.     if (netdev_uses_trailer_tags(dev))
  34.         return htons(ETH_P_TRAILER);

  35.     if (ntohs(eth->h_proto) >= 1536)
  36.         return eth->h_proto;   //IP为0x0800,ARP为0x0806
  37.     /*
  38.      * This is a magic hack to spot IPX packets. Older Novell breaks
  39.      * the protocol design and runs IPX over 802.3 without an 802.2 LLC
  40.      * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This
  41.      * won't work for fault tolerant netware but does for the rest.
  42.      */
  43.     if (skb->len >= 2 && *(unsigned short *)(skb->data) == 0xFFFF)
  44.         return htons(ETH_P_802_3);

  45.     /*
  46.      * Real 802.2 LLC
  47.      */
  48.     return htons(ETH_P_802_2);
  49. }
看驱动收包后,在发给上层协议栈前skb的具体情况:



再看一下发包后资源释放的流程:
  1. static void rtl8169_tx_interrupt(struct net_device *dev,
  2.                  struct rtl8169_private *tp,
  3.                  void __iomem *ioaddr)
  4. {
  5.     unsigned int dirty_tx, tx_left;

  6.     dirty_tx = tp->dirty_tx;
  7.     smp_rmb();
  8.     tx_left = tp->cur_tx - dirty_tx;

  9.     while (tx_left > 0) {
  10.         unsigned int entry = dirty_tx % NUM_TX_DESC;
  11.         struct ring_info *tx_skb = tp->tx_skb + entry;
  12.         u32 status;

  13.         rmb();
  14.         status = le32_to_cpu(tp->TxDescArray[entry].opts1);
  15.         if (status & DescOwn)  //DescOwn为1表示硬件还没发送完该包,发送完后硬件会把该位置0
  16.             break;

  17.         rtl8169_unmap_tx_skb(&tp->pci_dev->dev, tx_skb,
  18.                  tp->TxDescArray + entry);
  19.         if (status & LastFrag) {  //如果一个skb对应的所有发送描述符都可以释放的话,该skb也可以释放了
  20.             dev->stats.tx_packets++;
  21.             dev->stats.tx_bytes += tx_skb->skb->len;
  22.             dev_kfree_skb(tx_skb->skb);
  23.             tx_skb->skb = NULL;
  24.         }
  25.         dirty_tx++;
  26.         tx_left--;
  27.     }

  28.     if (tp->dirty_tx != dirty_tx) {
  29.         tp->dirty_tx = dirty_tx;
  30.         smp_wmb();
  31.         if (netif_queue_stopped(dev) && //如果之前由于发送描述符不够导致队列关闭的话,重新判断
  32.          (TX_BUFFS_AVAIL(tp) >= MAX_SKB_FRAGS)) {
  33.             netif_wake_queue(dev);
  34.         }
  35.         /*
  36.          * 8168 hack: TxPoll requests are lost when the Tx packets are
  37.          * too close. Let
对于网卡驱动收包流程的话可以总结为:根据接收描述符的内容构造skb,拷贝数据,赋值相应的分量,然后调用napi_gro_receive发给上层协议栈。


阅读(10080) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~