Chinaunix首页 | 论坛 | 博客
  • 博客访问: 86380
  • 博文数量: 15
  • 博客积分: 0
  • 博客等级: 民兵
  • 技术积分: 210
  • 用 户 组: 普通用户
  • 注册时间: 2014-01-05 15:27
文章分类

全部博文(15)

文章存档

2014年(15)

我的朋友

分类: LINUX

2014-03-21 00:01:28

GRO是一种优化机制,让包尽可能在底层合并,给上层呈现一个大的数据包
驱动调用napi_gro_receive触发GRO流程

首先初始化GRO相应的分量,然后根据__napi_gro_receive的返回值做不同的处理:如果能够合并或者保留,则不需要往上层协议栈送,否则就上报。
  1. gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
  2. {
  3.     switch (ret) {
  4.     case GRO_NORMAL:
  5.         if (netif_receive_skb(skb))
  6.             ret = GRO_DROP;
  7.         break;

  8.     case GRO_DROP:
  9.     case GRO_MERGED_FREE:
  10.         kfree_skb(skb);
  11.         break;

  12.     case GRO_HELD:
  13.     case GRO_MERGED:
  14.         break;
  15.     }

  16.     return ret;
  17. }

  18.  void skb_gro_reset_offset(struct sk_buff *skb)
  19.  {
  20.      NAPI_GRO_CB(skb)->data_offset = 0;
  21.      NAPI_GRO_CB(skb)->frag0 = NULL;
  22.      NAPI_GRO_CB(skb)->frag0_len = 0;
  23.  
  24.      if (skb->mac_header == skb->tail &&  //跟驱动强相关,r8169不会走这个流程
  25.       !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
  26.          NAPI_GRO_CB(skb)->frag0 =
  27.              page_address(skb_shinfo(skb)->frags[0].page) +
  28.              skb_shinfo(skb)->frags[0].page_offset;
  29.          NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
  30.      }
  31.  }
  32.  
  33.  gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
  34.  {
  35.      skb_gro_reset_offset(skb);
  36.  
  37.      return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
  38.  }
GRO的判断分层进行,首先是判断L2层相关的内容,然后是L3、L4,先看L2层的判断:

  1. static inline gro_result_t
  2. __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
  3. {
  4.     struct sk_buff *p;

  5.     for (p = napi->gro_list; p; p = p->next) {
  6.         unsigned long diffs;

  7.         diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; //网卡设备需要一样
  8.         diffs |= p->vlan_tci ^ skb->vlan_tci;
  9.         diffs |= compare_ether_header(skb_mac_header(p),  //L2头需要一样,14个字节
  10.                      skb_gro_mac_header(skb));
  11.         NAPI_GRO_CB(p)->same_flow = !diffs;
  12.         NAPI_GRO_CB(p)->flush = 0;
  13.     }

  14.     return dev_gro_receive(napi, skb);
  15. }
L2层处理完后调用dev_gro_receive,这个涉及L3层的处理,需要相应的L3层协议提供相应的钩子函数,IP协议满足:

  1. static struct packet_type ip_packet_type __read_mostly = {
  2.     .type = cpu_to_be16(ETH_P_IP),
  3.     .func = ip_rcv,
  4.     .gso_send_check = inet_gso_send_check,
  5.     .gso_segment = inet_gso_segment,
  6.     .gro_receive = inet_gro_receive,
  7.     .gro_complete = inet_gro_complete,
  8. }
后面还会涉及L4的操作,同样需要相应的协议提供钩子函数,TCP满足,而UDP没有注册类似的函数:

  1. static const struct net_protocol tcp_protocol = {
  2.     .handler =    tcp_v4_rcv,
  3.     .err_handler =    tcp_v4_err,
  4.     .gso_send_check = tcp_v4_gso_send_check,
  5.     .gso_segment =    tcp_tso_segment,
  6.     .gro_receive =    tcp4_gro_receive,
  7.     .gro_complete =    tcp4_gro_complete,
  8.     .no_policy =    1,
  9.     .netns_ok =    1,
  10. }
看一下具体的流程:
  1. enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
  2. {
  3.     struct sk_buff **pp = NULL;
  4.     struct packet_type *ptype;
  5.     __be16 type = skb->protocol;
  6.     struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
  7.     int same_flow;
  8.     int mac_len;
  9.     enum gro_result ret;

  10.     if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))  //GRO可以通过ethtool动态开启或者关闭,关闭的话直接返回NORMAL
  11.         goto normal;

  12.     if (skb_is_gso(skb) || skb_has_frag_list(skb)
  13.         goto normal;

  14.     rcu_read_lock();
  15.     list_for_each_entry_rcu(ptype, head, list) {
  16.         if (ptype->type != type || ptype->dev || !ptype->gro_receive)  //只有ip_packet_type满足
  17.             continue;

  18.         skb_set_network_header(skb, skb_gro_offset(skb));
  19.         mac_len = skb->network_header - skb->mac_header;
  20.         skb->mac_len = mac_len;
  21.         NAPI_GRO_CB(skb)->same_flow = 0;
  22.         NAPI_GRO_CB(skb)->flush = 0;
  23.         NAPI_GRO_CB(skb)->free = 0;

  24.         pp = ptype->gro_receive(&napi->gro_list, skb);//inet_gro_receive
  25.         break;
  26.     }
  27.     rcu_read_unlock();

  28.     if (&ptype->list == head)
  29.         goto normal;

  30.     same_flow = NAPI_GRO_CB(skb)->same_flow;
  31.     ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;

  32.     if (pp) {
  33.         struct sk_buff *nskb = *pp;

  34.         *pp = nskb->next;
  35.         nskb->next = NULL;
  36.         napi_gro_complete(nskb);
  37.         napi->gro_count--;
  38.     }

  39.     if (same_flow)
  40.         goto ok;

  41.     if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
  42.         goto normal;

  43.     napi->gro_count++;
  44.     NAPI_GRO_CB(skb)->count = 1;
  45.     skb_shinfo(skb)->gso_size = skb_gro_len(skb);
  46.     skb->next = napi->gro_list;
  47.     napi->gro_list = skb;
  48.     ret = GRO_HELD;

  49. pull:
  50.     if (skb_headlen(skb) < skb_gro_offset(skb)) {  
  51.         int grow = skb_gro_offset(skb) - skb_headlen(skb);

  52.         BUG_ON(skb->end - skb->tail < grow);

  53.         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);

  54.         skb->tail += grow;
  55.         skb->data_len -= grow;

  56.         skb_shinfo(skb)->frags[0].page_offset += grow;
  57.         skb_shinfo(skb)->frags[0].size -= grow;

  58.         if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
  59.             put_page(skb_shinfo(skb)->frags[0].page);
  60.             memmove(skb_shinfo(skb)->frags,
  61.                 skb_shinfo(skb)->frags + 1,
  62.                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
  63.         }
  64.     }

  65. ok:
  66.     return ret;

  67. normal:
  68.     ret = GRO_NORMAL;
  69.     goto pull;
  70. }


  1. static struct sk_buff **inet_gro_receive(struct sk_buff **head,
  2.                      struct sk_buff *skb)
  3. {
  4.     const struct net_protocol *ops;
  5.     struct sk_buff **pp = NULL;
  6.     struct sk_buff *p;
  7.     const struct iphdr *iph;
  8.     unsigned int hlen;
  9.     unsigned int off;
  10.     unsigned int id;
  11.     int flush = 1;
  12.     int proto;

  13.     off = skb_gro_offset(skb);  
  14.     hlen = off + sizeof(*iph);
  15.     iph = skb_gro_header_fast(skb, off);
  16.     if (skb_gro_header_hard(skb, hlen)) {
  17.         iph = skb_gro_header_slow(skb, hlen, off);
  18.         if (unlikely(!iph))
  19.             goto out;
  20.     }  //最后iph指向skb对应的ip头,即skb->data

  21.     proto = iph->protocol & (MAX_INET_PROTOS - 1);

  22.     rcu_read_lock();
  23.     ops = rcu_dereference(inet_protos[proto]);
  24.     if (!ops || !ops->gro_receive)
  25.         goto out_unlock;

  26.     if (*(u8 *)iph != 0x45)
  27.         goto out_unlock;

  28.     if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
  29.         goto out_unlock;

  30.     id = ntohl(*(__be32 *)&iph->id);
  31.     flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF)); //ip头的第3-4个字节表示长度,长度不一样或者是分片的话就flush
  32.     id >>= 16;

  33.     for (p = *head; p; p = p->next) {
  34.         struct iphdr *iph2;

  35.         if (!NAPI_GRO_CB(p)->same_flow)
  36.             continue;

  37.         iph2 = ip_hdr(p);

  38.         if ((iph->protocol ^ iph2->protocol) |     //IP层判断same flow需要L4协议一样,tos一样,源和目的地址一样
  39.          (iph->tos ^ iph2->tos) |
  40.          ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
  41.          ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
  42.             NAPI_GRO_CB(p)->same_flow = 0;
  43.             continue;
  44.         }

  45.         /* All fields must match except length and checksum. */ //只有flow一样的情况下才会设置flush
  46.         NAPI_GRO_CB(p)->flush |=
  47.             (iph->ttl ^ iph2->ttl) |
  48.             ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);

  49.         NAPI_GRO_CB(p)->flush |= flush;
  50.     }

  51.     NAPI_GRO_CB(skb)->flush |= flush;
  52.     skb_gro_pull(skb, sizeof(*iph)); //NAPI_GRO_CB(skb)->data_offset += len;
  53.     skb_set_transport_header(skb, skb_gro_offset(skb)); //skb->transport_header = skb->data + offset;

  54.     pp = ops->gro_receive(head, skb);

  55. out_unlock:
  56.     rcu_read_unlock();

  57. out:
  58.     NAPI_GRO_CB(skb)->flush |= flush;

  59.     return pp;
  60. }

  1. struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
  2. {
  3.     struct sk_buff **pp = NULL;
  4.     struct sk_buff *p;
  5.     struct tcphdr *th;
  6.     struct tcphdr *th2;
  7.     unsigned int len;
  8.     unsigned int thlen;
  9.     __be32 flags;
  10.     unsigned int mss = 1;
  11.     unsigned int hlen;
  12.     unsigned int off;
  13.     int flush = 1;
  14.     int i;

  15.     off = skb_gro_offset(skb);
  16.     hlen = off + sizeof(*th);
  17.     th = skb_gro_header_fast(skb, off);
  18.     if (skb_gro_header_hard(skb, hlen)) {
  19.         th = skb_gro_header_slow(skb, hlen, off);
  20.         if (unlikely(!th))
  21.             goto out;  //th指向TCP的头,即skb->data+off
  22.     }

  23.     thlen = th->doff * 4;
  24.     if (thlen < sizeof(*th))
  25.         goto out;

  26.     hlen = off + thlen;
  27.     if (skb_gro_header_hard(skb, hlen)) {
  28.         th = skb_gro_header_slow(skb, hlen, off);
  29.         if (unlikely(!th))
  30.             goto out;
  31.     }

  32.     skb_gro_pull(skb, thlen); // NAPI_GRO_CB(skb)->data_offset += len; 即等于L3+L4的头

  33.     len = skb_gro_len(skb);  //skb->len - NAPI_GRO_CB(skb)->data_offset;即len表示取掉头的负载
  34.     flags = tcp_flag_word(th);

  35.     for (; (p = *head); head = &p->next) {
  36.         if (!NAPI_GRO_CB(p)->same_flow)
  37.             continue;

  38.         th2 = tcp_hdr(p);

  39.         if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {  //tcp判断same flow的标准是源端口号一样
  40.             NAPI_GRO_CB(p)->same_flow = 0;
  41.             continue;
  42.         }

  43.         goto found;
  44.     }

  45.     goto out_check_final;

  46. found:
  47.     flush = NAPI_GRO_CB(p)->flush;
  48.     flush |= (__force int)(flags & TCP_FLAG_CWR);
  49.     flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
  50.          ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
  51.     flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
  52.     for (i = sizeof(*th); i < thlen; i += 4)  //option区域必须一样,否则flush
  53.         flush |= *(u32 *)((u8 *)th + i) ^
  54.              *(u32 *)((u8 *)th2 + i);

  55.     mss = skb_shinfo(p)->gso_size;

  56.     flush |= (len - 1) >= mss;
  57.     flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);

  58.     if (flush || skb_gro_receive(head, skb)) {
  59.         mss = 1;
  60.         goto out_check_final;
  61.     }

  62.     p = *head;
  63.     th2 = tcp_hdr(p);
  64.     tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);

  65. out_check_final:
  66.     flush = len < mss;
  67.     flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
  68.                     TCP_FLAG_RST | TCP_FLAG_SYN |
  69.                     TCP_FLAG_FIN));

  70.     if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
  71.         pp = head;

  72. out:
  73.     NAPI_GRO_CB(skb)->flush |= flush;

  74.     return pp;
  75. }









阅读(4138) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~