Chinaunix首页 | 论坛 | 博客
  • 博客访问: 86735
  • 博文数量: 15
  • 博客积分: 0
  • 博客等级: 民兵
  • 技术积分: 210
  • 用 户 组: 普通用户
  • 注册时间: 2014-01-05 15:27
文章分类

全部博文(15)

文章存档

2014年(15)

我的朋友

分类: LINUX

2014-03-01 16:30:48

IP头限制一个包最大为64K,但是实际上一般没法一次发送这么大的包,以太网一般有MTU限制一次发包的大小
MTU一般为1500字节。因此发包过程涉及packet  fragmentation,该流程在L3层完成
TCP把类似的操作提到了L4层,引入了MSS的概念,这个一般是根据MTU生成的。
随着网卡速度的提升,网络收发包会占用大量cpu,为了尽量减少cpu的使用,网卡集成了很多之前软件完成的功能,即卸载功能
首先看不开启所有卸载功能,发包大于1500字节的情况。

例子:负载为25000字节
__ip_append_data函数中涉及skb包的分配,length为25000+L4=25008
  1. if (!skb)
  2.         goto alloc_new_skb;

  3.     while (length > 0) {
  4.         /* Check if the remaining data fits into current packet. */
  5.         copy = mtu - skb->len;
  6.         if (copy < length)
  7.             copy = maxfraglen - skb->len;
  8.         if (copy <= 0) {
  9.             char *data;
  10.             unsigned int datalen;
  11.             unsigned int fraglen;
  12.             unsigned int fraggap;
  13.             unsigned int alloclen;
  14.             struct sk_buff *skb_prev;
  15. alloc_new_skb:
  16.             skb_prev = skb;
  17.             if (skb_prev)
  18.                 fraggap = skb_prev->len - maxfraglen;
  19.             else
  20.                 fraggap = 0;

  21.             /*
  22.              * If remaining data exceeds the mtu,
  23.              * we know we need more fragment(s).
  24.              */
  25.             datalen = length + fraggap;
  26.             if (datalen > mtu - fragheaderlen)         
  27.                 datalen = maxfraglen - fragheaderlen; //maxfraglen=1500,fragheaderlen=20,datalen=1480
  28.             fraglen = datalen + fragheaderlen;

  29.             if ((flags & MSG_MORE) &&
  30.              !(rt->dst.dev->features&NETIF_F_SG))
  31.                 alloclen = mtu;
  32.             else
  33.                 alloclen = fraglen;  //alloclen=1500

  34.             alloclen += exthdrlen;

  35.             /* The last fragment gets additional space at tail.
  36.              * Note, with MSG_MORE we overallocate on fragments,
  37.              * because we have no idea what fragment will be
  38.              * the last.
  39.              */
  40.             if (datalen == length + fraggap)
  41.                 alloclen += rt->dst.trailer_len;

  42.             if (transhdrlen) {
  43.                 skb = sock_alloc_send_skb(sk,
  44.                         alloclen + hh_len + 15,   //第一个skb分配1500+16+15,在__alloc_skb,该大小会和L1 cacheline对齐(64),即变成1536
  45.                         (flags & MSG_DONTWAIT), &err);
  46.             } else {
  47.                 skb = NULL;
  48.                 if (atomic_read(&sk->sk_wmem_alloc) <=
  49.                  2 * sk->sk_sndbuf)
  50.                     skb = sock_wmalloc(sk,  
  51.                              alloclen + hh_len + 15, 1,
  52.                              sk->sk_allocation);
  53.                 if (unlikely(skb == NULL))
  54.                     err = -ENOBUFS;
  55.                 else
  56.                     /* only the initial fragment is
  57.                      time stamped */
  58.                     cork->tx_flags = 0;
  59.             }
  60.             if (skb == NULL)
  61.                 goto error;

  62.             /*
  63.              *    Fill in the control structures
  64.              */
  65.             skb->ip_summed = csummode;
  66.             skb->csum = 0;
  67.             skb_reserve(skb, hh_len);
  68.             skb_shinfo(skb)->tx_flags = cork->tx_flags;

  69.             /*
  70.              *    Find where to start putting bytes.
  71.              */
  72.             data = skb_put(skb, fraglen + exthdrlen); //skb->len+=1500
  73.             skb_set_network_header(skb, exthdrlen);
  74.             skb->transport_header = (skb->network_header +
  75.                          fragheaderlen);
  76.             data += fragheaderlen + exthdrlen;

  77.             if (fraggap) {
  78.                 skb->csum = skb_copy_and_csum_bits(
  79.                     skb_prev, maxfraglen,
  80.                     data + transhdrlen, fraggap, 0);
  81.                 skb_prev->csum = csum_sub(skb_prev->csum,
  82.                              skb->csum);
  83.                 data += fraggap;
  84.                 pskb_trim_unique(skb_prev, maxfraglen);
  85.             }

  86.             copy = datalen - transhdrlen - fraggap; //copy=MTU-L3-L4=1472
  87.                                                     //第二个以后的分片不再区别L4,每次拷贝1480字节
  88.             if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
  89.                 err = -EFAULT;
  90.                 kfree_skb(skb);
  91.                 goto error;
  92.             }

  93.             offset += copy;
  94.             length -= datalen - fraggap;
  95.             transhdrlen = 0;
  96.             exthdrlen = 0;
  97.             csummode = CHECKSUM_NONE;

  98.             /*
  99.              * Put the packet on the pending queue.
  100.              */
  101.             __skb_queue_tail(queue, skb);
  102.             continue;
  103.         }
分配多个skb,拷贝数据,注意length包含L4的头,第二个开始的skb不再区别L4头。


其中涉及SG卸载功能的地方有:

  1. if ((flags & MSG_MORE) &&
  2.      !(rt->dst.dev->features&NETIF_F_SG))
  3.         alloclen = mtu;
  4. else
  5.         alloclen = fraglen
即有SG功能的话,每次都是按需分配,如果没有SG的功能,同时又MORE的标志的话,按最大分段大小进行分配,这么做的原因在于SG可以支持数据包对应的内存不连续

  1.         if (!(rt->dst.dev->features&NETIF_F_SG)) {
  2.             unsigned int off;

  3.             off = skb->len;
  4.             if (getfrag(from, skb_put(skb, copy),
  5.                     offset, copy, off, skb) < 0) {
  6.                 __skb_trim(skb, off);
  7.                 err = -EFAULT;
  8.                 goto error;
  9.             }
  10.         } else {
  11.             int i = skb_shinfo(skb)->nr_frags;
  12.             skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
  13.             struct page *page = cork->page;
  14.             int off = cork->off;
  15.             unsigned int left;

  16.             if (page && (left = PAGE_SIZE - off) > 0) {
  17.                 if (copy >= left)
  18.                     copy = left;
  19.                 if (page != frag->page) {
  20.                     if (i == MAX_SKB_FRAGS) {
  21.                         err = -EMSGSIZE;
  22.                         goto error;
  23.                     }
  24.                     get_page(page);
  25.                     skb_fill_page_desc(skb, i, page, off, 0);
  26.                     frag = &skb_shinfo(skb)->frags[i];
  27.                 }
  28.             } else if (i < MAX_SKB_FRAGS) {
  29.                 if (copy > PAGE_SIZE)
  30.                     copy = PAGE_SIZE;
  31.                 page = alloc_pages(sk->sk_allocation, 0);
  32.                 if (page == NULL) {
  33.                     err = -ENOMEM;
  34.                     goto error;
  35.                 }
  36.                 cork->page = page;
  37.                 cork->off = 0;

  38.                 skb_fill_page_desc(skb, i, page, 0, 0);
  39.                 frag = &skb_shinfo(skb)->frags[i];
  40.             } else {
  41.                 err = -EMSGSIZE;
  42.                 goto error;
  43.             }
  44.             if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
  45.                 err = -EFAULT;
  46.                 goto error;
  47.             }
  48.             cork->off += copy;
  49.             frag->size += copy;
  50.             skb->len += copy;
  51.             skb->data_len += copy;
  52.             skb->truesize += copy;
  53.             atomic_add(copy, &sk->sk_wmem_alloc);
  54.         }
  55.         offset += copy;
  56.         length -= copy;
  57.     }
可以看到不管支不支持SG,都涉及内存的拷贝,只不过拷贝的目的地不一样而已。
理论上SG使得每个SKB支持16个页,大小为64K,但是实际上__ip_append_data分片的规格是MTU,大于MTU就会重新分配一个SKB
函数__ip_make_skb除了填充L3头外,还对skb的组织进行了修改:


所有的大小都会在第一个skb中体现,即本例中skb->len=25028,即25000+L3+L4
后续调用ip_finish_output进行发送,这里涉及另外一个卸载功能:GSO

  1. static int ip_finish_output(struct sk_buff *skb)
  2. {
  3. #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
  4.     /* Policy lookup after SNAT yielded a new policy */
  5.     if (skb_dst(skb)->xfrm != NULL) {
  6.         IPCB(skb)->flags |= IPSKB_REROUTED;
  7.         return dst_output(skb);
  8.     }
  9. #endif
  10.     if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
  11.         return ip_fragment(skb, ip_finish_output2);
  12.     else
  13.         return ip_finish_output2(skb);
  14. }
如果包的长度大于MTU,并且不支持GSO的话,先调用ip_fragment,否则直接调用ip_finish_output2

  1. int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
  2. {
  3.     struct iphdr *iph;
  4.     int ptr;
  5.     struct net_device *dev;
  6.     struct sk_buff *skb2;
  7.     unsigned int mtu, hlen, left, len, ll_rs;
  8.     int offset;
  9.     __be16 not_last_frag;
  10.     struct rtable *rt = skb_rtable(skb);
  11.     int err = 0;

  12.     dev = rt->dst.dev;

  13.     /*
  14.      *    Point into the IP datagram header.
  15.      */

  16.     iph = ip_hdr(skb);

  17.     if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
  18.         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
  19.         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
  20.              htonl(ip_skb_dst_mtu(skb)));
  21.         kfree_skb(skb);
  22.         return -EMSGSIZE;
  23.     }

  24.     /*
  25.      *    Setup starting values.
  26.      */

  27.     hlen = iph->ihl * 4;
  28.     mtu = dst_mtu(&rt->dst) - hlen;    /* Size of data space */
  29. #ifdef CONFIG_BRIDGE_NETFILTER
  30.     if (skb->nf_bridge)
  31.         mtu -= nf_bridge_mtu_reduction(skb);
  32. #endif
  33.     IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;

  34.     /* When frag_list is given, use it. First, check its validity:
  35.      * some transformers could create wrong frag_list or break existing
  36.      * one, it is not prohibited. In this case fall back to copying.
  37.      *
  38.      * LATER: this step can be merged to real generation of fragments,
  39.      * we can switch to copy when see the first bad fragment.
  40.      */
  41.     if (skb_has_frag_list(skb)) {
  42.         struct sk_buff *frag, *frag2;
  43.         int first_len = skb_pagelen(skb);  //pagelen包含主buffer以及fragments的大小,不包含frag_list

  44.         if (first_len - hlen > mtu ||  // 本例中skb不包含fragments,只有frag_list,first_len=mtu+hlen
  45.          ((first_len - hlen) & 7) ||  //条件头不满足,走fast流程 
  46.          ip_is_fragment(iph) ||
  47.          skb_cloned(skb))
  48.             goto slow_path;

  49.         skb_walk_frags(skb, frag) {  //对每个frag进行检查
  50.             /* Correct geometry. */
  51.             if (frag->len > mtu ||
  52.              ((frag->len & 7) && frag->next) ||
  53.              skb_headroom(frag) < hlen)
  54.                 goto slow_path_clean;

  55.             /* Partially cloned skb? */
  56.             if (skb_shared(frag))
  57.                 goto slow_path_clean;

  58.             BUG_ON(frag->sk);
  59.             if (skb->sk) {
  60.                 frag->sk = skb->sk;
  61.                 frag->destructor = sock_wfree;
  62.             }
  63.             skb->truesize -= frag->truesize;
  64.         }

  65.         /* Everything is OK. */

  66.         err = 0;
  67.         offset = 0;
  68.         frag = skb_shinfo(skb)->frag_list;
  69.         skb_frag_list_init(skb);
  70.         skb->data_len = first_len - skb_headlen(skb);  //data_len=0
  71.         skb->len = first_len;            //len=1500
  72.         iph->tot_len = htons(first_len);
  73.         iph->frag_off = htons(IP_MF);
  74.         ip_send_check(iph);

  75.         for (;;) {
  76.             /* Prepare header of the next frame,
  77.              * before previous one went down. */
  78.             if (frag) {
  79.                 frag->ip_summed = CHECKSUM_NONE;
  80.                 skb_reset_transport_header(frag);
  81.                 __skb_push(frag, hlen);
  82.                 skb_reset_network_header(frag);
  83.                 memcpy(skb_network_header(frag), iph, hlen);  //赋值IP头
  84.                 iph = ip_hdr(frag);
  85.                 iph->tot_len = htons(frag->len);
  86.                 ip_copy_metadata(frag, skb);
  87.                 if (offset == 0)
  88.                     ip_options_fragment(frag);
  89.                 offset += skb->len - hlen;
  90.                 iph->frag_off = htons(offset>>3);
  91.                 if (frag->next != NULL)
  92.                     iph->frag_off |= htons(IP_MF);
  93.                 /* Ready, complete checksum */
  94.                 ip_send_check(iph);
  95.             }

  96.             err = output(skb);  //调用ip_finish_output2,除了第一个skb有L4头外,其他的skb没有

  97.             if (!err)
  98.                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
  99.             if (err || !frag)
  100.                 break;

  101.             skb = frag;
  102.             frag = skb->next;
  103.             skb->next = NULL;
  104.         }

  105.         if (err == 0) {
  106.             IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
  107.             return 0;
  108.         }

  109.         while (frag) {
  110.             skb = frag->next;
  111.             kfree_skb(frag);
  112.             frag = skb;
  113.         }
  114.         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
  115.         return err;

  116. slow_path_clean:
  117.         skb_walk_frags(skb, frag2) {
  118.             if (frag2 == frag)
  119.                 break;
  120.             frag2->sk = NULL;
  121.             frag2->destructor = NULL;
  122.             skb->truesize += frag2->truesize;
  123.         }
  124.     }

  125. slow_path:
  126.     left = skb->len - hlen;        /* Space per frame */
  127.     ptr = hlen;        /* Where to start from */

  128.     /* for bridged IP traffic encapsulated inside f.e. a vlan header,
  129.      * we need to make room for the encapsulating header
  130.      */
  131.     ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));

  132.     /*
  133.      *    Fragment the datagram.
  134.      */

  135.     offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
  136.     not_last_frag = iph->frag_off & htons(IP_MF);

  137.     /*
  138.      *    Keep copying data until we run out.
  139.      */

  140.     while (left > 0) {
  141.         len = left;
  142.         /* IF: it doesn't fit, use 'mtu' - the data space left */
  143.         if (len > mtu)
  144.             len = mtu;
  145.         /* IF: we are not sending up to and including the packet end
  146.          then align the next start on an eight byte boundary */
  147.         if (len < left)    {
  148.             len &= ~7;
  149.         }
  150.         /*
  151.          *    Allocate buffer.
  152.          */

  153.         if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
  154.             NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
  155.             err = -ENOMEM;
  156.             goto fail;
  157.         }

  158.         /*
  159.          *    Set up data on packet
  160.          */

  161.         ip_copy_metadata(skb2, skb);
  162.         skb_reserve(skb2, ll_rs);
  163.         skb_put(skb2, len + hlen);
  164.         skb_reset_network_header(skb2);
  165.         skb2->transport_header = skb2->network_header + hlen;

  166.         /*
  167.          *    Charge the memory for the fragment to any owner
  168.          *    it might possess
  169.          */

  170.         if (skb->sk)
  171.             skb_set_owner_w(skb2, skb->sk);

  172.         /*
  173.          *    Copy the packet header into the new buffer.
  174.          */

  175.         skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);

  176.         /*
  177.          *    Copy a block of the IP datagram.
  178.          */
  179.         if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
  180.             BUG();
  181.         left -= len;

  182.         /*
  183.          *    Fill in the new header fields.
  184.          */
  185.         iph = ip_hdr(skb2);
  186.         iph->frag_off = htons((offset >> 3));

  187.         /* ANK: dirty, but effective trick. Upgrade options only if
  188.          * the segment to be fragmented was THE FIRST (otherwise,
  189.          * options are already fixed) and make it ONCE
  190.          * on the initial skb, so that all the following fragments
  191.          * will inherit fixed options.
  192.          */
  193.         if (offset == 0)
  194.             ip_options_fragment(skb);

  195.         /*
  196.          *    Added AC : If we are fragmenting a fragment that's not the
  197.          *         last fragment then keep MF on each bit
  198.          */
  199.         if (left > 0 || not_last_frag)
  200.             iph->frag_off |= htons(IP_MF);
  201.         ptr += len;
  202.         offset += len;

  203.         /*
  204.          *    Put this fragment into the sending queue.
  205.          */
  206.         iph->tot_len = htons(len + hlen);

  207.         ip_send_check(iph);

  208.         err = output(skb2);
  209.         if (err)
  210.             goto fail;

  211.         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
  212.     }
  213.     kfree_skb(skb);
  214.     IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
  215.     return err;

  216. fail:
  217.     kfree_skb(skb);
  218.     IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
  219.     return err;
  220. }
可以看到fast流程就是对没有skb增加L3头,然后调用ip_finish_output2发送。



阅读(3047) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~