Chinaunix首页 | 论坛 | 博客
  • 博客访问: 537494
  • 博文数量: 120
  • 博客积分: 3030
  • 博客等级: 中校
  • 技术积分: 1445
  • 用 户 组: 普通用户
  • 注册时间: 2006-03-05 01:00
文章存档

2011年(1)

2009年(2)

2008年(32)

2007年(33)

2006年(52)

我的朋友

分类: LINUX

2008-03-26 11:14:52

ip数据包分片函数ip_fragment分析(上)
2007-11-30 17:38:14

 

/*

 *    This IP datagram is too large to be sent in one piece.  Break it up into

 *    smaller pieces (each of size equal to IP header plus

 *    a block of the data of the original IP data part) that will yet fit in a

 *    single device frame, and queue such a frame for sending.

 */

 

int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))

{

       struct iphdr *iph;//ip头指针

       int raw = 0;

       int ptr;

       struct net_device *dev;//网络设备指针

       struct sk_buff *skb2;//新建一个skb的指针

       unsigned int mtu, hlen, left, len, ll_rs;

       int offset;//偏移

       int not_last_frag;//不是最后一个分片

       struct rtable *rt = (struct rtable*)skb->dst;

       int err = 0;//出错?

 

       dev = rt->u.dst.dev;//dev等于在skb中的dev

 

       /*

        *    Point into the IP datagram header.

        */

 

       iph = skb->nh.iph;//ip头指针指向skb中的ip

 

       if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {

              icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,

                       htonl(dst_pmtu(&rt->u.dst)));

              kfree_skb(skb);//貌似检查是否允许分片,如果不允许就直接释放skb,其他的几个没看懂

              return -EMSGSIZE;//分组比接口的 MTU(最大传输单元)

       }

 

       /*

        *    Setup starting values.

        */

 

       hlen = iph->ihl * 4;//计算ip头长度

       mtu = dst_pmtu(&rt->u.dst) - hlen;      /* Size of data space */

       //这里的mtu应该是去掉ip头后,数据部分的最大长度

 

       /* When frag_list is given, use it. First, check its validity:

        * some transformers could create wrong frag_list or break existing

        * one, it is not prohibited. In this case fall back to copying.

        *

        * LATER: this step can be merged to real generation of fragments,

        * we can switch to copy when see the first bad fragment.

        */

          //skbuff.h中定义了如下#define skb_shinfo(SKB)((struct skb_shared_info *)((SKB)->end))

       //在缓冲区数据的末尾,有一个数据结构skb_shared_info

       //它保存了数据块的附加信息。这个数据结构紧跟在end指针所指的地址之后(end指针指示数据的末尾)

       //下面是这个结构的定义:

              //     struct skb_shared_info {

              //     atomic_t         dataref;

                 //     unsigned int     nr_frags;

                 //     unsigned short   tso_size;

                 //     unsigned short   tso_seqs;

                 //     struct sk_buff   *frag_list;

                 //     skb_frag_t       frags[MAX_SKB_FRAGS];

              //     };

/***********************************

dataref表示数据块的“用户”数,这个值在下一节(克隆和拷贝缓冲区)中有描述。nf_fragsfrag_listfrags用于存储IP分片。

skb_is_nonlinear函数用于测试一个缓冲区是否是分片的,而skb_linearize可以把分片组合成一个单一的缓冲区。组合分片涉及到

数据拷贝,它将严重影响系统性能。

 

需要注意的是:sk_buff中没有指向skb_shared_info结构的指针。如果要访问这个结构, 就需要使用skb_info宏,这个宏简单地返

end指针:

********************************/

 

 

       if (skb_shinfo(skb)->frag_list) {

              struct sk_buff *frag;//又定义了一个指向skb的指针,下面将会使用

              int first_len = skb_pagelen(skb);

 

              //skb_pagelen(skb)计算的就是skb中的自身数据的,函数如下

              //     static inline int skb_pagelen(const struct sk_buff *skb)

              //     {

                  //     int i, len = 0;

              //     for (i = (int)skb_shinfo(skb)->nr_frags - 1; i >= 0; i--)

               //     len += skb_shinfo(skb)->frags[i].size;

                  //     return len + skb_headlen(skb);

              //     }

/********************************************

skb_pagelen里首先计算skb_shinfo(skb)->frags的长度,然后再加上skb_headlen()的值,得出的是什么呢?

是隶属于本skb的,不包含skb_shinfo(skb)->frag_listskb数据的长度,也就是第一个skb的值了。这也就是为什么被命名为

first_len的原因

**********************************************/

 

              if (first_len - hlen > mtu ||

                  ((first_len - hlen) & 7) ||

                  (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||

                  skb_cloned(skb))

                     goto slow_path;

       //没看大明白,但从第一个来看,大于mtu就应当要分片了,

       //goto slow_path应当为分片

       //下面的for循环遍历frag_list指向的sk_buff结构的链表, 

       //检查其是否需要分片,如果需要分片,就goto slow_path

       //我认为可能是为了保证每个分片都足够小,不超过mtu

       //应为不同网络要求的mtu不一定相同,可能存在重复分片的情况,所以要检查一下

       //个人观点,仅供参考

 

              for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {

                     /* Correct geometry. */

                     if (frag->len > mtu ||

                         ((frag->len & 7) && frag->next) ||

                         skb_headroom(frag) < hlen)

                         goto slow_path;

 

                     /* Partially cloned skb? */

                     if (skb_shared(frag))

                            goto slow_path;

              }

 

              /* Everything is OK. Generate! */

              //相关数据的初始化

              err = 0;

              offset = 0;//初始偏移为0

              frag = skb_shinfo(skb)->frag_list;

                     //frag设置为指向frag_list的第一个的指针

              skb_shinfo(skb)->frag_list = NULL;//原来的设为NULL

              skb->data_len = first_len - skb_headlen(skb);//数据部分长度

              skb->len = first_len;//ip数据包长度

              iph->tot_len = htons(first_len);//总长度

              iph->frag_off |= htons(IP_MF);//设置分片的指示位

              ip_send_check(iph);//头部校验

 

              for (;;) {

                     /* Prepare header of the next frame,

                      * before previous one went down. */

                     if (frag) {

                     //如果frag非空,设置frag指向的skb结构

                            frag->ip_summed = CHECKSUM_NONE;

                            frag->h.raw = frag->data;

                            frag->nh.raw = __skb_push(frag, hlen);

                            memcpy(frag->nh.raw, iph, hlen);

                                   //复制ip头部

                            iph = frag->nh.iph;//指向新的ip

                            iph->tot_len = htons(frag->len);

                            ip_copy_metadata(frag, skb);

       //关于ip_copy_metadata函数,百度之据说是复制其他一些关于skb的设置

                            if (offset == 0)//偏移为0,表示为第一个分片包

                                   ip_options_fragment(frag);//一般第一个分片包会加入一些选项

                            offset += skb->len - hlen;//计算下一个offset

                            iph->frag_off = htons(offset>>3);

       //因为ip包中的offset一位表示一个字节,因此要右移3位,扩大8

                            if (frag->next != NULL)//如果不是最后一个分片,设置MF位为1

                                   iph->frag_off |= htons(IP_MF);

                            /* Ready, complete checksum */

                            ip_send_check(iph);

                     }

 

                     err = output(skb);//发送函数

 

                     if (err || !frag)//如果出错或frag为空

                            break;

 

                     skb = frag;//frag赋予skb

                     frag = skb->next;//frag指向下一个

                     skb->next = NULL;//清除skbfrag的联系

              }//for循环结束

///////////////////////////////////////

ip数据包分片函数ip_fragment分析(下)
2007-11-30 17:39:28

//下面这个if看不懂,貌似像改变某个的状态值(从INC来看),莫非可能与同一数据包分片共同具有的碎片ID有关?

              if (err == 0) {

                     IP_INC_STATS(IPSTATS_MIB_FRAGOKS);

                     return 0;

              }

//如果frag不为空,说明是因为err出错而跳出的循环,下面的是善后工作

              while (frag) {

                     skb = frag->next;

                     kfree_skb(frag);

                     frag = skb;

              }

              IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);

              return err;

       }//if (skb_shinfo(skb)->frag_list)结束

 

 

//slow_path向下应该是具体的分片过程

slow_path:

       left = skb->len - hlen;           /* Space per frame */

       //原始长度

       ptr = raw + hlen;           /* Where to start from */

       //起始位置

 

#ifdef CONFIG_BRIDGE_NETFILTER

       /* for bridged IP traffic encapsulated inside f.e. a vlan header,

        * we need to make room for the encapsulating header */

       ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));

       mtu -= nf_bridge_pad(skb);

#else

       ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);

#endif

       /*

        *    Fragment the datagram.

        */

 

       offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;

//取出偏移位(13位),并乘8算出总字节数――算该包的偏移字节数

       not_last_frag = iph->frag_off & htons(IP_MF);

//取出MF位(第14位)

 

       /*

        *    Keep copying data until we run out.

        */

 

//循环进行分片

       while(left > 0) {

              len = left;

              /* IF: it doesn't fit, use 'mtu' - the data space left */

       //如果剩下的数据left还比MTU大,则以MTU为分片的数据长度;否则,就用left作为数据长度(对于最后一片)

              if (len > mtu)

                     len = mtu;

              /* IF: we are not sending upto and including the packet end

                 then align the next start on an eight byte boundary */

              if (len < left)   {

//len=left时,即最后一个分片长度小于MTU,则不需要再取8字节的整数倍

                     len &= ~7;//否则取8字节的整数倍

              }

              /*

               *    Allocate buffer.

               */

//skb2请求分配空间

              if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {

                     NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));

                     err = -ENOMEM;

                     goto fail;

              }

 

              /*

               *    Set up data on packet

               */

//填充相关数据结构        

              ip_copy_metadata(skb2, skb);

              skb_reserve(skb2, ll_rs);

              skb_put(skb2, len + hlen);

              skb2->nh.raw = skb2->data;

              skb2->h.raw = skb2->data + hlen;

 

              /*

               *    Charge the memory for the fragment to any owner

               *    it might possess

               */

 

              if (skb->sk)

                     skb_set_owner_w(skb2, skb->sk);

 

              /*

               *    Copy the packet header into the new buffer.

               */

              //拷贝数据包头

              memcpy(skb2->nh.raw, skb->data, hlen);

 

              /*

               *    Copy a block of the IP datagram.

               */

              //拷贝数据部分

              if (skb_copy_bits(skb, ptr, skb2->h.raw, len))

                     BUG();

              left -= len;//更新left的值

 

              /*

               *    Fill in the new header fields.

               */

              iph = skb2->nh.iph;//新的ip

              iph->frag_off = htons((offset >> 3));//设置偏移

 

              /* ANK: dirty, but effective trick. Upgrade options only if

               * the segment to be fragmented was THE FIRST (otherwise,

               * options are already fixed) and make it ONCE

               * on the initial skb, so that all the following fragments

               * will inherit fixed options.

               */

              if (offset == 0)

                     ip_options_fragment(skb);

 

              /*

               *    Added AC : If we are fragmenting a fragment that's not the

               *              last fragment then keep MF on each bit

               */

              if (left > 0 || not_last_frag)//如果不是最后一片

                     iph->frag_off |= htons(IP_MF);

              ptr += len;

              offset += len;

 

              /*

               *    Put this fragment into the sending queue.

               */

 

              IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);

 

              iph->tot_len = htons(len + hlen);

 

              ip_send_check(iph);

 

              err = output(skb2);

              if (err)

                     goto fail;

       }//while循环结束

       kfree_skb(skb);

       IP_INC_STATS(IPSTATS_MIB_FRAGOKS);

       return err;

 

//如果失败,则进行如下操作

fail:

       kfree_skb(skb);

       IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);

       return err;

}

 

 

阅读(6284) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~