/* * sk_buff 完全注释 * 金伟注释 blog -- http://jinweidavid.cublog.cn 转载请注明 */
/* * 本文件取自linux 2.6.13内核的skbuff.c */
/* * Routines having to do with the 'struct sk_buff' memory handlers. * * Authors: Alan Cox * Florian La Roche * * Version: $Id: skbuff.c,v 1.90 2001/11/07 05:56:19 davem Exp $ * * Fixes: * Alan Cox : Fixed the worst of the load * balancer bugs. * Dave Platt : Interrupt stacking fix. * Richard Kooijman : Timestamp fixes. * Alan Cox : Changed buffer format. * Alan Cox : destructor hook for AF_UNIX etc. * Linus Torvalds : Better skb_clone. * Alan Cox : Added skb_copy. * Alan Cox : Added all the changed routines Linus * only put in the headers * Ray VanTassle : Fixed --skb->lock in free * Alan Cox : skb_copy copy arp field * Andi Kleen : slabified it. * Robert Olsson : Removed skb_head_pool * * NOTE: * The __skb_ routines should be called with interrupts * disabled, or you better be *real* sure that the operation is atomic * with respect to whatever list is being frobbed (e.g. via lock_sock() * or via disabling bottom half handlers, etc). * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */
/* * The functions in this file will not compile correctly with gcc 2.4.x */
#include <linux/config.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/slab.h> #include <linux/netdevice.h> #ifdef CONFIG_NET_CLS_ACT #include <net/pkt_sched.h> #endif #include <linux/string.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/highmem.h>
#include <net/protocol.h> #include <net/dst.h> #include <net/sock.h> #include <net/checksum.h> #include <net/xfrm.h>
#include <asm/uaccess.h> #include <asm/system.h>
static kmem_cache_t *skbuff_head_cache;
/* * Keep out-of-line to prevent kernel bloat. * __builtin_return_address is not used because it is not always * reliable. */
/** * skb_over_panic - private function * @skb: buffer * @sz: size * @here: address * * Out of line support code for skb_put(). Not user callable. */ void skb_over_panic(struct sk_buff *skb, int sz, void *here) { printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p " "data:%p tail:%p end:%p dev:%s\n", here, skb->len, sz, skb->head, skb->data, skb->tail, skb->end, skb->dev ? skb->dev->name : ""); BUG(); }
/** * skb_under_panic - private function * @skb: buffer * @sz: size * @here: address * * Out of line support code for skb_push(). Not user callable. */
void skb_under_panic(struct sk_buff *skb, int sz, void *here) { printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p " "data:%p tail:%p end:%p dev:%s\n", here, skb->len, sz, skb->head, skb->data, skb->tail, skb->end, skb->dev ? skb->dev->name : ""); BUG(); }
/* Allocate a new skbuff. We do this ourselves so we can fill in a few * 'private' fields and also do memory statistics to find all the * [BEEP] leaks. * */
/** * alloc_skb - allocate a network buffer * @size: size to allocate * @gfp_mask: allocation mask * * Allocate a new &sk_buff. The returned buffer has no headroom and a * tail room of size bytes. The object has a reference count of one. * The return is the buffer. On a failure the return is %NULL. * * Buffers may only be allocated from interrupts using a @gfp_mask of * %GFP_ATOMIC. */ struct sk_buff *alloc_skb(unsigned int size, int gfp_mask) { struct sk_buff *skb; u8 *data;
/* Get the HEAD */ /* 从cache缓冲池中获取内存 */ skb = kmem_cache_alloc(skbuff_head_cache, gfp_mask & ~__GFP_DMA); if (!skb) goto out;
/* Get the DATA. Size must match skb_add_mtu(). */
/* 对其size */ size = SKB_DATA_ALIGN(size);
/* 分配的缓冲长度包含skb_shared_info的长度 */ data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); if (!data) goto nodata; /* * offsetof是一个编译器宏或者是自定义的宏,用于计算member在struct中的偏移量。 * 把在truesize前面的field全部清零。 */ memset(skb, 0, offsetof(struct sk_buff, truesize)); /* truesize是广义SKB的大小,包含了4个部分的长度:skb自身,header,page frags,frag list */ skb->truesize = size + sizeof(struct sk_buff); /* users初始化成1 */ atomic_set(&skb->users, 1);
/* 初始化所有数据指针 */ skb->head = data; skb->data = data; skb->tail = data; skb->end = data + size; /* * skb_shinfo是个宏,#define skb_shinfo(SKB) ((struct skb_shared_info *)((SKB)->end)) * 所以用这个宏的时候必须等skb->end已经初始化。 * skb_shinfo 接在skb->end指向的内存空间后面。 */
/* 初始化skb_shared_info结构体 */ atomic_set(&(skb_shinfo(skb)->dataref), 1); skb_shinfo(skb)->nr_frags = 0; skb_shinfo(skb)->tso_size = 0; skb_shinfo(skb)->tso_segs = 0; skb_shinfo(skb)->frag_list = NULL; out: return skb; nodata: kmem_cache_free(skbuff_head_cache, skb); skb = NULL; goto out; }
/** * alloc_skb_from_cache - allocate a network buffer * @cp: kmem_cache from which to allocate the data area * (object size must be big enough for @size bytes + skb overheads) * @size: size to allocate * @gfp_mask: allocation mask * * Allocate a new &sk_buff. The returned buffer has no headroom and * tail room of size bytes. The object has a reference count of one. * The return is the buffer. On a failure the return is %NULL. * * Buffers may only be allocated from interrupts using a @gfp_mask of * %GFP_ATOMIC. */ struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp, unsigned int size, int gfp_mask) { struct sk_buff *skb; u8 *data;
/* Get the HEAD */ skb = kmem_cache_alloc(skbuff_head_cache, gfp_mask & ~__GFP_DMA); if (!skb) goto out;
/* Get the DATA. */ size = SKB_DATA_ALIGN(size); /* 这个函数和上面函数不同的地方就在下面这句,不用kmalloc,而用kmem_cache_alloc。 */ data = kmem_cache_alloc(cp, gfp_mask); if (!data) goto nodata;
memset(skb, 0, offsetof(struct sk_buff, truesize)); skb->truesize = size + sizeof(struct sk_buff); atomic_set(&skb->users, 1); skb->head = data; skb->data = data; skb->tail = data; skb->end = data + size;
atomic_set(&(skb_shinfo(skb)->dataref), 1); skb_shinfo(skb)->nr_frags = 0; skb_shinfo(skb)->tso_size = 0; skb_shinfo(skb)->tso_segs = 0; skb_shinfo(skb)->frag_list = NULL; out: return skb; nodata: kmem_cache_free(skbuff_head_cache, skb); skb = NULL; goto out; }
/* 这个函数是用来释放当前skb的frag_list区的 */ static void skb_drop_fraglist(struct sk_buff *skb) { struct sk_buff *list = skb_shinfo(skb)->frag_list;
skb_shinfo(skb)->frag_list = NULL; /* 循环前进,直到没有为止。 */ do { struct sk_buff *this = list; list = list->next; kfree_skb(this); } while (list); }
static void skb_clone_fraglist(struct sk_buff *skb) { struct sk_buff *list; /* 对当前skb的frag_list区链上的每个skb增加引用计数。 */ for (list = skb_shinfo(skb)->frag_list; list; list = list->next) skb_get(list); }
void skb_release_data(struct sk_buff *skb) { /* 查看skb是否被clone?skb_shinfo的dataref是否为0? * 如果是,那么就释放skb非线性区域和线性区域。 */ if (!skb->cloned || !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, &skb_shinfo(skb)->dataref)) { /* 释放page frags区 */ if (skb_shinfo(skb)->nr_frags) { int i; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) put_page(skb_shinfo(skb)->frags[i].page); }
/* 释放frag_list区 */ if (skb_shinfo(skb)->frag_list) skb_drop_fraglist(skb);
/* 释放线性区域 */ kfree(skb->head); } }
/* * Free an skbuff by memory without cleaning the state. */
/* 把skb自身和线性,非线性区域全部释放 */ void kfree_skbmem(struct sk_buff *skb) { skb_release_data(skb); kmem_cache_free(skbuff_head_cache, skb); }
/** * __kfree_skb - private function * @skb: buffer * * Free an sk_buff. Release anything attached to the buffer. * Clean the state. This is an internal helper function. Users should * always call kfree_skb */ /* 这个函数应该也能算是一个wrapper函数 */
void __kfree_skb(struct sk_buff *skb) { BUG_ON(skb->list != NULL);
dst_release(skb->dst); #ifdef CONFIG_XFRM secpath_put(skb->sp); #endif if (skb->destructor) { WARN_ON(in_irq()); skb->destructor(skb); } #ifdef CONFIG_NETFILTER nf_conntrack_put(skb->nfct); #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(skb->nf_bridge); #endif #endif /* XXX: IS this still necessary? - JHS */ #ifdef CONFIG_NET_SCHED skb->tc_index = 0; #ifdef CONFIG_NET_CLS_ACT skb->tc_verd = 0; skb->tc_classid = 0; #endif #endif
kfree_skbmem(skb); }
/** * skb_clone - duplicate an sk_buff * @skb: buffer to clone * @gfp_mask: allocation priority * * Duplicate an &sk_buff. The new one is not owned by a socket. Both * copies share the same packet data but not structure. The new * buffer has a reference count of 1. If the allocation fails the * function returns %NULL otherwise the new buffer is returned. * * If this function is called from an interrupt gfp_mask() must be * %GFP_ATOMIC. */
struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) { /* 从cache池中分配一个skb */ struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
if (!n) return NULL; /* 这个C(x) 就是clone的意思 */ #define C(x) n->x = skb->x
n->next = n->prev = NULL; n->list = NULL; n->sk = NULL; /* 把skb中各个成员都clone过去 */ C(stamp); C(dev); C(real_dev); C(h); C(nh); C(mac); C(dst); dst_clone(skb->dst); C(sp); #ifdef CONFIG_INET secpath_get(skb->sp); #endif memcpy(n->cb, skb->cb, sizeof(skb->cb)); C(len); C(data_len); C(csum); C(local_df); /* 新分配的skb是clone的 */ n->cloned = 1; n->nohdr = 0; C(pkt_type); C(ip_summed); C(priority); C(protocol); C(security); n->destructor = NULL; #ifdef CONFIG_NETFILTER C(nfmark); C(nfcache); C(nfct); nf_conntrack_get(skb->nfct); C(nfctinfo); #ifdef CONFIG_NETFILTER_DEBUG C(nf_debug); #endif #ifdef CONFIG_BRIDGE_NETFILTER C(nf_bridge); nf_bridge_get(skb->nf_bridge); #endif #endif /*CONFIG_NETFILTER*/ #if defined(CONFIG_HIPPI) C(private); #endif #ifdef CONFIG_NET_SCHED C(tc_index); #ifdef CONFIG_NET_CLS_ACT n->tc_verd = SET_TC_VERD(skb->tc_verd,0); n->tc_verd = CLR_TC_OK2MUNGE(skb->tc_verd); n->tc_verd = CLR_TC_MUNGED(skb->tc_verd); C(input_dev); C(tc_classid); #endif
#endif C(truesize); /* 新skb的users初始化为1 */ atomic_set(&n->users, 1); C(head); C(data); C(tail); C(end); /* 增加被clone的skb的数据引用 */ atomic_inc(&(skb_shinfo(skb)->dataref)); /* 设置原skb也是被clone了 */ skb->cloned = 1;
return n; }
static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { /* * Shift between the two data areas in bytes */ /* 为了等一下要给网络各层的指针赋值,现在要先算出两个data的偏移量 */ unsigned long offset = new->data - old->data;
new->list = NULL; new->sk = NULL; new->dev = old->dev; new->real_dev = old->real_dev; new->priority = old->priority; new->protocol = old->protocol; new->dst = dst_clone(old->dst); #ifdef CONFIG_INET new->sp = secpath_get(old->sp); #endif /* 用上面算出来的offset来算 */ new->h.raw = old->h.raw + offset; new->nh.raw = old->nh.raw + offset; new->mac.raw = old->mac.raw + offset;
/* 拷贝control block */ memcpy(new->cb, old->cb, sizeof(old->cb));
new->local_df = old->local_df; new->pkt_type = old->pkt_type; new->stamp = old->stamp; new->destructor = NULL; new->security = old->security; #ifdef CONFIG_NETFILTER new->nfmark = old->nfmark; new->nfcache = old->nfcache; new->nfct = old->nfct; nf_conntrack_get(old->nfct); new->nfctinfo = old->nfctinfo; #ifdef CONFIG_NETFILTER_DEBUG new->nf_debug = old->nf_debug; #endif #ifdef CONFIG_BRIDGE_NETFILTER new->nf_bridge = old->nf_bridge; nf_bridge_get(old->nf_bridge); #endif #endif #ifdef CONFIG_NET_SCHED #ifdef CONFIG_NET_CLS_ACT new->tc_verd = old->tc_verd; #endif new->tc_index = old->tc_index; #endif /* 设置新的skb的users为1 */ atomic_set(&new->users, 1);
/* 把skb_shinfo的东西也一起copy过去 */ skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size; skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs; }
/** * skb_copy - create private copy of an sk_buff * @skb: buffer to copy * @gfp_mask: allocation priority * * Make a copy of both an &sk_buff and its data. This is used when the * caller wishes to modify the data and needs a private copy of the * data to alter. Returns %NULL on failure or the pointer to the buffer * on success. The returned buffer has a reference count of 1. * * As by-product this function converts non-linear &sk_buff to linear * one, so that &sk_buff becomes completely private and caller is allowed * to modify all the data of returned buffer. This means that this * function is not recommended for use in circumstances when only * header is going to be modified. Use pskb_copy() instead. */
struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask) { int headerlen = skb->data - skb->head; /* * Allocate the copy buffer */
/* * 分配内存包含线性数据区的长度和非线性数据区的长度 * data_len是指非线性数据区的长度。 */ struct sk_buff *n = alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask); if (!n) return NULL;
/* Set the data pointer */ /* 预留头的长度 */ skb_reserve(n, headerlen); /* Set the tail pointer and length */ /* len是指线性和非线性数据的总长,把tail往后推 */ skb_put(n, skb->len); n->csum = skb->csum; n->ip_summed = skb->ip_summed; /* 因为 skb_copy_bits 函数中 offset是对有效负载的,即skb->data。 * 因此这里的offset为-headerlen。目的是从skb->data向前推headerlen。 * 从skb的head处拷贝到n的head处。这个函数把skb的线性和非线性部分全部拷贝到 * n的线性部分去了。 */ if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)) BUG(); /* 把skb的本身复制到n的本身 */ copy_skb_header(n, skb); return n; }
/** * pskb_copy - create copy of an sk_buff with private head. * @skb: buffer to copy * @gfp_mask: allocation priority * * Make a copy of both an &sk_buff and part of its data, located * in header. Fragmented data remain shared. This is used when * the caller wishes to modify only header of &sk_buff and needs * private copy of the header to alter. Returns %NULL on failure * or the pointer to the buffer on success. * The returned buffer has a reference count of 1. */
struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask) { /* * Allocate the copy buffer */ /* 分配一个新的skb_buff n,它的线性区长度是和原skb长度一样 */ struct sk_buff *n = alloc_skb(skb->end - skb->head, gfp_mask);
if (!n) goto out;
/* Set the data pointer */ /* 预留head到data之间的空隙 */ skb_reserve(n, skb->data - skb->head);
/* Set the tail pointer and length */ /* 准备向n放数据,试放数据长度是skb的header section的长度 */ skb_put(n, skb_headlen(skb));
/* Copy the bytes */ /* 拷贝有效负载,长度是n->len。上面skb_put中已经把n->len赋值成skb_headlen(skb) * 所以这里拷贝线性区域的长度。 */ memcpy(n->data, skb->data, n->len);
/* 复制skb本身信息到n */ n->csum = skb->csum; n->ip_summed = skb->ip_summed;
n->data_len = skb->data_len; n->len = skb->len;
/* 把skb中page frags的指针复制到n的page frags。 */ if (skb_shinfo(skb)->nr_frags) { int i;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; get_page(skb_shinfo(n)->frags[i].page); } skb_shinfo(n)->nr_frags = i; }
/* 把skb中frag_list地址复制到n的frag_list */ if (skb_shinfo(skb)->frag_list) { skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; skb_clone_fraglist(n); }
/* 把skb的本身复制到n的本身 */ copy_skb_header(n, skb); out: return n; }
/** * pskb_expand_head - reallocate header of &sk_buff * @skb: buffer to reallocate * @nhead: room to add at head * @ntail: room to add at tail * @gfp_mask: allocation priority * * Expands (or creates identical copy, if &nhead and &ntail are zero) * header of skb. &sk_buff itself is not changed. &sk_buff MUST have * reference count of 1. Returns zero in the case of success or error, * if expansion failed. In the last case, &sk_buff is not changed. * * All the pointers pointing into skb header may change and must be * reloaded after call to this function. */ /* 这个函数要注意的是原来的skb结构体并没有释放 * 释放的是header section数据区。 */ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, int gfp_mask) { int i; u8 *data; /* 算出原来线性区的长度,再加上现在要求的增加的headroom和tailroom。 */ int size = nhead + (skb->end - skb->head) + ntail; long off;
if (skb_shared(skb)) BUG(); /* 对齐size的大小 */ size = SKB_DATA_ALIGN(size);
/* 按照要求分配新的header section */ data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); if (!data) goto nodata;
/* Copy only real data... and, alas, header. This should be * optimized for the cases when header is void. */ /* 拷贝payload到正确的位置上 */ memcpy(data + nhead, skb->head, skb->tail - skb->head); memcpy(data + size, skb->end, sizeof(struct skb_shared_info));
/* 下面复制page frags区域和fraglist区域的指针 */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) get_page(skb_shinfo(skb)->frags[i].page);
if (skb_shinfo(skb)->frag_list) skb_clone_fraglist(skb);
/* 释放原来的数据区 */ skb_release_data(skb); /* 计算偏移量 */ off = (data + nhead) - skb->head;
skb->head = data; skb->end = data + size; skb->data += off; skb->tail += off; skb->mac.raw += off; skb->h.raw += off; skb->nh.raw += off; skb->cloned = 0; skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); return 0;
nodata: return -ENOMEM; }
/* Make private copy of skb with writable head and some headroom */
struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) { struct sk_buff *skb2; /* 计算现在要求的headroom 和原来headroom之间的差值 */ int delta = headroom - skb_headroom(skb); /* 如果现在要求的headroom没有原来的headroom大,那说明原来的header section可以用, * 所以只要用pskb_copy复制一份skb结构体和它的线性区域就可以了。 */ if (delta <= 0) skb2 = pskb_copy(skb, GFP_ATOMIC); else { /* 如果要求的headroom比原来的headroom大的话,clone一个skb */ skb2 = skb_clone(skb, GFP_ATOMIC); /* 把新clone的skb用pskb_expand_head扩大headroom */ if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) { kfree_skb(skb2); skb2 = NULL; } } return skb2; }
/** * skb_copy_expand - copy and expand sk_buff * @skb: buffer to copy * @newheadroom: new free bytes at head * @newtailroom: new free bytes at tail * @gfp_mask: allocation priority * * Make a copy of both an &sk_buff and its data and while doing so * allocate additional space. * * This is used when the caller wishes to modify the data and needs a * private copy of the data to alter as well as more space for new fields. * Returns %NULL on failure or the pointer to the buffer * on success. The returned buffer has a reference count of 1. * * You must pass %GFP_ATOMIC as the allocation priority if this function * is called from an interrupt. * * BUG ALERT: ip_summed is not copied. Why does this work? Is it used * only by netfilter in the cases when checksum is recalculated? --ANK */ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom, int newtailroom, int gfp_mask) { /* * Allocate the copy buffer */ /* 分配一个新的skb结构体,header section长度是原来的skb所有数据长度加上新的skb要求的headroom * 和要求的tailroom。目的是把原来的SKB线性化。 */ struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, gfp_mask); int head_copy_len, head_copy_off;
if (!n) return NULL;
/* 新的sk_buff n的headroom长度为newheadroom */ skb_reserve(n, newheadroom);
/* Set the tail pointer and length */ /* 设置tail指针和n->len */ skb_put(n, skb->len);
/* 设置head_copy_len 为老的skb的headroom */ head_copy_len = skb_headroom(skb); head_copy_off = 0; /* 如果新的headroom比老的headroom小, * 拷贝长度就为新的headroom的长度。 */ if (newheadroom <= head_copy_len) head_copy_len = newheadroom; else head_copy_off = newheadroom - head_copy_len;
/* Copy the linear header and data. */ /* offset为原来skb->data-head_copy_len */ if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, skb->len + head_copy_len)) BUG();
/* 拷贝skb结构体到n结构体 */ copy_skb_header(n, skb);
return n; }
/** * skb_pad - zero pad the tail of an skb * @skb: buffer to pad * @pad: space to pad * * Ensure that a buffer is followed by a padding area that is zero * filled. Used by network drivers which may DMA or transfer data * beyond the buffer end onto the wire. * * May return NULL in out of memory cases. */ struct sk_buff *skb_pad(struct sk_buff *skb, int pad) { struct sk_buff *nskb; /* If the skbuff is non linear tailroom is always zero.. */ /* 如果需要pad的长度比skb_tailroom小的话, * 就直接从skb->data+skb->len,开始清零. */ if (skb_tailroom(skb) >= pad) { memset(skb->data+skb->len, 0, pad); return skb; } /* 如果需要pad的长度比tailroom长的话,就skb_copy_expand */ nskb = skb_copy_expand(skb, skb_headroom(skb), skb_tailroom(skb) + pad, GFP_ATOMIC); /* 释放原来的SKB */ kfree_skb(skb); /* 清零 */ if (nskb) memset(nskb->data+nskb->len, 0, pad); return nskb; }
待续未完...
|