TCP/IP源码学习(44)——kernel重组IPv4分片代码(1)-自由人

自由人_SZzhihuaxie.blog.chinaunix.net

首页　| 　博文目录　| 　关于我

自由人_SZ

博客访问： 1717399
博文数量： 511
博客积分： 967
博客等级：准尉
技术积分： 2560
用户组：普通用户
注册时间： 2012-07-06 14:19

文章分类

全部博文（511）

zynq（34）
rootfs（12）
ffmpeg（12）
uio（5）
bluetooth（1）
rt-thread（4）
linux 文件（5）
powerpc（1）
调试（7）
Netfilter（13）
性能（10）
安全代码思考（12）
算法（18）
linux-tcpip（70）
机器视觉（2）
应用（27）
linux 应用（78）
linux 启动（32）
bootloader（3）
Linux 内核（62）
未分配的博文（103）

文章存档

2016年（11）

2015年（61）

2014年（257）

2013年（63）

2012年（119）

我的朋友

相关博文

TCP/IP源码学习(44)——kernel重组IPv4分片代码(1)

分类： LINUX

2014-05-17 22:57:36

原文地址：TCP/IP源码学习(44)——kernel重组IPv4分片代码(1) 作者：GFree_Wind

作者：gfree.wind@gmail.com
博客：blog.focus-linux.net linuxfocus.blog.chinaunix.net

本文的copyleft归gfree.wind@gmail.com所有，使用GPL发布，可以自由拷贝，转载。但转载请保持文档的完整性，注明原作者及原链接，严禁用于任何商业用途。

======================================================================================================

在前面的netfilter代码学习的过程中，正好碰到了kernel处理IPv4分片的函数。那么就这个线索继续下去吧。

ip_defrag用于处理kernel收到的IP分片的函数：

/*
参数skb毫无疑问为收到的IP分片skb，而user用于表明调用者的身份，参加枚举ip_defrag_users。比如本地收到的IP分片时，user为IP_DEFRAG_LOCAL_DELIVER。
*/
int ip_defrag(struct sk_buff *skb, u32 user)
{
struct ipq *qp;
struct net *net;

/* 得到net名称空间 */

net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
/* Start by cleaning up the memory. */
/*
IP分片占用的内存已经超过了设定的最高阀值，需要回收内存。
这个是必不可少的。因为所以的未重组的IP分片都保存在内存中。
*/
if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh)
ip_evictor(net);
/* Lookup (or create) queue header */
/* 查找对应的IP分片队列 */
if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
int ret;
spin_lock(&qp->q.lock);
/* 将该IP分片加入到该队列，如果可能的话，就对IP分片进行重组 */
ret = ip_frag_queue(qp, skb);
spin_unlock(&qp->q.lock);
ipq_put(qp);
return ret;
}

/* 无法创建新的IP分片队列，说明内存不足 */

IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
kfree_skb(skb);
return -ENOMEM;
}

接下来看ip_find：

static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
{
struct inet_frag_queue *q;
struct ip4_create_arg arg;
unsigned int hash;
arg.iph = iph;
arg.user = user;

这里对获得了ip4_frags.lock的读锁，何时释放的呢？

答案是在inet_frag_find这个函数中。

这种锁的使用风格，我很不喜欢。为什么kernel会使用这种方式呢？

read_lock(&ip4_frags.lock);
/*
对于IP分片来说，使用IP头部信息中的identifier，源地址，目的地址，以及协议来计算hash值。一般来说，这四个值基本上可以保证了IP分片的队列信息的唯一性。不过由于NAT设备的使用，就有可能将不同的分片队列混在一起。在计算hash值上，还使用ip4_frags.rnd这一随机值。
*/
hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
if (q == NULL)
goto out_nomem;

/* 内核中实际上维护的变量类型为struct ipq，需要从其成员变量q，获得原来的struct ipq类型的地址 */

return container_of(q, struct ipq, q);
out_nomem:
LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n");
return NULL;
}

然后inet_frag_find：

struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
struct inet_frags *f, void *key, unsigned int hash)
__releases(&f->lock) // 这里看上去有些怪异。但__release为一个宏。其值或为空，或为一个attribut // e扩展，所以可以这样写。
{
struct inet_frag_queue *q;
struct hlist_node *n;
hlist_for_each_entry(q, n, &f->hash[hash], list) {
/* net名称空间相等，且匹配函数返回true，则表示为正确的分片队列 */
if (q->net == nf && f->match(q, key)) {
atomic_inc(&q->refcnt);
read_unlock(&f->lock);
return q;
}
}
read_unlock(&f->lock);

没有找到正确的IP分片队列，需要重新创建一个新的IP分片队列。

这个函数很简单，申请一个新的队列节点，计算其hash值，并将其添加到hash表中。

return inet_frag_create(nf, f, key);
}

IPv4的匹配函数很简单：

static int ip4_frag_match(struct inet_frag_queue *q, void *a)
{
struct ipq *qp;
struct ip4_create_arg *arg = a;
qp = container_of(q, struct ipq, q);
/*
比较indentifier，源地址，目的地址，协议，以及分片队列的所有者。
这说明不同所有者，维护了不同的分片队列，它们之间互不影响。
*/
return (qp->id == arg->iph->id &&
qp->saddr == arg->iph->saddr &&
qp->daddr == arg->iph->daddr &&
qp->protocol == arg->iph->protocol &&
qp->user == arg->user);
}

大致看一下inet_frag_create：

static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
struct inet_frags *f, void *arg)
{
struct inet_frag_queue *q;
q = inet_frag_alloc(nf, f, arg);
if (q == NULL)
return NULL;
return inet_frag_intern(nf, q, f, arg);
}

static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
struct inet_frags *f, void *arg)
{
struct inet_frag_queue *q;
q = kzalloc(f->qsize, GFP_ATOMIC);
if (q == NULL)
return NULL;

因为需要同时支持IPv4和IPv6分片，所以这里使用一个回调函数。并且这种方式分隔了一些细节问题。

对于IPv4来说，该回调为ip4_frag_init。

f->constructor(q, arg);
/* 增加内存使用量统计 */
atomic_add(f->qsize, &nf->mem);
/* 设置定时器，因为分片需要使用定时器清理过期的分片信息 */
setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
spin_lock_init(&q->lock);
atomic_set(&q->refcnt, 1);
q->net = nf;
return q;
}

新的分片队列的真正的添加函数

static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
struct inet_frag_queue *qp_in, struct inet_frags *f,
void *arg)
{
struct inet_frag_queue *qp;
#ifdef CONFIG_SMP
struct hlist_node *n;
#endif
unsigned int hash;
write_lock(&f->lock);
/*
* While we stayed w/o the lock other CPU could update
* the rnd seed, so we need to re-calculate the hash
* chain. Fortunatelly the qp_in can be used to get one.
*/
/* 按注释所说，当拿到锁的时候，rnd随机值可能已经发生了变化，所以需要重新计算hash值 */
hash = f->hashfn(qp_in);
#ifdef CONFIG_SMP
/* With SMP race we have to recheck hash table, because
* such entry could be created on other cpu, while we
* promoted read lock to write lock.
*/
/*
对于SMP的情况下，很可能其它CPU已经添加了该队列，所以需要重新检查。
内核代码写的就是细致。
*/
hlist_for_each_entry(qp, n, &f->hash[hash], list) {
if (qp->net == nf && f->match(qp, arg)) {
/*
其它CPU真的已经添加了该节点，那么我们只需要增加其计数器，并设置其标志位。
目前还没有细致看，大概看的结果是设置标志位INET_FRAG_COMPLETE是避免该队列被删除。
*/
atomic_inc(&qp->refcnt);
write_unlock(&f->lock);
qp_in->last_in |= INET_FRAG_COMPLETE;
inet_frag_put(qp_in, f);
return qp;
}
}
#endif
qp = qp_in;
/* 修改定时器 */
if (!mod_timer(&qp->timer, jiffies + nf->timeout))
atomic_inc(&qp->refcnt);

/* 加新的队列节点添加到hash表中 */

atomic_inc(&qp->refcnt);
hlist_add_head(&qp->list, &f->hash[hash]);
list_add_tail(&qp->lru_list, &nf->lru_list);
nf->nqueues++;
write_unlock(&f->lock);
return qp;
}

未完待续。。。

阅读(788) | 评论(0) | 转发(0) |

上一篇：64位x86的函数调用栈布局

下一篇：TCP/IP源码学习(45)——kernel重组IPv4分片代码(2)

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6