TCP/IP源码学习(44)——kernel重组IPv4分片代码(1)-GFree_Wind-ChinaUnix博客

Chinaunix首页 | 论坛 | 博客

linux开发专注者(坚持原创)linuxfocus.blog.chinaunix.net

首页　| 　博文目录　| 　关于我

博客访问： 8173004
博文数量： 159
博客积分： 10424
博客等级：少将
技术积分： 14615
用户组：普通用户
注册时间： 2010-07-14 12:45

个人简介

啦啦啦~~~

文章分类

全部博文（159）

编写安全无错代码（11）
Linux（66）

TCP/IP源码（39）

内核I/O（0）

应用编程（7）

netfilter源码学（8）

ULK学习笔记（0）

驱动学习（0）

内核启动（1）

内核杂项（5）

shell（1）
C/C++（17）

代码优化（0）

C99标准学习笔记（4）

编译与链接（3）

避免Bug(我犯的错（3）

有趣的问题（1）

代码重构（1）

代码风格（2）

基础概念（1）
开源代码学习（8）

netmap（0）

Linux命令源代码（0）

zeromq（5）

glibc源码学习（3）
调试技巧（8）
并行编程（1）
软件工程（4）

经验之谈（1）

设计模式（3）
数据结构与算法（11）

算法（7）

数据结构（4）
网络设备开发（0）
Networks（9）

学习笔记（1）
计算机体系结构（0）
代码分享（1）
Light TCP proxy（1）
资料（0）

2012系统架构师大（0）
函数式编程（3）

Haskell（3）
职业发展（6）

我的思考（1）

优秀书目（5）
转载（1）
数据库（1）

sqlite（1）
其它（11）

职场（2）

随笔（7）
未分配的博文（0）

文章存档

2015年（5）

2014年（1）

2013年（5）

2012年（10）

2011年（116）

2010年（22）

我的朋友

最近访客

推荐博文

相关博文

TCP/IP源码学习(44)——kernel重组IPv4分片代码(1)

分类： LINUX

2011-12-19 22:36:40

作者：gfree.wind@gmail.com
博客：blog.focus-linux.net linuxfocus.blog.chinaunix.net

本文的copyleft归gfree.wind@gmail.com所有，使用GPL发布，可以自由拷贝，转载。但转载请保持文档的完整性，注明原作者及原链接，严禁用于任何商业用途。

======================================================================================================

在前面的netfilter代码学习的过程中，正好碰到了kernel处理IPv4分片的函数。那么就这个线索继续下去吧。

ip_defrag用于处理kernel收到的IP分片的函数：

/*
参数skb毫无疑问为收到的IP分片skb，而user用于表明调用者的身份，参加枚举ip_defrag_users。比如本地收到的IP分片时，user为IP_DEFRAG_LOCAL_DELIVER。
*/
int ip_defrag(struct sk_buff *skb, u32 user)
{
struct ipq *qp;
struct net *net;

/* 得到net名称空间 */

net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
/* Start by cleaning up the memory. */
/*
IP分片占用的内存已经超过了设定的最高阀值，需要回收内存。
这个是必不可少的。因为所以的未重组的IP分片都保存在内存中。
*/
if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh)
ip_evictor(net);
/* Lookup (or create) queue header */
/* 查找对应的IP分片队列 */
if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
int ret;
spin_lock(&qp->q.lock);
/* 将该IP分片加入到该队列，如果可能的话，就对IP分片进行重组 */
ret = ip_frag_queue(qp, skb);
spin_unlock(&qp->q.lock);
ipq_put(qp);
return ret;
}

/* 无法创建新的IP分片队列，说明内存不足 */

IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
kfree_skb(skb);
return -ENOMEM;
}

接下来看ip_find：

static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
{
struct inet_frag_queue *q;
struct ip4_create_arg arg;
unsigned int hash;
arg.iph = iph;
arg.user = user;

/*

这里对获得了ip4_frags.lock的读锁，何时释放的呢？

答案是在inet_frag_find这个函数中。

这种锁的使用风格，我很不喜欢。为什么kernel会使用这种方式呢？

*/

read_lock(&ip4_frags.lock);
/*
对于IP分片来说，使用IP头部信息中的identifier，源地址，目的地址，以及协议来计算hash值。一般来说，这四个值基本上可以保证了IP分片的队列信息的唯一性。不过由于NAT设备的使用，就有可能将不同的分片队列混在一起。在计算hash值上，还使用ip4_frags.rnd这一随机值。
*/
hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
if (q == NULL)
goto out_nomem;

/* 内核中实际上维护的变量类型为struct ipq，需要从其成员变量q，获得原来的struct ipq类型的地址 */

return container_of(q, struct ipq, q);
out_nomem:
LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n");
return NULL;
}

然后inet_frag_find：

struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
struct inet_frags *f, void *key, unsigned int hash)
__releases(&f->lock) // 这里看上去有些怪异。但__release为一个宏。其值或为空，或为一个attribut // e扩展，所以可以这样写。
{
struct inet_frag_queue *q;
struct hlist_node *n;
hlist_for_each_entry(q, n, &f->hash[hash], list) {
/* net名称空间相等，且匹配函数返回true，则表示为正确的分片队列 */
if (q->net == nf && f->match(q, key)) {
atomic_inc(&q->refcnt);
read_unlock(&f->lock);
return q;
}
}
read_unlock(&f->lock);

/*

没有找到正确的IP分片队列，需要重新创建一个新的IP分片队列。

这个函数很简单，申请一个新的队列节点，计算其hash值，并将其添加到hash表中。

*/

return inet_frag_create(nf, f, key);
}

IPv4的匹配函数很简单：

static int ip4_frag_match(struct inet_frag_queue *q, void *a)
{
struct ipq *qp;
struct ip4_create_arg *arg = a;
qp = container_of(q, struct ipq, q);
/*
比较indentifier，源地址，目的地址，协议，以及分片队列的所有者。
这说明不同所有者，维护了不同的分片队列，它们之间互不影响。
*/
return (qp->id == arg->iph->id &&
qp->saddr == arg->iph->saddr &&
qp->daddr == arg->iph->daddr &&
qp->protocol == arg->iph->protocol &&
qp->user == arg->user);
}

大致看一下inet_frag_create：

static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
struct inet_frags *f, void *arg)
{
struct inet_frag_queue *q;
q = inet_frag_alloc(nf, f, arg);
if (q == NULL)
return NULL;
return inet_frag_intern(nf, q, f, arg);
}

static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
struct inet_frags *f, void *arg)
{
struct inet_frag_queue *q;
q = kzalloc(f->qsize, GFP_ATOMIC);
if (q == NULL)
return NULL;

/*

因为需要同时支持IPv4和IPv6分片，所以这里使用一个回调函数。并且这种方式分隔了一些细节问题。

对于IPv4来说，该回调为ip4_frag_init。

*/

f->constructor(q, arg);
/* 增加内存使用量统计 */
atomic_add(f->qsize, &nf->mem);
/* 设置定时器，因为分片需要使用定时器清理过期的分片信息 */
setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
spin_lock_init(&q->lock);
atomic_set(&q->refcnt, 1);
q->net = nf;
return q;
}

新的分片队列的真正的添加函数

static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
struct inet_frag_queue *qp_in, struct inet_frags *f,
void *arg)
{
struct inet_frag_queue *qp;
#ifdef CONFIG_SMP
struct hlist_node *n;
#endif
unsigned int hash;
write_lock(&f->lock);
/*
* While we stayed w/o the lock other CPU could update
* the rnd seed, so we need to re-calculate the hash
* chain. Fortunatelly the qp_in can be used to get one.
*/
/* 按注释所说，当拿到锁的时候，rnd随机值可能已经发生了变化，所以需要重新计算hash值 */
hash = f->hashfn(qp_in);
#ifdef CONFIG_SMP
/* With SMP race we have to recheck hash table, because
* such entry could be created on other cpu, while we
* promoted read lock to write lock.
*/
/*
对于SMP的情况下，很可能其它CPU已经添加了该队列，所以需要重新检查。
内核代码写的就是细致。
*/
hlist_for_each_entry(qp, n, &f->hash[hash], list) {
if (qp->net == nf && f->match(qp, arg)) {
/*
其它CPU真的已经添加了该节点，那么我们只需要增加其计数器，并设置其标志位。
目前还没有细致看，大概看的结果是设置标志位INET_FRAG_COMPLETE是避免该队列被删除。
*/
atomic_inc(&qp->refcnt);
write_unlock(&f->lock);
qp_in->last_in |= INET_FRAG_COMPLETE;
inet_frag_put(qp_in, f);
return qp;
}
}
#endif
qp = qp_in;
/* 修改定时器 */
if (!mod_timer(&qp->timer, jiffies + nf->timeout))
atomic_inc(&qp->refcnt);

/* 加新的队列节点添加到hash表中 */

atomic_inc(&qp->refcnt);
hlist_add_head(&qp->list, &f->hash[hash]);
list_add_tail(&qp->lru_list, &nf->lru_list);
nf->nqueues++;
write_unlock(&f->lock);
return qp;
}

未完待续。。。

阅读(7439) | 评论(2) | 转发(10) |

0

上一篇：netfilter源码学习(3)——框架hook处理(3)

下一篇：TCP/IP源码学习(45)——kernel重组IPv4分片代码(2)

给主人留下些什么吧！~~

GFree_Wind2011-12-22 21:48:35

crazyhadoop: ip_find 和ip_frag_find 可以合并成一个函数啊，要不还把那个read_lock分散到了另一个函数里，感觉有点怪啊.....

没错。现在这样，lock和unlock在不同的地方，看起来感觉很不好。

回复 | 举报

crazyhadoop2011-12-22 21:44:35

ip_find 和ip_frag_find 可以合并成一个函数啊，要不还把那个read_lock分散到了另一个函数里，感觉有点怪啊

回复 | 举报

关于我们 | 关于IT168 | 联系方式 | 广告合作 | 法律声明 | 免费注册

Copyright 2001-2010 ChinaUnix.net All Rights Reserved 北京皓辰网域网络信息技术有限公司. 版权所有

感谢所有关心和支持过ChinaUnix的朋友们