Netfilter源码分析[转载]-自由人

自由人_SZzhihuaxie.blog.chinaunix.net

首页　| 　博文目录　| 　关于我

自由人_SZ

博客访问： 1722992
博文数量： 511
博客积分： 967
博客等级：准尉
技术积分： 2560
用户组：普通用户
注册时间： 2012-07-06 14:19

文章分类

全部博文（511）

zynq（34）
rootfs（12）
ffmpeg（12）
uio（5）
bluetooth（1）
rt-thread（4）
linux 文件（5）
powerpc（1）
调试（7）
Netfilter（13）
性能（10）
安全代码思考（12）
算法（18）
linux-tcpip（70）
机器视觉（2）
应用（27）
linux 应用（78）
linux 启动（32）
bootloader（3）
Linux 内核（62）
未分配的博文（103）

文章存档

2016年（11）

2015年（61）

2014年（257）

2013年（63）

2012年（119）

我的朋友

相关博文

Netfilter源码分析[转载]

分类： LINUX

2014-05-20 22:29:46

原文地址：Netfilter源码分析[转载] 作者：flyboy

一、主函数

init为初始化函数，主要完成表的注册，然后再注册与表相对应的HOOK
//初始化函数为init：
module_init(init);

//init 函数负责注册filter表和默认的三个chain
static int __init init(void)
{
int ret;

if (forward < 0 || forward > NF_MAX_VERDICT) {
printk("iptables forward must be 0 or 1\n");
return -EINVAL;
}

/* Entry 1 is the FORWARD hook */
initial_table.entries[1].target.verdict = -forward - 1;

/* 注册filter表 */
ret = ipt_register_table(&packet_filter);
if (ret < 0)
return ret;

/* 注册各个钩子函数 */
ret = nf_register_hook(&ipt_ops[0]);
if (ret < 0)
goto cleanup_table;

ret = nf_register_hook(&ipt_ops[1]);
if (ret < 0)
goto cleanup_hook0;

ret = nf_register_hook(&ipt_ops[2]);
if (ret < 0)
goto cleanup_hook1;

return ret;

//如果注册失败，将已注册的钩子清除掉
cleanup_hook1:
nf_unregister_hook(&ipt_ops[1]);
cleanup_hook0:
nf_unregister_hook(&ipt_ops[0]);
cleanup_table:
ipt_unregister_table(&packet_filter);

return ret;
}

回复于：2005-12-16 12:52:40

二、表的注册
表的注册由函数ipt_register_table来完成，
ipt_register_table(&packet_filter);
其参数packet_filter包含了待注册表的各个参数：
static struct ipt_table packet_filter
= { { NULL, NULL }, "filter", &initial_table.repl,
    FILTER_VALID_HOOKS, RW_LOCK_UNLOCKED, NULL, THIS_MODULE };

可见，内核中，表是以结构struct ipt_table来表示的：
struct ipt_table
{
struct list_head list;
/* 用于构造，维护链表的结构 */
char name[IPT_TABLE_MAXNAMELEN];
/* 表名，如"filter"、"nat"等，为了满足自动模块加载的设计，包含该表的模块应命名为iptable_'name'.o */
struct ipt_replace *table;
/* 表的初始化模板，初始为initial_table.repl */
unsigned int valid_hooks;
/* 位向量，表示当前表所影响的HOOK */
rwlock_t lock;
/* 读写锁，初始为打开状态 */
struct ipt_table_info *private;
/* iptable的数据区*/
struct module *me;
/* 是否在模块中定义，若否，则为NULL */
};

对照这一结构分析，filter表的初始化为：
链表：{ NULL, NULL }
表名："filter"
初始化模板：&initial_table.repl
当前表所影响的Hook：FILTER_VALID_HOOKS /*#define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT))*/
读写锁：RW_LOCK_UNLOCKED，即为打开状态
数据区： NULL
是否在模块中定义：THIS_MODULE，见如下宏定义：

#ifndef THIS_MODULE
#ifdef MODULE
#define THIS_MODULE (&__this_module)
#else
#define THIS_MODULE (NULL)
#endif
#endif

先来看维护表的链表的结构：
struct list_head {
struct list_head *next, *prev;
};
很简单，它是一个双向链表。

另一个重要的东东就是表的模板和数据区。表模板定义了一个初始化用的该表的所默认的HOOK所包含的规则等信息，它被初始化成了
&initial_table.repl。而初始化的数据区struct ipt_table_info *private为空。这样，ipt_register_table()函数会用repl成员的
内容去填充private成员.

struct ipt_table_info是实际描述表的数据结构（net/ipv4/netfilter/ip_tables.c）：
struct ipt_table_info
{
unsigned int size;
/* 表大小 */
unsigned int number;
/* 表中的规则数 */
unsigned int initial_entries;
/* 初始的规则数，用于模块计数 */
unsigned int hook_entry[NF_IP_NUMHOOKS];
/* 记录所影响的HOOK的规则入口相对于下面的entries变量的偏移量 */
unsigned int underflow[NF_IP_NUMHOOKS];
/* 与hook_entry相对应的规则表上限偏移量，当无规则录入时，相应的hook_entry和underflow均为0 */
char entries[0] ____cacheline_aligned;
/* 规则表入口 */
};

再来看模板的定义，这个结构很简单，不过长了点：

static struct
{
struct ipt_replace repl;
struct ipt_standard entries[3];
struct ipt_error term;
} initial_table __initdata
= { { "filter", FILTER_VALID_HOOKS, 4,
      sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
      { [NF_IP_LOCAL_IN] 0,
[NF_IP_FORWARD] sizeof(struct ipt_standard),
[NF_IP_LOCAL_OUT] sizeof(struct ipt_standard) * 2 },
      { [NF_IP_LOCAL_IN] 0,
[NF_IP_FORWARD] sizeof(struct ipt_standard),
[NF_IP_LOCAL_OUT] sizeof(struct ipt_standard) * 2 },
      0, NULL, { } },
    {
    /* LOCAL_IN */
    { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
0,
sizeof(struct ipt_entry),
sizeof(struct ipt_standard),
0, { 0, 0 }, { } },
      { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
-NF_ACCEPT - 1 } },
    /* FORWARD */
    { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
0,
sizeof(struct ipt_entry),
sizeof(struct ipt_standard),
0, { 0, 0 }, { } },
      { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
-NF_ACCEPT - 1 } },
    /* LOCAL_OUT */
    { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
0,
sizeof(struct ipt_entry),
sizeof(struct ipt_standard),
0, { 0, 0 }, { } },
      { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
-NF_ACCEPT - 1 } }
    },
    /* ERROR */
    { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
0,
sizeof(struct ipt_entry),
sizeof(struct ipt_error),
0, { 0, 0 }, { } },
      { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } },
  { } },
"ERROR"
      }
    }
};

结构长了点，我们先来关心注册表时的初始值：
&initial_table.repl
这是一个struct ipt_replace结构，该结构做为初始化模版被使用，同样用户通过系统调用更换
表时也要用到这个结构。定义如下：

/* The argument to IPT_SO_SET_REPLACE. */
struct ipt_replace
{
/* 表名. */
char name[IPT_TABLE_MAXNAMELEN];

/* 该表所影响的Hook. */
unsigned int valid_hooks;

/* Number of entries */
unsigned int num_entries;

/* Total size of new entries */
unsigned int size;

/* 记录所影响的HOOK的规则入口相对于下面的entries变量的偏移量 */
unsigned int hook_entry[NF_IP_NUMHOOKS];

/* 与hook_entry相对应的规则表上限偏移量，当无规则录入时，相应的hook_entry和underflow均为0 */
unsigned int underflow[NF_IP_NUMHOOKS];

/* Information about old entries: */
/* Number of counters (must be equal to current number of entries). */
unsigned int num_counters;
/* The old entries' counters. */
struct ipt_counters *counters;

/* The entries (hang off end: not really an array). */
struct ipt_entry entries[0];
};

对照结构，可以分析各个成员的初始化值了：
char name[IPT_TABLE_MAXNAMELEN]="filter";
unsigned int valid_hooks=FILTER_VALID_HOOKS;
unsigned int num_entries=4;
unsigned int size=sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error);
unsigned int hook_entry[NF_IP_NUMHOOKS]={ [NF_IP_LOCAL_IN] 0,
[NF_IP_FORWARD] sizeof(struct ipt_standard),
[NF_IP_LOCAL_OUT] sizeof(struct ipt_standard) * 2 };
unsigned int underflow={ [NF_IP_LOCAL_IN] 0,
[NF_IP_FORWARD] sizeof(struct ipt_standard),
[NF_IP_LOCAL_OUT] sizeof(struct ipt_standard) * 2 };
unsigned int num_counters=0;
struct ipt_counters *counters=NULL;
struct ipt_entry entries[0]={ };

了解了这些结构后，再来看表的注册函数：

int ipt_register_table(struct ipt_table *table)
{
int ret;
struct ipt_table_info *newinfo;
static struct ipt_table_info bootstrap
= { 0, 0, 0, { 0 }, { 0 }, { } };

/*宏MOD_INC_USE_COUNT用于模块计数器累加，主要是为了防止模块异常删除，对应的
宏MOD_DEC_USE_COUNT就是累减了*/
MOD_INC_USE_COUNT;

/*为每个CPU分配规则空间*/
newinfo = vmalloc(sizeof(struct ipt_table_info)
  + SMP_ALIGN(table->table->size) * smp_num_cpus);
/*分配失败*/
if (!newinfo) {
ret = -ENOMEM;
MOD_DEC_USE_COUNT;
return ret;
}

/*将规则项拷贝到新表项的第一个cpu空间里面*/
memcpy(newinfo->entries, table->table->entries, table->table->size);

/*translate_table函数将newinfo表示的table的各个规则进行边界检查，然后对于newinfo所指的
ipt_talbe_info结构中的hook_entries和underflows赋予正确的值，最后将表项向其他cpu拷贝*/
ret = translate_table(table->name, table->valid_hooks,
      newinfo, table->table->size,
      table->table->num_entries,
      table->table->hook_entry,
      table->table->underflow);
if (ret != 0) {
vfree(newinfo);
MOD_DEC_USE_COUNT;
return ret;
}

ret = down_interruptible(&ipt_mutex);
if (ret != 0) {
vfree(newinfo);
MOD_DEC_USE_COUNT;
return ret;
}

/* 如果注册的table已经存在，释放空间并且递减模块计数 */
if (list_named_find(&ipt_tables, table->name)) {
ret = -EEXIST;
goto free_unlock;
}

/* 替换table项. */
table->private = &bootstrap;
if (!replace_table(table, 0, newinfo, &ret))
goto free_unlock;

duprintf("table->private->number = %u\n",
table->private->number);

/* 保存初始规则计数器 */
table->private->initial_entries = table->private->number;

table->lock = RW_LOCK_UNLOCKED;
/*将表添加进链表*/
list_prepend(&ipt_tables, table);

unlock:
up(&ipt_mutex);
return ret;

free_unlock:
vfree(newinfo);
MOD_DEC_USE_COUNT;
goto unlock;
}

呵呵，初次看table的注册，有点头大，因为它不光是netfilter，还涉及到linux内核中的内存管理、
信号量设置等等，不过其实注册也就完成两件事：初始化表，将表添加进表的链表。

回复于：2005-12-16 12:53:14

表的注册中涉及到的重要函数

表注册函数中，主要涉及到的重要函数有：
translate_table
list_named_find
list_prepend

1、translate_table
/*
* 函数:translate_table()
* 参数：
* name:表名称；
* valid_hooks：当前表所影响的hook
* newinfo：包含当前表的所有信息的结构
* size：表的大小
* number：表中的规则数
* hook_entries：记录所影响的HOOK的规则入口相对于下面的entries变量的偏移量
* underflows：与hook_entry相对应的规则表上限偏移量
* 作用：
* translate_table函数将newinfo表示的table的各个规则进行边界检查，然后对于newinfo所指的
* ipt_talbe_info结构中的hook_entries和underflows赋予正确的值，最后将表项向其他cpu拷贝
* 返回值：
* int ret==0表示成功返回
*/

static int
translate_table(const char *name,
unsigned int valid_hooks,
struct ipt_table_info *newinfo,
unsigned int size,
unsigned int number,
const unsigned int *hook_entries,
const unsigned int *underflows)
{
unsigned int i;
int ret;

newinfo->size = size;
newinfo->number = number;

/* 初始化所有Hooks为不可能的值. */
for (i = 0; i < NF_IP_NUMHOOKS; i++) {
newinfo->hook_entry = 0xFFFFFFFF;
newinfo->underflow = 0xFFFFFFFF;
}

duprintf("translate_table: size %u\n", newinfo->size);
i = 0;
/* 遍历所有规则，检查所有偏量，检查的工作都是由IPT_ENTRY_ITERATE这个宏来完成，并且它
的最后一个参数i，返回表的所有规则数. */
ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
check_entry_size_and_hooks,
newinfo,
newinfo->entries,
newinfo->entries + size,
hook_entries, underflows, &i);
if (ret != 0)
return ret;

/*实际计算得到的规则数与指定的不符*/
if (i != number) {
duprintf("translate_table: %u not %u entries\n",
i, number);
return -EINVAL;
}

/* 因为函数一开始将HOOK的偏移地址全部初始成了不可能的值，而在上一个宏的遍历中设置了
hook_entries和underflows的值，这里对它们进行检查 */
for (i = 0; i < NF_IP_NUMHOOKS; i++) {
/* 只检查当前表所影响的hook */
if (!(valid_hooks & (1 << i)))
continue;
if (newinfo->hook_entry == 0xFFFFFFFF) {
duprintf("Invalid hook entry %u %u\n",
i, hook_entries);
return -EINVAL;
}
if (newinfo->underflow == 0xFFFFFFFF) {
duprintf("Invalid underflow %u %u\n",
i, underflows);
return -EINVAL;
}
}

/*确保新的table中不存在规则环*/
if (!mark_source_chains(newinfo, valid_hooks))
return -ELOOP;

/* 对tables中的规则项进行完整性检查，保证每一个规则项在形式上是合法的*/
i = 0;
ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
check_entry, name, size, &i);

/*检查失败，释放空间，返回*/
if (ret != 0) {
IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
  cleanup_entry, &i);
return ret;
}

/* 为每个CPU复制一个完整的table项*/
for (i = 1; i < smp_num_cpus; i++) {
memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
       newinfo->entries,
       SMP_ALIGN(newinfo->size));
}

return ret;
}

函数的核心处理，是调用了IPT_ENTRY_ITERATE，我在《iptables源码分析》中已提过，这个宏用来遍历每一个规则，然后
调用其第三个参数（函数指针）进行处理，前两个参数分别表示规则的起始位置和规则总大小，后面的参数则视情况而定。
再来看一次：
/* fn returns 0 to continue iteration */
#define IPT_ENTRY_ITERATE(entries, size, fn, args...) \
({ \
unsigned int __i; \
int __ret = 0; \
struct ipt_entry *__entry; \
\
for (__i = 0; __i < (size); __i += __entry->next_offset) { \
__entry = (void *)(entries) + __i; \
\
__ret = fn(__entry , ## args); \
if (__ret != 0) \
break; \
} \
__ret; \
})

对应地，函数的第一次宏的调用，
ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
check_entry_size_and_hooks,
newinfo,
newinfo->entries,
newinfo->entries + size,
hook_entries, underflows, &i);
遍历到每一项规则后，就调用check_entry_size_and_hooks继续处理。

static inline int
check_entry_size_and_hooks(struct ipt_entry *e,
   struct ipt_table_info *newinfo,
   unsigned char *base,
   unsigned char *limit,
   const unsigned int *hook_entries,
   const unsigned int *underflows,
   unsigned int *i)
{
unsigned int h;

/*(unsigned long)e % __alignof__(struct ipt_entry) != 0--不能整除，规则不完整
(unsigned char *)e + sizeof(struct ipt_entry) >= limit--超过上限了*/

if ((unsigned long)e % __alignof__(struct ipt_entry) != 0
    || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) {
duprintf("Bad offset %p\n", e);
return -EINVAL;
}

/*e->next_offset
    < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)--规则太"短"了，小于最基本的长度
    */
if (e->next_offset
    < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) {
duprintf("checking: element %p size %u\n",
e, e->next_offset);
return -EINVAL;
}

/* 检查并设置正确的 hooks & underflows */
for (h = 0; h < NF_IP_NUMHOOKS; h++) {
if ((unsigned char *)e - base == hook_entries[h])
newinfo->hook_entry[h] = hook_entries[h];
if ((unsigned char *)e - base == underflows[h])
newinfo->underflow[h] = underflows[h];
}

/* FIXME: underflows must be unconditional, standard verdicts
           < 0 (not IPT_RETURN). --RR */

/* Clear counters and comefrom */
e->counters = ((struct ipt_counters) { 0, 0 }); /*包和字节的计数器清零*/
e->comefrom = 0; /*环路计数器清零*/

(*i)++; /*规则计数器累加*/
return 0;
}

2、replace_table
前面说过，表中以struct ipt_table_info *private;表示实际数据区。但是在初始化赋值的时候，被设为
NULL，而表的初始变量都以模版的形式，放在struct ipt_replace *table;中。
注册函数一开始，就声明了：
struct ipt_table_info *newinfo;
然后对其分配了空间，将模块中的初值拷贝了进来。所以replace_table要做的工作，主要就是把newinfo中的
值传递给table结构中的private成员。

其函数原型如下：

static struct ipt_table_info *
replace_table(struct ipt_table *table,
      unsigned int num_counters,
      struct ipt_table_info *newinfo,
      int *error)
{
struct ipt_table_info *oldinfo;

#ifdef CONFIG_NETFILTER_DEBUG
{
struct ipt_entry *table_base;
unsigned int i;

for (i = 0; i < smp_num_cpus; i++) {
table_base =
(void *)newinfo->entries
+ TABLE_OFFSET(newinfo, i);

table_base->comefrom = 0xdead57ac;
}
}
#endif

/* Do the substitution. */
write_lock_bh(&table->lock);
/* Check inside lock: is the old number correct? */
if (num_counters != table->private->number) {
duprintf("num_counters != table->private->number (%u/%u)\n",
num_counters, table->private->number);
write_unlock_bh(&table->lock);
*error = -EAGAIN;
return NULL;
}
oldinfo = table->private;
table->private = newinfo;
newinfo->initial_entries = oldinfo->initial_entries;
write_unlock_bh(&table->lock);

return oldinfo;
}

3、list_named_find

在注册函数中，调用
/* Don't autoload: we'd eat our tail... */
if (list_named_find(&ipt_tables, table->name)) {
ret = -EEXIST;
goto free_unlock;
}
来检查当前表是否已被注册过了。可见，第一个参数为链表首部，第二个参数为当前表名。
其原型如下：
/* Find this named element in the list. */
#define list_named_find(head, name) \
LIST_FIND(head, __list_cmp_name, void *, name)

/* Return pointer to first true entry, if any, or NULL.  A macro
   required to allow inlining of cmpfn. */
#define LIST_FIND(head, cmpfn, type, args...) \
({ \
const struct list_head *__i = (head); \
\
ASSERT_READ_LOCK(head); \
do { \
__i = __i->next; \
if (__i == (head)) { \
__i = NULL; \
break; \
} \
} while (!cmpfn((const type)__i , ## args)); \
(type)__i; \
})

前面提过，表是一个双向链表，在宏当中，以while进行循环，以__i = __i->next;
进行遍历，然后调用比较函数进行比较，传递过来的比较函数是__list_cmp_name。

比较函数很简单：
static inline int __list_cmp_name(const void *i, const char *name)
{
return strcmp(name, i+sizeof(struct list_head)) == 0;
}

4、list_prepend
当所有的初始化工作结束，就调用list_prepend来构建链表了。
/* Prepend. */
static inline void
list_prepend(struct list_head *head, void *new)
{
ASSERT_WRITE_LOCK(head); /*设置写互斥*/
list_add(new, head); /*将当前表节点添加进链表*/
}
list_add就是一个构建双向链表的过程：
static __inline__ void list_add(struct list_head *new, struct list_head *head)
{
__list_add(new, head, head->next);
}

static __inline__ void __list_add(struct list_head * new,
struct list_head * prev,
struct list_head * next)
{
next->prev = new;
new->next = next;
new->prev = prev;
prev->next = new;
}

回复于：2005-12-16 12:54:08

三、Hook的注册

如果你对Netfilter的hook的注册还不了解的话，推荐先到网上搜搜《深入Linux网络核心堆栈》bioforge
看看先。(本节中有部份文字引自该文)

注册一个hook函数是围绕nf_hook_ops数据结构的一个非常简单的操作，nf_hook_ops数据结构在linux/netfilter.h中定义，
该数据结构的定义如下：
struct nf_hook_ops
{
struct list_head list;

/* User fills in from here down. */
nf_hookfn *hook;
int pf;
int hooknum;
/* Hooks are ordered in ascending priority. */
int priority;
};

该数据结构中的list成员用于维护Netfilter hook的列表。
hook成员是一个指向nf_hookfn类型的函数的指针，该函数是这个hook被调用时执行的函数。
nf_hookfn同样在linux/netfilter.h中定义。
pf这个成员用于指定协议族。有效的协议族在linux/socket.h中列出，但对于IPv4我们希望使用协议族PF_INET。
hooknum这个成员用于指定安装的这个函数对应的具体的hook类型:
NF_IP_PRE_ROUTING    在完整性校验之后，选路确定之前
NF_IP_LOCAL_IN        在选路确定之后，且数据包的目的是本地主机
NF_IP_FORWARD        目的地是其它主机地数据包
NF_IP_LOCAL_OUT        来自本机进程的数据包在其离开本地主机的过程中
NF_IP_POST_ROUTING    在数据包离开本地主机“上线”之前

最后，priority这个成员用于指定在执行的顺序中，这个hook函数应当在被放在什么地方。
对于IPv4，可用的值在linux/netfilter_ipv4.h的nf_ip_hook_priorities枚举中定义。

针对HOOK的注册，在初始化函数中有：
/* Register table */
ret = ipt_register_table(&packet_filter);
if (ret < 0)
return ret;

/* Register hooks */
ret = nf_register_hook(&ipt_ops[0]);
if (ret < 0)
goto cleanup_table;

ret = nf_register_hook(&ipt_ops[1]);
if (ret < 0)
goto cleanup_hook0;

ret = nf_register_hook(&ipt_ops[2]);
if (ret < 0)
goto cleanup_hook1;
可见，注册是通过nf_register_hook函数来完成，每一个Hook的相关信息，都在ipt_ops结构数组中，它的成员变量前面已做分析，
来看看它的初始化值：
static struct nf_hook_ops ipt_ops[]
= { { { NULL, NULL }, ipt_hook, PF_INET, NF_IP_LOCAL_IN, NF_IP_PRI_FILTER },
    { { NULL, NULL }, ipt_hook, PF_INET, NF_IP_FORWARD, NF_IP_PRI_FILTER },
    { { NULL, NULL }, ipt_local_out_hook, PF_INET, NF_IP_LOCAL_OUT,
NF_IP_PRI_FILTER }
};

对应结构各成员变量的含义，可见，filter表上总共设置了NF_IP_LOCAL_IN，NF_IP_FORWARD，NF_IP_LOCAL_OUT，用熟了iptables三个
链，对这三个东东应该是刻骨铭心了。协议簇是PF_INET，初始化链表为NULL，处理函数，前两个为ipt_hook，后一个为ipt_local_out_hook，
优化级均为NF_IP_PRI_FILTER。

hook的注册，是通过nf_register_hook来完成的，它也是一个维护双向链表的过程，值得注意的是，注册的钩子函数，全部是放在全局变量
nf_hooks中，它是一个二维数组，函数一开始先遍历它，找到合适的地方，再将当前节点插入之。（我们可以想像，将来调用钩子函数时，就
是一个查找nf_hooks数组成员的过程）
int nf_register_hook(struct nf_hook_ops *reg)
{
struct list_head *i;

br_write_lock_bh(BR_NETPROTO_LOCK);
/*寻找与当前待注册节点reg匹配的数组元素（按协议族和Hook来匹配）*/
for (i = nf_hooks[reg->pf][reg->hooknum].next;
     i != &nf_hooks[reg->pf][reg->hooknum];
     i = i->next) {
if (reg->priority < ((struct nf_hook_ops *)i)->priority)
break;
}
/*添加节点*/
list_add(®->list, i->prev);
br_write_unlock_bh(BR_NETPROTO_LOCK);
return 0;
}

能过表的注册，HOOK的注册，准备工作基本上就完成了，其它表的注册和Hook的注册，都是一样的，可以对照分析，没有必要再详述了。
不过注册也只是准备工作。重要的事情是对数据包的处理，对于filter来说，就是包过滤，对于nat来讲，就是地址转换。

四、数据包过滤

1、钩子函数
以中转包过滤为例（FORWARD），注册的时候，向内核注册了一个ipt_hook的钩子函数。
static unsigned int
ipt_hook(unsigned int hook, //Hook类型
struct sk_buff **pskb, //数据包
const struct net_device *in, //进入数据包接口
const struct net_device *out, //离开数据包接口
int (*okfn)(struct sk_buff *)) //默认处理函数
{
return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL);
}

转向到了ipt_do_table。也就是说，如果向内核挂了钩，中转的数据，将进入ipt_do_table函数。

2、钩子函数被调用
钩子函数被注册了，但是内核是如何调用它的呢？
在/src/net/ipv4下边，对应于input/output/forward，分别有Ip_forward.c，Ip_output.c,Ip_input.c。同样继续以forward为例，
（关于linux堆栈处理数据包流程的各个函数的作用等，这里就不进一步详述，请参考其它相关资料）。

对于转发的数据，将进入Ip_forward.c中的ip_forward函数，当处理完成后，在最后一句，可以看到：
return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev2,
       ip_forward_finish);
事实上，你在linux的每一个数据转发的"关节"的函数处，都可以发现这个宏的调用，它就是调用我们注册的钩子，其最后一个参数为
下一步处理的函数，即，如果有钩子函数，则处理完所有的钩子函数后，调用这个函数继续处理，如果没有注册任何钩子，则直接调用
此函数。

/* This is gross, but inline doesn't cut it for avoiding the function
   call in fast path: gcc doesn't inline (needs value tracking?). --RR */
#ifdef CONFIG_NETFILTER_DEBUG
#define NF_HOOK nf_hook_slow
#else
#define NF_HOOK(pf, hook, skb, indev, outdev, okfn) \
(list_empty(&nf_hooks[(pf)][(hook)]) \
? (okfn)(skb) \
: nf_hook_slow((pf), (hook), (skb), (indev), (outdev), (okfn)))
#endif

先初略看看这个宏，okfn，我们已讲过，它是下一步要处理的函数，这里先调用
list_empty函数检查nf_hooks是否为空，为空则表示没有Hook注册，则直接调用
okfn继续处理。如果不为空，则转入nf_hook_slow函数：

int nf_hook_slow(int pf, unsigned int hook, struct sk_buff *skb,
struct net_device *indev,
struct net_device *outdev,
int (*okfn)(struct sk_buff *))
{
struct list_head *elem;
unsigned int verdict;
int ret = 0;

/* This stopgap cannot be removed until all the hooks are audited. */
if (skb_is_nonlinear(skb) && skb_linearize(skb, GFP_ATOMIC) != 0) {
kfree_skb(skb);
return -ENOMEM;
}
if (skb->ip_summed == CHECKSUM_HW) {
if (outdev == NULL) {
skb->ip_summed = CHECKSUM_NONE;
} else {
skb_checksum_help(skb);
}
}

/* We may already have this, but read-locks nest anyway */
br_read_lock_bh(BR_NETPROTO_LOCK);

#ifdef CONFIG_NETFILTER_DEBUG
if (skb->nf_debug & (1 << hook)) {
printk("nf_hook: hook %i already set.\n", hook);
nf_dump_skb(pf, skb);
}
skb->nf_debug |= (1 << hook);
#endif

/*因为在调用NF_HOOK宏时，已经指定了协议簇和钩子名称，所以要找到对应的Hook点，是很容易的
elem即为我们要找的，记得struct nf_hook_ops结构么？双向链表中的每个elem->hook就是我们关心的终极目标*/
elem = &nf_hooks[pf][hook];
/*找到后，遍历双向链表，进一步处理，以调用Hook函数，并返回相应的动作*/
verdict = nf_iterate(&nf_hooks[pf][hook], &skb, hook, indev,
     outdev, &elem, okfn);
if (verdict == NF_QUEUE) {
NFDEBUG("nf_hook: Verdict = QUEUE.\n");
nf_queue(skb, elem, pf, hook, indev, outdev, okfn);
}
/*如果是接受，则调用okfn继续处理，否则丢度之*/
switch (verdict) {
case NF_ACCEPT:
ret = okfn(skb);
break;

case NF_DROP:
kfree_skb(skb);
ret = -EPERM;
break;
}

br_read_unlock_bh(BR_NETPROTO_LOCK);
return ret;
}

再来看nf_iterate:

static unsigned int nf_iterate(struct list_head *head,
       struct sk_buff **skb,
       int hook,
       const struct net_device *indev,
       const struct net_device *outdev,
       struct list_head **i,
       int (*okfn)(struct sk_buff *))
{
/*循环遍历所有注册的钩子函数，包括系统默认的三个，用户自定义的……*/
for (*i = (*i)->next; *i != head; *i = (*i)->next) {
struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;
/*就在这里调用了*/
switch (elem->hook(hook, skb, indev, outdev, okfn)) {
case NF_QUEUE:
return NF_QUEUE;

case NF_STOLEN:
return NF_STOLEN;

case NF_DROP:
return NF_DROP;

case NF_REPEAT:
*i = (*i)->prev;
break;

#ifdef CONFIG_NETFILTER_DEBUG
case NF_ACCEPT:
break;

default:
NFDEBUG("Evil return from %p(%u).\n",
elem->hook, hook);
#endif
}
}
return NF_ACCEPT;
}

解释一下各个返回值：

NF_DROP                丢弃该数据包
NF_ACCEPT            保留该数据包
NF_STOLEN            忘掉该数据包
NF_QUEUE            将该数据包插入到用户空间
NF_REPEAT            再次调用该hook函数

这样，最终关心的还是每一个注册的函数，这样又回到本节开头所说的ipt_do_table……

文章出处

阅读(725) | 评论(0) | 转发(0) |

上一篇：超强的Linux中断分析

下一篇：Linux netfilter源码分析（一）

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6