/* Netfilter hook itself. */ unsigned int ip_conntrack_in(unsigned int hooknum, struct sk_buff **pskb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; struct ip_conntrack_protocol *proto; int set_reply = 0; int ret;
/* Previously seen (loopback or untracked)? Ignore. */ if ((*pskb)->nfct) { // 判断当前数据包是否已被检查过了
CONNTRACK_STAT_INC(ignore); return NF_ACCEPT; }
/* Never happen */ if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) { // 分片包当会在前一个Hook中被处理,
// 所以并不会触发该条件
if (net_ratelimit()) { printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n", (*pskb)->nh.iph->protocol, hooknum); } return NF_DROP; }
/* Doesn't cover locally-generated broadcast, so not worth it. */ #if 0 /* Ignore broadcast: no `connection'. */ if ((*pskb)->pkt_type == PACKET_BROADCAST) { printk("Broadcast packet!\n"); return NF_ACCEPT; } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) == htonl(0x000000FF)) { printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n", NIPQUAD((*pskb)->nh.iph->saddr), NIPQUAD((*pskb)->nh.iph->daddr), (*pskb)->sk, (*pskb)->pkt_type); } #endif
proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol); // 根据当前数据包的协议,查找与之相应的struct ip_conntrack_protocol结构
/* It may be an special packet, error, unclean... * inverse of the return code tells to the netfilter * core what to do with the packet. */ if (proto->error != NULL && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) { CONNTRACK_STAT_INC(error); CONNTRACK_STAT_INC(invalid); return -ret; }
if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) { 在全局的连接表中,查找与当前包相匹配的连接结构,返回的是struct ip_conntrack *类型指针,它用于描述一个数据包的连接状态 /* Not valid part of a connection */ CONNTRACK_STAT_INC(invalid); return NF_ACCEPT; }
if (IS_ERR(ct)) { /* Too stressed to deal. */ CONNTRACK_STAT_INC(drop); return NF_DROP; }
IP_NF_ASSERT((*pskb)->nfct);
ret = proto->packet(ct, *pskb, ctinfo); 如果注册了相应的协议的ip_conntrack_protocol结构,则在这里调用其中的packet函数做一些检查 if (ret < 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do*/ nf_conntrack_put((*pskb)->nfct); (*pskb)->nfct = NULL; CONNTRACK_STAT_INC(invalid); return -ret; }
if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) ip_conntrack_event_cache(IPCT_STATUS, *pskb);
return ret; } struct ip_conntrack_protocol * __ip_conntrack_proto_find(u_int8_t protocol) { return ip_ct_protos[protocol]; }
#define MAX_IP_CT_PROTO 256 extern struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ static inline struct ip_conntrack * resolve_normal_ct(struct sk_buff *skb, struct ip_conntrack_protocol *proto, int *set_reply, unsigned int hooknum, enum ip_conntrack_info *ctinfo) 是连接跟踪中最重要的函数之一,它的主要功能就是判断数据包在连接跟踪表是否存在,如果不存在,则为数据包分配相应的连接跟踪节点空间并初始化,然后设置连接状态 { struct ip_conntrack_tuple tuple; struct ip_conntrack_tuple_hash *h; struct ip_conntrack *ct;
IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, // 将skb转化为tuple
&tuple,proto)) return NULL;
/* look for tuple match */ h = ip_conntrack_find_get(&tuple, NULL); // 查看数据包对应的tuple
if (!h) { // 在连接跟踪表ip_conntrack_hash中是否存在
h = init_conntrack(&tuple, proto, skb); // 如果不存在,初始化之
if (!h) return NULL; if (IS_ERR(h)) return (void *)h; } ct = tuplehash_to_ctrack(h); // 根据hash表节点,取得数据包对应的连接跟踪结构
/* It exists; we have (non-exclusive) reference. */ if (DIRECTION(h) == IP_CT_DIR_REPLY) { *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY; // 确定连接的状态
/* Please set reply bit if this packet OK */ *set_reply = 1; } else { /* Once we've had two way comms, always ESTABLISHED. */ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { DEBUGP("ip_conntrack_in: normal packet for %p\n", ct); *ctinfo = IP_CT_ESTABLISHED; } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { DEBUGP("ip_conntrack_in: related packet for %p\n", ct); *ctinfo = IP_CT_RELATED; } else { DEBUGP("ip_conntrack_in: new packet for %p\n", ct); *ctinfo = IP_CT_NEW; } *set_reply = 0; } skb->nfct = &ct->ct_general; // 设置skb中与连接相关的值
skb->nfctinfo = *ctinfo; return ct; } /* Find a connection corresponding to a tuple. */ struct ip_conntrack_tuple_hash * ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple, const struct ip_conntrack *ignored_conntrack) { struct ip_conntrack_tuple_hash *h;
read_lock_bh(&ip_conntrack_lock); h = __ip_conntrack_find(tuple, ignored_conntrack); if (h) atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); read_unlock_bh(&ip_conntrack_lock);
return h; }
struct ip_conntrack_tuple_hash * __ip_conntrack_find(const struct ip_conntrack_tuple *tuple, const struct ip_conntrack *ignored_conntrack) { struct ip_conntrack_tuple_hash *h; unsigned int hash = hash_conntrack(tuple); // tuple对应的hash表入口即为
// ip_conntrack_hash[hash],也就是链表的首节点
ASSERT_READ_LOCK(&ip_conntrack_lock); list_for_each_entry(h, &ip_conntrack_hash[hash], list) { if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {// 在这种情况下ignored_conntrack是NULL
CONNTRACK_STAT_INC(found); return h; } CONNTRACK_STAT_INC(searched); }
return NULL; } static inline int conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i, const struct ip_conntrack_tuple *tuple, const struct ip_conntrack *ignored_conntrack) { ASSERT_READ_LOCK(&ip_conntrack_lock); return tuplehash_to_ctrack(i) != ignored_conntrack && ip_ct_tuple_equal(tuple, &i->tuple); } static inline int ip_ct_tuple_equal(const struct ip_conntrack_tuple *t1, const struct ip_conntrack_tuple *t2) 分为“来源”和“目的”进行比较 { return ip_ct_tuple_src_equal(t1, t2) && ip_ct_tuple_dst_equal(t1, t2); }
static inline int ip_ct_tuple_src_equal(const struct ip_conntrack_tuple *t1, const struct ip_conntrack_tuple *t2) 比较“来源” { return t1->src.ip == t2->src.ip && t1->src.u.all == t2->src.u.all; }
static inline int ip_ct_tuple_dst_equal(const struct ip_conntrack_tuple *t1, const struct ip_conntrack_tuple *t2) 比较“目的” { return t1->dst.ip == t2->dst.ip && t1->dst.u.all == t2->dst.u.all && t1->dst.protonum == t2->dst.protonum; } ※ 这里的比较,除了IP地址之外,并没有直接比较“端口”,这是因为像ICMP协议这样的并没有“端口”协议,struct ip_conntrack_tuple 结构中,与协议相关的,如端口等,都定义成union类型,这样,就可以直接使用u.all,而不用再去管TCP,UDP还是ICMP了。 int ip_ct_get_tuple(const struct iphdr *iph, const struct sk_buff *skb, unsigned int dataoff, struct ip_conntrack_tuple *tuple, const struct ip_conntrack_protocol *protocol) { /* Never happen */ if (iph->frag_off & htons(IP_OFFSET)) { printk("ip_conntrack_core: Frag of proto %u.\n", iph->protocol); return 0; }
tuple->src.ip = iph->saddr; // 设置来源、目的地址
tuple->dst.ip = iph->daddr; tuple->dst.protonum = iph->protocol; // 设置协议号
tuple->dst.dir = IP_CT_DIR_ORIGINAL; enum ip_conntrack_dir { IP_CT_DIR_ORIGINAL, IP_CT_DIR_REPLY, IP_CT_DIR_MAX };
return protocol->pkt_to_tuple(skb, dataoff, tuple); // 设置协议特殊的处理,下面以tcp为例说明
}
static int tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct ip_conntrack_tuple *tuple) { struct tcphdr _hdr, *hp;
/* Actually only need first 8 bytes. */ hp = skb_header_pointer(skb, dataoff, 8, &_hdr); if (hp == NULL) return 0;
tuple->src.u.tcp.port = hp->source; // 设置tuple中相关协议的源,目的端口
tuple->dst.u.tcp.port = hp->dest;
return 1; } /* Allocate a new conntrack: we return -ENOMEM if classification * failed due to stress. Otherwise it really is unclassifiable */ static struct ip_conntrack_tuple_hash * init_conntrack(struct ip_conntrack_tuple *tuple, struct ip_conntrack_protocol *protocol, struct sk_buff *skb) { struct ip_conntrack *conntrack; struct ip_conntrack_tuple repl_tuple; // 应答方向的tuple
struct ip_conntrack_expect *exp;
if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { // 根据当前tuple,
DEBUGP("Can't invert tuple.\n"); // 计算出应答方向的repl_tuple
return NULL; }
conntrack = ip_conntrack_alloc(tuple, &repl_tuple); if (conntrack == NULL || IS_ERR(conntrack)) return (struct ip_conntrack_tuple_hash *)conntrack;
if (!protocol->new(conntrack, skb)) { // 根据协议不同,比如icmp_new()
ip_conntrack_free(conntrack); return NULL; }
write_lock_bh(&ip_conntrack_lock); exp = find_expectation(tuple);
if (exp) { DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n", conntrack, exp); /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &conntrack->status); conntrack->master = exp->master; #ifdef CONFIG_IP_NF_CONNTRACK_MARK conntrack->mark = exp->master->mark; #endif #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) /* this is ugly, but there is no other place where to put it */ conntrack->nat.masq_index = exp->master->nat.masq_index; #endif #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK conntrack->secmark = exp->master->secmark; #endif nf_conntrack_get(&conntrack->master->ct_general); CONNTRACK_STAT_INC(expect_new); } else { // 如果没有找到exp = find_expectation(tuple);
conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
CONNTRACK_STAT_INC(new); }
/* Overload tuple linked list to put us in unconfirmed list. */ list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
write_unlock_bh(&ip_conntrack_lock);
if (exp) { if (exp->expectfn) exp->expectfn(conntrack, exp); ip_conntrack_expect_put(exp); }
return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; } int ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse, const struct ip_conntrack_tuple *orig, const struct ip_conntrack_protocol *protocol) { inverse->src.ip = orig->dst.ip; inverse->dst.ip = orig->src.ip; inverse->dst.protonum = orig->dst.protonum; inverse->dst.dir = !orig->dst.dir;
return protocol->invert_tuple(inverse, orig); // 这里根据协议的不同调用各自的函数
}
static int tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct ip_conntrack_tuple *tuple) { struct tcphdr _hdr, *hp;
/* Actually only need first 8 bytes. */ hp = skb_header_pointer(skb, dataoff, 8, &_hdr); if (hp == NULL) return 0;
tuple->src.u.tcp.port = hp->source; // 因为是tcp包,所以填充tuple中tcp的相关信息
tuple->dst.u.tcp.port = hp->dest;
return 1; } struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig, struct ip_conntrack_tuple *repl) { struct ip_conntrack *conntrack;
if (!ip_conntrack_hash_rnd_initted) { get_random_bytes(&ip_conntrack_hash_rnd, 4); ip_conntrack_hash_rnd_initted = 1; }
/* We don't want any race condition at early drop stage */ atomic_inc(&ip_conntrack_count);
if (ip_conntrack_max && atomic_read(&ip_conntrack_count) > ip_conntrack_max) { unsigned int hash = hash_conntrack(orig); /* Try dropping from this hash chain. */ if (!early_drop(&ip_conntrack_hash[hash])) { atomic_dec(&ip_conntrack_count); if (net_ratelimit()) printk(KERN_WARNING "ip_conntrack: table full, dropping" " packet.\n"); return ERR_PTR(-ENOMEM); } }
conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); if (!conntrack) { DEBUGP("Can't allocate conntrack.\n"); atomic_dec(&ip_conntrack_count); return ERR_PTR(-ENOMEM); }
memset(conntrack, 0, sizeof(*conntrack)); // 设置ip_conntrack
atomic_set(&conntrack->ct_general.use, 1); conntrack->ct_general.destroy = destroy_conntrack; conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; /* Don't set timer yet: wait for confirmation */ init_timer(&conntrack->timeout); conntrack->timeout.data = (unsigned long)conntrack; conntrack->timeout.function = death_by_timeout;
return conntrack; } /* If an expectation for this connection is found, it gets delete from * global list then returned. */ static struct ip_conntrack_expect * find_expectation(const struct ip_conntrack_tuple *tuple) 从全局链表ip_conntrack_expect_list里面查找期望连接。 { struct ip_conntrack_expect *i;
list_for_each_entry(i, &ip_conntrack_expect_list, list) { /* If master is not in hash table yet (ie. packet hasn't left this machine yet), how can other end know about expected? Hence these are not the droids you are looking for (if master ct never got confirmed, we'd hold a reference to it and weird things would happen to future packets). */ if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) // 设置位掩码比较
&& is_confirmed(i->master)) { if (i->flags & IP_CT_EXPECT_PERMANENT) { atomic_inc(&i->use); return i; } else if (del_timer(&i->timeout)) { ip_ct_unlink_expect(i); return i; } } } return NULL; }
static inline int ip_ct_tuple_mask_cmp(const struct ip_conntrack_tuple *t, const struct ip_conntrack_tuple *tuple, const struct ip_conntrack_tuple *mask) { return !(((t->src.ip ^ tuple->src.ip) & mask->src.ip) || ((t->dst.ip ^ tuple->dst.ip) & mask->dst.ip) || ((t->src.u.all ^ tuple->src.u.all) & mask->src.u.all) || ((t->dst.u.all ^ tuple->dst.u.all) & mask->dst.u.all) || ((t->dst.protonum ^ tuple->dst.protonum) & mask->dst.protonum)); } static struct ip_conntrack_helper * __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple) { return LIST_FIND(&helpers, helper_cmp, struct ip_conntrack_helper *, tuple); } { .hook = ip_conntrack_help, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_HELPER, // INT_MAX - 2,
}, { .hook = ip_conntrack_help, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_HELPER, // INT_MAX - 2,
},
static unsigned int ip_conntrack_help(unsigned int hooknum, struct sk_buff **pskb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { struct ip_conntrack *ct; enum ip_conntrack_info ctinfo;
/* This is where we call the helper: as the packet goes out. */ ct = ip_conntrack_get(*pskb, &ctinfo); // 找到skb关联的ip_conntrack
if (ct && ct->helper && ctinfo != IP_CT_RELATED + IP_CT_IS_REPLY) { unsigned int ret; ret = ct->helper->help(pskb, ct, ctinfo); // 调用各个协议注册的helper的help函数
if (ret != NF_ACCEPT) // 下面以tftp为例(tftp_heaper)来说明help函数
return ret; } return NF_ACCEPT; }
/* Return conntrack_info and tuple hash for given skb. */ static inline struct ip_conntrack * ip_conntrack_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo) { *ctinfo = skb->nfctinfo; return (struct ip_conntrack *)skb->nfct; // skb的nfct成员指向了ip_conntrack。哪里设置的?
} static int tftp_help(struct sk_buff **pskb, struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) { struct tftphdr _tftph, *tfh; struct ip_conntrack_expect *exp; unsigned int ret = NF_ACCEPT;
tfh = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4+sizeof(struct udphdr), sizeof(_tftph), &_tftph); if (tfh == NULL) return NF_ACCEPT;
switch (ntohs(tfh->opcode)) { /* RRQ and WRQ works the same way */ case TFTP_OPCODE_READ: case TFTP_OPCODE_WRITE: DEBUGP(""); DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
exp = ip_conntrack_expect_alloc(ct); // 先创建了一个ip_conntrack_expect
if (exp == NULL) return NF_DROP;
exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; //将应答方向的tuple结构附值给exp->tuple
exp->mask.src.ip = 0xffffffff; exp->mask.src.u.udp.port = 0; exp->mask.dst.ip = 0xffffffff; exp->mask.dst.u.udp.port = 0xffff; exp->mask.dst.protonum = 0xff; exp->expectfn = NULL; exp->flags = 0;
DEBUGP("expect: "); DUMP_TUPLE(&exp->tuple); DUMP_TUPLE(&exp->mask); if (ip_nat_tftp_hook) ret = ip_nat_tftp_hook(pskb, ctinfo, exp); else if (ip_conntrack_expect_related(exp) != 0) ret = NF_DROP; ip_conntrack_expect_put(exp); break; case TFTP_OPCODE_DATA: case TFTP_OPCODE_ACK: DEBUGP("Data/ACK opcode\n"); break; case TFTP_OPCODE_ERROR: DEBUGP("Error opcode\n"); break; default: DEBUGP("Unknown opcode\n"); } return NF_ACCEPT; } int ip_conntrack_expect_related(struct ip_conntrack_expect *expect) { struct ip_conntrack_expect *i; int ret;
DEBUGP("ip_conntrack_expect_related %p\n", related_to); DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple); DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
write_lock_bh(&ip_conntrack_lock); list_for_each_entry(i, &ip_conntrack_expect_list, list) { if (expect_matches(i, expect)) { /* Refresh timer: if it's dying, ignore.. */ if (refresh_timer(i)) { ret = 0; goto out; } } else if (expect_clash(i, expect)) { ret = -EBUSY; goto out; } }
/* Will be over limit? */ if (expect->master->helper->max_expected && expect->master->expecting >= expect->master->helper->max_expected) evict_oldest_expect(expect->master);
ip_conntrack_expect_insert(expect); ip_conntrack_expect_event(IPEXP_NEW, expect); ret = 0; out: write_unlock_bh(&ip_conntrack_lock); return ret; } /* The protocol-specific manipulable parts of the tuple: always in network order! */ union ip_conntrack_manip_proto { /* Add other protocols here. */ u_int16_t all;
struct { __be16 port; } tcp; struct { u_int16_t port; } udp; struct { u_int16_t id; } icmp; struct { u_int16_t port; } sctp; struct { __be16 key; /* key is 32bit, pptp only uses 16 */ } gre; };
/* The manipulable part of the tuple. */ struct ip_conntrack_manip { u_int32_t ip; union ip_conntrack_manip_proto u; };
/* This contains the information to distinguish a connection. */ struct ip_conntrack_tuple { struct ip_conntrack_manip src;
/* These are the parts of the tuple which are fixed. */ struct { u_int32_t ip; union { /* Add other protocols here. */ u_int16_t all;
struct { u_int16_t port; } tcp; struct { u_int16_t port; } udp; struct { u_int8_t type, code; } icmp; struct { u_int16_t port; } sctp; struct { __be16 key; /* key is 32bit, * pptp only uses 16 */ } gre; } u;
/* The protocol. */ u_int8_t protonum;
/* The direction (for tuplehash) */ u_int8_t dir; } dst; } ip_conntrack_in
ip_conntrack_local -> ip_conntrack_in
static struct nf_hook_ops ip_conntrack_ops[] = { { .hook = ip_conntrack_defrag, // 进出包之前先处理分片
.owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, }, { .hook = ip_conntrack_in, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK, // -200
}, { .hook = ip_conntrack_defrag, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, }, { .hook = ip_conntrack_local, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK, }, { .hook = ip_conntrack_help, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_HELPER, }, { .hook = ip_conntrack_help, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_HELPER, }, { .hook = ip_confirm, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, { .hook = ip_confirm, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, };
|