一、先介绍第四层协议的注册(net/ipv4/af_inet.c):
通过结构体
net_protocol:
struct net_protocol {
int (*handler)(struct sk_buff *skb);
void (*err_handler)(struct sk_buff *skb, u32 info);
int (*gso_send_check)(struct sk_buff *skb);
struct sk_buff *(*gso_segment)(struct sk_buff *skb,
int features);
unsigned int no_policy:1,
netns_ok:1;
};
第四层协议添加不同的回调函数。如TCP协议:
static struct net_protocol tcp_protocol = {
.handler = tcp_v4_rcv,//IP层通过这个回调函数把数据传到tcp。
.err_handler = tcp_v4_err,
.gso_send_check = tcp_v4_gso_send_check,
.gso_segment = tcp_tso_segment,
.no_policy = 1,
.netns_ok = 1,
};
然后通过下面一个函数在内核初始化的时候加载。
static int __init inet_init(void)
{...
if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
>>>
}
二、ip层交付:
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct iphdr *iph;
.........
/* Remove any debris in the socket control block */
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL,
ip_rcv_finish);
inhdr_error:
IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
out:
return NET_RX_DROP;
}
真正的处理函数在 ip_rcv_finish(), 而ip_rcv一般只做一些检查工作。
static int ip_rcv_finish(struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
if (skb->dst == NULL) {
int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
skb->dev);
if (unlikely(err)) {
if (err == -EHOSTUNREACH)
IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
else if (err == -ENETUNREACH)
IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
goto drop;
}
}
#ifdef CONFIG_NET_CLS_ROUTE
if (unlikely(skb->dst->tclassid)) {
struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
u32 idx = skb->dst->tclassid;
st[idx&0xFF].o_packets++;
st[idx&0xFF].o_bytes+=skb->len;
st[(idx>>16)&0xFF].i_packets++;
st[(idx>>16)&0xFF].i_bytes+=skb->len;
}
#endif
if (iph->ihl > 5 && ip_rcv_options(skb))
goto drop;
rt = skb->rtable;
if (rt->rt_type == RTN_MULTICAST)
IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS);
else if (rt->rt_type == RTN_BROADCAST)
IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS);
return dst_input(skb);
drop:
kfree_skb(skb);
return NET_RX_DROP;
}
然后:
int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev)
{
struct rtable * rth;
unsigned hash;
int iif = dev->ifindex;
struct net *net;
net = dev_net(dev);
tos &= IPTOS_RT_MASK;
hash = rt_hash(daddr, saddr, iif);
rcu_read_lock();
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
rth = rcu_dereference(rth->u.dst.rt_next)) {
if (((rth->fl.fl4_dst ^ daddr) |
(rth->fl.fl4_src ^ saddr) |
(rth->fl.iif ^ iif) |
rth->fl.oif |
(rth->fl.fl4_tos ^ tos)) == 0 &&
rth->fl.mark == skb->mark &&
net_eq(dev_net(rth->u.dst.dev), net) &&
rth->rt_genid == atomic_read(&rt_genid)) {
dst_use(&rth->u.dst, jiffies);
RT_CACHE_STAT_INC(in_hit);
rcu_read_unlock();
skb->rtable = rth;
return 0;
}
RT_CACHE_STAT_INC(in_hlist_search);
}
rcu_read_unlock();
/* Multicast recognition logic is moved from route cache to here.
The problem was that too many Ethernet cards have broken/missing
hardware multicast filters :-( As result the host on multicasting
network acquires a lot of useless route cache entries, sort of
SDR messages from all the world. Now we try to get rid of them.
Really, provided software IP multicast filter is organized
reasonably (at least, hashed), it does not result in a slowdown
comparing with route cache reject entries.
Note, that multicast routers are not affected, because
route cache entry is created eventually.
*/
if (ipv4_is_multicast(daddr)) {
struct in_device *in_dev;
rcu_read_lock();
if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
int our = ip_check_mc(in_dev, daddr, saddr,
ip_hdr(skb)->protocol);
if (our
#ifdef CONFIG_IP_MROUTE
|| (!ipv4_is_local_multicast(daddr) &&
IN_DEV_MFORWARD(in_dev))
#endif
) {
rcu_read_unlock();
return ip_route_input_mc(skb, daddr, saddr,
tos, dev, our);
}
}
rcu_read_unlock();
return -EINVAL;
}
return ip_route_input_slow(skb, daddr, saddr, tos, dev);
}
static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev)
{
....
rth->u.dst.input= ip_local_deliver;
....
}
再后来到
int ip_local_deliver(struct sk_buff *skb)
{
/*
* Reassemble IP fragments.
*/
if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))//分包重组
return 0;
}
return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
static int ip_local_deliver_finish(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
__skb_pull(skb, ip_hdrlen(skb));
/* Point into the IP datagram, just past the header. */
skb_reset_transport_header(skb);
rcu_read_lock();
{
int protocol = ip_hdr(skb)->protocol;
int hash, raw;
struct net_protocol *ipprot;
resubmit:
raw = raw_local_deliver(skb, protocol);
hash = protocol & (MAX_INET_PROTOS - 1);
ipprot = rcu_dereference(inet_protos[hash]);
if (ipprot != NULL && (net == &init_net || ipprot->netns_ok)) {
int ret;
if (!ipprot->no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
kfree_skb(skb);
goto out;
}
nf_reset(skb);
}
ret = ipprot->handler(skb);//这里便是tcp协议注册的时候的处理回调函数
if (ret < 0) {
protocol = -ret;
goto resubmit;
}
IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
} else {
if (!raw) {
if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
} else
IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
kfree_skb(skb);
}
}
out:
rcu_read_unlock();
return 0;
}
数据包的基本流程 ip_rcv -->ip_rcv_finish-->ip_route_input --> ip_route_input_slow,NF_IP_PRE_ROUTING挂在ip_rcv与ip_rcv_finish之间。ip_route_input 决定包的走向,是转发,发往本地,还是丢弃。
ingress IP流量是由ip_rcv_finish处理。这个函数根据路由表来判断将报文送往本地还是丢弃。决策是由ip_route_input函数来完成,它首先检查路由缓存,当缓存查找失败时检查路由表(ip_route_input_slow)。ip_route_input_slow函数对dst->input和dst->ouput函数指针可以创建三对主要的组合:
● 如果报文被转发,函数将dst->input初始化为ip_forward,将dst->output初始化为ip_output。所以dst_input将调用ip_forward,而在ip_forward的结尾处间接调用dst_output,即ip_output。这是图35-3中的case (1)。
● 如果报文被送往本地,函数将dst->input初始化为ip_local_deliver。此时不需要初始化dst->output,但它还是被初始化为ip_rt_error,当被调用时打印出一条错误消息,这样做有助于检测在处理送往本地报文时dst->output是否被错误调用。
● 如果根据路由表得出目的地址不可达,dst->input被初始化为ip_error,这将生成一个ICMP消息,消息类型依赖于路由查找返回的结果。因为ip_error将skb buffer释放,所以不需要初始化dst->output,因为即使在犯错情况下它也不会被调用。
阅读(1568) | 评论(0) | 转发(0) |