内核IP层传输数据到上层-zimang-ChinaUnix博客

行到水穷处坐看云起时

首页　| 　博文目录　| 　关于我

zimang

博客访问： 1825757
博文数量： 306
博客积分： 3133
博客等级：中校
技术积分： 3932
用户组：普通用户
注册时间： 2009-04-19 16:50

文章分类

全部博文（306）

存储（2）

大数据处理（1）
算法（2）
服务器（6）
计算机基础（7）
无线（60）
数据库（1）
web开发（7）
Qt（8）
音视频（5）
C/C++（19）
其他（18）
Linux system（60）
Embeded system（13）
Linux Network（58）
Linux kernel（33）
未分配的博文（7）

文章存档

2018年（7）

2017年（18）

2016年（39）

2015年（35）

2014年（52）

2013年（39）

2012年（22）

2011年（29）

2010年（53）

2009年（12）

我的朋友

相关博文

内核IP层传输数据到上层

分类： LINUX

2009-07-17 12:07:50

一、先介绍第四层协议的注册（net/ipv4/af_inet.c）：
通过结构体 net_protocol：
struct net_protocol {
    int            (*handler)(struct sk_buff *skb);
    void            (*err_handler)(struct sk_buff *skb, u32 info);
    int            (*gso_send_check)(struct sk_buff *skb);
    struct sk_buff     *(*gso_segment)(struct sk_buff *skb,
                     int features);
    unsigned int        no_policy:1,
                netns_ok:1;
};

第四层协议添加不同的回调函数。如TCP协议：
static struct net_protocol tcp_protocol = {
    .handler =    tcp_v4_rcv,//IP层通过这个回调函数把数据传到tcp。
    .err_handler =    tcp_v4_err,
    .gso_send_check = tcp_v4_gso_send_check,
    .gso_segment =    tcp_tso_segment,
    .no_policy =    1,
    .netns_ok =    1,
};
然后通过下面一个函数在内核初始化的时候加载。
static int __init inet_init(void)
{...
    if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
        printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
    if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
        printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
>>>
}

二、ip层交付：
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
    struct iphdr *iph;
.........

    /* Remove any debris in the socket control block */
    memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));

    return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL,
         ip_rcv_finish);

inhdr_error:
    IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
drop:
    kfree_skb(skb);
out:
    return NET_RX_DROP;
}

真正的处理函数在 ip_rcv_finish（），而ip_rcv一般只做一些检查工作。

static int ip_rcv_finish(struct sk_buff *skb)
{
    const struct iphdr *iph = ip_hdr(skb);
    struct rtable *rt;

    /*
     *    Initialise the virtual path cache for the packet. It describes
     *    how the packet travels inside Linux networking.
     */
    if (skb->dst == NULL) {
        int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
                     skb->dev);
        if (unlikely(err)) {
            if (err == -EHOSTUNREACH)
                IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
            else if (err == -ENETUNREACH)
                IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
            goto drop;
        }
    }

#ifdef CONFIG_NET_CLS_ROUTE
    if (unlikely(skb->dst->tclassid)) {
        struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
        u32 idx = skb->dst->tclassid;
        st[idx&0xFF].o_packets++;
        st[idx&0xFF].o_bytes+=skb->len;
        st[(idx>>16)&0xFF].i_packets++;
        st[(idx>>16)&0xFF].i_bytes+=skb->len;
    }
#endif

    if (iph->ihl > 5 && ip_rcv_options(skb))
        goto drop;

    rt = skb->rtable;
    if (rt->rt_type == RTN_MULTICAST)
        IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS);
    else if (rt->rt_type == RTN_BROADCAST)
        IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS);

    return dst_input(skb);

drop:
    kfree_skb(skb);
    return NET_RX_DROP;
}

然后：
int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
         u8 tos, struct net_device *dev)
{
    struct rtable * rth;
    unsigned    hash;
    int iif = dev->ifindex;
    struct net *net;

    net = dev_net(dev);
    tos &= IPTOS_RT_MASK;
    hash = rt_hash(daddr, saddr, iif);

    rcu_read_lock();
    for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
     rth = rcu_dereference(rth->u.dst.rt_next)) {
        if (((rth->fl.fl4_dst ^ daddr) |
         (rth->fl.fl4_src ^ saddr) |
         (rth->fl.iif ^ iif) |
         rth->fl.oif |
         (rth->fl.fl4_tos ^ tos)) == 0 &&
         rth->fl.mark == skb->mark &&
         net_eq(dev_net(rth->u.dst.dev), net) &&
         rth->rt_genid == atomic_read(&rt_genid)) {
            dst_use(&rth->u.dst, jiffies);
            RT_CACHE_STAT_INC(in_hit);
            rcu_read_unlock();
            skb->rtable = rth;
            return 0;
        }
        RT_CACHE_STAT_INC(in_hlist_search);
    }
    rcu_read_unlock();

    /* Multicast recognition logic is moved from route cache to here.
     The problem was that too many Ethernet cards have broken/missing
     hardware multicast filters :-( As result the host on multicasting
     network acquires a lot of useless route cache entries, sort of
     SDR messages from all the world. Now we try to get rid of them.
     Really, provided software IP multicast filter is organized
     reasonably (at least, hashed), it does not result in a slowdown
     comparing with route cache reject entries.
     Note, that multicast routers are not affected, because
     route cache entry is created eventually.
     */
    if (ipv4_is_multicast(daddr)) {
        struct in_device *in_dev;

        rcu_read_lock();
        if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
            int our = ip_check_mc(in_dev, daddr, saddr,
                ip_hdr(skb)->protocol);
            if (our
#ifdef CONFIG_IP_MROUTE
             || (!ipv4_is_local_multicast(daddr) &&
                IN_DEV_MFORWARD(in_dev))
#endif
             ) {
                rcu_read_unlock();
                return ip_route_input_mc(skb, daddr, saddr,
                             tos, dev, our);
            }
        }
        rcu_read_unlock();
        return -EINVAL;
    }
    return ip_route_input_slow(skb, daddr, saddr, tos, dev);
}

static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
             u8 tos, struct net_device *dev)
{
....
    rth->u.dst.input= ip_local_deliver;
....
}
再后来到
int ip_local_deliver(struct sk_buff *skb)
{
    /*
     *    Reassemble IP fragments.
     */

    if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
        if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))//分包重组
            return 0;
    }

    return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
         ip_local_deliver_finish);
}

static int ip_local_deliver_finish(struct sk_buff *skb)
{
    struct net *net = dev_net(skb->dev);

    __skb_pull(skb, ip_hdrlen(skb));

    /* Point into the IP datagram, just past the header. */
    skb_reset_transport_header(skb);

    rcu_read_lock();
    {
        int protocol = ip_hdr(skb)->protocol;
        int hash, raw;
        struct net_protocol *ipprot;

    resubmit:
        raw = raw_local_deliver(skb, protocol);

        hash = protocol & (MAX_INET_PROTOS - 1);
        ipprot = rcu_dereference(inet_protos[hash]);
        if (ipprot != NULL && (net == &init_net || ipprot->netns_ok)) {
            int ret;

            if (!ipprot->no_policy) {
                if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                    kfree_skb(skb);
                    goto out;
                }
                nf_reset(skb);
            }
            ret = ipprot->handler(skb);//这里便是tcp协议注册的时候的处理回调函数
            if (ret < 0) {
                protocol = -ret;
                goto resubmit;
            }
            IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
        } else {
            if (!raw) {
                if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                    IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
                    icmp_send(skb, ICMP_DEST_UNREACH,
                         ICMP_PROT_UNREACH, 0);
                }
            } else
                IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
            kfree_skb(skb);
        }
    }
out:
    rcu_read_unlock();

    return 0;
}

数据包的基本流程 ip_rcv -->ip_rcv_finish-->ip_route_input --> ip_route_input_slow,NF_IP_PRE_ROUTING挂在ip_rcv与ip_rcv_finish之间。ip_route_input 决定包的走向，是转发，发往本地，还是丢弃。

ingress IP流量是由ip_rcv_finish处理。这个函数根据路由表来判断将报文送往本地还是丢弃。决策是由ip_route_input函数来完成，它首先检查路由缓存，当缓存查找失败时检查路由表（ip_route_input_slow）。ip_route_input_slow函数对dst->input和dst->ouput函数指针可以创建三对主要的组合：

● 如果报文被转发，函数将dst->input初始化为ip_forward，将dst->output初始化为ip_output。所以dst_input将调用ip_forward，而在ip_forward的结尾处间接调用dst_output，即ip_output。这是图35-3中的case (1)。

● 如果报文被送往本地，函数将dst->input初始化为ip_local_deliver。此时不需要初始化dst->output，但它还是被初始化为ip_rt_error，当被调用时打印出一条错误消息，这样做有助于检测在处理送往本地报文时dst->output是否被错误调用。

● 如果根据路由表得出目的地址不可达，dst->input被初始化为ip_error，这将生成一个ICMP消息，消息类型依赖于路由查找返回的结果。因为ip_error将skb buffer释放，所以不需要初始化dst->output，因为即使在犯错情况下它也不会被调用。

阅读(1597) | 评论(0) | 转发(0) |

上一篇：解决libevent 例子程序运行错误

下一篇：解决Fedora10下安装vmtools gcc版本不对问题

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6