首先弹出IP报头然后复位transport_header的位置
结构图如下
然后进入到raw_local_deliver中
raw_local_deliver在/net/ipv4/raw.c中
int raw_local_deliver(struct sk_buff *skb, int protocol) { int hash; struct sock *raw_sk;
//计算协议类型的哈希值 hash = protocol & (RAW_HTABLE_SIZE - 1); //取得对应的sock raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); /* If there maybe a raw socket we must check - if not we * don't care less */ //检测sock是否为空 //将skb发送到上层处理 if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash)) raw_sk = NULL; return raw_sk != NULL; }
|
还记得raw_v4_hashinfo么?~ 回顾一下最前面sock的结构图吧 = 3=)/
继续大步往上层走,来到raw_v4_input
raw_v4_input在/net/ipv4/raw.c中
static int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) { struct sock *sk; struct hlist_head *head; int delivered = 0; struct net *net;
//锁上raw_v4_hashinfo read_lock(&raw_v4_hashinfo.lock); //取得队列元素 head = &raw_v4_hashinfo.ht[hash]; //检测队列元素是否为空 if (hlist_empty(head)) goto out;
net = dev_net(skb->dev); sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol, iph->saddr, iph->daddr, skb->dev->ifindex); while (sk) { delivered = 1; //检测协议是否为ICMP if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { //克隆一个skb struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); /* Not releasing hash table! */ //检测克隆是否成功 if (clone) //成功则将sk和skb传递到上一层 raw_rcv(sk, clone); } //寻找下一个匹配的sock sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol, iph->saddr, iph->daddr, skb->dev->ifindex); } out: read_unlock(&raw_v4_hashinfo.lock); return delivered; }
|
__raw_v4_lookup负责匹配sock
__raw_v4_lookup在/net/ipv4/raw.c中
static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, unsigned short num, __be32 raddr, __be32 laddr, int dif) { struct hlist_node *node;
//历遍sock队列 sk_for_each_from(sk, node) { //取得对应的inet_sock结构 struct inet_sock *inet = inet_sk(sk); //检测sock的net类型是否与传入的net类型相等 //检测端口号是否相等 //检测目的地址是否存在,检测目的地址是否等于发送地址 //检测本地跳跃地址是否存在,检测本地跳跃地址是否等于目的地址 // if (net_eq(sock_net(sk), net) && inet->num == num && !(inet->daddr && inet->daddr != raddr) && !(inet->rcv_saddr && inet->rcv_saddr != laddr) && !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) goto found; /* gotcha */ } sk = NULL; found: return sk; }
|
我不明白sk_bound_dev_if这个参数的用途............ 请大家赐教 T ^T
回到raw_v4_input中
现在要克隆skb, skb_clone负责这个任务
skb_clone在/net/core/skbuff.c中
struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { struct sk_buff *n;
//取得该skb的下一个skb n = skb + 1; //检测skb的克隆模式是否为独占 //检测下一个skb的克隆模式是否为无效的 if (skb->fclone == SKB_FCLONE_ORIG && n->fclone == SKB_FCLONE_UNAVAILABLE) { atomic_t *fclone_ref = (atomic_t *) (n + 1); n->fclone = SKB_FCLONE_CLONE; atomic_inc(fclone_ref); } else { //否则从缓冲区中分配一个新的skb n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); if (!n) return NULL; //设置克隆模式为无效 n->fclone = SKB_FCLONE_UNAVAILABLE; } //执行克隆 return __skb_clone(n, skb); }
|
我们在发送ICMP中所申请的skb是没有克隆标志的,所以这里会进入else中,重缓冲区中分配一个新的skb
__skb_clone执行具体的拷贝任务
__skb_clone在/net/core/skbuff.c中
static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) { #define C(x) n->x = skb->x
//初始化队列指针 n->next = n->prev = NULL; //初始化sock指针 n->sk = NULL; //拷贝所有信息层的信息 __copy_skb_header(n, skb); C(len); C(data_len); C(mac_len); n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; n->cloned = 1; n->nohdr = 0; //初始化回收函数 n->destructor = NULL; C(iif); C(tail); C(end); C(head); C(data); C(truesize); atomic_set(&n->users, 1); atomic_inc(&(skb_shinfo(skb)->dataref)); skb->cloned = 1; return n; #undef C }
|
__skb_clone主要拷贝数据方面的内容,各种数据指针和数据长度
回到raw_v4_input,克隆成功后便进入到raw_rcv
raw_rcv在/net/ipv4/raw.c中
int raw_rcv(struct sock *sk, struct sk_buff *skb) { if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { atomic_inc(&sk->sk_drops); kfree_skb(skb); return NET_RX_DROP; } nf_reset(skb); //将skb的data指针指向网络层头部 skb_push(skb, skb->data - skb_network_header(skb)); raw_rcv_skb(sk, skb); return 0; }
|
主要是完成 skb_push(skb, skb->data - skb_network_header(skb))这个任务
执行完成后的结构图如下
然后到raw_rcv_skb
raw_rcv_skb在/net/ipv4/raw.c中
static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) { /* Charge it to the socket. */ //发送sk和skb到上一层 if (sock_queue_rcv_skb(sk, skb) < 0) { //增加发送失败计数器 atomic_inc(&sk->sk_drops); //释放skb kfree_skb(skb); return NET_RX_DROP; } return NET_RX_SUCCESS; }
|
很简单,调用sock_queue_rcv_skb
sock_queue_rcv_skb在/net/core/sock.c中
int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err = 0; int skb_len;
/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces number of warnings when compiling with -W --ANK */ if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= (unsigned)sk->sk_rcvbuf) { err = -ENOMEM; goto out; } err = sk_filter(sk, skb); if (err) goto out; if (!sk_rmem_schedule(sk, skb->truesize)) { err = -ENOBUFS; goto out; } skb->dev = NULL; //关联skb与sk skb_set_owner_r(skb, sk); /* Cache the SKB length before we tack it onto the receive * queue. Once it is added it no longer belongs to us and * may be freed by other threads of control pulling packets * from the queue. */ //设置数据长度 skb_len = skb->len; //把该skb添加到sock的接受队列上 skb_queue_tail(&sk->sk_receive_queue, skb); //检测sock是否处于死亡状态 if (!sock_flag(sk, SOCK_DEAD)) //提交sock到上一层 sk->sk_data_ready(sk, skb_len); out: return err; }
|
sk_filter和sk_rmem_schedule的内容不是很明白 T ^T 继续请大家指教
把skb挂接到sock的sk_receive_queue队列之后就跳用sk->sk_data_ready
sk->sk_data_ready为sock_def_readable
sock_def_readable在/net/core/sock.c中
static void sock_def_readable(struct sock *sk, int len) { read_lock(&sk->sk_callback_lock); if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) //唤醒sock的sk_sleep wake_up_interruptible_sync(sk->sk_sleep); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); read_unlock(&sk->sk_callback_lock); }
|
唤醒sk_sleep!!! 终于在这里唤醒了读取啊~ 如果这时候sk_sleep在睡眠的话就会被唤醒,从而拿到需要的数据
raw这边走完了,别急,还有icmp呢边呢
回到ip_local_deliver_finish中
继续往下走,来到ipprot = rcu_dereference(inet_protos[hash]),这里会根据哈希值拿到协议
我们当然是要ICMP协议的结构icmp_protocol了
icmp_protocol的结构如下
static struct net_protocol icmp_protocol = { .handler = icmp_rcv, .no_policy = 1, .netns_ok = 1, };
|
继续往下走,来到ret = ipprot->handler(skb),在这里运行协议的handler函数,也就是icmp_rcv
icmp_rcv在/net/ipv4/icmp.c中
int icmp_rcv(struct sk_buff *skb) { struct icmphdr *icmph; struct rtable *rt = skb->rtable;
//检测安全 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { int nh; if (!(skb->sp && skb->sp->xvec[skb->sp->len - 1]->props.flags & XFRM_STATE_ICMP)) goto drop; if (!pskb_may_pull(skb, sizeof(*icmph) + sizeof(struct iphdr))) goto drop; nh = skb_network_offset(skb); skb_set_network_header(skb, sizeof(*icmph)); if (!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN, skb)) goto drop;
skb_set_network_header(skb, nh); } //增加ICMP包累积计数器 ICMP_INC_STATS_BH(ICMP_MIB_INMSGS); //检测效验和模式 switch (skb->ip_summed) { case CHECKSUM_COMPLETE: if (!csum_fold(skb->csum)) break; /* fall through */ case CHECKSUM_NONE: skb->csum = 0; if (__skb_checksum_complete(skb)) goto error; } //检测数据空间是否满足icmph包的大小 //满足则推出ICMP包结构 if (!pskb_pull(skb, sizeof(*icmph))) goto error; //取得icmp结构 icmph = icmp_hdr(skb); //增加icmp包的类型的计数器 ICMPMSGIN_INC_STATS_BH(icmph->type); /* * 18 is the highest 'known' ICMP type. Anything else is a mystery * * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently * discarded. */ //检测类型是否超出范围 if (icmph->type > NR_ICMP_TYPES) goto error; /* * Parse the ICMP message */ //检测是否为广播或者多播 if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { struct net *net; net = dev_net(rt->u.dst.dev); /* * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be * silently ignored (we let user decide with a sysctl). * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently * discarded if to broadcast/multicast. */ if ((icmph->type == ICMP_ECHO || icmph->type == ICMP_TIMESTAMP) && net->ipv4.sysctl_icmp_echo_ignore_broadcasts) { goto error; } if (icmph->type != ICMP_ECHO && icmph->type != ICMP_TIMESTAMP && icmph->type != ICMP_ADDRESS && icmph->type != ICMP_ADDRESSREPLY) { goto error; } } //递交skb给相应的icmp包类型处理函数 icmp_pointers[icmph->type].handler(skb); drop: kfree_skb(skb); return 0; error: ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); goto drop; }
|
xfrm4_policy_check是安全检测模块的,跳过
先看一下我们现在skb的结构,虽然在raw处理中改变了结构,不过呢个是克隆体,不会影响到我们这边的skb
然后执行pskb_pull(skb, sizeof(*icmph),弹出icmp数据包
执行完后数据结构如下
为什么这里data不和tail重合呢? 是因为我们在发送的时候把数据大小设置成了64个字节,超过了ICMP包的大小,所以这里是不会到尾端的
到最后的icmp_pointers[icmph->type].handler(skb),我们这里ICMP包类型为8,请求回显,呢么就是到icmp_echo中
icmp_echo在/net/ipv4/icmp.c中
static void icmp_echo(struct sk_buff *skb) { struct net *net;
net = dev_net(skb->dst->dev); //检测是否忽略回显 if (!net->ipv4.sysctl_icmp_echo_ignore_all) { struct icmp_bxm icmp_param; //复制icmp包信息 icmp_param.data.icmph = *icmp_hdr(skb); //设置icmp包为回显应答 icmp_param.data.icmph.type = ICMP_ECHOREPLY; icmp_param.skb = skb; icmp_param.offset = 0; icmp_param.data_len = skb->len; icmp_param.head_len = sizeof(struct icmphdr); //发送icmp包 icmp_reply(&icmp_param, skb); } }
|
设置完后来到icmp_reply
icmp_reply在/net/ipv4/icmp.c中
static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) { struct ipcm_cookie ipc; struct rtable *rt = skb->rtable; struct net *net = dev_net(rt->u.dst.dev); struct sock *sk = icmp_sk(net); struct inet_sock *inet = inet_sk(sk); __be32 daddr;
if (ip_options_echo(&icmp_param->replyopts, skb)) return; if (icmp_xmit_lock(sk)) return; //初始化效验和 icmp_param->data.icmph.checksum = 0; //设置服务类型 inet->tos = ip_hdr(skb)->tos; //设置发送地址 daddr = ipc.addr = rt->rt_src; //初始化ip_options为NULL ipc.opt = NULL; //检测是否有ip_options选项 if (icmp_param->replyopts.optlen) { ipc.opt = &icmp_param->replyopts; if (ipc.opt->srr) daddr = icmp_param->replyopts.faddr; } { struct flowi fl = { .nl_u = { .ip4_u = { .daddr = daddr, .saddr = rt->rt_spec_dst, .tos = RT_TOS(ip_hdr(skb)->tos) } }, .proto = IPPROTO_ICMP }; security_skb_classify_flow(skb, &fl); //查找路由 if (ip_route_output_key(net, &rt, &fl)) goto out_unlock; } if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type, icmp_param->data.icmph.code)) //发送icmp包 icmp_push_reply(icmp_param, &ipc, rt); //释放路由结构 ip_rt_put(rt); out_unlock: icmp_xmit_unlock(sk); }
|
继续来到icmp_push_reply
icmp_push_reply在/net/ipv4/icmp.c中
static void icmp_push_reply(struct icmp_bxm *icmp_param, struct ipcm_cookie *ipc, struct rtable *rt) { struct sock *sk; struct sk_buff *skb;
//取得sock结构 sk = icmp_sk(dev_net(rt->u.dst.dev)); //复制数据到skb中 if (ip_append_data(sk, icmp_glue_bits, icmp_param, icmp_param->data_len+icmp_param->head_len, icmp_param->head_len, ipc, rt, MSG_DONTWAIT) < 0) //清空sk下的所有skb ip_flush_pending_frames(sk); //检测发送队列是否为空 else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { struct icmphdr *icmph = icmp_hdr(skb); __wsum csum = 0; struct sk_buff *skb1; //历遍所有skb skb_queue_walk(&sk->sk_write_queue, skb1) { //计算累积效验和 csum = csum_add(csum, skb1->csum); } csum = csum_partial_copy_nocheck((void *)&icmp_param->data, (char *)icmph, icmp_param->head_len, csum);
//设置效验和 icmph->checksum = csum_fold(csum); //设置效验和模式 skb->ip_summed = CHECKSUM_NONE; //发送skb ip_push_pending_frames(sk); } }
|
ip_push_pending_frames,终于把ICMP包发送出去了,然后icmp_rcv又收到了一个ICMP包
不过这次的包类型为0,是回显应答
而类型0的处理函数是icmp_discard
icmp_discard里面是个空函数,什么都不干,到此ICMP包的发送就完成了
大家有没有注意到一个问题呢,就是ICMP处理函数收到ICMP包的话,RAW也会收到ICMP包
呢么在一次PING本机中会有2个ICMP包,一个是请求回显,一个是回显应答,呢么RAW层也会收到2个ICMP包
所以大家在写PING程序的时候一定不要忘记判断ICMP包的类型啊
如果不判断的话,则PING本机收到的第一个ICMP包一定是自己发出去的类型8的请求回显包,而不是类型0的回显应答包
好,现在让我们返回到__skb_recv_datagram中
现在收到了skb后就会但回到raw_recvmsg中拷贝数据
数据的拷贝由skb_copy_datagram_iovec来完成
skb_copy_datagram_iovec在/net/core/datagram.c中
int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, struct iovec *to, int len) { int start = skb_headlen(skb);
int i, copy = start - offset; /* Copy header. */ if (copy > 0) { if (copy > len) copy = len; //拷贝数据 if (memcpy_toiovec(to, skb->data + offset, copy)) goto fault; //检测是否有剩余数据未拷贝 if ((len -= copy) == 0) return 0; offset += copy; } /* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; BUG_TRAP(start <= offset + len); end = start + skb_shinfo(skb)->frags[i].size; if ((copy = end - offset) > 0) { int err; u8 *vaddr; skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; struct page *page = frag->page; if (copy > len) copy = len; vaddr = kmap(page); err = memcpy_toiovec(to, vaddr + frag->page_offset + offset - start, copy); kunmap(page); if (err) goto fault; if (!(len -= copy)) return 0; offset += copy; } start = end; } if (skb_shinfo(skb)->frag_list) { struct sk_buff *list = skb_shinfo(skb)->frag_list; for (; list; list = list->next) { int end; BUG_TRAP(start <= offset + len); end = start + list->len; if ((copy = end - offset) > 0) { if (copy > len) copy = len; if (skb_copy_datagram_iovec(list, offset - start, to, copy)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; } start = end; } } if (!len) return 0; fault: return -EFAULT; }
|
我们拷贝的数据一次完成,大小刚好,所以不会到下面的for循环和if中的
然后是memcpy_toiovec
memcpy_toiovec在/net/core/iovec.c中
int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) { //检测是否还有剩余数据未拷贝 while (len > 0) { if (iov->iov_len) { //取小的为准 int copy = min_t(unsigned int, iov->iov_len, len); //拷贝数据 if (copy_to_user(iov->iov_base, kdata, copy)) return -EFAULT; //增加数据量计数器 kdata += copy; //减少剩余数据量 len -= copy; //减少用户空间数据缓冲量 iov->iov_len -= copy; //增加地址 iov->iov_base += copy; } //移动到下一个iovec结构 iov++; } return 0; }
|
这里的拷贝也是一次完成,到这里~ 所有的4个部分就都完成了
用户层也终于拿到了数据,不过请注意哈,这个数据是包含IP层的,所以在PING程序中分析收到的ICMP包前,一定要先取得IP层数据的大小,跳过IP层才能拿到ICMP包数据的起始地址
笔记就到这里了,不知道大家对TCP/IP协议栈也是否有了一份自己的理解呢?~ = 3=)/