在前一篇博文中,分析了数据包在IP层接收过程,如果是发给本机,最终数据包会送给L4来处理。下面以UDP协议为例来分析L4的处理过程:
1. udp_rcv是封装函数,直接调用__udp4_lib_rcv函数来处理,那么我们来看看这个函数:
-
/*
-
* All we need to do is get the socket, and then do a checksum.
-
*/
-
-
int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
-
int is_udplite)
-
{
-
struct sock *sk; //这个结构体很重要,它和socket结构体相关联,也就是说根据一个就可以得到另一个
-
struct udphdr *uh = skb->h.uh; //从skb结构体中取得源端口号和目的端口号
-
unsigned short ulen;
-
struct rtable *rt = (struct rtable*)skb->dst;
-
__be32 saddr = skb->nh.iph->saddr; //从skb结构体中取得源IP地址和目的IP地址
-
__be32 daddr = skb->nh.iph->daddr;
-
-
/*
-
* Validate the packet.
-
*/
-
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
-
goto drop; /* No space for header. */
-
-
ulen = ntohs(uh->len);
-
if (ulen > skb->len)
-
goto short_packet;
-
-
if(! is_udplite ) { /* UDP validates ulen. */
-
-
if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
-
goto short_packet;
-
uh = skb->h.uh;
-
-
udp4_csum_init(skb, uh);
-
-
} else { /* UDP-Lite validates cscov. */
-
if (udplite4_csum_init(skb, uh))
-
goto csum_error;
-
}
-
-
if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) //如果是L3广播或组播报文,进入相应的处理
-
return __udp4_lib_mcast_deliver(skb, uh, saddr, daddr, udptable);
-
-
sk = __udp4_lib_lookup(saddr, uh->source, daddr, uh->dest,
-
skb->dev->ifindex, udptable ); //这是这个函数所做的主要工作之一:根据目的端口号,找到应用层创建的socket
-
-
if (sk != NULL) {
-
int ret = udp_queue_rcv_skb(sk, skb); //如果找到这个socket,就把skb挂入到此socket的接收队列中
-
sock_put(sk);
-
-
/* a return value > 0 means to resubmit the input, but
-
* it wants the return to be -protocol, or 0
-
*/
-
if (ret > 0) //在这里这个数据包从网卡芯片往协议栈送的过程就算结束了
-
return -ret;
-
return 0;
-
}
-
-
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
-
goto drop;
-
nf_reset(skb);
-
-
/* No socket. Drop packet silently, if checksum is wrong */
-
if (udp_lib_checksum_complete(skb))
-
goto csum_error;
-
-
UDP_INC_STATS_BH(UDP_MIB_NOPORTS, is_udplite);
-
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); //如果挂入失败的话,就给源主机发送目标不可达ICMP报文
-
-
/*
-
* Hmm. We got an UDP packet to a port to which we
-
* don't wanna listen. Ignore it.
-
*/
-
kfree_skb(skb); //释放掉,此skb,over...
-
return(0);
-
-
short_packet:
-
LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
-
is_udplite? "-Lite" : "",
-
NIPQUAD(saddr),
-
ntohs(uh->source),
-
ulen,
-
skb->len,
-
NIPQUAD(daddr),
-
ntohs(uh->dest));
-
goto drop;
-
-
csum_error:
-
/*
-
* RFC1122: OK. Discards the bad packet silently (as far as
-
* the network is concerned, anyway) as per 4.1.3.4 (MUST).
-
*/
-
LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
-
is_udplite? "-Lite" : "",
-
NIPQUAD(saddr),
-
ntohs(uh->source),
-
NIPQUAD(daddr),
-
ntohs(uh->dest),
-
ulen);
-
drop:
-
UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite);
-
kfree_skb(skb);
-
return(0);
-
}
一百多行啊,呵呵!
函数的注释,正确的归纳了这个函数的处理过程:取得相应的socket, 做一些检查。其实还应该加一句,把skb挂入socket的接收队列中。
2. 从上面的代码可以看出,此过程比较简单。分析一下根据端口号找socket的过程和将skb挂入socket接收队列的过程:
-
/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
-
* harder than this. -DaveM
-
*/
-
static struct sock *__udp4_lib_lookup(__be32 saddr, __be16 sport,
-
__be32 daddr, __be16 dport,
-
int dif, struct hlist_head udptable[])
-
{
-
struct sock *sk, *result = NULL;
-
struct hlist_node *node;
-
unsigned short hnum = ntohs(dport);
-
int badness = -1;
-
-
read_lock(&udp_hash_lock); //这个过程得加锁
-
sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) { //udptable这个哈希数组在bind绑定端口号的时候已经构建好了,在这里就是用端口号来
-
//从hlist链表中取得sock结构
-
struct inet_sock *inet = inet_sk(sk);
-
-
if (sk->sk_hash == hnum && !ipv6_only_sock(sk)) { //找到了bind了相同端口号的socket
-
int score = (sk->sk_family == PF_INET ? 1 : 0);
-
if (inet->rcv_saddr) { //在bind的时候绑定了自己本身的IP地址,判断对端发送数据包中的目的IP地址是否和自己匹配
-
if (inet->rcv_saddr != daddr)
-
continue;
-
score+=2;
-
}
-
if (inet->daddr) {
-
if (inet->daddr != saddr) //看socket端的目的地址和数据包的源地址
-
continue;
-
score+=2;
-
}
-
if (inet->dport) {
-
if (inet->dport != sport) //看socket端的目的端口和数据包的源端口
-
continue;
-
score+=2;
-
}
-
if (sk->sk_bound_dev_if) { //看绑定的接口 ?
-
if (sk->sk_bound_dev_if != dif)
-
continue;
-
score+=2;
-
}
-
if(score == 9) {
-
result = sk;
-
break;
-
} else if(score > badness) {
-
result = sk;
-
badness = score;
-
}
-
}
-
}
-
if (result)
-
sock_hold(result);
-
read_unlock(&udp_hash_lock);
-
return result;
-
}
从上面的代码中我们看到通过数据包的目的端口号,从udp_hash中找到bind相同端口号的socket,然后从中找到一个最佳的socket(score
来计分,呵呵),如果分值一样就取得最后bind的socket。现在我们知道了,可以建立多个socket
来bind相同的端口号(但是得用setsockopt设置socket属性为SO_REUSEADDR,否则会bind失败),如果这些socket属性一样(score分值一样),那么只有最后bind的socket有效,也就是说接收到的数据包会传给这个socket,其他socket接收不到skb。
-
int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
-
{
-
struct udp_sock *up = udp_sk(sk);
-
int rc;
-
-
/*
-
* Charge it to the socket, dropping if the queue is full.
-
*/
-
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
-
goto drop;
-
nf_reset(skb);
-
-
if (up->encap_type) {
-
/*
-
* This is an encapsulation socket, so let's see if this is
-
* an encapsulated packet.
-
* If it's a keepalive packet, then just eat it.
-
* If it's an encapsulateed packet, then pass it to the
-
* IPsec xfrm input and return the response
-
* appropriately. Otherwise, just fall through and
-
* pass this up the UDP socket.
-
*/
-
int ret;
-
-
ret = udp_encap_rcv(sk, skb);
-
if (ret == 0) {
-
/* Eat the packet .. */
-
kfree_skb(skb);
-
return 0;
-
}
-
if (ret < 0) {
-
/* process the ESP packet */
-
ret = xfrm4_rcv_encap(skb, up->encap_type);
-
UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag);
-
return -ret;
-
}
-
/* FALLTHROUGH -- it's a UDP Packet */
-
}
-
-
/*
-
* UDP-Lite specific tests, ignored on UDP sockets
-
*/
-
if ((up->pcflag & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
-
-
/*
-
* MIB statistics other than incrementing the error count are
-
* disabled for the following two types of errors: these depend
-
* on the application settings, not on the functioning of the
-
* protocol stack as such.
-
*
-
* RFC 3828 here recommends (sec 3.3): "There should also be a
-
* way ... to ... at least let the receiving application block
-
* delivery of packets with coverage values less than a value
-
* provided by the application."
-
*/
-
if (up->pcrlen == 0) { /* full coverage was set */
-
LIMIT_NETDEBUG(KERN_WARNING "UDPLITE: partial coverage "
-
"%d while full coverage %d requested\n",
-
UDP_SKB_CB(skb)->cscov, skb->len);
-
goto drop;
-
}
-
/* The next case involves violating the min. coverage requested
-
* by the receiver. This is subtle: if receiver wants x and x is
-
* greater than the buffersize/MTU then receiver will complain
-
* that it wants x while sender emits packets of smaller size y.
-
* Therefore the above ...()->partial_cov statement is essential.
-
*/
-
if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
-
LIMIT_NETDEBUG(KERN_WARNING
-
"UDPLITE: coverage %d too small, need min %d\n",
-
UDP_SKB_CB(skb)->cscov, up->pcrlen);
-
goto drop;
-
}
-
}
-
-
if (sk->sk_filter && skb->ip_summed != CHECKSUM_UNNECESSARY) {
-
if (__udp_lib_checksum_complete(skb))
-
goto drop;
-
skb->ip_summed = CHECKSUM_UNNECESSARY;
-
}
-
-
if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) { //在这里挂入的,这个函数里面有文章...
-
/* Note that an ENOMEM error is charged twice */
-
if (rc == -ENOMEM)
-
UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, up->pcflag);
-
goto drop;
-
}
-
-
UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag);
-
return 0;
-
-
drop:
-
UDP_INC_STATS_BH(UDP_MIB_INERRORS, up->pcflag);
-
kfree_skb(skb);
-
return -1;
-
}
-
int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
-
{
-
int err = 0;
-
int skb_len;
-
-
/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
-
number of warnings when compiling with -W --ANK
-
*/
-
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= //sk_rmem_alloc是对接收的skb大小的累加和,当接收到skb时,sk_rmem_alloc增加,当从队列中取出并释放skb时,sk_rmem_alloc减少
-
(unsigned)sk->sk_rcvbuf) { //sk_rcvbuf 这个是接收缓冲区的大小,我们可以通过setsockopt进行设置。我们看到当从接收队列取包的速度小于接收到包的时候,我们
-
//适当增加sk_rcvbuf这个缓冲区的大小就一定程度上减少丢包。
-
-
err = -ENOMEM;
-
goto out;
-
}
-
-
err = sk_filter(sk, skb);
-
if (err)
-
goto out;
-
-
skb->dev = NULL;
-
skb_set_owner_r(skb, sk); //这个函数是对sk_rmem_alloc字段的操作
-
-
/* Cache the SKB length before we tack it onto the receive
-
* queue. Once it is added it no longer belongs to us and
-
* may be freed by other threads of control pulling packets
-
* from the queue.
-
*/
-
skb_len = skb->len;
-
-
skb_queue_tail(&sk->sk_receive_queue, skb); //把skb挂入到sk_receive_queue中
-
-
if (!sock_flag(sk, SOCK_DEAD))
-
sk->sk_data_ready(sk, skb_len);
-
out:
-
return err;
-
}
-
static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
-
{
-
skb->sk = sk;
-
skb->destructor = sock_rfree;
-
atomic_add(skb->truesize, &sk->sk_rmem_alloc); //还是个原子操作
-
}
-
-
-
-
void sock_rfree(struct sk_buff *skb) //这个是在free skb的时候调用的
-
{
-
struct sock *sk = skb->sk;
-
-
atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
-
}
阅读(2092) | 评论(0) | 转发(1) |