出没于杭州和青岛的程序猿一枚,对内核略懂一二
分类: LINUX
2013-08-16 13:13:47
Poll Internals
Author: Tony
Date:2013年8月17日星期六
根据man手册上的说明“POLLIN There is data to read.”,那么是不是说recvmsg/read一定能读取到数据呢?
在同事的一个udp程序中出现了POLLIN,但read的时候返回EAGAIN,同时之前的代码中block的socket没有出现过类似的问题。内核bug???
我和我的小伙伴们都迷惑了。。。
poll系统调用的流程:
fs/select.c: SYSCALL_DEFINE3(poll)-->do_sys_poll-->do_poll-->do_pollfd-->file->f_op->poll
net/socket.c:定义socket的文件操作struct file_operations socket_file_ops,poll对应于函数sock_poll
sock_poll-->sock->ops->poll
net/ipv4/af_inet.c:定义struct proto_pos inet_dgram_ops, poll对应于函数udp_poll
udp_poll的实现
/**
* udp_poll - wait for a UDP event.
* @file - file struct
* @sock - socket
* @wait - poll table
*
* This is same as datagram poll, except for the special case of
* blocking sockets. If application is using a blocking fd
* and a packet with checksum error is in the queue;
* then it could get return from select indicating data available
* but then block when reading it. Add special case code
* to work around these arguably broken applications.
*/
unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
unsigned int mask = datagram_poll(file, sock, wait);//进程睡眠,等待数据包到达
struct sock *sk = sock->sk;
/* Check for false positives due to checksum errors */
/*
当poll返回后,对应block类型的socket,这里会判断接收到的数据包的校验和,
并确保recvqueue中存在合法的udp数据包。这里要注意,即使收到的udp数据包
数据部分长度为0,但是由于有udp头,firsrt_packet_length会返回udp头的长度8,
poll仍然会返回POLLIN,recvmsg会return=0,errno=0.
*/
if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
!(sk->sk_shutdown & RCV_SHUTDOWN) && !first_packet_length(sk))
mask &= ~(POLLIN | POLLRDNORM);
return mask;
}
datagram_poll-->sock_poll_wait-->poll_wait 然后睡眠等待被唤醒
ip_rcv-->ip_rcv_finish-->dst_input-->ip_local_deliver-->ip_local_deliver_finish-->net_protocol->handler
net/ipv4/af_inet.c: struct net_protocol udp_protocol定义handler为udp_rcv
udp_rcv-->__udp4_lib_rcv-->udp_queue_rcv_skb-->__udp_queue_rcv_skb/sk_add_backlog
这里假设sock没有被用户lock,那么就是__udp_queue_rcv_skb函数
__udp_queue_rcv_skb-->ip_queue_rcv_skb-->sock_queue_rcv_skb-->sk->sk_data_ready
net/core/sock.c:定义函数sk_data_ready=sock_def_readable
/*
* Default Socket Callbacks
*/
static void sock_def_wakeup(struct sock *sk)
{
struct socket_wq *wq;
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
if (wq_has_sleeper(wq))
wake_up_interruptible_all(&wq->wait);//唤醒poll等待的进程
rcu_read_unlock();
}
用户态调用read/recvmsg对应的函数是udp_recvmsg,具体的代码在net/ipv4/udp.c中。
int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len, int noblock, int flags, int *addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
struct sk_buff *skb;
unsigned int ulen;
int peeked;
int err;
int is_udplite = IS_UDPLITE(sk);
bool slow;
/*
* Check any passed addresses
*/
if (addr_len)
*addr_len = sizeof(*sin);
if (flags & MSG_ERRQUEUE)
return ip_recv_error(sk, msg, len);
try_again:
skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
&peeked, &err);
if (!skb)
goto out;
ulen = skb->len - sizeof(struct udphdr);
if (len > ulen)
len = ulen;
else if (len < ulen)
msg->msg_flags |= MSG_TRUNC;
/*
* If checksum is needed at all, try to do it while copying the
* data. If the data is truncated, or if we only want a partial
* coverage checksum (UDP-Lite), do it before the copy.
*/
if (len < ulen || UDP_SKB_CB(skb)->partial_cov) {
if (udp_lib_checksum_complete(skb))
goto csum_copy_err;
}
if (skb_csum_unnecessary(skb))
err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
msg->msg_iov, len);
else {
err = skb_copy_and_csum_datagram_iovec(skb,
sizeof(struct udphdr),
msg->msg_iov);
if (err == -EINVAL)
goto csum_copy_err;
}
if (!peeked)
UDP_INC_STATS_USER(sock_net(sk),
UDP_MIB_INDATAGRAMS, is_udplite);
sock_recv_ts_and_drops(msg, sk, skb);
/* Copy the address. */
if (sin) {
sin->sin_family = AF_INET;
sin->sin_port = udp_hdr(skb)->source;
sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
}
if (inet->cmsg_flags)
ip_cmsg_recv(msg, skb);
err = len;
if (flags & MSG_TRUNC)
err = ulen;
out_free:
skb_free_datagram_locked(sk, skb);
out:
return err;
csum_copy_err:
slow = lock_sock_fast(sk);
if (!skb_kill_datagram(sk, skb, flags))
UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
unlock_sock_fast(sk, slow);
if (noblock)
return -EAGAIN;//如果校验和错误,那么这里不会读取到任何数据,而是返回错误EAGAIN
/* starting over for a new packet */
msg->msg_flags &= ~MSG_TRUNC;
goto try_again;
}
non-block socket:
poll();
if (POLLIN)
recvmsg();
/*
由于POLLIN返回只表示收到了ip头校验合法的数据包,并不保证udp部分校验的合法性。而udp_recvmsg会对校验和验证,导致如果校验和错误,该函数将返回-1,且errno=-EAGAIN;
*/
block socket:
poll();
if(POLLIN)
recvmsg();
/*
对于block类型的socket,poll系统调用返回POLLIN时会确保收到了udp校验合法的数据包,即使收到一个0字节的数据包也会返回,所以recvmsg一定会返回,但可能返回0,errno=0。
*/
由于select的具体实现,同样使用了poll的机制,所以select返回值的含义与poll相同。