poll 源码分析
2.1 数据结构
/*
* Structures and helpers for sys_poll/sys_poll
*/
struct poll_wqueues {
poll_table pt;
struct poll_table_page * table;
int error;
};
/*
* structures and helpers for f_op->poll implementations
*/
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
typedef struct poll_table_struct {
poll_queue_proc qproc;
} poll_table;
//poll链表
struct poll_list {
struct poll_list *next;
int len;
struct pollfd entries[0];
};
//pollfd结构
struct pollfd {
int fd; //要监视的文件描述符
short events; //请求要监控的事件
short revents; //实际返回的事件
};
3, epoll源码分析
//poll系统调用入口
// ufds : 用户传入的要监视的pollfd数组首地址
// nfds : 要监控的fd的个数和pollfd的数组的成员个数相等
// timeout : 时间参数
asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long timeout)
{
struct poll_wqueues table;
int fdcount, err;
unsigned int i;
struct poll_list *head;
struct poll_list *walk;
/* Do a sanity check on nfds ... */ //检查要检查的文件描述符的个数
if (nfds > current->files->max_fdset && nfds > OPEN_MAX)
return -EINVAL;
if (timeout) { //处理timeout参数
/* Careful about overflow in the intermediate values */
if ((unsigned long) timeout < MAX_SCHEDULE_TIMEOUT / HZ)
timeout = (unsigned long)(timeout*HZ+999)/1000+1;
else /* Negative or overflow */
timeout = MAX_SCHEDULE_TIMEOUT;
}
//初始化table,为table的qproc设置一个处理函数
poll_initwait(&table);
head = NULL; //初始化队列头
walk = NULL;
i = nfds; //获取监控的fd个数
err = -ENOMEM;
while(i!=0) { //个数大于0
struct poll_list *pp;
//为i个pollfd分配空间,若大于一页内存,则分配一页内存一般是(4096)
pp = kmalloc(sizeof(struct poll_list)+
sizeof(struct pollfd)*
(i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i),
GFP_KERNEL);
//分配失败
if(pp==NULL)
goto out_fds;
pp->next=NULL;
pp->len = (i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i);
//若链表头是NULL,这是第一个链表节点,直接把该节点的地址赋给head即可
if (head == NULL)
head = pp;
else
walk->next = pp; //如不是第一个节点,则把该节点插入到最后一个节点之后
walk = pp; //更新walk指针,它始终指向最后一个节点
//把用户的pollfd结构复制到相应节点的entries[i]中。
if (copy_from_user(pp->entries, ufds + nfds-i,
sizeof(struct pollfd)*pp->len)) {
err = -EFAULT;
goto out_fds;
}
i -= pp->len;
}
//以上代码实际上就是建立起一个用户要监视的pollfd链表
// 并把用户传入的参数从用户态复制到内核态的链表中。
fdcount = do_poll(nfds, head, &table, timeout);
/* OK, now copy the revents fields back to user space. */
walk = head;
err = -EFAULT;
while(walk != NULL) { //遍历pollfd链表,并检查每个链表节点中fd的实际返回的事件值
struct pollfd *fds = walk->entries;
int j;
for (j=0; j < walk->len; j++, ufds++) {
if(__put_user(fds[j].revents, &ufds->revents))
goto out_fds;
}
walk = walk->next;
}
err = fdcount;
if (!fdcount && signal_pending(current))
err = -EINTR;
out_fds:
walk = head;
while(walk!=NULL) {
struct poll_list *pp = walk->next;
kfree(walk);
walk = pp;
}
poll_freewait(&table);
return err;
}
//初始化pollwait队列
void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p);
void poll_initwait(struct poll_wqueues *pwq)
{
init_poll_funcptr(&pwq->pt, __pollwait);
pwq->error = 0;
pwq->table = NULL;
}
static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
pt->qproc = qproc;
}
//__pollwait()系统调用
void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *_p)
{
struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
struct poll_table_page *table = p->table;
if (!table || POLL_TABLE_FULL(table)) {
struct poll_table_page *new_table;
new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
if (!new_table) {
p->error = -ENOMEM;
__set_current_state(TASK_RUNNING);
return;
}
new_table->entry = new_table->entries;
new_table->next = table;
p->table = new_table;
table = new_table;
}
/* Add a new entry */
//向等待队列添加一个poll_table_entry实体,并把进程挂起
{
struct poll_table_entry * entry = table->entry;
table->entry = entry+1;
get_file(filp);
entry->filp = filp;
entry->wait_address = wait_address;
init_waitqueue_entry(&entry->wait, current);
add_wait_queue(wait_address,&entry->wait);
}
}
//do_poll调用
static int do_poll(unsigned int nfds, struct poll_list *list,
struct poll_wqueues *wait, long timeout)
{
int count = 0;
poll_table* pt = &wait->pt;
if (!timeout)
pt = NULL;
for (;;) { //死循环
struct poll_list *walk;
set_current_state(TASK_INTERRUPTIBLE);
walk = list;
while(walk != NULL) { //遍历链表并调用do_pollfd函数对要检查的pollfd进行检查
do_pollfd( walk->len, walk->entries, &pt, &count);
walk = walk->next;
}
pt = NULL;
if (count || !timeout || signal_pending(current))
break;
count = wait->error;
if (count)
break;
timeout = schedule_timeout(timeout);
}
__set_current_state(TASK_RUNNING);
return count;
}
//do_pollfd系统调用
static void do_pollfd(unsigned int num, struct pollfd * fdpage,
poll_table ** pwait, int *count)
{
int i;
for (i = 0; i < num; i++) {
int fd;
unsigned int mask;
struct pollfd *fdp;
mask = 0;
fdp = fdpage+i;
fd = fdp->fd;
if (fd >= 0) {
struct file * file = fget(fd); //获取文件指针
mask = POLLNVAL;
if (file != NULL) {
mask = DEFAULT_POLLMASK;
if (file->f_op && file->f_op->poll) //调用驱动程序的poll函数检查fd的事件
mask = file->f_op->poll(file, *pwait);
mask &= fdp->events | POLLERR | POLLHUP;
fput(file);
}
if (mask) {
*pwait = NULL;
(*count)++;
}
}
fdp->revents = mask;
}
}
//udp_poll->datagram_poll()...
unsigned int datagram_poll(struct file *file, struct socket *sock,
poll_table *wait)
{
struct sock *sk = sock->sk;
unsigned int mask;
poll_wait(file, sk->sk_sleep, wait); //其实就是调用__poll_wait
mask = 0;
...
...
}
总结:
*poll系统调用的流程如下:
(1) 先注册回调函数__poll_wait;
(2) 再初始化table变量(类型为struct poll_wqueues);
(3) 拷贝用户传入的struct pollfd(其实主要是fd);
(4) 轮流调用所有fd对应的poll,该poll调用一般是流程是:把current挂到各个fd对应的设备等待队列上;
(5) 在设备收到一条消息(网络设备)或填写完文件数据(磁盘设备)。
(6) 唤醒设备等待队列上的进程,此时current便被唤醒了。
(7) current醒来后离开sys_poll,并把结果返回给使用者。
*poll的优缺点
优点:不像select,poll对fd没有限制。
缺点:(1) 要把用户传入的fd复制到内核,并对每一个fd分配内存,把这些fd构造成一个链表。
(2) 遍历链表,并对用户传入的每一个fd(不管有没有事件产生),都调用do_pollfd函数。
此两者在fd个数比较小的情况下没事,但一点fd数量巨大,这将会造成poll性能的瓶颈。
附注:内核版本:2.6.11