Linux的poll与epoll实现（3）---epoll_ctl-define_shore

define_shore_me的ChinaUnix博客defineshoreme.blog.chinaunix.net

首页　| 　博文目录　| 　关于我

define_shore_me

博客访问： 234219
博文数量： 59
博客积分： 1215
博客等级：少尉
技术积分： 575
用户组：普通用户
注册时间： 2011-11-09 02:18

文章分类

全部博文（59）

python（1）
Nginx（8）
Linux内核解读（5）
读书笔记（17）

《深入理解Androi（1）

《Unix网络编程卷（10）

Linux设备驱动开（6）
嵌入式（1）
Android（2）
Linux编程（11）

Axel源码学习（4）
数据库（2）
qt（1）
其他（6）
未分配的博文（5）

文章存档

2012年（53）

2011年（6）

我的朋友

相关博文

Linux的poll与epoll实现（3）---epoll_ctl

分类： C/C++

2012-02-05 16:53:45

大家都知道，这个epoll_ctl系统调用是epoll高效之所在，因为把文件描述符集合的传递与轮询分离，而对目标文件描述符集合的操作就在这个epoll_ctl里面完成。我们先来关注一下数据结构：

第一个是昨天提到的eventpoll，这里就不说了。

第二个是epitem，每一个需要传递的文件描述符都对应着一个epitem，我们来看一下源码：

/*
* Each file descriptor added to the eventpoll interface will
* have an entry of this type linked to the hash.
*/
struct epitem {
/* RB-Tree node used to link this structure to the eventpoll rb-tree */
struct rb_node rbn;
/* List header used to link this structure to the eventpoll ready list */
struct list_head rdllink;
/* The file descriptor information this item refers to */
struct epoll_filefd ffd;
/* Number of active wait queue attached to poll operations */
int nwait;
/* List containing poll wait queues */
struct list_head pwqlist;
/* The "container" of this item */
struct eventpoll *ep;
/* The structure that describe the interested events and the source fd */
struct epoll_event event;
/*
* Used to keep track of the usage count of the structure. This avoids
* that the structure will desappear from underneath our processing.
*/
atomic_t usecnt;
/* List header used to link this item to the "struct file" items list */
struct list_head fllink;
/* List header used to link the item to the transfer list */
struct list_head txlink;
/*
* This is used during the collection/transfer of events to userspace
* to pin items empty events set.
*/
unsigned int revents;
};

可以看到，这个结构体跟eventpoll紧密结合，eventpoll维护一个红黑树存储着epitem。

下面正式看看sys_epoll_ctl源码：

/*
* The following function implements the controller interface for
* the eventpoll file that enables the insertion/removal/change of
* file descriptors inside the interest set. It represents
* the kernel part of the user space epoll_ctl(2).
*/
asmlinkage long
sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
{
int error;
struct file *file, *tfile;
struct eventpoll *ep;
struct epitem *epi;
struct epoll_event epds;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
current, epfd, op, fd, event));
error = -EFAULT;
if (EP_OP_HASH_EVENT(op) && //(1)
copy_from_user(&epds, event, sizeof(struct epoll_event)))
goto eexit_1;
/* Get the "struct file *" for the eventpoll file */
error = -EBADF;
file = fget(epfd); //(2)
if (!file)
goto eexit_1;
/* Get the "struct file *" for the target file */
tfile = fget(fd); //(3)
if (!tfile)
goto eexit_2;
/* The target file descriptor must support poll */
error = -EPERM;
if (!tfile->f_op || !tfile->f_op->poll) //(4)
goto eexit_3;
/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file. And also we do not permit
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
if (file == tfile || !IS_FILE_EPOLL(file)) //(5)
goto eexit_3;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = file->private_data; //(6)
down_write(&ep->sem);
/* Try to lookup the file inside our hash table */
epi = ep_find(ep, tfile, fd); //(7)
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_insert(ep, &epds, tfile, fd);
} else
error = -EEXIST;
break;
case EPOLL_CTL_DEL:
if (epi)
error = ep_remove(ep, epi);
else
error = -ENOENT;
break;
case EPOLL_CTL_MOD:
if (epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_modify(ep, epi, &epds);
} else
error = -ENOENT;
break;
}
/*
* The function ep_find() increments the usage count of the structure
* so, if this is not NULL, we need to release it.
*/
if (epi) //(8)
ep_release_epitem(epi);
up_write(&ep->sem);
eexit_3:
fput(tfile);
eexit_2:
fput(file);
eexit_1:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
current, epfd, op, fd, event, error));
return error;
}

(1)这里就是把epoll_event（用过epoll函数都知道这个）从用户态传到内核态。

(2)(3)这两个分别是获得创建的eventpoll和监听目标这两个文件描述符的文件对象。

(4)就是看看监听目标有没有支持poll也就是有没有实现poll函数。

(5)相关文件的检查。

(6)取得eventpoll。

(7)首先是在我们eventpoll的红黑树里面找到属于监听目标的epitem，然后进行用户想要的操作。

(8)看注释可以知道，因为ep_find增加了引用计数，这里要减去。

操作具体实现等下补上，先去吃饭，O(∩_∩)O~

好，吃饭回来，接着看。这里以ADD操作为例。看源码：

static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
struct file *tfile, int fd)
{
int error, revents, pwake = 0;
unsigned long flags;
struct epitem *epi;
struct ep_pqueue epq;
error = -ENOMEM;
if (!(epi = EPI_MEM_ALLOC()))
goto eexit_1;
/* Item initialization follow here ... */ //(1)
EP_RB_INITNODE(&epi->rbn);
INIT_LIST_HEAD(&epi->rdllink);
INIT_LIST_HEAD(&epi->fllink);
INIT_LIST_HEAD(&epi->txlink);
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;
EP_SET_FFD(&epi->ffd, tfile, fd);
epi->event = *event;
atomic_set(&epi->usecnt, 1);
epi->nwait = 0;
/* Initialize the poll table using the queue callback */
epq.epi = epi; //(2)
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
/*
* Attach the item to the poll hooks and get current event bits.
* We can safely use the file* here because its usage count has
* been increased by the caller of this function.
*/
revents = tfile->f_op->poll(tfile, &epq.pt); //(3)
/*
* We have to check if something went wrong during the poll wait queue
* install process. Namely an allocation for a wait queue failed due
* high memory pressure.
*/
if (epi->nwait < 0) //(4)
goto eexit_2;
/* Add the current item to the list of active epoll hook for this file */ //(5)
spin_lock(&tfile->f_ep_lock);
list_add_tail(&epi->fllink, &tfile->f_ep_links);
spin_unlock(&tfile->f_ep_lock);
/* We have to drop the new item inside our item list to keep track of it */ //6)
write_lock_irqsave(&ep->lock, flags);
/* Add the current item to the rb-tree */ //(7)
ep_rbtree_insert(ep, epi);
/* If the file is already "ready" we drop it inside the ready list */ //(8)
if ((revents & event->events) && !EP_IS_LINKED(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
write_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */ //(9)
if (pwake)
ep_poll_safewake(&psw, &ep->poll_wait);
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
current, ep, tfile, fd));
return 0;
eexit_2:
ep_unregister_pollwait(ep, epi);
/*
* We need to do this because an event could have been arrived on some
* allocated wait queue.
*/ //(10)
write_lock_irqsave(&ep->lock, flags);
if (EP_IS_LINKED(&epi->rdllink))
EP_LIST_DEL(&epi->rdllink);
write_unlock_irqrestore(&ep->lock, flags);
EPI_MEM_FREE(epi);
eexit_1:
return error;
}

我发现eopll模块的代码是内核代码中注释比较全的部分了，呵呵，很多牛人代码都不留痕迹的。

（1）这部分都是初始化刚刚申请的epitem，比较有意思的是每个epitem其实都有一个nwait字段，所以每个监听目标可以同时用多个epoll监听。

（2）这部分就是最核心的部分了。epoll高效还有一个原因就是有回调唤醒机制。要理解这部分是最困难的，因为涉及很多数据结构，我们先来分析一下数据结构:（引用网上的图）

ep_ptable_queue_proc这个函数就是初始化每个epitem的回调函数，并把它放到等待队列，等待被事件唤醒。相信在接下来我们分析epoll_wait的时候会知道如何唤醒，如何回调。

(3)这里调用了目标文件的poll函数，每个目标文件都应该由自己的poll_wait实现。类似的第一篇引用过别人的话。

(4)在刚才设置回调函数的时候，如果发生错误，nwait为-1。

(5)把fllink连接到文件的轮询等待链表。

(6)加写锁。

(7)把这个epitem加到eventpoll的红黑树里面。

(8)这里把已经ready的加到表示准备好的队列里面，估计回调函数做的也跟这个差不多了。

(9)这里是完成刚才剩下的工作，也是wakeup，这里的wakeup有点不同，这里不细究，有兴趣的可以去看源码。

(10)出错处理，把ready队列里面相关的删掉。

阅读(2582) | 评论(0) | 转发(1) |

上一篇：Android开发者必须深入学习的10个应用开源项目

下一篇：Netbeans里的UML建模入门

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6