Chinaunix首页 | 论坛 | 博客
  • 博客访问: 641437
  • 博文数量: 363
  • 博客积分: 110
  • 博客等级: 民兵
  • 技术积分: 1347
  • 用 户 组: 普通用户
  • 注册时间: 2011-06-22 16:07
文章分类

全部博文(363)

文章存档

2018年(83)

2016年(1)

2014年(2)

2013年(34)

2012年(236)

2011年(7)

分类:

2012-02-06 14:08:56

大家都知道,这个epoll_ctl系统调用是epoll高效之所在,因为把文件描述符集合的传递与轮询分离,而对目标文件描述符集合的操作就在这个epoll_ctl里面完成。我们先来关注一下数据结构:
第一个是昨天提到的eventpoll,这里就不说了。
第二个是epitem,每一个需要传递的文件描述符都对应着一个epitem,我们来看一下源码:
  1. /*
  2.  * Each file descriptor added to the eventpoll interface will
  3.  * have an entry of this type linked to the hash.
  4.  */
  5. struct epitem {
  6.     /* RB-Tree node used to link this structure to the eventpoll rb-tree */
  7.     struct rb_node rbn;

  8.     /* List header used to link this structure to the eventpoll ready list */
  9.     struct list_head rdllink;

  10.     /* The file descriptor information this item refers to */
  11.     struct epoll_filefd ffd;

  12.     /* Number of active wait queue attached to poll operations */
  13.     int nwait;

  14.     /* List containing poll wait queues */
  15.     struct list_head pwqlist;

  16.     /* The "container" of this item */
  17.     struct eventpoll *ep;

  18.     /* The structure that describe the interested events and the source fd */
  19.     struct epoll_event event;

  20.     /*
  21.      * Used to keep track of the usage count of the structure. This avoids
  22.      * that the structure will desappear from underneath our processing.
  23.      */
  24.     atomic_t usecnt;

  25.     /* List header used to link this item to the "struct file" items list */
  26.     struct list_head fllink;

  27.     /* List header used to link the item to the transfer list */
  28.     struct list_head txlink;

  29.     /*
  30.      * This is used during the collection/transfer of events to userspace
  31.      * to pin items empty events set.
  32.      */
  33.     unsigned int revents;
  34. };
可以看到,这个结构体跟eventpoll紧密结合,eventpoll维护一个红黑树存储着epitem。
 
下面正式看看sys_epoll_ctl源码:
  1. /*
  2.  * The following function implements the controller interface for
  3.  * the eventpoll file that enables the insertion/removal/change of
  4.  * file descriptors inside the interest set. It represents
  5.  * the kernel part of the user space epoll_ctl(2).
  6.  */
  7. asmlinkage long
  8. sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
  9. {
  10.     int error;
  11.     struct file *file, *tfile;
  12.     struct eventpoll *ep;
  13.     struct epitem *epi;
  14.     struct epoll_event epds;

  15.     DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
  16.          current, epfd, op, fd, event));

  17.     error = -EFAULT;
  18.     if (EP_OP_HASH_EVENT(op) &&                                    //(1)
  19.      copy_from_user(&epds, event, sizeof(struct epoll_event)))
  20.         goto eexit_1;

  21.     /* Get the "struct file *" for the eventpoll file */
  22.     error = -EBADF;
  23.     file = fget(epfd);                                              //(2)
  24.     if (!file)
  25.         goto eexit_1;

  26.     /* Get the "struct file *" for the target file */
  27.     tfile = fget(fd);                                              //(3)
  28.     if (!tfile)
  29.         goto eexit_2;

  30.     /* The target file descriptor must support poll */
  31.     error = -EPERM;
  32.     if (!tfile->f_op || !tfile->f_op->poll)                        //(4)
  33.         goto eexit_3;

  34.     /*
  35.      * We have to check that the file structure underneath the file descriptor
  36.      * the user passed to us _is_ an eventpoll file. And also we do not permit
  37.      * adding an epoll file descriptor inside itself.
  38.      */
  39.     error = -EINVAL;
  40.     if (file == tfile || !IS_FILE_EPOLL(file))                      //(5)
  41.         goto eexit_3;

  42.     /*
  43.      * At this point it is safe to assume that the "private_data" contains
  44.      * our own data structure.
  45.      */
  46.     ep = file->private_data;                                        //(6)


  47.     down_write(&ep->sem);

  48.     /* Try to lookup the file inside our hash table */
  49.     epi = ep_find(ep, tfile, fd);                                  //(7)                 

  50.     error = -EINVAL;
  51.     switch (op)
  52.     case EPOLL_CTL_ADD:
  53.         if (!epi) {
  54.             epds.events |= POLLERR | POLLHUP;

  55.             error = ep_insert(ep, &epds, tfile, fd);
  56.         } else
  57.             error = -EEXIST;
  58.         break;
  59.     case EPOLL_CTL_DEL:
  60.         if (epi)
  61.             error = ep_remove(ep, epi);
  62.         else
  63.             error = -ENOENT;
  64.         break;
  65.     case EPOLL_CTL_MOD:
  66.         if (epi) {
  67.             epds.events |= POLLERR | POLLHUP;
  68.             error = ep_modify(ep, epi, &epds);
  69.         } else
  70.             error = -ENOENT;
  71.         break;
  72.     }

  73.     /*
  74.      * The function ep_find() increments the usage count of the structure
  75.      * so, if this is not NULL, we need to release it.
  76.      */
  77.     if (epi)                                                       //(8)
  78.         ep_release_epitem(epi);

  79.     up_write(&ep->sem);

  80. eexit_3:
  81.     fput(tfile);
  82. eexit_2:
  83.     fput(file);
  84. eexit_1:
  85.     DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
  86.          current, epfd, op, fd, event, error));

  87.     return error;
  88. }
 
(1)这里就是把epoll_event(用过epoll函数都知道这个)从用户态传到内核态。
(2)(3)这两个分别是获得创建的eventpoll和监听目标这两个文件描述符的文件对象。
(4)就是看看监听目标有没有支持poll也就是有没有实现poll函数。
(5)相关文件的检查。
(6)取得eventpoll。
(7)首先是在我们eventpoll的红黑树里面找到属于监听目标的epitem,然后进行用户想要的操作。
(8)看注释可以知道,因为ep_find增加了引用计数,这里要减去。
 
操作具体实现等下补上,先去吃饭,O(∩_∩)O~
好,吃饭回来,接着看。这里以ADD操作为例。看源码:
  1. static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
  2.          struct file *tfile, int fd)
  3. {
  4.     int error, revents, pwake = 0;
  5.     unsigned long flags;
  6.     struct epitem *epi;
  7.     struct ep_pqueue epq;

  8.     error = -ENOMEM;
  9.     if (!(epi = EPI_MEM_ALLOC()))
  10.         goto eexit_1;

  11.     /* Item initialization follow here ... */                 //(1)
  12.     EP_RB_INITNODE(&epi->rbn);
  13.     INIT_LIST_HEAD(&epi->rdllink);
  14.     INIT_LIST_HEAD(&epi->fllink);
  15.     INIT_LIST_HEAD(&epi->txlink);
  16.     INIT_LIST_HEAD(&epi->pwqlist);
  17.     epi->ep = ep;
  18.     EP_SET_FFD(&epi->ffd, tfile, fd);
  19.     epi->event = *event;
  20.     atomic_set(&epi->usecnt, 1);
  21.     epi->nwait = 0;

  22.     /* Initialize the poll table using the queue callback */
  23.     epq.epi = epi;                                           //(2)
  24.     init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

  25.     /*
  26.      * Attach the item to the poll hooks and get current event bits.
  27.      * We can safely use the file* here because its usage count has
  28.      * been increased by the caller of this function.
  29.      */
  30.     revents = tfile->f_op->poll(tfile, &epq.pt);              //(3)

  31.     /*
  32.      * We have to check if something went wrong during the poll wait queue
  33.      * install process. Namely an allocation for a wait queue failed due
  34.      * high memory pressure.
  35.      */
  36.     if (epi->nwait < 0)                                       //(4)
  37.         goto eexit_2;

  38.     /* Add the current item to the list of active epoll hook for this file */  //(5)
  39.     spin_lock(&tfile->f_ep_lock);                              
  40.     list_add_tail(&epi->fllink, &tfile->f_ep_links);
  41.     spin_unlock(&tfile->f_ep_lock);

  42.     /* We have to drop the new item inside our item list to keep track of it */   //6)
  43.     write_lock_irqsave(&ep->lock, flags);

  44.     /* Add the current item to the rb-tree */                           //(7)
  45.     ep_rbtree_insert(ep, epi);

  46.     /* If the file is already "ready" we drop it inside the ready list */     //(8)
  47.     if ((revents & event->events) && !EP_IS_LINKED(&epi->rdllink)) {
  48.         list_add_tail(&epi->rdllink, &ep->rdllist);

  49.         /* Notify waiting tasks that events are available */
  50.         if (waitqueue_active(&ep->wq))
  51.             wake_up(&ep->wq);
  52.         if (waitqueue_active(&ep->poll_wait))
  53.             pwake++;
  54.     }

  55.     write_unlock_irqrestore(&ep->lock, flags);

  56.     /* We have to call this outside the lock */                    //(9)
  57.     if (pwake)
  58.         ep_poll_safewake(&psw, &ep->poll_wait);

  59.     DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
  60.          current, ep, tfile, fd));

  61.     return 0;

  62. eexit_2:
  63.     ep_unregister_pollwait(ep, epi);

  64.     /*
  65.      * We need to do this because an event could have been arrived on some
  66.      * allocated wait queue.
  67.      */                                                            //(10)
  68.     write_lock_irqsave(&ep->lock, flags);
  69.     if (EP_IS_LINKED(&epi->rdllink))
  70.         EP_LIST_DEL(&epi->rdllink);
  71.     write_unlock_irqrestore(&ep->lock, flags);

  72.     EPI_MEM_FREE(epi);
  73. eexit_1:
  74.     return error;
  75. }
我发现eopll模块的代码是内核代码中注释比较全的部分了,呵呵,很多牛人代码都不留痕迹的。
(1)这部分都是初始化刚刚申请的epitem,比较有意思的是每个epitem其实都有一个nwait字段,所以每个监听目标可以同时用多个epoll监听。
(2)这部分就是最核心的部分了。epoll高效还有一个原因就是有回调唤醒机制。要理解这部分是最困难的,因为涉及很多数据结构,我们先来分析一下数据结构:(引用网上的图)
 
 
ep_ptable_queue_proc这个函数就是初始化每个epitem的回调函数,并把它放到等待队列,等待被事件唤醒。相信在接下来我们分析epoll_wait的时候会知道如何唤醒,如何回调。
(3)这里调用了目标文件的poll函数,每个目标文件都应该由自己的poll_wait实现。类似的第一篇引用过别人的话。
(4)在刚才设置回调函数的时候,如果发生错误,nwait为-1。
(5)把fllink连接到文件的轮询等待链表。
(6)加写锁。
(7)把这个epitem加到eventpoll的红黑树里面。
(8)这里把已经ready的加到表示准备好的队列里面,估计回调函数做的也跟这个差不多了。
(9)这里是完成刚才剩下的工作,也是wakeup,这里的wakeup有点不同,这里不细究,有兴趣的可以去看源码。
(10)出错处理,把ready队列里面相关的删掉。
阅读(218) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~