Chinaunix首页 | 论坛 | 博客
  • 博客访问: 826354
  • 博文数量: 91
  • 博客积分: 2544
  • 博客等级: 少校
  • 技术积分: 1885
  • 用 户 组: 普通用户
  • 注册时间: 2006-12-12 09:08
文章存档

2016年(10)

2014年(2)

2013年(4)

2012年(23)

2011年(23)

2010年(13)

2009年(14)

2007年(2)

分类: LINUX

2014-08-06 11:30:57

epoll实现源码分析

0, 说明

很久以前就分析了,没有写成文章。最近又复习了一下epoll的实现,分析如下:

1, 数据结构

poll的安全唤醒队列,以及spinlock锁结构。
/*
 * This is used to implement the safe poll wake up avoiding to reenter
 * the poll callback from inside wake_up().
 */
struct poll_safewake {
 struct list_head wake_task_list;
 spinlock_t lock;
};

. epoll 文件结构
struct epoll_filefd {
    struct file *file;
    int fd;
};

. eventpoll 结构
这是一个实现epoll时,内部实现使用的数据结构。
在epoll_create中会为epoll本身获取一个和文件描述符和一个file结构,file->priv_data就是指向eventpoll结构。
该结构在函数:epoll_create()->sys_epoll_create1()中创建。
并在后续的epoll操作中都会用到。

/*
 * This structure is stored inside the "private_data" member of the file
 * structure and rapresent the main data sructure for the eventpoll
 * interface.
 */
struct eventpoll {
 /* Protect the this structure access */
 spinlock_t lock;                                  //保护本结构的自旋锁
 /*
  * This mutex is used to ensure that files are not removed
  * while epoll is using them. This is held during the event
  * collection loop, the file cleanup path, the epoll file exit
  * code and the ctl operations.
  */
 struct mutex mtx;  
 /* Wait queue used by sys_epoll_wait() */
 wait_queue_head_t wq;                    // 等待队列头结点,sys_epoll_wait()使用
 /* Wait queue used by file->poll() */
 wait_queue_head_t poll_wait;          // file->poll()使用的等待队列头
 /* List of ready file descriptors */
 struct list_head rdllist;
 /* RB tree root used to store monitored fd structs */
 struct rb_root rbr;                              // 保存监控fd结构的红黑树
 /*
  * This is a single linked list that chains all the "struct epitem" that
  * happened while transfering ready events to userspace w/out
  * holding ->lock.
  */
 struct epitem *ovflist;              // 单链表,链接所有发生的准备复制到用户空间的事件
};


.epitem结构
每一个添加到eventpoll接口的文件描述符都对应一个epitem类型的实体,该结构中用rbn连接到eventpoll结构的红黑树根变量rbr。

/*
 * Each file descriptor added to the eventpoll interface will
 * have an entry of this type linked to the "rbr" RB tree.
 */
struct epitem {
 /* RB tree node used to link this structure to the eventpoll RB tree */
 struct rb_node rbn;

 /* List header used to link this structure to the eventpoll ready list */
 struct list_head rdllink;
 /*
  * Works together "struct eventpoll"->ovflist in keeping the
  * single linked chain of items.
  */
 struct epitem *next;

 /* The file descriptor information this item refers to */
 struct epoll_filefd ffd;

 /* Number of active wait queue attached to poll operations */
 int nwait;

 /* List containing poll wait queues */
 struct list_head pwqlist;

 /* The "container" of this item */
 struct eventpoll *ep;       // 该epitem属于ep指向的eventpoll结构

 /* List header used to link this item to the "struct file" items list */
 struct list_head fllink;

 /* The structure that describe the interested events and the source fd */
 struct epoll_event event;
};

. eppoll_entry 结构
/* Wait structure used by the poll hooks */
struct eppoll_entry {
    /* List header used to link this structure to the "struct epitem" */
    struct list_head llink;

    /* The "base" pointer is set to the container "struct epitem" */
    void *base;

    /*
     * Wait queue item that will be linked to the target file wait
     * queue head.
     */
    wait_queue_t wait;

    /* The wait queue head that linked the "wait" wait queue item */
    wait_queue_head_t *whead;
};

. epoll_event结构
struct epoll_event {
 __u32 events;
 __u64 data;
} EPOLL_PACKED;

. epoll队列
/* Wrapper struct used by poll queueing */
struct ep_pqueue {
 poll_table pt;
 struct epitem *epi;
};

/*
 * structures and helpers for f_op->poll implementations
 */
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);

typedef struct poll_table_struct {
 poll_queue_proc qproc;
} poll_table;


. 结构体之间关系


2,实现解析
2.1 初始化
epoll系统调用以模块的方式添加到内核,整个系统调用的实现是在文件Eventpoll.c文件中。epoll的初始化流程如下:

* epoll模块初始化
epoll模块初始化函数主要对epoll用到的全局锁进行初始化,同时创建epitem结构和eppoll_entry结构内存池。

static int __init eventpoll_init(void) 
{
 mutex_init(&epmutex);
 /* Initialize the structure used to perform safe poll wait head wake ups */
 // 初始化静态全局变量psw锁和队列, 该变量在模块加载时被定义:
 //  /* Safe wake up implementation */
//   static struct poll_safewake psw;
 ep_poll_safewake_init(&psw);

 /* Allocates slab cache used to allocate "struct epitem" items */
 // epitem的内存池
 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
   0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC,
   NULL);

 /* Allocates slab cache used to allocate "struct eppoll_entry" */
 // 为eppoll_entry结构创建内存池
 pwq_cache = kmem_cache_create("eventpoll_pwq",
   sizeof(struct eppoll_entry), 0,
   EPI_SLAB_DEBUG|SLAB_PANIC, NULL);

 return 0;
}
fs_initcall(eventpoll_init);


2.2 epoll_create()的实现

---------------------------------------------------
       #include
      int epoll_create(int size)
----------------------------------------------------

epoll使用内核的通用文件系统架构来实现的,所以,就需要注册一个struct file_operations的结构,并实现其中对应的回调函数。

// epoll文件系统中的file结构中f_op的值,也就是file->f_op的指针值
/* File callbacks that implement the eventpoll file behaviour */
static const struct file_operations eventpoll_fops = {
 .release = ep_eventpoll_release,
 .poll = ep_eventpoll_poll
};

// epoll_create(int size) 参数size其实没有用到,大于0即可。
asmlinkage long sys_epoll_create(int size)
{
 // 若size小于0,返回值错误编号
 if (size < 0)
  return -EINVAL;
 // 这里的size参数其实没有用,只要大于0,就可以了。
 // 这里根本就没有使用size,而是写成了0。分析的版本是2.6.27,可能在以后的版本中会有时改变。
 return sys_epoll_create1(0);
}


// 最终的epoll_create是在下面的函数中实现的
/*
 * Open an eventpoll file descriptor.
 */
asmlinkage long sys_epoll_create1(int flags)
{
 int error, fd = -1;
 // 该结构将会被赋值给file结构的priv_data指针
 struct eventpoll *ep;

 /* Check the EPOLL_* constant for consistency. */
 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
 if (flags & ~EPOLL_CLOEXEC)
  return -EINVAL;

 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
       current, flags));
 /*
  * Create the internal data structure ( "struct eventpoll" ).
  */
 // 创建一个eventpoll结构,并对该结构中的成员进行初始化
 // 之后该eventpoll结构成员的指针,将会赋给file->private_data字段
 error = ep_alloc(&ep);
 if (error < 0) {
  fd = error;
  goto error_return;
 }

 /*
  * Creates all the items needed to setup an eventpoll file. That is,
  * a file structure and a free file descriptor.
  */
 // 在匿名文件系统(anon_inode_mnt->mnt_sb->s_root)的超级块根路径下创建一个目录结构(d_entry)
 // 在该目录结构下,申请一个fd和file结构,并把这两者连接起来。
 // 初始化file结构的private_data指针的值为ep(eventpoll *参数)的指针。
 // 初始化file的f_op指针,为eventpoll_fops结构的指针,该结构的定义在本节开始处。
 fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
         flags & O_CLOEXEC);
 if (fd < 0)
  ep_free(ep);
error_return:
 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
       current, flags, fd));
 // 返回epoll创建的fd
 return fd;
}

// 创建并初始化eventpoll结构
static int ep_alloc(struct eventpoll **pep)
{
 struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL);
 if (!ep)
  return -ENOMEM;
 spin_lock_init(&ep->lock);
 mutex_init(&ep->mtx);
 init_waitqueue_head(&ep->wq);
 init_waitqueue_head(&ep->poll_wait);
 INIT_LIST_HEAD(&ep->rdllist);
 ep->rbr = RB_ROOT;
 ep->ovflist = EP_UNACTIVE_PTR;
 *pep = ep;
 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
       current, ep));
 return 0;
}

// epoll的file_operation的poll函数定义
static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
{
 unsigned int pollflags = 0;
 unsigned long flags;
 struct eventpoll *ep = file->private_data;
 /* Insert inside our poll wait queue */
 poll_wait(file, &ep->poll_wait, wait);
 /* Check our condition */
 spin_lock_irqsave(&ep->lock, flags);
 if (!list_empty(&ep->rdllist))
  pollflags = POLLIN | POLLRDNORM;
 spin_unlock_irqrestore(&ep->lock, flags);
 return pollflags;
}
小结:epoll_create的要点如下:
(1) epoll的整个实现建立在内核的文件系统框架基础之上。
(2) epoll向内核文件系统注册了一个file_operations结构,并实现其中的poll和release回调函数。
(3) epoll文件系统中,poll回调函数的实现函数是:ep_eventpoll_poll,release回调函数是:ep_eventpoll_release。
(4) epoll_create(int size) 其中的size参数在内核中实际上没有使用。
(5) epoll_create函数实现初始化操作,在匿名文件系统上注册了一个目录结构和其操作。


2.3 epoll控制函数的实现(epoll_ctl)
通过该函数对epoll监控的fd进行添加,删除,修改操作。
/*
 * The following function implements the controller interface for
 * the eventpoll file that enables the insertion/removal/change of
 * file descriptors inside the interest set.
 */
asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
         struct epoll_event __user *event)
{
 int error;
 struct file *file, *tfile;
 struct eventpoll *ep;
 struct epitem *epi;
 struct epoll_event epds;
 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
       current, epfd, op, fd, event));
 error = -EFAULT;
 if (ep_op_has_event(op) &&
     copy_from_user(&epds, event, sizeof(struct epoll_event)))
  goto error_return;
 /* Get the "struct file *" for the eventpoll file */
 error = -EBADF;
 // 获取epoll的file结构指针
 file = fget(epfd);
 if (!file)
  goto error_return;
 /* Get the "struct file *" for the target file */
 // 获取被监控fd的file结构指针
 tfile = fget(fd);
 if (!tfile)
  goto error_fput;

 /* The target file descriptor must support poll */
 error = -EPERM;
 // 若被监控的fd对应的file结构没有poll函数,错误
 // 因为后续需要调用对应文件系统的poll函数,所以这里要进行判断。
 // 对于tcp的socket来说,这里是tcp_poll,对于udp的socket来说,是udp_poll
 if (!tfile->f_op || !tfile->f_op->poll)
  goto error_tgt_fput;

 /*
  * We have to check that the file structure underneath the file descriptor
  * the user passed to us _is_ an eventpoll file. And also we do not permit
  * adding an epoll file descriptor inside itself.
  */
 error = -EINVAL;

 // 若被监控的file结构指针和epoll的file结构指针相等,则把两个file都销毁,并返回错误。
 if (file == tfile || !is_file_epoll(file))
  goto error_tgt_fput;

 /*
  * At this point it is safe to assume that the "private_data" contains
  * our own data structure.
  */
 // file->private_data是eventpoll结构,前面已经讲过,这里要获取该指针
 ep = file->private_data;

 // 添加互斥锁
 mutex_lock(&ep->mtx);
 /*
  * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
  * above, we can be sure to be able to use the item looked up by
  * ep_find() till we release the mutex.
  */
 // 在ep的红黑树中,查找添加的fd是否存在。
 // 若已经存在,直接返回epitem的地址,否则返回NULL。
 epi = ep_find(ep, tfile, fd);

 error = -EINVAL;
 switch (op) {
 case EPOLL_CTL_ADD:    // 是插入监控fd操作
  // ep对应的epitem红黑树中不存在该fd
  if (!epi) {
   // 默认要添加POLLERR和POLLHUP事件 
   epds.events |= POLLERR | POLLHUP;
   // 把该file和fd插入到ep对应的epitem红黑树中。
   error = ep_insert(ep, &epds, tfile, fd);
  } else //若插入的fd已存在,直接返回EXIST错误码
   error = -EEXIST;
  break;
 case EPOLL_CTL_DEL:  //删除fd操作
  if (epi)
   error = ep_remove(ep, epi); //从ep的红黑树中删除fd
  else
   error = -ENOENT;
  break;
 case EPOLL_CTL_MOD:  //修改fd操作
  if (epi) {
   // 需要修改fd的关注的事件,默认会添加两个事件,一个是ERR事件,一个是HUP事件。
   epds.events |= POLLERR | POLLHUP;
   error = ep_modify(ep, epi, &epds);
  } else
   error = -ENOENT;
  break;
 }
 // 解锁
 mutex_unlock(&ep->mtx);
error_tgt_fput:
 fput(tfile);
error_fput:
 fput(file);
error_return:
 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
       current, epfd, op, fd, event, error));
 return error;
}


. 添加监控fd(ep_insert函数)

/*
 * Must be called with "mtx" held.
 */
static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
       struct file *tfile, int fd)
{
 int error, revents, pwake = 0;
 unsigned long flags;
 struct epitem *epi;
 struct ep_pqueue epq;
 error = -ENOMEM;

 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
  goto error_return;

 /* Item initialization follow here ... */
 INIT_LIST_HEAD(&epi->rdllink);
 INIT_LIST_HEAD(&epi->fllink);
 INIT_LIST_HEAD(&epi->pwqlist);
 epi->ep = ep;
 ep_set_ffd(&epi->ffd, tfile, fd);
 epi->event = *event;
 epi->nwait = 0;
 epi->next = EP_UNACTIVE_PTR;

 /* Initialize the poll table using the queue callback */
 epq.epi = epi;

 // 为epq.pt安装处理函数,pt->qproc = ep_ptable_queue_proc;
 // 这里实际上是安装了一个回调函数,该回调函数会在fd对应文件系统的poll函数中被调用。
 // 对于socket的fd来说,f_op->poll回调函数是sock_poll(),最终调用sock协议相关的函数,
// 若是TCP的socket就是调用tcp_poll(),函数,在该函数中会调用 poll_wait(file, sk->sk_sleep, wait);
 // 而poll_wait()函数会调用p->qproc(filp, wait_address, p);在这里也就是ep_ptable_queue_proc()。
 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

 /*
  * Attach the item to the poll hooks and get current event bits.
  * We can safely use the file* here because its usage count has
  * been increased by the caller of this function. Note that after
  * this operation completes, the poll callback can start hitting
  * the new item.
  */
 // 调用fd对应文件系统的poll函数
// 对于socket来说,f_op->poll()函数是:sock_poll(struct file *file, poll_table *wait)函数
// 在sock_poll()中的调用顺序为:sock_poll()->file->private_data->ops->poll()也就是调用tcp_poll()
 revents = tfile->f_op->poll(tfile, &epq.pt);
 /*
  * We have to check if something went wrong during the poll wait queue
  * install process. Namely an allocation for a wait queue failed due
  * high memory pressure.
  */
 if (epi->nwait < 0)
  goto error_unregister;

 /* Add the current item to the list of active epoll hook for this file */
 spin_lock(&tfile->f_ep_lock);
 list_add_tail(&epi->fllink, &tfile->f_ep_links);
 spin_unlock(&tfile->f_ep_lock);
 /*
  * Add the current item to the RB tree. All RB tree operations are
  * protected by "mtx", and ep_insert() is called with "mtx" held.
  */
 ep_rbtree_insert(ep, epi);

 /* We have to drop the new item inside our item list to keep track of it */
 spin_lock_irqsave(&ep->lock, flags);

 /* If the file is already "ready" we drop it inside the ready list */
 if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
  list_add_tail(&epi->rdllink, &ep->rdllist);

  /* Notify waiting tasks that events are available */
  if (waitqueue_active(&ep->wq))
   wake_up_locked(&ep->wq);
  if (waitqueue_active(&ep->poll_wait))
   pwake++;
 }
 spin_unlock_irqrestore(&ep->lock, flags);
 /* We have to call this outside the lock */
 if (pwake)
  ep_poll_safewake(&psw, &ep->poll_wait);
 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
       current, ep, tfile, fd));
 return 0;
error_unregister:
 ep_unregister_pollwait(ep, epi);
 /*
  * We need to do this because an event could have been arrived on some
  * allocated wait queue. Note that we don't care about the ep->ovflist
  * list, since that is used/cleaned only inside a section bound by "mtx".
  * And ep_insert() is called with "mtx" held.
  */
 spin_lock_irqsave(&ep->lock, flags);
 if (ep_is_linked(&epi->rdllink))
  list_del_init(&epi->rdllink);
 spin_unlock_irqrestore(&ep->lock, flags);
 kmem_cache_free(epi_cache, epi);
error_return:
 return error;
}

. ep_modify() 函数的实现
当epoll_ctl()函数的操作是EPOLL_CTL_MOD时,会调用ep_modify()函数改变要监控fd的事件。

/*
 * Modify the interest event mask by dropping an event if the new mask
 * has a match in the current file status. Must be called with "mtx" held.
 */
static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)
{
    int pwake = 0;
    unsigned int revents;
    unsigned long flags;
    /*
     * Set the new event interest mask before calling f_op->poll(), otherwise
     * a potential race might occur. In fact if we do this operation inside
     * the lock, an event might happen between the f_op->poll() call and the
     * new event set registering.
     */
    epi->event.events = event->events;
    /*
     * Get current event bits. We can safely use the file* here because
     * its usage count has been increased by the caller of this function.
     */
    // 当poll_table的参数为NULL时,对应文件系统注册的poll函数将不再执行回调函数ep_ptable_queue_proc(),
    // 因为该回调函数已经注册过了。这里只需要直接返回准备好的事件对应的位。
    revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
    // 添加自旋锁
    spin_lock_irqsave(&ep->lock, flags);
    /* Copy the data member from inside the lock */
    // 把用户要监控的事件,复制给eventpoll句柄中的事件变量
    epi->event.data = event->data;

    /*
     * If the item is "hot" and it is not registered inside the ready
     * list, push it inside.
     */
    // 若在用户需要监控的事件中,有已经准备好的事件,把该事件添加到已准备好fd队列中,然后唤醒等待队列。
    if (revents & event->events) {
        if (!ep_is_linked(&epi->rdllink)) {
            list_add_tail(&epi->rdllink, &ep->rdllist);
            /* Notify waiting tasks that events are available */
            if (waitqueue_active(&ep->wq))
                wake_up_locked(&ep->wq);
            if (waitqueue_active(&ep->poll_wait))
                pwake++;
        }
    }
    spin_unlock_irqrestore(&ep->lock, flags); // 解锁
    /* We have to call this outside the lock */
    // 遍历等待队列,并执行对应的回调函数,该回调函数把准备好的fd,添加到已完成fd队列中
    if (pwake)
        ep_poll_safewake(&psw, &ep->poll_wait);
    return 0;
}


. poll的回调函数
//这里的参数:whead是对应文件系统的等待队列头节点,例如:TCPsocket的就是sk->sk_sleep,而这里的pt,就是epoll文件系统中的ep_pqueue结构中的poll队列。

// 该函数创建一个eppoll_entry 实体,初始化该结构,为wait队列添加一个回调函数ep_poll_callback
// 并把该结构实体添加到,文件系统对应的poll等待队列中。

/*
 * This is the callback that is used to add our wait queue to the
 * target file wakeup lists.
 */
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
     poll_table *pt)
{
    // 从pt结构回指指针中获取epitem结构指针。
   struct epitem *epi = ep_item_from_epqueue(pt);
   struct eppoll_entry *pwq;

    // 创建一个eppoll_entry 结构,并把该结构的wait实体添加到对应文件系统的等待队列中。

   if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
    // 为新创建的eppoll_entry 实体设置等待队列唤醒的回调函数ep_poll_callback。
     init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
    // 设置对应文件系统的等待队列头指针
     pwq->whead = whead;
     pwq->base = epi;
    // 把新建立的ep_poll_callback实体的等待队列节点wait,添加到对应文件系统的等待队列中。
     add_wait_queue(whead, &pwq->wait);
     // 把创建的新的eppoll_entry 结构体,添加到epollevent建立的eppoll_entry 结构队列中。
     list_add_tail(&pwq->llink, &epi->pwqlist);
     epi->nwait++;
  } else {
     /* We have to signal that an error occurred */
    // 有错发生,比如:内存不足等。
     epi->nwait = -1;
  }
}


/*
 * This is the callback that is passed to the wait queue wakeup
 * machanism. It is called by the stored file descriptors when they
 * have events to report.
 */
// epoll之所以能这么高效,是靠这个回调函数来把有事件的描述符添加到epoll准备好的事件队列中的。
// 该函数是epoll的通知引擎,实现其实很简单。
static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
 int pwake = 0;
 unsigned long flags;
 struct epitem *epi = ep_item_from_wait(wait);
 struct eventpoll *ep = epi->ep;

 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
       current, epi->ffd.file, epi, ep));
 spin_lock_irqsave(&ep->lock, flags);

 /*
  * If the event mask does not contain any poll(2) event, we consider the
  * descriptor to be disabled. This condition is likely the effect of the
  * EPOLLONESHOT bit that disables the descriptor when an event is received,
  * until the next EPOLL_CTL_MOD will be issued.
  */
 // 
 if (!(epi->event.events & ~EP_PRIVATE_BITS))
  goto out_unlock;

 /*
  * If we are trasfering events to userspace, we can hold no locks
  * (because we're accessing user memory, and because of linux f_op->poll()
  * semantics). All the events that happens during that period of time are
  * chained in ep->ovflist and requeued later on.
  */
 if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
  if (epi->next == EP_UNACTIVE_PTR) {
   epi->next = ep->ovflist;
   ep->ovflist = epi;
  }
  goto out_unlock;
 }


 /* If this file is already in the ready list we exit soon */
 // 若file结构已经添加到了准备队列,什么都不做
 if (ep_is_linked(&epi->rdllink))
  goto is_linked;

 // 把epi节点添加到,eventepoll的已准备好的队列中。
 list_add_tail(&epi->rdllink, &ep->rdllist);

is_linked:
 /*
  * Wake up ( if active ) both the eventpoll wait list and the ->poll()
  * wait list.
  */
 // 唤醒epoll的等待队列
 if (waitqueue_active(&ep->wq))
  wake_up_locked(&ep->wq);

 if (waitqueue_active(&ep->poll_wait))
  pwake++;

out_unlock:
 spin_unlock_irqrestore(&ep->lock, flags);

 /* We have to call this outside the lock */
 if (pwake)
  ep_poll_safewake(&psw, &ep->poll_wait);

 return 1;
}

总结要点:
(1) epoll是基于linux虚拟文件系统框架实现的。
(2) 在epoll_ctl函数中,其实就是往epoll_create创建的eventpoll变量中的epitem红黑树中插入被监控fd和file的过程。
(3) 在插入过程中,设置poll_table参数中的回调函数qproc为ep_ptable_queue_proc。然后调用被监控fd对应文件系统的poll函数,tcp的是tcp_poll,udp的是udp_poll。在这些poll函数中,会把poll_table传入作为参数,而且会调用poll_table->qproc()函数,也就是这里的ep_ptable_queue_proc。
(4) 在ep_ptable_queue_proc函数中,会为被监控fd设置中断唤醒回调函数ep_poll_callback。当该fd有事件发生,而进程被唤醒时,则会调用ep_poll_callback函数。
(5) ep_poll_callback函数主要功能是把fd对应的epitem添加到evenpoll的rdllink队列中,表示该fd有事件发生了。
(6) 而epoll_wait函数,最终会把rdllink队列的数据,往用户层复制,告诉用户那些fd发生了什么事件。
阅读(3538) | 评论(0) | 转发(1) |
给主人留下些什么吧!~~