epoll分析-lc0060305-ChinaUnix博客

李庚睿（lgr）的博客 -- 蔚蓝天空garry.blog.chinaunix.net

首页　| 　博文目录　| 　关于我

lc0060305

博客访问： 3584831
博文数量： 1450
博客积分： 11163
博客等级：上将
技术积分： 11101
用户组：普通用户
注册时间： 2005-07-25 14:40

文章分类

全部博文（1450）

音视频直播（2）
linux各种服务器（3）
ARM学习（8）

ARM汇编指令（7）
手机开发（230）

android（2）

iphone（4）

symbian（224）
nginx 分析（6）
vi常用方法（13）
linux 常用命令（65）

linux shell 脚本（38）
window批处理资料（15）
黑客技术（20）

linux 系统安全（12）
搜索引擎与网络爬（32）
数据库技术（143）
网络技术（25）

网络测试方法（2）
操作系统研究（192）

android源码分析（1）

linux驱动（20）
程序设计（513）

调试技术（3）

测试方法（7）

性能调优（2）

debian（1）

JNI（5）

configure.ac（1）

Makefile.am（3）

设计模式（19）

算法与数据结构（4）

java程序开发（103）

web程序开发（41）
随笔（129）

地图集（14）

英语（4）

笑话（56）

我喜爱的诗（6）

我的小诗（4）
未分配的博文（54）

文章存档

2017年（5）

2014年（2）

2013年（3）

2012年（35）

2011年（39）

2010年（88）

2009年（395）

2008年（382）

2007年（241）

2006年（246）

2005年（14）

我的朋友

相关博文

epoll分析

分类： LINUX

2010-03-23 18:49:44

epoll的使用需要文件系统的支持，文件系统需要支持poll的f_pos，ext2/3就不支持epoll。epoll的工作流程：
1. 初始化需要监听的fd，将注册ep_poll_callback到文件系统的inode的wait队列。
2. 当文件系统就绪，会唤醒其inode的wait队列上的进程，并调用1所注册的回调函数。（该回调函数会将就绪的fd加入到eventpoll的就绪队列（rdllist）中，并唤醒epoll_wait进程）
3. epoll_wait进程查询eventpoll就绪队列上的fd，将起相关信息copy到用户态，完成通知。

/*

* This structure is stored inside the "private_data" member of the file
* structure and rapresent the main data sructure for the eventpoll
* interface.
*/
struct eventpoll {
/* Protect the this structure access */
rwlock_t lock;

/*
* This semaphore is used to ensure that files are not removed
* while epoll is using them. This is read-held during the event
* collection loop and it is write-held during the file cleanup
* path, the epoll file exit code and the ctl operations.
*/
struct rw_semaphore sem;

/* Wait queue used by sys_epoll_wait() */
wait_queue_head_t wq;

/* Wait queue used by file->poll() */
wait_queue_head_t poll_wait;

/* List of ready file descriptors */
struct list_head rdllist;

/* RB-Tree root used to store monitored fd structs */
struct rb_root rbr;
};

/*
* It opens an eventpoll file descriptor by suggesting a storage of "size"
* file descriptors. The size parameter is just an hint about how to size
* data structures. It won't prevent the user to store more than "size"
* file descriptors inside the epoll interface. It is the kernel part of
* the userspace epoll_create(2).
*/
//注意这个size参数只是一个参考值，而没有被使用到。
//这个函数就是用于建立一个eventpoll结构, 然后返回一个文件描述符,
//通过这个文件描述符可以获取这个相应的eventpoll结构.

asmlinkage long sys_epoll_create(int size)
{
int error, fd = -1;
struct eventpoll *ep;
struct inode *inode;
struct file *file;

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
current, size));

/*
* Sanity check on the size parameter, and create the internal data
* structure ( "struct eventpoll" ).
*/
error = -EINVAL;
if (size <= 0 || (error = ep_alloc(&ep)) != 0) //分配并初始化一个eventpoll结构
goto eexit_1;

/*
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure, and inode and a free file descriptor.
*/
//在这个eventpoll文件系统中分配inode节点,并且分配相应的fd, file结构,
//并有file->private_data = ep, 就是说将ep与这个file/fd联系起来.
error = ep_getfd(&fd, &inode, &file, ep);
if (error)
goto eexit_2;

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
current, size, fd));

return fd;

eexit_2:
ep_free(ep);
kfree(ep);
eexit_1:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
current, size, error));
return error;
}

struct epoll_event {
__u32 events;
__u64 data;
} EPOLL_PACKED;

/*
* Each file descriptor added to the eventpoll interface will
* have an entry of this type linked to the hash.
*/
struct epitem {
/* RB-Tree node used to link this structure to the eventpoll rb-tree */
struct rb_node rbn;

/* List header used to link this structure to the eventpoll ready list */
struct list_head rdllink;

/* The file descriptor information this item refers to */
struct epoll_filefd ffd;

/* Number of active wait queue attached to poll operations */
int nwait;

/* List containing poll wait queues */
struct list_head pwqlist;

/* The "container" of this item */
struct eventpoll *ep;

/* The structure that describe the interested events and the source fd */
struct epoll_event event;

/*
* Used to keep track of the usage count of the structure. This avoids
* that the structure will desappear from underneath our processing.
*/
atomic_t usecnt;

/* List header used to link this item to the "struct file" items list */
struct list_head fllink;

/* List header used to link the item to the transfer list */
struct list_head txlink;

/*
* This is used during the collection/transfer of events to userspace
* to pin items empty events set.
*/
unsigned int revents;
};

/*
* The following function implements the controller interface for
* the eventpoll file that enables the insertion/removal/change of
* file descriptors inside the interest set. It represents
* the kernel part of the user space epoll_ctl(2).
*/
asmlinkage long
sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
{
int error;
struct file *file, *tfile;
struct eventpoll *ep;
struct epitem *epi; //每个fd对应一个epitem结构，它是epoll_ctl中insertion/removal/change操作的对象
struct epoll_event epds;

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
current, epfd, op, fd, event));

error = -EFAULT;
if (ep_op_hash_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
goto eexit_1;

/* Get the "struct file *" for the eventpoll file */
error = -EBADF;
file = fget(epfd); //eventpoll的file结构
if (!file)
goto eexit_1;

/* Get the "struct file *" for the target file */
tfile = fget(fd); //待处理的file结构
if (!tfile)
goto eexit_2;

/* The target file descriptor must support poll */
error = -EPERM;
if (!tfile->f_op || !tfile->f_op->poll) //待处理的file必须支持poll
goto eexit_3;

/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file. And also we do not permit
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
//不能自己加自己, 但是可以将一个epoll file descriptor加入到另个个epoll/poll/select中
if (file == tfile || !is_file_epoll(file))
goto eexit_3;

/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = file->private_data; //取出eventpoll结构。

down_write(&ep->sem);

/* Try to lookup the file inside our hash table */
epi = ep_find(ep, tfile, fd); //从红黑数中查找fd对应的epi, epi->usecnt++

error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD: //插入操作
if (!epi) {
epds.events |= POLLERR | POLLHUP;

error = ep_insert(ep, &epds, tfile, fd);
} else
error = -EEXIST; //如果已经存在, 插入出错
break;
case EPOLL_CTL_DEL: //删除操作
if (epi)
error = ep_remove(ep, epi);
else
error = -ENOENT;
break;
case EPOLL_CTL_MOD: //修改操作
if (epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_modify(ep, epi, &epds);
} else
error = -ENOENT;
break;
}

/*
* The function ep_find() increments the usage count of the structure
* so, if this is not NULL, we need to release it.
*/
if (epi)
ep_release_epitem(epi); // epi->usecnt++, if epi->usecnt==0, 释放这个epi

up_write(&ep->sem);

eexit_3:
fput(tfile);
eexit_2:
fput(file);
eexit_1:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
current, epfd, op, fd, event, error));

return error;
}

/* Wrapper struct used by poll queueing */
struct ep_pqueue {
poll_table pt;
struct epitem *epi;
};

static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
struct file *tfile, int fd)
{
int error, revents, pwake = 0;
unsigned long flags;
struct epitem *epi;
struct ep_pqueue epq;

error = -ENOMEM; //插入操作，先分配一个epi
if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
goto eexit_1;

//初始化epi
/* Item initialization follow here ... */
ep_rb_initnode(&epi->rbn);
INIT_LIST_HEAD(&epi->rdllink);
INIT_LIST_HEAD(&epi->fllink);
INIT_LIST_HEAD(&epi->txlink);
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;   //eventpoll
ep_set_ffd(&epi->ffd, tfile, fd); //待处理的文件描述符
epi->event = *event; //用户关注的事件
atomic_set(&epi->usecnt, 1);
epi->nwait = 0;

//注意以下的初始化, 它用ep_ptable_queue_proc这个函数初始化poll_table,
//类似于在poll()实现中它是用__waitpoll()来初始化这个函数. 这个函数会在
//poll_wait()中被调用,而poll_wait()则在驱动或文件系统的file_operations->poll
//中被调用.

/* Initialize the poll table using the queue callback */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

/*
* Attach the item to the poll hooks and get current event bits.
* We can safely use the file* here because its usage count has
* been increased by the caller of this function.
*/
revents = tfile->f_op->poll(tfile, &epq.pt);

/*
* We have to check if something went wrong during the poll wait queue
* install process. Namely an allocation for a wait queue failed due
* high memory pressure.
*/
if (epi->nwait < 0)
goto eexit_2;

/* Add the current item to the list of active epoll hook for this file */
spin_lock(&tfile->f_ep_lock);
list_add_tail(&epi->fllink, &tfile->f_ep_links);
spin_unlock(&tfile->f_ep_lock);

/* We have to drop the new item inside our item list to keep track of it */
write_lock_irqsave(&ep->lock, flags);

/* Add the current item to the rb-tree */
ep_rbtree_insert(ep, epi); //插入到树中

//如果可读,直接加入到相应的eventpoll的ep->rdllist中,
/* If the file is already "ready" we drop it inside the ready list */
if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);

/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq)) //如果这个睡眠队列非空, 唤醒睡眠进程
__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}

write_unlock_irqrestore(&ep->lock, flags);

/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&psw, &ep->poll_wait); //???

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
current, ep, tfile, fd));

return 0;

eexit_2:
ep_unregister_pollwait(ep, epi);

/*
* We need to do this because an event could have been arrived on some
* allocated wait queue.
*/
write_lock_irqsave(&ep->lock, flags);
if (ep_is_linked(&epi->rdllink))
ep_list_del(&epi->rdllink);
write_unlock_irqrestore(&ep->lock, flags);

kmem_cache_free(epi_cache, epi);
eexit_1:
return error;
}

static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
pt->qproc = qproc;
}
/*
* This is the callback that is used to add our wait queue to the
* target file wakeup lists.
*/
//这个函数用于将当前进程加入到设备的睡眠队列中去,这样,当设备有数据可读写时,
//设备的read/write函数会调用wake_up()唤醒睡眠在这个队列上的进程.
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt)
{
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;

if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
//将wait的函数初始化为ep_poll_callback,当进程被唤醒时,他就会执行这个函数.
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = epi;
add_wait_queue(whead, &pwq->wait); //加入队列
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
} else {
/* We have to signal that an error occurred */
epi->nwait = -1;
}
}

/*
* This is the callback that is passed to the wait queue wakeup
* machanism. It is called by the stored file descriptors when they
* have events to report.
*/
//这个函数体现了epoll于poll/select的本质区别. 在poll/select中,它是通过遍历所有的文件描述符
//来检查每个文件描述符是否有数据可读写, 但是在epoll中,它是在一个文件可读写时,通过wait_up()
//调用以下这个wait的callback函数, 将这个有数据可读写的文件描述符加入到ready队列中去.
//所以, 它就不用遍历所有的文件描述符,而只是这个ready队列而已了.

static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
unsigned long flags;
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
current, epi->ffd.file, epi, ep));

write_lock_irqsave(&ep->lock, flags);

/*
* If the event mask does not contain any poll(2) event, we consider the
* descriptor to be disabled. This condition is likely the effect of the
* EPOLLONESHOT bit that disables the descriptor when an event is received,
* until the next EPOLL_CTL_MOD will be issued.
*/
if (!(epi->event.events & ~EP_PRIVATE_BITS))
goto is_disabled;

/* If this file is already in the ready list we exit soon */
if (ep_is_linked(&epi->rdllink))
goto is_linked;

list_add_tail(&epi->rdllink, &ep->rdllist); //加入到Ready队列中去

is_linked:
/*
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
* wait list.
*/
if (waitqueue_active(&ep->wq)) //有数据可读, 唤醒
__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
TASK_INTERRUPTIBLE);
if (waitqueue_active(&ep->poll_wait))
pwake++;

is_disabled:
write_unlock_irqrestore(&ep->lock, flags);

/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&psw, &ep->poll_wait);

return 1;
}

======================================================================

/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
*/
asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, int timeout)
{
int error;
struct file *file;
struct eventpoll *ep;

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
current, epfd, events, maxevents, timeout));

/* The maximum number of event must be greater than zero */
if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
return -EINVAL;

/* Verify that the area passed by the user is writeable */
if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
error = -EFAULT;
goto eexit_1;
}

/* Get the "struct file *" for the eventpoll file */
error = -EBADF;
file = fget(epfd);
if (!file)
goto eexit_1;

/*
* We have to check that the file structure underneath the fd
* the user passed to us _is_ an eventpoll file.
*/
error = -EINVAL;
if (!is_file_epoll(file)) //检查是不是epoll文件描述符: (f->f_op == &eventpoll_fops;)
goto eexit_2;

/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = file->private_data;

/* Time to fish for events ... */
error = ep_poll(ep, events, maxevents, timeout);

eexit_2:
fput(file);
eexit_1:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
current, epfd, events, maxevents, timeout, error));

return error;
}

static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, long timeout)
{
int res, eavail;
unsigned long flags;
long jtimeout;
wait_queue_t wait;

/*
* Calculate the timeout by checking for the "infinite" value ( -1 )
* and the overflow condition. The passed timeout is in milliseconds,
* that why (t * HZ) / 1000.
*/
jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?
MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;

retry:
write_lock_irqsave(&ep->lock, flags);

res = 0;
//如果没有ready的文件描述符,则睡眠等待被唤醒.
//我们在上面看到,它是通过poll_wait加入到设备的睡眠队列中去的.
if (list_empty(&ep->rdllist)) {
/*
* We don't have any available event to return to the caller.
* We need to sleep here, and we will be wake up by
* ep_poll_callback() when events will become available.
*/
init_waitqueue_entry(&wait, current);
__add_wait_queue(&ep->wq, &wait); //这个本身的睡眠队列,不同于设备的睡眠队列

for (;;) {
/*
* We don't want to sleep if the ep_poll_callback() sends us
* a wakeup in between. That's why we set the task state
* to TASK_INTERRUPTIBLE before doing the checks.
*/
set_current_state(TASK_INTERRUPTIBLE);
if (!list_empty(&ep->rdllist) || !jtimeout)
break;
if (signal_pending(current)) {
res = -EINTR;
break;
}

write_unlock_irqrestore(&ep->lock, flags);
jtimeout = schedule_timeout(jtimeout);
write_lock_irqsave(&ep->lock, flags);
}
__remove_wait_queue(&ep->wq, &wait);

set_current_state(TASK_RUNNING);
}

/* Is it worth to try to dig for events ? */
eavail = !list_empty(&ep->rdllist);

write_unlock_irqrestore(&ep->lock, flags);

/*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
* more luck.
*/
//有数据处理
if (!res && eavail &&
!(res = ep_events_transfer(ep, events, maxevents)) && jtimeout)
goto retry;

return res;
}

/*
* Perform the transfer of events to user space.
*/
static int ep_events_transfer(struct eventpoll *ep,
struct epoll_event __user *events, int maxevents)
{
int eventcnt = 0;
struct list_head txlist;

INIT_LIST_HEAD(&txlist);

/*
* We need to lock this because we could be hit by
* eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
*/
down_read(&ep->sem);

/* Collect/extract ready items */
if (ep_collect_ready_items(ep, &txlist, maxevents) > 0) {
/* Build result set in userspace */
eventcnt = ep_send_events(ep, &txlist, events);

/* Reinject ready items into the ready list */
ep_reinject_items(ep, &txlist);
}

up_read(&ep->sem);

return eventcnt;
}

/*
* Since we have to release the lock during the __copy_to_user() operation and
* during the f_op->poll() call, we try to collect the maximum number of items
* by reducing the irqlock/irqunlock switching rate.
*/
static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist, int maxevents)
{
int nepi;
unsigned long flags;
struct list_head *lsthead = &ep->rdllist, *lnk;
struct epitem *epi;

write_lock_irqsave(&ep->lock, flags);

//将数据从ep->rdllist队列移到txlist队列
for (nepi = 0, lnk = lsthead->next; lnk != lsthead && nepi < maxevents;) {
epi = list_entry(lnk, struct epitem, rdllink);

lnk = lnk->next;

/* If this file is already in the ready list we exit soon */
if (!ep_is_linked(&epi->txlink)) {
/*
* This is initialized in this way so that the default
* behaviour of the reinjecting code will be to push back
* the item inside the ready list.
*/
//待处理事件
epi->revents = epi->event.events;

/* Link the ready item into the transfer list */
list_add(&epi->txlink, txlist);
nepi++;

/*
* Unlink the item from the ready list.
*/
ep_list_del(&epi->rdllink); //脱链
}
}

write_unlock_irqrestore(&ep->lock, flags);

return nepi;
}

/*
* This function is called without holding the "ep->lock" since the call to
* __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ
* because of the way poll() is traditionally implemented in Linux.
*/
static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
struct epoll_event __user *events)
{
int eventcnt = 0;
unsigned int revents;
struct list_head *lnk;
struct epitem *epi;

/*
* We can loop without lock because this is a task private list.
* The test done during the collection loop will guarantee us that
* another task will not try to collect this file. Also, items
* cannot vanish during the loop because we are holding "sem".
*/
list_for_each(lnk, txlist) {
epi = list_entry(lnk, struct epitem, txlink);

/*
* Get the ready file event set. We can safely use the file
* because we are holding the "sem" in read and this will
* guarantee that both the file and the item will not vanish.
*/
//返回事件
revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);

/*
* Set the return event set for the current file descriptor.
* Note that only the task task was successfully able to link
* the item to its "txlist" will write this field.
*/
epi->revents = revents & epi->event.events; //得到所关心的事情

if (epi->revents) { //拷贝给用户
if (__put_user(epi->revents,
&events[eventcnt].events) ||
__put_user(epi->event.data,
&events[eventcnt].data))
return -EFAULT;
if (epi->event.events & EPOLLONESHOT)
epi->event.events &= EP_PRIVATE_BITS;
eventcnt++;
}
}
return eventcnt;
}

//如下, 我们可以看到, ep_eventpoll_poll()是一个文件系统的poll具体实现,
//它是设备或文件系统的一个实现样本.
//epoll实现具体的poll()有两个目的, 一是sys_epoll_wait()中检查这个描述符是
//不是epoll描述符, (f->f_op == &eventpoll_fops;). 二是用来实现这个特殊的epoll
//文件描述符也可以加入其他的epoll/poll/select的描述符集中.

/* File callbacks that implement the eventpoll file behaviour */
static const struct file_operations eventpoll_fops = {
.release   = ep_eventpoll_close,
.poll       = ep_eventpoll_poll
};

static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
{
unsigned int pollflags = 0;
unsigned long flags;
struct eventpoll *ep = file->private_data;

/* Insert inside our poll wait queue */
poll_wait(file, &ep->poll_wait, wait);

/* Check our condition */
read_lock_irqsave(&ep->lock, flags);
if (!list_empty(&ep->rdllist))
pollflags = POLLIN | POLLRDNORM;
read_unlock_irqrestore(&ep->lock, flags);

return pollflags;
}

阅读(780) | 评论(0) | 转发(0) |

上一篇：线程池模式比较--H/H半同步/半异步模式与 L/F领导者跟随者模式

下一篇：epoll实现原理

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6