epoll源码分析---sys_epoll_ctl()函数-Larpenteur-ChinaUnix博客

尘世中一个迷途小书童riverhwp.blog.chinaunix.net

首页　| 　博文目录　| 　关于我

Larpenteur

博客访问： 6456454
博文数量： 2759
博客积分： 1021
博客等级：中士
技术积分： 4091
用户组：普通用户
注册时间： 2012-03-11 14:14

文章分类

全部博文（2759）

Todo（1）
Advice（151）
Linux-未分类（223）
Ubuntu（47）
Database（145）
算法&DS（77）
Android（47）
Web（214）
Geek（237）
CPPC（296）
Java（113）
Python（99）
Matlab（19）
Git（19）
SVN（11）
Gnuplot（5）
面试（0）
机器-挖掘-AI（6）
开源项目（1）
Happy Drawe（9）
Programming（144）

Tools（23）

Shell（66）

Makefile（11）

GDB（26）

vim（18）
System（628）

Author（110）

Common（4）

Memory（66）

File system（82）

Driver（19）

IO（66）

Storage（45）

General（38）

Architecture（19）

Command（64）

Kernel（115）
Virtualization（39）
Cloud（33）
Hadoop（71）
Big Data（24）
未分配的博文（100）

文章存档

2019年（1）

2017年（84）

2016年（196）

2015年（204）

2014年（636）

2013年（1176）

2012年（463）

我的朋友

最近访客

推荐博文

epoll源码分析---sys_epoll_ctl()函数

分类：

2013-01-15 02:17:38

原文地址：epoll源码分析---sys_epoll_ctl()函数作者：justlinux2010

一、sys_epoll_ctl()函数

源码和注释如下：

/*
* @epfd: epool_create创建的用于eventpoll的fd
* @op: 控制的命令类型
* @fd: 要操作的文件描述符
* @event:与fd相关的对象.
*/
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
int error;
struct file *file, *tfile;
struct eventpoll *ep;
struct epitem *epi;
struct epoll_event epds;
error = -EFAULT;
/*
* 检查是否需要从用户空间拷贝event参数,如果需要拷贝,则调用
* copy_from_user来拷贝.
*/
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
goto error_return;
/* Get the "struct file *" for the eventpoll file */
error = -EBADF;
/*
* 获取epfd对应的file实例
*/
file = fget(epfd);
if (!file)
goto error_return;
/* Get the "struct file *" for the target file */
/*
* 获取要操作的文件描述符对应的file实例
*/
tfile = fget(fd);
if (!tfile)
goto error_fput;
/* The target file descriptor must support poll */
/*
* 检查fd对应的文件是否支持poll
*/
error = -EPERM;
if (!tfile->f_op || !tfile->f_op->poll)
goto error_tgt_fput;
/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file. And also we do not permit
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
/*
* 检查fd对应的文件是否是一个eventpoll文件
*/
if (file == tfile || !is_file_epoll(file))
goto error_tgt_fput;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
/*
* 获取eventpoll文件中的私有数据，该数据是在epoll_create中创建的。
*/
ep = file->private_data;
mutex_lock(&ep->mtx);
/*
* Try to lookup the file inside our RB tree, Since we grabbed "mtx"
* above, we can be sure to be able to use the item looked up by
* ep_find() till we release the mutex.
*/
/*
* 在eventpoll中存储文件描述符信息的红黑树中查找指定的fd对应的epitem实例
*/
epi = ep_find(ep, tfile, fd);
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
/*
* 如果要添加的fd不存在,则调用ep_insert()插入到红黑树中,
* 如果已存在,则返回EEXIST错误.
*/
if (!epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_insert(ep, &epds, tfile, fd);
} else
error = -EEXIST;
break;
case EPOLL_CTL_DEL:
if (epi)
error = ep_remove(ep, epi);
else
error = -ENOENT;
break;
case EPOLL_CTL_MOD:
if (epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_modify(ep, epi, &epds);
} else
error = -ENOENT;
break;
}
mutex_unlock(&ep->mtx);
error_tgt_fput:
fput(tfile);
error_fput:
fput(file);
error_return:
return error;
}

该函数首先在eventpoll中查找操作的fd对应的epitem对象是否存在，然后根据用户指定的命令参数，作相应的处理。每个添加到epoll的文件都会附加到一个epitem对象中。epoll的删除文件和修改文件命令，分别有ep_remove（）和ep_modify（）来完成，这两个函数比较简单，不作过多分析。主要关心的是epoll的添加命令对应的函数ep_insert().

二、ep_insert（）函数

源码及分析如下：

/*
* Must be called with "mtx" held.
*/
static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
struct file *tfile, int fd)
{
int error, revents, pwake = 0;
unsigned long flags;
struct epitem *epi;
struct ep_pqueue epq;
/*
* 检查epoll监视的文件描述符的个数是否超过max_user_watches,
* max_user_watches用来存储每个用户使用epoll可以监视的文件
* 描述符个数
*/
if (unlikely(atomic_read(&ep->user->epoll_watches) >=
max_user_watches))
return -ENOSPC;
/*
* 每个加入到epoll中的文件都会附加到一个epitem实例中，
* 分配当前文件对应的epitem实例。
*/
if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
return -ENOMEM;
/*
* 初始化新分配的epitem实例
*/
INIT_LIST_HEAD(&epi->rdllink);
INIT_LIST_HEAD(&epi->fllink);
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;
ep_set_ffd(&epi->ffd, tfile, fd);
epi->event = *event;
epi->nwait = 0;
epi->next = EP_UNACTIVE_PTR;
/* Initialize the poll table using the queue callback */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
/*
* 如果fd是套接字，f_op为socket_file_ops，poll函数是
* sock_poll()。如果是TCP套接字的话，进而会调用
* 到tcp_poll()函数。此处调用poll函数查看当前
* 文件描述符的状态，存储在revents中。
* 在poll的处理函数(tcp_poll())中，会调用sock_poll_wait()，
* 在sock_poll_wait()中会调用到epq.pt.qproc指向的函数，
* 也就是ep_ptable_queue_proc()。
*/
revents = tfile->f_op->poll(tfile, &epq.pt);
/*
* ep_ptable_queue_proc()中如果分配内存失败时，会
* 将nwait置为-1。
*/
error = -ENOMEM;
if (epi->nwait < 0)
goto error_unregister;
/* Add the current item to the list of active epoll hook for this file */
spin_lock(&tfile->f_lock);
/*
* 将当前的epitem加入tfile的f_ep_links链表中，
* 在从epoll中移除文件时，用户清理文件对应的
* epitem实例。
*/
list_add_tail(&epi->fllink, &tfile->f_ep_links);
spin_unlock(&tfile->f_lock);
/*
* 将当前的epitem加入到存储监视的所有文件的红黑树中.
*/
ep_rbtree_insert(ep, epi);
/* We have to drop the new item inside our item list to keep track of it */
spin_lock_irqsave(&ep->lock, flags);
/*
* 如果要监视的文件状态已经就绪并且还没有加入到就绪队列中,则将当前的
* epitem加入到就绪队列中.如果有进程正在等待该文件的状态就绪,则
* 唤醒一个等待的进程.
*/
if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
/* Notify waiting tasks that events are available */
/*
* 如果有进程正在等待文件的状态就绪，也就是
* 调用epoll_wait睡眠的进程正在等待，则唤醒一个
* 等待进程。
*/
if (waitqueue_active(&ep->wq))
wake_up_locked(&ep->wq);
/*
* 如果有进程等待eventpoll文件本身的事件就绪，
* 则增加临时变量pwake的值，pwake的值不为0时，
* 在释放lock后，会唤醒等待进程。
*/
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
spin_unlock_irqrestore(&ep->lock, flags);
/*
* 增加eventpoll监视的文件数量。
*/
atomic_inc(&ep->user->epoll_watches);
/* We have to call this outside the lock */
/*
* 唤醒等待eventpoll文件状态就绪的进程
*/
*
if (pwake)
ep_poll_safewake(&ep->poll_wait);
return 0;
error_unregister:
ep_unregister_pollwait(ep, epi);
/*
* We need to do this because an event could have been arrived on some
* allocated wait queue. Note that we don't care about the ep->ovflist
* list, since that is used/cleaned only inside a section bound by "mtx".
* And ep_insert() is called with "mtx" held.
*/
spin_lock_irqsave(&ep->lock, flags);
if (ep_is_linked(&epi->rdllink))
list_del_init(&epi->rdllink);
spin_unlock_irqrestore(&ep->lock, flags);
kmem_cache_free(epi_cache, epi);
return error;
}

ep_insert()函数首先分配fd要附加到的epitem实例，初始化后会添加到eventpoll中存储文件的红黑树、监视文件的f_ep_links链表中以及监视文件的唤醒队列中。在加入到监视文件的唤醒队列时，如果用户关心的事件发生时，会将epitem实例添加到eventpoll的就绪队列中。第52行代码就是将epitem实例添加到文件的唤醒队列中，真正添加的操作是ep_ptable_queue_proc（）函数。

三、ep_ptable_queue_proc（）函数

源码及注释如下：

/*
* 在文件操作中的poll函数中调用，将epoll的回调函数
* 加入到目标文件的唤醒队列中。
* 如果监视的文件是套接字，参数whead则是sock结构的sk_sleep
* 成员的地址
*/
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt)
{
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;
if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = epi;
add_wait_queue(whead, &pwq->wait);
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
} else {
/* We have to signal that an error occurred */
/*
* 如果分配内存失败，则将nwait置为-1，表示
* 发生错误，即内存分配失败，或者已发生错误
*/
epi->nwait = -1;
}
}

从上面的函数可以看出，注册在监视文件的唤醒队列上的回调方法是ep_poll_callback（）函数。也就是当有事件发生时，会唤醒监视文件上等待的进程。在tcp_prequeue（）函数中当有数据达到时唤醒等待队列sk_sleep上的进程，代码片段如下：

static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
{
.......
wake_up_interruptible_poll(sk->sk_sleep,
POLLIN | POLLRDNORM | POLLRDBAND);
.......
}

wake_up_interruptible_poll()函数会调用注册到sk_sleep中的回调函数，如果是eventpoll注册的话，该回调函数就是ep_poll_callback（）。

四、ep_poll_callback（）函数

/*
* 如果文件类型支持epoll并且有事件发生，发生的事件通过
* 参数key来传送，参见tcp_prequeue()函数中对wake_up_interruptible_poll()
* 的调用。
* @wait: 调用ep_ptable_queue_proc()加入到文件中的唤醒队列时分配的
* eppoll_entry实例的wait成员的地址
* @mode:该参数在回调函数ep_poll_callback()中没有使用，其值为进程
* 睡眠时的状态
* @sync: 唤醒等待进程的标志
*/
static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
unsigned long flags;
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
spin_lock_irqsave(&ep->lock, flags);
/*
* If the event mask does not contain any poll(2) event, we consider the
* descriptor to be disabled. This condition is likely the effect of the
* EPOLLONESHOT bit that disables the descriptor when an event is received,
* until the next EPOLL_CTL_MOD will be issued.
*/
/*
* epi->event.events中存储的是用户空间关心的事件，如果该成员
* 没有包含任何poll事件，则跳转到out_unlock处处理
*/
if (!(epi->event.events & ~EP_PRIVATE_BITS))
goto out_unlock;
/*
* Check the events coming with the callback. At this stage, not
* every device reports the events in the "key" parameter of the
* callback. We need to be able to handle both cases here, hence the
* test for "key" != NULL before the event match test.
*/
/*
* 如果key不为NULL，也就是值不是0，但是用户关心的
* 事件并没有发生，则跳转到out_unlock处处理。参数key
* 应该不会为0
*/
if (key && !((unsigned long) key & epi->event.events))
goto out_unlock;
/*
* If we are trasfering events to userspace, we can hold no locks
* (because we're accessing user memory, and because of linux f_op->poll()
* semantics). All the events that happens during that period of time are
* chained in ep->ovflist and requeued later on.
*/
/*
* ep_scan_ready_list()是向用户空间传递事件的处理函数，
* ep_scan_ready_list()函数执行时会将ovflist链表中的元素
* 暂存到一个临时变量中，然后将ovflist成员置为NULL，
* 而EP_UNACTIVE_PTR的定义如下:
* #define EP_UNACTIVE_PTR ((void *) -1L)
* 因此(ep->ovflist != EP_UNACTIVE_PTR)成立时，正在向用户空间
* 传递事件。
* 如果当前正在向用户空间传递事件，则将
* 当前的事件对应的epitem实例加入到ovflist链表中。
*/
if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
/*
* 如果epi->next不等于EP_UNACTIVE_PTR，则说明已经
* 添加到ovflist链表中，就不用再添加了
*/
if (epi->next == EP_UNACTIVE_PTR) {
epi->next = ep->ovflist;
ep->ovflist = epi;
}
goto out_unlock;
}
/* If this file is already in the ready list we exit soon */
/*
* 如果当前没有在向用户空间传递事件，用户
* 关心的事件已经发生，并且还没有加入到就绪
* 队列中，则将当前的epitem实例加入到就绪队列中。
*/
if (!ep_is_linked(&epi->rdllink))
list_add_tail(&epi->rdllink, &ep->rdllist);
/*
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
* wait list.
*/
/*
* 唤醒调用epoll_wait()函数时睡眠的进程。
*/
if (waitqueue_active(&ep->wq))
wake_up_locked(&ep->wq);
/*
* 唤醒等待eventpoll文件状态就绪的进程
*/
if (waitqueue_active(&ep->poll_wait))
pwake++;
out_unlock:
spin_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
/*
* 唤醒等待eventpoll文件的状态就绪的进程
*/
if (pwake)
ep_poll_safewake(&ep->poll_wait);
return 1;
}

该函数主要的功能是将被监视文件的等待事件就绪时，将文件对应的epitem实例添加到就绪队列中，当用户调用epoll_wait()时，内核会将就绪队列中的事件报告给用户

阅读(728) | 评论(0) | 转发(0) |

上一篇：如何实现shell并发

下一篇：Linux 下 Apache 编译安装操作

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6