重读2.4 052 fs/select.c
target="_blank">
2008-4-22
这里讨论的是select和poll的系统调用实现方式.
首先要熟悉的是内核中为poll和select定义的各种输入输出消息的含义:
POLLIN
有数据可以读入,read不会阻塞,注意:select的请情况下,即使到EOF也是ready的.
POLLPRI 紧急数据,比如TCP,或者packet
模式的peseudo-terminal发现slave的状态有变化.
POLLOUT 写入不会阻塞.
POLLRDHUP :从Linux 2.6.17开始支持,不能再读取了. 比如tcp半连接状态.
POLLERR :输出出错
POLLHUP :Hang up (output only).
POLLNVAL :Invalid request: fd not open (output only).
如果定义了宏 _XOPEN_SOURCE下面的宏也有效:
POLLRDNORM : POLLIN.
POLLRDBAND : 高优先级的数据read for read (generally unused on Linux).
POLLWRNORM : Equivalent to POLLOUT.
POLLWRBAND : Priority data may be written.
asmlinkage long
sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
{
fd_set_bits fds;
char *bits;
.....
timeout = MAX_SCHEDULE_TIMEOUT;
if (tvp) { /*获取timeout */
......
if ((unsigned long) sec < MAX_SELECT_SECONDS) {
timeout = ROUND_UP(usec, 1000000/HZ);
timeout += sec * (unsigned long) HZ;
}
}
ret = -EINVAL;
... /*sanity check*/
/* 分配6个bitmaps, in/out/ex, res_in res_out res_ex */
......
ret = do_select(n, &fds, &timeout);
/*处理超时数据的返回方式:STICKY_TIMEOUTS*/
........
/*把结果返回给用户(copy出去的,完全覆盖了用户传入的bit)*/
....
}
#define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR)
#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
#define POLLEX_SET (POLLPRI) /*可以看出,except就是所谓带外数据之类*/
int do_select(int n, fd_set_bits *fds, long *timeout)
{
....
retval = max_select_fd(n, fds);
/*根据已经打开文件的位图检查用户数据(对应fd必须打开),返回检测的文件个数*/
.....
n = retval;
....
poll_initwait(&table);
wait = &table;
......
for (;;) {
set_current_state(TASK_INTERRUPTIBLE); /*睡眠过程容许被信号量唤醒*/
for (i = 0 ; i < n; i++) {
...
file = fget(i); /*获取对应的文件指针*/
mask = POLLNVAL;
if (file) {
mask = DEFAULT_POLLMASK;
if (file->f_op && file->f_op->poll)
mask = file->f_op->poll(file, wait); /*调用对应文件的poll函数*/
fput(file);
}
if ((mask & POLLIN_SET) && ISSET(bit,
__IN(fds,off))) {
SET(bit, __RES_IN(fds,off));
retval++;
wait = NULL; /*有结果就不再等了*/
}
..... /*根据返回值更新结果*/
}
wait = NULL; /*只第一轮挂上等待队列即可*/
if (retval || !__timeout || signal_pending(current))
/*超时,有文件ready,有信号,都返回*/
break;
.....
__timeout = schedule_timeout(__timeout); /*否则休眠...*/
}
current->state = TASK_RUNNING;
poll_freewait(&table);
/*
* Up-to-date the caller timeout.
*/
*timeout = __timeout;
return retval;
}
这里的逻辑是这样:
第一轮扫描的时候调用文件的poll函数(如,pipe_poll),
把证调用select的进程挂到文件所属的某个等待队列,以备文件ready的时候
唤醒这个队列.
第二轮就不用挂载等待队列了,如果有ready的也就不再挂载了,让app尽快处理.
令:有信号量的时候要返回,因为signal的处理函数是在进程返回的时候才能得到服务.(见arch/../
kernel/entry.S.
简单看看一个poll的实现:
static unsigned int
pipe_poll(struct file *filp, poll_table *wait)
{
unsigned int mask;
struct inode *inode = filp->f_dentry->d_inode;
poll_wait(filp, PIPE_WAIT(*inode), wait);
/*在wait(一个table)中申请poll_table_entry->wait挂入inode的等待队列*/
/* Reading only -- no need for acquiring the semaphore. */
mask = POLLIN | POLLRDNORM;
if (PIPE_EMPTY(*inode)) /*pipe空了,进程可以写入*/
mask = POLLOUT | POLLWRNORM;
if (!PIPE_WRITERS(*inode) && filp->f_version != PIPE_WCOUNTER(*inode))
mask |= POLLHUP; /*写入hangup,禁止写入*/
if (!PIPE_READERS(*inode))
mask |= POLLERR;
return mask;
}
void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table
*p)
{ / *current要在这个队列上等待*/
/*select的wait table*/
struct poll_table_page *table = p->table;
if (!table || POLL_TABLE_FULL(table)) {
.../* table初始化 */
}
/* Add a new entry */
{
struct poll_table_entry * entry = table->entry;
table->entry = entry+1;
get_file(filp);
entry->filp = filp;
entry->wait_address = wait_address; /*wait address就是上面的pipe的inode的wait
queue*/
init_waitqueue_entry(&entry->wait, current);/*current是调用select的进程*/
add_wait_queue(wait_address,&entry->wait); /*select的poll
table等待在inode上....*/
}
}
可见,select的wait
table记载了当前调用select的进程现在在等待多少资源,加入了多少等待队列.
而sys_poll和select是极为类似的,不再分析了..
asmlinkage long sys_poll(struct pollfd * ufds, unsigned int nfds, long timeout)