linux 内核等待队列

Linux内核的等待队列是以双循环链表为基础数据结构，与进程调度机制紧密结合，能够用于实现核心的异步事件通知机制。在Linux2.4.21中，等待队列在源代码树 include/linux/wait.h中，这是一个通过list_head连接的典型双循环链表，如下图所示。

在这个链表中，有两种数据结构：等待队列头（wait_queue_head_t）和等待队列项（wait_queue_t）。等待队列头和等待队列项中都包含一个list_head类型的域作为"连接件"。由于我们只需要对队列进行添加和删除操作，并不会修改其中的对象（等待队列项），因此，我们只需要提供一把保护整个基础设施和所有对象的锁，这把锁保存在等待队列头中，为wq_lock_t类型。在实现中，可以支持读写锁（rwlock）或自旋锁（spinlock）两种类型，通过一个宏定义来切换。如果使用读写锁，将wq_lock_t定义为rwlock_t类型；如果是自旋锁，将wq_lock_t定义为spinlock_t类型。无论哪种情况，分别相应设置wq_read_lock、wq_read_unlock、wq_read_lock_irqsave、 wq_read_unlock_irqrestore、wq_write_lock_irq、wq_write_unlock、 wq_write_lock_irqsave和wq_write_unlock_irqrestore等宏。
等待队列头
struct __wait_queue_head {
wq_lock_t lock;
struct list_head task_list;
};
typedef struct __wait_queue_head wait_queue_head_t;
前面已经说过，等待队列的主体是进程，这反映在每个等待队列项中，是一个任务结构指针（struct task_struct * task）。flags为该进程的等待标志，当前只支持互斥。
等待队列项
struct __wait_queue {
unsigned int flags;
#define WQ_FLAG_EXCLUSIVE 0x01
struct task_struct * task;
struct list_head task_list;
};
typedef struct __wait_queue wait_queue_t;
每一个等待队列项代表一个睡眠进程，该进程等待某一事件的发生。它的描述符地址通常放在private字段中。Task_list字段中包含的是指针，由这个指针把一个元素链接到等待相同事件的进程链表中。
等待队列元素的func字段用来表示等待队列中睡眠进程应该用什么方式唤醒(互斥方式和非互斥方式)。
声明和初始化
#define DECLARE_WAITQUEUE(name, tsk)     \
wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk)
#define __WAITQUEUE_INITIALIZER(name, tsk) {    \
task: tsk,      \
task_list: { NULL, NULL },     \
    __WAITQUEUE_DEBUG_INIT(name)}
通过DECLARE_WAITQUEUE宏将等待队列项初始化成对应的任务结构，并且用于连接的相关指针均设置为空。其中加入了调试相关代码。
#define DECLARE_WAIT_QUEUE_HEAD(name) \
wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
#define __WAIT_QUEUE_HEAD_INITIALIZER(name) {    \
lock: WAITQUEUE_RW_LOCK_UNLOCKED,   \
task_list: { &(name).task_list, &(name).task_list }, \
   __WAITQUEUE_HEAD_DEBUG_INIT(name)}
通过DECLARE_WAIT_QUEUE_HEAD宏初始化一个等待队列头，使得其所在链表为空，并设置链表为"未上锁"状态。其中加入了调试相关代码。
static inline void init_waitqueue_head(wait_queue_head_t *q)
该函数初始化一个已经存在的等待队列头，它将整个队列设置为"未上锁"状态，并将链表指针prev和next指向它自身。
{
    q->lock = WAITQUEUE_RW_LOCK_UNLOCKED;
    INIT_LIST_HEAD(&q->task_list);
}
static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
该函数初始化一个已经存在的等待队列项，它设置对应的任务结构，同时将标志位清0。
{
    q->flags = 0;
    q->task = p;
}
static inline int waitqueue_active(wait_queue_head_t *q)
该函数检查等待队列是否为空。
{
    return !list_empty(&q->task_list);
}
static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
将指定的等待队列项new添加到等待队列头head所在的链表头部，该函数假设已经获得锁。
{
    list_add(&new->task_list, &head->task_list);
}
static inline void __add_wait_queue_tail(wait_queue_head_t *head, wait_queue_t *new)
将指定的等待队列项new添加到等待队列头head所在的链表尾部，该函数假设已经获得锁。
{
    list_add_tail(&new->task_list, &head->task_list);
}
static inline void __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
将函数从等待队列头head所在的链表中删除指定等待队列项old，该函数假设已经获得锁，并且old在head所在链表中。
{
    list_del(&old->task_list);
}
睡眠和唤醒操作
对等待队列的操作包括睡眠和唤醒（相关函数保存在源代码树的/kernel/sched.c和include/linux/sched.h中）。思想是更改当前进程（CURRENT）的任务状态，并要求重新调度，因为这时这个进程的状态已经改变，不再在调度表的就绪队列中，因此无法再获得执行机会，进入"睡眠"状态，直至被"唤醒"，即其任务状态重新被修改回就绪态。
2. 等待队列的睡眠过程
使用等待队列前通常先定义一个等待队列头：static wait_queue_head_t wq ,然后调用wait_event_*函数将等待某条件condition的当前进程插入到等待队列wq中并睡眠，一直等到condition条件满足后，内核再将睡眠在等待队列wq上的某一进程或所有进程唤醒。
定义等待队列头没什么好讲的，下面从调用wait_event_*开始分析：
这里我们举比较常用的wait_event_interruptible：
/**
* wait_event_interruptible - sleep until a condition gets true
* @wq: the waitqueue to wait on
* @condition: a C expression for the event to wait for
*
* The process is put to sleep (TASK_INTERRUPTIBLE) until the
* @condition evaluates to true or a signal is received.
* The @condition is checked each time the waitqueue @wq is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
* The function will return -ERESTARTSYS if it was interrupted by a
* signal and 0 if @condition evaluated to true.
*/
#define wait_event_interruptible(wq, condition)                               \
({                                                                                 \
         int __ret = 0;                                                            \
         if (!(condition))                                                        \
                   __wait_event_interruptible(wq, condition, __ret);         \
         __ret;                                                                        \
})
这里很简单，判断一下condition条件是否满足，如果不满足则调用__wait_event_interruptible函数。
#define __wait_event_interruptible(wq, condition, ret)                            \
do {                                                                             \
         DEFINE_WAIT(__wait);                                                  \
                                                                                    \
         for (;;) {                                                             \
                   prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE);    \
                   if (condition)                                                   \
                            break;                                                     \
                   if (!signal_pending(current)) {                             \
                            schedule();                                             \
                            continue;                                      \
                   }                                                                \
                   ret = -ERESTARTSYS;                                              \
                   break;                                                               \
         }                                                                         \
         finish_wait(&wq, &__wait);                                         \
} while (0)
__wait_event_interruptible首先定义了一个wait_queue_t类型的等待队列项__wait :
#define DEFINE_WAIT(name)                                                \
         wait_queue_t name = {                                                  \
                   .private    = current,                                     \
                   .func                   = autoremove_wake_function,                 \
                   .task_list = LIST_HEAD_INIT((name).task_list),     \
         }
可以发现，这里__wait的private成员(通常用来存放进程的描述符)已经被初始化为current, 表示该等待队列项对应为当前进程。func成员为该等待队列项对应的唤醒函数，该进程被唤醒后会执行它，已经被初始化为默认的 autoremove_wake_function函数。
然后在一个for (;;) 循环内调用prepare_to_wait函数：
void fastcall prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
         unsigned long flags;
         wait->flags &= ~WQ_FLAG_EXCLUSIVE;
         spin_lock_irqsave(&q->lock, flags);
         if (list_empty(&wait->task_list))
                   __add_wait_queue(q, wait);

         /*
          * don't alter the task state if this is just going to
          * queue an async wait queue callback
          */
         if (is_sync_wait(wait))
                   set_current_state(state);
         spin_unlock_irqrestore(&q->lock, flags);
}
prepare_to_wait 做如下两件事，将先前定义的等待队列项__wait插入到等待队列头wq，然后将当前进程设为TASK_INTERRUPTIBLE状态。 prepare_to_wait执行完后立马再检查一下condition有没有满足，如果此时碰巧满足了则不必要在睡眠了。如果还没有满足，则准备睡眠。
睡眠是通过调用schedule()函数实现的，由于之前已经将当前进程设置为TASK_INTERRUPTIBLE状态，因而这里再执行 schedule()进行进程切换的话，之后就永远不会再调度到该进程运行的，直到该进程被唤醒（即更改为TASK_RUNNING状态）。
这里在执行schedule()切换进程前会先判断一下有没signal过来，如果有则立即返回ERESTARTSYS。没有的话则执行schedule()睡眠去了。

for (;;) 循环的作用是让进程被唤醒后再一次去检查一下condition是否满足。主要是为了防止等待队列上的多个进程被同时唤醒后有可能其他进程已经抢先把资源占有过去造成资源又变为不可用，因此最好再判断一下。(当然，内核也提供了仅唤醒一个或多个进程（独占等待进程）的方式，有兴趣的可以参考相关资料)
进程被唤醒后最后一步是调用finish_wait(&wq, &__wait)函数进行清理工作。finish_wait将进程的状态再次设为TASK_RUNNING并从等待队列中删除该进程。
void fastcall finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
{
         unsigned long flags;
         __set_current_state(TASK_RUNNING);
         if (!list_empty_careful(&wait->task_list)) {
                   spin_lock_irqsave(&q->lock, flags);
                   list_del_init(&wait->task_list);
                   spin_unlock_irqrestore(&q->lock, flags);
         }
}
再往后就是返回你先前调用wait_event_interruptible(wq, condition)被阻塞的地方继续往下执行。

3. 等待队列的唤醒过程
直到这里我们明白等待队列是如何睡眠的，下面我们分析等待队列的唤醒过程。
使用等待队列有个前提，必须得有人唤醒它，如果没人唤醒它，那么同眠在该等待队列上的所有进程岂不是变成“僵尸进程”了。
对于设备驱动来讲，通常是在中断处理函数内唤醒该设备的等待队列。驱动程序通常会提供一组自己的读写等待队列以实现上层(user level)所需的BLOCK和O_NONBLOCK操作。当设备资源可用时，如果驱动发现有进程睡眠在自己的读写等待队列上便会唤醒该等待队列。
唤醒一个等待队列是通过wake_up_*函数实现的。这里我们举对应的wake_up_interruptible作为例子分析。定义如下：
#define wake_up_interruptible(x)   __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
这里的参数x即要唤醒的等待队列对应的等待队列头。唤醒TASK_INTERRUPTIBLE类型的进程并且默认唤醒该队列上所有非独占等待进程和一个独占等待进程。
__wake_up定义如下：
/**
* __wake_up - wake up threads blocked on a waitqueue.
* @q: the waitqueue
* @mode: which threads
* @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: is directly passed to the wakeup function
*/
void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,int nr_exclusive, void *key)
{
unsigned long flags;
spin_lock_irqsave(&q->lock, flags);
__wake_up_common(q, mode, nr_exclusive, 1, key);
spin_unlock_irqrestore(&q->lock, flags);
preempt_check_resched_delayed();
}
__wake_up 简单的调用__wake_up_common进行实际唤醒工作。
__wake_up_common定义如下：
/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
* number) then we wake all the non-exclusive tasks and one exclusive task.
*
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,int nr_exclusive, int sync, void *key)
{
struct list_head *tmp, *next;
list_for_each_safe(tmp, next, &q->task_list) {
wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
unsigned flags = curr->flags;
if (curr->func(curr, mode, sync, key) &&
(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
}
__wake_up_common循环遍历等待队列内的所有元素，分别执行其对应的唤醒函数。
这里的唤醒函数即先前定义等待队列项DEFINE_WAIT(__wait)时默认初始化的autoremove_wake_function函数。 autoremove_wake_function最终会调用try_to_wake_up函数将进程置为TASK_RUNNING状态。这样后面的进程调度便会调度到该进程，从而唤醒该进程继续执行。
static inline int try_to_wake_up(struct task_struct * p, int synchronous)
{
    unsigned long flags;
    int success = 0;

    /* 由于我们需要操作运行队列，必须获得对应的锁 */
    spin_lock_irqsave(&runqueue_lock, flags);
    /* 将进程状态设置为TASK_RUNNING */
    p->state = TASK_RUNNING;
    /* 如果进程已经在运行队列中，释放锁退出 */
    if (task_on_runqueue(p))
        goto out;
    /* 否则将进程添加到运行队列中 */
    add_to_runqueue(p);

    /* 如果设置了同步标志 */
    if (!synchronous || !(p->cpus_allowed & (1UL << smp_processor_id())))
        reschedule_idle(p);
    /* 唤醒成功，释放锁退出 */
    success = 1;
out:
    spin_unlock_irqrestore(&runqueue_lock, flags);
    return success;
}
等待队列应用模式
等待队列的的应用涉及两个进程，假设为A和B。A是资源的消费者，B是资源的生产者。A在消费的时候必须确保资源已经生产出来，为此定义一个资源等待队列。这个队列同时要被进程A和进程B使用，我们可以将它定义为一个全局变量。
DECLARE_WAIT_QUEUE_HEAD(rsc_queue); /* 全局变量 */
在进程A中，执行逻辑如下：
while (resource is unavaiable) {
    interruptible_sleep_on( &wq );
}
consume_resource();
在进程B中，执行逻辑如下：
produce_resource();
wake_up_interruptible( &wq );

阅读(974) | 评论(0) | 转发(0) |

上一篇：什么是进程上下文

下一篇：Linux内核等待队列机制介绍 (转载)－－－好文章

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6