分类: LINUX
2011-03-31 17:16:21
分析schedule()函数之前,最好还是看看这篇文章, 他主要讲讲了理论,http://www-128.ibm.com/developerworks/cn/linux/kernel/l-kn26sch/index.html
我主要是从代码的角度简单的分析了一下。
/*
* 调度的主要函数,研究一下到底是怎么样进行调度的
* __schedule() is the main scheduler function.
*/
void __sched __schedule(void)
{
struct task_struct *prev, *next;
struct prio_array *array;
struct list_head *queue;
unsigned long long now;
unsigned long run_time;
int cpu, idx, new_prio;
long *switch_count;
struct rq *rq;
WARN_ON(system_state == SYSTEM_BOOTING);
/*
* Test if we are atomic. Since do_exit() needs to call into
* schedule() atomically, we ignore that path for now.
* Otherwise, whine if we are scheduling when we should not be.
*/
if (unlikely(in_atomic() && !current->exit_state)) {
stop_trace();
printk(KERN_ERR "BUG: scheduling while atomic: "
"%s/0x%08x/%d, CPU#%d\n",
current->comm, preempt_count(), current->pid,
smp_processor_id());
dump_stack();
}
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
//禁止抢占
preempt_disable(); // FIXME: disable irqs here
prev = current;
release_kernel_lock(prev);
rq = this_rq();
//处理idle进程
/*
* The idle thread is not allowed to schedule!
* Remove this check after it has been exercised a bit.
*/
//会有这种情况吗? idle进程就是当前进程,且状态不是TASK_RUNNING
if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
printk(KERN_ERR "BUG: scheduling from the idle thread!\n");
dump_stack();
}
schedstat_inc(rq, sched_cnt); //rq->sched_cnt++
now = sched_clock(); //返回当前的时间(ns级别的)
if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
run_time = now - prev->timestamp;
if (unlikely((long long)(now - prev->timestamp) < 0)) //可能发生吗? 怎么会出现这种情况呢?
run_time = 0;
} else
run_time = NS_MAX_SLEEP_AVG;
/*
* Tasks charged proportionately(相称的,成比例的) less run_time at high sleep_avg to
* delay them losing their interactive status
*/
run_time /= (CURRENT_BONUS(prev) ? : 1);
cpu = smp_processor_id();
spin_lock_irq(&rq->lock);
switch_count = &prev->nvcsw; // TODO: temporary - to see it in vmstat
if ((prev->state & ~TASK_RUNNING_MUTEX) &&
!(preempt_count() & PREEMPT_ACTIVE)) {
switch_count = &prev->nvcsw;
if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
unlikely(signal_pending(prev))))
prev->state = TASK_RUNNING;
else {
if (prev->state == TASK_UNINTERRUPTIBLE) {
rq->nr_uninterruptible++;
incr_rt_nr_uninterruptible(prev, rq);
}
touch_softlockup_watchdog();
deactivate_task(prev, rq);
}
}
if (preempt_count() & PREEMPT_ACTIVE) //表明当前进程是否可以抢占
sub_preempt_count(PREEMPT_ACTIVE); //变成可抢占的
//从rq中删除掉
if (unlikely(prev->flags & PF_DEAD)) {
if (prev->state != TASK_RUNNING) {
printk("prev->state: %ld != TASK_RUNNING??\n",
prev->state);
WARN_ON(1);
} else
deactivate_task(prev, rq); //已经dead了,那么进程状态肯定应该是running,然后从可运行队列中删除去
prev->state = EXIT_DEAD;
}
#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
if (unlikely(atomic_read(&rt_overload)))
balance_rt_tasks(rq, cpu);
#endif
//如何调度到idle进程的代码
if (unlikely(!rq->nr_running)) { //这个运行队列里面的进程个数,包含了active和expired两个优先级队列里面的进程
idle_balance(cpu, rq); //与up无关
if (!rq->nr_running) {
next = rq->idle; //如果可运行队列里面的进程数为0,就调用idle进程
rq->expired_timestamp = 0;
wake_sleeping_dependent(cpu);
goto switch_tasks;
}
}
//开始置换active队列和expired队列了
array = rq->active;
//不太进程发生
if (unlikely(!array->nr_active)) { //如果active里面的进程个数是0,就要和expired队列置换
/*
* Switch the active and expired arrays.
*/
schedstat_inc(rq, sched_switch);
rq->active = rq->expired;
rq->expired = array;
array = rq->active; //array 最终指向了expired queue
rq->expired_timestamp = 0;
rq->best_expired_prio = MAX_PRIO;
}
//选择一个最合适的进程投入运行
idx = sched_find_first_bit(array->bitmap); //找到第一个优先级最高的那个index
queue = array->queue + idx;
next = list_entry(queue->next, struct task_struct, run_list); //选中队列中的第一个进程
if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
unsigned long long delta = now - next->timestamp;
if (unlikely((long long)(now - next->timestamp) < 0))
delta = 0;
if (next->sleep_type == SLEEP_INTERACTIVE)
delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
array = next->array;
new_prio = recalc_task_prio(next, next->timestamp + delta);
if (unlikely(next->prio != new_prio)) {
dequeue_task(next, array);
next->prio = new_prio;
enqueue_task(next, array);
}
}
next->sleep_type = SLEEP_NORMAL;
if (dependent_sleeper(cpu, rq, next))
next = rq->idle;
//正式开始进程切换
switch_tasks: //进程切换
if (next == rq->idle)
schedstat_inc(rq, sched_goidle);
prefetch(next);
prefetch_stack(next);
clear_tsk_need_resched(prev); // #define TIF_NEED_RESCHED 2
clear_tsk_need_resched_delayed(prev);
rcu_qsctr_inc(task_cpu(prev));
update_cpu_clock(prev, rq, now);
prev->sleep_avg -= run_time;
if ((long)prev->sleep_avg <= 0)
prev->sleep_avg = 0;
prev->timestamp = prev->last_ran = now;
trace_all_runnable_tasks(rq);
sched_info_switch(prev, next);
//开始进程切换了
if (likely(prev != next)) {
//一些队列本身的属性值的更新
next->timestamp = now;
rq->nr_switches++;
rq->curr = next;
++*switch_count;
prepare_task_switch(rq, next);
MARK(kernel_sched_schedule, "%d %d %ld",
prev->pid, next->pid, prev->state);
prev = context_switch(rq, prev, next);
barrier();
trace_special_pid(prev->pid, PRIO(prev), PRIO(current));
/*
* this_rq must be evaluated again because prev may have moved
* CPUs since it called schedule(), thus the 'rq' on its stack
* frame will be invalid.
*/
finish_task_switch(this_rq(), prev);
__preempt_enable_no_resched();
} else {
__preempt_enable_no_resched();
spin_unlock(&rq->lock);
trace_stop_sched_switched(next);
}
reacquire_kernel_lock(current);
}
-----接下来就是关于 如何从rq里面删除进程了。
//这里的两个函数很关键的, 当把一个就绪队列里面的进程,删除, 就是调用的这两个函数。
从中,我们可以看出,schedule()函数 ,当发现当前进程的状态是INTERRUPTIBLE并且是有未决信号等待处理的(也就是他收到了一个信号,sig_pending=1) , 那么就把当前这个进程的状态置于TASK_RUNNING ,但是请注意,这仅表示, 以后scheduler有可能再次调度当前这个进程而已。 这次肯定是调度另外一个了。
/*
* Adding/removing a task to/from a priority array:
*/
static void dequeue_task(struct task_struct *p, struct prio_array *array) //¸Ã½ø3ìÔúÄ3¸öóÅÏ輶¶óáDàïÃæ
{
array->nr_active--;
list_del(&p->run_list);
if (list_empty(array->queue + p->prio))
__clear_bit(p->prio, array->bitmap);
dec_rt_tasks(p, array->rq);
}
/*
* deactivate_task - remove a task from the runqueue.
*/
static void deactivate_task(struct task_struct *p, struct rq *rq)
{
trace_special_pid(p->pid, PRIO(p), rq->nr_running);
dec_nr_running(p, rq);
dequeue_task(p, p->array);
p->array = NULL;
}
====