linux2.6.34同时也支持实时进程的调度,基于实时调度类来实现。
内核中实时调度类的定义如下:
- static const struct sched_class rt_sched_class = {
- .next = &fair_sched_class,
- .enqueue_task = enqueue_task_rt,
- .dequeue_task = dequeue_task_rt,
- .yield_task = yield_task_rt,
- .check_preempt_curr = check_preempt_curr_rt,
- .pick_next_task = pick_next_task_rt,
- .put_prev_task = put_prev_task_rt,
- #ifdef CONFIG_SMP
- .select_task_rq = select_task_rq_rt,
- .set_cpus_allowed = set_cpus_allowed_rt,
- .rq_online = rq_online_rt,
- .rq_offline = rq_offline_rt,
- .pre_schedule = pre_schedule_rt,
- .post_schedule = post_schedule_rt,
- .task_woken = task_woken_rt,
- .switched_from = switched_from_rt,
- #endif
- .set_curr_task = set_curr_task_rt,
- .task_tick = task_tick_rt,
- .get_rr_interval = get_rr_interval_rt,
- .prio_changed = prio_changed_rt,
- .switched_to = switched_to_rt,
- }
有两种实时类:SCHED_RR 和 SCHED_FIFO,SCHED_RR确保几个优先级相同的实时进程按照确定的时间片依次运行,SCHED_FIFO没有时间片的约束,可以运行任意的时间。
首先,我们了解一下实时调度类的就绪队列的实现:
在每cpu的就绪队列中,就有cfs和rt的就绪队列,具体如下:
- struct rq {
- ......
- struct cfs_rq cfs;
- struct rt_rq rt;
- ......
- };
而我们再看实时就绪队列的实现:
- struct rt_rq {
- struct rt_prio_array active;
- unsigned long rt_nr_running;
- #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- struct {
- int curr; /* highest queued rt task prio */
- #ifdef CONFIG_SMP
- int next; /* next highest */
- #endif
- } highest_prio;
- #endif
- #ifdef CONFIG_SMP
- unsigned long rt_nr_migratory;
- unsigned long rt_nr_total;
- int overloaded;
- struct plist_head pushable_tasks;
- #endif
- int rt_throttled;
- u64 rt_time;
- u64 rt_runtime;
- /* Nests inside the rq lock: */
- raw_spinlock_t rt_runtime_lock;
- #ifdef CONFIG_RT_GROUP_SCHED
- unsigned long rt_nr_boosted;
- struct rq *rq;
- struct list_head leaf_rt_rq_list;
- struct task_group *tg;
- #endif
- }
其关键的是active这个结构,具体如下:
- /*
- * This is the priority-queue data structure of the RT scheduling class:
- */
- struct rt_prio_array {
- DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
- struct list_head queue[MAX_RT_PRIO];
- };
从上面的代码中,我们可以发现实时调度类的就绪队列就是一个指针数组,可以理解成一个根据不同优先级组成的一个个链表,也可以简单的理解为就是一个链表,超级简单。同时,还有一个位图,其作用是用来表示对应每个不同优先级的链表是否为空,若为空则对应位置为0.
另外,需要注意的是更新实时进程的运行时间函数为update_curr_rt,其实现如下:
- static void update_curr_rt(struct rq *rq)
- {
- struct task_struct *curr = rq->curr;
- struct sched_rt_entity *rt_se = &curr->rt;
- struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
- u64 delta_exec;
- if (!task_has_rt_policy(curr))
- return;
- delta_exec = rq->clock - curr->se.exec_start;
- if (unlikely((s64)delta_exec < 0))
- delta_exec = 0;
- schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
- curr->se.sum_exec_runtime += delta_exec;
- account_group_exec_runtime(curr, delta_exec);
- curr->se.exec_start = rq->clock;
- cpuacct_charge(curr, delta_exec);
- sched_rt_avg_update(rq, delta_exec);
- if (!rt_bandwidth_enabled())
- return;
- for_each_sched_rt_entity(rt_se) {
- rt_rq = rt_rq_of_se(rt_se);
- if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
- raw_spin_lock(&rt_rq->rt_runtime_lock);
- rt_rq->rt_time += delta_exec;
- if (sched_rt_runtime_exceeded(rt_rq))
- resched_task(curr);
- raw_spin_unlock(&rt_rq->rt_runtime_lock);
- }
- }
- }
从上面的实现我们可以看出,实时进程的时间统计为实际的时间,而没有什么虚拟时间,这样的实现就简单很多。
其次,我们通过几个关键的函数的实现,来看看实时调度类是怎么运作的
挑选下一个可运行的实时进程pick_next_task_rt,其核心的代码片段如下:
- static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
- struct rt_rq *rt_rq)
- {
- struct rt_prio_array *array = &rt_rq->active;
- struct sched_rt_entity *next = NULL;
- struct list_head *queue;
- int idx;
- idx = sched_find_first_bit(array->bitmap);
- BUG_ON(idx >= MAX_RT_PRIO);
- queue = array->queue + idx;
- next = list_entry(queue->next, struct sched_rt_entity, run_list);
- return next;
- }
上面的代码简单易懂,先通过sched_find_first_bit发现位图中首个不为0的队列的索引idx,然后,通过array->queue + idx就可以计算得到那个队列所对应的队列头数组,之后就从这个队列中取出第一个可运行的实时进程,很简单吧。
周期调度的实现如下:
- static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
- {
- //更新时间
- update_curr_rt(rq);
- watchdog(rq, p);
- /*
- * RR tasks need a special form of timeslice management.
- * FIFO tasks have no timeslices.
- */
- //实时FIFO进程不需要设置时间
- if (p->policy != SCHED_RR)
- return;
- //时间片没有用完,返回继续工作
- if (--p->rt.time_slice)
- return;
- //重置时间片,100ms
- p->rt.time_slice = DEF_TIMESLICE;
- /*
- * Requeue to the end of queue if we are not the only element
- * on the queue:
- */
- //就绪队列非空,重新插入,并设置为可调度
- if (p->rt.run_list.prev != p->rt.run_list.next) {
- requeue_task_rt(rq, p, 0);
- set_tsk_need_resched(p);
- }
- }
主要内容加上了注释,应当还是比较好理解的。
向运行队列添加进程enqueue_task_rt,如下:
- static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
- {
- //如果进程已经在运行队列中,则从运行队列中删除进程
- dequeue_rt_stack(rt_se);
- for_each_sched_rt_entity(rt_se)
- //重新添加进程到运行队列中,可能插在队列的头,也可能是尾
- __enqueue_rt_entity(rt_se, head);
- }
从运行队列中删除进程dequeue_task_rt,如下:
- static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
- {
- //从运行队列中删除进程
- dequeue_rt_stack(rt_se);
- for_each_sched_rt_entity(rt_se) {
- struct rt_rq *rt_rq = group_rt_rq(rt_se);
- if (rt_rq && rt_rq->rt_nr_running)
- //将进程插入到运行队列的尾部
- __enqueue_rt_entity(rt_se, false);
- }
- }
再次,我们说明一个细节性的问题,就是linux系统为了保证FIFO进程能够得到调度,会启动一个FIFO的watchdog线程,并且系统默认设定一个阀值(60s),并且,在每个时钟中断的过程中调用softlockup_tick来检查位于每cpu上的时间戳,若是这个时间戳超过30s而小于60s没有更新,就会唤醒watchdog线程更新一把,若是超过60s没有更新,系统就会panic。而这个时间戳的更新是在watchdog线程中进行的,所以,只有当这个watchdog线程得不到运行的时候,系统可能就会发生panic。什么情况下会发生这种情况呢?一种情况就是你写了个上帝线程,比FIFO线程优先级高,长时间运行,就可能发生系统panic了。其相关代码实现和注解如下:
- void softlockup_tick(void)
- {
- int this_cpu = smp_processor_id();
- //关键就是这个每cpu变量softlockup_touch_ts,要及时得到更新
- unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu);
- unsigned long print_ts;
- struct pt_regs *regs = get_irq_regs();
- unsigned long now;
- /* Is detection switched off? */
- if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) {
- /* Be sure we don't false trigger if switched back on */
- if (touch_ts)
- per_cpu(softlockup_touch_ts, this_cpu) = 0;
- return;
- }
- if (touch_ts == 0) {
- if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
- /*
- * If the time stamp was touched atomically
- * make sure the scheduler tick is up to date.
- */
- per_cpu(softlock_touch_sync, this_cpu) = false;
- sched_clock_tick();
- }
- __touch_softlockup_watchdog();
- return;
- }
- print_ts = per_cpu(softlockup_print_ts, this_cpu);
- /* report at most once a second */
- if (print_ts == touch_ts || did_panic)
- return;
- /* do not print during early bootup: */
- if (unlikely(system_state != SYSTEM_RUNNING)) {
- __touch_softlockup_watchdog();
- return;
- }
- //取当前cpu时间戳
- now = get_timestamp(this_cpu);
- /*
- * Wake up the high-prio watchdog task twice per
- * threshold timespan.
- */
- //检查时间是否超过阀值的一半,即30s,若是,则启动watchdog线程进行时间戳更新
- if (time_after(now - softlockup_thresh/2, touch_ts))
- wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
- //检查时间是否超过阀值60s,否,则返回,是则继续
- /* Warn about unreasonable delays: */
- if (time_before_eq(now - softlockup_thresh, touch_ts))
- return;
- per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
- spin_lock(&print_lock);
- printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
- this_cpu, now - touch_ts,
- current->comm, task_pid_nr(current));
- print_modules();
- print_irqtrace_events(current);
- if (regs)
- show_regs(regs);
- else
- dump_stack();
- spin_unlock(&print_lock);
- //系统状态不正常了,panic产生
- if (softlockup_panic)
- panic("softlockup: hung tasks");
- }
下面是系统启动之初,使能各个cpu的时候,就创建相应的watchdog线程
- cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
- {
- int hotcpu = (unsigned long)hcpu;
- struct task_struct *p;
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- BUG_ON(per_cpu(softlockup_watchdog, hotcpu));
- p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
- if (IS_ERR(p)) {
- printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
- return NOTIFY_BAD;
- }
- per_cpu(softlockup_touch_ts, hotcpu) = 0;
- per_cpu(softlockup_watchdog, hotcpu) = p;
- kthread_bind(p, hotcpu);
- break;
- 。。。。。。
- }
watchdog线程的实现如下:
- static int watchdog(void *__bind_cpu)
- {
- struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
- sched_setscheduler(current, SCHED_FIFO, ¶m);
- /* initialize timestamp */
- //更新时间戳
- __touch_softlockup_watchdog();
- set_current_state(TASK_INTERRUPTIBLE);
- /*
- * Run briefly once per second to reset the softlockup timestamp.
- * If this gets delayed for more than 60 seconds then the
- * debug-printout triggers in softlockup_tick().
- */
- while (!kthread_should_stop()) {
- //更新时间戳
- __touch_softlockup_watchdog();
- schedule();
- if (kthread_should_stop())
- break;
- set_current_state(TASK_INTERRUPTIBLE);
- }
- __set_current_state(TASK_RUNNING);
- return 0;
- }
其关键就是上面两处更新时间戳的地方,其实现如下,
- static void __touch_softlockup_watchdog(void)
- {
- int this_cpu = raw_smp_processor_id();
- __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu);
- }
怎么样,一结合来看,上下逻辑就很清楚了吧。
最后,我们来说说linux系统是如何保证高优先级的实时调度类先调度、接着cfs调度类,然后是idle调度类依次调度的呢?其实,仔细想想便可知道,系统任务的切换(或者调度类的切换)一定离不开schedule调度器这个关键点,而我们再仔细的分析一下它的代码,便会发现其奥妙在pick_next_task中,具体实现如下:
- static inline struct task_struct *
- pick_next_task(struct rq *rq)
- {
- const struct sched_class *class;
- struct task_struct *p;
- /*
- * Optimization: we know that if all tasks are in
- * the fair class we can call that function directly:
- */
- if (likely(rq->nr_running == rq->cfs.nr_running)) {
- p = fair_sched_class.pick_next_task(rq);
- if (likely(p))
- return p;
- }
- //实现调度类按照实时、cfs和idle依次调度的关键点
- class = sched_class_highest;
- for ( ; ; ) {
- p = class->pick_next_task(rq);
- if (p)
- return p;
- /*
- * Will never be NULL as the idle class always
- * returns a non-NULL p:
- */
- class = class->next;
- }
- }
就在挑选下一个任务的过程中,完成了我们追问的关键点,另外,还有两个点set_rq_online和set_rq_offline都会完成类似的动作,再次,不做深入讨论。
阅读(3884) | 评论(0) | 转发(0) |