Chinaunix首页 | 论坛 | 博客
  • 博客访问: 258350
  • 博文数量: 35
  • 博客积分: 883
  • 博客等级: 准尉
  • 技术积分: 656
  • 用 户 组: 普通用户
  • 注册时间: 2011-06-17 09:38
文章分类

全部博文(35)

文章存档

2013年(1)

2012年(34)

分类: LINUX

2012-11-03 16:18:15

linux2.6.34同时也支持实时进程的调度,基于实时调度类来实现。
内核中实时调度类的定义如下:

点击(此处)折叠或打开

  1. static const struct sched_class rt_sched_class = {
  2.     .next            = &fair_sched_class,
  3.     .enqueue_task        = enqueue_task_rt,
  4.     .dequeue_task        = dequeue_task_rt,
  5.     .yield_task        = yield_task_rt,

  6.     .check_preempt_curr    = check_preempt_curr_rt,

  7.     .pick_next_task        = pick_next_task_rt,
  8.     .put_prev_task        = put_prev_task_rt,

  9. #ifdef CONFIG_SMP
  10.     .select_task_rq        = select_task_rq_rt,

  11.     .set_cpus_allowed = set_cpus_allowed_rt,
  12.     .rq_online = rq_online_rt,
  13.     .rq_offline = rq_offline_rt,
  14.     .pre_schedule        = pre_schedule_rt,
  15.     .post_schedule        = post_schedule_rt,
  16.     .task_woken        = task_woken_rt,
  17.     .switched_from        = switched_from_rt,
  18. #endif

  19.     .set_curr_task = set_curr_task_rt,
  20.     .task_tick        = task_tick_rt,

  21.     .get_rr_interval    = get_rr_interval_rt,

  22.     .prio_changed        = prio_changed_rt,
  23.     .switched_to        = switched_to_rt,
  24. }
有两种实时类:SCHED_RR 和 SCHED_FIFO,SCHED_RR确保几个优先级相同的实时进程按照确定的时间片依次运行,SCHED_FIFO没有时间片的约束,可以运行任意的时间。
首先,我们了解一下实时调度类的就绪队列的实现:
在每cpu的就绪队列中,就有cfs和rt的就绪队列,具体如下:

点击(此处)折叠或打开

  1. struct rq {
  2. ......
  3.     struct cfs_rq cfs;
  4.     struct rt_rq rt;

  5. ......
  6. };
而我们再看实时就绪队列的实现:

点击(此处)折叠或打开

  1. struct rt_rq {
  2.     struct rt_prio_array active;
  3.     unsigned long rt_nr_running;
  4. #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
  5.     struct {
  6.         int curr; /* highest queued rt task prio */
  7. #ifdef CONFIG_SMP
  8.         int next; /* next highest */
  9. #endif
  10.     } highest_prio;
  11. #endif
  12. #ifdef CONFIG_SMP
  13.     unsigned long rt_nr_migratory;
  14.     unsigned long rt_nr_total;
  15.     int overloaded;
  16.     struct plist_head pushable_tasks;
  17. #endif
  18.     int rt_throttled;
  19.     u64 rt_time;
  20.     u64 rt_runtime;
  21.     /* Nests inside the rq lock: */
  22.     raw_spinlock_t rt_runtime_lock;

  23. #ifdef CONFIG_RT_GROUP_SCHED
  24.     unsigned long rt_nr_boosted;

  25.     struct rq *rq;
  26.     struct list_head leaf_rt_rq_list;
  27.     struct task_group *tg;
  28. #endif
  29. }
其关键的是active这个结构,具体如下:

点击(此处)折叠或打开

  1. /*
  2.  * This is the priority-queue data structure of the RT scheduling class:
  3.  */
  4. struct rt_prio_array {
  5.     DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
  6.     struct list_head queue[MAX_RT_PRIO];
  7. };
从上面的代码中,我们可以发现实时调度类的就绪队列就是一个指针数组,可以理解成一个根据不同优先级组成的一个个链表,也可以简单的理解为就是一个链表,超级简单。同时,还有一个位图,其作用是用来表示对应每个不同优先级的链表是否为空,若为空则对应位置为0.
另外,需要注意的是更新实时进程的运行时间函数为update_curr_rt,其实现如下:

点击(此处)折叠或打开

  1. static void update_curr_rt(struct rq *rq)
  2. {
  3.     struct task_struct *curr = rq->curr;
  4.     struct sched_rt_entity *rt_se = &curr->rt;
  5.     struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
  6.     u64 delta_exec;

  7.     if (!task_has_rt_policy(curr))
  8.         return;

  9.     delta_exec = rq->clock - curr->se.exec_start;
  10.     if (unlikely((s64)delta_exec < 0))
  11.         delta_exec = 0;

  12.     schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));

  13.     curr->se.sum_exec_runtime += delta_exec;
  14.     account_group_exec_runtime(curr, delta_exec);

  15.     curr->se.exec_start = rq->clock;
  16.     cpuacct_charge(curr, delta_exec);

  17.     sched_rt_avg_update(rq, delta_exec);

  18.     if (!rt_bandwidth_enabled())
  19.         return;

  20.     for_each_sched_rt_entity(rt_se) {
  21.         rt_rq = rt_rq_of_se(rt_se);

  22.         if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
  23.             raw_spin_lock(&rt_rq->rt_runtime_lock);
  24.             rt_rq->rt_time += delta_exec;
  25.             if (sched_rt_runtime_exceeded(rt_rq))
  26.                 resched_task(curr);
  27.             raw_spin_unlock(&rt_rq->rt_runtime_lock);
  28.         }
  29.     }
  30. }
从上面的实现我们可以看出,实时进程的时间统计为实际的时间,而没有什么虚拟时间,这样的实现就简单很多。
其次,我们通过几个关键的函数的实现,来看看实时调度类是怎么运作的
挑选下一个可运行的实时进程pick_next_task_rt,其核心的代码片段如下:

点击(此处)折叠或打开

  1. static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
  2.                          struct rt_rq *rt_rq)
  3. {
  4.     struct rt_prio_array *array = &rt_rq->active;
  5.     struct sched_rt_entity *next = NULL;
  6.     struct list_head *queue;
  7.     int idx;

  8.     idx = sched_find_first_bit(array->bitmap);
  9.     BUG_ON(idx >= MAX_RT_PRIO);

  10.     queue = array->queue + idx;
  11.     next = list_entry(queue->next, struct sched_rt_entity, run_list);

  12.     return next;
  13. }
上面的代码简单易懂,先通过sched_find_first_bit发现位图中首个不为0的队列的索引idx,然后,通过array->queue + idx就可以计算得到那个队列所对应的队列头数组,之后就从这个队列中取出第一个可运行的实时进程,很简单吧。
周期调度的实现如下:

点击(此处)折叠或打开

  1. static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
  2. {
  3. //更新时间
  4.     update_curr_rt(rq);

  5.     watchdog(rq, p);

  6.     /*
  7.      * RR tasks need a special form of timeslice management.
  8.      * FIFO tasks have no timeslices.
  9.      */
  10. //实时FIFO进程不需要设置时间
  11.     if (p->policy != SCHED_RR)
  12.         return;
  13. //时间片没有用完,返回继续工作
  14.     if (--p->rt.time_slice)
  15.         return;
  16. //重置时间片,100ms
  17.     p->rt.time_slice = DEF_TIMESLICE;

  18.     /*
  19.      * Requeue to the end of queue if we are not the only element
  20.      * on the queue:
  21.      */
  22. //就绪队列非空,重新插入,并设置为可调度
  23.     if (p->rt.run_list.prev != p->rt.run_list.next) {
  24.         requeue_task_rt(rq, p, 0);
  25.         set_tsk_need_resched(p);
  26.     }
  27. }
主要内容加上了注释,应当还是比较好理解的。
向运行队列添加进程enqueue_task_rt,如下:

点击(此处)折叠或打开

  1. static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
  2. {
  3. //如果进程已经在运行队列中,则从运行队列中删除进程
  4.     dequeue_rt_stack(rt_se);
  5.     for_each_sched_rt_entity(rt_se)
  6. //重新添加进程到运行队列中,可能插在队列的头,也可能是尾
  7.         __enqueue_rt_entity(rt_se, head);
  8. }
从运行队列中删除进程dequeue_task_rt,如下:

点击(此处)折叠或打开

  1. static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
  2. {
  3. //从运行队列中删除进程
  4.     dequeue_rt_stack(rt_se);

  5.     for_each_sched_rt_entity(rt_se) {
  6.         struct rt_rq *rt_rq = group_rt_rq(rt_se);

  7.         if (rt_rq && rt_rq->rt_nr_running)
  8. //将进程插入到运行队列的尾部
  9.             __enqueue_rt_entity(rt_se, false);
  10.     }
  11. }
再次,我们说明一个细节性的问题,就是linux系统为了保证FIFO进程能够得到调度,会启动一个FIFO的watchdog线程,并且系统默认设定一个阀值(60s),并且,在每个时钟中断的过程中调用softlockup_tick来检查位于每cpu上的时间戳,若是这个时间戳超过30s而小于60s没有更新,就会唤醒watchdog线程更新一把,若是超过60s没有更新,系统就会panic。而这个时间戳的更新是在watchdog线程中进行的,所以,只有当这个watchdog线程得不到运行的时候,系统可能就会发生panic。什么情况下会发生这种情况呢?一种情况就是你写了个上帝线程,比FIFO线程优先级高,长时间运行,就可能发生系统panic了。其相关代码实现和注解如下:

点击(此处)折叠或打开

  1. void softlockup_tick(void)
  2. {
  3.     int this_cpu = smp_processor_id();
  4. //关键就是这个每cpu变量softlockup_touch_ts,要及时得到更新
  5.     unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu);
  6.     unsigned long print_ts;
  7.     struct pt_regs *regs = get_irq_regs();
  8.     unsigned long now;

  9.     /* Is detection switched off? */
  10.     if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) {
  11.         /* Be sure we don't false trigger if switched back on */
  12.         if (touch_ts)
  13.             per_cpu(softlockup_touch_ts, this_cpu) = 0;
  14.         return;
  15.     }

  16.     if (touch_ts == 0) {
  17.         if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
  18.             /*
  19.              * If the time stamp was touched atomically
  20.              * make sure the scheduler tick is up to date.
  21.              */
  22.             per_cpu(softlock_touch_sync, this_cpu) = false;
  23.             sched_clock_tick();
  24.         }
  25.         __touch_softlockup_watchdog();
  26.         return;
  27.     }

  28.     print_ts = per_cpu(softlockup_print_ts, this_cpu);

  29.     /* report at most once a second */
  30.     if (print_ts == touch_ts || did_panic)
  31.         return;

  32.     /* do not print during early bootup: */
  33.     if (unlikely(system_state != SYSTEM_RUNNING)) {
  34.         __touch_softlockup_watchdog();
  35.         return;
  36.     }
  37. //取当前cpu时间戳
  38.     now = get_timestamp(this_cpu);

  39.     /*
  40.      * Wake up the high-prio watchdog task twice per
  41.      * threshold timespan.
  42.      */
  43. //检查时间是否超过阀值的一半,即30s,若是,则启动watchdog线程进行时间戳更新
  44.     if (time_after(now - softlockup_thresh/2, touch_ts))
  45.         wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
  46. //检查时间是否超过阀值60s,否,则返回,是则继续
  47.     /* Warn about unreasonable delays: */
  48.     if (time_before_eq(now - softlockup_thresh, touch_ts))
  49.         return;

  50.     per_cpu(softlockup_print_ts, this_cpu) = touch_ts;

  51.     spin_lock(&print_lock);
  52.     printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
  53.             this_cpu, now - touch_ts,
  54.             current->comm, task_pid_nr(current));
  55.     print_modules();
  56.     print_irqtrace_events(current);
  57.     if (regs)
  58.         show_regs(regs);
  59.     else
  60.         dump_stack();
  61.     spin_unlock(&print_lock);
  62. //系统状态不正常了,panic产生
  63.     if (softlockup_panic)
  64.         panic("softlockup: hung tasks");
  65. }
下面是系统启动之初,使能各个cpu的时候,就创建相应的watchdog线程

点击(此处)折叠或打开

  1. cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
  2. {
  3.     int hotcpu = (unsigned long)hcpu;
  4.     struct task_struct *p;

  5.     switch (action) {
  6.     case CPU_UP_PREPARE:
  7.     case CPU_UP_PREPARE_FROZEN:
  8.         BUG_ON(per_cpu(softlockup_watchdog, hotcpu));
  9.         p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
  10.         if (IS_ERR(p)) {
  11.             printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
  12.             return NOTIFY_BAD;
  13.         }
  14.         per_cpu(softlockup_touch_ts, hotcpu) = 0;
  15.         per_cpu(softlockup_watchdog, hotcpu) = p;
  16.         kthread_bind(p, hotcpu);
  17.         break;
  18. 。。。。。。
  19. }
watchdog线程的实现如下:

点击(此处)折叠或打开

  1. static int watchdog(void *__bind_cpu)
  2. {
  3.     struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };

  4.     sched_setscheduler(current, SCHED_FIFO, &param);

  5.     /* initialize timestamp */
  6. //更新时间戳
  7.     __touch_softlockup_watchdog();

  8.     set_current_state(TASK_INTERRUPTIBLE);
  9.     /*
  10.      * Run briefly once per second to reset the softlockup timestamp.
  11.      * If this gets delayed for more than 60 seconds then the
  12.      * debug-printout triggers in softlockup_tick().
  13.      */
  14.     while (!kthread_should_stop()) {
  15. //更新时间戳
  16.         __touch_softlockup_watchdog();
  17.         schedule();

  18.         if (kthread_should_stop())
  19.             break;

  20.         set_current_state(TASK_INTERRUPTIBLE);
  21.     }
  22.     __set_current_state(TASK_RUNNING);

  23.     return 0;
  24. }
其关键就是上面两处更新时间戳的地方,其实现如下,

点击(此处)折叠或打开

  1. static void __touch_softlockup_watchdog(void)
  2. {
  3.     int this_cpu = raw_smp_processor_id();

  4.     __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu);
  5. }
怎么样,一结合来看,上下逻辑就很清楚了吧。
最后,我们来说说linux系统是如何保证高优先级的实时调度类先调度、接着cfs调度类,然后是idle调度类依次调度的呢?其实,仔细想想便可知道,系统任务的切换(或者调度类的切换)一定离不开schedule调度器这个关键点,而我们再仔细的分析一下它的代码,便会发现其奥妙在pick_next_task中,具体实现如下:

点击(此处)折叠或打开

  1. static inline struct task_struct *
  2. pick_next_task(struct rq *rq)
  3. {
  4.     const struct sched_class *class;
  5.     struct task_struct *p;

  6.     /*
  7.      * Optimization: we know that if all tasks are in
  8.      * the fair class we can call that function directly:
  9.      */
  10.     if (likely(rq->nr_running == rq->cfs.nr_running)) {
  11.         p = fair_sched_class.pick_next_task(rq);
  12.         if (likely(p))
  13.             return p;
  14.     }
  15. //实现调度类按照实时、cfs和idle依次调度的关键点
  16.     class = sched_class_highest;
  17.     for ( ; ; ) {
  18.         p = class->pick_next_task(rq);
  19.         if (p)
  20.             return p;
  21.         /*
  22.          * Will never be NULL as the idle class always
  23.          * returns a non-NULL p:
  24.          */
  25.         class = class->next;
  26.     }
  27. }
就在挑选下一个任务的过程中,完成了我们追问的关键点,另外,还有两个点set_rq_online和set_rq_offline都会完成类似的动作,再次,不做深入讨论。






 

 





 
阅读(3896) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~