实时调度类-alex-huang-ChinaUnix博客

alex-huang的ChinaUnix博客

首页　| 　博文目录　| 　关于我

alex-huang

博客访问： 261004
博文数量： 35
博客积分： 883
博客等级：准尉
技术积分： 656
用户组：普通用户
注册时间： 2011-06-17 09:38

文章分类

全部博文（35）

心理学（8）
国学（2）
中国式管理（0）
市场销售（2）
财务（0）
人力资源（3）
MBA（1）
linux内核之2.6.3（16）
PMP管理（3）
未分配的博文（0）

文章存档

2013年（1）

2012年（34）

我的朋友

相关博文

实时调度类

分类： LINUX

2012-11-03 16:18:15

linux2.6.34同时也支持实时进程的调度，基于实时调度类来实现。

内核中实时调度类的定义如下：

点击(此处)折叠或打开

static const struct sched_class rt_sched_class = {
.next = &fair_sched_class,
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
.check_preempt_curr = check_preempt_curr_rt,
.pick_next_task = pick_next_task_rt,
.put_prev_task = put_prev_task_rt,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_rt,
.set_cpus_allowed = set_cpus_allowed_rt,
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
.pre_schedule = pre_schedule_rt,
.post_schedule = post_schedule_rt,
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
#endif
.set_curr_task = set_curr_task_rt,
.task_tick = task_tick_rt,
.get_rr_interval = get_rr_interval_rt,
.prio_changed = prio_changed_rt,
.switched_to = switched_to_rt,
}

有两种实时类：SCHED_RR 和 SCHED_FIFO，SCHED_RR确保几个优先级相同的实时进程按照确定的时间片依次运行，SCHED_FIFO没有时间片的约束，可以运行任意的时间。

首先，我们了解一下实时调度类的就绪队列的实现：

在每cpu的就绪队列中，就有cfs和rt的就绪队列，具体如下：

点击(此处)折叠或打开

struct rq {
......
struct cfs_rq cfs;
struct rt_rq rt;
......
};

而我们再看实时就绪队列的实现：

点击(此处)折叠或打开

struct rt_rq {
struct rt_prio_array active;
unsigned long rt_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
struct {
int curr; /* highest queued rt task prio */
#ifdef CONFIG_SMP
int next; /* next highest */
#endif
} highest_prio;
#endif
#ifdef CONFIG_SMP
unsigned long rt_nr_migratory;
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
#endif
int rt_throttled;
u64 rt_time;
u64 rt_runtime;
/* Nests inside the rq lock: */
raw_spinlock_t rt_runtime_lock;
#ifdef CONFIG_RT_GROUP_SCHED
unsigned long rt_nr_boosted;
struct rq *rq;
struct list_head leaf_rt_rq_list;
struct task_group *tg;
#endif
}

其关键的是active这个结构，具体如下：

点击(此处)折叠或打开

/*
* This is the priority-queue data structure of the RT scheduling class:
*/
struct rt_prio_array {
DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
struct list_head queue[MAX_RT_PRIO];
};

从上面的代码中，我们可以发现实时调度类的就绪队列就是一个指针数组，可以理解成一个根据不同优先级组成的一个个链表，也可以简单的理解为就是一个链表，超级简单。同时，还有一个位图，其作用是用来表示对应每个不同优先级的链表是否为空，若为空则对应位置为0.

另外，需要注意的是更新实时进程的运行时间函数为update_curr_rt，其实现如下：

点击(此处)折叠或打开

static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt;
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
u64 delta_exec;
if (!task_has_rt_policy(curr))
return;
delta_exec = rq->clock - curr->se.exec_start;
if (unlikely((s64)delta_exec < 0))
delta_exec = 0;
schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
curr->se.exec_start = rq->clock;
cpuacct_charge(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
if (!rt_bandwidth_enabled())
return;
for_each_sched_rt_entity(rt_se) {
rt_rq = rt_rq_of_se(rt_se);
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_time += delta_exec;
if (sched_rt_runtime_exceeded(rt_rq))
resched_task(curr);
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
}
}

从上面的实现我们可以看出，实时进程的时间统计为实际的时间，而没有什么虚拟时间，这样的实现就简单很多。

其次，我们通过几个关键的函数的实现，来看看实时调度类是怎么运作的

挑选下一个可运行的实时进程pick_next_task_rt，其核心的代码片段如下：

点击(此处)折叠或打开

static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
struct rt_rq *rt_rq)
{
struct rt_prio_array *array = &rt_rq->active;
struct sched_rt_entity *next = NULL;
struct list_head *queue;
int idx;
idx = sched_find_first_bit(array->bitmap);
BUG_ON(idx >= MAX_RT_PRIO);
queue = array->queue + idx;
next = list_entry(queue->next, struct sched_rt_entity, run_list);
return next;
}

上面的代码简单易懂，先通过sched_find_first_bit发现位图中首个不为0的队列的索引idx，然后，通过array->queue + idx就可以计算得到那个队列所对应的队列头数组，之后就从这个队列中取出第一个可运行的实时进程，很简单吧。

周期调度的实现如下：

点击(此处)折叠或打开

static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{
//更新时间
update_curr_rt(rq);
watchdog(rq, p);
/*
* RR tasks need a special form of timeslice management.
* FIFO tasks have no timeslices.
*/
//实时FIFO进程不需要设置时间
if (p->policy != SCHED_RR)
return;
//时间片没有用完，返回继续工作
if (--p->rt.time_slice)
return;
//重置时间片，100ms
p->rt.time_slice = DEF_TIMESLICE;
/*
* Requeue to the end of queue if we are not the only element
* on the queue:
*/
//就绪队列非空，重新插入，并设置为可调度
if (p->rt.run_list.prev != p->rt.run_list.next) {
requeue_task_rt(rq, p, 0);
set_tsk_need_resched(p);
}
}

主要内容加上了注释，应当还是比较好理解的。

向运行队列添加进程enqueue_task_rt，如下：

点击(此处)折叠或打开

static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
{
//如果进程已经在运行队列中，则从运行队列中删除进程
dequeue_rt_stack(rt_se);
for_each_sched_rt_entity(rt_se)
//重新添加进程到运行队列中，可能插在队列的头，也可能是尾
__enqueue_rt_entity(rt_se, head);
}

从运行队列中删除进程dequeue_task_rt，如下：

点击(此处)折叠或打开

static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
{
//从运行队列中删除进程
dequeue_rt_stack(rt_se);
for_each_sched_rt_entity(rt_se) {
struct rt_rq *rt_rq = group_rt_rq(rt_se);
if (rt_rq && rt_rq->rt_nr_running)
//将进程插入到运行队列的尾部
__enqueue_rt_entity(rt_se, false);
}
}

再次，我们说明一个细节性的问题，就是linux系统为了保证FIFO进程能够得到调度，会启动一个FIFO的watchdog线程，并且系统默认设定一个阀值（60s），并且，在每个时钟中断的过程中调用softlockup_tick来检查位于每cpu上的时间戳，若是这个时间戳超过30s而小于60s没有更新，就会唤醒watchdog线程更新一把，若是超过60s没有更新，系统就会panic。而这个时间戳的更新是在watchdog线程中进行的，所以，只有当这个watchdog线程得不到运行的时候，系统可能就会发生panic。什么情况下会发生这种情况呢？一种情况就是你写了个上帝线程，比FIFO线程优先级高，长时间运行，就可能发生系统panic了。其相关代码实现和注解如下：

点击(此处)折叠或打开

void softlockup_tick(void)
{
int this_cpu = smp_processor_id();
//关键就是这个每cpu变量softlockup_touch_ts，要及时得到更新
unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu);
unsigned long print_ts;
struct pt_regs *regs = get_irq_regs();
unsigned long now;
/* Is detection switched off? */
if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) {
/* Be sure we don't false trigger if switched back on */
if (touch_ts)
per_cpu(softlockup_touch_ts, this_cpu) = 0;
return;
}
if (touch_ts == 0) {
if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
/*
* If the time stamp was touched atomically
* make sure the scheduler tick is up to date.
*/
per_cpu(softlock_touch_sync, this_cpu) = false;
sched_clock_tick();
}
__touch_softlockup_watchdog();
return;
}
print_ts = per_cpu(softlockup_print_ts, this_cpu);
/* report at most once a second */
if (print_ts == touch_ts || did_panic)
return;
/* do not print during early bootup: */
if (unlikely(system_state != SYSTEM_RUNNING)) {
__touch_softlockup_watchdog();
return;
}
//取当前cpu时间戳
now = get_timestamp(this_cpu);
/*
* Wake up the high-prio watchdog task twice per
* threshold timespan.
*/
//检查时间是否超过阀值的一半，即30s，若是，则启动watchdog线程进行时间戳更新
if (time_after(now - softlockup_thresh/2, touch_ts))
wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
//检查时间是否超过阀值60s，否，则返回，是则继续
/* Warn about unreasonable delays: */
if (time_before_eq(now - softlockup_thresh, touch_ts))
return;
per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
spin_lock(&print_lock);
printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
this_cpu, now - touch_ts,
current->comm, task_pid_nr(current));
print_modules();
print_irqtrace_events(current);
if (regs)
show_regs(regs);
else
dump_stack();
spin_unlock(&print_lock);
//系统状态不正常了，panic产生
if (softlockup_panic)
panic("softlockup: hung tasks");
}

下面是系统启动之初，使能各个cpu的时候，就创建相应的watchdog线程

点击(此处)折叠或打开

cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
int hotcpu = (unsigned long)hcpu;
struct task_struct *p;
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
BUG_ON(per_cpu(softlockup_watchdog, hotcpu));
p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
if (IS_ERR(p)) {
printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
return NOTIFY_BAD;
}
per_cpu(softlockup_touch_ts, hotcpu) = 0;
per_cpu(softlockup_watchdog, hotcpu) = p;
kthread_bind(p, hotcpu);
break;
。。。。。。
}

watchdog线程的实现如下:

点击(此处)折叠或打开

static int watchdog(void *__bind_cpu)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
sched_setscheduler(current, SCHED_FIFO, &param);
/* initialize timestamp */
//更新时间戳
__touch_softlockup_watchdog();
set_current_state(TASK_INTERRUPTIBLE);
/*
* Run briefly once per second to reset the softlockup timestamp.
* If this gets delayed for more than 60 seconds then the
* debug-printout triggers in softlockup_tick().
*/
while (!kthread_should_stop()) {
//更新时间戳
__touch_softlockup_watchdog();
schedule();
if (kthread_should_stop())
break;
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
return 0;
}

其关键就是上面两处更新时间戳的地方，其实现如下，

点击(此处)折叠或打开

static void __touch_softlockup_watchdog(void)
{
int this_cpu = raw_smp_processor_id();
__raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu);
}

怎么样，一结合来看，上下逻辑就很清楚了吧。

最后，我们来说说linux系统是如何保证高优先级的实时调度类先调度、接着cfs调度类，然后是idle调度类依次调度的呢？其实，仔细想想便可知道，系统任务的切换（或者调度类的切换）一定离不开schedule调度器这个关键点，而我们再仔细的分析一下它的代码，便会发现其奥妙在pick_next_task中，具体实现如下：

点击(此处)折叠或打开

static inline struct task_struct *
pick_next_task(struct rq *rq)
{
const struct sched_class *class;
struct task_struct *p;
/*
* Optimization: we know that if all tasks are in
* the fair class we can call that function directly:
*/
if (likely(rq->nr_running == rq->cfs.nr_running)) {
p = fair_sched_class.pick_next_task(rq);
if (likely(p))
return p;
}
//实现调度类按照实时、cfs和idle依次调度的关键点
class = sched_class_highest;
for ( ; ; ) {
p = class->pick_next_task(rq);
if (p)
return p;
/*
* Will never be NULL as the idle class always
* returns a non-NULL p:
*/
class = class->next;
}
}

就在挑选下一个任务的过程中，完成了我们追问的关键点，另外，还有两个点set_rq_online和set_rq_offline都会完成类似的动作，再次，不做深入讨论。

阅读(3949) | 评论(0) | 转发(0) |

上一篇：完全公平调度CFS探索之实现

下一篇：2012 Wind River 开发者大会（深圳区域）

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6