2012年(2)
分类: LINUX
2012-05-31 13:20:28
在2.6.21内核之前,时钟中断是周期的,即以HZ为频率,系统总是被动的接受时钟中断,然后运行中断处理程序。如果实在没有任务可以运行,那么就执行idle,这也许也算一种创意,可 是时钟中断还是会周期性的打破idle,然后查询有没有需要做的事情,如果没有继续idle
以往的进程在特定的固定时间片内运行,时钟的定时中断提供了时间片的监督工作,一切显得十分和谐,可是系统内核本身就是没有主权,一切都在硬件的安排下进行。
随后的2.6.22以后,nohz才出现,nohz其实就是动态设置下一次的中断时间而不是使用系统无条件的默认的HZ中断。
这样cfs调度器,再也不用受制于底层的时钟以及时间片分配特性,linux可以动态设置时间片长短,按照自己的方式来进行调度。
nohz其实就是托了抽象出来的clocksource和 clock_event_device的福,clocksource和 clock_event_device,这两个结构体就是时钟以及时钟行为的抽象。
先熟悉两个数据结构
struct timer_list :软件时钟,记录了软件时钟的到期时间以及到期后要执行的操作。
struct tvec_base :用于组织、管理软件时钟的结构。在 SMP 系统中,每个 CPU 有一个。
struct timer_list {
struct list_head entry; //所在的链表
unsigned long expires; //到期时间,以 tick 为单位
void (*function)(unsigned long); //回调函数,到期后执行的操作
unsigned long data; //回调函数的参数
struct tvec_t_base_s *base; //记录该软件时钟所在的 struct tvec_base 变量
#ifdef CONFIG_TIMER_STATS
void *start_site;
char start_comm[16];
int start_pid;
#endif
};
struct tvec_t_base_s {
spinlock_t lock;
struct timer_list *running_timer; //正在处理的软件时钟
unsigned long timer_jiffies; //当前正在处理的软件时钟到期时间
tvec_root_t tv1; //保存了到期时间从 timer_jiffies 到 timer_jiffies + 2的8次方 之间(包括边缘值)的所有软件时钟
tvec_t tv2; //保存了到期时间从 timer_jiffies + 2的8次方 到 timer_jiffies + 2的14次方 之间(包括边缘值)的 所有软件时钟
tvec_t tv3; //14~20
tvec_t tv4; //20~26
tvec_t tv5; //26~32
} ____cacheline_aligned;
typedef struct tvec_t_base_s tvec_base_t;
//下面开始跟踪内核中timer的代码,内核版本2.6.24
/*
* This function runs timers and the timer-tq in bottom half context.
*/
static void run_timer_softirq(struct softirq_action *h) //定时器中断下半部
{
tvec_base_t *base = __get_cpu_var(tvec_bases); //取得CPU的tvec_base_t结构数据
hrtimer_run_queues(); //这里有机会切换到nohz或者hres
if (time_after_eq(jiffies, base->timer_jiffies)) //如果当前jiffies >= 定时器到期base->timer_jiffies
__run_timers(base); //运行定时器回调函数
}
/*
* Called from timer softirq every jiffy, expire hrtimers:
*
* For HRT its the fall back code to run the softirq in the timer
* softirq context in case the hrtimer initialization failed or has
* not been done yet.
*/
void hrtimer_run_queues(void)
{
struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
int i;
if (hrtimer_hres_active())
return;
/*
* This _is_ ugly: We have to check in the softirq context,
* whether we can switch to highres and / or nohz mode. The
* clocksource switch happens in the timer interrupt with
* xtime_lock held. Notification from there only sets the
* check bit in the tick_oneshot code, otherwise we might
* deadlock vs. xtime_lock.
*/
if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) //这个if判断就是具体切换到hres或者nohz的代码
if (hrtimer_switch_to_hres())
return;
hrtimer_get_softirq_time(cpu_base);
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
run_hrtimer_queue(cpu_base, i);
}
/**
* Check, if a change happened, which makes oneshot possible.
*
* Called cyclic from the hrtimer softirq (driven by the timer
* softirq) allow_nohz signals, that we can switch into low-res nohz
* mode, because high resolution timers are disabled (either compile
* or runtime).
*/
int tick_check_oneshot_change(int allow_nohz)
{
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
if (!test_and_clear_bit(0, &ts->check_clocks))
return 0;
if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
return 0;
if (!timekeeping_is_continuous() || !tick_is_oneshot_available())
return 0;
if (!allow_nohz)
return 1;
tick_nohz_switch_to_nohz(); //如果满足调节,切换到nohz
return 0;
}
/**
* tick_nohz_switch_to_nohz - switch to nohz mode
*/
static void tick_nohz_switch_to_nohz(void)
{
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
ktime_t next;
if (!tick_nohz_enabled)
return;
local_irq_disable();
if (tick_switch_to_oneshot(tick_nohz_handler)) { //timer改成oneshot模式(一次性定时器),同时指定回调函数tick_nohz_handler
local_irq_enable();
return;
}
ts->nohz_mode = NOHZ_MODE_LOWRES;
/*
* Recycle the hrtimer in ts, so we can share the
* hrtimer_forward with the highres code.
*/
hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
/* Get the next period */
next = tick_init_jiffy_update();
for (;;) {
ts->sched_timer.expires = next;
if (!tick_program_event(next, 0))
break;
next = ktime_add(next, tick_period);
}
local_irq_enable();
printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n",
smp_processor_id());
}
/*
* The nohz low res interrupt handler
*/
static void tick_nohz_handler(struct clock_event_device *dev)
{
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
struct pt_regs *regs = get_irq_regs();
int cpu = smp_processor_id();
ktime_t now = ktime_get();
dev->next_event.tv64 = KTIME_MAX;
/*
* Check if the do_timer duty was dropped. We don't care about
* concurrency: This happens only when the cpu in charge went
* into a long sleep. If two cpus happen to assign themself to
* this duty, then the jiffies update is still serialized by
* xtime_lock.
*/
if (unlikely(tick_do_timer_cpu == -1))
tick_do_timer_cpu = cpu;
/* Check, if the jiffies need an update */
if (tick_do_timer_cpu == cpu)
tick_do_update_jiffies64(now);
/*
* When we are idle and the tick is stopped, we have to touch
* the watchdog as we might not schedule for a really long
* time. This happens on complete idle SMP systems while
* waiting on the login prompt. We also increment the "start
* of idle" jiffy stamp so the idle accounting adjustment we
* do when we go busy again does not account too much ticks.
*/
if (ts->tick_stopped) { //idle的tick已经停止,替idle喂狗
touch_softlockup_watchdog();
ts->idle_jiffies++;
}
update_process_times(user_mode(regs)); //在这里面调用进程调度的回调函数
profile_tick(CPU_PROFILING);
/* Do not restart, when we are in the idle loop */
if (ts->tick_stopped)
return;
while (tick_nohz_reprogram(ts, now)) { //重新设置定时器
now = ktime_get();
tick_do_update_jiffies64(now); //这里修改jiffies
}
}
//tick_sched结构
/**
* struct tick_sched - sched tick emulation and no idle tick control/stats
* @sched_timer: hrtimer to schedule the periodic tick in high
* resolution mode
* @idle_tick: Store the last idle tick expiry time when the tick
* timer is modified for idle sleeps. This is necessary
* to resume the tick timer operation in the timeline
* when the CPU returns from idle
* @tick_stopped: Indicator that the idle tick has been stopped //idle的tick已经停止
* @idle_jiffies: jiffies at the entry to idle for idle time accounting
* @idle_calls: Total number of idle calls
* @idle_sleeps: Number of idle calls, where the sched tick was stopped
* @idle_entrytime: Time when the idle call was entered
* @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
* @sleep_length: Duration of the current idle sleep
*/
struct tick_sched {
struct hrtimer sched_timer;
unsigned long check_clocks;
enum tick_nohz_mode nohz_mode;
ktime_t idle_tick;
int tick_stopped;
unsigned long idle_jiffies;
unsigned long idle_calls;
unsigned long idle_sleeps;
ktime_t idle_entrytime;
ktime_t idle_sleeptime;
ktime_t sleep_length;
unsigned long last_jiffies;
unsigned long next_jiffies;
ktime_t idle_expires;
};
/*
* Called from the timer interrupt handler to charge one tick to the current
* process. user_tick is 1 if the tick is user time, 0 for system.
*/
void update_process_times(int user_tick)
{
struct task_struct *p = current;
int cpu = smp_processor_id();
/* Note: this timer irq context must be accounted for as well. */
account_process_tick(p, user_tick);
run_local_timers();
if (rcu_pending(cpu))
rcu_check_callbacks(cpu, user_tick);
scheduler_tick(); //每个tick一次进程调度
run_posix_cpu_timers(p);
}
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*
* It also gets called by the fork code, when changing the parent's
* timeslices.
*/
void scheduler_tick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
u64 next_tick = rq->tick_timestamp + TICK_NSEC;
spin_lock(&rq->lock);
__update_rq_clock(rq);
/*
* Let rq->clock advance by at least TICK_NSEC:
*/
if (unlikely(rq->clock < next_tick))
rq->clock = next_tick;
rq->tick_timestamp = rq->clock;
update_cpu_load(rq);
if (curr != rq->idle) /* FIXME: needed? */
curr->sched_class->task_tick(rq, curr); //普通进程tick调度:task_tick_fair;或者实时进程:task_tick_rt;或者task_tick_idle
spin_unlock(&rq->lock);
#ifdef CONFIG_SMP
rq->idle_at_tick = idle_cpu(cpu);
trigger_load_balance(rq, cpu);
#endif
}
//每种进程调度策略都有这样一个结构体,下面是fair_sched_class,即普通进程调度策略
/*
* All the scheduling class methods:
*/
static const struct sched_class fair_sched_class = {
.next = &idle_sched_class,
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
.check_preempt_curr = check_preempt_wakeup,
.pick_next_task = pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
#ifdef CONFIG_SMP
.load_balance = load_balance_fair,
.move_one_task = move_one_task_fair,
#endif
.set_curr_task = set_curr_task_fair,
.task_tick = task_tick_fair, //普通进程tick调度
.task_new = task_new_fair,
};
//就跟踪到这里。task_tick_fair和task_tick_rt以后再分析