Linux学习总结—进程切换和调度算法深入分析-istvh-ChinaUnix博客

Linux循序渐进

首页　| 　博文目录　| 　关于我

istvh

博客访问： 629890
博文数量： 172
博客积分： 10010
博客等级：上将
技术积分： 1252
用户组：普通用户
注册时间： 2009-06-29 22:26

文章分类

全部博文（172）

software（3）
algorithm（24）
Linux（144）
未分配的博文（1）

文章存档

2011年（6）

2010年（7）

2009年（159）

我的朋友

相关博文

Linux学习总结—进程切换和调度算法深入分析

分类： LINUX

2009-07-19 21:36:26

一、Linux进程切换深入分析
#define CLONE_KERNEL     (CLONE_FS | CLONE_FILES | CLONE_SIGHAND)
创建内核线程时使用的CLONE标志。
1．#define unlikely(x)      __builtin_expect(!!(x), 0)
编译器优化，实际返回值x是整型表达式，0表示并不预期该事件发生，也就是说x为0的可能性很小，这是为了让编译器对下面得语句进行优化。
2．进程内核态堆栈结构：
进程是动态实体，进程描述符是存放在动态内存中的。在一块进程内存区上，Linux存放了两个数据结构：指向task_struct得thread_info和内核态的进程栈。大小一般2页8K，这要求页面帧对齐2的13次幂，在X86上编译时可以配置大小为4K。thread_info在内存区开始处，内核栈从内存尾向下增长。在C语言中可以用union结构表示：
图1. 8K内核栈和进程描述符task_struct及thread_info的相互关系

union thread_union {
        struct thread_info thread_info;
        unsigned long stack[2048]; /* 1024 for 4KB stacks */
    };

CPU的esp寄存器用于执行堆栈的顶部指针，当从用户态转向内核态时，进程内核栈总是空的，所以esp就会执行堆栈底部。
使用alloc_thread_info 和free_thread_info用于分配和释放一个存放thread_info结构和内核堆栈的内存区。
内核通过当前esp指针可以很方便的得到thread_info结构的地址。current_thread_info(void)的原理即如下：
movl $0xffff2000,%ecx /* or 0xfffff000 for 4KB stacks */
   andl %esp,%ecx
movl %ecx,p
thread_info中task指针是第一个，所以current宏相当于current_thread_info( )->task，从而也就得到task指针。

每个进程有自己独立得进程空间，所有进程共享CPU寄存器。进程继续执行时必须装入寄存器恢复得数据集称为硬件上下文环境。在Linux中部分硬件上下文存放在进程描述符中，部分存放到内核态堆栈里。

3. 进程切换堆栈原理：
每个进程有自己独立得进程空间，所有进程共享CPU寄存器。进程继续执行时必须装入寄存器恢复得数据集称为硬件上下文环境。在Linux中部分硬件上下文存放在进程描述符中，部分存放到内核态堆栈里。
80x86体系支持在进程TSS段跳转时自动执行进程硬件上下文切换。Linux使用软件方法实现。软件方式效率差不多，当更灵活，可以控制流程，留下优化空间。
80x86用TSS段保存硬件上下文内容，每个CPU有一个TSS段。从用户态到内核态切换时，从TSS中取出内核栈地址。用户态进程访问I/O端口时，TSS中的I/O访问位图可以验证权限。tss_struct描述了TSS格式，init_tss存放初始TSS内容，每次进程切换，内核更新TSS中的某些字段，以反映当前运行进程的权限等级。每个进程有个反映任务CPU状态的thread_struct结构变量thread，除eax、ecx等通用寄存器内容保存在内核态堆栈中，其他大部分寄存器都保存在次结构中。该结构一部分对应于tss_struct中的内容，进程切换时把thread中某些内容更新到tss_struct中就可以反映当前任务的运行CPU环境。
struct tss_struct {
    unsigned short    back_link,__blh;
    unsigned long esp0;
    unsigned short    ss0,__ss0h;
    unsigned long esp1;
    unsigned short    ss1,__ss1h;   /* ss1 is used to cache MSR_IA32_SYSENTER_CS */
    unsigned long esp2;
    unsigned short    ss2,__ss2h;
    unsigned long __cr3;
    unsigned long eip;
    unsigned long eflags;
    unsigned long eax,ecx,edx,ebx;
    unsigned long esp;
    unsigned long ebp;
    unsigned long esi;
    unsigned long edi;
    unsigned short    es, __esh;
    unsigned short    cs, __csh;
    unsigned short    ss, __ssh;
    unsigned short    ds, __dsh;
    unsigned short    fs, __fsh;
    unsigned short    gs, __gsh;
    unsigned short    ldt, __ldth;
    unsigned short    trace, io_bitmap_base;
    /*
     * The extra 1 is there because the CPU will access an
     * additional byte beyond the end of the IO permission
     * bitmap. The extra byte must be all 1 bits, and must
     * be within the limit.
     */
    unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
    /*
     * Cache the current maximum and the last task that used the bitmap:
     */
    unsigned long io_bitmap_max;
    struct thread_struct *io_bitmap_owner;
    /*
     * pads the TSS to be cacheline-aligned (size is 0x100)
     */
    unsigned long __cacheline_filler[35];
    /*
     * .. and then another 0x100 bytes for emergency kernel stack
     */
    unsigned long stack[64];
} __attribute__((packed));

struct thread_struct {
/* cached TLS descriptors. */
struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
unsigned long esp0;
unsigned long sysenter_cs;
unsigned long eip;
unsigned long esp;
unsigned long fs;
unsigned long gs;
/* Hardware debugging registers */
unsigned long debugreg[8]; /* %%db0-7 debug registers */
/* fault info */
unsigned long cr2, trap_no, error_code;
/* floating point info */
union i387_union i387;
/* virtual 86 mode info */
struct vm86_struct __user * vm86_info;
unsigned long     screen_bitmap;
unsigned long     v86flags, v86mask, saved_esp0;
unsigned int      saved_fs, saved_gs;
/* IO permissions */
unsigned long *io_bitmap_ptr;
    unsigned long iopl;
/* max allowed port in the bitmap, in bytes: */
unsigned long io_bitmap_max;
};

4．进程切换流程解析switch_to
进程切换本质上两步：
1)      进程页表PGD切换；
2)      内核态堆栈和硬件上下文切换（包括CPU寄存器）；
   上面两步通过context_switch()实现，它通过调用switch_mm()切换进程空间，switch_to切换内核上下文环境。

首先看看context_switch()做了些什么：
1)        进程描述符中active_mm执行进程使用的地址空间，mm执行进程拥有的地址空间，对于普通进程它们相同。对于内核线程，它的mm总为NULL。所以context_switch()首先判断if (!next->mm)即next为内核线程，则使用prev的进程地址空间：
if (!next->mm) {    next->active_mm = prev->active_mm;    atomic_inc(&prev->active_mm->mm_count);    enter_lazy_tlb(prev->active_mm, next);}2)        否则，如果next是普通进程，则用next进程空间替换prev的地址空间：
    switch_mm(oldmm, mm, next);
3)        如果prev是内核线程或者正在退出，则设置prev->active_mm 和runqueue的 prev_mm为NULL：
if (!prev->mm) {
      prev->active_mm = NULL;
      WARN_ON(rq->prev_mm);
      rq->prev_mm = oldmm;
}

下面看看switch_mm()如何切换进程空间：
1)        获取cpu逻辑号。
2)        清除cpu_vm_mask位标志。cpu_clear(cpu, prev->cpu_vm_mask)
3)        设置cpu_tlbstate状态。per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK
4)        设置cpu_tlbstate的active_mm为next。per_cpu(cpu_tlbstate, cpu).active_mm = next
5)        设置next的cpu_vm_mask标志。cpu_set(cpu, next->cpu_vm_mask)
6)        装载next的pgd页表到cr3寄存器。load_cr3(next->pgd)
7)      如果next的LDT描述符改变，则加载next的LDT描述符。
if (unlikely(prev->context.ldt != next->context.ldt))
           load_LDT_nolock(&next->context);

最后，switch_to进行内核堆栈和CPU环境切换操作：
#define switch_to(prev,next,last) do {               \
    unsigned long esi,edi;                    \
    asm volatile("pushfl\n\t"       /* Save flags */ \
            "pushl %%ebp\n\t"                 \
            "movl %%esp,%0\n\t" /* save ESP */       \
            "movl %5,%%esp\n\t" /* restore ESP */ \
            "movl $1f,%1\n\t"      /* save EIP */       \
            "pushl %6\n\t"      /* restore EIP */ \
            "jmp __switch_to\n"           \
            "1:\t"                     \
            "popl %%ebp\n\t"                  \
            "popfl"                    \
            :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \
             "=a" (last),"=S" (esi),"=D" (edi)          \
            :"m" (next->thread.esp),"m" (next->thread.eip), \
             "2" (prev), "d" (next));            \
} while (0)

流程描述，prev是进程A的task结构，next是进程B的task结构，last是进程C的结构：
1)      保存prev和next指针的值到eax和edx：
movl prev, %eaxmovl next, %edx
2)      保存eflags 和 ebp 寄存器内容到prev内核态堆栈中：
pushfl
pushl %ebp

3)      将esp内容保存到prev->thread.esp中，该字段执行prev内核堆栈的top地址。
movl %esp,484(%eax)

4)      将next->thread.esp加载到esp中，现在开始，esp执行next的内核堆栈，进程切换完成。
movl 484(%edx), %esp

5)      保存下面Label 1到prev->thread.eip指针中，当prev进程恢复运行时，从该位置开始运行。
movl $1f, 480(%eax)

6)      将next->thread.eip的指针内容压到next的内核态堆栈中，通常它的内容也是Label 1。
pushl 480(%edx)

7)      跳转到__switch_to（）C函数执行。
jmp __switch_to

8)      被替换的进程A继续执行，它在Label 1处，首先是恢复eflags和ebp寄存器内容。注意这里是发生在调度器选择prev在CPU上运行后，次数esp已经执行了prev的内核堆栈。
1:
      popl %ebp
   popfl

9)      将eax内容保存到last任务结构中。这里eax是被进程A切换下来的进程C的task结构指针。
movl %eax, last

5．__switch_to深入分析
__switch_to参数是存放在eax和edx中的内容，这通过
#define fastcall __attribute__((regparm(3)))告诉gcc编译器。
1)      获取tss_struct tss、prev_p和next_p的thread_struct结构prev和next、当前CPU逻辑ID。
2)      调用__unlazy_fpu(prev_p)根据条件标志选择是否保存prev_p的FPU, MMX, 和XMM寄存器内容。
3)      load_esp0(tss, next)将next的堆栈地址存放到tss中：tss->esp0 = thread->esp0。
4)      savesegment(gs, prev->gs)保存gs寄存器到prev->gs，fs已经在栈入口保存，es和ds在内核态下不需要保存。
5)      load_TLS(next, cpu)从next的tls_array 缓存中加载线程的Thread-Local Storage描述符。TLS在GDT表中位置6、7、8。
cpu_gdt_table[cpu][6] = next_p->thread.tls_array[0];
cpu_gdt_table[cpu][7] = next_p->thread.tls_array[1];
    cpu_gdt_table[cpu][8] = next_p->thread.tls_array[2];
6)      如果当前特权级别是0并且prev->iopl != next->iopl则恢复IOPL设置set_iopl_mask(next->iopl)。
7)      根据thread_info的TIF标志_TIF_WORK_CTXSW和TIF_IO_BITMAP判断是否需要处理debug寄存器和IO位图：__switch_to_xtra(next_p, tss);
l        只有当next_p挂起时即if (test_tsk_thread_flag(next_p, TIF_DEBUG))使用了debug寄存器才需要恢复set_debugreg(next->debugreg[i], i)。只有调试器需要监控prev的状态时，prev_p->thread.debugreg数组的内容才会被修改。Debug寄存器dr0～dr7，dr4和dr5不用。
l         当prev_p或者next_p定义了自己的I/O访问位图时，必须更新TSS的I/O bitmap。
if (prev_p->thread.io_bitmap_ptr || next_p->thread.io_bitmap_ptr)          handle_io_bitmap(&next_p->thread, &init_tss[cpu]);
进程的I/O访问位图存放在io_bitmap_ptr指针里，通常进程很少修改IO位图，只有当前时间片中访问IO端口才会把实际的IO位图加载到TSS中。
ü         当next_p没有自定义位图时：
tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 返回
ü         如果next == tss->io_bitmap_owner则设置有效的偏移量：tss->io_bitmap_base = IO_BITMAP_OFFSET; 返回
ü         否则tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
只有第二种情况tss->io_bitmap_base设置的是有效的io_bitmap偏移量，对于其他两种情况，当用户进程访问I/O端口时将会触发"General protection "的异常，do_general_protection( )异常处理函数根据io_bitmap的值处理异常：如果是0x8000(INVALID_IO_BITMAP_OFFSET)则发送SIGSEGV信号给用户进程；如果是0x9000(INVALID_IO_BITMAP_OFFSET_LAZY)则拷贝进程的thread中的io_bitmap_ptr内容到io_bitmap中，并设置io_bitmap_base为正确的偏移量(104)。

8)      disable_tsc(prev_p, next_p)设置cr4中的TSC Disable位。
9)      arch_leave_lazy_cpu_mode()设置CPU的lazy模式。
10) 如果next_p->fpu_counter > 5则恢复next_p的FPU寄存器内容：
math_state_restore()。FPU寄存器存放在next_p->thread->i387中，i387是i387_union的union结构：
union i387_union {
struct i387_fsave_struct fsave;
struct i387_fxsave_struct   fxsave;
struct i387_soft_struct soft;
};
struct i387_fxsave_struct {
unsigned short    cwd;
unsigned short    swd;
unsigned short    twd;
unsigned short    fop;
long   fip;
long   fcs;
long   foo;
long   fos;
long   mxcsr;
long   mxcsr_mask;
long   st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
long   xmm_space[32];    /* 8*16 bytes for each XMM-reg = 128 bytes */
long   padding[56];
} __attribute__ ((aligned (16)));

11) 如果需要，则从next->gs中恢复gs寄存器内容。
if (prev->gs | next->gs)
         loadsegment(gs, next->gs);
二、Linux实时调度schedule
1．概述
三种调度策略：SCHED_FIFO，SCHED_RR和SCHED_NORMAL。
FIFO实时调度算法当调度器将CPU指定给某个进程时，它把该进程放到运行队列首；除非有更高优先级的进程，否则该进程将一直占用CPU。
Round Robin实时进程调度把CPU指定给某进程，把它放到运行队列尾。时间片运行完再选择其他进程调度。这样保证了同优先级的公平竞争CPU。
SCHED_NORMAL是普通的基于运行时间和等待时间等，动态调整进程优先级的一种调度策略。
实时进程优先级1～100，普通101～139。
2．实时进程调度的时机
1)      该进程被更高优先级的进程抢占；
2)      进程执行一个阻塞操作，被放到睡眠队列，状态为TASK_INTERRUPTIBLE或TASK_UNINTERRUPTIBLE；
3)      进程被终止(状态为TASK_STOPPED 或TASK_TRACED)，或者进程被杀死(状态为EXIT_ZOMBIE 或 EXIT_DEAD)
4)      进程调用sched_yield()主动放弃CPU；
5)      RR实时进程用完了CPU分配的时间片；

3．调度器相关函数
1)      scheduler_tick( )
更新当前进程的运行时间片tick值，在update_process_times( )中调用，判断进程的时间片是否用完。

2)      try_to_wake_up( )
唤醒一个睡眠的进程并把它的状态设为TASK_RUNNING，插入到运行队列中。

3)      recalc_task_prio( )
更新进程的睡眠时间和动态优先级，SCHED_NORMAL调度。

4)      schedule( )
进程调度

5)      load_balance()
SMP系统的负载均衡。

4．schedule( )函数
进程调度有两种方式：直接调用和延迟调用。
直接调用schedule，当前进程资源不可用时会直接调用调度器，这种情况下，内核线程进行如下处理：
1)      将current插入到合适的等待队列中；
2)      将current状态变为TASK_INTERRUPTIBLE 或TASK_UNINTERRUPTIBLE
3)      调用schedule();
4)      检查资源是否可用，如果不可用，转到第2）步；
5)      一旦资源可用，从等待队列中移除current进程；

在设备驱动程序中也经常会检查TIF_NEED_RESCHED并调用schedule()。

延迟调用方式是通过设置current进程的TIF_NEED_RESCHED标志为1。当恢复用户态进程的执行前，会检查该标志并决定是否调用schedule()。延迟调度的情形有：
1)      在scheduler_tick()中如果current用完了时间片则设置该标志；
2)      在try_to_wake_up( )中唤醒一个进程并且该进程比当前运行进程优先级高。
3)      调用sched_setscheduler()时。

schedule()函数工作流程：
进程切换前的工作：
1)      禁止内核抢占，初始化局部变量prev，释放prev占有的大内核锁；
need_resched:
    preempt_disable();
    prev = current;
    release_kernel_lock(prev);
2)      读取调度TSC时间，计算调整run_time时间，更新调度状态rq->sched_cnt参数，获取rq的spin锁：spin_lock_irq(&rq->lock)。
3)      检查prev状态：如果状态不是TASK_RUNNING且没有在内核态被抢占，则从运行队列中移除；但是如果prev状态是TASK_INTERRUPTIBLE并且拥有非阻塞挂起的信号，则把进程状态设为TASK_RUNNING不移出运行队列。
     if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
       switch_count = &prev->nvcsw;
       if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
              unlikely(signal_pending(prev))))
           prev->state = TASK_RUNNING;
       else {
           if (prev->state == TASK_UNINTERRUPTIBLE)
              rq->nr_uninterruptible++;
           deactivate_task(prev, rq);
       }
    }
4)      获取当前CPU逻辑号，如果当前运行队列为空，则调用idle_balance(cpu, rq)从其他CPU运行队列上拉进程到本地CPU的运行队列上。如果调整后，当前运行队列仍为空则next赋为idle进程，跳转到任务切换代码行去。
    if (unlikely(!rq->nr_running)) {
       idle_balance(cpu, rq);
       if (!rq->nr_running) {
           next = rq->idle;
           rq->expired_timestamp = 0;
           goto switch_tasks;
       }
    }
5)      如果runqueue中有进程，并且当前活得进程数为0，则交换active 和 expired队列指针。
    array = rq->active;
    if (unlikely(!array->nr_active)) {
       schedstat_inc(rq, sched_switch);
       rq->active = rq->expired;
       rq->expired = array;
       array = rq->active;
       rq->expired_timestamp = 0;
       rq->best_expired_prio = MAX_PRIO;
    }

6)      从运行队列的活动prio_array数据的位图中查找第一个位设置为1的索引，根据索引找到该优先级队列的第一个task。
idx = sched_find_first_bit(array->bitmap);
    queue = array->queue + idx;
    next = list_entry(queue->next, struct task_struct, run_list);

7)      如果next是普通进程，并且next->sleep_type是SLEEP_INTERACTIVE 或SLEEP_INTERRUPTED，则重新计算进程睡眠时间和进程优先级。

进程切换工作：
8)      更新sched_goidle，预期next结构数据，清除TIF_NEED_RESCHED标志，设置quiescent状态计数为1：rcu_data ->passed_quiesc = 1;
switch_tasks:
if (next == rq->idle)
    schedstat_inc(rq, sched_goidle);
prefetch(next);
prefetch_stack(next);
clear_tsk_need_resched(prev);
rcu_qsctr_inc(task_cpu(prev));

9)      更新prev进程运行时间戳prev->sleep_avg，prev->timestamp;
10) 调度信息切换到next，更新next;时间戳和运行队列信息：
sched_info_switch(prev, next);
if (likely(prev != next)) {
    next->timestamp = next->last_ran = now;
    rq->nr_switches++;
    rq->curr = next;
    ++*switch_count;
     ……
}
11) 进行进程切换，context_switch参见前面的分析，它进行进程空间和内核堆栈切换。prepare_lock_switch 功能是在定义了__ARCH_WANT_INTERRUPTS_ON_CTXSW情况下，在切换前开中断spin_unlock_irq(&rq->lock); barrier()是保证代码执行顺序不变。
     prepare_task_switch(rq, next);
    prev = context_switch(rq, prev, next);
    barrier();
    finish_task_switch(this_rq(), prev);

进程切换后的工作：
进程切换context_switch语句之后的代码并不是由next进程立即执行的，而是由调度器选择prev进程继续执行的。次时prev变量指向的已经是被prev进程替换的其他进程的指针。

12) finish_task_switch()必须与prepare_task_switch配对使用，并主要锁的顺序。它所做的工作，finish_lock_switch调用local_irq_enable(),获取prev的状态和rq->prev_mm，如果mm非空，则调用mmdrop(mm)减少mm的引用计数，如果为0则释放进程页表和虚拟空间。如果prev_state为TASK_DEAD则释放进程的task结构。

struct mm_struct *mm = rq->prev_mm;
long prev_state;

rq->prev_mm = NULL;
prev_state = prev->state;
finish_arch_switch(prev);
finish_lock_switch(rq, prev);
if (mm)
    mmdrop(mm);
if (unlikely(prev_state == TASK_DEAD)) {
    kprobe_flush_task(prev);
    put_task_struct(prev);
}

13) 最后，if (unlikely(task->lock_depth >= 0))则重新获取大内核锁__reacquire_kernel_lock，否则goto need_resched_nonpreemptible; 允许抢占，如果TIF_NEED_RESCHED被设置，则跳转到need_resched重新进行调度。
prev = current;
if (unlikely(reacquire_kernel_lock(prev) < 0))
    goto need_resched_nonpreemptible;
preempt_enable_no_resched();
if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
    goto need_resched;

阅读(1734) | 评论(0) | 转发(0) |

上一篇：Linux学习总结—Linux调度器分析

下一篇：linux进程调度方法(SCHED_OTHER,SCHED_FIFO,SCHED_RR)

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6