分类:
2009-11-27 19:48:41
startup_32:
cld
cli
movl $(KERNEL_DS),%eax
mov %ax,%ds
mov %ax,%es
mov %ax,%fs
mov %ax,%gs
#ifdef
__SMP__
orw %bx,%bx #
What state are we in BX=1 for SMP
# 0 for boot
jz 2f # Initial boot
//根据bx值指示是主cpu(bx=0)还是次cpu(bx=1)
//然后会有不同的执行路径
/*
* We are
trampolining an SMP processor
*//这里是其他次cpu执行路径
mov %ax,%ss
xorl %eax,%eax # Back to 0
mov
%cx,%ax # SP low 16 bits
movl %eax,%esp
pushl 0 #
Clear NT
popfl
ljmp $(KERNEL_CS),
$0x100000 # Into C and sanity
2://这里是主cpu的执行路径
#endif
lss SYMBOL_NAME(stack_start),%esp
xorl %eax,%eax
1: incl %eax #
check that A20 really IS enabled
movl %eax,0x000000 # loop forever if it isn't
cmpl %eax,0x100000
je 1b
/*
* Initialize eflags. Some BIOS's leave bits like NT set. This would
* confuse the debugger if this code is traced.
* XXX - best to initialize before switching to
protected mode.
*/
pushl $0
popfl
/*
* Clear BSS
*/
xorl %eax,%eax
movl $ SYMBOL_NAME(_edata),%edi
movl $ SYMBOL_NAME(_end),%ecx
subl %edi,%ecx
cld
rep
stosb
/*
* Do the decompression, and jump to the new
kernel..
*/
subl $16,%esp # place for structure on the stack
pushl %esp #
address of structure as first arg
call SYMBOL_NAME(decompress_kernel)
orl
%eax,%eax
jnz
3f
xorl %ebx,%ebx
ljmp $(KERNEL_CS),
$0x100000
ljmp $(KERNEL_CS), $0x100000
这个其实就是跳到start_kernel函数。
asmlinkage void start_kernel(void)
{
char * command_line;
/*
* This little check will move.
*/
#ifdef __SMP__
static int
first_cpu=1;
//这个不是函数局部变量,是函数静态变量,主cpu执行这个函数时复位为1,其他cpu为0,因为主cpu总是第一个执行这个函数的。
if(!first_cpu)
start_secondary();
//对于
first_cpu=0;
#endif
/*
* Interrupts are still
disabled. Do necessary setups, then
* enable them
*/
setup_arch(&command_line,
&memory_start, &memory_end);
memory_start =
paging_init(memory_start,memory_end);
trap_init();
init_IRQ();
sched_init();
time_init();
parse_options(command_line);
#ifdef CONFIG_MODULES
init_modules();
#endif
#ifdef CONFIG_PROFILE
if (!prof_shift)
#ifdef CONFIG_PROFILE_SHIFT
prof_shift =
CONFIG_PROFILE_SHIFT;
#else
prof_shift = 2;
#endif
#endif
if (prof_shift) {
prof_buffer =
(unsigned int *) memory_start;
/* only text is
profiled */
prof_len = (unsigned
long) &_etext - (unsigned long) &_stext;
prof_len >>=
prof_shift;
memory_start +=
prof_len * sizeof(unsigned int);
}
memory_start =
console_init(memory_start,memory_end);
#ifdef CONFIG_PCI
memory_start =
pci_init(memory_start,memory_end);
#endif
memory_start =
kmalloc_init(memory_start,memory_end);
sti();
calibrate_delay();
memory_start =
inode_init(memory_start,memory_end);
memory_start = file_table_init(memory_start,memory_end);
memory_start =
name_cache_init(memory_start,memory_end);
#ifdef CONFIG_BLK_DEV_INITRD
if (initrd_start
&& initrd_start < memory_start) {
printk(KERN_CRIT
"initrd overwritten (0x%08lx < 0x%08lx) - "
"disabling it.\n",initrd_start,memory_start);
initrd_start = 0;
}
#endif
mem_init(memory_start,memory_end);
buffer_init();
sock_init();
#if defined(CONFIG_SYSVIPC) || defined(CONFIG_KERNELD)
ipc_init();
#endif
dquot_init();
arch_syms_export();
sti();
check_bugs();
printk(linux_banner);
#ifdef __SMP__
smp_init();
#endif
sysctl_init();
/*
* We
count on the initial thread going ok
* Like
idlers init is an unlocked kernel thread, which will
* make
syscalls (and thus be locked).
*/
kernel_thread(init,
NULL, 0);
/*
* task[0] is meant to be
used as an "idle" task: it may not sleep, but
* it might do some general
things like count free pages or it could be
* used to implement a
reasonable LRU algorithm for the paging routines:
* anything that can be
useful, but shouldn't take time from the real
* processes.
*
* Right now task[0] just
does a infinite idle loop.
*/
cpu_idle(NULL);
}
asmlinkage void start_secondary(void)
{
trap_init();
init_IRQ();
//初始化自己的irq
smp_callin();
//这个等待主cpu给大家发送开始信号
cpu_idle(NULL);
//这个是ide进程。
}
void smp_callin(void)
{
extern void
calibrate_delay(void);
int
cpuid=GET_APIC_ID(apic_read(APIC_ID));
unsigned long l;
/*
* Activate
our APIC
*/
SMP_PRINTK(("CALLIN
%d\n",smp_processor_id()));
l=apic_read(APIC_SPIV);
l|=(1<<8); /*
Enable */
apic_write(APIC_SPIV,l);
sti();
/*
* Get
our bogomips.
*/
calibrate_delay();
/*
* Save
our processor parameters
*/
smp_store_cpu_info(cpuid);
/*
* Allow
the master to continue.
*/
set_bit(cpuid, (unsigned
long *)&cpu_callin_map[0]);
/*
* Until
we are ready for SMP scheduling
*/
load_ldt(0);
/* printk("Testing
faulting...\n");
*(long *)0=1; OOPS... */
local_flush_tlb();
while(!smp_commenced);
//这个可以看成是自旋锁,等待主cpu发smp_commenced信号即开始信号。
if (cpu_number_map[cpuid]
== -1)
while(1);
local_flush_tlb();
SMP_PRINTK(("Commenced..\n"));
load_TR(cpu_number_map[cpuid]);
/* while(1);*/
}
int cpu_idle(void *unused)
{
for(;;)
idle();
}
主cpu给各次cpu发开始信号是在init函数中调用smp_begin函数:
static void smp_begin(){
smp_threads_ready=1;
smp_commence();
//这个会通过IPI给各个次cpu发送相关中断来通信
}
每个cpu有一个current指针。
刚开始的时候由主cpu赋值为init_task;
在主cpu调用 sched_init赋值。
void sched_init(void)
{
/*
* We
have to do a little magic to get the first
* process
right in SMP mode.
*/
int
cpu=smp_processor_id();//这个为0,因为是主cpu才调用。
#ifndef __SMP__
current_set[cpu]=&init_task;
#else
init_task.processor=cpu;
//这个是将init_task标志为主cpu在运行。
for(cpu = 0; cpu <
NR_CPUS; cpu++)
current_set[cpu] =
&init_task;
#endif
init_bh(TIMER_BH,
timer_bh);
init_bh(TQUEUE_BH,
tqueue_bh);
init_bh(IMMEDIATE_BH,
immediate_bh);
}
同时这些还会在 smp_init丰富。
static void smp_init(void)
{
int i, j;
smp_boot_cpus();
/*
* Create
the slave init tasks as sharing pid 0.
*
* This
should only happen if we have virtual CPU numbers
* higher
than 0.
*/
for (i=1;
i
{
struct task_struct
*n, *p;
j =
cpu_logical_map[i];
/*
* We use
kernel_thread for the idlers which are
* unlocked
tasks running in kernel space.
*/
kernel_thread(cpu_idle,
NULL, CLONE_PID);
//这个其实就是创建线程然后这个线程体现在task[i]上了,因为创建的时候的task_struct就是从task[i]取的。
/*
* Don't
assume linear processor numbering
*/
current_set[j]=task[i];
current_set[j]->processor=j;
cli();
n =
task[i]->next_run;
p =
task[i]->prev_run;
nr_running--;
n->prev_run = p;
p->next_run = n;
task[i]->next_run
= task[i]->prev_run = task[i];
sti();
}
}
上面执行完后就给每个cpu加了一个idle任务。
然后kernel_thread(init, NULL, 0)创建的init任务。
//每个cpu在时间中断时都可能调用这个共同的函数。
asmlinkage void schedule(void)
{
int c;
struct task_struct * p;
struct task_struct *
prev, * next;
unsigned long timeout =
0;
int this_cpu=smp_processor_id();
//获取cpu_id;
/* check alarm, wake up any interruptible tasks that have got a
signal */
if (intr_count)
goto
scheduling_in_interrupt;
if (bh_active &
bh_mask) {
intr_count = 1;
do_bottom_half();
intr_count = 0;
}
run_task_queue(&tq_scheduler);
need_resched = 0;
prev = current;
cli();
/* move an exhausted RR
process to be last.. */
if (!prev->counter
&& prev->policy == SCHED_RR) {
prev->counter =
prev->priority;
move_last_runqueue(prev);
}
switch (prev->state) {
case
TASK_INTERRUPTIBLE:
if
(prev->signal & ~prev->blocked)
goto
makerunnable;
timeout =
prev->timeout;
if (timeout
&& (timeout <= jiffies)) {
prev->timeout
= 0;
timeout = 0;
makerunnable:
prev->state
= TASK_RUNNING;
break;
}
default:
del_from_runqueue(prev);
case TASK_RUNNING:
}
p =
init_task.next_run;
//获取进程双向链表的一个节点。
sti();
#ifdef __SMP__
/*
* This
is safe as we do not permit re-entry of schedule()
*/
prev->processor =
NO_PROC_ID;
#define idle_task (task[cpu_number_map[this_cpu]])
#else
#define idle_task (&init_task)
#endif
/*
* Note! there may appear
new tasks on the run-queue during this, as
* interrupts are enabled.
However, they will be put on front of the
* list, so our list
starting at "p" is essentially fixed.
*/
/* this is the scheduler proper: */
c = -1000;
next = idle_task;
while (p !=
&init_task) {
//p初始值为init_task.next_run
//当回到init_task时说明已经查找为所有的了。
int weight =
goodness(p, prev, this_cpu);
if (weight > c)
c = weight, next =
p;
p = p->next_run;
}
//这个是查找所有的task,找出最合适的task来调度。
/* if all runnable processes
have "counter == 0", re-calculate counters */
if (!c) {
for_each_task(p)
p->counter =
(p->counter >> 1) + p->priority;
}
#ifdef __SMP__
/*
* Allocate
process to CPU
*/
next->processor = this_cpu;
//将这个将要被执行的processor标识为这个cpu
next->last_processor = this_cpu;
#endif
#ifdef __SMP_PROF__
/* mark processor running
an idle thread */
if (0==next->pid)
set_bit(this_cpu,&smp_idle_map);
else
clear_bit(this_cpu,&smp_idle_map);
#endif
if (prev != next) {
struct timer_list
timer;
kstat.context_swtch++;
if (timeout) {
init_timer(&timer);
timer.expires =
timeout;
timer.data =
(unsigned long) prev;
timer.function =
process_timeout;
add_timer(&timer);
}
get_mmu_context(next);
switch_to(prev,next);
if (timeout)
del_timer(&timer);
}
return;
scheduling_in_interrupt:
printk("Aiee:
scheduling in interrupt %p\n",
__builtin_return_address(0));
}
上面需要注意的是current变量,在单核中肯定就是一个变量,在多核中肯定是各个cpu有自己的current:
其定义如下:
#define current (0+current_set[smp_processor_id()]
在smp中current是current_set数组中的一个元素,是指具体一个cpu的当前进程。
从上面可以看出一个cpu是从全局task找一个task来运行,每个cpu有一个idle_task,这个task的编号是固定的。
所有的task可以通过init_task来找到,因为创建新进程(内核线程)的时候,会将新建的挂到链表上。
而init_task是静态挂在这上面的。
附上task_struct:
struct task_struct {
/* these are hardcoded - don't touch */
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped
*/
long counter;
long priority;
unsigned long signal;
unsigned long blocked; /* bitmap of masked signals */
unsigned long flags; /* per process flags, defined below */
int errno;
long debugreg[8]; /* Hardware debugging registers */
struct exec_domain
*exec_domain;
/* various fields */
struct linux_binfmt
*binfmt;
struct task_struct
*next_task, *prev_task;
struct task_struct
*next_run, *prev_run;
unsigned long
saved_kernel_stack;
unsigned long
kernel_stack_page;
int exit_code,
exit_signal;
/* ??? */
unsigned long
personality;
int dumpable:1;
int did_exec:1;
/* shouldn't this be
pid_t? */
int pid;
int pgrp;
int tty_old_pgrp;
int session;
/* boolean value for
session group leader */
int leader;
int groups[NGROUPS];
/*
* pointers to (original) parent process,
youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
* p->p_pptr->pid)
*/
struct task_struct
*p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
struct wait_queue
*wait_chldexit; /* for wait4() */
unsigned short
uid,euid,suid,fsuid;
unsigned short gid,egid,sgid,fsgid;
unsigned long timeout,
policy, rt_priority;
unsigned long
it_real_value, it_prof_value, it_virt_value;
unsigned long
it_real_incr, it_prof_incr, it_virt_incr;
struct timer_list
real_timer;
long utime, stime,
cutime, cstime, start_time;
/* mm fault and swap info: this can arguably be seen as either
mm-specific or thread-specific */
unsigned long min_flt,
maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
int swappable:1;
unsigned long
swap_address;
unsigned long
old_maj_flt; /* old value of maj_flt */
unsigned long dec_flt; /* page fault count of the last time */
unsigned long swap_cnt; /* number of pages to swap on next pass */
/* limits */
struct rlimit
rlim[RLIM_NLIMITS];
unsigned short used_math;
char comm[16];
/* file system info */
int link_count;
struct tty_struct *tty;
/* NULL if no tty */
/* ipc stuff */
struct sem_undo *semundo;
struct sem_queue
*semsleeping;
/* ldt for this task - used by Wine. If NULL, default_ldt is used */
struct desc_struct *ldt;
/* tss for this task */
struct thread_struct tss;
/* filesystem information */
struct fs_struct *fs;
/* open file information */
struct files_struct
*files;
/* memory management info */
struct mm_struct *mm;
/* signal handlers */
struct signal_struct
*sig;
#ifdef __SMP__
int processor;
int last_processor;
int lock_depth; /* Lock depth. We can context switch in and
out of holding a syscall kernel lock... */
#endif
};
故这个p = init_task.next_run;
p可以获取到所有在就绪状态的task;