1. 在 ./init/main.c 中第一次出现fork调用
-
static inline _syscall0(int,fork)
-
void main(void)
-
{
-
move_to_user_mode(); //fork之前,进程的特权级是3,且当前执行的是进程0
-
if( !fork() ){ //由进程0去创建进程1 -->调用fork完成这个操作
-
init();
-
}
-
}
2. fork函数的原型及实现
在init/main.c中对有如下形式-->inline _syscall0(int,fork)
其中_syscall0 是一个宏定义:
在 ./include/unistd.h 中
-
#define _syscall0(type,name) \
-
type name(void) \
-
{ \
-
long __res; \
-
__asm__ volatile ("int $0x80" \
-
: "=a" (__res) \
-
: "0" (__NR_##name)); \
-
if (__res >= 0) \
-
return (type) __res; \
-
errno = -__res; \
-
return -1; \
-
}
|
将宏展开后: NR_fork=2
-
inline int fork(void)
-
{
-
long __res;
-
__asm__ volatile ("int $0x80"
-
: "=a" (__res)
-
: "0" (__NR_fork)); -->2
-
if (__res >= 0) //这是为什么父进程会返回>0
-
return (int) __res;
-
errno = -__res;
-
return -1;
-
}
|
汇编的格式看更方便:
-
inline _syscall0(int,fork)
-
64c0: 83 ec 10 sub $0x10,%esp
-
64c3: b8 02 00 00 00 mov $0x2,%eax -->把NR_fork放在eax中
-
64c8: cd 80 int $0x80 -->作为int 0x80的参数
-
64ca: 89 44 24 0c mov %eax,0xc(%esp)
-
64ce: 83 7c 24 0c 00 cmpl $0x0,0xc(%esp)
-
64d3: 78 06 js 64db <fork+0x1b>
-
64d5: 8b 44 24 0c mov 0xc(%esp),%eax
-
64d9: eb 10 jmp 64eb <fork+0x2b>
-
64db: 8b 44 24 0c mov 0xc(%esp),%eax
-
64df: f7 d8 neg %eax
-
64e1: a3 04 1c 04 00 mov %eax,0x41c04
-
64e6: b8 ff ff ff ff mov $0xffffffff,%eax
-
64eb: 83 c4 10 add $0x10,%esp
-
64ee: c3 ret
|
函数是用NR_fork 来初始化eax (eax=2), 说明fork就是int 0x80的 2号调用,进入中断
3. 在 ./kernel/system_call.s 中找到int 0x80中断的处理函数
-
.align 4
-
system_call:
-
push %ds
-
push %es
-
push %fs
-
pushl %eax # save the orig_eax
-
pushl %edx
-
pushl %ecx # push %ebx,%ecx,%edx as parameters
-
pushl %ebx # to the system call
-
movl $0x10,%edx # set up ds,es to kernel space
-
mov %dx,%ds
-
mov %dx,%es
-
movl $0x17,%edx # fs points to local data space
-
mov %dx,%fs
-
cmpl NR_syscalls,%eax
-
jae bad_sys_call
-
call sys_call_table(,%eax,4) -->即调用sys_fork函数
-
pushl %eax
-
2:
-
movl current,%eax
-
cmpl $0,state(%eax) # state
-
jne reschedule -->如果不是就绪状态就重新调度
-
cmpl $0,counter(%eax) # counter
-
je reschedule -->如果时间片为0,就重新调度
-
ret_from_sys_call: -->如果是就绪状态并且时间片不为0
-
//如果是进程0,则直接返回
-
movl current,%eax
-
cmpl task,%eax # task[0] cannot have signals
-
je 3f
-
cmpw $0x0f,CS(%esp) # was old code segment supervisor ?
-
jne 3f
-
cmpw $0x17,OLDSS(%esp) # was stack segment = 0x17 ?
-
jne 3f
-
movl signal(%eax),%ebx
-
movl blocked(%eax),%ecx
-
notl %ecx
-
andl %ebx,%ecx
-
bsfl %ecx,%ecx
-
je 3f
-
btrl %ecx,%ebx
-
movl %ebx,signal(%eax)
-
incl %ecx
-
pushl %ecx
-
call do_signal
-
popl %ecx
-
testl %eax, %eax
-
jne 2b # see if we need to switch tasks, or do more signals
-
3: popl %eax
-
popl %ebx
-
popl %ecx
-
popl %edx
-
addl $4, %esp # skip orig_eax
-
pop %fs
-
pop %es
-
pop %ds
-
iret
a. 在./include/linux/sched.h 中有 fn_ptr 的定义
typedef int (*fn_ptr) ()
b. 在 ./include/linux/sys.h 中
fn_ptr sys_call_table[] = { sys_setup, sys_exit,sys_fork,...}
c. 所以call 的调用地址就是
_sys_call_table + %eax * 4 即:sys_fork ( 其中eax=2, 每个指针的长度是4, 所以用_sys_call_table的首地址加 2*4, 就是 _sys_call_table 数组中的第二个元素的地址,即sys_fork )
call _sys_call_table(,%eax,4) 即:call _sys_fork
4. sys_fork --> 在./kernel/system_call.s中
-
.align 2
-
_sys_fork:
-
call _find_empty_process
-
test1 %eax,%eax //如果eax为负则test后SF=1置位
-
js 1f //如果符号位置位(SF:负数=1 非负=0)则说明出错跳转
-
push %gs
-
pushl %esi
-
pushl %edi
-
pushl %ebp
-
pushl %eax //给copy_process传eax=last_pid,ebp,edi,esi,gd这5个参数,注意顺序
-
call _copy_process
-
add $20,%esp
-
1: ret
可见fork又主要是调用了两个函数:
find_empty_process 和 copy_process ;它们都在kernel/fork.c中实现
说明:gcc编译器在生成汇编代码,其函数名及变量名前都会都会加_,所以在汇编中调用C的函數或变量的时候,需要手动加上一个下划线。
4.1 find_empty_process -->在kernel/fork.c中
为新进程分配pid及未使用的task[i]结构体
-
144 int find_empty_process(void)
-
145 {
-
146 int i;
-
147
-
148 repeat:
-
//last_pid是一个有符号的long,当last_pid越界后变为负数,把last_pid置1
-
149 if ((++last_pid)<0) last_pid=1;
-
//有了pid号之后,在64个task中一一搜索,检查这个pid号是否被占用
-
150 for(i=0 ; i<NR_TASKS ; i++)
-
151 if (task[i] && ((task[i]->pid == last_pid) ||
-
152 (task[i]->pgrp == last_pid)))
-
153 goto repeat;
-
//在64个task中选出为还没有使用的一项返回
-
154 for(i=1 ; i<NR_TASKS ; i++)
-
155 if (!task[i])
-
156 return i; -->若没出错则返回正值,并保存在eax中sys_fork会判断这个值
-
157 return -EAGAIN; -->若出错则返回负值,并保存在eax中sys_fork会判断这个值
-
158 }
4.2 copy_process --> 在kernel/fork.c中
-
/*
-
* Ok, this is the main fork-routine. It copies the system process
-
* information (task[nr]) and sets up the necessary registers. It
-
* also copies the data segment in it's entirety.
-
*/
-
int copy_process(int nr,long ebp,long edi,long esi,long gs,long none,
-
long ebx,long ecx,long edx, long orig_eax,
-
long fs,long es,long ds,
-
long eip,long cs,long eflags,long esp,long ss)
-
{
-
struct task_struct *p;
-
int i;
-
struct file *f;
-
//首先为task_struct分配空间
-
p = (struct task_struct *) get_free_page();
-
if (!p)
-
return -EAGAIN;
-
task[nr] = p; //第1次fork时,nr=1
-
//*p = *current; /* this doesn't copy the supervisor stack */
-
memcpy(p, current, sizeof(struct task_struct));
-
p->state = TASK_UNINTERRUPTIBLE;
-
p->pid = last_pid;
-
p->counter = p->priority;
-
p->signal = 0;
-
p->alarm = 0;
-
p->leader = 0; /* process leadership doesn't inherit */
-
p->utime = p->stime = 0;
-
p->cutime = p->cstime = 0;
-
p->start_time = jiffies;
-
p->tss.back_link = 0;
-
p->tss.esp0 = PAGE_SIZE + (long) p;
-
p->tss.ss0 = 0x10;
-
p->tss.eip = eip;
-
p->tss.eflags = eflags;
-
p->tss.eax = 0; //这是为什么子进程会返回0
-
p->tss.ecx = ecx;
-
p->tss.edx = edx;
-
p->tss.ebx = ebx;
-
p->tss.esp = esp;
-
p->tss.ebp = ebp;
-
p->tss.esi = esi;
-
p->tss.edi = edi;
-
p->tss.es = es & 0xffff;
-
p->tss.cs = cs & 0xffff;
-
p->tss.ss = ss & 0xffff;
-
p->tss.ds = ds & 0xffff;
-
p->tss.fs = fs & 0xffff;
-
p->tss.gs = gs & 0xffff;
-
p->tss.ldt = _LDT(nr);
-
p->tss.trace_bitmap = 0x80000000;
-
if (last_task_used_math == current)
-
__asm__("clts ; fnsave %0 ; frstor %0"::"m" (p->tss.i387));
-
if (copy_mem(nr,p)) {
-
task[nr] = NULL;
-
free_page((long) p);
-
return -EAGAIN;
-
}
-
for (i=0; i<NR_OPEN;i++)
-
if (f=p->filp[i])
-
f->f_count++;
-
if (current->pwd)
-
current->pwd->i_count++;
-
if (current->root)
-
current->root->i_count++;
-
if (current->executable)
-
current->executable->i_count++;
-
if (current->library)
-
current->library->i_count++;
-
//进程0的nr=0,进程1的nr=1,所以这儿是与gdt[6]=tss1, gdt[7]=ldt1
-
set_tss_desc(gdt+(nr<<1)+FIRST_TSS_ENTRY,&(p->tss));
-
set_ldt_desc(gdt+(nr<<1)+FIRST_LDT_ENTRY,&(p->ldt));
-
p->p_pptr = current;
-
p->p_cptr = 0;
-
p->p_ysptr = 0;
-
p->p_osptr = current->p_cptr;
-
if (p->p_osptr)
-
p->p_osptr->p_ysptr = p;
-
current->p_cptr = p;
-
p->state = TASK_RUNNING; /* do this last, just in case */
-
return last_pid;
-
}
注:要想知道copy_process中的这一堆函数是怎么来的,需要从头开始捋
a. fork的流程
-->int 0x80中断
-->进入0x80的中断处理函数system_call
-->system_call中会调用call sys_call_table 即sys_fork
-->sys_fork会调用_copy_process
b.上述各个阶段传的参数
b.1 int 0x80中断时会将ss esp压栈
b.2 sys_call中给copy_process传ds,es,fs, org_eax,edx,ecx,ebx -->7个
b.3 sys_call中call sys_call_table(,%eax,4)把下一条eip压栈-->none
b.4 sys_fork中给copy_process传gs,esi,edi,ebp,eax=last_pid -->5个
-
int copy_mem(int nr,struct task_struct * p)
-
{
-
unsigned long old_data_base,new_data_base,data_limit;
-
unsigned long old_code_base,new_code_base,code_limit;
-
//取进程0的ldt中代码段与数据段的段限长与段基,在进程0的段基地址上加64M,就是进程1的段基,段界限不变都是640K
-
code_limit=get_limit(0x0f); -->进程0的代码段界限
-
data_limit=get_limit(0x17); -->进程0的数据段界限
-
old_code_base = get_base(current->ldt[1]); -->进程0的代码段基地址
-
old_data_base = get_base(current->ldt[2]); -->进程0的数据段基地址
-
if (old_data_base != old_code_base)
-
panic("We don't support separate I&D");
-
if (data_limit < code_limit)
-
panic("Bad data_limit");
-
new_data_base = new_code_base = nr * TASK_SIZE; -->进程1的LDT的代码段与数据段基地址=进程0的基地址+64M
-
p->start_code = new_code_base;
-
set_base(p->ldt[1],new_code_base); -->设置进程1的LDT的代码段与数据段基地址
-
set_base(p->ldt[2],new_data_base);
-
//
-
if (copy_page_tables(old_data_base,new_data_base,data_limit)) {
-
free_page_tables(new_data_base,data_limit);
-
return -ENOMEM;
-
}
-
return 0;
-
}
-
//这个函数只会被fork调用
-
int copy_page_tables(unsigned long from,unsigned long to,long size)
-
{
-
unsigned long * from_page_table;
-
unsigned long * to_page_table;
-
unsigned long this_page;
-
unsigned long * from_dir, * to_dir;
-
unsigned long new_page;
-
unsigned long nr;
-
//检查边界是不是4M对齐,from和to都是进程的基地址,进程的基地址都是以64M为单位的加减的。
-
if ((from&0x3fffff) || (to&0x3fffff))
-
panic("copy_page_tables called with wrong alignment");
-
//取from与to的页目录表项地址,及size占了几个页目录表项
-
from_dir = (unsigned long *) ((from>>20) & 0xffc);
-
to_dir = (unsigned long *) ((to>>20) & 0xffc);
-
size = ((unsigned) (size+0x3fffff)) >> 22; -->一个页目录表项可以映射4M内存,这个size是以4M为单位
-
for( ; size-->0 ; from_dir++,to_dir++) {
-
//检查P位,若dst的P位存在,说明目的内存己占用,出错; 若src的P位不存在则,要复制的内存不存在,出错。
-
if (1 & *to_dir)
-
panic("copy_page_tables: already exist");
-
if (!(1 & *from_dir))
-
continue;
-
from_page_table = (unsigned long *) (0xfffff000 & *from_dir); //取页目录表所映射的页表地址
-
if (!(to_page_table = (unsigned long *) get_free_page())) //为dst的页目录表要映射的页表分配内存
-
return -1; /* Out of memory, see freeing */
-
*to_dir = ((unsigned long) to_page_table) | 7; //设置页目录表项的存在标志
-
nr = (from==0)?0xA0:1024;
-
for ( ; nr-- > 0 ; from_page_table++,to_page_table++) {
-
this_page = *from_page_table;
-
if (!this_page)
-
continue;
-
if (!(1 & this_page)) {
-
if (!(new_page = get_free_page()))
-
return -1;
-
read_swap_page(this_page>>1, (char *) new_page);
-
*to_page_table = this_page;
-
*from_page_table = new_page | (PAGE_DIRTY | 7);
-
continue;
-
}
-
this_page &= ~2;
-
*to_page_table = this_page;
-
if (this_page > LOW_MEM) {
-
*from_page_table = this_page;
-
this_page -= LOW_MEM;
-
this_page >>= 12;
-
mem_map[this_page]++;
-
}
-
}
-
}
-
invalidate();
-
return 0;
-
}
4.3 set_ldt_desc --> 在include/asm/system.h中
在gdt[n+常数]处嵌入ldt,将参数addr填到基地址中
-
#define _set_tssldt_desc(n,addr,0x82)
-
__asm__ ("movw $104,%1" -->BYTE[0-1]:TSS与LDT段限长=104
-
"movw %%ax,%2" -->BYTE[2-3]:段基址[0-15]
-
"rorl $16,%%eax" -->将eax的高16位与低16位交换
-
"movb %%al,%3" -->BYTE[4]: al是段基址的[16-23]
-
"movb $0x82,%4" -->BYTE[5]: P=1 DPL=0 S=0 TYPE=2(数据段) TYPE=9(代码段)
-
"movb $0x00,%5" -->BYTE[6]: 置0 G=0,DB=0,AVL=0
-
"movb %%ah,%6" -->BYTE[7]: ah是段基址的[24-31]
-
"rorl $16,%%eax" -->将eax的高16位与低16位再交换一次,即把eax恢复成原值
-
::"a" (addr), "m" (*(n)), "m" (*(n+2)), "m" (*(n+4)),
-
"m" (*(n+5)), "m" (*(n+6)), "m" (*(n+7))
-
)
二. 几个问题
2.1
int 0x80与system_call是如何关联起来的?--> 在kernel/sched.c中
sched_init
--> set_system_gate(0x80,&system_call);
-->_set_gate(&idt[n],15,3,addr) //type=15=0xF=1111B说明是一个陷阱门
-
#define _set_gate(gate_addr,type,dpl,addr) \
-
__asm__ ("movw %%dx,%%ax\n\t" \ -->edx是addr的全部地址,这儿只用了低16位
-
"movw %0,%%dx\n\t" \
-
"movl %%eax,%1\n\t" \ -->eax的低16位是addr的低16位,eax的高16位是选择子
-
"movl %%edx,%2" \ -->edx的低16位是属性,edx的高16位是addr的高16位
-
: \ -->movl一次移动4个字节,这样既填充了属性又填充了addr
-
: "i" ((short) (0x8000+(dpl<<13)+(type<<8))), \ -->0x8000-->P=1 dpl偏移是13,type偏移是8
-
"o" (*((char *) (gate_addr))), \
-
"o" (*(4+(char *) (gate_addr))), \
-
"d" ((char *) (addr)),"a" (0x00080000))
陷阱门 -->不会关中断-->可以嵌套
中断门 -->会关中断 -->不可以嵌套
2.2 fork后父子进程的返回值为什么不一样?
fork是在c语言中调用的,c语言中的返回值都是在eax中,
父进程: 返回值是int 0x80调用的返回值,system_call-->sys_fork-->copy_process的返回值,即last_pid也就是子进程的pid.
子进程:在copy_process中设置p->tss.eax=0,当任务调度执行到子进程时,eax会被tss中的eax覆盖,则fork后返回0.
附录1. test影响SF位
-
mov eax, 1
-
test eax, eax
-
-
mov eax, -1
-
test eax, eax
下面是bochs的调试结果:
-
Next at t=156838332
-
(0) [0x000000001718] 000f:00001718 (unk. ctxt): mov eax, 0x00000001 ; b801000000
-
<bochs:8> n
-
Next at t=156838333
-
(0) [0x00000000171d] 000f:0000171d (unk. ctxt): test eax, eax ; 85c0
-
<bochs:9> r
-
eflags 0x00000286: id vip vif ac vm rf nt IOPL=0 of df IF tf SF zf af PF cf -->原先SF=1是置位的
-
<bochs:10> n
-
Next at t=156838334
-
(0) [0x00000000171f] 000f:0000171f (unk. ctxt): mov eax, 0xffffffff ; b8ffffffff
-
<bochs:11> r
-
eflags 0x00000202: id vip vif ac vm rf nt IOPL=0 of df IF tf sf zf af pf cf -->eax=1,执行test后,sf=0
-
<bochs:12> n
-
Next at t=156838335
-
(0) [0x000000001724] 000f:00001724 (unk. ctxt): test eax, eax ; 85c0
-
<bochs:13> r
-
eflags 0x00000202: id vip vif ac vm rf nt IOPL=0 of df IF tf sf zf af pf cf
-
<bochs:14> n
-
Next at t=156838336
-
(0) [0x000000001726] 000f:00001726 (unk. ctxt): mov al, 0x46 ; b046
-
<bochs:15> r
-
eflags 0x00000286: id vip vif ac vm rf nt IOPL=0 of df IF tf SF zf af PF cf -->eax=-1,执行test后,SF=1