linux0.12--6.fork的实现-wangcong02345-ChinaUnix博客

wangcong02345wangcong02345.blog.chinaunix.net

首页　| 　博文目录　| 　关于我

wangcong02345

博客访问： 2179847
博文数量： 438
博客积分： 3871
博客等级：中校
技术积分： 6075
用户组：普通用户
注册时间： 2011-09-10 00:11

个人简介

邮箱: wangcong02345@163.com

文章分类

全部博文（438）

android应用学习（16）
framework学习（91）
Linux开发（75）
linux内核驱动（71）
opengl（2）
写操作系统（63）
音视频（39）
算法（19）
bootloader（26）
others（4）
windows使用（11）
Linux使用（21）
未分配的博文（0）

文章存档

2017年（15）

2016年（119）

2015年（91）

2014年（62）

2013年（56）

2012年（79）

2011年（16）

我的朋友

相关博文

linux0.12--6.fork的实现

分类： LINUX

2016-09-21 17:42:57

1. 在 ./init/main.c 中第一次出现fork调用

static inline _syscall0(int,fork)
void main(void)
{
move_to_user_mode(); //fork之前，进程的特权级是3,且当前执行的是进程0
if( !fork() ){ //由进程0去创建进程1 -->调用fork完成这个操作
init();
}
}

2. fork函数的原型及实现
在init/main.c中对有如下形式-->inline _syscall0(int,fork)

其中_syscall0 是一个宏定义:
在 ./include/unistd.h 中

#define _syscall0(type,name) \
type name(void) \
{ \
long __res; \
__asm__ volatile ("int $0x80" \
: "=a" (__res) \
: "0" (__NR_##name)); \
if (__res >= 0) \
return (type) __res; \
errno = -__res; \
return -1; \
}

将宏展开后: NR_fork=2

inline int fork(void)
{
long __res;
__asm__ volatile ("int $0x80"
: "=a" (__res)
: "0" (__NR_fork)); -->2
if (__res >= 0) //这是为什么父进程会返回>0
return (int) __res;
errno = -__res;
return -1;
}

汇编的格式看更方便:

inline _syscall0(int,fork)
64c0: 83 ec 10 sub $0x10,%esp
64c3: b8 02 00 00 00 mov $0x2,%eax -->把NR_fork放在eax中
64c8: cd 80 int $0x80 -->作为int 0x80的参数
64ca: 89 44 24 0c mov %eax,0xc(%esp)
64ce: 83 7c 24 0c 00 cmpl $0x0,0xc(%esp)
64d3: 78 06 js 64db <fork+0x1b>
64d5: 8b 44 24 0c mov 0xc(%esp),%eax
64d9: eb 10 jmp 64eb <fork+0x2b>
64db: 8b 44 24 0c mov 0xc(%esp),%eax
64df: f7 d8 neg %eax
64e1: a3 04 1c 04 00 mov %eax,0x41c04
64e6: b8 ff ff ff ff mov $0xffffffff,%eax
64eb: 83 c4 10 add $0x10,%esp
64ee: c3 ret

函数是用NR_fork 来初始化eax (eax=2), 说明fork就是int 0x80的 2号调用,进入中断
3. 在 ./kernel/system_call.s 中找到int 0x80中断的处理函数

.align 4
system_call:
push %ds
push %es
push %fs
pushl %eax # save the orig_eax
pushl %edx
pushl %ecx # push %ebx,%ecx,%edx as parameters
pushl %ebx # to the system call
movl $0x10,%edx # set up ds,es to kernel space
mov %dx,%ds
mov %dx,%es
movl $0x17,%edx # fs points to local data space
mov %dx,%fs
cmpl NR_syscalls,%eax
jae bad_sys_call
call sys_call_table(,%eax,4) -->即调用sys_fork函数
pushl %eax
2:
movl current,%eax
cmpl $0,state(%eax) # state
jne reschedule -->如果不是就绪状态就重新调度
cmpl $0,counter(%eax) # counter
je reschedule -->如果时间片为0,就重新调度
ret_from_sys_call: -->如果是就绪状态并且时间片不为0
//如果是进程0,则直接返回
movl current,%eax
cmpl task,%eax # task[0] cannot have signals
je 3f
cmpw $0x0f,CS(%esp) # was old code segment supervisor ?
jne 3f
cmpw $0x17,OLDSS(%esp) # was stack segment = 0x17 ?
jne 3f
movl signal(%eax),%ebx
movl blocked(%eax),%ecx
notl %ecx
andl %ebx,%ecx
bsfl %ecx,%ecx
je 3f
btrl %ecx,%ebx
movl %ebx,signal(%eax)
incl %ecx
pushl %ecx
call do_signal
popl %ecx
testl %eax, %eax
jne 2b # see if we need to switch tasks, or do more signals
3: popl %eax
popl %ebx
popl %ecx
popl %edx
addl $4, %esp # skip orig_eax
pop %fs
pop %es
pop %ds
iret

a. 在./include/linux/sched.h 中有 fn_ptr 的定义
typedef int (*fn_ptr) ()
b. 在 ./include/linux/sys.h 中
fn_ptr sys_call_table[] = { sys_setup, sys_exit,sys_fork,...}
c. 所以call 的调用地址就是
_sys_call_table + %eax * 4 即：sys_fork ( 其中eax=2, 每个指针的长度是4，所以用_sys_call_table的首地址加 2*4, 就是 _sys_call_table 数组中的第二个元素的地址，即sys_fork )
call _sys_call_table(,%eax,4) 即：call _sys_fork
4. sys_fork --> 在./kernel/system_call.s中

.align 2
_sys_fork:
call _find_empty_process
test1 %eax,%eax //如果eax为负则test后SF=1置位
js 1f //如果符号位置位(SF:负数=1 非负=0)则说明出错跳转
push %gs
pushl %esi
pushl %edi
pushl %ebp
pushl %eax //给copy_process传eax=last_pid,ebp,edi,esi,gd这5个参数，注意顺序
call _copy_process
add $20,%esp
1: ret

可见fork又主要是调用了两个函数:
find_empty_process 和 copy_process ;它们都在kernel/fork.c中实现
说明：gcc编译器在生成汇编代码，其函数名及变量名前都会都会加_，所以在汇编中调用C的函數或变量的时候，需要手动加上一个下划线。
4.1 find_empty_process -->在kernel/fork.c中
为新进程分配pid及未使用的task[i]结构体

144 int find_empty_process(void)
145 {
146 int i;
147
148 repeat:
//last_pid是一个有符号的long,当last_pid越界后变为负数，把last_pid置1
149 if ((++last_pid)<0) last_pid=1;
//有了pid号之后，在64个task中一一搜索，检查这个pid号是否被占用
150 for(i=0 ; i<NR_TASKS ; i++)
151 if (task[i] && ((task[i]->pid == last_pid) ||
152 (task[i]->pgrp == last_pid)))
153 goto repeat;
//在64个task中选出为还没有使用的一项返回
154 for(i=1 ; i<NR_TASKS ; i++)
155 if (!task[i])
156 return i; -->若没出错则返回正值，并保存在eax中sys_fork会判断这个值
157 return -EAGAIN; -->若出错则返回负值，并保存在eax中sys_fork会判断这个值
158 }

4.2 copy_process --> 在kernel/fork.c中

/*
* Ok, this is the main fork-routine. It copies the system process
* information (task[nr]) and sets up the necessary registers. It
* also copies the data segment in it's entirety.
*/
int copy_process(int nr,long ebp,long edi,long esi,long gs,long none,
long ebx,long ecx,long edx, long orig_eax,
long fs,long es,long ds,
long eip,long cs,long eflags,long esp,long ss)
{
struct task_struct *p;
int i;
struct file *f;
//首先为task_struct分配空间
p = (struct task_struct *) get_free_page();
if (!p)
return -EAGAIN;
task[nr] = p; //第1次fork时，nr=1
//*p = *current; /* this doesn't copy the supervisor stack */
memcpy(p, current, sizeof(struct task_struct));
p->state = TASK_UNINTERRUPTIBLE;
p->pid = last_pid;
p->counter = p->priority;
p->signal = 0;
p->alarm = 0;
p->leader = 0; /* process leadership doesn't inherit */
p->utime = p->stime = 0;
p->cutime = p->cstime = 0;
p->start_time = jiffies;
p->tss.back_link = 0;
p->tss.esp0 = PAGE_SIZE + (long) p;
p->tss.ss0 = 0x10;
p->tss.eip = eip;
p->tss.eflags = eflags;
p->tss.eax = 0; //这是为什么子进程会返回0
p->tss.ecx = ecx;
p->tss.edx = edx;
p->tss.ebx = ebx;
p->tss.esp = esp;
p->tss.ebp = ebp;
p->tss.esi = esi;
p->tss.edi = edi;
p->tss.es = es & 0xffff;
p->tss.cs = cs & 0xffff;
p->tss.ss = ss & 0xffff;
p->tss.ds = ds & 0xffff;
p->tss.fs = fs & 0xffff;
p->tss.gs = gs & 0xffff;
p->tss.ldt = _LDT(nr);
p->tss.trace_bitmap = 0x80000000;
if (last_task_used_math == current)
__asm__("clts ; fnsave %0 ; frstor %0"::"m" (p->tss.i387));
if (copy_mem(nr,p)) {
task[nr] = NULL;
free_page((long) p);
return -EAGAIN;
}
for (i=0; i<NR_OPEN;i++)
if (f=p->filp[i])
f->f_count++;
if (current->pwd)
current->pwd->i_count++;
if (current->root)
current->root->i_count++;
if (current->executable)
current->executable->i_count++;
if (current->library)
current->library->i_count++;
//进程0的nr=0,进程1的nr=1,所以这儿是与gdt[6]=tss1, gdt[7]=ldt1
set_tss_desc(gdt+(nr<<1)+FIRST_TSS_ENTRY,&(p->tss));
set_ldt_desc(gdt+(nr<<1)+FIRST_LDT_ENTRY,&(p->ldt));
p->p_pptr = current;
p->p_cptr = 0;
p->p_ysptr = 0;
p->p_osptr = current->p_cptr;
if (p->p_osptr)
p->p_osptr->p_ysptr = p;
current->p_cptr = p;
p->state = TASK_RUNNING; /* do this last, just in case */
return last_pid;
}

注:要想知道copy_process中的这一堆函数是怎么来的，需要从头开始捋
a. fork的流程
-->int 0x80中断
-->进入0x80的中断处理函数system_call
  -->system_call中会调用call sys_call_table 即sys_fork
  -->sys_fork会调用_copy_process
b.上述各个阶段传的参数
b.1 int 0x80中断时会将ss esp压栈
  b.2 sys_call中给copy_process传ds，es,fs, org_eax,edx,ecx,ebx -->7个
  b.3 sys_call中call sys_call_table(,%eax,4)把下一条eip压栈-->none
  b.4 sys_fork中给copy_process传gs,esi,edi,ebp,eax=last_pid -->5个

int copy_mem(int nr,struct task_struct * p)
{
unsigned long old_data_base,new_data_base,data_limit;
unsigned long old_code_base,new_code_base,code_limit;
//取进程0的ldt中代码段与数据段的段限长与段基，在进程0的段基地址上加64M,就是进程1的段基，段界限不变都是640K
code_limit=get_limit(0x0f); -->进程0的代码段界限
data_limit=get_limit(0x17); -->进程0的数据段界限
old_code_base = get_base(current->ldt[1]); -->进程0的代码段基地址
old_data_base = get_base(current->ldt[2]); -->进程0的数据段基地址
if (old_data_base != old_code_base)
panic("We don't support separate I&D");
if (data_limit < code_limit)
panic("Bad data_limit");
new_data_base = new_code_base = nr * TASK_SIZE; -->进程1的LDT的代码段与数据段基地址=进程0的基地址+64M
p->start_code = new_code_base;
set_base(p->ldt[1],new_code_base); -->设置进程1的LDT的代码段与数据段基地址
set_base(p->ldt[2],new_data_base);
//
if (copy_page_tables(old_data_base,new_data_base,data_limit)) {
free_page_tables(new_data_base,data_limit);
return -ENOMEM;
}
return 0;
}

//这个函数只会被fork调用
int copy_page_tables(unsigned long from,unsigned long to,long size)
{
unsigned long * from_page_table;
unsigned long * to_page_table;
unsigned long this_page;
unsigned long * from_dir, * to_dir;
unsigned long new_page;
unsigned long nr;
//检查边界是不是4M对齐，from和to都是进程的基地址，进程的基地址都是以64M为单位的加减的。
if ((from&0x3fffff) || (to&0x3fffff))
panic("copy_page_tables called with wrong alignment");
//取from与to的页目录表项地址，及size占了几个页目录表项
from_dir = (unsigned long *) ((from>>20) & 0xffc);
to_dir = (unsigned long *) ((to>>20) & 0xffc);
size = ((unsigned) (size+0x3fffff)) >> 22; -->一个页目录表项可以映射4M内存，这个size是以4M为单位
for( ; size-->0 ; from_dir++,to_dir++) {
//检查P位，若dst的P位存在，说明目的内存己占用，出错; 若src的P位不存在则,要复制的内存不存在，出错。
if (1 & *to_dir)
panic("copy_page_tables: already exist");
if (!(1 & *from_dir))
continue;
from_page_table = (unsigned long *) (0xfffff000 & *from_dir); //取页目录表所映射的页表地址
if (!(to_page_table = (unsigned long *) get_free_page())) //为dst的页目录表要映射的页表分配内存
return -1; /* Out of memory, see freeing */
*to_dir = ((unsigned long) to_page_table) | 7; //设置页目录表项的存在标志
nr = (from==0)?0xA0:1024;
for ( ; nr-- > 0 ; from_page_table++,to_page_table++) {
this_page = *from_page_table;
if (!this_page)
continue;
if (!(1 & this_page)) {
if (!(new_page = get_free_page()))
return -1;
read_swap_page(this_page>>1, (char *) new_page);
*to_page_table = this_page;
*from_page_table = new_page | (PAGE_DIRTY | 7);
continue;
}
this_page &= ~2;
*to_page_table = this_page;
if (this_page > LOW_MEM) {
*from_page_table = this_page;
this_page -= LOW_MEM;
this_page >>= 12;
mem_map[this_page]++;
}
}
}
invalidate();
return 0;
}

4.3 set_ldt_desc --> 在include/asm/system.h中

在gdt[n+常数]处嵌入ldt,将参数addr填到基地址中

#define _set_tssldt_desc(n,addr,0x82)
__asm__ ("movw $104,%1" -->BYTE[0-1]:TSS与LDT段限长=104
"movw %%ax,%2" -->BYTE[2-3]:段基址[0-15]
"rorl $16,%%eax" -->将eax的高16位与低16位交换
"movb %%al,%3" -->BYTE[4]: al是段基址的[16-23]
"movb $0x82,%4" -->BYTE[5]: P=1 DPL=0 S=0 TYPE=2(数据段) TYPE=9(代码段)
"movb $0x00,%5" -->BYTE[6]: 置0 G=0,DB=0,AVL=0
"movb %%ah,%6" -->BYTE[7]: ah是段基址的[24-31]
"rorl $16,%%eax" -->将eax的高16位与低16位再交换一次，即把eax恢复成原值
::"a" (addr), "m" (*(n)), "m" (*(n+2)), "m" (*(n+4)),
"m" (*(n+5)), "m" (*(n+6)), "m" (*(n+7))
)

二. 几个问题
2.1 int 0x80与system_call是如何关联起来的？--> 在kernel/sched.c中
sched_init
--> set_system_gate(0x80,&system_call);
-->_set_gate(&idt[n],15,3,addr) //type=15=0xF=1111B说明是一个陷阱门

#define _set_gate(gate_addr,type,dpl,addr) \
__asm__ ("movw %%dx,%%ax\n\t" \ -->edx是addr的全部地址，这儿只用了低16位
"movw %0,%%dx\n\t" \
"movl %%eax,%1\n\t" \ -->eax的低16位是addr的低16位，eax的高16位是选择子
"movl %%edx,%2" \ -->edx的低16位是属性，edx的高16位是addr的高16位
: \ -->movl一次移动4个字节，这样既填充了属性又填充了addr
: "i" ((short) (0x8000+(dpl<<13)+(type<<8))), \ -->0x8000-->P=1 dpl偏移是13,type偏移是8
"o" (*((char *) (gate_addr))), \
"o" (*(4+(char *) (gate_addr))), \
"d" ((char *) (addr)),"a" (0x00080000))

陷阱门 -->不会关中断-->可以嵌套
中断门 -->会关中断 -->不可以嵌套

2.2 fork后父子进程的返回值为什么不一样？
fork是在c语言中调用的，c语言中的返回值都是在eax中，
父进程: 返回值是int 0x80调用的返回值，system_call-->sys_fork-->copy_process的返回值，即last_pid也就是子进程的pid.
子进程:在copy_process中设置p->tss.eax=0,当任务调度执行到子进程时，eax会被tss中的eax覆盖，则fork后返回0.

附录1. test影响SF位

mov eax, 1
test eax, eax
mov eax, -1
test eax, eax

下面是bochs的调试结果:

Next at t=156838332
(0) [0x000000001718] 000f:00001718 (unk. ctxt): mov eax, 0x00000001 ; b801000000
<bochs:8> n
Next at t=156838333
(0) [0x00000000171d] 000f:0000171d (unk. ctxt): test eax, eax ; 85c0
<bochs:9> r
eflags 0x00000286: id vip vif ac vm rf nt IOPL=0 of df IF tf SF zf af PF cf -->原先SF=1是置位的
<bochs:10> n
Next at t=156838334
(0) [0x00000000171f] 000f:0000171f (unk. ctxt): mov eax, 0xffffffff ; b8ffffffff
<bochs:11> r
eflags 0x00000202: id vip vif ac vm rf nt IOPL=0 of df IF tf sf zf af pf cf -->eax=1,执行test后，sf=0
<bochs:12> n
Next at t=156838335
(0) [0x000000001724] 000f:00001724 (unk. ctxt): test eax, eax ; 85c0
<bochs:13> r
eflags 0x00000202: id vip vif ac vm rf nt IOPL=0 of df IF tf sf zf af pf cf
<bochs:14> n
Next at t=156838336
(0) [0x000000001726] 000f:00001726 (unk. ctxt): mov al, 0x46 ; b046
<bochs:15> r
eflags 0x00000286: id vip vif ac vm rf nt IOPL=0 of df IF tf SF zf af PF cf -->eax=-1,执行test后，SF=1

阅读(892) | 评论(0) | 转发(0) |

上一篇：linux使用---18.slickedit2013的安装及使用

下一篇：linux0.12--7.进程0的创建与执行

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6