出处:http://ericxiao.cublog.cn/
------------------------------------------
在linux内核,用linux_binfmt结构来表示每一个加载模块.它的定义如下:
struct linux_binfmt {
//用来构成链表
struct list_head lh;
//所属的module
struct module *module;
//加载可执行文件
int (*load_binary)(struct linux_binprm *, struct pt_regs * regs);
//加载共享库
int (*load_shlib)(struct file *);
int (*core_dump)(long signr, struct pt_regs *regs, struct file *file, unsigned long limit);
unsigned long min_coredump; /* minimal dump size */
int hasvdso;
}
结构中的lh将之组成一个链表,这个链表的表头是formats.
为了说明,我们来看一下如何注册一个可执行文件的加载模块.
int register_binfmt(struct linux_binfmt * fmt)
{
if (!fmt)
return -EINVAL;
write_lock(&binfmt_lock);
//将其添加之链表
list_add(&fmt->lh, &formats);
write_unlock(&binfmt_lock);
return 0;
}
所以,在加载可执文件的时候,只要遍历formats这个链表,然后依次按module加载这个可执行文件.这正是search_binary_handler()所做的.代码如下:
int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
{
int try,retval;
struct linux_binfmt *fmt;
#ifdef __alpha__
/* handle /sbin/loader.. */
{
struct exec * eh = (struct exec *) bprm->buf;
if (!bprm->loader && eh->fh.f_magic == 0x183 &&
(eh->fh.f_flags & 0x3000) == 0x3000)
{
struct file * file;
unsigned long loader;
allow_write_access(bprm->file);
fput(bprm->file);
bprm->file = NULL;
loader = bprm->vma->vm_end - sizeof(void *);
file = open_exec("/sbin/loader");
retval = PTR_ERR(file);
if (IS_ERR(file))
return retval;
/* Remember if the application is TASO. */
bprm->sh_bang = eh->ah.entry < 0x100000000UL;
bprm->file = file;
bprm->loader = loader;
retval = prepare_binprm(bprm);
if (retval<0)
return retval;
/* should call search_binary_handler recursively here,
but it does not matter */
}
}
#endif
retval = security_bprm_check(bprm);
if (retval)
return retval;
/* kernel module loader fixup */
/* so we don't try to load run modprobe in kernel space. */
set_fs(USER_DS);
retval = audit_bprm(bprm);
if (retval)
return retval;
retval = -ENOENT;
//这里会循环两次.待模块加载之后再遍历一次
for (try=0; try<2; try++) {
read_lock(&binfmt_lock);
list_for_each_entry(fmt, &formats, lh) {
//加载函数
int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
if (!fn)
continue;
if (!try_module_get(fmt->module))
continue;
read_unlock(&binfmt_lock);
//运行加载函数,如果加载末成功,则继续遍历
retval = fn(bprm, regs);
//加载成功了
if (retval >= 0) {
put_binfmt(fmt);
allow_write_access(bprm->file);
if (bprm->file)
fput(bprm->file);
bprm->file = NULL;
current->did_exec = 1;
proc_exec_connector(current);
return retval;
}
read_lock(&binfmt_lock);
put_binfmt(fmt);
if (retval != -ENOEXEC || bprm->mm == NULL)
break;
if (!bprm->file) {
read_unlock(&binfmt_lock);
return retval;
}
}
read_unlock(&binfmt_lock);
//所有模块加载这个可执行文件失败,则加载其它模块再试一次
if (retval != -ENOEXEC || bprm->mm == NULL) {
break;
//CONFIG_KMOD:动态加载模块标志
#ifdef CONFIG_KMOD
}else{
#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
if (printable(bprm->buf[0]) &&
printable(bprm->buf[1]) &&
printable(bprm->buf[2]) &&
printable(bprm->buf[3]))
break; /* -ENOEXEC */
request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
#endif
}
}
return retval;
}
到这里,我们看到了可执行文件的加载过程,接下来,我们以a.out型的可执文件的加载过程为例.来看一看linux怎么处理可执行文件的.
四:a.out文件格式的加载
a.out模块的处理是在binfmt.aout.c中.如下示:
static struct linux_binfmt aout_format = {
.module = THIS_MODULE,
.load_binary = load_aout_binary,
.load_shlib = load_aout_library,
.core_dump = aout_core_dump,
.min_coredump = PAGE_SIZE
};
对应的加载接口为load_aout_binary().代码如下:
static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
{
struct exec ex;
unsigned long error;
unsigned long fd_offset;
unsigned long rlim;
int retval;
//文件头信息匹配
ex = *((struct exec *) bprm->buf); /* exec-header */
if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
N_TRSIZE(ex) || N_DRSIZE(ex) ||
i_size_read(bprm->file->f_path.dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
return -ENOEXEC;
}
/*
* Requires a mmap handler. This prevents people from using a.out
* as part of an exploit attack against /proc-related vulnerabilities.
*/
//如果文件不支持OPEN,或者MMAP。无效
if (!bprm->file->f_op || !bprm->file->f_op->mmap)
return -ENOEXEC;
//可执行文件正文的起始位置
//每种类型的正文起始位置
fd_offset = N_TXTOFF(ex);
/* Check initial limits. This avoids letting people circumvent
* size limits imposed on them by creating programs with large
* arrays in the data or bss.
*/
//判断data+bss是否超过了限制
rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
if (rlim >= RLIM_INFINITY)
rlim = ~0;
if (ex.a_data + ex.a_bss > rlim)
return -ENOMEM;
/* Flush all traces of the currently running executable */
//已经取得了足够的信息,是跟当前进程脱离的时候了
retval = flush_old_exec(bprm);
if (retval)
return retval;
/* OK, This is the point of no return */
#if defined(__alpha__)
SET_AOUT_PERSONALITY(bprm, ex);
#elif defined(__sparc__)
set_personality(PER_SUNOS);
#if !defined(__sparc_v9__)
memcpy(¤t->thread.core_exec, &ex, sizeof(struct exec));
#endif
#else
//设置进程的个性标志
set_personality(PER_LINUX);
#endif
//设置进程的代码段的起始与终止位置
current->mm->end_code = ex.a_text +
(current->mm->start_code = N_TXTADDR(ex));
//设置进程数段段的起始与终止位置
current->mm->end_data = ex.a_data +
(current->mm->start_data = N_DATADDR(ex));
//设置进程BSS区间
current->mm->brk = ex.a_bss +
(current->mm->start_brk = N_BSSADDR(ex));
current->mm->free_area_cache = current->mm->mmap_base;
current->mm->cached_hole_size = 0;
compute_creds(bprm);
//进程已经fork 完成了,不再需要PF_FORKNOEXEC
current->flags &= ~PF_FORKNOEXEC;
#ifdef __sparc__
if (N_MAGIC(ex) == NMAGIC) {
loff_t pos = fd_offset;
/* Fuck me plenty... */
down_write(¤t->mm->mmap_sem);
error = do_brk(N_TXTADDR(ex), ex.a_text);
up_write(¤t->mm->mmap_sem);
bprm->file->f_op->read(bprm->file, (char *) N_TXTADDR(ex),
ex.a_text, &pos);
down_write(¤t->mm->mmap_sem);
error = do_brk(N_DATADDR(ex), ex.a_data);
up_write(¤t->mm->mmap_sem);
bprm->file->f_op->read(bprm->file, (char *) N_DATADDR(ex),
ex.a_data, &pos);
goto beyond_if;
}
#endif
//如果是OMAGIC格式
if (N_MAGIC(ex) == OMAGIC) {
unsigned long text_addr, map_size;
loff_t pos;
text_addr = N_TXTADDR(ex);
#if defined(__alpha__) || defined(__sparc__)
pos = fd_offset;
map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1;
#else
pos = 32;
map_size = ex.a_text+ex.a_data;
#endif
down_write(¤t->mm->mmap_sem);
//为进程的代码段分配空间
error = do_brk(text_addr & PAGE_MASK, map_size);
up_write(¤t->mm->mmap_sem);
if (error != (text_addr & PAGE_MASK)) {
send_sig(SIGKILL, current, 0);
return error;
}
//读文件数据读入代码段
error = bprm->file->f_op->read(bprm->file,
(char __user *)text_addr,
ex.a_text+ex.a_data, &pos);
if ((signed long)error < 0) {
send_sig(SIGKILL, current, 0);
return error;
}
//x86上为一空函数
flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data);
} else {
static unsigned long error_time, error_time2;
//数据段,代码段是否页框对齐
if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
(N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ)
{
printk(KERN_NOTICE "executable not page aligned\n");
error_time2 = jiffies;
}
//
if ((fd_offset & ~PAGE_MASK) != 0 &&
(jiffies-error_time) > 5*HZ)
{
printk(KERN_WARNING
"fd_offset is not page aligned. Please convert program: %s\n",
bprm->file->f_path.dentry->d_name.name);
error_time = jiffies;
}
if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
//不支持mmap
loff_t pos = fd_offset;
down_write(¤t->mm->mmap_sem);
//分配段空间
do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
up_write(¤t->mm->mmap_sem);
//从文件中读入相关数据
bprm->file->f_op->read(bprm->file,
(char __user *)N_TXTADDR(ex),
ex.a_text+ex.a_data, &pos);
flush_icache_range((unsigned long) N_TXTADDR(ex),
(unsigned long) N_TXTADDR(ex) +
ex.a_text+ex.a_data);
goto beyond_if;
}
//如果支持MMAP。将直接将文件映射到内存即可
down_write(¤t->mm->mmap_sem);
error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
PROT_READ | PROT_EXEC,
MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
fd_offset);
up_write(¤t->mm->mmap_sem);
if (error != N_TXTADDR(ex)) {
send_sig(SIGKILL, current, 0);
return error;
}
down_write(¤t->mm->mmap_sem);
error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
fd_offset + ex.a_text);
up_write(¤t->mm->mmap_sem);
if (error != N_DATADDR(ex)) {
send_sig(SIGKILL, current, 0);
return error;
}
}
beyond_if:
//设置进程的binfmt
set_binfmt(&aout_format);
//为BSS段分配空间
retval = set_brk(current->mm->start_brk, current->mm->brk);
if (retval < 0) {
//分配失败,发送SIGKILL信号,杀掉当前进程
send_sig(SIGKILL, current, 0);
return retval;
}
//扩大进程的栈
retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
if (retval < 0) {
/* Someone check-me: is this error path enough? */
send_sig(SIGKILL, current, 0);
return retval;
}
//调整栈空间的布局
current->mm->start_stack =
(unsigned long) create_aout_tables((char __user *) bprm->p, bprm);
#ifdef __alpha__
regs->gp = ex.a_gpvalue;
#endif
//设置新的EIP与ESP.使其返回到用户空间后,可以开始运行这个程序
start_thread(regs, ex.a_entry, current->mm->start_stack);
if (unlikely(current->ptrace & PT_PTRACED)) {
if (current->ptrace & PT_TRACE_EXEC)
ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
else
send_sig(SIGTRAP, current, 0);
}
return 0;
}
首先判断文件的文件头信息,检查是否属于a.out文件.属于不属于a.out再出错退出,让其它module进行选择.
因为execve()系统调用会完全代替进程,因此,在运行该进程之前,先解除父子进程的共享关系,这是由flush_old_exec()完成的.代码如下:
int flush_old_exec(struct linux_binprm * bprm)
{
char * name;
int i, ch, retval;
struct files_struct *files;
char tcomm[sizeof(current->comm)];
//如果父子进程共享信号处理,脱离其共享关系
retval = de_thread(current);
if (retval)
goto out;
//复制共享的文件
files = current->files; /* refcounted so safe to hold */
retval = unshare_files();
if (retval)
goto out;
//进程的用户空间有可能是父进程的复制品.使之独立
//使进程的mm切换为bprm->mm
//这就是我们之前千亲万苦初始化bprm->mm的原因
retval = exec_mmap(bprm->mm);
if (retval)
goto mmap_failed;
bprm->mm = NULL; /* We're using it now */
put_files_struct(files);
current->sas_ss_sp = current->sas_ss_size = 0;
if (current->euid == current->uid && current->egid == current->gid)
set_dumpable(current->mm, 1);
else
set_dumpable(current->mm, suid_dumpable);
name = bprm->filename;
/* Copies the binary name from after last slash */
//取可执行文件的名字
for (i=0; (ch = *(name++)) != '\0';) {
if (ch == '/')
i = 0; /* overwrite what we wrote */
else
if (i < (sizeof(tcomm) - 1))
tcomm[i++] = ch;
}
tcomm[i] = '\0';
//task->com:保存可执行文件名
set_task_comm(current, tcomm);
current->flags &= ~PF_RANDOMIZE;
//flush_thread:只与协处理器和DEBUG有关
flush_thread();
current->mm->task_size = TASK_SIZE;
if (bprm->e_uid != current->euid || bprm->e_gid != current->egid) {
suid_keys(current);
set_dumpable(current->mm, suid_dumpable);
current->pdeath_signal = 0;
} else if (file_permission(bprm->file, MAY_READ) ||
(bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
suid_keys(current);
set_dumpable(current->mm, suid_dumpable);
}
/* An exec changes our domain. We are no longer part of the thread
group */
current->self_exec_id++;
//因为解除了跟父进程的共享关系,所以
//将信号处理函数改为默认的操作
flush_signal_handlers(current, 0);
//关闭打开的文件
flush_old_files(current->files);
return 0;
mmap_failed:
reset_files_struct(current, files);
out:
return retval;
}
我们重点分析一下exec_mmap():
static int exec_mmap(struct mm_struct *mm)
{
struct task_struct *tsk;
struct mm_struct * old_mm, *active_mm;
tsk = current;
old_mm = current->mm;
mm_release(tsk, old_mm);
if (old_mm) {
down_read(&old_mm->mmap_sem);
if (unlikely(old_mm->core_waiters)) {
up_read(&old_mm->mmap_sem);
return -EINTR;
}
}
task_lock(tsk);
active_mm = tsk->active_mm;
tsk->mm = mm;
tsk->active_mm = mm;
//切换进程的执行空间.这个过程我们在进程切换跟调度的时候再来做详细的分析
activate_mm(active_mm, mm);
task_unlock(tsk);
arch_pick_mmap_layout(mm);
// 减少old_mm,active_mm的引用计数,如果引用计数为零,则释放其所占
//空间,或者断开映射
if (old_mm) {
up_read(&old_mm->mmap_sem);
BUG_ON(active_mm != old_mm);
mmput(old_mm);
return 0;
}
mmdrop(active_mm);
return 0;
}
值得注意的是mm_release()中有一个重要的操作:
void mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
struct completion *vfork_done = tsk->vfork_done;
/* Get rid of any cached register state */
deactivate_mm(tsk, mm);
/* notify parent sleeping on vfork() */
//如果创建子进程的时候带了CLONE_VFORK。其在子进程已经使用完了
//是该唤醒父进程了
if (vfork_done) {
tsk->vfork_done = NULL;
complete(vfork_done);
}
/*
* If we're exiting normally, clear a user-space tid field if
* requested. We leave this alone when dying by signal, to leave
* the value intact in a core dump, and to save the unnecessary
* trouble otherwise. Userland only wants this done for a sys_exit.
*/
if (tsk->clear_child_tid
&& !(tsk->flags & PF_SIGNALED)
&& atomic_read(&mm->mm_users) > 1) {
u32 __user * tidptr = tsk->clear_child_tid;
tsk->clear_child_tid = NULL;
/*
* We don't check the error code - if userspace has
* not set up a proper pointer then tough luck.
*/
put_user(0, tidptr);
sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
}
}
还记得我们之前讨论过的CLONE_VFOR标志吗?到这里就可以唤醒父进程了.因为此时子进程结束了对父进程空间的共享.
与父进程脱离关系之后,子进程就拥有了自己独立的资源.然后加载数据段和代码段.分配BSS段空间.把栈空间也伸缩适当大小.
之后我们遇到的再一个重点是栈空间的布局.我们来分析这一个过程.
static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
{
……
……
current->mm->start_stack =
(unsigned long) create_aout_tables((char __user *) bprm->p, bprm);
#ifdef __alpha__
regs->gp = ex.a_gpvalue;
#endif
start_thread(regs, ex.a_entry, current->mm->start_stack);
……
}
Creat_aout_tables()代码如下:
static unsigned long __user *create_aout_tables(char __user *p, struct linux_binprm * bprm)
{
char __user * __user *argv;
char __user * __user *envp;
unsigned long __user *sp;
//可执行文件的参数个数
int argc = bprm->argc;
//环境变量的个数
int envc = bprm->envc;
//sp初始化成p,也即bprm->p
//对应下面图的初始化状态(1)
sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p);
#ifdef __sparc__
/* This imposes the proper stack alignment for a new process. */
sp = (void __user *) (((unsigned long) sp) & ~7);
if ((envc+argc+3)&1) --sp;
#endif
#ifdef __alpha__
/* whee.. test-programs are so much fun. */
put_user(0, --sp);
put_user(0, --sp);
if (bprm->loader) {
put_user(0, --sp);
put_user(0x3eb, --sp);
put_user(bprm->loader, --sp);
put_user(0x3ea, --sp);
}
put_user(bprm->exec, --sp);
put_user(0x3e9, --sp);
#endif
sp -= envc+1;
envp = (char __user * __user *) sp;
sp -= argc+1;
argv = (char __user * __user *) sp;
#if defined(__i386__) || defined(__mc68000__) || defined(__arm__) || defined(__arch_um__)
put_user((unsigned long) envp,--sp);
put_user((unsigned long) argv,--sp);
#endif
put_user(argc,--sp);
//对应下面分析图中的(2)
current->mm->arg_start = (unsigned long) p;
while (argc-->0) {
char c;
put_user(p,argv++);
do {
get_user(c,p++);
} while (c);
}
put_user(NULL,argv);
current->mm->arg_end = current->mm->env_start = (unsigned long) p;
while (envc-->0) {
char c;
put_user(p,envp++);
do {
get_user(c,p++);
} while (c);
}
put_user(NULL,envp);
current->mm->env_end = (unsigned long) p;
//对应分析图中的(3)
return sp;
}
我们用图来表示上面的操作过程:
对照上面的分析图就很容易看懂代码了.
最后,设置eip的值为可执行文件中main函数对齐的地址,esp为当前栈指针位置,返回到用户空间就可以顺利的执行了.这一过程是start_thread()完成的.这个函数比较简单,就不分析了.