分类: LINUX
2011-01-31 13:54:46
open 系统调用剖析<2.6.18版>
文莎(wenshawenzhang@gmail.com)
本文遵循自顶向下的方法,追踪open系统调用的源码执行路径。其中也包括我在读源码过程中遇到的一些问题,也希望各位高手能帮我解决。文中有什么不妥的地方,也希望各位能批评指正。
下表为向下执行时调用的主要函数:
sys_open() |-do_sys_open() |-getname() |-get_unused_fd() |-do_filp_open() |-open_namei() |-path_lookup_open() |-path_lookup_create() |-vfs_create() |-may_open() |-break_lease() |-__break_lease() |-locks_verify_locked() |-locks_mandatory_locked() |-nameidata_to_filp() |-fd_install() |
-------------------------------------------------------------------------------------------------------------
./fs/open.c
asmlinkage long sys_open(const char __user *filename, int flags, int mode)
{
long ret;
if (force_o_largefile()) //检查系统是否是非32位,若是,则开启大文件系统支持标志O_LARGEFILE
flags |= O_LARGEFILE;
ret = do_sys_open(AT_FDCWD, filename, flags, mode); //AT_FDCWD表示openat应该使用当前工作目录
/* avoid REGPARM breakage on x86: */
prevent_tail_call(ret);
return ret;
}
-------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------
./fs/open.c
long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
{
char *tmp = getname(filename); //通过do_getname()函数,do_getname()函数调用strncpy_from_user()函数,将文件名从用户空间拷贝到内核空间
int fd = PTR_ERR(tmp);
if (!IS_ERR(tmp)) {
fd = get_unused_fd(); //返回一个未被使用的文件描述符(每次都会选取最小的未被使用的文件描述符),并将其置为忙
if (fd >= 0) {
struct file *f = do_filp_open(dfd, tmp, flags, mode); //打开文件,并返回与其对应的关联进程和文件的file结构体
if (IS_ERR(f)) {
put_unused_fd(fd);
fd = PTR_ERR(f);
} else {
fsnotify_open(f->f_dentry); //通知文件已经打开
fd_install(fd, f); //将打开文件后返回的file结构体与文件描述符fd关联起来
}
}
putname(tmp);
}
return fd;
}
-------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------
./fs/open.c
/*
* Note that while the flag value (low two bits) for sys_open means:
* 00 - read-only
* 01 - write-only
* 10 - read-write
* 11 - special
* it is changed into
* 00 - no permissions needed
* 01 - read-permission
* 10 - write-permission
* 11 - read-write
* for the internal routines (ie open_namei()/follow_link() etc). 00 is
* used by symlinks.
*/
static struct file *do_filp_open(int dfd, const char *filename, int flags,
int mode)
{
int namei_flags, error;
struct nameidata nd;
namei_flags = flags;
if ((namei_flags+1) & O_ACCMODE) //将sys_open()的访问模式标志转换为open_namei()的访问模式标志
namei_flags++;
error = open_namei(dfd, filename, namei_flags, mode, &nd); //真正的打开文件函数,返回结果保存在nameidata结构体中。
if (!error)
return nameidata_to_filp(&nd, flags); //将文件节点对应的nameidata结构体转换为file结构体
return ERR_PTR(error);
}
-------------------------------------------------------------------------------------------------------------
sys_open() |-do_sys_open() |-getname() |-get_unused_fd() |-do_filp_open() |-open_namei() |-path_lookup_open() |-path_lookup_create() |-vfs_create() |-may_open() |-break_lease() |-__break_lease() |-locks_verify_locked() |-locks_mandatory_locked() |-nameidata_to_filp() |-fd_install() |
-------------------------------------------------------------------------------------------------------------
./fs/namei.c
/*
* open_namei()
*
* namei for open - this is in fact almost the whole open-routine.
*
* Note that the low bits of "flag" aren't the same as in the open
* system call - they are 00 - no permissions needed
* 01 - read permission needed
* 10 - write permission needed
* 11 - read/write permissions needed
* which is a lot more logical, and also allows the "no perm" needed
* for symlinks (where the permissions are checked later).
* SMP-safe
*/
int open_namei(int dfd, const char *pathname, int flag,
int mode, struct nameidata *nd)
{
int acc_mode, error;
struct path path;
struct dentry *dir;
int count = 0;
acc_mode = ACC_MODE(flag); //提取访问模式
/* O_TRUNC implies we need access checks for write permissions */
if (flag & O_TRUNC) // O_TRUNC意味着我们需要检查写许可
acc_mode |= MAY_WRITE;
/* Allow the LSM permission hook to distinguish append
access from general write access. */
if (flag & O_APPEND) //允许LSM许可钩子将附加访问模式与写访问模式区别开来
acc_mode |= MAY_APPEND;
/*
* The simplest case - just a plain lookup.
*/
if (!(flag & O_CREAT)) { //最简单的情况,由于没有设定O_CREAT标志,仅仅查找文件名对应的文件系统结点是否存在
error = path_lookup_open(dfd, pathname, lookup_flags(flag),
nd, flag);
if (error)
return error;
goto ok;
}
/*
* Create - we need to know the parent.
*/
error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,nd,flag,mode); //如果设定了O_CREAT标志,则先查找父节点
if (error)
return error;
/*
* We have the parent and last component. First of all, check
* that we are not asked to creat(2) an obvious directory - that
* will not do.
*/
error = -EISDIR;
if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len])
goto exit;
dir = nd->dentry;
nd->flags &= ~LOOKUP_PARENT;
mutex_lock(&dir->d_inode->i_mutex);
path.dentry = lookup_hash(nd); //到父目录中查找是否有该结点.如果没有该结点就会创建相应的dentry但dentry->d_inode为空
path.mnt = nd->mnt;
do_last:
error = PTR_ERR(path.dentry);
if (IS_ERR(path.dentry)) { //错误检查,若出错,则返回
mutex_unlock(&dir->d_inode->i_mutex);
goto exit;
}
if (IS_ERR(nd->intent.open.file)) {
mutex_unlock(&dir->d_inode->i_mutex);
error = PTR_ERR(nd->intent.open.file);
goto exit_dput;
}
/* Negative dentry, just create the file */
if (!path.dentry->d_inode) { // d_inode为空,则说明该结点是新建的
if (!IS_POSIXACL(dir->d_inode))
mode &= ~current->fs->umask;
error = vfs_create(dir->d_inode, path.dentry, mode, nd); //进行创建的实际操作
mutex_unlock(&dir->d_inode->i_mutex);
dput(nd->dentry);
nd->dentry = path.dentry;
if (error)
goto exit;
/* Don't check for write permission, don't truncate */
acc_mode = 0;
flag &= ~O_TRUNC;
goto ok;
}
/*
* It already exists.
*/
mutex_unlock(&dir->d_inode->i_mutex);
audit_inode_update(path.dentry->d_inode);
error = -EEXIST;
if (flag & O_EXCL)
goto exit_dput;
if (__follow_mount(&path)) { //如果是挂载目录,则跳转到挂载文件系统的根目录
error = -ELOOP;
if (flag & O_NOFOLLOW)
goto exit_dput;
}
error = -ENOENT;
if (!path.dentry->d_inode) //仍为空,则出错了
goto exit_dput;
if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link) //如果结点是符号链接,则跳转到符号链接的相应处理的标号处
goto do_link;
path_to_nameidata(&path, nd);
error = -EISDIR;
if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode)) //如果结点是一个目录,则出错退出
goto exit;
ok:
error = may_open(nd, acc_mode, flag); //无论是对于新建的结点还是已有的结点,都必须调用该函数,进行打开已经存在的结点
if (error)
goto exit;
return 0;
exit_dput:
dput_path(&path, nd);
exit:
if (!IS_ERR(nd->intent.open.file))
release_open_intent(nd);
path_release(nd);
return error;
do_link:
error = -ELOOP;
if (flag & O_NOFOLLOW)
goto exit_dput;
/*
* This is subtle. Instead of calling do_follow_link() we do the
* thing by hands. The reason is that this way we have zero link_count
* and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.
* After that we have the parent and last component, i.e.
* we are in the same situation as after the first path_walk().
* Well, almost - if the last component is normal we get its copy
* stored in nd->last.name and we will have to putname() it when we
* are done. Procfs-like symlinks just set LAST_BIND.
*/
nd->flags |= LOOKUP_PARENT;
error = security_inode_follow_link(path.dentry, nd);
if (error)
goto exit_dput;
error = __do_follow_link(&path, nd);
if (error) {
/* Does someone understand code flow here? Or it is only
* me so stupid? Anathema to whoever designed this non-sense
* with "intent.open".
*/
release_open_intent(nd);
return error;
}
nd->flags &= ~LOOKUP_PARENT;
if (nd->last_type == LAST_BIND)
goto ok;
error = -EISDIR;
if (nd->last_type != LAST_NORM)
goto exit;
if (nd->last.name[nd->last.len]) {
__putname(nd->last.name);
goto exit;
}
error = -ELOOP;
if (count++==32) {
__putname(nd->last.name);
goto exit;
}
dir = nd->dentry;
mutex_lock(&dir->d_inode->i_mutex);
path.dentry = lookup_hash(nd);
path.mnt = nd->mnt;
__putname(nd->last.name);
goto do_last;
}
-------------------------------------------------------------------------------------------------------------
sys_open() |-do_sys_open() |-getname() |-get_unused_fd() |-do_filp_open() |-open_namei() |-path_lookup_open() |-path_lookup_create() |-vfs_create() |-may_open() |-break_lease() |-__break_lease() |-locks_verify_locked() |-locks_mandatory_locked() |-nameidata_to_filp() |-fd_install() |
-------------------------------------------------------------------------------------------------------------
./fs/namei.c
int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
struct nameidata *nd)
{
int error = may_create(dir, dentry, nd); //创建文件进行的必要的检查
if (error)
return error;
if (!dir->i_op || !dir->i_op->create) //如果无法创建
return -EACCES; /* shouldn't it be ENOSYS? */
mode &= S_IALLUGO;
mode |= S_IFREG;
error = security_inode_create(dir, dentry, mode);
if (error)
return error;
DQUOT_INIT(dir);
error = dir->i_op->create(dir, dentry, mode, nd); //调用父节点对应的创建操作
if (!error)
fsnotify_create(dir, dentry); //创建成功则进行通知
return error;
}
-------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------
./fs/namei.c
int may_open(struct nameidata *nd, int acc_mode, int flag)
{
struct dentry *dentry = nd->dentry;
struct inode *inode = dentry->d_inode;
int error;
if (!inode) //若结点对应的inode不存在
return -ENOENT;
if (S_ISLNK(inode->i_mode)) //若结点对应的inode是一个符号链接
return -ELOOP;
if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE)) //若结点对应的inode是一个目录
return -EISDIR;
error = vfs_permission(nd, acc_mode); //权限检查
if (error)
return error;
/*
* FIFO's, sockets and device files are special: they don't
* actually live on the filesystem itself, and as such you
* can write to them even if the filesystem is read-only.
*/
if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
flag &= ~O_TRUNC; //去掉O_TRUNC标志
} else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
if (nd->mnt->mnt_flags & MNT_NODEV) //如果是一个块设备文件或者是一个字符设备文件,却挂载选项带有MNT_NODEV标志.出错退出
return -EACCES;
flag &= ~O_TRUNC;
} else if (IS_RDONLY(inode) && (flag & FMODE_WRITE)) //写一个只读结点,出错退出
return -EROFS;
/*
* An append-only file must be opened in append mode for writing.
*/
if (IS_APPEND(inode)) {
if ((flag & FMODE_WRITE) && !(flag & O_APPEND))
return -EPERM;
if (flag & O_TRUNC)
return -EPERM;
}
/* O_NOATIME can only be set by the owner or superuser */
if (flag & O_NOATIME)
if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER))
return -EPERM;
/*
* Ensure there are no outstanding leases on the file.
*/
error = break_lease(inode, flag); //解除租约
if (error)
return error;
if (flag & O_TRUNC) {
error = get_write_access(inode);
if (error)
return error;
/*
* Refuse to truncate files with mandatory locks held on them.
*/
error = locks_verify_locked(inode); //检查文件系统是否使用了强制锁,且已经加上了强制锁
if (!error) {
DQUOT_INIT(inode);
error = do_truncate(dentry, 0, ATTR_MTIME|ATTR_CTIME, NULL); //对文件进行截尾处理
}
put_write_access(inode);
if (error)
return error;
} else
if (flag & FMODE_WRITE)
DQUOT_INIT(inode);
return 0;
}
-------------------------------------------------------------------------------------------------------------
sys_open() |-do_sys_open() |-getname() |-get_unused_fd() |-do_filp_open() |-open_namei() |-path_lookup_open() |-path_lookup_create() |-vfs_create() |-may_open() |-break_lease() |-__break_lease() |-locks_verify_locked() |-locks_mandatory_locked() |-nameidata_to_filp() |-fd_install() |
-------------------------------------------------------------------------------------------------------------
./include/linux/fs.h
static inline int break_lease(struct inode *inode, unsigned int mode)
{
if (inode->i_flock) //判断该inode是否有锁
return __break_lease(inode, mode);
return 0;
}
-------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------
./fs/locks.c
/**
* __break_lease - revoke all outstanding leases on file
* @inode: the inode of the file to return
* @mode: the open mode (read or write)
*
* break_lease (inlined for speed) has checked there already
* is a lease on this file. Leases are broken on a call to open()
* or truncate(). This function can sleep unless you
* specified %O_NONBLOCK to your open().
*/
int __break_lease(struct inode *inode, unsigned int mode)
{
int error = 0, future;
struct file_lock *new_fl, *flock;
struct file_lock *fl;
int alloc_err;
unsigned long break_time;
int i_have_this_lease = 0;
alloc_err = lease_alloc(NULL, mode & FMODE_WRITE ? F_WRLCK : F_RDLCK,
&new_fl); //申请一个租约锁
lock_kernel();
time_out_leases(inode); //对该inode现有租约锁的延时进行处理
flock = inode->i_flock;
if ((flock == NULL) || !IS_LEASE(flock)) //如果没有锁或者该锁不是租约锁,则退出
goto out;
for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) //如果进程本身是租约锁的拥有者,则。。。
if (fl->fl_owner == current->files)
i_have_this_lease = 1;
if (mode & FMODE_WRITE) {
/* If we want write access, we have to revoke any lease. */
future = F_UNLCK | F_INPROGRESS; //如果我们是进行写访问,则移除所有租约锁
} else if (flock->fl_type & F_INPROGRESS) {
/* If the lease is already being broken, we just leave it */ //若移除操作正在进行,则我们由着它
future = flock->fl_type;
} else if (flock->fl_type & F_WRLCK) {
/* Downgrade the exclusive lease to a read-only lease. */
future = F_RDLCK | F_INPROGRESS;
} else {
/* the existing lease was read-only, so we can read too. */
goto out;
}
if (alloc_err && !i_have_this_lease && ((mode & O_NONBLOCK) == 0)) { //如果分配内存失败且本进程不允许强制锁且不允许堵塞,则退出
error = alloc_err;
goto out;
}
break_time = 0;
if (lease_break_time > 0) {
break_time = jiffies + lease_break_time * HZ; //设置租约锁超时时间
if (break_time == 0)
break_time++; /* so that 0 means no break time */
}
for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) { //因为进程要获得此租用锁了,将其类型更将,指定延时到达时间为初始化时间且向其它拥有租用锁的进程发送信号
if (fl->fl_type != future) {
fl->fl_type = future;
fl->fl_break_time = break_time;
/* lease must have lmops break callback */
fl->fl_lmops->fl_break(fl);
}
}
if (i_have_this_lease || (mode & O_NONBLOCK)) { //如果进程本身就是锁的拥有者,或者不允许阻塞,退出
error = -EWOULDBLOCK;
goto out;
}
restart:
break_time = flock->fl_break_time; //计算剩余的租约锁持有时间
if (break_time != 0) {
break_time -= jiffies;
if (break_time == 0)
break_time++;
}
error = locks_block_on_timeout(flock, new_fl, break_time); //将新分配的租用锁插入到链表。直接break time到达,或者是被其它拥有者唤醒
if (error >= 0) { //如果正常返回,更新结点中的租借锁状态
if (error == 0)
time_out_leases(inode);
/* Wait for the next lease that has not been broken yet */
for (flock = inode->i_flock; flock && IS_LEASE(flock); //如果还有租用锁没有被处理,继续前述的处理过程
flock = flock->fl_next) {
if (flock->fl_type & F_INPROGRESS)
goto restart;
}
error = 0;
}
out:
unlock_kernel();
if (!alloc_err)
locks_free_lock(new_fl);
return error;
}
-------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------
./include/linux/fs.h
static inline int locks_verify_locked(struct inode *inode)
{//强制锁的初始条件即:1:挂载文件系统的类型为MS_MANDLOCK 且文件的组设置位为1且组执行位为0
if (MANDATORY_LOCK(inode)) //判断文件中是否有强制锁
return locks_mandatory_locked(inode);
return 0;
}
-------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------
./fs/locks.c
/**
* locks_mandatory_locked - Check for an active lock
* @inode: the file to check
*
* Searches the inode's list of locks to find any POSIX locks which conflict.
* This function is called from locks_verify_locked() only.
*/
int locks_mandatory_locked(struct inode *inode)
{
fl_owner_t owner = current->files;
struct file_lock *fl;
/*
* Search the lock list for this inode for any POSIX locks.
*/
lock_kernel();
for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
if (!IS_POSIX(fl)) //判断是否为强制锁
continue;
if (fl->fl_owner != owner) //不是进程的强制锁.说明被其它的进程置了强制锁了
break;
}
unlock_kernel();
return fl ? -EAGAIN : 0;
}
-------------------------------------------------------------------------------------------------------------
sys_open() |-do_sys_open() |-getname() |-get_unused_fd() |-do_filp_open() |-open_namei() |-path_lookup_open() |-path_lookup_create() |-vfs_create() |-may_open() |-break_lease() |-__break_lease() |-locks_verify_locked() |-locks_mandatory_locked() |-nameidata_to_filp() |-fd_install() |
---------------------------------------------------------------------------------------------------------
./fs/open.c
/*
* Install a file pointer in the fd array.
*
* The VFS is full of places where we drop the files lock between
* setting the open_fds bitmap and installing the file in the file
* array. At any such point, we are vulnerable to a dup2() race
* installing a file in the array before us. We need to detect this and
* fput() the struct file we are about to overwrite in this case.
*
* It should never happen - if we allow dup2() do it, _really_ bad things
* will follow.
*/
void fastcall fd_install(unsigned int fd, struct file * file) //将打开文件后返回的file结构体与文件描述符fd关联起来
{
struct files_struct *files = current->files;
struct fdtable *fdt;
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
BUG_ON(fdt->fd[fd] != NULL); //如果相应项已经有对象了.则是一个bug
rcu_assign_pointer(fdt->fd[fd], file); //将file添加至对象描述符数组
spin_unlock(&files->file_lock);
}
-------------------------------------------------------------------------------------------------------------
从上面的代码中可以看出.对file的各种操作,都会对应到inode的f_op中.对不存在的文件会调用vfs_create().继续会调用目录结点的create()方法.下面分析一下rootfs和ext2中的create实现. rootfs中inode对应的操作如下:
-------------------------------------------------------------------------------------------------------------
./fs/ramfs/inode.c
static struct inode_operations ramfs_dir_inode_operations = {
.create = ramfs_create,
.lookup = simple_lookup,
.link = simple_link,
.unlink = simple_unlink,
.symlink = ramfs_symlink,
.mkdir = ramfs_mkdir,
.rmdir = simple_rmdir,
.mknod = ramfs_mknod,
.rename = simple_rename,
};
-------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------
./fs/ramfs/inode.c
static int ramfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
{
return ramfs_mknod(dir, dentry, mode | S_IFREG, 0);
}
-------------------------------------------------------------------------------------------------------------
从上面可以看到.上面的过程与rootfs中目录的建立大体相同.只是文件的模式改为了S_IFREG.即一般的文件.
-------------------------------------------------------------------------------------------------------------
ext2中目录对应的操作为:
-------------------------------------------------------------------------------------------------------------
./fs/ext2/inode.c
struct inode_operations ext2_dir_inode_operations = {
.create = ext2_create,
.lookup = ext2_lookup,
.link = ext2_link,
.unlink = ext2_unlink,
.symlink = ext2_symlink,
.mkdir = ext2_mkdir,
.rmdir = ext2_rmdir,
.mknod = ext2_mknod,
.rename = ext2_rename,
#ifdef CONFIG_EXT2_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext2_listxattr,
.removexattr = generic_removexattr,
#endif
.setattr = ext2_setattr,
.permission = ext2_permission,
};
-------------------------------------------------------------------------------------------------------------
其create函数的入口为ext2_create().
-------------------------------------------------------------------------------------------------------------
./fs/ext2/inode.c
/*
* By the time this is called, we already have created
* the directory cache entry for the new file, but it
* is so far negative - it has no inode.
*
* If the create succeeds, we fill in the inode information
* with d_instantiate().
*/
static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd)
{
struct inode * inode = ext2_new_inode (dir, mode); //分配一个新的结点
int err = PTR_ERR(inode);
if (!IS_ERR(inode)) { //指定i_op和i_fop.页面缓存的操作方式
inode->i_op = &ext2_file_inode_operations;
if (ext2_use_xip(inode->i_sb)) {
inode->i_mapping->a_ops = &ext2_aops_xip;
inode->i_fop = &ext2_xip_file_operations;
} else if (test_opt(inode->i_sb, NOBH)) {
inode->i_mapping->a_ops = &ext2_nobh_aops;
inode->i_fop = &ext2_file_operations;
} else {
inode->i_mapping->a_ops = &ext2_aops;
inode->i_fop = &ext2_file_operations;
}
mark_inode_dirty(inode); //将inode置脏
err = ext2_add_nondir(dentry, inode);
}
return err;
}
-------------------------------------------------------------------------------------------------------------
这部分的代码算是勉强读完了,但是还是没有真正理解对file的各种操作以及inode的f_op在何时对应联系的,关键是这种联系在代码中是如何体现的。