Linux虚拟文件系统--open()

open()系统调用用来打开一个文件，本文就VFS层，对open系统调用的过程进行一个简单的分析。

[cpp]view plaincopy
				
				SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)   
			
				{   
			
				    long ret;   
			
				    if (force_o_largefile())   
			
				        flags |= O_LARGEFILE;   
			
				    ret = do_sys_open(AT_FDCWD, filename, flags, mode);   
			
				    /* avoid REGPARM breakage on x86: */   
			
				    asmlinkage_protect(3, ret, filename, flags, mode);   
			
				    return ret;   
			
				}

force_o_largefile()用来判断系统是否为32位的，如果不是32位，也就是说为64位，则将O_LARGEFILE置位，主体工作由do_sys_open()来做

[cpp]view plaincopy
				
				long do_sys_open(int dfd, const char __user *filename, int flags, int mode)   
			
				{   
			
				    char *tmp = getname(filename);//拷贝文件名字符串到内核空间   
			
				    int fd = PTR_ERR(tmp);   
			
				    if (!IS_ERR(tmp)) {   
			
				        fd = get_unused_fd_flags(flags);//为文件分配一个文件描述符   
			
				        if (fd >= 0) {   
			
				            //实际的OPEN操作处理   
			
				            struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);   
			
				            if (IS_ERR(f)) {   
			
				                put_unused_fd(fd);   
			
				                fd = PTR_ERR(f);   
			
				            } else {   
			
				                fsnotify_open(f->f_path.dentry);   
			
				                fd_install(fd, f);   
			
				            }   
			
				        }   
			
				        putname(tmp);   
			
				    }   
			
				    return fd;   
			
				}

open操作是特定于某个进程进行的，因此涉及到了VFS中特定于进程的结构，这里简单的介绍下

[cpp]view plaincopy
				
				"font-size:12px;">struct files_struct {   
			
				  /*  
			
				   * read mostly part  
			
				   */   
			
				    atomic_t count;   
			
				    struct fdtable *fdt;   
			
				    struct fdtable fdtab;   
			
				  /*  
			
				   * written part on a separate cache line in SMP  
			
				   */   
			
				    spinlock_t file_lock ____cacheline_aligned_in_smp;   
			
				    int next_fd;   
			
				    struct embedded_fd_set close_on_exec_init;   
			
				    struct embedded_fd_set open_fds_init;   
			
				    struct file * fd_array[NR_OPEN_DEFAULT];   
			
				};

count表示共享该结构的进程数

fdtable是该进程的文件描述符数组

fdt指向fdtable

next_fd表示最大文件描述符号+1

embedded_fd_set是一个位图结构，用来标记文件描述符，close_on_exec_init用来标记那些执行exec时要关闭的文件的文件描述符，open_fds_init用来标记已经分配出去了的文件描述符

fd_array用来存储进程打开的文件的struct file指针

do_sys_open()的一个重要任务就是调用get_unused_fd_flags()为即将打开的文件分配一个文件描述符

[cpp]view plaincopy
				
				"font-size:12px;">#define get_unused_fd_flags(flags) alloc_fd(0, (flags))

[cpp]view plaincopy
				
				"font-size:12px;">int alloc_fd(unsigned start, unsigned flags)   
			
				{   
			
				    struct files_struct *files = current->files;//获取当前进程的files_struct   
			
				    unsigned int fd;   
			
				    int error;   
			
				    struct fdtable *fdt;   
			
				    spin_lock(&files->file_lock);   
			
				repeat:   
			
				    fdt = files_fdtable(files);//获取进程的fdtable   
			
				    fd = start;   
			
				    if (fd < files->next_fd)   
			
				        fd = files->next_fd;   
			
				    if (fd < fdt->max_fds)   
			
				        fd = find_next_zero_bit(fdt->open_fds->fds_bits,   
			
				                       fdt->max_fds, fd);//从位图中获取一个空闲位   
			
				    error = expand_files(files, fd);//这里根据需要扩充文件描述符数组   
			
				    if (error < 0)   
			
				        goto out;   
			
				    /*  
			
				     * If we needed to expand the fs array we  
			
				     * might have blocked - try again.  
			
				     */   
			
				    if (error)//之前进行了扩充操作，重新进行一次空闲bit的搜索   
			
				        goto repeat;   
			
				    if (start <= files->next_fd)   
			
				        files->next_fd = fd + 1;   
			
				    FD_SET(fd, fdt->open_fds);//在open_fds的位图上置位   
			
				    if (flags & O_CLOEXEC)//如果设定了O_CLOEXEC，则在close_on_exec位图上将相应位置位   
			
				        FD_SET(fd, fdt->close_on_exec);   
			
				    else   
			
				        FD_CLR(fd, fdt->close_on_exec);   
			
				    error = fd;   
			
				#if 1   
			
				    /* Sanity check */   
			
				    if (rcu_dereference(fdt->fd[fd]) != NULL) {   
			
				        printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);   
			
				        rcu_assign_pointer(fdt->fd[fd], NULL);   
			
				    }   
			
				#endif   
			
				out:   
			
				    spin_unlock(&files->file_lock);   
			
				    return error;   
			
				}

[cpp]view plaincopy
				
				int expand_files(struct files_struct *files, int nr)   
			
				{   
			
				    struct fdtable *fdt;   
			
				    fdt = files_fdtable(files);   
			
				    /*  
			
				     * N.B. For clone tasks sharing a files structure, this test  
			
				     * will limit the total number of files that can be opened.  
			
				     */   
			
				     /*如果nr大于进程允许的最大打开文件数，则返回错误*/   
			
				    if (nr >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)   
			
				        return -EMFILE;   
			
				    /*nr小于最大文件描述符，则不用进行fdtable的扩展，直接返回*/   
			
				    if (nr < fdt->max_fds)   
			
				        return 0;   
			
				    /*扩展的话不能超过sysctl_nr_opend的上限*/   
			
				    if (nr >= sysctl_nr_open)   
			
				        return -EMFILE;   
			
				    /* 到这里表示确实需要进行扩充，进行实际的扩展操作 */   
			
				    return expand_fdtable(files, nr);   
			
				}

实际的扩充操作：

[cpp]view plaincopy
				
				"font-size:12px;">static int expand_fdtable(struct files_struct *files, int nr)   
			
				    __releases(files->file_lock)   
			
				    __acquires(files->file_lock)   
			
				{   
			
				    struct fdtable *new_fdt, *cur_fdt;   
			
				    spin_unlock(&files->file_lock);   
			
				    new_fdt = alloc_fdtable(nr);//根据nr重新创建一个新的fdtable   
			
				    spin_lock(&files->file_lock);   
			
				    if (!new_fdt)   
			
				        return -ENOMEM;   
			
				    /*  
			
				     * extremely unlikely race - sysctl_nr_open decreased between the check in  
			
				     * caller and alloc_fdtable().  Cheaper to catch it here...  
			
				     */   
			
				     /*这里为了防止因为竞争，在alloc_fdtable调用之前systl_nr_open减小了新创建的fdtable小于nr*/   
			
				    if (unlikely(new_fdt->max_fds <= nr)) {   
			
				        free_fdarr(new_fdt);   
			
				        free_fdset(new_fdt);   
			
				        kfree(new_fdt);   
			
				        return -EMFILE;   
			
				    }   
			
				    /*  
			
				     * Check again since another task may have expanded the fd table while  
			
				     * we dropped the lock  
			
				     */   
			
				    cur_fdt = files_fdtable(files);//获取旧的fdtable   
			
				    if (nr >= cur_fdt->max_fds) {//新的nr必须大于旧的fdtable的大小   
			
				        /* Continue as planned */   
			
				        copy_fdtable(new_fdt, cur_fdt);//将旧的fdtable中的内容拷贝至新的fdtable   
			
				        rcu_assign_pointer(files->fdt, new_fdt);//用新的fdtable替换旧的fdtable   
			
				        if (cur_fdt->max_fds > NR_OPEN_DEFAULT)   
			
				            free_fdtable(cur_fdt);//释放旧的fdtable   
			
				    } else {   
			
				        /* Somebody else expanded, so undo our attempt */   
			
				        free_fdarr(new_fdt);   
			
				        free_fdset(new_fdt);   
			
				        kfree(new_fdt);   
			
				    }   
			
				    return 1;   
			
				}

到此为止，分配新的fd的工作完成，如果分配fd成功，接下来do_sys_open()就要通过do_filp_open()函数查找文件并执行相应的打开操作

do_filp_open的工作针对两种情况进行：

1.flag中未标识O_CREAT，也就是只进行单纯的搜索打开，如果没有搜索到目标文件的话，不会进行创建，这种情况处理起来比较简单，主要工作就是通过路径解析来查找文件，查找到了的话再根据文件系统定义的open方式进行打开

2.flag中标识了O_CREAT，也就是说如果没找到目标文件要进行创建。这种情况要先查找目标文件的父目录(通过将LOOKUP_PARENT标识置位然后进行路径解析来实现)，因为假如没查找到目标文件的话，创建工作需要在父目录下完成；然后再查找最后一个文件分量，也就是目标文件，并进行打开操作，其中涉及到的许多部分在前面几篇文章中也都已经分析过了

[cpp]view plaincopy
				
				"font-size:12px;">struct file *do_filp_open(int dfd, const char *pathname,   
			
				        int open_flag, int mode, int acc_mode)   
			
				{   
			
				    struct file *filp;   
			
				    struct nameidata nd;   
			
				    int error;   
			
				    struct path path;   
			
				    struct dentry *dir;   
			
				    int count = 0;   
			
				    int will_write;   
			
				    int flag = open_to_namei_flags(open_flag);   
			
				    if (!acc_mode)   
			
				        acc_mode = MAY_OPEN | ACC_MODE(flag);   
			
				    /* O_TRUNC implies we need access checks for write permissions */   
			
				    if (flag & O_TRUNC)   
			
				        acc_mode |= MAY_WRITE;   
			
				    /* Allow the LSM permission hook to distinguish append   
			
				       access from general write access. */   
			
				    if (flag & O_APPEND)   
			
				        acc_mode |= MAY_APPEND;   
			
				    /*  
			
				     * The simplest case - just a plain lookup.  
			
				     */   
			
				    /*如果没有设置O_CREAT，则在未找到文件的情况下不用创建文件，直接通过查找来打开文件*/   
			
				    if (!(flag & O_CREAT)) {   
			
				        error = path_lookup_open(dfd, pathname, lookup_flags(flag),   
			
				                     &nd, flag);   
			
				        if (error)   
			
				            return ERR_PTR(error);   
			
				        goto ok;  //成功查找到了目标文件的话，就跳转到ok去执行后续操作   
			
				    }   
			
				    /*  
			
				     * Create - we need to know the parent.  
			
				     */   
			
				     /*如果需要creat,那么就要知道目标文件的父目录，因此需要设置LOOKUP_PARENT标识*/   
			
				    error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);   
			
				    if (error)   
			
				        return ERR_PTR(error);   
			
				    /*进行路径名的解析，父目录将保存到nd中*/   
			
				    error = path_walk(pathname, &nd);   
			
				    if (error) {   
			
				        if (nd.root.mnt)   
			
				            path_put(&nd.root);   
			
				        return ERR_PTR(error);   
			
				    }   
			
				    if (unlikely(!audit_dummy_context()))   
			
				        audit_inode(pathname, nd.path.dentry);   
			
				    /*  
			
				     * We have the parent and last component. First of all, check  
			
				     * that we are not asked to creat(2) an obvious directory - that  
			
				     * will not do.  
			
				     */   
			
				    error = -EISDIR;   
			
				    /*这里要先保证路径名的最后一个分量是普通文件名(不为.和..)，并且长度不为0*/   
			
				    if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])   
			
				        goto exit_parent;   
			
				    error = -ENFILE;   
			
				    filp = get_empty_filp();//分配一个struct file   
			
				    if (filp == NULL)   
			
				        goto exit_parent;   
			
				    /*将打开文件的信息保存在nd.intent中*/   
			
				    nd.intent.open.file = filp;   
			
				    nd.intent.open.flags = flag;   
			
				    nd.intent.open.create_mode = mode;   
			
				    dir = nd.path.dentry;//获取父目录   
			
				    nd.flags &= ~LOOKUP_PARENT;//取消LOOKUP_PARENT标识   
			
				    nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;//设置CREATE和OPEN标识   
			
				    if (flag & O_EXCL)   
			
				        nd.flags |= LOOKUP_EXCL;   
			
				    mutex_lock(&dir->d_inode->i_mutex);   
			
				    //lookup_hash进行最终分量的查找，先查找dentry缓存，没找到的话再通过特定于文件系统的lookup方式从磁盘查找   
			
				    path.dentry = lookup_hash(&nd);   
			
				    path.mnt = nd.path.mnt;   
			
				do_last:   
			
				    error = PTR_ERR(path.dentry);//检查目标dentry是否有效   
			
				    if (IS_ERR(path.dentry)) {   
			
				        mutex_unlock(&dir->d_inode->i_mutex);   
			
				        goto exit;   
			
				    }   
			
				    if (IS_ERR(nd.intent.open.file)) {//检查file是否有效   
			
				        error = PTR_ERR(nd.intent.open.file);   
			
				        goto exit_mutex_unlock;   
			
				    }   
			
				    /* Negative dentry, just create the file */   
			
				    if (!path.dentry->d_inode) {//dentry没有对应上inode，创建之，可能的情况就是该文件被删除了   
			
				        /*  
			
				         * This write is needed to ensure that a  
			
				         * ro->rw transition does not occur between  
			
				         * the time when the file is created and when  
			
				         * a permanent write count is taken through  
			
				         * the 'struct file' in nameidata_to_filp().  
			
				         */   
			
				        error = mnt_want_write(nd.path.mnt);   
			
				        if (error)   
			
				            goto exit_mutex_unlock;   
			
				        /*__open_namei_create将会调用到父目录所属文件系统中定义的create方式创建文件*/   
			
				        error = __open_namei_create(&nd, &path, flag, mode);   
			
				        if (error) {   
			
				            mnt_drop_write(nd.path.mnt);   
			
				            goto exit;   
			
				        }   
			
				        /*nameidata_to_filp将会调用目标文件的inode对应的open函数进行打开操作*/   
			
				        filp = nameidata_to_filp(&nd, open_flag);   
			
				        if (IS_ERR(filp))   
			
				            ima_counts_put(&nd.path,   
			
				                       acc_mode & (MAY_READ | MAY_WRITE |   
			
				                           MAY_EXEC));   
			
				        mnt_drop_write(nd.path.mnt);   
			
				        if (nd.root.mnt)   
			
				            path_put(&nd.root);   
			
				        return filp;   
			
				    }   
			
				    /*  
			
				     * 下面的情况对应目标文件存在  
			
				     */   
			
				    mutex_unlock(&dir->d_inode->i_mutex);   
			
				    audit_inode(pathname, path.dentry);   
			
				    error = -EEXIST;   
			
				    if (flag & O_EXCL)   
			
				        goto exit_dput;   
			
				    /*下面要做一些必要的检查*/   
			
				    if (__follow_mount(&path)) {//检测目标对象上是否挂载了文件系统   
			
				        error = -ELOOP;   
			
				        if (flag & O_NOFOLLOW)   
			
				            goto exit_dput;   
			
				    }   
			
				    error = -ENOENT;   
			
				    if (!path.dentry->d_inode)//检测目标对象的inode是否存在   
			
				        goto exit_dput;   
			
				    if (path.dentry->d_inode->i_op->follow_link)//检测目标对象是否为链接文件   
			
				        goto do_link;   
			
				    /*检查OK，将path保存至nd*/   
			
				    path_to_nameidata(&path, &nd);   
			
				    error = -EISDIR;   
			
				    if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))   
			
				        goto exit;   
			
				ok:   
			
				    /*  
			
				     * Consider:  
			
				     * 1. may_open() truncates a file  
			
				     * 2. a rw->ro mount transition occurs  
			
				     * 3. nameidata_to_filp() fails due to  
			
				     *    the ro mount.  
			
				     * That would be inconsistent, and should  
			
				     * be avoided. Taking this mnt write here  
			
				     * ensures that (2) can not occur.  
			
				     */   
			
				    will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);   
			
				    if (will_write) {   
			
				        error = mnt_want_write(nd.path.mnt);   
			
				        if (error)   
			
				            goto exit;   
			
				    }   
			
				    /*may_open()会做一些检测*/   
			
				    error = may_open(&nd.path, acc_mode, flag);   
			
				    if (error) {   
			
				        if (will_write)   
			
				            mnt_drop_write(nd.path.mnt);   
			
				        goto exit;   
			
				    }   
			
				    //执行文件系统定义的打开操作，并保存信息至filp   
			
				    filp = nameidata_to_filp(&nd, open_flag);   
			
				    if (IS_ERR(filp))   
			
				        ima_counts_put(&nd.path,   
			
				                   acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));   
			
				    /*  
			
				     * It is now safe to drop the mnt write  
			
				     * because the filp has had a write taken  
			
				     * on its behalf.  
			
				     */   
			
				    if (will_write)   
			
				        mnt_drop_write(nd.path.mnt);   
			
				    if (nd.root.mnt)   
			
				        path_put(&nd.root);   
			
				    return filp;   
			
				exit_mutex_unlock:   
			
				    mutex_unlock(&dir->d_inode->i_mutex);   
			
				exit_dput:   
			
				    path_put_conditional(&path, &nd);   
			
				exit:   
			
				    if (!IS_ERR(nd.intent.open.file))   
			
				        release_open_intent(&nd);   
			
				exit_parent:   
			
				    if (nd.root.mnt)   
			
				        path_put(&nd.root);   
			
				    path_put(&nd.path);   
			
				    return ERR_PTR(error);   
			
				do_link://目标文件为符号链接的处理，前文已经分析过   
			
				    error = -ELOOP;   
			
				    if (flag & O_NOFOLLOW)   
			
				        goto exit_dput;   
			
				    /*  
			
				     * This is subtle. Instead of calling do_follow_link() we do the  
			
				     * thing by hands. The reason is that this way we have zero link_count  
			
				     * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.  
			
				     * After that we have the parent and last component, i.e.  
			
				     * we are in the same situation as after the first path_walk().  
			
				     * Well, almost - if the last component is normal we get its copy  
			
				     * stored in nd->last.name and we will have to putname() it when we  
			
				     * are done. Procfs-like symlinks just set LAST_BIND.  
			
				     */   
			
				    nd.flags |= LOOKUP_PARENT;   
			
				    error = security_inode_follow_link(path.dentry, &nd);   
			
				    if (error)   
			
				        goto exit_dput;   
			
				    error = __do_follow_link(&path, &nd);   
			
				    if (error) {   
			
				        /* Does someone understand code flow here? Or it is only  
			
				         * me so stupid? Anathema to whoever designed this non-sense  
			
				         * with "intent.open".  
			
				         */   
			
				        release_open_intent(&nd);   
			
				        if (nd.root.mnt)   
			
				            path_put(&nd.root);   
			
				        return ERR_PTR(error);   
			
				    }   
			
				    nd.flags &= ~LOOKUP_PARENT;   
			
				    if (nd.last_type == LAST_BIND)   
			
				        goto ok;   
			
				    error = -EISDIR;   
			
				    if (nd.last_type != LAST_NORM)   
			
				        goto exit;   
			
				    if (nd.last.name[nd.last.len]) {   
			
				        __putname(nd.last.name);   
			
				        goto exit;   
			
				    }   
			
				    error = -ELOOP;   
			
				    if (count++==32) {   
			
				        __putname(nd.last.name);   
			
				        goto exit;   
			
				    }   
			
				    dir = nd.path.dentry;   
			
				    mutex_lock(&dir->d_inode->i_mutex);   
			
				    path.dentry = lookup_hash(&nd);   
			
				    path.mnt = nd.path.mnt;   
			
				    __putname(nd.last.name);   
			
				    goto do_last;   
			
				}

阅读(396) | 评论(0) | 转发(0) |

上一篇：Shell 逻辑运算符、逻辑表达式详解

下一篇：linux内核五大模块

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6