Chinaunix首页 | 论坛 | 博客
  • 博客访问: 2251999
  • 博文数量: 218
  • 博客积分: 5767
  • 博客等级: 大校
  • 技术积分: 5883
  • 用 户 组: 普通用户
  • 注册时间: 2008-03-01 14:44
文章存档

2012年(53)

2011年(131)

2009年(1)

2008年(33)

分类: LINUX

2011-09-14 08:32:47

http://blog.chinaunix.net/space.php?uid=12567959&do=blog&id=161001

open系统调用

open()系统调用的服务例程为sys_open()函数,该函数接收的参数为:要打开的文件的路径名filename、访问模式的一些标志flags,以及如果该文件被创建所需要的许可位掩码mode。如果该系统调用成功,就返回一个文件描述符,也就是指向文件对象的指针数组current-> files-> fd_array或者current-> files-> fdtable.fd中新分配给文件的索引;否则,返回-1

 

open()系统调用的所有标志

---------------------------------------------------------------------

include/asm-generic/fcntl.h

#define O_ACCMODE 00000003

#define O_RDONLY  00000000   /* 为只读而打开 */

#define O_WRONLY  00000001   /* 为只写而打开 */

#define O_RDWR       00000002  /* 为读和写而打开 */

#ifndef O_CREAT /*如果文件不存在则创建它 */

#define O_CREAT      00000100   /* not fcntl */

#endif

#ifndef O_EXCL /* 对于O_CREAT标志,如果文件已经存在,则失败 */

#define O_EXCL       00000200   /* not fcntl */

#endif

#ifndef O_NOCTTY /* 从不把文件看作终端 */

#define O_NOCTTY  00000400   /* not fcntl */

#endif

#ifndef O_TRUNC /* 截断文件(删除所有的现有内容) */

#define O_TRUNC      00001000   /* not fcntl */

#endif

#ifndef O_APPEND /* 总是在文件末尾写 */

#define O_APPEND  00002000

#endif

#ifndef O_NONBLOCK /* 非阻塞打开 */

#define O_NONBLOCK   00004000

#endif

#ifndef O_DSYNC /* 同步写(阻塞,直到物理写终止) */

#define O_DSYNC      00010000   /* used to be O_SYNC, see below */

#endif

#ifndef FASYNC /* 通过信号发出I/O事件通知 */

#define FASYNC       00020000   /* fcntl, for BSD compatibility */

#endif

#ifndef O_DIRECT

#define O_DIRECT  00040000   /* direct disk access hint */

#endif

/* 大型文件(文件长度大于off_t所能表示的范围但小于off64_t*/

#ifndef O_LARGEFILE

#define O_LARGEFILE  00100000

#endif

 

#ifndef O_DIRECTORY /* 如果文件不是一个目录,则失败 */

#define O_DIRECTORY  00200000   /* must be a directory */

#endif

 

#ifndef O_NOFOLLOW /* 不解析路径名尾部的符号链接 */

#define O_NOFOLLOW   00400000   /* don't follow links */

#endif

#ifndef O_NOATIME /*不更新索引节点的访问时间。*/

#define O_NOATIME 01000000

#endif

#ifndef O_CLOEXEC

#define O_CLOEXEC 02000000   /* set close_on_exec */

#endif

---------------------------------------------------------------------

有一些标志的定义是因体系结构而异的。

 

sys_open()定义如下:

---------------------------------------------------------------------

fs/open.c

SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)

{

    long ret;

 

    if (force_o_largefile())

       flags |= O_LARGEFILE;

 

    ret = do_sys_open(AT_FDCWD, filename, flags, mode);

    /* avoid REGPARM breakage on x86: */

    asmlinkage_protect(3, ret, filename, flags, mode);

    return ret;

}

---------------------------------------------------------------------

这个函数的操作如下:

首先,调用force_o_largefile()来判断是否支持大文件,若是,则设置标志的O_LARGEFILE位。force_o_largefile()其实是一个宏。这个宏也是因体系结构而异的。

其次,调用do_sys_open(AT_FDCWD, filename, flags, mode)来完成实际的打开文件的任务。下面有更详细说明。

最后,调用asmlinkage_protect()以使系统调用正确返回。它也是一个宏,为了防止编译器错误而设。其他平台为空,只有x86平台有定义,为:

---------------------------------------------------------------------

arch/x86/include/asm/linkage.h

/*

 * Make sure the compiler doesn't do anything stupid with the

 * arguments on the stack - they are owned by the *caller*, not

 * the callee. This just fools gcc into not spilling into them,

 * and keeps it from doing tailcall recursion and/or using the

 * stack slots for temporaries, since they are live and "used"

 * all the way to the end of the function.

 *

 * NOTE! On x86-64, all the arguments are in registers, so this

 * only matters on a 32-bit kernel.

 */#define asmlinkage_protect(n, ret, args...) \

    __asmlinkage_protect##n(ret, ##args)

#define __asmlinkage_protect_n(ret, args...) \

    __asm__ __volatile__ ("" : "=r" (ret) : "0" (ret), ##args)

#define __asmlinkage_protect0(ret) \

    __asmlinkage_protect_n(ret)

#define __asmlinkage_protect1(ret, arg1) \

    __asmlinkage_protect_n(ret, "g" (arg1))

#define __asmlinkage_protect2(ret, arg1, arg2) \

    __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2))

#define __asmlinkage_protect3(ret, arg1, arg2, arg3) \

    __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3))

---------------------------------------------------------------------

 

do_sys_open()函数定义如下:

---------------------------------------------------------------------

fs/open.c

long do_sys_open(int dfd, const char __user *filename, int flags, int mode)

{

    char *tmp = getname(filename);

    int fd = PTR_ERR(tmp);

 

    if (!IS_ERR(tmp)) {

       fd = get_unused_fd_flags(flags);

       if (fd >= 0) {

           struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);

           if (IS_ERR(f)) {

              put_unused_fd(fd);

              fd = PTR_ERR(f);

           } else {

              fsnotify_open(f->f_path.dentry);

              fd_install(fd, f);

           }

       }

       putname(tmp);

    }

    return fd;

}

---------------------------------------------------------------------

第一个参数是多么的眼熟啊,查找路径名的do_path_lookup()函数的第一个参数也是它,用于说明查找路径名的基目录。关于它,前面路径名查找已有说明了。其余参数则是sys_open()将传进来的参数传给了它。

这个函数执行如下操作:

1、调用getname(filename)从进程地址空间读取文件的路径名,将路径名的地址存放在局部变量tmpgetname(filename)本质上,首先从名为"names_cache"路径名slab缓存中分配内存区,然后将路径名从用户空间复制到该内存区中。

 

2、调用get_unused_fd_flags (flags)从当前进程的文件描述符表中找一个空位。其定义为:

---------------------------------------------------------------------

include/linux/file.h

#define get_unused_fd_flags(flags) alloc_fd(0, (flags))

---------------------------------------------------------------------

这是一个宏,仅仅是对alloc_fd()函数特殊参数下调用的别名,对alloc_fd()函数定义如下:

---------------------------------------------------------------------

fs/file.c

/*

 * allocate a file descriptor, mark it busy.

 */

int alloc_fd(unsigned start, unsigned flags)

{

    struct files_struct *files = current->files;

    unsigned int fd;

    int error;

    struct fdtable *fdt;

 

    spin_lock(&files->file_lock);

repeat:

    fdt = files_fdtable(files);

    fd = start;

    if (fd < files->next_fd)                            

       fd = files->next_fd;

 

    if (fd < fdt->max_fds)

       fd = find_next_zero_bit(fdt->open_fds->fds_bits,

                     fdt->max_fds, fd);

 

    error = expand_files(files, fd);

    if (error < 0)

       goto out;

 

    /*

     * If we needed to expand the fs array we

     * might have blocked - try again.

     */

    if (error)

       goto repeat;

 

    if (start <= files->next_fd)

       files->next_fd = fd + 1;

 

    FD_SET(fd, fdt->open_fds);

    if (flags & O_CLOEXEC)

       FD_SET(fd, fdt->close_on_exec);

    else

       FD_CLR(fd, fdt->close_on_exec);

    error = fd;

#if 1

    /* Sanity check */

    if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {

       printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);

       rcu_assign_pointer(fdt->fd[fd], NULL);

    }

#endif

 

out:

    spin_unlock(&files->file_lock);

    return error;

}

---------------------------------------------------------------------

这个函数执行如下操作:

a.current->files-> next_fd字段赋值给局部变量。

b.调用find_next_zero_bit(fdt->open_fds->fds_bits, fdt->max_fds, fd)来在文件描述符表中寻找下一个可以分配的文件描述符。这个函数也是因体系结构而异,系统中通用的函数定义为:

---------------------------------------------------------------------

lib/find_next_bit.c

unsigned long find_next_zero_bit(const unsigned long *addr,

unsigned long size,   unsigned long offset)

{

    const unsigned long *p = addr + BITOP_WORD(offset);

    unsigned long result = offset & ~(BITS_PER_LONG-1);

    unsigned long tmp;

 

    if (offset >= size)

        return size;

    size -= result;

    offset %= BITS_PER_LONG;

    if (offset) {

       tmp = *(p++);

       tmp |= ~0UL >> (BITS_PER_LONG - offset);

       if (size < BITS_PER_LONG)

           goto found_first;

       if (~tmp)

           goto found_middle;

       size -= BITS_PER_LONG;

       result += BITS_PER_LONG;

    }

    while (size & ~(BITS_PER_LONG-1)) {

       if (~(tmp = *(p++)))

           goto found_middle;

       result += BITS_PER_LONG;

       size -= BITS_PER_LONG;

    }

    if (!size)

       return result;

    tmp = *p;

 

found_first:

    tmp |= ~0UL << size;

    if (tmp == ~0UL)  /* Are any bits zero? */

       return result + size;    /* Nope. */

found_middle:

    return result + ffz(tmp);

}

---------------------------------------------------------------------

(1)、上面的BITOP_WORD(offset)清一色定义为:

#define BITOP_WORD(nr)      ((nr) / BITS_PER_LONG)

首先求得第一个要查找的long的位置。

(2)offset & ~(BITS_PER_LONG-1)等价于offset除以32再乘以32,以此来求得所要查找的第一个字第0位在表中的位置,只不过会比除法运算效率高很多。

(3)、如果传递的起始查找位置甚至大于最大可能值,则返回最大可能值。

(4)、查找第一个0位所在的long型值的位置。而局部变量tmp中会保存该long型量的值。

(5)、返回查找到的第一个0位的位置或可能的最大值。

 

c.调用expand_files(files, fd),来扩展文件描述符表,files_structfd_array数组成员可以在打开的文件较少时使用,但当打开的文件较多时,就会对文件描述符表进行扩展。

 

d.更新files->next_fd字段,将分配的文件描述符添加进fdt->open_fds,如果设置了O_CLOEXEC则将文件描述符添加进fdt->close_on_exec,若没有,则清除fdt->close_on_exec中的相应位。

 

e.返回文件描述符。

 

3、调用do_filp_open(dfd, tmp, flags, mode, 0)函数,传递给它的参数依次为查找路径名的基目录、文件路径名、访问模式标志以及许可权位掩码、访问模式位。这个函数定义为:

---------------------------------------------------------------------

fs/namei.c

1761 /*

1762  * Note that the low bits of the passed in "open_flag"

1763  * are not the same as in the local variable "flag". See

1764  * open_to_namei_flags() for more details.

1765  */

1766 struct file *do_filp_open(int dfd, const char *pathname,

1767                 int open_flag, int mode, int acc_mode)

1768 {

1769         struct file *filp;

1770         struct nameidata nd;

1771         int error;

1772         struct path path;

1773         int count = 0;

1774         int flag = open_to_namei_flags(open_flag);

1775         int force_reval = 0;

1776

1777         if (!(open_flag & O_CREAT))

1778                 mode = 0;

1779

1780         /*

1781          * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only

1782          * check for O_DSYNC if the need any syncing at all we enforce it's

1783          * always set instead of having to deal with possibly weird behaviour

1784          * for malicious applications setting only __O_SYNC.

1785          */

1786         if (open_flag & __O_SYNC)

1787                 open_flag |= O_DSYNC;

1788

1789         if (!acc_mode)

1790                 acc_mode = MAY_OPEN | ACC_MODE(open_flag);

1791

1792         /* O_TRUNC implies we need access checks for write permissions */

1793         if (open_flag & O_TRUNC)

1794                 acc_mode |= MAY_WRITE;

1795

1796         /* Allow the LSM permission hook to distinguish append

1797            access from general write access. */

1798         if (open_flag & O_APPEND)

1799                 acc_mode |= MAY_APPEND;

1800

1801         /* find the parent */

1802 reval:

1803         error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);

1804         if (error)

1805                 return ERR_PTR(error);

1806         if (force_reval)

1807                 nd.flags |= LOOKUP_REVAL;

1808

1809         current->total_link_count = 0;

1810         error = link_path_walk(pathname, &nd);

1811         if (error) {

1812                 filp = ERR_PTR(error);

1813                 goto out;

1814         }

1815         if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT))

1816                 audit_inode(pathname, nd.path.dentry);

1817

1818         /*

1819          * We have the parent and last component.

1820          */

1821

1822         error = -ENFILE;

1823         filp = get_empty_filp();

1824         if (filp == NULL)

1825                 goto exit_parent;

1826         nd.intent.open.file = filp;

1827         filp->f_flags = open_flag;

1828         nd.intent.open.flags = flag;

1829         nd.intent.open.create_mode = mode;

1830         nd.flags &= ~LOOKUP_PARENT;

1831         nd.flags |= LOOKUP_OPEN;

1832         if (open_flag & O_CREAT) {

1833                 nd.flags |= LOOKUP_CREATE;

1834                 if (open_flag & O_EXCL)

1835                         nd.flags |= LOOKUP_EXCL;

1836         }

1837         if (open_flag & O_DIRECTORY)

1838                 nd.flags |= LOOKUP_DIRECTORY;

1839         if (!(open_flag & O_NOFOLLOW))

1840                 nd.flags |= LOOKUP_FOLLOW;

1841         filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);

1842         while (unlikely(!filp)) { /* trailing symlink */

1843                 struct path holder;

1844                 struct inode *inode = path.dentry->d_inode;

1845                 void *cookie;

1846                 error = -ELOOP;

1847                 /* S_ISDIR part is a temporary automount kludge */

1848                 if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode))

1849                         goto exit_dput;

1850                 if (count++ == 32)

1851                         goto exit_dput;

1852                 /*

1853                  * This is subtle. Instead of calling do_follow_link() we do

1854                  * the thing by hands. The reason is that this way we have zero

1855                  * link_count and path_walk() (called from ->follow_link)

1856                  * honoring LOOKUP_PARENT.  After that we have the parent and

1857                  * last component, i.e. we are in the same situation as after

1858                  * the first path_walk().  Well, almost - if the last component

1859                  * is normal we get its copy stored in nd->last.name and we will

1860                  * have to putname() it when we are done. Procfs-like symlinks

1861                  * just set LAST_BIND.

1862                  */

1863                 nd.flags |= LOOKUP_PARENT;

1864                 error = security_inode_follow_link(path.dentry, &nd);

1865                 if (error)

1866                         goto exit_dput;

1867                 error = __do_follow_link(&path, &nd, &cookie);

1868                 if (unlikely(error)) {

1869                         /* nd.path had been dropped */

1870                         if (!IS_ERR(cookie) && inode->i_op->put_link)

1871                                 inode->i_op->put_link(path.dentry, &nd, cookie);

1872                         path_put(&path);

1873                         release_open_intent(&nd);

1874                         filp = ERR_PTR(error);

1875                         goto out;

1876                 }

1877                 holder = path;

1878                 nd.flags &= ~LOOKUP_PARENT;

1879                 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);

1880                 if (inode->i_op->put_link)

1881                         inode->i_op->put_link(holder.dentry, &nd, cookie);

1882                 path_put(&holder);

1883         }

1884 out:

1885         if (nd.root.mnt)

1886                 path_put(&nd.root);

1887         if (filp == ERR_PTR(-ESTALE) && !force_reval) {

1888                 force_reval = 1;

1889                 goto reval;

1890         }

1891         return filp;

1892

1893 exit_dput:

1894         path_put_conditional(&path, &nd);

1895         if (!IS_ERR(nd.intent.open.file))

1896                 release_open_intent(&nd);

1897 exit_parent:

1898         path_put(&nd.path);

1899         filp = ERR_PTR(error);

1900         goto out;

1901 } ---------------------------------------------------------------------

这个函数一次执行下列步骤:

a.把访问模式拷贝到flag标志中,但是,用特殊的格式对方问模式标志O_RDONLYO_WRONLYO_RDWR进行编码。由函数open_to_namei_flags()完成,其定义如下:

---------------------------------------------------------------------

fs/namei.c

/*

 * Note that while the flag value (low two bits) for sys_open means:

 *  00 - read-only

 *  01 - write-only

 *  10 - read-write

 *  11 - special

 * it is changed into

 *  00 - no permissions needed

 *  01 - read-permission

 *  10 - write-permission

 *  11 - read-write

 * for the internal routines (ie open_namei()/follow_link() etc)

 * This is more logical, and also allows the 00 "no perm needed"

 * to be used for symlinks (where the permissions are checked

 * later).

 *

*/

static inline int open_to_namei_flags(int flag)

{

    if ((flag+1) & O_ACCMODE)

       flag++;

    return flag;

}

---------------------------------------------------------------------

注释中说的已经很清楚了。

根据open_flag原来的值适当更新打开标志open_flag,根据open_flag适当更新访问模式acc_mode

 

b.调用path_init(dfd, pathname, LOOKUP_PARENT, &nd),将查找的路径名的基路径找到,并赋给ndpath字段。注意在这个函数中设置了nd->flagsLOOKUP_PARENT,也就是要查找路径名最后一个分量的父目录。

 

c.设置current->total_link_count0

 

d.调用link_path_walk(pathname, &nd)查找路径名最后一个分量的父母的路径path结构体,保存在nd.path中。因为,路径名的最后一个分量有可能是不存在的而需要创建。

 

e.调用get_empty_filp()函数从fileslab缓冲区filp_cachep中分配一个file结构,并初始化它的一些字段。如果返回值为NULL,则返回错误码-ENFILE的指针形式。

 

f.设置nd.intent.open.file为上一步分配的file结构的地址filp,设置filp的打开文件时所制定的标志f_flags为传递进来并经过适当修改的打开标志open_flag。设置nd.intent.open.flags为访问模式flag,设置nd.intent.open.create_mode为创建模式mode。清除nd.flagsLOOKUP_PARENT标志,设置其LOOKUP_OPEN标志。如果在打开标志中设置了O_CREAT,则设置nd.flagsLOOKUP_CREATE标志,若同时设置了打开标志的O_EXCL,则同时设置nd.flagsOOKUP_EXCL。若设置了打开标志的O_DIRECTORY,则设置nd.flagsLOOKUP_DIRECTORY。若没有设置打开标志的O_NOFOLLOW位,则设置nd.flagsLOOKUP_FOLLOW

即是根据打开标志来设置nd.flags的相应位。这些标志似乎都只与查找的路径名的最后一个分量有关。

 

g.调用do_last()来完成路径名最后一个分量的处理。又是一个非常长的函数,它接受六个参数,nd为前面查找的路径名最后一个分量的父目录的nameidata结构,path为一个path结构的局部变量,打开标志,acc_mode,创建模式(如果需要的话)mode和路径名。该函数定义如下:

---------------------------------------------------------------------

fs/namei.c

1617 static struct file *do_last(struct nameidata *nd, struct path *path,

1618                             int open_flag, int acc_mode,

1619                             int mode, const char *pathname)

1620 {

1621         struct dentry *dir = nd->path.dentry;

1622         struct file *filp;

1623         int error = -EISDIR;

1624

1625         switch (nd->last_type) {

1626         case LAST_DOTDOT:

1627                 follow_dotdot(nd);

1628                 dir = nd->path.dentry;

1629         case LAST_DOT:

1630                 if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {

1631                         if (!dir->d_op->d_revalidate(dir, nd)) {

1632                                 error = -ESTALE;

1633                                 goto exit;

1634                         }

1635                 }

1636                 /* fallthrough */

1637         case LAST_ROOT:

1638                 if (open_flag & O_CREAT)

1639                         goto exit;

1640                 /* fallthrough */

1641         case LAST_BIND:

1642                 audit_inode(pathname, dir);

1643                 goto ok;

1644         }

1645

1646         /* trailing slashes? */

1647         if (nd->last.name[nd->last.len]) {

1648                 if (open_flag & O_CREAT)

1649                         goto exit;

1650                 nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW;

1651         }

1652

1653         /* just plain open? */

1654         if (!(open_flag & O_CREAT)) {

1655                 error = do_lookup(nd, &nd->last, path);

1656                 if (error)

1657                         goto exit;

1658                 error = -ENOENT;

1659                 if (!path->dentry->d_inode)

1660                         goto exit_dput;

1661                 if (path->dentry->d_inode->i_op->follow_link)

1662                         return NULL;

1663                 error = -ENOTDIR;

1664                 if (nd->flags & LOOKUP_DIRECTORY) {

1665                         if (!path->dentry->d_inode->i_op->lookup)

1666                                 goto exit_dput;

1667                 }

1668                 path_to_nameidata(path, nd);

1669                 audit_inode(pathname, nd->path.dentry);

1670                 goto ok;

1671         }

1672

1673         /* OK, it's O_CREAT */

1674         mutex_lock(&dir->d_inode->i_mutex);

1675

1676         path->dentry = lookup_hash(nd);

1677         path->mnt = nd->path.mnt;

1678

1679         error = PTR_ERR(path->dentry);

1680         if (IS_ERR(path->dentry)) {

1681                 mutex_unlock(&dir->d_inode->i_mutex);

1682                 goto exit;

1683         }

1684

1685         if (IS_ERR(nd->intent.open.file)) {

1686                 error = PTR_ERR(nd->intent.open.file);

1687                 goto exit_mutex_unlock;

1688         }

1689

1690         /* Negative dentry, just create the file */

1691         if (!path->dentry->d_inode) {

1692                 /*

1693                  * This write is needed to ensure that a

1694                  * ro->rw transition does not occur between

1695                  * the time when the file is created and when

1696                  * a permanent write count is taken through

1697                  * the 'struct file' in nameidata_to_filp().

1698                  */

1699                 error = mnt_want_write(nd->path.mnt);

1700                 if (error)

1701                         goto exit_mutex_unlock;

1702                 error = __open_namei_create(nd, path, open_flag, mode);

1703                 if (error) {

1704                         mnt_drop_write(nd->path.mnt);

1705                         goto exit;

1706                 }

1707                 filp = nameidata_to_filp(nd);

1708                 mnt_drop_write(nd->path.mnt);

1709                 if (!IS_ERR(filp)) {

1710                         error = ima_file_check(filp, acc_mode);

1711                         if (error) {

1712                                 fput(filp);

1713                                 filp = ERR_PTR(error);

1714                         }

1715                 }

1716                 return filp;

1717         }

1718

1719         /*

1720          * It already exists.

1721          */

1722         mutex_unlock(&dir->d_inode->i_mutex);

1723         audit_inode(pathname, path->dentry);

1724

1725         error = -EEXIST;

1726         if (open_flag & O_EXCL)

1727                 goto exit_dput;

1728

1729         if (__follow_mount(path)) {

1730                 error = -ELOOP;

1731                 if (open_flag & O_NOFOLLOW)

1732                         goto exit_dput;

1733         }

1734

1735         error = -ENOENT;

1736         if (!path->dentry->d_inode)

1737                 goto exit_dput;

1738

1739         if (path->dentry->d_inode->i_op->follow_link)

1740                 return NULL;

1741

1742         path_to_nameidata(path, nd);

1743         error = -EISDIR;

1744         if (S_ISDIR(path->dentry->d_inode->i_mode))

1745                 goto exit;

1746 ok:

1747         filp = finish_open(nd, open_flag, acc_mode);

1748         return filp;

1749

1750 exit_mutex_unlock:

1751         mutex_unlock(&dir->d_inode->i_mutex);

1752 exit_dput:

1753         path_put_conditional(path, nd);

1754 exit:

1755         if (!IS_ERR(nd->intent.open.file))

1756                 release_open_intent(nd);

1757         path_put(&nd->path);

1758         return ERR_PTR(error);

1759 }

---------------------------------------------------------------------

再对这些参数进行以下说明,nd指向的nameidatapath中存放的是路径名最后一个分量的父目录的路径path,其last字段中存放的是路径名最后一个分量的名字的信息qstr结构

(1)这个函数首先根据路径名的最后一个分量的名字信息,来采取一些动作。

如果最后一个分量是“..”则调用follow_dotdot(nd)返回上一级目录,并设置局部变量dirnd->path.dentry

若最后一个分量是“.,则检查nd->path.mnt->mnt_sb->s_type->fs_flagsFS_REVAL_DOT,若设置了该标志,则调用目录项的dir->d_op->d_revalidate(dir, nd)方法,若该方法失败,则释放先前分配的file结构,减少nd->path的引用计数,并返回错误码-ESTALE

若最后一个分量为根目录。若设置了打开标志为O_CREAT,则释放先前分配的file结构,减少nd->path的引用计数,并返回错误码-EISDIR

若为符号链接,则调用finish_open()来完成最后的打开文件操作,并返回file结构指针filpfinish_open()函数稍后解释。

这一步中处理那些最后一个分量的路径已经获得并保存在nd->path中或者最后一个分量为符号链接的情况。同时我们也可以看到,是可以直接使用open来打开目录的,但是不能创建目录。

 

(2)、若路径名的最后一个分量是以“/”结尾(这根据nd->last.name[nd->last.len]的值来判断,在link_path_walk()函数中求出nd->last值的相关部分可以看出,若已“/”结尾,则nd->last.name[nd->last.len]的值正是字符’ /’)的,则检查打开标志是否设置了O_CREAT,若是则释放先前分配的file结构,减少nd->path的引用计数,并返回错误码-EISDIR;若没有则,设置nd查找标志nd->flagsLOOKUP_DIRECTORY LOOKUP_FOLLOW位。

 

(3)、若打开标志没有设置O_CREAT。则调用do_lookup(nd, &nd->last, path)来完成最路径名中最后一个分量路径的查找。

若返回错误码,则释放先前分配的file结构,减少nd->path的引用计数,并返回该错误码。

若查找的结果path->dentry->d_inodeNULL,则调用path_put_conditional(path, nd)来释放查找到的path->dentry,若最后一个分量表示的是挂载点则还要释放path->mnt。释放先前分配的file结构,减少nd->path的引用计数,并返回错误码-ENOENT

若最后一个分量表示的是符号链接,则返回NULL

若设置了查找标志的LOOKUP_DIRECTORY位,则还有判断找到的是否为一个目录(通过检查path->dentry->d_inode->i_op->lookup),若不是目录,则调用path_put_conditional(path, nd)来释放查找到的path->dentry,若最后一个分量表示的是挂载点则还要释放path->mnt。释放先前分配的file结构,减少nd->path的引用计数,并返回错误码-ENOTDIR

调用path_to_nameidata(path, nd)将使得nd->path中保存有路径名最后一个分量的路径。完成审计信息记录。

调用finish_open()来完成最后的打开文件操作,并返回file结构指针filpfinish_open()函数稍后解释。

 

(4)、打开标志设置了O_CREAT,若文件不存在则要创建的情况。首先要对父目录的inode上锁(mutex_lock(&dir->d_inode->i_mutex)),调用lookup_hash(nd)在目录项缓存中查找或者创建路径名最后一个分量的目录项。其定义为:

---------------------------------------------------------------------

fs/namei.c

1122 static struct dentry *__lookup_hash(struct qstr *name,

1123                 struct dentry *base, struct nameidata *nd)

1124 {

1125         struct dentry *dentry;

1126         struct inode *inode;

1127         int err;

1128

1129         inode = base->d_inode;

1130

1131         /*

1132          * See if the low-level filesystem might want

1133          * to use its own hash..

1134          */

1135         if (base->d_op && base->d_op->d_hash) {

1136                 err = base->d_op->d_hash(base, name);

1137                 dentry = ERR_PTR(err);

1138                 if (err < 0)

1139                         goto out;

1140         }

1141

1142         dentry = __d_lookup(base, name);

1143

1144         /* lockess __d_lookup may fail due to concurrent d_move()

1145          * in some unrelated directory, so try with d_lookup

1146          */

1147         if (!dentry)

1148                 dentry = d_lookup(base, name);

1149

1150         if (dentry && dentry->d_op && dentry->d_op->d_revalidate)

1151                 dentry = do_revalidate(dentry, nd);

1152

1153         if (!dentry) {

1154                 struct dentry *new;

1155

1156                 /* Don't create child dentry for a dead directory. */

1157                 dentry = ERR_PTR(-ENOENT);

1158                 if (IS_DEADDIR(inode))

1159                         goto out;

1160

1161                 new = d_alloc(base, name);

1162                 dentry = ERR_PTR(-ENOMEM);

1163                 if (!new)

1164                         goto out;

1165                 dentry = inode->i_op->lookup(inode, new, nd);

1166                 if (!dentry)

1167                         dentry = new;

1168                 else

1169                         dput(new);

1170         }

1171 out:

1172         return dentry;

1173 }

 

1175 /*

1176  * Restricted form of lookup. Doesn't follow links, single-component only,

1177  * needs parent already locked. Doesn't follow mounts.

1178  * SMP-safe.

1179  */

1180 static struct dentry *lookup_hash(struct nameidata *nd)

1181 {

1182         int err;

1183

1184         err = exec_permission(nd->path.dentry->d_inode);

1185         if (err)

1186                 return ERR_PTR(err);

1187         return __lookup_hash(&nd->last, nd->path.dentry, nd);

1188 }

---------------------------------------------------------------------

lookup_hash(nd)函数当在目录项缓存中没有找到要找的目录项时,会分配目录项,并且会调用父目录的inode->i_op->lookup(inode, new, nd)方法来创建所要查找的文件的inode等信息,并设置目录项的适当字段。但是在目录中没有所要查找的文件时,lookup(inode, new, nd)方法并不返回错误。

lookup_hash(nd)函数返回的结果被赋给path->dentrypath用来存放路径名最后一个分量的path结构。

初始化path->mnt为父目录的vfsmount对象。

 

(5)、检查path->dentry是否是一个错误码的指针形式,若是则对父目录的inode解锁(mutex_unlock(&dir->d_inode->i_mutex)),减少nd->path的引用计数,并返回该错误码。

 

(6)、检查nd->intent.open.file是否包含一个错误码,若是则首先对父目录的inode解锁,接着调用path_put_conditional(path, nd)来释放由lookup_hash(nd)查找到或创建的path->dentry,若最后一个分量表示的是挂载点则还要释放path->mnt。释放先前分配的file结构,减少nd->path的引用计数,并返回错误码-ENOTDIR

 

(7)、检查path->dentry->d_inode是否为NULL,若是,则说明要创建一个文件。

首先调用mnt_want_write(nd->path.mnt),来确保在创建文件和在nameidata_to_filp()中通过struct file取得固定的写计数之间不会发生ro -> rw的转换。这个函数本质上增加vfsmount对象的写着计数器mnt->mnt_writers

 

调用__open_namei_create(nd, path, open_flag, mode)函数来创建一个文件,这个函数定义为:

---------------------------------------------------------------------

fs/namei.c

1502 /*

1503  * Be careful about ever adding any more callers of this

1504  * function.  Its flags must be in the namei format, not

1505  * what get passed to sys_open().

1506  */

1507 static int __open_namei_create(struct nameidata *nd, struct path *path,

1508                                 int open_flag, int mode)

1509 {

1510         int error;

1511         struct dentry *dir = nd->path.dentry;

1512

1513         if (!IS_POSIXACL(dir->d_inode))

1514                 mode &= ~current_umask();

1515         error = security_path_mknod(&nd->path, path->dentry, mode, 0);

1516         if (error)

1517                 goto out_unlock;

1518         error = vfs_create(dir->d_inode, path->dentry, mode, nd);

1519 out_unlock:

1520         mutex_unlock(&dir->d_inode->i_mutex);

1521         dput(nd->path.dentry);

1522         nd->path.dentry = path->dentry;

1523         if (error)

1524                 return error;

1525         /* Don't check for write permission, don't truncate */

1526         return may_open(&nd->path, 0, open_flag & ~O_TRUNC);

1527 }

---------------------------------------------------------------------

__open_namei_create()函数在执行了访问权限检查后,就调用父目录inodecreate方法dir->i_op->create(dir, dentry, mode, nd)来创建文件。之后,__open_namei_create()解除对于父目录inode的锁定,释放父目录目录项,并将路径名最后一个分量目录项path->dentry赋给nd->path.dentry。然后返回对may_open(&nd->path, 0, open_flag & ~O_TRUNC)调用的返回值。

 

调用nameidata_to_filp(nd)来将一个nameidata转换为一个打开的filp,这个函数本质上主要调用__dentry_open(nd->path.dentry, nd->path.mnt, filp,     NULL, cred)来根据当前进程的状态和获得的目录项来设置nd->intent.open.file所指向的file结构的各字段。

 

调用mnt_drop_write(nd->path.mnt)来减少nd->path.mnt写者计数器的值。

 

返回filp

 

(8)、尽管设置了打开标志的O_CREAT,但是却找到了所需的文件。则首先对父目录inode解锁。检查打开标志是否设置了O_EXCL,若是,则调用path_put_conditional(path, nd)来释放查找到的path->dentry,若最后一个分量表示的是挂载点则还要释放path->mnt。释放先前分配的file结构,减少nd->path的引用计数,并返回错误码-EEXIST

 

调用__follow_mount(path) 找到挂载在本路径上的文件系统,即vfsmount对象的地址和目录项对象地址。

 

调用finish_open()来完成打开草走。

 

(9)、返回filp

 

h.若最后一个分量是一个符号链接,则追踪符号链接。

 

i、返回filp

 

4、将文件安装在fd数组中。

 

5、释放路径名所占用的临时内存空间tmp

 

6、返回文件描述符fd


close系统调用

close()系统调用

@font-face { font-family: "宋体"; }@font-face { font-family: "@宋体"; }p.MsoNormal, li.MsoNormal, div.MsoNormal { margin: 0cm 0cm 0.0001pt; text-align: justify; font-size: 10.5pt; font-family: "Times New Roman"; }div.Section1 { page: Section1; }

程序关闭打开的文件使用close()系统调用,它接收的参数为要关闭文件的文件描述符fdsys_close()服务例程定义如下:

---------------------------------------------------------------------

fs/open.c

/*

 * Careful here! We test whether the file pointer is NULL before

 * releasing the fd. This ensures that one clone task can't release

 * an fd while another clone is opening it.

 */

SYSCALL_DEFINE1(close, unsigned int, fd)

{

    struct file * filp;

    struct files_struct *files = current->files;

    struct fdtable *fdt;

    int retval;

 

    spin_lock(&files->file_lock);

    fdt = files_fdtable(files);

    if (fd >= fdt->max_fds)

       goto out_unlock;

    filp = fdt->fd[fd];

    if (!filp)

       goto out_unlock;

    rcu_assign_pointer(fdt->fd[fd], NULL);

    FD_CLR(fd, fdt->close_on_exec);

    __put_unused_fd(files, fd);

    spin_unlock(&files->file_lock);

    retval = filp_close(filp, files);

 

    /* can't restart close syscall because file table entry was cleared */

    if (unlikely(retval == -ERESTARTSYS ||

            retval == -ERESTARTNOINTR ||

            retval == -ERESTARTNOHAND ||

            retval == -ERESTART_RESTARTBLOCK))

       retval = -EINTR;

 

    return retval;

 

out_unlock:

    spin_unlock(&files->file_lock);

    return -EBADF;

}

---------------------------------------------------------------------

sys_close()服务例程执行下列操作:

1、获得存放在当前进程current->files->fdt->fd[fd]中的文件对象,如果它为NULL,则返回-EBADF

 

2、把current->files->fdt->fd[fd]置为NULL。释放文件描述符fd,这是通过清除current->files->fdt->close_on_exec字段相应的位及调用__put_unused_fd(files, fd)函数来进行的。__put_unused_fd()定义如下:

---------------------------------------------------------------------

fs/open.c

static void __put_unused_fd(struct files_struct *files, unsigned int fd)

{

    struct fdtable *fdt = files_fdtable(files);

    __FD_CLR(fd, fdt->open_fds);

    if (fd < files->next_fd)

       files->next_fd = fd;

}

---------------------------------------------------------------------

这个函数清除current->files->fdt-> open_fds字段相应的位。然后对比fdfiles->next_fd,如果前者更小,则更新后者为前者。由此可见files_struct结构的next_fd中存放的是文件描述表中可用的最小的文件描述符。这个字段,一来可以为文件描述符的快速分配提供支持,二来则有助于缩小所需搜索的可用的文件描述符的范围。

 

3、调用filp_close(),该函数定义如下:

---------------------------------------------------------------------

fs/open.c

/*

 * "id" is the POSIX thread ID. We use the

 * files pointer for this..

 */

int filp_close(struct file *filp, fl_owner_t id)

{

    int retval = 0;

 

    if (!file_count(filp)) {

       printk(KERN_ERR "VFS: Close: file count is 0\n");

       return 0;

    }

 

    if (filp->f_op && filp->f_op->flush)

       retval = filp->f_op->flush(filp, id);

 

    dnotify_flush(filp, id);

    locks_remove_posix(filp, id);

    fput(filp);

    return retval;

}

---------------------------------------------------------------------

该函数执行下列操作:

a.判断文件的引用计数是否为0,若是,则返回0

b.调用文件操作的flush方法(如果已定义)。

c.释放文件上的任何强制锁。参见后面“文件加锁”部分。

d.调用fput(filp)释放文件对象,该函数定义为:

---------------------------------------------------------------------

fs/file_table.c

/* __fput is called from task context when aio completion releases the last

 * last use of a struct file *.  Do not use otherwise.

 */

void __fput(struct file *file)

{

    struct dentry *dentry = file->f_path.dentry;

    struct vfsmount *mnt = file->f_path.mnt;

    struct inode *inode = dentry->d_inode;

 

    might_sleep();

 

    fsnotify_close(file);

    /*

     * The function eventpoll_release() should be the first called

     * in the file cleanup chain.

     */

    eventpoll_release(file);

    locks_remove_flock(file);

 

    if (unlikely(file->f_flags & FASYNC)) {

       if (file->f_op && file->f_op->fasync)

           file->f_op->fasync(-1, file, 0);

    }

    if (file->f_op && file->f_op->release)

       file->f_op->release(inode, file);

    security_file_free(file);

    ima_file_free(file);

    if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))

       cdev_put(inode->i_cdev);

    fops_put(file->f_op);

    put_pid(file->f_owner.pid);

    file_kill(file);

    if (file->f_mode & FMODE_WRITE)

       drop_file_write_access(file);

    file->f_path.dentry = NULL;

    file->f_path.mnt = NULL;

    file_free(file);

    dput(dentry);

    mntput(mnt);

}

 

void fput(struct file *file)

{

    if (atomic_long_dec_and_test(&file->f_count))

       __fput(file);

}

---------------------------------------------------------------------

在这个函数中,监测与文件有关的file对象、目录项对象及vfsmount对象的引用计数,若引用计数为0,且条件合适,则将它们归还给相应的slab缓存。

 

4、返回0,或一个出错码。出错码可由flush方法或文件中的前一个写操作错误产生。
阅读(2039) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~