Chinaunix首页 | 论坛 | 博客
  • 博客访问: 509030
  • 博文数量: 80
  • 博客积分: 1475
  • 博客等级: 上尉
  • 技术积分: 1047
  • 用 户 组: 普通用户
  • 注册时间: 2010-04-01 22:58
文章分类

全部博文(80)

文章存档

2012年(3)

2010年(77)

我的朋友

分类: LINUX

2010-05-09 14:57:45

一:前言
在用户空间中,建立目录所用的API为mkdir().它在内核中的系统调用入口是sys_mkdir().今天跟踪一下
函数来分析linux文件系统中目录的建立过程.
二:sys_mkdir()
Sys_mkdir()对应的代码如下:
asmlinkage long sys_mkdir(const char __user * pathname, int mode)
{
    int error = 0;
     char * tmp;
 
     //把用户空间的值copy到内核空间
     tmp = getname(pathname);
     error = PTR_ERR(tmp);
     if (!IS_ERR(tmp)) {
         struct dentry *dentry;
         struct nameidata nd;
 
         //先查到它的父目录,看父目录是否存在
         error = path_lookup(tmp, LOOKUP_PARENT, &nd);
         if (error)
              goto out;
         //寻找子结点的dentry. 如果没有,则新建之
         dentry = lookup_create(&nd, 1);
         error = PTR_ERR(dentry);
         if (!IS_ERR(dentry)) {
              if (!IS_POSIXACL(nd.dentry->d_inode))
                   mode &= ~current->fs->umask;
              //与具体的文件系统相关的部份
              error = vfs_mkdir(nd.dentry->d_inode, dentry, mode);
              //减少dentry的引用计数
              dput(dentry);
         }
         up(&nd.dentry->d_inode->i_sem);
 
         //释放临时内存
         path_release(&nd);
out:
         putname(tmp);
     }
 
     return error;
}
这个函数里面有几个重要的子函数. path_lookup()在前一篇文章中已经分析过了.如果不太了解,请参阅相关的部份.
lookup_create()的代码如下:
{
     struct dentry *dentry;
 
     //防止并发操作,获得信号量
     down(&nd->dentry->d_inode->i_sem);
     dentry = ERR_PTR(-EEXIST);
     //如果之前的查找过程失败
     if (nd->last_type != LAST_NORM)
         goto fail;
 
     //去掉LOOKUP_PARENT标志
     nd->flags &= ~LOOKUP_PARENT;
     //在缓存中寻找相应的dentry.如果没有。则新建之
     dentry = lookup_hash(&nd->last, nd->dentry);
     //创建或者查找失败
     if (IS_ERR(dentry))
         goto fail;
     //如果不是建立一个目录而且文件名字不是以0结尾
     //出错退出
     if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
         goto enoent;
     return dentry;
enoent:
     dput(dentry);
     dentry = ERR_PTR(-ENOENT);
fail:
     return dentry;
}
lookup_hash()à __lookup_hash():
 
static struct dentry * __lookup_hash(struct qstr *name, struct dentry * base, struct nameidata *nd)
{
     struct dentry * dentry;
     struct inode *inode;
     int err;
 
     inode = base->d_inode;
     //检查是否有相关的权限
     err = permission(inode, MAY_EXEC, nd);
     dentry = ERR_PTR(err);
     if (err)
         goto out;
 
     /*
      * See if the low-level filesystem might want
      * to use its own hash..
      */
      //如果自定义了hash计算
     if (base->d_op && base->d_op->d_hash) {
         err = base->d_op->d_hash(base, name);
         dentry = ERR_PTR(err);
         if (err < 0)
              goto out;
     }
 
     //从缓存中寻找
     dentry = cached_lookup(base, name, nd);
     if (!dentry) {
         //如果缓存中没有相关项。则新建之
         struct dentry *new = d_alloc(base, name);
         dentry = ERR_PTR(-ENOMEM);
         if (!new)
              goto out;
         //到具体的文件系统中查找
         dentry = inode->i_op->lookup(inode, new, nd);
         if (!dentry)
              dentry = new;
         else
              dput(new);
     }
out:
     return dentry;
}
值得注意的是:经过上述的操作,返回的dentry有可能是原本就存在的.对这种情况是怎么排除的呢?继续看sys_mkdir()的另一个子函数:
int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
{
     //对异常情况的排除和权限的检查
     int error = may_create(dir, dentry, NULL);
 
     if (error)
         return error;
 
     //如果父结点不允许mkdir操作
     if (!dir->i_op || !dir->i_op->mkdir)
         return -EPERM;
 
     mode &= (S_IRWXUGO|S_ISVTX);
     error = security_inode_mkdir(dir, dentry, mode);
     if (error)
         return error;
 
     DQUOT_INIT(dir);
     //调用父结点的mkdir操作
     error = dir->i_op->mkdir(dir, dentry, mode);
    
     if (!error) {
         //如果成功,通告与之关联的进程
         inode_dir_notify(dir, DN_CREATE);
         security_inode_post_mkdir(dir,dentry, mode);
     }
     return error;
}
在这里看到,最终会调用父进程的i_op.mkdir操作.另外,对于上面说的相应结点已经存在的情况是在may_create()中检测的:
static inline int may_create(struct inode *dir, struct dentry *child,
                   struct nameidata *nd)
{
     //如果欲建结点的inode已经存在
     //对于一个新建的dentry.其d_inode指向为空.
     if (child->d_inode)
         return -EEXIST;
     //判断父目录是否已经失效
     if (IS_DEADDIR(dir))
         return -ENOENT;
     //权限检查
     return permission(dir,MAY_WRITE | MAY_EXEC, nd);
}
Mkdir的大体架构就如此了.下面讨论一下rootfs和ext2中的目录创建.
 
三:rootfs的目录创建
在前一篇文章分析到.挂载rootfs时,对文件系统根目录的inode.i_op赋值如下:
static struct inode_operations ramfs_dir_inode_operations = {
     .create       = ramfs_create,
     .lookup       = simple_lookup,
     .link         = simple_link,
     .unlink       = simple_unlink,
     .symlink = ramfs_symlink,
     .mkdir        = ramfs_mkdir,
     .rmdir        = simple_rmdir,
     .mknod        = ramfs_mknod,
     .rename       = simple_rename,
};
对应的mkdir操作入口是ramfs_mkdir():
static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
{
     //创建结点
     int retval = ramfs_mknod(dir, dentry, mode | S_IFDIR, 0);
     //如果创建成功,更新i_nlink计数
     if (!retval)
         dir->i_nlink++;
     return retval;
}
Ramsf_mknod()的代码如下:
static int
ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
{
     //在文件系统中分其为配一个inode
     struct inode * inode = ramfs_get_inode(dir->i_sb, mode, dev);
     int error = -ENOSPC;
 
    
     if (inode) {
         //如果分配成功
         if (dir->i_mode & S_ISGID) {
              inode->i_gid = dir->i_gid;
              if (S_ISDIR(mode))
                   inode->i_mode |= S_ISGID;
         }
         //将dentry与分配的inode关联起来
         d_instantiate(dentry, inode);
         //增加dentry的引用计数
         dget(dentry); /* Extra count - pin the dentry in core */
         error = 0;
     }
     return error;
}
这个函数中的子函数我们都在前面已经分析过.请自行查阅本站的其它文档.其操作非常简单。就是分配一个inode。然后将inode 与dentry建立关联.因为rootfs是一个基于RAM的文件系统。其inode的分配就是在内存中创建一个inode空间,然后为其各项操作赋值而已.
 
四:ext2中的目录创建
经过上一章的分析可以看到.ext2文件系统根目录的inode.i_op被赋值为ext2_dir_inode_operations.其结构如下所示:
struct inode_operations ext2_dir_inode_operations = {
     .create       = ext2_create,
     .lookup       = ext2_lookup,
     .link         = ext2_link,
     .unlink       = ext2_unlink,
     .symlink = ext2_symlink,
     .mkdir        = ext2_mkdir,
     .rmdir        = ext2_rmdir,
     .mknod        = ext2_mknod,
     .rename       = ext2_rename,
#ifdef CONFIG_EXT2_FS_XATTR
     .setxattr = generic_setxattr,
     .getxattr = generic_getxattr,
     .listxattr    = ext2_listxattr,
     .removexattr  = generic_removexattr,
#endif
     .setattr = ext2_setattr,
     .permission   = ext2_permission,
}
Mkdir对应的入口为ext2_mkdir().代码如下:
static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
{
     struct inode * inode;
     int err = -EMLINK;
 
     if (dir->i_nlink >= EXT2_LINK_MAX)
         goto out;
 
     //增加dir的引用计数,并将其置为"脏"
     ext2_inc_count(dir);
 
     //在文件系统中分配一个inode
     inode = ext2_new_inode (dir, S_IFDIR | mode);
     err = PTR_ERR(inode);
     if (IS_ERR(inode))
         goto out_dir;
 
     //为inode的各项操作赋值
     inode->i_op = &ext2_dir_inode_operations;
     inode->i_fop = &ext2_dir_operations;
 
     //为inode对应的i_mapping赋值
     if (test_opt(inode->i_sb, NOBH))
         inode->i_mapping->a_ops = &ext2_nobh_aops;
     else
         inode->i_mapping->a_ops = &ext2_aops;
 
     //增加inode的引用计数,并将其置为"脏"
     ext2_inc_count(inode);
 
     //对目录结点的初始化
     err = ext2_make_empty(inode, dir);
     if (err)
         goto out_fail;
 
     //更新父目录,使inode加入父目录
     err = ext2_add_link(dentry, inode);
     if (err)
         goto out_fail;
 
     //使dentry和inode建立关联
     d_instantiate(dentry, inode);
out:
     return err;
 
out_fail:
     ext2_dec_count(inode);
     ext2_dec_count(inode);
     iput(inode);
out_dir:
     ext2_dec_count(dir);
     goto out;
}
逐个分析上面所涉及到的子函数.
在ext2中分配一个inode是由ext2_new_inode()完成的.它的代码如下:
 
struct inode *ext2_new_inode(struct inode *dir, int mode)
{
     struct super_block *sb;
     struct buffer_head *bitmap_bh = NULL;
     struct buffer_head *bh2;
     int group, i;
     ino_t ino = 0;
     struct inode * inode;
     struct ext2_group_desc *gdp;
     struct ext2_super_block *es;
     struct ext2_inode_info *ei;
     struct ext2_sb_info *sbi;
     int err;
 
     sb = dir->i_sb;
     //分配一个inode
     inode = new_inode(sb);
     if (!inode)
         return ERR_PTR(-ENOMEM);
 
     //inode的私有结构
     ei = EXT2_I(inode);
     //super_block中的ext2私有结构
     sbi = EXT2_SB(sb);
 
     //ext2的super_block
     es = sbi->s_es;
 
     //寻找一个合适的组来分配inode
     if (S_ISDIR(mode)) {
         if (test_opt(sb, OLDALLOC))
              group = find_group_dir(sb, dir);
         else
              group = find_group_orlov(sb, dir);
     } else
         group = find_group_other(sb, dir);
 
     if (group == -1) {
         err = -ENOSPC;
         goto fail;
     }
 
     //遍历组描述符
     for (i = 0; i < sbi->s_groups_count; i++) {
         //group对应的组开始遍历
 
         //取得组描述符
         gdp = ext2_get_group_desc(sb, group, &bh2);
 
         //释放bitmap_bh.已经后面会使用这个临时变量
         brelse(bitmap_bh);
 
         //取得组描述符里的inode位图
         bitmap_bh = read_inode_bitmap(sb, group);
         if (!bitmap_bh) {
              err = -EIO;
              goto fail;
         }
         ino = 0;
 
repeat_in_this_group:
         //寻找位图中第一个没有使用的位
         ino = ext2_find_next_zero_bit((unsigned long *)bitmap_bh->b_data,
                             EXT2_INODES_PER_GROUP(sb), ino);
         //如果找到的位大于块组中的inode数.那从group之后的块组中分配
         if (ino >= EXT2_INODES_PER_GROUP(sb)) {
              /*
               * Rare race: find_group_xx() decided that there were
               * free inodes in this group, but by the time we tried
               * to allocate one, they're all gone.  This can also
               * occur because the counters which find_group_orlov()
               * uses are approximate.  So just go and search the
               * next block group.
               */
               //已经到达块组数目最大值。则将其置为零.然后重新循环
              if (++group == sbi->s_groups_count)
                   group = 0;
              continue;
         }
 
         //将inode 位图中的分配位置位
         if (ext2_set_bit_atomic(sb_bgl_lock(sbi, group),
                            ino, bitmap_bh->b_data)) {
              /* we lost this inode */
              //如果该位已经被置位了.说明其它的内核控制路径将其分配了.
              //那就找它的下一个没有被使用的inode
 
              //如果下一个超过了这个组中的最大inode数目。那从下一个块组中分配
              if (++ino >= EXT2_INODES_PER_GROUP(sb)) {
                   /* this group is exhausted, try next group */
                   if (++group == sbi->s_groups_count)
                       group = 0;
                   continue;
              }
              /* try to find free inode in the same group */
              //重新从块组中寻找没有被使用的inode
              goto repeat_in_this_group;
         }
 
         //如果运行到这里的话,说明分配成功了
         goto got;
     }
 
     /*
      * Scanned all blockgroups.
      */
     err = -ENOSPC;
     goto fail;
got:
     mark_buffer_dirty(bitmap_bh);
     if (sb->s_flags & MS_SYNCHRONOUS)
         sync_dirty_buffer(bitmap_bh);
     brelse(bitmap_bh);
 
     //将块组中的inode序号转换为全局inode计数
     ino += group * EXT2_INODES_PER_GROUP(sb) + 1;
 
     //如果inode序号小于super_block的超始inode序号或者大于inode总数
     //出错退出
     if (ino < EXT2_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
         ext2_error (sb, "ext2_new_inode",
                  "reserved inode or inode > inodes count - "
                  "block_group = %d,inode=%lu", group,
                  (unsigned long) ino);
         err = -EIO;
         goto fail;
     }
 
     //更新统计计数
     percpu_counter_mod(&sbi->s_freeinodes_counter, -1);
     if (S_ISDIR(mode))
         percpu_counter_inc(&sbi->s_dirs_counter);
 
     spin_lock(sb_bgl_lock(sbi, group));
    
     gdp->bg_free_inodes_count =
                cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
 
     //更新s_debts
     if (S_ISDIR(mode)) {
         if (sbi->s_debts[group] < 255)
              sbi->s_debts[group]++;
         gdp->bg_used_dirs_count =
              cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
     } else {
         if (sbi->s_debts[group])
              sbi->s_debts[group]--;
     }
     spin_unlock(sb_bgl_lock(sbi, group));
 
     sb->s_dirt = 1;
     mark_buffer_dirty(bh2);
     inode->i_uid = current->fsuid;
     if (test_opt (sb, GRPID))
         inode->i_gid = dir->i_gid;
     else if (dir->i_mode & S_ISGID) {
         inode->i_gid = dir->i_gid;
         if (S_ISDIR(mode))
              mode |= S_ISGID;
     } else
         inode->i_gid = current->fsgid;
     inode->i_mode = mode;
 
     //更新inode表示的索引结点号
     inode->i_ino = ino;
     inode->i_blksize = PAGE_SIZE;    /* This is the optimal IO size (for stat), not the fs block size */
     inode->i_blocks = 0;
     //使i_mtine,i_atime,i_ctime置为当前时间
     inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
     memset(ei->i_data, 0, sizeof(ei->i_data));
     ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL;
     if (S_ISLNK(mode))
         ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL);
     /* dirsync is only applied to directories */
     if (!S_ISDIR(mode))
         ei->i_flags &= ~EXT2_DIRSYNC_FL;
     ei->i_faddr = 0;
     ei->i_frag_no = 0;
     ei->i_frag_size = 0;
     ei->i_file_acl = 0;
     ei->i_dir_acl = 0;
     ei->i_dtime = 0;
     ei->i_block_group = group;
     ei->i_next_alloc_block = 0;
     ei->i_next_alloc_goal = 0;
     ei->i_prealloc_block = 0;
     ei->i_prealloc_count = 0;
     ei->i_dir_start_lookup = 0;
     ei->i_state = EXT2_STATE_NEW;
     ext2_set_inode_flags(inode);
     spin_lock(&sbi->s_next_gen_lock);
     inode->i_generation = sbi->s_next_generation++;
     spin_unlock(&sbi->s_next_gen_lock);
     insert_inode_hash(inode);
 
     if (DQUOT_ALLOC_INODE(inode)) {
         DQUOT_DROP(inode);
         err = -ENOSPC;
         goto fail2;
     }
     err = ext2_init_acl(inode, dir);
     if (err) {
         DQUOT_FREE_INODE(inode);
         goto fail2;
     }
     //置inode为“脏”
     mark_inode_dirty(inode);
     ext2_debug("allocating inode %lu\n", inode->i_ino);
     ext2_preread_inode(inode);
     return inode;
 
fail2:
     inode->i_flags |= S_NOQUOTA;
     inode->i_nlink = 0;
     iput(inode);
     return ERR_PTR(err);
 
fail:
     make_bad_inode(inode);
     iput(inode);
     return ERR_PTR(err);
}
查找一个末使用的索引结点有一个规则,就是尽量使每个块组达到平衡.所以linux在ext2_sb_info结构中加了一个s_debts字段.用来表示每个块组中的文件与目录的分配情况.计算的方法是在此find_group_orlov(目录)和find_group_other(其它类型的文件)中完成的.
每个页面都包含两个特殊目录结构 “.”和 “..”.单点代表其本身,双点代表父目录.这个过程是在ext2_make_empty()中完成的.对应代码如下:
int ext2_make_empty(struct inode *inode, struct inode *parent)
{
     struct address_space *mapping = inode->i_mapping;
     //找到页面映射所代表的首个页面
     struct page *page = grab_cache_page(mapping, 0);
     unsigned chunk_size = ext2_chunk_size(inode);
     struct ext2_dir_entry_2 * de;
     int err;
     void *kaddr;
 
     if (!page)
         return -ENOMEM;
 
     //先调用prepare_write().因为之后会将page写到文件系统中去
     err = mapping->a_ops->prepare_write(NULL, page, 0, chunk_size);
     if (err) {
         unlock_page(page);
         goto fail;
     }
     //将page临时映射到内核
     kaddr = kmap_atomic(page, KM_USER0);
 
     //目录中的第一个文件对象
     de = (struct ext2_dir_entry_2 *)kaddr;
     //每个目录中都有两个默认存在的对象.和..
 
     //将'.'加至目录中,其inode结点号指向其本身
     de->name_len = 1;
     de->rec_len = cpu_to_le16(EXT2_DIR_REC_LEN(1));
     memcpy (de->name, ".\0\0", 4);
     de->inode = cpu_to_le32(inode->i_ino);
     ext2_set_de_type (de, inode);
 
     //设置'..'.使其指向父目录
     de = (struct ext2_dir_entry_2 *)(kaddr + EXT2_DIR_REC_LEN(1));
     de->name_len = 2;
     de->rec_len = cpu_to_le16(chunk_size - EXT2_DIR_REC_LEN(1));
     de->inode = cpu_to_le32(parent->i_ino);
     memcpy (de->name, "..\0", 4);
     ext2_set_de_type (de, inode);
 
     //释放掉映射区间
     kunmap_atomic(kaddr, KM_USER0);
     //将更改的页面提交到文件系统
     err = ext2_commit_chunk(page, 0, chunk_size);
fail:
     //页面使用完了,减少其使用计数
     page_cache_release(page);
     return err;
}
初始完成之后,要将子目录插入父目录所表的空间的。它是由ext2_add_link()完成的。代码如下:
int ext2_add_link (struct dentry *dentry, struct inode *inode)
{
     //得到父目录的inode
     struct inode *dir = dentry->d_parent->d_inode;
     const char *name = dentry->d_name.name;
     int namelen = dentry->d_name.len;
     unsigned chunk_size = ext2_chunk_size(dir);
     unsigned reclen = EXT2_DIR_REC_LEN(namelen);
     unsigned short rec_len, name_len;
     struct page *page = NULL;
     ext2_dirent * de;
 
     //父目录结点大小所占的页面数
     unsigned long npages = dir_pages(dir);
     unsigned long n;
     char *kaddr;
     unsigned from, to;
     int err;
 
     /*
      * We take care of directory expansion in the same loop.
      * This code plays outside i_size, so it locks the page
      * to protect that region.
      */
 
     //遍历结点所在的空间
     for (n = 0; n <= npages; n++) {
         char *dir_end;
 
         page = ext2_get_page(dir, n);
         err = PTR_ERR(page);
         if (IS_ERR(page))
              goto out;
         lock_page(page);
         kaddr = page_address(page);
 
         //本页面的最后的位置.
        
         //ext2_last_byte: 如果剩余的长度大于一个页面,则返回一个页面大小.否则返回剩余空间大小
         dir_end = kaddr + ext2_last_byte(dir, n);
         de = (ext2_dirent *)kaddr;
         kaddr += PAGE_CACHE_SIZE - reclen;
         while ((char *)de <= kaddr) {
              //到了结点空间的末尾
              if ((char *)de == dir_end) {
                   /* We hit i_size */
                   name_len = 0;
                   rec_len = chunk_size;
                   de->rec_len = cpu_to_le16(chunk_size);
                   de->inode = 0;
                   goto got_it;
              }
 
              //目录中文件所占空间长度为0.非法
              if (de->rec_len == 0) {
                   ext2_error(dir->i_sb, __FUNCTION__,
                       "zero-length directory entry");
                   err = -EIO;
                   goto out_unlock;
              }
              err = -EEXIST;
              //在目录所包含的文件中,含有同名的结点
              if (ext2_match (namelen, name, de))
                   goto out_unlock;
              name_len = EXT2_DIR_REC_LEN(de->name_len);
              rec_len = le16_to_cpu(de->rec_len);
              //de->inode==0.表示目录中的此结点被删除
              //rec_len >= reclen:表示旧结点中有足够的空间存储新的结点
              if (!de->inode && rec_len >= reclen)
                   goto got_it;
              //这个结点中有空间剩余.(可能是它后面有节点被删除造成的)
              if (rec_len >= name_len + reclen)
                   goto got_it;
              de = (ext2_dirent *) ((char *) de + rec_len);
         }
         unlock_page(page);
         ext2_put_page(page);
     }
     BUG();
     return -EINVAL;
 
got_it:
     from = (char*)de - (char*)page_address(page);
     to = from + rec_len;
     err = page->mapping->a_ops->prepare_write(NULL, page, from, to);
     if (err)
         goto out_unlock;
     if (de->inode) {
         //这是属于结点空间有剩余的情况
         //即在空间中插入一个新的结点
         ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
         de1->rec_len = cpu_to_le16(rec_len - name_len);
         de->rec_len = cpu_to_le16(name_len);
         de = de1;
     }
     //对目录的相关项进行赋值
     de->name_len = namelen;
     memcpy (de->name, name, namelen);
     de->inode = cpu_to_le32(inode->i_ino);
     ext2_set_de_type (de, inode);
     //提交所做的修改,将其写入文件系统
     err = ext2_commit_chunk(page, from, to);
     //更改时间戳
     dir->i_mtime = dir->i_ctime = CURRENT_TIME;
     EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
     mark_inode_dirty(dir);
     /* OFFSET_CACHE */
out_put:
     ext2_put_page(page);
out:
     return err;
out_unlock:
     unlock_page(page);
     goto out_put;
}
在这里,忽略了页面映射与文件系统驱动的交互过程。关于页面缓存后续再给出章节进行分析.
 
五:小结
在这一节里,以rootfs和ext2文件系统为例分析了目录的建立过程.只要对ext2文件系统的相关部分有所了解.理解这部份代码并不难.其中关于页面缓存部份以后再给出专题分析.详情请关注本站更新.
阅读(2919) | 评论(0) | 转发(2) |
给主人留下些什么吧!~~