一:前言
在用户空间中,建立目录所用的API为mkdir().它在内核中的系统调用入口是sys_mkdir().今天跟踪一下
函数来分析linux文件系统中目录的建立过程.
二:sys_mkdir()
Sys_mkdir()对应的代码如下:
asmlinkage long sys_mkdir(const char __user * pathname, int mode)
{
int error = 0;
char * tmp;
//把用户空间的值copy到内核空间
tmp = getname(pathname);
error = PTR_ERR(tmp);
if (!IS_ERR(tmp)) {
struct dentry *dentry;
struct nameidata nd;
//先查到它的父目录,看父目录是否存在
error = path_lookup(tmp, LOOKUP_PARENT, &nd);
if (error)
goto out;
//寻找子结点的dentry. 如果没有,则新建之
dentry = lookup_create(&nd, 1);
error = PTR_ERR(dentry);
if (!IS_ERR(dentry)) {
if (!IS_POSIXACL(nd.dentry->d_inode))
mode &= ~current->fs->umask;
//与具体的文件系统相关的部份
error = vfs_mkdir(nd.dentry->d_inode, dentry, mode);
//减少dentry的引用计数
dput(dentry);
}
up(&nd.dentry->d_inode->i_sem);
//释放临时内存
path_release(&nd);
out:
putname(tmp);
}
return error;
}
这个函数里面有几个重要的子函数. path_lookup()在前一篇文章中已经分析过了.如果不太了解,请参阅相关的部份.
lookup_create()的代码如下:
{
struct dentry *dentry;
//防止并发操作,获得信号量
down(&nd->dentry->d_inode->i_sem);
dentry = ERR_PTR(-EEXIST);
//如果之前的查找过程失败
if (nd->last_type != LAST_NORM)
goto fail;
//去掉LOOKUP_PARENT标志
nd->flags &= ~LOOKUP_PARENT;
//在缓存中寻找相应的dentry.如果没有。则新建之
dentry = lookup_hash(&nd->last, nd->dentry);
//创建或者查找失败
if (IS_ERR(dentry))
goto fail;
//如果不是建立一个目录而且文件名字不是以0结尾
//出错退出
if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
goto enoent;
return dentry;
enoent:
dput(dentry);
dentry = ERR_PTR(-ENOENT);
fail:
return dentry;
}
lookup_hash()à __lookup_hash():
static struct dentry * __lookup_hash(struct qstr *name, struct dentry * base, struct nameidata *nd)
{
struct dentry * dentry;
struct inode *inode;
int err;
inode = base->d_inode;
//检查是否有相关的权限
err = permission(inode, MAY_EXEC, nd);
dentry = ERR_PTR(err);
if (err)
goto out;
/*
* See if the low-level filesystem might want
* to use its own hash..
*/
//如果自定义了hash计算
if (base->d_op && base->d_op->d_hash) {
err = base->d_op->d_hash(base, name);
dentry = ERR_PTR(err);
if (err < 0)
goto out;
}
//从缓存中寻找
dentry = cached_lookup(base, name, nd);
if (!dentry) {
//如果缓存中没有相关项。则新建之
struct dentry *new = d_alloc(base, name);
dentry = ERR_PTR(-ENOMEM);
if (!new)
goto out;
//到具体的文件系统中查找
dentry = inode->i_op->lookup(inode, new, nd);
if (!dentry)
dentry = new;
else
dput(new);
}
out:
return dentry;
}
值得注意的是:经过上述的操作,返回的dentry有可能是原本就存在的.对这种情况是怎么排除的呢?继续看sys_mkdir()的另一个子函数:
int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
{
//对异常情况的排除和权限的检查
int error = may_create(dir, dentry, NULL);
if (error)
return error;
//如果父结点不允许mkdir操作
if (!dir->i_op || !dir->i_op->mkdir)
return -EPERM;
mode &= (S_IRWXUGO|S_ISVTX);
error = security_inode_mkdir(dir, dentry, mode);
if (error)
return error;
DQUOT_INIT(dir);
//调用父结点的mkdir操作
error = dir->i_op->mkdir(dir, dentry, mode);
if (!error) {
//如果成功,通告与之关联的进程
inode_dir_notify(dir, DN_CREATE);
security_inode_post_mkdir(dir,dentry, mode);
}
return error;
}
在这里看到,最终会调用父进程的i_op.mkdir操作.另外,对于上面说的相应结点已经存在的情况是在may_create()中检测的:
static inline int may_create(struct inode *dir, struct dentry *child,
struct nameidata *nd)
{
//如果欲建结点的inode已经存在
//对于一个新建的dentry.其d_inode指向为空.
if (child->d_inode)
return -EEXIST;
//判断父目录是否已经失效
if (IS_DEADDIR(dir))
return -ENOENT;
//权限检查
return permission(dir,MAY_WRITE | MAY_EXEC, nd);
}
Mkdir的大体架构就如此了.下面讨论一下rootfs和ext2中的目录创建.
三:rootfs的目录创建
在前一篇文章分析到.挂载rootfs时,对文件系统根目录的inode.i_op赋值如下:
static struct inode_operations ramfs_dir_inode_operations = {
.create = ramfs_create,
.lookup = simple_lookup,
.link = simple_link,
.unlink = simple_unlink,
.symlink = ramfs_symlink,
.mkdir = ramfs_mkdir,
.rmdir = simple_rmdir,
.mknod = ramfs_mknod,
.rename = simple_rename,
};
对应的mkdir操作入口是ramfs_mkdir():
static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
{
//创建结点
int retval = ramfs_mknod(dir, dentry, mode | S_IFDIR, 0);
//如果创建成功,更新i_nlink计数
if (!retval)
dir->i_nlink++;
return retval;
}
Ramsf_mknod()的代码如下:
static int
ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
{
//在文件系统中分其为配一个inode
struct inode * inode = ramfs_get_inode(dir->i_sb, mode, dev);
int error = -ENOSPC;
if (inode) {
//如果分配成功
if (dir->i_mode & S_ISGID) {
inode->i_gid = dir->i_gid;
if (S_ISDIR(mode))
inode->i_mode |= S_ISGID;
}
//将dentry与分配的inode关联起来
d_instantiate(dentry, inode);
//增加dentry的引用计数
dget(dentry); /* Extra count - pin the dentry in core */
error = 0;
}
return error;
}
这个函数中的子函数我们都在前面已经分析过.请自行查阅本站的其它文档.其操作非常简单。就是分配一个inode。然后将inode 与dentry建立关联.因为rootfs是一个基于RAM的文件系统。其inode的分配就是在内存中创建一个inode空间,然后为其各项操作赋值而已.
四:ext2中的目录创建
经过上一章的分析可以看到.ext2文件系统根目录的inode.i_op被赋值为ext2_dir_inode_operations.其结构如下所示:
struct inode_operations ext2_dir_inode_operations = {
.create = ext2_create,
.lookup = ext2_lookup,
.link = ext2_link,
.unlink = ext2_unlink,
.symlink = ext2_symlink,
.mkdir = ext2_mkdir,
.rmdir = ext2_rmdir,
.mknod = ext2_mknod,
.rename = ext2_rename,
#ifdef CONFIG_EXT2_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext2_listxattr,
.removexattr = generic_removexattr,
#endif
.setattr = ext2_setattr,
.permission = ext2_permission,
}
Mkdir对应的入口为ext2_mkdir().代码如下:
static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
{
struct inode * inode;
int err = -EMLINK;
if (dir->i_nlink >= EXT2_LINK_MAX)
goto out;
//增加dir的引用计数,并将其置为"脏"
ext2_inc_count(dir);
//在文件系统中分配一个inode
inode = ext2_new_inode (dir, S_IFDIR | mode);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_dir;
//为inode的各项操作赋值
inode->i_op = &ext2_dir_inode_operations;
inode->i_fop = &ext2_dir_operations;
//为inode对应的i_mapping赋值
if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
inode->i_mapping->a_ops = &ext2_aops;
//增加inode的引用计数,并将其置为"脏"
ext2_inc_count(inode);
//对目录结点的初始化
err = ext2_make_empty(inode, dir);
if (err)
goto out_fail;
//更新父目录,使inode加入父目录
err = ext2_add_link(dentry, inode);
if (err)
goto out_fail;
//使dentry和inode建立关联
d_instantiate(dentry, inode);
out:
return err;
out_fail:
ext2_dec_count(inode);
ext2_dec_count(inode);
iput(inode);
out_dir:
ext2_dec_count(dir);
goto out;
}
逐个分析上面所涉及到的子函数.
在ext2中分配一个inode是由ext2_new_inode()完成的.它的代码如下:
struct inode *ext2_new_inode(struct inode *dir, int mode)
{
struct super_block *sb;
struct buffer_head *bitmap_bh = NULL;
struct buffer_head *bh2;
int group, i;
ino_t ino = 0;
struct inode * inode;
struct ext2_group_desc *gdp;
struct ext2_super_block *es;
struct ext2_inode_info *ei;
struct ext2_sb_info *sbi;
int err;
sb = dir->i_sb;
//分配一个inode
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
//inode的私有结构
ei = EXT2_I(inode);
//super_block中的ext2私有结构
sbi = EXT2_SB(sb);
//ext2的super_block
es = sbi->s_es;
//寻找一个合适的组来分配inode
if (S_ISDIR(mode)) {
if (test_opt(sb, OLDALLOC))
group = find_group_dir(sb, dir);
else
group = find_group_orlov(sb, dir);
} else
group = find_group_other(sb, dir);
if (group == -1) {
err = -ENOSPC;
goto fail;
}
//遍历组描述符
for (i = 0; i < sbi->s_groups_count; i++) {
//group对应的组开始遍历
//取得组描述符
gdp = ext2_get_group_desc(sb, group, &bh2);
//释放bitmap_bh.已经后面会使用这个临时变量
brelse(bitmap_bh);
//取得组描述符里的inode位图
bitmap_bh = read_inode_bitmap(sb, group);
if (!bitmap_bh) {
err = -EIO;
goto fail;
}
ino = 0;
repeat_in_this_group:
//寻找位图中第一个没有使用的位
ino = ext2_find_next_zero_bit((unsigned long *)bitmap_bh->b_data,
EXT2_INODES_PER_GROUP(sb), ino);
//如果找到的位大于块组中的inode数.那从group之后的块组中分配
if (ino >= EXT2_INODES_PER_GROUP(sb)) {
/*
* Rare race: find_group_xx() decided that there were
* free inodes in this group, but by the time we tried
* to allocate one, they're all gone. This can also
* occur because the counters which find_group_orlov()
* uses are approximate. So just go and search the
* next block group.
*/
//已经到达块组数目最大值。则将其置为零.然后重新循环
if (++group == sbi->s_groups_count)
group = 0;
continue;
}
//将inode 位图中的分配位置位
if (ext2_set_bit_atomic(sb_bgl_lock(sbi, group),
ino, bitmap_bh->b_data)) {
/* we lost this inode */
//如果该位已经被置位了.说明其它的内核控制路径将其分配了.
//那就找它的下一个没有被使用的inode
//如果下一个超过了这个组中的最大inode数目。那从下一个块组中分配
if (++ino >= EXT2_INODES_PER_GROUP(sb)) {
/* this group is exhausted, try next group */
if (++group == sbi->s_groups_count)
group = 0;
continue;
}
/* try to find free inode in the same group */
//重新从块组中寻找没有被使用的inode
goto repeat_in_this_group;
}
//如果运行到这里的话,说明分配成功了
goto got;
}
/*
* Scanned all blockgroups.
*/
err = -ENOSPC;
goto fail;
got:
mark_buffer_dirty(bitmap_bh);
if (sb->s_flags & MS_SYNCHRONOUS)
sync_dirty_buffer(bitmap_bh);
brelse(bitmap_bh);
//将块组中的inode序号转换为全局inode计数
ino += group * EXT2_INODES_PER_GROUP(sb) + 1;
//如果inode序号小于super_block的超始inode序号或者大于inode总数
//出错退出
if (ino < EXT2_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
ext2_error (sb, "ext2_new_inode",
"reserved inode or inode > inodes count - "
"block_group = %d,inode=%lu", group,
(unsigned long) ino);
err = -EIO;
goto fail;
}
//更新统计计数
percpu_counter_mod(&sbi->s_freeinodes_counter, -1);
if (S_ISDIR(mode))
percpu_counter_inc(&sbi->s_dirs_counter);
spin_lock(sb_bgl_lock(sbi, group));
gdp->bg_free_inodes_count =
cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
//更新s_debts
if (S_ISDIR(mode)) {
if (sbi->s_debts[group] < 255)
sbi->s_debts[group]++;
gdp->bg_used_dirs_count =
cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
} else {
if (sbi->s_debts[group])
sbi->s_debts[group]--;
}
spin_unlock(sb_bgl_lock(sbi, group));
sb->s_dirt = 1;
mark_buffer_dirty(bh2);
inode->i_uid = current->fsuid;
if (test_opt (sb, GRPID))
inode->i_gid = dir->i_gid;
else if (dir->i_mode & S_ISGID) {
inode->i_gid = dir->i_gid;
if (S_ISDIR(mode))
mode |= S_ISGID;
} else
inode->i_gid = current->fsgid;
inode->i_mode = mode;
//更新inode表示的索引结点号
inode->i_ino = ino;
inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size (for stat), not the fs block size */
inode->i_blocks = 0;
//使i_mtine,i_atime,i_ctime置为当前时间
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
memset(ei->i_data, 0, sizeof(ei->i_data));
ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL;
if (S_ISLNK(mode))
ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL);
/* dirsync is only applied to directories */
if (!S_ISDIR(mode))
ei->i_flags &= ~EXT2_DIRSYNC_FL;
ei->i_faddr = 0;
ei->i_frag_no = 0;
ei->i_frag_size = 0;
ei->i_file_acl = 0;
ei->i_dir_acl = 0;
ei->i_dtime = 0;
ei->i_block_group = group;
ei->i_next_alloc_block = 0;
ei->i_next_alloc_goal = 0;
ei->i_prealloc_block = 0;
ei->i_prealloc_count = 0;
ei->i_dir_start_lookup = 0;
ei->i_state = EXT2_STATE_NEW;
ext2_set_inode_flags(inode);
spin_lock(&sbi->s_next_gen_lock);
inode->i_generation = sbi->s_next_generation++;
spin_unlock(&sbi->s_next_gen_lock);
insert_inode_hash(inode);
if (DQUOT_ALLOC_INODE(inode)) {
DQUOT_DROP(inode);
err = -ENOSPC;
goto fail2;
}
err = ext2_init_acl(inode, dir);
if (err) {
DQUOT_FREE_INODE(inode);
goto fail2;
}
//置inode为“脏”
mark_inode_dirty(inode);
ext2_debug("allocating inode %lu\n", inode->i_ino);
ext2_preread_inode(inode);
return inode;
fail2:
inode->i_flags |= S_NOQUOTA;
inode->i_nlink = 0;
iput(inode);
return ERR_PTR(err);
fail:
make_bad_inode(inode);
iput(inode);
return ERR_PTR(err);
}
查找一个末使用的索引结点有一个规则,就是尽量使每个块组达到平衡.所以linux在ext2_sb_info结构中加了一个s_debts字段.用来表示每个块组中的文件与目录的分配情况.计算的方法是在此find_group_orlov(目录)和find_group_other(其它类型的文件)中完成的.
每个页面都包含两个特殊目录结构 “.”和 “..”.单点代表其本身,双点代表父目录.这个过程是在ext2_make_empty()中完成的.对应代码如下:
int ext2_make_empty(struct inode *inode, struct inode *parent)
{
struct address_space *mapping = inode->i_mapping;
//找到页面映射所代表的首个页面
struct page *page = grab_cache_page(mapping, 0);
unsigned chunk_size = ext2_chunk_size(inode);
struct ext2_dir_entry_2 * de;
int err;
void *kaddr;
if (!page)
return -ENOMEM;
//先调用prepare_write().因为之后会将page写到文件系统中去
err = mapping->a_ops->prepare_write(NULL, page, 0, chunk_size);
if (err) {
unlock_page(page);
goto fail;
}
//将page临时映射到内核
kaddr = kmap_atomic(page, KM_USER0);
//目录中的第一个文件对象
de = (struct ext2_dir_entry_2 *)kaddr;
//每个目录中都有两个默认存在的对象.和..
//将'.'加至目录中,其inode结点号指向其本身
de->name_len = 1;
de->rec_len = cpu_to_le16(EXT2_DIR_REC_LEN(1));
memcpy (de->name, ".\0\0", 4);
de->inode = cpu_to_le32(inode->i_ino);
ext2_set_de_type (de, inode);
//设置'..'.使其指向父目录
de = (struct ext2_dir_entry_2 *)(kaddr + EXT2_DIR_REC_LEN(1));
de->name_len = 2;
de->rec_len = cpu_to_le16(chunk_size - EXT2_DIR_REC_LEN(1));
de->inode = cpu_to_le32(parent->i_ino);
memcpy (de->name, "..\0", 4);
ext2_set_de_type (de, inode);
//释放掉映射区间
kunmap_atomic(kaddr, KM_USER0);
//将更改的页面提交到文件系统
err = ext2_commit_chunk(page, 0, chunk_size);
fail:
//页面使用完了,减少其使用计数
page_cache_release(page);
return err;
}
初始完成之后,要将子目录插入父目录所表的空间的。它是由ext2_add_link()完成的。代码如下:
int ext2_add_link (struct dentry *dentry, struct inode *inode)
{
//得到父目录的inode
struct inode *dir = dentry->d_parent->d_inode;
const char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
unsigned chunk_size = ext2_chunk_size(dir);
unsigned reclen = EXT2_DIR_REC_LEN(namelen);
unsigned short rec_len, name_len;
struct page *page = NULL;
ext2_dirent * de;
//父目录结点大小所占的页面数
unsigned long npages = dir_pages(dir);
unsigned long n;
char *kaddr;
unsigned from, to;
int err;
/*
* We take care of directory expansion in the same loop.
* This code plays outside i_size, so it locks the page
* to protect that region.
*/
//遍历结点所在的空间
for (n = 0; n <= npages; n++) {
char *dir_end;
page = ext2_get_page(dir, n);
err = PTR_ERR(page);
if (IS_ERR(page))
goto out;
lock_page(page);
kaddr = page_address(page);
//本页面的最后的位置.
//ext2_last_byte: 如果剩余的长度大于一个页面,则返回一个页面大小.否则返回剩余空间大小
dir_end = kaddr + ext2_last_byte(dir, n);
de = (ext2_dirent *)kaddr;
kaddr += PAGE_CACHE_SIZE - reclen;
while ((char *)de <= kaddr) {
//到了结点空间的末尾
if ((char *)de == dir_end) {
/* We hit i_size */
name_len = 0;
rec_len = chunk_size;
de->rec_len = cpu_to_le16(chunk_size);
de->inode = 0;
goto got_it;
}
//目录中文件所占空间长度为0.非法
if (de->rec_len == 0) {
ext2_error(dir->i_sb, __FUNCTION__,
"zero-length directory entry");
err = -EIO;
goto out_unlock;
}
err = -EEXIST;
//在目录所包含的文件中,含有同名的结点
if (ext2_match (namelen, name, de))
goto out_unlock;
name_len = EXT2_DIR_REC_LEN(de->name_len);
rec_len = le16_to_cpu(de->rec_len);
//de->inode==0.表示目录中的此结点被删除
//rec_len >= reclen:表示旧结点中有足够的空间存储新的结点
if (!de->inode && rec_len >= reclen)
goto got_it;
//这个结点中有空间剩余.(可能是它后面有节点被删除造成的)
if (rec_len >= name_len + reclen)
goto got_it;
de = (ext2_dirent *) ((char *) de + rec_len);
}
unlock_page(page);
ext2_put_page(page);
}
BUG();
return -EINVAL;
got_it:
from = (char*)de - (char*)page_address(page);
to = from + rec_len;
err = page->mapping->a_ops->prepare_write(NULL, page, from, to);
if (err)
goto out_unlock;
if (de->inode) {
//这是属于结点空间有剩余的情况
//即在空间中插入一个新的结点
ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
de1->rec_len = cpu_to_le16(rec_len - name_len);
de->rec_len = cpu_to_le16(name_len);
de = de1;
}
//对目录的相关项进行赋值
de->name_len = namelen;
memcpy (de->name, name, namelen);
de->inode = cpu_to_le32(inode->i_ino);
ext2_set_de_type (de, inode);
//提交所做的修改,将其写入文件系统
err = ext2_commit_chunk(page, from, to);
//更改时间戳
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(dir);
/* OFFSET_CACHE */
out_put:
ext2_put_page(page);
out:
return err;
out_unlock:
unlock_page(page);
goto out_put;
}
在这里,忽略了页面映射与文件系统驱动的交互过程。关于页面缓存后续再给出章节进行分析.
五:小结
在这一节里,以rootfs和ext2文件系统为例分析了目录的建立过程.只要对ext2文件系统的相关部分有所了解.理解这部份代码并不难.其中关于页面缓存部份以后再给出专题分析.详情请关注本站更新.