fs/namespace.c
SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
char __user *, type, unsigned long, flags, void __user *, data)
{
int retval;
unsigned long data_page;
unsigned long type_page;
unsigned long dev_page;
char *dir_page;
//// 把用户空间的挂载类型复制到内核
retval = copy_mount_options(type, &type_page);
if (retval < 0)
return retval;
////通过kmem_cache_alloc从names_cachep一个PATH_MAX大小的内核空间,把用户空间的dir_name复制过去
dir_page = getname(dir_name);
retval = PTR_ERR(dir_page);
if (IS_ERR(dir_page))
goto out1;
/////////// copy_mount_options()和getname()函数将结构形式或字符串形式的参数值从用户空间拷贝到内核空间;这些参数值的长度均以一个页面为限,但是getname()在复制时遇到字符串结尾符“\0”就停止,并返回指向该字符串的指针;而copy_mount_options()则拷贝整个页面,并返回该页面的起始地址 /////////////////
////复制挂载的设备过去
retval = copy_mount_options(dev_name, &dev_page);
if (retval < 0)
goto out2;
retval = copy_mount_options(data, &data_page);
if (retval < 0)
goto out3;
lock_kernel();
retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
flags, (void *)data_page);////数据准备好,开始do_mount
unlock_kernel();
free_page(data_page);
out3:
free_page(dev_page);
out2:
putname(dir_page);
out1:
free_page(type_page);
return retval;
}
////////////////
/*
* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
* be given to the mount() call (ie: read-only, no-dev, no-suid etc).
*
* data is a (void *) that can point to any structure up to
* PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
* information (or be NULL).
*
* Pre-0.97 versions of mount() didn't have a flags word.
* When the flags word was introduced its top half was required
* to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
* Therefore, if this magic number is present, it carries no information
* and must be discarded.
*/
//验证挂载的选项Flags,以及获取并填充相应挂载目录dir_name的路径Path结构。 再根据挂载选项Flags来判断,挂载的动作。
long do_mount(char *dev_name, char *dir_name, char *type_page,
unsigned long flags, void *data_page)
{
struct path path;
int retval = 0;
int mnt_flags = 0;
/* Discard magic */
if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
flags &= ~MS_MGC_MSK;
/* Basic sanity checks *///挂载目录的有效性, memchr是判断dir_name是否在0~PAGE_SIZE(用户空间)中
if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
return -EINVAL;
if (dev_name && !memchr(dev_name, 0, PAGE_SIZE))
return -EINVAL;
if (data_page)
((char *)data_page)[PAGE_SIZE - 1] = 0;
/* Separate the per-mountpoint flags */
if (flags & MS_NOSUID)
mnt_flags |= MNT_NOSUID;////忽略suid和sgid位的影响
if (flags & MS_NODEV)
mnt_flags |= MNT_NODEV;//不允许访问设备专用文件
if (flags & MS_NOEXEC)
mnt_flags |= MNT_NOEXEC;//不允许执行程序
if (flags & MS_NOATIME)
mnt_flags |= MNT_NOATIME;//下面三个标志是关于是否更新文件或目录的atime
if (flags & MS_NODIRATIME)
mnt_flags |= MNT_NODIRATIME;
if (flags & MS_RELATIME)
mnt_flags |= MNT_RELATIME;
if (flags & MS_RDONLY)
mnt_flags |= MNT_READONLY;//只读标志
////相应的一些标志已经备份到mnt_flags了,flags去除相应位
flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT);
/* ... and get the mountpoint *///!!根据dir_name,获取挂载目录的路径信息path
retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
if (retval)
return retval;
retval = security_sb_mount(dev_name, &path,
type_page, flags, data_page);
if (retval)
goto dput_out;
///*根据不同选项,进行下面五种不同的挂载*/
if (flags & MS_REMOUNT)
retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
data_page);
else if (flags & MS_BIND)
retval = do_loopback(&path, dev_name, flags & MS_REC);
else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
retval = do_change_type(&path, flags);
else if (flags & MS_MOVE)
retval = do_move_mount(&path, dev_name);
else
//do_new_mount是最常用的挂载,path: 挂载目录,dev_name:挂载设备
retval = do_new_mount(&path, type_page, flags, mnt_flags,
dev_name, data_page);
dput_out:
path_put(&path);
return retval;
}
· MS_MGC_VAL 和 MS_MGC_MSK是在以前的版本中定义的安装标志和掩码,现在的安装标志中已经不使用这些魔数了,因此,当还有这个魔数时,则丢弃它。
·
对参数dir_name和dev_name进行基本检查,注意“!dir_name ”
和“!*dir_name”之不同,前者指指向字符串的指针为不为空,而后者指字符串不为空。Memchr()函数在指定长度的字符串中寻找指定的字符,如果字符串中没有结尾符“\0”,也是一种错误。前面以说过,对于基于网络的文件系统dev_name可以为空。
· 把安装标志为MS_NOSUID、MS_NOEXEC和MS_NODEV的三个标志位从flags分离出来,放在局部安装标志变量mnt_flags中。
· 函数path_init()和path_walk()寻找安装点的dentry数据结构,找到的dentry结构存放在局部变量nd的dentry域中。
· 如果flags中的MS_REMOUNT标志位为1,就表示所要求的只是改变一个原已安装设备的安装方式,例如从“只读“安装方式改为“可写”安装方式,这是通过调用do_remount()函数完成的。
·
如果flags中的MS_BIND标志位为1,就表示把一个“回接”设备捆绑到另一个对象上。回接设备是一种特殊的设备(虚拟设备),而实际上并不是一种真正设备,而是一种机制,这种机制提供了把回接设备回接到某个可访问的常规文件或块设备的手段。通常在/dev目录中有/dev/loop0和/dev/loop1两个回接设备文件。调用do_loopback()来实现回接设备的安装。
· 如果flags中的MS_MOVE标志位为1,就表示把一个已安装的设备可以移到另一个安装点,这是通过调用do_move_mount()函数来实现的。
·
如果不是以上三种情况,那就是一般的安装请求,于是把安装点加入到目录树中,这是通过调用do_new_mount()函数实现的,而do_new_mount()首先调用do_kern_mount()函数形成一个安装点,该函数的代码在fs/super.c中:
////
/*
* create a new mount for userspace and request it to be added into the
* namespace's tree
*利用do_kern_mount为用户空间生成一个新的挂载,并do_add_mount把新安装加入到命名空间树上
*
*/
//// *path: 挂载目录, *name:挂载设备,*type:挂载文件系统类型
static int do_new_mount(struct path *path, char *type, int flags,
int mnt_flags, char *name, void *data)
{
struct vfsmount *mnt;////包含已挂载文件系统的信息。
if (!type || !memchr(type, 0, PAGE_SIZE))
return -EINVAL;
/* we need capabilities... */// root权限,才能挂载
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
//linux 2.6.32 版本此处好像有内核锁 lock_kernel();
////完成mnt信息的填充。
mnt = do_kern_mount(type, flags, name, data);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
////添加到命名空间树上
return do_add_mount(mnt, path, mnt_flags, NULL);
}
///////fs/super.c
检查文件系统类型以决定安装操作要如何完成
/*
fstype:要安装的文件系统的类型名
flags:安装标志
name:存放文件系统的块设备路径名
data:指向传递给文件系统的read_super方法的附加数据的指针
*/
struct vfsmount *
do_kern_mount(const char *fstype, int flags, const char *name, void *data)
{
//get_fs_type()在文件系统类型链表中搜索并且确定存放在fstype参数中的名字的位置;
//返回局部变量type中的对应file_system_type描述//符的地址
////获取挂载文件系统的类型结构,
struct file_system_type *type = get_fs_type(fstype);
struct vfsmount *mnt;
if (!type)
.return ERR_PTR(-ENODEV);
mnt = vfs_kern_mount(type, flags, name, data);
if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
!mnt->mnt_sb->s_subtype)
mnt = fs_set_subtype(mnt, fstype);
put_filesystem(type);
return mnt;
}· 只有系统管理员才具有安装一个设备的权力,因此首先要检查当前进程是否具有这种权限。
· get_fs_type()函数根据具体文件系统的类型名在file_system_file链表中找到相应的结构。
· alloc_vfsmnt()函数调用Slab分配器给类型为vfsmount结构的局部变量mnt分配空间,并进行相应的初始化。
· set_devname()函数设置设备名。
· 一般的文件系统类型要求有物理的设备作为其物质基础,如果fs_flags中的FS_REQUIRES_DEV标志位为1,说明这就是正常的文件系统类型,如Ext2、mnix等。对于这种文件系统类型,通过调用get_sb_bdev()从待安装设备上读其超级块。
· 如果fs_flags中的FS_SINGLE标志位为1,说明整个文件系统只有一个类型,也就是说,这是一种虚拟的文件系统类型。这种文件类型在安装了同类型的第一个“设备”
,通过调用get_sb_single()创建了超级块super_block结构后,再安装的同类型设备就共享这个数据结构。但是像Ext2这样的文件系统类型在每个具体设备上都有一个超级块。
· 还有些文件系统类型的fs_flags中的FS_NOMOUNT、FS_REUIRE_DEV以及FS_SINGLE标志位全都为0,那么这些所谓的文件系统其实是“虚拟的”,通常只是用来实现某种机制或者规程,所以根本就没有对应的物理设备。对于这样的文件系统类型都是通过get_sb_nodev()来生成一个super_block结构的。
· 如果文件类型fs_flags的FS_NOMOUNT标志位为1,说明根本就没有用户进行安装,因此,把超级块中的MS_NOUSER标志位置1。
· mnt->mnt_sb指向所安装设备的超级块sb;mnt->mnt_root指向其超级块的根b->s_root,dget()函数把dentry的引用计数count加1;mnt->mnt_mountpoint也指向超级块的根,而mnt->mnt_parent指向自己。到此为止,仅仅形成了一个安装点,但还没有把这个安装点挂接在目录树上。
////////////////////////////
struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
struct vfsmount *mnt;
char *secdata = NULL;
int error;
if (!type)
return ERR_PTR(-ENODEV);
error = -ENOMEM;
////alloc_vfsmnt以设备名为参数,为mnt函数分配一个空间,初始化mnt的基本信息,包括mnt->mnt_devname,以及一些list
mnt = alloc_vfsmnt(name);//分配一个新的已安装文件系统的描述符,并将它的地址存放在mnt局部变量中
if (!mnt)
goto out;
if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
secdata = alloc_secdata();
if (!secdata)
goto out_mnt;
error = security_sb_copy_data(data, secdata);
if (error)
goto out_free_secdata;
}
//分配并初始化一个新的超级快 填充vfsmnt结构体的super_block结构体
////调用具体文件系统file_system_type的get_sb函数,填充vfsmnt结构体的super_block结构体
error = type->get_sb(type, flags, name, data, mnt);
if (error < 0)
goto out_free_secdata;
BUG_ON(!mnt->mnt_sb);
error = security_sb_kern_mount(mnt->mnt_sb, secdata);
if (error)
goto out_sb;
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;
up_write(&mnt->mnt_sb->s_umount);
free_secdata(secdata);
return mnt;
out_sb:
dput(mnt->mnt_root);
up_write(&mnt->mnt_sb->s_umount);
deactivate_super(mnt->mnt_sb);
out_free_secdata:
free_secdata(secdata);
out_mnt:
free_vfsmnt(mnt);
out:
return ERR_PTR(error);
}
//////////////////type->get_sb//////////////////////////////////////////
以sysfs文件系统为例fs/sysfs/mount.c
static struct file_system_type sysfs_fs_type = {
.name= "sysfs",
.get_sb= sysfs_get_sb,
.kill_sb= kill_anon_super,
};
static int sysfs_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data, struct vfsmount *mnt)
{
return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt);
}
int get_sb_single(struct file_system_type *fs_type,
int flags, void *data,
int (*fill_super)(struct super_block *, void *, int),
struct vfsmount *mnt)
{
struct super_block *s;
int error;
//搜索文件系统的超级快对象链表type->fs_supers,如果找到一个与块设备相关的超级快,则返回他的地址。否则分配并且
//初始化一个新的超级快对象,把它插入到文件系统链表和超级快全局链表中。并返回其地址
s = sget(fs_type, compare_single, set_anon_super, NULL);
if (IS_ERR(s))
return PTR_ERR(s);
if (!s->s_root) {
s->s_flags = flags;
////具体文件系统的fill_super,这里是ext2_fill_super,完成对super_block各个域的初始化
error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
if (error) {
up_write(&s->s_umount);
deactivate_super(s);
return error;
}
s->s_flags |= MS_ACTIVE;
}
do_remount_sb(s, flags, data, 0);
return simple_set_mnt(mnt, s);
}
/////////////////////////////////////////////////////////////////////////////////////////////////////fs/filesystems.c
struct file_system_type *get_fs_type(const char *name)
{
struct file_system_type *fs;
const char *dot = strchr(name, '.');
unsigned len = dot ? dot - name : strlen(name);
read_lock(&file_systems_lock);
fs = *(find_filesystem(name, len));
if (fs && !try_module_get(fs->owner))
fs = NULL;
read_unlock(&file_systems_lock);
//Load a module using the user mode module loader
if (!fs && (request_module("%.*s", len, name) == 0)) {
read_lock(&file_systems_lock);
fs = *(find_filesystem(name, len));
if (fs && !try_module_get(fs->owner))
fs = NULL;
read_unlock(&file_systems_lock);
}
if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
put_filesystem(fs);
fs = NULL;
}
return fs;
}
/////////fs/filesystems.c
void put_filesystem(struct file_system_type *fs)
{
module_put(fs->owner);
}
该函数的功能是将一个特定模块module的引用计数减1 ,这样当一个模块的引用计数因为不为0而不能从内核中卸载时,可以调用此函数一次或多次,实现对模块计数的清零,从而实现模块卸载。
void module_put(struct module *module)
{
if (module) {
unsigned int cpu = get_cpu();
local_dec(&module->ref[cpu].count);
/* Maybe they're waiting for us to drop reference? */
if (unlikely(!module_is_live(module)))
wake_up_process(module->waiter);
put_cpu();
}
}
///////被do_new_mount()调用、fs/namespace.c
/*
* add a mount into a namespace's mount tree
* - provide the option of adding the new mount to an expiration list
*/
int do_add_mount(struct vfsmount *newmnt, struct path *path,
int mnt_flags, struct list_head *fslist)
{
int err;
//获得写信号量,因为函数要修改namespace
down_write(&namespace_sem);
/* Something was mounted here while we slept */
从设备上读入超级块的过程是个较为漫长的过程,当前进程在等待从设备上读入超级块的过程中几乎可肯定要睡眠,这样就有可能另一个进程捷足先登抢先将另一个设备安装到了同一个安装点上。d_mountpoint()函数就是检查是否发生了这种情况。如果确实发生了这种情况,其对策就是调用follow_down()前进到已安装设备的根节点,并且通过while循环进一步检测新的安装点,直到找到一个空安装点为止。
////*挂载目录可能是已挂载的,follow_down可以把path->mnt和path->dentry指向上一层挂载*/
while (d_mountpoint(path->dentry) &&
follow_down(&path->mnt, &path->dentry))
;
err = -EINVAL;
if (!check_mnt(path->mnt))
goto unlock;
/* Refuse the same filesystem on the same mount point */
err = -EBUSY;
//· 如果在同一个安装点上要安装两个同样的文件系统,则出错
if (path->mnt->mnt_sb == newmnt->mnt_sb &&
path->mnt->mnt_root == path->dentry)
goto unlock;
err = -EINVAL;
if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
goto unlock;
newmnt->mnt_flags = mnt_flags;
//调用graft_tree()把mnt与安装树挂接起来,完成最终的安装
//调用graft_tree把新安装的文件系统对象插入到namespace链表,散列表及父文件系统的子链表中
if ((err = graft_tree(newmnt, path)))
goto unlock;
if (fslist) /* add to the specified expiration list */
list_add_tail(&newmnt->mnt_expire, fslist);
//释放写信号量
up_write(&namespace_sem);
return 0;
unlock:
up_write(&namespace_sem);
mntput(newmnt);
return err;
}
/////////fs/namespace.c
· 调用graft_tree()把mnt与安装树挂接起来,完成最终的安装。
static int graft_tree(struct vfsmount *mnt, struct path *path)
{
int err;
if (mnt->mnt_sb->s_flags & MS_NOUSER)
return -EINVAL;
if (S_ISDIR(path->dentry->d_inode->i_mode) !=
S_ISDIR(mnt->mnt_root->d_inode->i_mode))
return -ENOTDIR;
err = -ENOENT;
mutex_lock(&path->dentry->d_inode->i_mutex);
if (IS_DEADDIR(path->dentry->d_inode))
goto out_unlock;
err = security_sb_check_sb(mnt, path);
if (err)
goto out_unlock;
err = -ENOENT;
if (IS_ROOT(path->dentry) || !d_unhashed(path->dentry))
err = attach_recursive_mnt(mnt, path, NULL);
out_unlock:
mutex_unlock(&path->dentry->d_inode->i_mutex);
if (!err)
security_sb_post_addmount(mnt, path);
return err;
}
////////
/*
* @source_mnt : mount tree to be attached
* @nd : place the mount tree @source_mnt is attached
* @parent_nd : if non-null, detach the source_mnt from its parent and
* store the parent mount and mountpoint dentry.
* (done when source_mnt is moved)
*
* NOTE: in the table below explains the semantics when a source mount
* of a given type is attached to a destination mount of a given type.
* ---------------------------------------------------------------------------
* | BIND MOUNT OPERATION |
* |**************************************************************************
* | source-->| shared | private | slave | unbindable |
* | dest | | | | |
* | | | | | | |
* | v | | | | |
* |**************************************************************************
* | shared | shared (++) | shared (+) | shared(+++)| invalid |
* | | | | | |
* |non-shared| shared (+) | private | slave (*) | invalid |
* ***************************************************************************
* A bind operation clones the source mount and mounts the clone on the
* destination mount.
*
* (++) the cloned mount is propagated to all the mounts in the propagation
* tree of the destination mount and the cloned mount is added to
* the peer group of the source mount.
* (+) the cloned mount is created under the destination mount and is marked
* as shared. The cloned mount is added to the peer group of the source
* mount.
* (+++) the mount is propagated to all the mounts in the propagation tree
* of the destination mount and the cloned mount is made slave
* of the same master as that of the source mount. The cloned mount
* is marked as 'shared and slave'.
* (*) the cloned mount is made a slave of the same master as that of the
* source mount.
*
* ---------------------------------------------------------------------------
* | MOVE MOUNT OPERATION |
* |**************************************************************************
* | source-->| shared | private | slave | unbindable |
* | dest | | | | |
* | | | | | | |
* | v | | | | |
* |**************************************************************************
* | shared | shared (+) | shared (+) | shared(+++) | invalid |
* | | | | | |
* |non-shared| shared (+*) | private | slave (*) | unbindable |
* ***************************************************************************
*
* (+) the mount is moved to the destination. And is then propagated to
* all the mounts in the propagation tree of the destination mount.
* (+*) the mount is moved to the destination.
* (+++) the mount is moved to the destination and is then propagated to
* all the mounts belonging to the destination mount's propagation tree.
* the mount is marked as 'shared and slave'.
* (*) the mount continues to be a slave at the new location.
*
* if the source mount is a tree, the operations explained above is
* applied to each mount in the tree.
* Must be called without spinlocks held, since this function can sleep
* in allocations.
*/
fs/namespace.c
static int attach_recursive_mnt(struct vfsmount *source_mnt,
struct path *path, struct path *parent_path)
{
LIST_HEAD(tree_list);
struct vfsmount *dest_mnt = path->mnt;
struct dentry *dest_dentry = path->dentry;
struct vfsmount *child, *p;
int err;
if (IS_MNT_SHARED(dest_mnt)) {
err = invent_group_ids(source_mnt, true);
if (err)
goto out;
}
err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list);
if (err)
goto out_cleanup_ids;
if (IS_MNT_SHARED(dest_mnt)) {
for (p = source_mnt; p; p = next_mnt(p, source_mnt))
set_mnt_shared(p);
}
spin_lock(&vfsmount_lock);
if (parent_path) {
detach_mnt(source_mnt, parent_path);
attach_mnt(source_mnt, path);
touch_mnt_namespace(current->nsproxy->mnt_ns);
} else {
mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
commit_tree(source_mnt);
}
list_for_each_entry_safe(child, p, &tree_list, mnt_hash) {
list_del_init(&child->mnt_hash);
commit_tree(child);
}
spin_unlock(&vfsmount_lock);
return 0;
out_cleanup_ids:
if (IS_MNT_SHARED(dest_mnt))
cleanup_group_ids(source_mnt, NULL);
out:
return err;
}