Chinaunix首页 | 论坛 | 博客
  • 博客访问: 2762181
  • 博文数量: 79
  • 博客积分: 30130
  • 博客等级: 大将
  • 技术积分: 2608
  • 用 户 组: 普通用户
  • 注册时间: 2007-10-22 14:58
个人简介

博所搬至http://xiaogr.com

文章存档

2015年(2)

2009年(3)

2008年(56)

2007年(18)

分类: LINUX

2008-05-16 18:08:55

------------------------------------------
本文系本站原创,欢迎转载!
转载请注明出处:http://ericxiao.cublog.cn/
------------------------------------------
文件系统是操作系统的一个重要的功能,linux提供了对各种文件系统的支持,具有极高的扩展性.文件系统这一部份也是linux内核中难以理解的一部份,因为它与很多子系统有关.有时候还需要涉及到具体的磁盘分区格式.下面的代码分析以ext2格式为基础进行分析.在分析的过程中会遇到了块设备操作的一些API,暂且将它放至一边。块设备的操作会以单独的专题做讨论。
一:与关系统有关的数据结构
1.1:VFS相关的数据结构
在文件系统中,涉及到最多的就是super_block,inode,dentry这几个结构。先列出这几个结构中各成员所代表的含义,具体的作用等到代码遇到再进行分析。这几个结构在内核中的定义如下:
struct super_block {
//用来形成一个链表
     struct list_head   s_list;       /* Keep this first */
     //super_block所对应的设备
     dev_t              s_dev;        /* search index; _not_ kdev_t */
     //以字节为单位的块大小
     unsigned long      s_blocksize;
     unsigned long      s_old_blocksize;
     //以位为单位的块大小
     unsigned char      s_blocksize_bits;
     //“脏”标志
     unsigned char      s_dirt;
     //文件大小的上限
     unsigned long long s_maxbytes;   /* Max file size */
     //所属的文件系统
     struct file_system_type *s_type;
     //super_block的操作
     struct super_operations *s_op;
     //磁盘限额的方法
     struct dquot_operations *dq_op;
     //配置磁盘限额的方法
     struct quotactl_ops    *s_qcop;
     //导出的方法
     struct export_operations *s_export_op;
     //所对应的标志
     unsigned long      s_flags;
     //文件系统的魔数
     unsigned long      s_magic;
     //目录登录点
     struct dentry      *s_root;
     //为避免竞争,所用的rw_semaphore
     struct rw_semaphore    s_umount;
     struct semaphore   s_lock;
     int           s_count;
     //文件系统的同步标志
     int           s_syncing;
     //尚末将文件系统同步
     int           s_need_sync_fs;
     //活动引用计数
     atomic_t      s_active;
     //安全模块
     void                    *s_security;
     struct xattr_handler   **s_xattr;
     //脏节点链表
     struct list_head   s_dirty; /* dirty inodes */
     //回写链表
     struct list_head   s_io;         /* parked for writeback */
     //匿名分区
     struct hlist_head  s_anon;       /* anonymous dentries for (nfs) exporting */
     //被分区的文件链表
     struct list_head   s_files;
     //相关的块设备
     struct block_device    *s_bdev;
     //该类型的文件系统
     struct list_head   s_instances;
     struct quota_info  s_dquot; /* Diskquota specific options */
     //如果该标志被置位,则禁止使用该文件系统(可能需要等待某些操作的完成,例如同步)
     int           s_frozen;
     //等待队列
     wait_queue_head_t  s_wait_unfrozen;
     //设备的名称
     char s_id[32];                   /* Informational name */
     //指向特定的文件系统的信息(这个是一个统一的结构,毕竟不同的文件系统有很多不同的信息)
     void               *s_fs_info;   /* Filesystem private info */
 
     /*
      * The next field is for VFS *only*. No filesystems have any business
      * even looking at it. You had been warned.
      */
      //删除时使用的信号量
     struct semaphore s_vfs_rename_sem;   /* Kludge */
}
 
struct inode {
     //散列表
     struct hlist_node  i_hash;
     //用来形成链表
     struct list_head   i_list;
     //目录项链表
     struct list_head   i_dentry;
     //索引节点号
     unsigned long      i_ino;
     //引用计数
     atomic_t      i_count;
     //访问权根控制
     umode_t            i_mode;
     //硬链接数目
     unsigned int       i_nlink;
     //使用者id
     uid_t              i_uid;
     //使用者的gid
     gid_t              i_gid;
     //所属的设备
     dev_t              i_rdev;
     //以字节为单位的文件大小
     loff_t             i_size;
     //最后访问时间
     struct timespec        i_atime;
     //最后修改时间
     struct timespec        i_mtime;
     //最后改变时间
     struct timespec        i_ctime;
     //以位为单位的块大小
     unsigned int       i_blkbits;
     //以字节为单位的块大小
     unsigned long      i_blksize;
     //版本号
     unsigned long      i_version;
     //文件的块数
     unsigned long      i_blocks;
     //使用的字节数
     unsigned short          i_bytes;
     //如果该成员被置为1```则这个inode表示的是一个套接字
     unsigned char      i_sock;
     //索引结点的自旋锁
     spinlock_t         i_lock;  /* i_blocks, i_bytes, maybe i_size */
     //索引结点的信号量
     struct semaphore   i_sem;
     struct rw_semaphore    i_alloc_sem;
     //索引结点的操作列表
     struct inode_operations *i_op;
     //索引结点所属文件的操作列表
     struct file_operations *i_fop;  /* former ->i_op->default_file_ops */
     //索引结点的所属super_block
     struct super_block *i_sb;
     //文件锁链表
     struct file_lock   *i_flock;
     //把向所属的页面缓存
     struct address_space   *i_mapping;
     struct address_space   i_data;
#ifdef CONFIG_QUOTA
     struct dquot       *i_dquot[MAXQUOTAS];
#endif
     /* These three should probably be a union */
     //块设备链表
     struct list_head   i_devices;
     //管道信息
     struct pipe_inode_info *i_pipe;
     //所属块设备
     struct block_device    *i_bdev;
     //所属的字符设备
     struct cdev        *i_cdev;
     int           i_cindex;
     //索引结点的版本号
     __u32             i_generation;
     //目录通知掩码
     unsigned long      i_dnotify_mask; /* Directory notify events */
     //目录通知
     struct dnotify_struct  *i_dnotify; /* for directory notifications */
     //状态标志
     unsigned long      i_state;
     //首次修改时间
     unsigned long      dirtied_when; /* jiffies of first dirtying */
     //文件系统标志
     unsigned int       i_flags;
     //写者计数
     atomic_t      i_writecount;
     //安全模块
     void          *i_security;
     //文件的特殊信息
     union {
         void     *generic_ip;
     } u;
#ifdef __NEED_I_SIZE_ORDERED
     seqcount_t         i_size_seqcount;
#endif
}
 
struct dentry {
     //引用计数
     atomic_t d_count;
     //目录项标识
     unsigned int d_flags;       /* protected by d_lock */
     //单目录项
     spinlock_t d_lock;     /* per dentry lock */
     //相关的inode
     struct inode *d_inode;      /* Where the name belongs to - NULL is
                        * negative */
     /*
      * The next three fields are touched by __d_lookup.  Place them here
      * so they all fit in a 16-byte range, with 16-byte alignment.
      */
      //父目录中的目录结构
     struct dentry *d_parent;    /* parent directory */
     //散队表头
     struct hlist_head *d_bucket;     /* lookup hash bucket */
     //目录项的名字
     struct qstr d_name;
     //LRU链表
     struct list_head d_lru;     /* LRU list */
     //父目录的子目录链表
     struct list_head d_child;   /* child of parent list */
     //目录的子目录链表
     struct list_head d_subdirs; /* our children */
     //索引结点的别名链表
     struct list_head d_alias;   /* inode alias list */
     //重新生效的时间
     unsigned long d_time;       /* used by d_revalidate */
     //目录项的操作列表
     struct dentry_operations *d_op;
     //目录项的所属super_block
     struct super_block *d_sb;   /* The root of the dentry tree */
    
     void *d_fsdata;             /* fs-specific data */
     void * d_extra_attributes;  /* TUX-specific data */
     //RCU锁
     struct rcu_head d_rcu;
     struct dcookie_struct *d_cookie; /* cookie, if any */
     //所属散列表
     struct hlist_node d_hash;   /* lookup hash list */
     //是否有文件系统被挂载到此目录下
     int d_mounted;
     //短文件名
     unsigned char d_iname[DNAME_INLINE_LEN_MIN];   /* small names */
}
 
在这里要注意的是,不管目录还是普通文件在文件系统中都是对应的文件,只是文件的类型不一样,都有一个dentry项,dentry项对应一个inode.inode和dentry都有一个成员指向文件系统的super_block.
 
1.2:与进程相关的结构
在task的定义中,包含两个成员:fs,file .它们的结构定义如下所示:
struct fs_struct {
     //结构的引用计数
     atomic_t count;
     //读写锁
     rwlock_t lock;
     //默认的文件访问权限
     int umask;
     //用户的root目录,当前目录,与替换目录
     struct dentry * root, * pwd, * altroot;
     //root目录,当前目录与替换目录所对应的文件系统
     struct vfsmount * rootmnt, * pwdmnt, * altrootmnt;
}
其中,在x86平台中,altroot与altrootmnt的值为空.
 
File对应的数据结构为files_struct.它的结构定义如下:
struct files_struct {
     //结构体的使用计数
        atomic_t count;
        spinlock_t file_lock;     /* Protects all the below members.  Nests inside tsk->alloc_lock */
     //文件对象数的上限
        int max_fds;
     //文件描述符的上限
        int max_fdset;
     //下一个文件描述符
        int next_fd;
     //全部文件对象数组
        struct file ** fd;      /* current fd array */
     //exec()关闭的文件描述符
        fd_set *close_on_exec;
     //打开的文件描述符指针
        fd_set *open_fds;
     //exe()关闭的初始化文件
        fd_set close_on_exec_init;
     //文件描述符的初始集合
        fd_set open_fds_init;
     //默认的文件对象数组
        struct file * fd_array[NR_OPEN_DEFAULT];
}
 
 
二:路径名的查找
在应用中,经常有为给定的路径寻找结点的操作,例如cd /home/eric/kernel_study.这个操作是经常需用用到的操作,在进行深入的文件系统学习前,有必要先了解一下这个操作的实现.
在内核中,path_lookup()用来查到一个给定路径的所属文件结点。它的代码如下:
int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata *nd)
{
     int retval;
 
     //刚开始搜索前,将nd->last_type置为LAST_ROOT
     nd->last_type = LAST_ROOT; /* if there are only slashes... */
     nd->flags = flags;
     nd->depth = 0;
 
     //为了避免对current->fs的读写竞争,先加锁
     read_lock(¤t->fs->lock);
     //第一个路径字符为'/',表示的是一个绝对路径.eg :/home/eric
     if (*name=='/') {
         //在x86中,task->fs->altroot为空
         if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
              nd->mnt = mntget(current->fs->altrootmnt);
              nd->dentry = dget(current->fs->altroot);
              read_unlock(¤t->fs->lock);
              if (__emul_lookup_dentry(name,nd))
                   return 0;
              read_lock(¤t->fs->lock);
         }
         //将搜索的起始路径设为root
         nd->mnt = mntget(current->fs->rootmnt);
         nd->dentry = dget(current->fs->root);
     } else {
     //相对路径.eg: eric/kernel_study
     //将搜综的起始路径设为pwd
         nd->mnt = mntget(current->fs->pwdmnt);
         nd->dentry = dget(current->fs->pwd);
     }
     //解锁
     read_unlock(¤t->fs->lock);
     //将当前进程的total_link_count置为0(表示末遇到链接)
     current->total_link_count = 0;
     retval = link_path_walk(name, nd);
     if (unlikely(current->audit_context
              && nd && nd->dentry && nd->dentry->d_inode))
         audit_inode(name,
                  nd->dentry->d_inode->i_ino,
                  nd->dentry->d_inode->i_rdev);
     return retval;
}
这个函数有三个参数:name表示路径的字符串。Flag表示搜索的标志.如下所示:
 
//如果最后一个结点是符号链表,跟随进去
#define LOOKUP_FOLLOW       1
//最后的结点需要是一个目录
#define LOOKUP_DIRECTORY    2
//继续往下面搜索
#define LOOKUP_CONTINUE     4
//搜索最后结点的父目录
#define LOOKUP_PARENT       16
//不要搜索替换目录,在x86中没有使用
#define LOOKUP_NOALT        32
//在搜索的过程中不能被打断,
#define LOOKUP_ATOMIC       64
 
/*
 * Intent data
 */
 //搜索文件的目的
#define LOOKUP_OPEN         (0x0100)
#define LOOKUP_CREATE       (0x0200)
#define LOOKUP_ACCESS       (0x0400)
 
最后的一参数nameidata用来存放中间信息和搜索的结果.它的结构如下:
//存储搜索路径过程中的信息
struct nameidata {
     //存放最后一次搜索的dentry
     struct dentry *dentry;
     //存放dentry所属的文件系统
     struct vfsmount *mnt;
     //最后搜索的结点的名称
     struct qstr   last;
     //标志,与path_lookup()中的flag参数相同
     unsigned int  flags;
     //最后一次搜索的类型
     int      last_type;
     //搜索的链接深度
     unsigned depth;
     char *saved_names[MAX_NESTED_LINKS + 1];
 
     /* Intent data */
     union {
         struct open_intent open;
     } intent;
}
上述代码会判断是一个相对路径还是绝对路径。将其信息设置在nd->dentry和nd->vfsmount中后转入link_path_walk()进行搜索.代码如下所示:
int fastcall link_path_walk(const char * name, struct nameidata *nd)
{
     struct path next;
     struct inode *inode;
     int err, atomic;
     unsigned int lookup_flags = nd->flags;
 
     //如果搜索标志中含有LOOKUP_ATOMIC. atomci等于1
     atomic = (lookup_flags & LOOKUP_ATOMIC);
 
     //跳过路径前面的'/'
     while (*name=='/')
         name++;
     //如果后面没有数据了,说明已经搜索完了.
     if (!*name)
         goto return_reval;
 
     //搜索目录所对应的inode
     inode = nd->dentry->d_inode;
     //如果nd->depth被置位,则定义lookup_flags = LOOKUP_FOLLOW
     if (nd->depth)
         lookup_flags = LOOKUP_FOLLOW;
 
     /* At this point we know we have a real path component. */
     for(;;) {
         unsigned long hash;
         struct qstr this;
         unsigned int c;
 
         //判断是否有相应的权限
         err = exec_permission_lite(inode, nd);
         if (err == -EAGAIN) {
              err = permission(inode, MAY_EXEC, nd);
         }
         if (err)
              break;
 
         //将结点的信息保存在this中.例如 : eric/kerne_study/fs
         //就会将"eric"保存到this中
         this.name = name;
         c = *(const unsigned char *)name;
 
         hash = init_name_hash();
         //计算结点名称对应的hash值。一直到字符为空或者到了节点末尾为止(节点节尾,即遇上了'/')
         do {
              name++;
              hash = partial_name_hash(c, hash);
              c = *(const unsigned char *)name;
         } while (c && (c != '/'));
         this.len = name - (const char *) this.name;
         this.hash = end_name_hash(hash);
 
         /* remove trailing slashes? */
         //!c:则到了最后一个结点,且不以'/'结尾 eg: /home/eric
         if (!c)
              goto last_component;
         while (*++name == '/');
         if (!*name)
              //最后一个结点是以'/'结尾的 eg:/home/eric/
              goto last_with_slashes;
 
         /*
          * "." and ".." are special - ".." especially so because it has
          * to be able to know about the current root directory and
          * parent relationships.
          */
          //如果结点是以'.' 开头
          //在linux中,'.'表示当前目录。'..'表示上一次目录
         if (this.name[0] == '.') switch (this.len) {
              default:
                   break;
              case 2: 
                   if (this.name[1] != '.')
                       break;
                   follow_dotdot(&nd->mnt, &nd->dentry);
                   inode = nd->dentry->d_inode;
                   /* fallthrough */
              case 1:
                   // '.'
                   continue;
         }
         /*
          * See if the low-level filesystem might want
          * to use its own hash..
          */
 
         //如果自定义哈希值计算
         if (nd->dentry->d_op && nd->dentry->d_op->d_hash) {
              err = nd->dentry->d_op->d_hash(nd->dentry, &this);
              if (err < 0)
                   break;
         }
         //添加LOOKUP_CONTINUE
         nd->flags |= LOOKUP_CONTINUE;
         /* This does the actual lookups.. */
         //到nd->dentry中查找this. 如果有相应的结点,则将结点对应的denty存进next中
         err = do_lookup(nd, &this, &next, atomic);
         //如果查找失败
         if (err)
              break;
         /* Check mountpoints.. */
         //对于目录有文件系统挂载的处理
         follow_mount(&next.mnt, &next.dentry);
 
         err = -ENOENT;
 
         //对查找节点节点的inode进行检查
         inode = next.dentry->d_inode;
         if (!inode)
              goto out_dput;
         err = -ENOTDIR;
         if (!inode->i_op)
              goto out_dput;
 
         //如果该节点是一个链接
         if (inode->i_op->follow_link) {
              mntget(next.mnt);
              err = do_follow_link(next.dentry, nd);
              dput(next.dentry);
              mntput(next.mnt);
              if (err)
                   goto return_err;
              err = -ENOENT;
              inode = nd->dentry->d_inode;
              if (!inode)
                   break;
              err = -ENOTDIR;
              if (!inode->i_op)
                   break;
         } else {
         //否则,释放nd->dentry。将next中的相应值赋给nd
              dput(nd->dentry);
              nd->mnt = next.mnt;
              nd->dentry = next.dentry;
         }
         err = -ENOTDIR;
         //如果该结点没有定义lookup操作
         if (!inode->i_op->lookup)
              break;
         //继续下一个节点的查找
         continue;
         /* here ends the main loop */
 
//最后的结点是以'/' 结尾
last_with_slashes:
         //添加LOOKUP_FOLLOW | LOOKUP_DIRECTORY标志
         lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
//最后的结点      
last_component:
         nd->flags &= ~LOOKUP_CONTINUE;
         //如果定义了LOOKUP_PARENT:不必要搜索最后的结点,直接返回。那返回nameidata
         //包含的是父目录的相关信息
         if (lookup_flags & LOOKUP_PARENT)
              goto lookup_parent;
 
         //如果没有定义LOOKUP_PARENT:则还需要解析最后的一个结点
         if (this.name[0] == '.') switch (this.len) {
              default:
                   break;
              case 2: 
                   if (this.name[1] != '.')
                       break;
                   follow_dotdot(&nd->mnt, &nd->dentry);
                   inode = nd->dentry->d_inode;
                   /* fallthrough */
              case 1:
                   goto return_reval;
         }
         if (nd->dentry->d_op && nd->dentry->d_op->d_hash) {
              err = nd->dentry->d_op->d_hash(nd->dentry, &this);
              if (err < 0)
                   break;
         }
         err = do_lookup(nd, &this, &next, atomic);
         if (err)
              break;
         follow_mount(&next.mnt, &next.dentry);
         inode = next.dentry->d_inode;
         if ((lookup_flags & LOOKUP_FOLLOW)
             && inode && inode->i_op && inode->i_op->follow_link) {
              mntget(next.mnt);
              err = do_follow_link(next.dentry, nd);
              dput(next.dentry);
              mntput(next.mnt);
              if (err)
                   goto return_err;
              inode = nd->dentry->d_inode;
         } else {
              dput(nd->dentry);
              nd->mnt = next.mnt;
              nd->dentry = next.dentry;
         }
         err = -ENOENT;
         if (!inode)
              break;
         if (lookup_flags & LOOKUP_DIRECTORY) {
              err = -ENOTDIR;
              if (!inode->i_op || !inode->i_op->lookup)
                   break;
         }
         goto return_base;
lookup_parent:
         nd->last = this;
 
         //根据最后结点的名字。返回不同的last_type
         nd->last_type = LAST_NORM;
         if (this.name[0] != '.')
              goto return_base;
         if (this.len == 1)
              nd->last_type = LAST_DOT;
         else if (this.len == 2 && this.name[1] == '.')
              nd->last_type = LAST_DOTDOT;
         else
              goto return_base;
return_reval:
         /*
          * We bypassed the ordinary revalidation routines.
          * We may need to check the cached dentry for staleness.
          */
         if (nd->dentry && nd->dentry->d_sb &&
             (nd->dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
              err = -ESTALE;
              /* Note: we do not d_invalidate() */
              if (!nd->dentry->d_op->d_revalidate(nd->dentry, nd))
                   break;
         }
return_base:
         return 0;
out_dput:
         dput(next.dentry);
         break;
     }
     path_release(nd);
return_err:
     return err;
}
上面的代码对最后一次结点的时候进行了特殊的处理。如果定义了LOOK_PARENT。那只要搜索到倒数第二个结点就可以了。对于没有定义LOOK_PARENT的情况,都跟循环体中的处理是一样的.对于在搜索中遇到的特殊情况,分情况罗列如下:
2.1返回到上一级目录
如果路径中含有‘..’,则需要返回当前节点的上一级目录.这个过程是在follow_dotdot()中处理的。它的代码如下:
static inline void follow_dotdot(struct vfsmount **mnt, struct dentry **dentry)
{
     while(1) {
         struct vfsmount *parent;
         struct dentry *old = *dentry;
 
                read_lock(¤t->fs->lock);
 
         //已经到进程的根目录       
         if (*dentry == current->fs->root &&
             *mnt == current->fs->rootmnt) {
                        read_unlock(¤t->fs->lock);
              break;
         }
                read_unlock(¤t->fs->lock);
         spin_lock(&dcache_lock);
         //当前目录不是文件系统的挂载点。直接找其父目录就OK
         if (*dentry != (*mnt)->mnt_root) {
              *dentry = dget((*dentry)->d_parent);
              spin_unlock(&dcache_lock);
              dput(old);
              break;
         }
         spin_unlock(&dcache_lock);
         spin_lock(&vfsmount_lock);
 
         //当前目录是文件系统的挂载点。再往上的话就会到一个新的文件系统
 
         //那就取mnt为挂载点的vfsmount
         //dentry为挂载点的mountpoint
         //然后再循环一次
         parent = (*mnt)->mnt_parent;
         if (parent == *mnt) {
              spin_unlock(&vfsmount_lock);
              break;
         }
         mntget(parent);
         *dentry = dget((*mnt)->mnt_mountpoint);
         spin_unlock(&vfsmount_lock);
         dput(old);
         mntput(*mnt);
         *mnt = parent;
     }
 
     //如果找到的目录(要找的上一级目录)是一个挂载点。跳转到了最后一次挂载的文件系统
     follow_mount(mnt, dentry);
}
1):当前已经为用户的root目录了,不能再往上面走了,直接返回当前目录.
2):如果当前目录不是挂载目录,直接转到它的父目录就可以了。Dentry->d_parent指向它的父目录。注意这里的判断目录是否为挂载目录的条件(*dentry != (*mnt)->mnt_root).条件为当前目录不是文件系统的根目录。在后面可以看到,如果目录是挂载目录,则会替换为了文件系统的根目录.
3):如果当前目录是一个挂载目录,将目录替换为它的挂载目录,然后重新循环。mnt->mnt_parent指向挂载之前的文件系统. mnt->mnt_mountpoint会指向挂载的目录.
Follow_mount()是一个很重要的操作,它涉及到文件系统的挂载点处理.它的代码如下:
static int follow_mount(struct vfsmount **mnt, struct dentry **dentry)
{
     int res = 0;
     //dentry->d_mounted为1,说明这个目录下有了文件系统的挂载
     while (d_mountpoint(*dentry)) {
         //以挂载点的vfsmount与dentry到hash表中查找
         struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
         if (!mounted)
              break;
         mntput(*mnt);
         *mnt = mounted;
         dput(*dentry);
         *dentry = dget(mounted->mnt_root);
         res = 1;
     }
     return res;
}
这段代码涉及到文件系统挂载方面的知识点,看不太懂的等分析完文件系统的挂载后再来看这段代码.首先,在判断完目录下有挂载点之后,以挂载点的vfsmount与dentry到vfsmount的存储哈希表中找相应的vfsmount,如果找到了,就将mnt ,dentry转为挂载文件系统的mnt与dentry。然后再判断dentry下是否有文件系统的挂载,然后再做相同的处理.这样做是因为同一个目录下可以挂载多个不同的文件系统。例如在/mnt/ext2/可以先挂载一个ext2的磁盘,然后再挂载一个fat32的U盘。这时候,如果cd /mnt/ext2用ls查看只会看到fat32格式U盘里面的内容。对应到上面的处理过程中,当搜索到/mnt/ext2的时候,就会判断这个目录是否有挂载文件系统,这里的判断是成立的,就会将mnt与dentry转换成ext2磁盘的挂载dentry与mnt。然后循环再判断dentry是否有文件系统的挂载,这样的判断还会是成立的,因为之后又挂载了一个fat32.然后又会将dentry与mnt设置成fat32文件系统的dentry与vfsmount.
同理,在上面的情况中,如果有/mnt/ext2/..这样的路径的查找.因为该目录下挂载了文件系统,又是在文件系统的根目录下,所以就有dentry ==mnt->root.然后将dentry与mnt设为其挂载点(mountpoint)的目录与文件系统.这样就会找到ext2的磁盘挂载目录.然后循环之后,又满足dentry == mnt->root.同样又会将dentry与mnt设为挂载点的相应信息.经过这样的处理之后,就会到了“干净”的/mnt/ext2/目录.这样做主要是为了保挂当前目录与以上一级目录在同一个文件系统中.
讲述到这里之后,随便以lookup_mnt()来分析一下vfsmount是怎么样组织的.
首先vfsmount的结构如下:
struct vfsmount
{
     struct list_head mnt_hash;
     //父文件系统
     struct vfsmount *mnt_parent;     /* fs we are mounted on */
     //该文件系统的挂载目录
     struct dentry *mnt_mountpoint;   /* dentry of mountpoint */
     //该文件系统的根目录项
     struct dentry *mnt_root;    /* root of the mounted tree */
     //文件系统对应的超级块
     struct super_block *mnt_sb; /* pointer to superblock */
     struct list_head mnt_mounts;     /* list of children, anchored here */
     struct list_head mnt_child; /* and going through their mnt_child */
     atomic_t mnt_count;
     int mnt_flags;
     int mnt_expiry_mark;        /* true if marked for expiry */
     //设备名称
     char *mnt_devname;     /* Name of device e.g. /dev/dsk/hda1 */
     struct list_head mnt_list;
     struct list_head mnt_fslink;     /* link in fs-specific expiry list */
     struct namespace *mnt_namespace; /* containing namespace */
}
Lookup_mnt()的代码如下:
struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
{
     //以挂载点的vfsmount与dentry计算哈希值
     struct list_head * head = mount_hashtable + hash(mnt, dentry);
     struct list_head * tmp = head;
     struct vfsmount *p, *found = NULL;
 
     spin_lock(&vfsmount_lock);
     //在哈希冲突链表寻找匹配的vfsmount
     for (;;) {
         tmp = tmp->next;
         p = NULL;
         if (tmp == head)
              break;
         p = list_entry(tmp, struct vfsmount, mnt_hash);
         if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) {
              found = mntget(p);
              break;
         }
     }
     spin_unlock(&vfsmount_lock);
     return found;
}
从上面可以看到,所有的vfsmount都是存储在mount_hashtable[]中,它以挂载点的vfsmount和dentry为关键字计算哈希值.
2.2:目录中的查找
在目录中进行有关结点的查找是由do_lookup()完成的,它的代码如下:
/*
     nd:      里面包含了搜索目录的vfsmount与dentry
     name:    搜索文件的名字
     path:    用来存放搜索的结果
     atomic:  如果LOOKUP_ATOMIC被定义,则值为1,否则为0
   */
static int do_lookup(struct nameidata *nd, struct qstr *name,
              struct path *path, int atomic)
{
     struct vfsmount *mnt = nd->mnt;
     //__d_lookup:到目录项缓存中寻找
     struct dentry *dentry = __d_lookup(nd->dentry, name);
 
     //如果在目录项缓存中没有找到,转到need_lookup
     if (!dentry)
         goto need_lookup;
     //如果dentry中定义了d_revalidata,则调用它检查数据的合法性
     if (dentry->d_op && dentry->d_op->d_revalidate)
         goto need_revalidate;
     //在path中存放查找结果的信息
done:
     path->mnt = mnt;
     path->dentry = dentry;
     return 0;
 
need_lookup:
     //如果LOOKUP_ATOMIC被置,则只会到目录项缓存中寻找
     if (atomic)
         return -EWOULDBLOCKIO;
     //到具体的文件系统中寻找,因为此时目录项可能没有被读进缓存中
     dentry = real_lookup(nd->dentry, name, nd);
     if (IS_ERR(dentry))
         goto fail;
     goto done;
 
need_revalidate:
     //数据的合法性判断
     if (atomic)
         return -EWOULDBLOCKIO;
     if (dentry->d_op->d_revalidate(dentry, nd))
         goto done;
     if (d_invalidate(dentry))
         goto done;
     dput(dentry);
     goto need_lookup;
 
fail:
     return PTR_ERR(dentry);
}
该函数先会从目录项缓存中查找,如果没有找到,则会到文件系统对应的设备中查找。分成两个部份来分析这个操作.
2.2.1:在目录缓存中查找结点
因为操作系统经常会对文件系统中数据的相关定位活动,如果每次操作都会磁盘中读取有关目录信息会降低整个操作系统的效率.基于此,为了提高效率,文件的inode与dentry都会存放在一个缓存区中.在看代码之前先想一下,到缓存区中查到dentry的关键字是什么?一方面它要包含它所在的位置,另一方面要包含它的名称.
__d_lookup()的代码如下所示:
struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
{
     unsigned int len = name->len;
     unsigned int hash = name->hash;
     const unsigned char *str = name->name;
 
     //重新计算hash值,并在dentry_hashtable中找到相应项
     struct hlist_head *head = d_hash(parent,hash);
     struct dentry *found = NULL;
     struct hlist_node *node;
 
     rcu_read_lock();
 
     //遍历hlist
     hlist_for_each_rcu(node, head) {
         struct dentry *dentry;
         struct qstr *qstr;
 
         //取得hlist对应的dentry
         dentry = hlist_entry(node, struct dentry, d_hash);
 
         smp_rmb();
 
         //进行信息的匹配
         if (dentry->d_name.hash != hash)
              continue;
         if (dentry->d_parent != parent)
              continue;
 
         spin_lock(&dentry->d_lock);
 
         /*
          * If lookup ends up in a different bucket due to concurrent
          * rename, fail it
          */
         if (unlikely(dentry->d_bucket != head))
              goto terminate;
 
         /*
          * Recheck the dentry after taking the lock - d_move may have
          * changed things.  Don't bother checking the hash because we're
          * about to compare the whole name anyway.
          */
         if (dentry->d_parent != parent)
              goto next;
 
         qstr = rcu_dereference(&dentry->d_name);
 
         //如果自定义了名字比较函数
         if (parent->d_op && parent->d_op->d_compare) {
              if (parent->d_op->d_compare(parent, qstr, name))
                   goto next;
         } else {
              if (qstr->len != len)
                   goto next;
              if (memcmp(qstr->name, str, len))
                   goto next;
         }
 
         //d_unhashed():对应的dentry没有在哈希表中
         if (!d_unhashed(dentry)) {
              atomic_inc(&dentry->d_count);
              found = dentry;
         }
 
         //如果找到的dentry没有在哈希表中,则说明查找失败
terminate:
         spin_unlock(&dentry->d_lock);
         break;
next:
         spin_unlock(&dentry->d_lock);
     }
     rcu_read_unlock();
 
     return found;
}
d_hash()的操作如下所示:
static inline struct hlist_head *d_hash(struct dentry *parent,
                       unsigned long hash)
{
     hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
     hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS);
     return dentry_hashtable + (hash & D_HASHMASK);
}
可见,目录项缓存是存放在dentry_hashtable[]中,它以要查找目录的dentry与它本身的名字计算得出的hash值为关键字进行匹配。这样就包含了“文件的位置”与“文件的名称”这两个信息.
在上面的代码中,我们要注意的是,在有的文件系统中,定义了文件名称的匹配操作,这样是因为可能因为文件系统命名方式的差异,导致了名称对比的不同操作。例如,不同的文件系统有不同的文件名称长度限制.
2.2.2:在具体的文件系统中进行文件的查找
这个操作是在real_lookup()中完成的,它的代码如下:
static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
{
     struct dentry * result;
     //目录所对应的inode
     struct inode *dir = parent->d_inode;
 
     down(&dir->i_sem);
      //在加锁的过程中,可能其它的进程对dir这个inode进行了相关的操作
      //重新判断缓存中是否有建好的dentry
     result = d_lookup(parent, name);
     //如果没有,则要到相应的文件系统中查找了
     if (!result) {
         //以所在的目录与文件名称为参数,分配一个新的dentry
         struct dentry * dentry = d_alloc(parent, name);
         result = ERR_PTR(-ENOMEM);
         if (dentry) {
              //使用inode提供的lookup方法查找是否有些节点
              result = dir->i_op->lookup(dir, dentry, nd);
              //如果查找失败,释放刚分配的dentry
              if (result)
                   dput(dentry);
              else
                   //如果成功,则返回这个dentry
                   result = dentry;
         }
         up(&dir->i_sem);
         return result;
     }
 
     /*
      * Uhhuh! Nasty case: the cache was re-populated while
      * we waited on the semaphore. Need to revalidate.
      */
 
     up(&dir->i_sem);
     //如果在缓存中找到了节点,还要是进行数据的合法性检验
     if (result->d_op && result->d_op->d_revalidate) {
         if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) {
              dput(result);
              result = ERR_PTR(-ENOENT);
         }
     }
     return result;
}
先来看分配一个新的dentry的过程,这个过程在d_alloc()中,代码如下:
struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
{
     struct dentry *dentry;
     char *dname;
 
     //从slab中分配dentry结构
     dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
     if (!dentry)
         return NULL;
 
     //如果节点的名字长度超出了DNAME_INLINE_LEN-1
     //则新分配一个空间用来存放结点的名称
     if (name->len > DNAME_INLINE_LEN-1) {
         dname = kmalloc(name->len + 1, GFP_KERNEL);
         if (!dname) {
              kmem_cache_free(dentry_cache, dentry);
              return NULL;
         }
     } else  {
         //否则,就用dentry0->d_iname
         dname = dentry->d_iname;
     }   
     dentry->d_name.name = dname;
 
     dentry->d_name.len = name->len;
     dentry->d_name.hash = name->hash;
     //copy结点的名称
     memcpy(dname, name->name, name->len);
     dname[name->len] = 0;
 
     //各项值的初始化
     atomic_set(&dentry->d_count, 1);
     dentry->d_flags = DCACHE_UNHASHED;
     dentry->d_lock = SPIN_LOCK_UNLOCKED;
     dentry->d_inode = NULL;
     dentry->d_parent = NULL;
     dentry->d_sb = NULL;
     dentry->d_op = NULL;
     dentry->d_fsdata = NULL;
     dentry->d_extra_attributes = NULL;
     dentry->d_mounted = 0;
     dentry->d_cookie = NULL;
     dentry->d_bucket = NULL;
     INIT_HLIST_NODE(&dentry->d_hash);
     INIT_LIST_HEAD(&dentry->d_lru);
     INIT_LIST_HEAD(&dentry->d_subdirs);
     INIT_LIST_HEAD(&dentry->d_alias);
 
     //更新dentry->d_sb
     //更新dentry->d_parent
     if (parent) {
         dentry->d_parent = dget(parent);
         dentry->d_sb = parent->d_sb;
     } else {
         INIT_LIST_HEAD(&dentry->d_child);
     }
 
     spin_lock(&dcache_lock);
     //将目录挂到父目录的d_subdirs链下面
     if (parent)
         list_add(&dentry->d_child, &parent->d_subdirs);
     dentry_stat.nr_dentry++;
     spin_unlock(&dcache_lock);
 
     return dentry;
}
在这里值得注意的是对文件名称长度的处理.在本文的开头部份分析过dentry的结构。在dentry结构中,有一个叫d_iname的成员。如下所示:
struct dentry {
……
……
struct qstr d_name;
……
……
unsigned char d_iname[DNAME_INLINE_LEN_MIN];
}
#define DNAME_INLINE_LEN_MIN 36
如果文件名称长度没有超过d_iname的范围,就直接使d_name.name指向d_iname.如果文件名称超长,是新分配一个合适大小的空间,将d_name.name指向这个空间.
在上面的d_alloc()中,主要指定了dentry的super_block为父目录的super_block,并将dentry挂入父目录的d_subdirs.
调用d_alloc()分配了dentry之后,就会调用inode->i_op->lookup()在具体的文件系统中进行文件的查找了.关于具体文件系统的信息,先把问题搁到这里,等分析文件系统的挂载时,涉及到具体的文件系统再来分析这个操作.
2.2.3:符号链接的文件查找
符号链接的查找是在do_follow_link中完成的.它的代码如下:
static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd)
{
     int err = -ELOOP;
 
     //current->link_count: 循环嵌套的计数
     if (current->link_count >= MAX_NESTED_LINKS)
         goto loop;
     // current->tatal_link_count:所遇到的符号链接的总数
     if (current->total_link_count >= 40)
         goto loop;
     BUG_ON(nd->depth >= MAX_NESTED_LINKS);
     //如果内核被抢占了,则调度其它的进程
     cond_resched();
     err = security_inode_follow_link(dentry, nd);
     if (err)
         goto loop;
     //递增计数
     current->link_count++;
     current->total_link_count++;
     nd->depth++;
     //更新dentry的时间戳
     touch_atime(nd->mnt, dentry);
     //将nd->saved_names[nd->depth]置为空
     nd_set_link(nd, NULL);
     //读取符号链接里的路径,若是读取成功的话,就会将路径存放在nd->saved_names[nd->depth]
     err = dentry->d_inode->i_op->follow_link(dentry, nd);
     if (!err) {
         char *s = nd_get_link(nd);
         if (s)
              err = __vfs_follow_link(nd, s);
         //如果有put_link操作,调用之
         if (dentry->d_inode->i_op->put_link)
              dentry->d_inode->i_op->put_link(dentry, nd);
     }
     current->link_count--;
     nd->depth--;
     return err;
loop:
     path_release(nd);
     return err;
}
先举个例子 /home/eric/eg是指向/mnt/hgfs的符号链接.如果要查找的路径是/home/eric/eg/fs.内核在读到eg这个分量的时候,就要读取它所链接到的文件,也就是/mnt/hgfs这个路径,然后再对这个路径继续解析下去.
很显然,上面是两上查找目录的过程,先是查找/home/eric/eg.然后转去查找/mnt/hgfs.找到这个目录对应的dentry之后,再转到这个dentry下面查找fs.对于这样的操作,很明显是一个递归的过程.
为了避免过入无限的递归,linux内核对递归层次做出了限制.每次嵌套一次,current-> link_count加1.每次退出嵌套时,current->link_count减1.当current->link_count达到MAX_NESTED_LINKS时,退出递归.
另外,为了避免恶意用户设备大量的符号链接,linux内核对符号链接的总数也进行限制.current->total_link_count存放了遇到的符号链接的数目,如果这个数目超过40.也会停止解析.
解析一个符号链接,首先要读取这个符号链接所指向的对象,这是由inode->i_op-> follow_link()完成的.
读取到链接对象之会,会调用__vfs_follow_link().转入看下这个操作.
static inline int __vfs_follow_link(struct nameidata *nd, const char *link)
{
     int res = 0;
     char *name;
     if (IS_ERR(link))
         goto fail;
 
     //所以是以'/'开头的绝对路径.则重新设置nd的dentry与vfsmount
     if (*link == '/') {
         path_release(nd);
         if (!walk_init_root(link, nd))
              /* weird __emul_prefix() stuff did it */
              goto out;
     }
     //调用link_path_walk解析这个路径.这就是一个递归的过程了
     res = link_path_walk(link, nd);
out:
     if (nd->depth || res || nd->last_type!=LAST_NORM)
         return res;
     /*
      * If it is an iterative symlinks resolution in open_namei() we
      * have to copy the last component. And all that crap because of
      * bloody create() on broken symlinks. Furrfu...
      */
     name = __getname();
     if (unlikely(!name)) {
         path_release(nd);
         return -ENOMEM;
     }
     strcpy(name, nd->last.name);
     nd->last.name = name;
     return 0;
fail:
     path_release(nd);
     return PTR_ERR(link);
}
同理,在上述代码中遇到的依赖于特定文件系统的操作,我们先把它搁开. *^_^*
 
三:文件系统的挂载
在实际应用中,通常会特定格式的硬件设备挂载到linux里,使linux能够对其进行读写.其实,在linux中,文件系统也包含有虚拟文件系统,这些文件系统一般都是存放在RAM中的。例如:ramfs , sysfs等.
文件系统会挂载到一个特定的点上,那这个点又是怎么生成的呢?我们先从系统初始化时,总根的挂载说起.
3.1:根文件系统的挂载
内核启动到start_kernel()会调用mnt_init().从这个函数说起:
void __init mnt_init(unsigned long mempages)
{
     struct list_head *d;
     unsigned int nr_hash;
     int i;
     int err;
 
     init_rwsem(&namespace_sem);
 
     //创建vfsmount的cache
     mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount),
              0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
 
     //vfsmount hash数组,一个页面大小
     mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
 
     if (!mount_hashtable)
         panic("Failed to allocate mount hash table\n");
 
     /*
      * Find the power-of-two list-heads that can fit into the allocation..
      * We don't guarantee that "sizeof(struct list_head)" is necessarily
      * a power-of-two.
      */
 
     //哈希数组的项数
     nr_hash = PAGE_SIZE / sizeof(struct list_head);
 
     //计算项数总共有多少位
     hash_bits = 0;
     do {
         hash_bits++;
     } while ((nr_hash >> hash_bits) != 0);
     hash_bits--;
 
     /*
      * Re-calculate the actual number of entries and the mask
      * from the number of bits we can fit.
      */
 
     //将哈希表大小向上取整
     nr_hash = 1UL << hash_bits;
     //哈希表数组的掩码
     hash_mask = nr_hash - 1;
 
     printk("Mount-cache hash table entries: %d\n", nr_hash);
 
     /* And initialize the newly allocated array */
     //hash数组初始化
     d = mount_hashtable;
     i = nr_hash;
     do {
         INIT_LIST_HEAD(d);
         d++;
         i--;
     } while (i);
     //sysfs初始化
     err = sysfs_init();
     if (err)
         printk(KERN_WARNING "%s: sysfs_init error: %d\n",
              __FUNCTION__, err);
     err = subsystem_register(&fs_subsys);
     if (err)
         printk(KERN_WARNING "%s: subsystem_register error: %d\n",
              __FUNCTION__, err);
 
     //初始化rootfs
     init_rootfs();
     //初始化挂载树
     init_mount_tree();
}
转到init_rootf()中:
int __init init_rootfs(void)
{
     return register_filesystem(&rootfs_fs_type);
}
其实它是注册了rootfs的文件系统。转进去看下代码:
int register_filesystem(struct file_system_type * fs)
{
     int res = 0;
     struct file_system_type ** p;
 
     BUG_ON(strchr(fs->name, '.'));
     if (fs->next)
         return -EBUSY;
     //初始化fs->fs_supers
     INIT_LIST_HEAD(&fs->fs_supers);
     write_lock(&file_systems_lock);
     //从注册的文件系统中搜索待注册的fs
     p = find_filesystem(fs->name, strlen(fs->name));
     //如果存在,返回错误
     if (*p)
         res = -EBUSY;
     else
     //如果不存在,则加到它的后面    
         *p = fs;
     write_unlock(&file_systems_lock);
     return res;
}
其中find_filesystem()的代码如下:
static struct file_system_type **find_filesystem(const char *name, unsigned len)
{
     //遍历链表file_systems
     struct file_system_type **p;
     for (p=&file_systems; *p; p=&(*p)->next)
         //链表中信息的匹配
         if (strlen((*p)->name) == len &&
             strncmp((*p)->name, name, len) == 0)
              break;
     //如果找到了相关的信息,则返回它在链表中的结点位置,否则。返会链表末尾位置
     return p;
}
综合上面所说的,init_rootfs()仅是将rootfs_fs_type挂到了file_systems链表上.
init_mount_tree()的代码如下:
static void __init init_mount_tree(void)
{
     struct vfsmount *mnt;
     struct mnt_namespace *ns;
 
     //挂载rootfs文件系统
     mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
     if (IS_ERR(mnt))
         panic("Can't create rootfs");
     ns = kmalloc(sizeof(*ns), GFP_KERNEL);
     if (!ns)
         panic("Can't allocate initial namespace");
     atomic_set(&ns->count, 1);
     INIT_LIST_HEAD(&ns->list);
     init_waitqueue_head(&ns->poll);
     ns->event = 0;
     list_add(&mnt->mnt_list, &ns->list);
     ns->root = mnt;
     mnt->mnt_ns = ns;
 
     init_task.nsproxy->mnt_ns = ns;
     get_mnt_ns(ns);
 
     //设置当前目录和root目录都是文件系统的根目录
     set_fs_pwd(current->fs, ns->root, ns->root->mnt_root);
     set_fs_root(current->fs, ns->root, ns->root->mnt_root);
}
注意在这里的cuuuent进程是init进程。后续的进程全是它的子进程,也就是说后面进程继承了它的当前目录与root目录信息.
Rootfs的挂载是在do_kern_mount()中完成的。它的代码如下:
struct vfsmount *
do_kern_mount(const char *fstype, int flags, const char *name, void *data)
{
     //取得相应的文件类型
     struct file_system_type *type = get_fs_type(fstype);
     struct vfsmount *mnt;
     if (!type)
         return ERR_PTR(-ENODEV);
     //具体的挂载过程
     mnt = vfs_kern_mount(type, flags, name, data);
     if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
         !mnt->mnt_sb->s_subtype)
         mnt = fs_set_subtype(mnt, fstype);
     //减少文件系统的引用计数
     put_filesystem(type);
     return mnt;
}
vfs_kern_mount()的代码如下:
struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
     struct vfsmount *mnt;
     char *secdata = NULL;
     int error;
 
     if (!type)
         return ERR_PTR(-ENODEV);
 
     error = -ENOMEM;
     //分配vfsmount 并使mnt->mnt_devname = name
     mnt = alloc_vfsmnt(name);
     if (!mnt)
         goto out;
 
     if (data) {
         secdata = alloc_secdata();
         if (!secdata)
              goto out_mnt;
 
         error = security_sb_copy_data(type, data, secdata);
         if (error)
              goto out_free_secdata;
     }
 
     //调用type->get_sb()创建相应的super_block.挂载点的dentry.inode之类的信息
     error = type->get_sb(type, flags, name, data, mnt);
     if (error < 0)
         goto out_free_secdata;
     BUG_ON(!mnt->mnt_sb);
 
     error = security_sb_kern_mount(mnt->mnt_sb, secdata);
     if (error)
         goto out_sb;
 
     //将挂载点置位文件系统根目录
     mnt->mnt_mountpoint = mnt->mnt_root;
     //将父文件系统置为其本身
     mnt->mnt_parent = mnt;
     up_write(&mnt->mnt_sb->s_umount);
     free_secdata(secdata);
     return mnt;
out_sb:
     dput(mnt->mnt_root);
     up_write(&mnt->mnt_sb->s_umount);
     deactivate_super(mnt->mnt_sb);
out_free_secdata:
     free_secdata(secdata);
out_mnt:
     free_vfsmnt(mnt);
out:
     return ERR_PTR(error);
}
在上面会调用type->get_sb()来填充一些关链的信息。Rootfs的file_system_type定义如下:
static struct file_system_type rootfs_fs_type = {
     .name         = "rootfs",
     .get_sb       = rootfs_get_sb,
     .kill_sb = kill_litter_super,
}
Rootfs_get_sb()代码如下:
static int rootfs_get_sb(struct file_system_type *fs_type,
     int flags, const char *dev_name, void *data, struct vfsmount *mnt)
{
     return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super,
                  mnt);
}
内核为get_sb提供了一个统一的初始化接口,具体有关文件系统的消息填充放在一个回调函数里。在内核中get_sb()有以下几个API:
get_sb_bdev():挂载一个基于块设备的文件系统
get_sb_nodev():挂载一个不存在于磁盘的文件系统
get_sb_single():挂载一个与其它挂载共享的文件系统
get_sb_nodev()的代码如下:
struct super_block *get_sb_nodev(struct file_system_type *fs_type,
     int flags, void *data,
     int (*fill_super)(struct super_block *, void *, int))
{
     int error;
     //分配并初始化一个super_block,并分配一个虚拟设备号
     struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
 
     if (IS_ERR(s))
         return s;
 
     s->s_flags = flags;
 
     //调用回调函数填充具体的信息
     error = fill_super(s, data, flags & MS_VERBOSE ? 1 : 0);
     if (error) {
         up_write(&s->s_umount);
         deactivate_super(s);
         return ERR_PTR(error);
     }
     //置MS_ACTIVE标志
     s->s_flags |= MS_ACTIVE;
     return s;
}
Rootfs调用get_sb_nodev()所有的回调函数为ramfs_fill_super().代码如下:
static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
{
     struct inode * inode;
     struct dentry * root;
 
     //初始化super_block
     sb->s_maxbytes = MAX_LFS_FILESIZE;
     sb->s_blocksize = PAGE_CACHE_SIZE;
     sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
     sb->s_magic = RAMFS_MAGIC;
     sb->s_op = &ramfs_ops;
     //初始化inode节点
     inode = ramfs_get_inode(sb, S_IFDIR | 0755, 0);
     if (!inode)
         return -ENOMEM;
 
     //分配一个dentry做为文件系统的根目录
     root = d_alloc_root(inode);
     if (!root) {
         iput(inode);
         return -ENOMEM;
     }
    
     sb->s_root = root;
     return 0;
}
在这里,需要留意sb->s_op的赋值。Rootfs的inode节点是在ramfs_get_inode()分配并初始化的。代码如下:
struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
{
     //分配inode
     struct inode * inode = new_inode(sb);
 
     //初始化rootfs的inode
     if (inode) {
         inode->i_mode = mode;
         inode->i_uid = current->fsuid;
         inode->i_gid = current->fsgid;
         inode->i_blksize = PAGE_CACHE_SIZE;
         inode->i_blocks = 0;
         inode->i_mapping->a_ops = &ramfs_aops;
         inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
         inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
         switch (mode & S_IFMT) {
         default:
              init_special_inode(inode, mode, dev);
              break;
              //S_IFREG:一般文件
         case S_IFREG:
              //索引结点操作表
              inode->i_op = &ramfs_file_inode_operations;
              inode->i_fop = &ramfs_file_operations;
              break;
              //S_IFDIR:目录
         case S_IFDIR:
              inode->i_op = &ramfs_dir_inode_operations;
              inode->i_fop = &simple_dir_operations;
 
              /* directory inodes start off with i_nlink == 2 (for "." entry) */
              inode->i_nlink++;
              break;
              //S_IFLNK:符号链接
         case S_IFLNK:
              inode->i_op = &page_symlink_inode_operations;
              break;
         }
     }
     return inode;
}
Rootfs文件系统根结点的dentry是在d_alloc_root()中分配并初始化的。代码如下:
struct dentry * d_alloc_root(struct inode * root_inode)
{
     struct dentry *res = NULL;
 
     if (root_inode) {
         //这就是我们所看到的 “/”
         static const struct qstr name = { .name = "/", .len = 1 };
 
         res = d_alloc(NULL, &name);
         if (res) {
              //d_sb:文件超级块
              res->d_sb = root_inode->i_sb;
              res->d_parent = res;
              d_instantiate(res, root_inode);
         }
     }
     return res;
}
注意上面rootfs的文件系统根目录是’/’.这也是我们平时在shell里看到的’/’了.
 
3.2:ext2文件系统的挂载
挂载完根目录之后,我们就可以具体的文件系统的挂载了。以ext2为例做说明.mount对应的系统调用入口是sys_mount().它的代码如下:
asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name,
                char __user * type, unsigned long flags,
                void __user * data)
{
     int retval;
     unsigned long data_page;
     unsigned long type_page;
     unsigned long dev_page;
     char *dir_page;
 
     //从用户空间copy 数据到内核空间
 
     //copy type
     retval = copy_mount_options (type, &type_page);
     if (retval < 0)
         return retval;
 
     dir_page = getname(dir_name);
     retval = PTR_ERR(dir_page);
     if (IS_ERR(dir_page))
         goto out1;
 
     //copy dev_name
     retval = copy_mount_options (dev_name, &dev_page);
     if (retval < 0)
         goto out2;
 
     //copy data
     retval = copy_mount_options (data, &data_page);
     if (retval < 0)
         goto out3;
 
     lock_kernel();
     //具体的mount过程
     retval = do_mount((char*)dev_page, dir_page, (char*)type_page,
                flags, (void*)data_page);
     unlock_kernel();
 
     //释放分配的资源
     free_page(data_page);
 
out3:
     free_page(dev_page);
out2:
     putname(dir_page);
out1:
     free_page(type_page);
     return retval;
}
Do_mount()的代码如下:
long do_mount(char * dev_name, char * dir_name, char *type_page,
           unsigned long flags, void *data_page)
{
     struct nameidata nd;
     int retval = 0;
     int mnt_flags = 0;
 
     /* Discard magic */
     if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
         flags &= ~MS_MGC_MSK;
 
     /* Basic sanity checks */
 
     //参数的有效性判断
     if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
         return -EINVAL;
     if (dev_name && !memchr(dev_name, 0, PAGE_SIZE))
         return -EINVAL;
 
     if (data_page)
         ((char *)data_page)[PAGE_SIZE - 1] = 0;
 
     /* Separate the per-mountpoint flags */
     if (flags & MS_NOSUID)
         mnt_flags |= MNT_NOSUID;
     if (flags & MS_NODEV)
         mnt_flags |= MNT_NODEV;
     if (flags & MS_NOEXEC)
         mnt_flags |= MNT_NOEXEC;
     flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_ACTIVE);
 
     /* ... and get the mountpoint */
     //查找挂载点
     retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
 
     //如果挂载目录不存在,出错退出
     if (retval)
         return retval;
 
     retval = security_sb_mount(dev_name, &nd, type_page, flags, data_page);
     if (retval)
         goto dput_out;
 
     //重新安装
     if (flags & MS_REMOUNT)
         retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
                       data_page);
     //安装一个绑定设备
     else if (flags & MS_BIND)
         retval = do_loopback(&nd, dev_name, flags & MS_REC);
     //将挂载的文件系统移动到其它的位置
     else if (flags & MS_MOVE)
         retval = do_move_mount(&nd, dev_name);
     else
         //全新的安装
         retval = do_new_mount(&nd, type_page, flags, mnt_flags,
                         dev_name, data_page);
dput_out:
     path_release(&nd);
     return retval;
}
通常执行的是一个全新的安装,即会转入到do_new_mount().代码如下:
static int do_new_mount(struct nameidata *nd, char *type, int flags,
              int mnt_flags, char *name, void *data)
{
     struct vfsmount *mnt;
 
     //参数有效性判断
     if (!type || !memchr(type, 0, PAGE_SIZE))
         return -EINVAL;
 
     /* we need capabilities... */
     //判断是否有相应的权限
     if (!capable(CAP_SYS_ADMIN))
         return -EPERM;
     //具体的挂载过程
     mnt = do_kern_mount(type, flags, name, data);
     if (IS_ERR(mnt))
         return PTR_ERR(mnt);
     //将vfsmount 加至挂载树
     return do_add_mount(mnt, nd, mnt_flags, NULL);
}
Do_kern_mount()的代码在根目录挂载一节已经分析过了.do_add_mount()代码如下示:
int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
          int mnt_flags, struct list_head *fslist)
{
     int err;
 
     down_write(¤t->namespace->sem);
     /* Something was mounted here while we slept */
     //可能在安装的时候又有设备挂到下面去了
     //重新遍历一下挂载目录下的安装结点
     while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
         ;
     err = -EINVAL;
     if (!check_mnt(nd->mnt))
         goto unlock;
 
     /* Refuse the same filesystem on the same mount point */
     err = -EBUSY;
     //在同一个挂载点上挂载相同的文件系统
     if (nd->mnt->mnt_sb == newmnt->mnt_sb &&
         nd->mnt->mnt_root == nd->dentry)
         goto unlock;
 
     err = -EINVAL;
     //文件系统的根目录是一个链接?
     if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
         goto unlock;
 
     newmnt->mnt_flags = mnt_flags;
     //将newmnt加至namespace->list ,将mnt加至hash表,加至父文件系统的子链表
     err = graft_tree(newmnt, nd);
 
     if (err == 0 && fslist) {
         /* add to the specified expiration list */
         spin_lock(&vfsmount_lock);
         list_add_tail(&newmnt->mnt_fslink, fslist);
         spin_unlock(&vfsmount_lock);
     }
 
unlock:
     up_write(¤t->namespace->sem);
     mntput(newmnt);
     return err;
}
 
如果挂载的是ext2文件系统,在do_kern_mount()中就会调用file_system_type -> get_sb().
Ext2的file_system_type定义如下:
static struct file_system_type ext2_fs_type = {
     .owner        = THIS_MODULE,
     .name         = "ext2",
     .get_sb       = ext2_get_sb,
     .kill_sb = kill_block_super,
     .fs_flags = FS_REQUIRES_DEV,
}
相应的get_sb入口为ext2_get_sb():
static struct super_block *ext2_get_sb(struct file_system_type *fs_type,
     int flags, const char *dev_name, void *data)
{
     //get_sb_bdev挂载一个基于块设备的文件系统
     return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
}
get_sb_bdev()涉及到块设备方面的东西,先把它放开。
ext2_get_sb()的代码比较简单.相应的回调函数为ext2_fill_super():
static int ext2_fill_super(struct super_block *sb, void *data, int silent)
{
     struct buffer_head * bh;
     struct ext2_sb_info * sbi;
     struct ext2_super_block * es;
     struct inode *root;
     unsigned long block;
     //从指定的参数中取得超级块位置
     unsigned long sb_block = get_sb_block(&data);
     unsigned long logic_sb_block;
     unsigned long offset = 0;
     unsigned long def_mount_opts;
     int blocksize = BLOCK_SIZE;
     int db_count;
     int i, j;
     __le32 features;
 
     //super_block的私有结构是sbi
     sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
     if (!sbi)
         return -ENOMEM;
     sb->s_fs_info = sbi;
     memset(sbi, 0, sizeof(*sbi));
 
     /*
      * See what the current blocksize for the device is, and
      * use that as the blocksize.  Otherwise (or if the blocksize
      * is smaller than the default) use the default.
      * This is important for devices that have a hardware
      * sectorsize that is larger than the default.
      */
 
     //设置sb->block_size   和sb->s_blocksize_bits
     blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
     if (!blocksize) {
         printk ("EXT2-fs: unable to set blocksize\n");
         goto failed_sbi;
     }
 
     /*
      * If the superblock doesn't start on a hardware sector boundary,
      * calculate the offset. 
      */
      //data中包含sb=XXX.用来指定超级块的块号
 
     //如果不是默认块大小
     if (blocksize != BLOCK_SIZE) {
         logic_sb_block = (sb_block*BLOCK_SIZE) / blocksize;
         offset = (sb_block*BLOCK_SIZE) % blocksize;
     } else {
         logic_sb_block = sb_block;
     }
 
     //读取块设备中的指定块内容,将其放入缓冲区中
     if (!(bh = sb_bread(sb, logic_sb_block))) {
         printk ("EXT2-fs: unable to read superblock\n");
         goto failed_sbi;
     }
     /*
      * Note: s_es must be initialized as soon as possible because
      *       some ext2 macro-instructions depend on its value
      */
     es = (struct ext2_super_block *) (((char *)bh->b_data) + offset);
     sbi->s_es = es;
     sb->s_magic = le16_to_cpu(es->s_magic);
     sb->s_flags |= MS_ONE_SECOND;
     //如果文件系统魔数不是ext2规定的.出错退出
     if (sb->s_magic != EXT2_SUPER_MAGIC) {
         if (!silent)
              printk ("VFS: Can't find ext2 filesystem on dev %s.\n",
                   sb->s_id);
         goto failed_mount;
     }
 
     /* Set defaults before we parse the mount options */
     //解析文件系统中默认的挂载选项
     def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
     if (def_mount_opts & EXT2_DEFM_DEBUG)
         set_opt(sbi->s_mount_opt, DEBUG);
     if (def_mount_opts & EXT2_DEFM_BSDGROUPS)
         set_opt(sbi->s_mount_opt, GRPID);
     if (def_mount_opts & EXT2_DEFM_UID16)
         set_opt(sbi->s_mount_opt, NO_UID32);
     if (def_mount_opts & EXT2_DEFM_XATTR_USER)
         set_opt(sbi->s_mount_opt, XATTR_USER);
     if (def_mount_opts & EXT2_DEFM_ACL)
         set_opt(sbi->s_mount_opt, POSIX_ACL);
    
     if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_PANIC)
         set_opt(sbi->s_mount_opt, ERRORS_PANIC);
     else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_RO)
         set_opt(sbi->s_mount_opt, ERRORS_RO);
 
     sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
     sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
 
     //解析参数中的挂载参数
     if (!parse_options ((char *) data, sbi))
         goto failed_mount;
 
     sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
         ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
          MS_POSIXACL : 0);
 
     if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
         (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
          EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
          EXT2_HAS_INCOMPAT_FEATURE(sb, ~0U)))
         printk("EXT2-fs warning: feature flags set on rev 0 fs, "
                "running e2fsck is recommended\n");
     /*
      * Check feature flags regardless of the revision level, since we
      * previously didn't change the revision level when setting the flags,
      * so there is a chance incompat flags are set on a rev 0 filesystem.
      */
     features = EXT2_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP);
     if (features) {
         printk("EXT2-fs: %s: couldn't mount because of "
                "unsupported optional features (%x).\n",
                sb->s_id, le32_to_cpu(features));
         goto failed_mount;
     }
     if (!(sb->s_flags & MS_RDONLY) &&
         (features = EXT2_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))){
         printk("EXT2-fs: %s: couldn't mount RDWR because of "
                "unsupported optional features (%x).\n",
                sb->s_id, le32_to_cpu(features));
         goto failed_mount;
     }
 
     //块大小的计算方式: 默认块大小的sbi->s_es->s_log_block_size 次方
     blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
    
     /* If the blocksize doesn't match, re-read the thing.. */
     //如果super_block的块大小不与ext2规定的块大小相同,则更新super_block->s_blocksize
     //这里需要重读ext2_super_block
     if (sb->s_blocksize != blocksize) {
         brelse(bh);
 
         if (!sb_set_blocksize(sb, blocksize)) {
              printk(KERN_ERR "EXT2-fs: blocksize too small for device.\n");
              goto failed_sbi;
         }
 
         logic_sb_block = (sb_block*BLOCK_SIZE) / blocksize;
         offset = (sb_block*BLOCK_SIZE) % blocksize;
         bh = sb_bread(sb, logic_sb_block);
         if(!bh) {
              printk("EXT2-fs: Couldn't read superblock on "
                     "2nd try.\n");
              goto failed_sbi;
         }
         es = (struct ext2_super_block *) (((char *)bh->b_data) + offset);
         sbi->s_es = es;
         if (es->s_magic != cpu_to_le16(EXT2_SUPER_MAGIC)) {
              printk ("EXT2-fs: Magic mismatch, very weird !\n");
              goto failed_mount;
         }
     }
 
     //所允许的最大的文件大小
     sb->s_maxbytes = ext2_max_size(sb->s_blocksize_bits);
 
     if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV) {
         sbi->s_inode_size = EXT2_GOOD_OLD_INODE_SIZE;
         sbi->s_first_ino = EXT2_GOOD_OLD_FIRST_INO;
     } else {
         sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
         sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
         if ((sbi->s_inode_size < EXT2_GOOD_OLD_INODE_SIZE) ||
             (sbi->s_inode_size & (sbi->s_inode_size - 1)) ||
             (sbi->s_inode_size > blocksize)) {
              printk ("EXT2-fs: unsupported inode size: %d\n",
                   sbi->s_inode_size);
              goto failed_mount;
         }
     }
     sbi->s_frag_size = EXT2_MIN_FRAG_SIZE <<
                      le32_to_cpu(es->s_log_frag_size);
     if (sbi->s_frag_size)
         sbi->s_frags_per_block = sb->s_blocksize /
                              sbi->s_frag_size;
     else
         sb->s_magic = 0;
     sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
     sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
     sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
 
     //每个块中的inode数= block大小/inode大小
     sbi->s_inodes_per_block = sb->s_blocksize /
                          EXT2_INODE_SIZE(sb);
     //每个block grop中,inode table所占的块数= 组中的总点数/每个块里的inode数目
     sbi->s_itb_per_group = sbi->s_inodes_per_group /
                           sbi->s_inodes_per_block;
     //每个块中的组描述符数目= 块大小/组描述符大小
     sbi->s_desc_per_block = sb->s_blocksize /
                        sizeof (struct ext2_group_desc);
     //包含这个超级块的bh
     sbi->s_sbh = bh;
     //文件系统的状态
     sbi->s_mount_state = le16_to_cpu(es->s_state);
     sbi->s_addr_per_block_bits =
         log2 (EXT2_ADDR_PER_BLOCK(sb));
     sbi->s_desc_per_block_bits =
         log2 (EXT2_DESC_PER_BLOCK(sb));
     if (sb->s_magic != EXT2_SUPER_MAGIC) {
         if (!silent)
              printk ("VFS: Can't find an ext2 filesystem on dev "
                   "%s.\n",
                   sb->s_id);
         goto failed_mount;
     }
     if (sb->s_blocksize != bh->b_size) {
         if (!silent)
              printk ("VFS: Unsupported blocksize on dev "
                   "%s.\n", sb->s_id);
         goto failed_mount;
     }
 
     if (sb->s_blocksize != sbi->s_frag_size) {
         printk ("EXT2-fs: fragsize %lu != blocksize %lu (not supported yet)\n",
              sbi->s_frag_size, sb->s_blocksize);
         goto failed_mount;
     }
 
     if (sbi->s_blocks_per_group > sb->s_blocksize * 8) {
         printk ("EXT2-fs: #blocks per group too big: %lu\n",
              sbi->s_blocks_per_group);
         goto failed_mount;
     }
     if (sbi->s_frags_per_group > sb->s_blocksize * 8) {
         printk ("EXT2-fs: #fragments per group too big: %lu\n",
              sbi->s_frags_per_group);
         goto failed_mount;
     }
     if (sbi->s_inodes_per_group > sb->s_blocksize * 8) {
         printk ("EXT2-fs: #inodes per group too big: %lu\n",
              sbi->s_inodes_per_group);
         goto failed_mount;
     }
 
     //文件系统中的块组数
     sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
                           le32_to_cpu(es->s_first_data_block) +
                          EXT2_BLOCKS_PER_GROUP(sb) - 1) /
                          EXT2_BLOCKS_PER_GROUP(sb);
    
     //组描述符所占的块数 = (块组总数-1)/每一个块中的组描述符
     //每一个块组对应一个描述符
     db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) /
            EXT2_DESC_PER_BLOCK(sb);
 
     // TODO:要注意的是,对一个块对应一个BH
     sbi->s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_KERNEL);
     if (sbi->s_group_desc == NULL) {
         printk ("EXT2-fs: not enough memory\n");
         goto failed_mount;
     }
     percpu_counter_init(&sbi->s_freeblocks_counter);
     percpu_counter_init(&sbi->s_freeinodes_counter);
     percpu_counter_init(&sbi->s_dirs_counter);
     bgl_lock_init(&sbi->s_blockgroup_lock);
 
     //每一个块组对应一个s_debts  . sbi->s_debts是一个数组...
     sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(*sbi->s_debts),
                     GFP_KERNEL);
     if (!sbi->s_debts) {
         printk ("EXT2-fs: not enough memory\n");
         goto failed_mount_group_desc;
     }
     memset(sbi->s_debts, 0, sbi->s_groups_count * sizeof(*sbi->s_debts));
 
    
     for (i = 0; i < db_count; i++) {
 
         //计算在超级块后的第N个块
         //组描述符是放在 超级块之后
         block = descriptor_loc(sb, logic_sb_block, i);
         sbi->s_group_desc[i] = sb_bread(sb, block);
         if (!sbi->s_group_desc[i]) {
              for (j = 0; j < i; j++)
                   brelse (sbi->s_group_desc[j]);
              printk ("EXT2-fs: unable to read group descriptors\n");
              goto failed_mount_group_desc;
         }
     }
     if (!ext2_check_descriptors (sb)) {
         printk ("EXT2-fs: group descriptors corrupted!\n");
         db_count = i;
         goto failed_mount2;
     }
     sbi->s_gdb_count = db_count;
     get_random_bytes(&sbi->s_next_generation, sizeof(u32));
     spin_lock_init(&sbi->s_next_gen_lock);
     /*
      * set up enough so that it can read an inode
      */
     sb->s_op = &ext2_sops;
     sb->s_export_op = &ext2_export_ops;
     sb->s_xattr = ext2_xattr_handlers;
     //ext2的根目录.在第二个inode
     root = iget(sb, EXT2_ROOT_INO);
     //初始化文件系统的根目录,使其dentry的inode引用指向root
     sb->s_root = d_alloc_root(root);
     if (!sb->s_root) {
         iput(root);
         printk(KERN_ERR "EXT2-fs: get root inode failed\n");
         goto failed_mount2;
     }
     if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
         dput(sb->s_root);
         sb->s_root = NULL;
         printk(KERN_ERR "EXT2-fs: corrupt root inode, run e2fsck\n");
         goto failed_mount2;
     }
     if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
         ext2_warning(sb, __FUNCTION__,
              "mounting ext3 filesystem as ext2\n");
     ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY);
     percpu_counter_mod(&sbi->s_freeblocks_counter,
                   ext2_count_free_blocks(sb));
     percpu_counter_mod(&sbi->s_freeinodes_counter,
                   ext2_count_free_inodes(sb));
     percpu_counter_mod(&sbi->s_dirs_counter,
                   ext2_count_dirs(sb));
     return 0;
failed_mount2:
     for (i = 0; i < db_count; i++)
         brelse(sbi->s_group_desc[i]);
failed_mount_group_desc:
     kfree(sbi->s_group_desc);
     if (sbi->s_debts)
         kfree(sbi->s_debts);
failed_mount:
     brelse(bh);
failed_sbi:
     sb->s_fs_info = NULL;
     kfree(sbi);
     return -EINVAL;
}
这段代码里关于具体设备的读值部份将放在块设备驱动中做专题讲述。上面的操作主要是初始化了super_block中指向具体文件系统的信息部份。其中的信息如下图所示:
 
 
在上面的代码中,需要注意的是,文件系统的根目录对应的inode是在第二个inode.inode的设置是下列代码完成的:
root = iget(sb, EXT2_ROOT_INO);
跟踪进iget():
static inline struct inode *iget(struct super_block *sb, unsigned long ino)
{
         //分配并初始化inode
         struct inode *inode = iget_locked(sb, ino);
 
         //调用sb->s_op_read_inode()获得具体的inode信息
         if (inode && (inode->i_state & I_NEW)) {
                   sb->s_op->read_inode(inode);
                   unlock_new_inode(inode);
         }
 
         return inode;
}
而ext2的super_block的操作是由下列代码设置的:
sb->s_op = &ext2_sops;
sb->s_export_op = &ext2_export_ops;
sb->s_xattr = ext2_xattr_handlers;
相应的。exe2_sops由下所示:
static struct super_operations ext2_sops = {
     .alloc_inode  = ext2_alloc_inode,
     .destroy_inode     = ext2_destroy_inode,
     .read_inode   = ext2_read_inode,
     .write_inode  = ext2_write_inode,
     .put_inode    = ext2_put_inode,
     .delete_inode = ext2_delete_inode,
     .put_super    = ext2_put_super,
     .write_super  = ext2_write_super,
     .statfs       = ext2_statfs,
     .remount_fs   = ext2_remount,
     .clear_inode  = ext2_clear_inode,
}
其read_inode对应的接口为ext2_read_inode().代码如下:
void ext2_read_inode (struct inode * inode)
{
     struct ext2_inode_info *ei = EXT2_I(inode);
     //取得它inode对应的索引结点号
     ino_t ino = inode->i_ino;
     struct buffer_head * bh;
     //从ext2文件系统中读取索引结点对应的信息
     struct ext2_inode * raw_inode = ext2_get_inode(inode->i_sb, ino, &bh);
     int n;
 
#ifdef CONFIG_EXT2_FS_POSIX_ACL
     ei->i_acl = EXT2_ACL_NOT_CACHED;
     ei->i_default_acl = EXT2_ACL_NOT_CACHED;
#endif
     if (IS_ERR(raw_inode))
         goto bad_inode;
 
     //利用读取到的信息对inode经行初始化
     inode->i_mode = le16_to_cpu(raw_inode->i_mode);
     inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
     inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
     if (!(test_opt (inode->i_sb, NO_UID32))) {
         inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
         inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
     }
     inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
     inode->i_size = le32_to_cpu(raw_inode->i_size);
     inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
     inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
     inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
     inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
     ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
     /* We now have enough fields to check if the inode was active or not.
      * This is needed because nfsd might try to access dead inodes
      * the test is that same one that e2fsck uses
      * NeilBrown 1999oct15
      */
     if (inode->i_nlink == 0 && (inode->i_mode == 0 || ei->i_dtime)) {
         /* this inode is deleted */
         brelse (bh);
         goto bad_inode;
     }
     inode->i_blksize = PAGE_SIZE;    /* This is the optimal IO size (for stat), not the fs block size */
     inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
     ei->i_flags = le32_to_cpu(raw_inode->i_flags);
     ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
     ei->i_frag_no = raw_inode->i_frag;
     ei->i_frag_size = raw_inode->i_fsize;
     ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
     ei->i_dir_acl = 0;
     if (S_ISREG(inode->i_mode))
         inode->i_size |= ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
     else
         ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
     ei->i_dtime = 0;
     inode->i_generation = le32_to_cpu(raw_inode->i_generation);
     ei->i_state = 0;
     ei->i_next_alloc_block = 0;
     ei->i_next_alloc_goal = 0;
     ei->i_prealloc_count = 0;
     ei->i_block_group = (ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
     ei->i_dir_start_lookup = 0;
 
     /*
      * NOTE! The in-memory inode i_data array is in little-endian order
      * even on big-endian machines: we do NOT byteswap the block numbers!
      */
     for (n = 0; n < EXT2_N_BLOCKS; n++)
         ei->i_data[n] = raw_inode->i_block[n];
 
     //.一般文件
     if (S_ISREG(inode->i_mode)) {
         inode->i_op = &ext2_file_inode_operations;
         inode->i_fop = &ext2_file_operations;
         if (test_opt(inode->i_sb, NOBH))
              inode->i_mapping->a_ops = &ext2_nobh_aops;
         else
              inode->i_mapping->a_ops = &ext2_aops;
     }
     //目录
     else if (S_ISDIR(inode->i_mode)) {
         inode->i_op = &ext2_dir_inode_operations;
         inode->i_fop = &ext2_dir_operations;
         if (test_opt(inode->i_sb, NOBH))
              inode->i_mapping->a_ops = &ext2_nobh_aops;
         else
              inode->i_mapping->a_ops = &ext2_aops;
     }
     //链接文件
     else if (S_ISLNK(inode->i_mode)) {
         if (ext2_inode_is_fast_symlink(inode))
              inode->i_op = &ext2_fast_symlink_inode_operations;
         else {
              inode->i_op = &ext2_symlink_inode_operations;
              if (test_opt(inode->i_sb, NOBH))
                   inode->i_mapping->a_ops = &ext2_nobh_aops;
              else
                   inode->i_mapping->a_ops = &ext2_aops;
         }
     } else {
         inode->i_op = &ext2_special_inode_operations;
         if (raw_inode->i_block[0])
              init_special_inode(inode, inode->i_mode,
                 old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
         else
              init_special_inode(inode, inode->i_mode,
                 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
     }
     brelse (bh);
     ext2_set_inode_flags(inode);
     return;
    
bad_inode:
     make_bad_inode(inode);
     return;
}
从文件系统中读取索引结点号对应的信息是通过ext2_get_inode()完成的。它的代码如下:
static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino,
                       struct buffer_head **p)
{
     struct buffer_head * bh;
     unsigned long block_group;
     unsigned long block;
     unsigned long offset;
     struct ext2_group_desc * gdp;
 
     *p = NULL;
 
     //参数的有效性判断
     if ((ino != EXT2_ROOT_INO && ino < EXT2_FIRST_INO(sb)) ||
         ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count))
         goto Einval;
 
     //索引结点号/每个组中的结点数 = inode所在的组号
     block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb);
     //组号对应的组描述符
     gdp = ext2_get_group_desc(sb, block_group, &bh);
     if (!gdp)
         goto Egdp;
     /*
      * Figure out the offset within the block group inode table
      */
      //计算在组中的偏移量
     offset = ((ino - 1) % EXT2_INODES_PER_GROUP(sb)) * EXT2_INODE_SIZE(sb);
     //inode对应的块号
     block = le32_to_cpu(gdp->bg_inode_table) +
         (offset >> EXT2_BLOCK_SIZE_BITS(sb));
     //读取相应的块信息
     if (!(bh = sb_bread(sb, block)))
         goto Eio;
 
     *p = bh;
     //计算索引结点在块中的偏移量
     offset &= (EXT2_BLOCK_SIZE(sb) - 1);
     return (struct ext2_inode *) (bh->b_data + offset);
 
Einval:
     ext2_error(sb, "ext2_get_inode", "bad inode number: %lu",
            (unsigned long) ino);
     return ERR_PTR(-EINVAL);
Eio:
     ext2_error(sb, "ext2_get_inode",
            "unable to read inode block - inode=%lu, block=%lu",
            (unsigned long) ino, block);
Egdp:
     return ERR_PTR(-EIO);
}
// 取得对应块组号的组描述符
struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb,
                            unsigned int block_group,
                            struct buffer_head ** bh)
{
     unsigned long group_desc;
     unsigned long offset;
     struct ext2_group_desc * desc;
     struct ext2_sb_info *sbi = EXT2_SB(sb);
 
     //块组号大于块组总数,出错退出
     if (block_group >= sbi->s_groups_count) {
         ext2_error (sb, "ext2_get_group_desc",
                  "block_group >= groups_count - "
                  "block_group = %d, groups_count = %lu",
                  block_group, sbi->s_groups_count);
 
         return NULL;
     }
 
     //块组号/每块中的组描述符数 可以计算出是在那一个BH
     group_desc = block_group / EXT2_DESC_PER_BLOCK(sb);
     //在BH中的偏程
     offset = block_group % EXT2_DESC_PER_BLOCK(sb);
 
     //对应的BH为空.出错退出
     if (!sbi->s_group_desc[group_desc]) {
         ext2_error (sb, "ext2_get_group_desc",
                  "Group descriptor not loaded - "
                  "block_group = %d, group_desc = %lu, desc = %lu",
                   block_group, group_desc, offset);
         return NULL;
     }
 
     //取得组描述符所在的BH
     desc = (struct ext2_group_desc *) sbi->s_group_desc[group_desc]->b_data;
     if (bh)
         *bh = sbi->s_group_desc[group_desc];
     //加上在组中的偏移量即为所求的组描述符
     return desc + offset;
}
至此,对文件系统的挂载就完成了。关于文件系统的其它操作的实现。请继续关注本站更新。
四:ext2中文件的查找
现在,就可以来看下文件查找所遗留的问题了.在文件查找里曾分析到。如果文件所对应的目录不在缓存里的话,就会调用文件系统对应的lookup 操作。在上面看到对inode的设置。如果是目录的话就会将i_op设置为ext2_dir_inode_operations.它的定义如下:
struct inode_operations ext2_dir_inode_operations = {
     .create       = ext2_create,
     .lookup       = ext2_lookup,
     .link         = ext2_link,
     .unlink       = ext2_unlink,
     .symlink = ext2_symlink,
     .mkdir        = ext2_mkdir,
     .rmdir        = ext2_rmdir,
     .mknod        = ext2_mknod,
     .rename       = ext2_rename,
#ifdef CONFIG_EXT2_FS_XATTR
     .setxattr = generic_setxattr,
     .getxattr = generic_getxattr,
     .listxattr    = ext2_listxattr,
     .removexattr  = generic_removexattr,
#endif
     .setattr = ext2_setattr,
     .permission   = ext2_permission,
}
相应的lookup接口为 ext2_lookup().代码如下:
static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
{
     struct inode * inode;
     ino_t ino;
 
     //判断文件名是否超长
     if (dentry->d_name.len > EXT2_NAME_LEN)
         return ERR_PTR(-ENAMETOOLONG);
 
     //取得dentry所在的索引结点号
     ino = ext2_inode_by_name(dir, dentry);
     inode = NULL;
    
     if (ino) {
         //取得索引结点号所对应的inode
         inode = iget(dir->i_sb, ino);
         if (!inode)
              return ERR_PTR(-EACCES);
     }
     //将inode和dentry关联起来
     if (inode)
         return d_splice_alias(inode, dentry);
     d_add(dentry, inode);
     return NULL;
}
具体的查找过程是在ext2_inode_by_name()完成的。代码如下:
ino_t ext2_inode_by_name(struct inode * dir, struct dentry *dentry)
{
     ino_t res = 0;
     struct ext2_dir_entry_2 * de;
     struct page *page;
    
     de = ext2_find_entry (dir, dentry, &page);
     if (de) {
         res = le32_to_cpu(de->inode);
         kunmap(page);
         page_cache_release(page);
     }
     return res;
}
转入ext2_find_entry():
struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir,
              struct dentry *dentry, struct page ** res_page)
{
     const char *name = dentry->d_name.name;
     int namelen = dentry->d_name.len;
     unsigned reclen = EXT2_DIR_REC_LEN(namelen);
     unsigned long start, n;
     //计算文件大小所占的页面
     unsigned long npages = dir_pages(dir);
     struct page *page = NULL;
     struct ext2_inode_info *ei = EXT2_I(dir);
     ext2_dirent * de;
 
     if (npages == 0)
         goto out;
 
     /* OFFSET_CACHE */
     *res_page = NULL;
 
     //起始搜索的页面号
     start = ei->i_dir_start_lookup;
     //超出了大小,将其置0
     if (start >= npages)
         start = 0;
     n = start;
     //逐页面的读取inode代码的文件
     do {
         char *kaddr;
         //一次读一个页面
         page = ext2_get_page(dir, n);
         if (!IS_ERR(page)) {
              kaddr = page_address(page);
              de = (ext2_dirent *) kaddr;
              //本页面的最后一个文件
              kaddr += ext2_last_byte(dir, n) - reclen;
              //遍历这个页面的所有目录
              while ((char *) de <= kaddr) {
                   //长度非法,出错退出
                   if (de->rec_len == 0) {
                       ext2_error(dir->i_sb, __FUNCTION__,
                            "zero-length directory entry");
                       ext2_put_page(page);
                       goto out;
                   }
                   //名称是否匹配
                   if (ext2_match (namelen, name, de))
                       goto found;
                   //取得下一个目录
                   de = ext2_next_entry(de);
              }
 
              // 释放页面
              ext2_put_page(page);
         }
         if (++n >= npages)
              n = 0;
     } while (n != start);
out:
     return NULL;
 
found:
     *res_page = page;
     //更改最近搜索的页面
     ei->i_dir_start_lookup = n;
     return de;
}
至此,就完成了一个搜索的过程。如果搜索成功,ext2_inode_by_name()就会返回文件所在的索引号.
找到相应的inode之后,会调用d_splice_alias()使dentry与inode关联起来.这部份代码比较简单,可以自行了解.
五:小结
本节主要以ext2文件系统为例来描述了文件系统的挂载。读者需要自行了解有关ext2布局的知识.文件系统是一个非常繁杂的子系统。里面涉及到的东西需要分解成很多的专题。不可能在一节中完全讲述的很清楚。后续专题请关注本站更新.
阅读(7180) | 评论(3) | 转发(7) |
给主人留下些什么吧!~~

chinaunix网友2011-06-07 13:53:25

非常非常的厉害

chinaunix网友2011-06-07 13:53:25

非常非常的厉害

chinaunix网友2011-06-07 13:53:25

非常非常的厉害