Chinaunix首页 | 论坛 | 博客
  • 博客访问: 349586
  • 博文数量: 102
  • 博客积分: 3140
  • 博客等级: 中校
  • 技术积分: 680
  • 用 户 组: 普通用户
  • 注册时间: 2008-12-28 11:44
个人简介

开阔未来

文章分类

全部博文(102)

文章存档

2015年(10)

2014年(1)

2013年(1)

2012年(4)

2011年(8)

2010年(24)

2009年(51)

2008年(3)

我的朋友

分类: LINUX

2010-08-28 16:05:39

文件读写--页面缓冲(Page Cache)的管理

R.wen

一、本文分析文件的读写过程。当用户进程发出一个read()系统调用时,它首先通过VFSdisk cache中去查找相应的文件块有没有已经被缓存起来,如果有,则不需要再次从设备中去读,直接从CACHE中去拷贝给用户缓冲区就可以了,否则它就要先分配一个缓冲页面,并且将其加入到对应的inode节点的address_space中,再调用address_spacereadpage()函数,通过submit_bio()向设备发送一个请求,将所需的文件块从设备中读取出来存放在先前分配的缓冲页面中,最后再从该页面中将所需数据拷贝到用户缓冲区。

1

二、页面缓冲(Page Cache)的管理

页面缓冲的核心数据结构是struct address_space 

struct backing_dev_info;

struct address_space {

       struct inode           *host;            /* owner: inode, block_device */

       struct radix_tree_root    page_tree;       /* radix tree of all pages */

       rwlock_t        tree_lock;       /* and rwlock protecting it */

       unsigned int           i_mmap_writable;/* count VM_SHARED mappings */

       struct prio_tree_root      i_mmap;         /* tree of private and shared mappings */

       struct list_head       i_mmap_nonlinear;/*list VM_NONLINEAR mappings */

       spinlock_t              i_mmap_lock; /* protect tree, count, list */

       unsigned int           truncate_count;      /* Cover race condition with truncate */

       unsigned long         nrpages; /* number of total pages */

       pgoff_t                  writeback_index;/* writeback starts here */

       const struct address_space_operations *a_ops;   /* methods */

       unsigned long         flags;             /* error bits/gfp mask */

       struct backing_dev_info *backing_dev_info; /* device readahead, etc */

       spinlock_t              private_lock;   /* for use by the address_space */

       struct list_head       private_list;     /* ditto */

       struct address_space     *assoc_mapping;    /* ditto */

} __attribute__((aligned(sizeof(long))));

如下图2,缓冲页面的是通过一个基数树(Radix Tree)来管理的,这是一个简单但非常高效的树结构。

2

由图2可以看到,当RADIX_TREE_MAP_SHIFT6(即每个节点有2^664slot)且树高是1时,它可以寻址大小为64个页面(256kb)的文件,同样,当树高为2时,它可以寻址64*64个页面(16M)大小的文件,如此下去,在32位的系统中,树高为6级,(最高级只有2位:32-6*5),所以它可以寻址2^32-1个页面大小的文件,约为16TB大小,所以目前来说已经足够了。

基数树的遍历也是很简单,且类似于虚拟线性地址的转换过程。只要给定树根及文件偏移,就可以找到相应的缓存页面。再如图2右,如果在文件中的偏移为131个页面,这个偏移值的高6位就是第一级偏移,而低6位就是在第二级的偏移,依此类推。如对于偏移值131(10000011),高6位值是131>>6 = 2,所以它在第一级的偏移是2,而在第2级的领衔就是低6位,值为3,即偏移为3,所以得到的结果如图2右方所示。

#define RADIX_TREE_MAP_SHIFT   (CONFIG_BASE_SMALL ? 4 : 6)

#define RADIX_TREE_MAP_SIZE      (1UL << RADIX_TREE_MAP_SHIFT)

#define RADIX_TREE_MAX_TAGS 2

#define RADIX_TREE_TAG_LONGS \    //其值为64

       ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)

struct radix_tree_node {

       unsigned int    height;            /* Height from the bottom */

       unsigned int    count;

       struct rcu_head      rcu_head;

       void        *slots[RADIX_TREE_MAP_SIZE];

       unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];

};

struct radix_tree_path {

       struct radix_tree_node *node;

       int offset;

};

struct radix_tree_node {

       unsigned int    height;            /* Height from the bottom */

       unsigned int    count;

       struct rcu_head      rcu_head;

       void        *slots[RADIX_TREE_MAP_SIZE];

       unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];

};

以上是相关的几个数据结构,第一个为树根结点结构,第二个用于路径查找,第三个就是树的节点结构。

注意节点结构中的tags域,这个一个典型的用空间换时间的应用。它是一个二维数组,用于记录该节点下面的子节点有没有相应的标志。目前RADIX_TREE_MAX_TAGS2,表示只记录两个标志,其中tags[0]PAGE_CACHE_DIRTYtags[1]PAGE_CACHE_WRITEBACK。它表示,如果当前节点的tags[0]值为1,那么它的子树节点就存在PAGE_CACHE_DIRTY节点,否则这个子树分枝就不存在着这样的节点,就不必再查找这个子树了。比如在查找PG_dirty的页面时,就不需要遍历整个树,而可以跳过那些tags[0]0值的子树,这样就提高了查找效率。

二、文件读过程

我们先看标准的读过程。

1、准备工作。通过VFS层,及一些初始化操作,为真正的读操作做准备。

首先是用户进程通过read系统调用发出一个读请求:

asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)

{

       struct file *file;

       ssize_t ret = -EBADF;

       int fput_needed;

       file = fget_light(fd, &fput_needed);

       if (file) {

              loff_t pos = file_pos_read(file);

              ret = vfs_read(file, buf, count, &pos);

              file_pos_write(file, pos);

              fput_light(file, fput_needed);

       }

       return ret;

}

然后通过VFS层操作:

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)

{

       ssize_t ret;

       …… //一些检查

       ret = rw_verify_area(READ, file, pos, count);

       if (ret >= 0) {

              count = ret;

              ret = security_file_permission (file, MAY_READ);

              if (!ret) {

                     if (file->f_op->read)

                            ret = file->f_op->read(file, buf, count, pos);

                     else

                            ret = do_sync_read(file, buf, count, pos);

                     ……

              }

       }

       return ret;

}

对于ext2文件系统,有:

const struct file_operations ext2_file_operations = {

       .llseek            = generic_file_llseek,

       .read              = do_sync_read,

       .write             = do_sync_write,

       .aio_read = generic_file_aio_read,

       .aio_write       = generic_file_aio_write,

       .ioctl              = ext2_ioctl,

#ifdef CONFIG_COMPAT

       .compat_ioctl = ext2_compat_ioctl,

#endif

       .mmap           = generic_file_mmap,

       .open             = generic_file_open,

       .release    = ext2_release_file,

       .fsync            = ext2_sync_file,

       .sendfile = generic_file_sendfile,

       .splice_read    = generic_file_splice_read,

       .splice_write   = generic_file_splice_write,

};

所以它执行的是:

ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)

{

       //初始化iov, kiocb两个数据结构

       struct iovec iov = { .iov_base = buf, .iov_len = len };

       struct kiocb kiocb;

       ssize_t ret;

       init_sync_kiocb(&kiocb, filp);

       kiocb.ki_pos = *ppos;

       kiocb.ki_left = len;

       for (;;) {

              ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);

              if (ret != -EIOCBRETRY)

                     break;

              wait_on_retry_sync_kiocb(&kiocb);

       }

       if (-EIOCBQUEUED == ret)

              ret = wait_on_sync_kiocb(&kiocb);

       *ppos = kiocb.ki_pos;

       return ret;

}

可以看,它最后还是调用了aio_read()接口函数来完成读操作,即在2.6中,aio_read()为同步和异步读操作的通用接口,由上可以看到,对于ext2,它是generic_file_aio_read

/**

* generic_file_aio_read - generic filesystem read routine

* @iocb:       kernel I/O control block

* @iov: io vector request

* @nr_segs: number of segments in the iovec

* @pos: current file position

*

* This is the "read()" routine for all filesystems

* that can use the page cache directly.

*/

ssize_t

generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,

              unsigned long nr_segs, loff_t pos)

{

       struct file *filp = iocb->ki_filp;

       ssize_t retval;

       unsigned long seg;

       size_t count;

       loff_t *ppos = &iocb->ki_pos;

       …….//一些检查   

       /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */

       if (filp->f_flags & O_DIRECT) {

              ……//直接IO,我们这里先跳过

       }

       retval = 0;

       if (count) {

              for (seg = 0; seg < nr_segs; seg++) {

                     read_descriptor_t desc; //一个读描述符结构

                     desc.written = 0;

                     desc.arg.buf = iov[seg].iov_base;

                     desc.count = iov[seg].iov_len;

                     if (desc.count == 0)

                            continue;

                     desc.error = 0;

                     do_generic_file_read(filp,ppos,&desc,file_read_actor);

                     retval += desc.written;

                     if (desc.error) {

                            retval = retval ?: desc.error;

                            break;

                     }

              }

       }

out:

       return retval;

}

static inline void do_generic_file_read(struct file * filp, loff_t *ppos,

                                   read_descriptor_t * desc,

                                   read_actor_t actor)

{

       do_generic_mapping_read(filp->f_mapping,

                            &filp->f_ra,

                            filp,

                            ppos,

                            desc,

                            actor);

}

2、读入操作。完成了上面的准备工作,下一步就是执行读操作的核心函数do_generic_mapping_read这是一个比较复杂的函数,里面有大量的goto跳转,但还是比较清晰的。

       它工作过程可以描述如下:

a.       如果所要读取的文件在页面缓存中,则跳转到步骤d

b.       文件还没有被缓冲,所以要从设备中去读取,首先分配一个页面,并将这个页面链入到相应的address_space中去

c.       然后调用address_space中的readpage()函数,去从设备中读出一个页面大小的数据到这个页面缓存中。

d.       检查PageUptodate(page)

e.       调用由参数传入的actor函数指针,在此为file_read_actor(),将数据中页面缓存中拷贝到用户缓冲区。

f.        如果请求读取的数据长度已完成,则函数返回,否则跳转到步骤a重复执行。

先看看file_read_actor()

int file_read_actor(read_descriptor_t *desc, struct page *page,

                     unsigned long offset, unsigned long size)

{

       char *kaddr;

       unsigned long left, count = desc->count;

       if (size > count)

              size = count;

……

       /* Do it the slow way */

       kaddr = kmap(page);

       left = __copy_to_user(desc->arg.buf, kaddr + offset, size); //将数据拷贝到用户空间

       kunmap(page);

       if (left) {

              size -= left;

              desc->error = -EFAULT;

       }

success:

       desc->count = count - size;

       desc->written += size;

       desc->arg.buf += size;

       return size;

}

/**

* This is a generic file read routine, and uses the

* mapping->a_ops->readpage() function for the actual low-level stuff.

*/

void do_generic_mapping_read(struct address_space *mapping,

                          struct file_ra_state *_ra,

                          struct file *filp,

                          loff_t *ppos,

                          read_descriptor_t *desc,

                          read_actor_t actor)

{

       struct inode *inode = mapping->host;

       unsigned long index;

       unsigned long end_index;

       unsigned long offset;

       unsigned long last_index;

       unsigned long next_index;

       unsigned long prev_index;

       loff_t isize;

       struct page *cached_page;

       int error;

       struct file_ra_state ra = *_ra;

       cached_page = NULL;

       index = *ppos >> PAGE_CACHE_SHIFT;

       next_index = index;

       prev_index = ra.prev_page;

       last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;

       offset = *ppos & ~PAGE_CACHE_MASK;

       isize = i_size_read(inode);

       if (!isize)

              goto out;

       end_index = (isize - 1) >> PAGE_CACHE_SHIFT;

       for (;;) {

              struct page *page;

              unsigned long nr, ret;

              /* nr is the maximum number of bytes to copy from this page */

              nr = PAGE_CACHE_SIZE;

              if (index >= end_index) {

                     if (index > end_index)

                            goto out;

                     nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;

                     if (nr <= offset) {

                            goto out;

                     }

              }

              nr = nr - offset;

              cond_resched();

              if (index == next_index)

                     next_index = page_cache_readahead(mapping, &ra, filp,

                                   index, last_index - index);

find_page:

              page = find_get_page(mapping, index); //在缓存中查找

              if (unlikely(page == NULL)) {

                     handle_ra_miss(mapping, &ra, index);

                     goto no_cached_page; //没有找到

              }

              if (!PageUptodate(page)) //Uptodate

                     goto page_not_up_to_date;

page_ok: //找到了相关缓存页面

              ret = actor(desc, page, offset, nr); //拷贝数据到用户缓冲区

              //更新一些变量值

              offset += ret;

              index += offset >> PAGE_CACHE_SHIFT;

              offset &= ~PAGE_CACHE_MASK;

              page_cache_release(page);

              if (ret == nr && desc->count)

                     continue; //未完成,进入下一次循环

              goto out; //完成

page_not_up_to_date:

              /* Get exclusive access to the page ... */

              lock_page(page);

              /* Did it get truncated before we got the lock? */

              if (!page->mapping) {

                     unlock_page(page);

                     page_cache_release(page);

                     continue;

              }

              /* Did somebody else fill it already? */

              if (PageUptodate(page)) {

                     unlock_page(page);

                     goto page_ok;

              }

readpage: //读操作

              /* Start the actual read. The read will unlock the page. */

              error = mapping->a_ops->readpage(filp, page); //真正的读操作

              ……             

              /* nr is the maximum number of bytes to copy from this page */

              nr = PAGE_CACHE_SIZE;

              if (index == end_index) {

                     nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;

                     if (nr <= offset) {

                            page_cache_release(page);

                            goto out;

                     }

              }

              nr = nr - offset;

              goto page_ok;

readpage_error:

              /* UHHUH! A synchronous read error occurred. Report it */

              desc->error = error;

              page_cache_release(page);

              goto out;

no_cached_page: //分配一个新的页面,比将它链入缓存树中。

              /*

              * Ok, it wasn't cached, so we need to create a new

              * page..

              */

              if (!cached_page) {

                     cached_page = page_cache_alloc_cold(mapping);

                     if (!cached_page) {

                            desc->error = -ENOMEM;

                            goto out;

                     }

              }

              error = add_to_page_cache_lru(cached_page, mapping,

                                          index, GFP_KERNEL);

              page = cached_page;

              cached_page = NULL;

              goto readpage;

       }

out:

       *_ra = ra;

       *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;

       if (cached_page)

              page_cache_release(cached_page);

       if (filp)

              file_accessed(filp);

}

3、从设备读取

对于不同的文件系统有不同的address_space,而且有不同的address_space_operations,对于ext2文件系统来说,这个是如下一个结构:

const struct address_space_operations ext2_aops = {

       .readpage               = ext2_readpage,

       .readpages             = ext2_readpages,

       .writepage             = ext2_writepage,

       .sync_page            = block_sync_page,

       .prepare_write        = ext2_prepare_write,

       .commit_write              = generic_commit_write,

       .bmap                   = ext2_bmap,

       .direct_IO              = ext2_direct_IO,

       .writepages            = ext2_writepages,

       .migratepage          = buffer_migrate_page,

};

可见,这个readpage()便是ext2_readpage()它负责从设备中读取一个页面。

static int ext2_readpage(struct file *file, struct page *page)

{

       return mpage_readpage(page, ext2_get_block);

}

/*

* This isn't called much at all

*/

int mpage_readpage(struct page *page, get_block_t get_block)

{

       struct bio *bio = NULL;

       sector_t last_block_in_bio = 0;

       struct buffer_head map_bh;

       unsigned long first_logical_block = 0;

       clear_buffer_mapped(&map_bh);

       bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,

                     &map_bh, &first_logical_block, get_block);

       if (bio)

              mpage_bio_submit(READ, bio);

       return 0;

}

这个函数最终将读请求转成submit_bio(),之后就是通用块层的事情了。

阅读(1490) | 评论(1) | 转发(0) |
给主人留下些什么吧!~~

chinaunix网友2010-08-30 21:28:38

Download More than 1000 free IT eBooks: http://free-ebooks.appspot.com