Chinaunix首页 | 论坛 | 博客
  • 博客访问: 513940
  • 博文数量: 80
  • 博客积分: 1475
  • 博客等级: 上尉
  • 技术积分: 1047
  • 用 户 组: 普通用户
  • 注册时间: 2010-04-01 22:58
文章分类

全部博文(80)

文章存档

2012年(3)

2010年(77)

我的朋友

分类: LINUX

2010-05-09 15:24:44

八:VFS层的I/O操作
VFS层是与用户界面直接交互的接口,在这一节里,我们将分为读写两部份来介绍VFS层的操作以及跟上层用用的交互.
8.1:文件的读操作
在用户空间,读文件操作的常用函数为read()。对应在系统空间的调用入口是sys_read().它的代码如下:
asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
{
     struct file *file;
     ssize_t ret = -EBADF;
     int fput_needed;
 
     //根据fd从进程中取出相应的file对象
     file = fget_light(fd, &fput_needed);
     if (file) {
         loff_t pos = file_pos_read(file);
         //文件的当前位置
         ret = vfs_read(file, buf, count, &pos);
         //更新当前的文件位置
         file_pos_write(file, pos);
         fput_light(file, fput_needed);
     }
 
     return ret;
}
从进程中取得文件描述符后和文件当前的操作位置后会调用vfs_read()执行具体的操作过程.它的代码如下:
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
     struct inode *inode = file->f_dentry->d_inode;
     ssize_t ret;
 
     if (!(file->f_mode & FMODE_READ))
         return -EBADF;
     if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
         return -EINVAL;
//检查当前区段是否允许读操作
     ret = locks_verify_area(FLOCK_VERIFY_READ, inode, file, *pos, count);
     if (!ret) {
         //是否有权限
         ret = security_file_permission (file, MAY_READ);
         if (!ret) {
              //如果有read 操作,调用之
              if (file->f_op->read)
                   ret = file->f_op->read(file, buf, count, pos);
              else
                   //否则调用aio_read
                   ret = do_sync_read(file, buf, count, pos);
              //ret: 写入的字节数
              if (ret > 0)
                   //产生通告
                   dnotify_parent(file->f_dentry, DN_ACCESS);
         }
     }
 
     return ret;
}
从上面看到,会最终调用file的相关操作完成文件的读操作.曾记得我们在文件的打开一节中分析了文件的打开过程。在打开文件过程中,文件描述符的相关操作会被赋值为inode->f_op.对于ext2文件系统,inode的相关信息如下:
         inode->i_fop = &ext2_file_operations;
struct file_operations ext2_file_operations = {
     .llseek       = generic_file_llseek,
     .read         = generic_file_read,
     .write        = generic_file_write,
     .aio_read = generic_file_aio_read,
     .aio_write    = generic_file_aio_write,
     .ioctl        = ext2_ioctl,
     .mmap         = generic_file_mmap,
     .open         = generic_file_open,
     .release = ext2_release_file,
     .fsync        = ext2_sync_file,
     .readv        = generic_file_readv,
     .writev       = generic_file_writev,
     .sendfile = generic_file_sendfile,
}
相应文件读操作入口为generic_file_read():
ssize_t
generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
{
     //用户空间的地址和长度
     struct iovec local_iov = { .iov_base = buf, .iov_len = count };
     //记录完成状态
    
     struct kiocb kiocb;
     ssize_t ret;
 
     //kiocb.ki_key=KIOCB_SYNC_KEY; kiocb.ki_filp=filp;kiocb.ki_obj=current;
     init_sync_kiocb(&kiocb, filp);
     //返回读写完成的字节数
     ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);
     //异步操作,需用等待
     if (-EIOCBQUEUED == ret)
         ret = wait_on_sync_kiocb(&kiocb);
     //返回完成的字节数
     return ret;
}
__generic_file_aio_read()是一个很重要的函数,它是读操作的入口。代码如下:
ssize_t
__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
         unsigned long nr_segs, loff_t *ppos)
{
     struct file *filp = iocb->ki_filp;
     ssize_t retval;
     unsigned long seg;
     size_t count;
 
     count = 0;
     for (seg = 0; seg < nr_segs; seg++) {
         const struct iovec *iv = &iov[seg];
 
         /*
          * If any segment has a negative length, or the cumulative
          * length ever wraps negative then return -EINVAL.
          */
         count += iv->iov_len;
         if (unlikely((ssize_t)(count|iv->iov_len) < 0))
              return -EINVAL;
         //检查从 iv->iov_base 开始的iov_len区间的合法性
         if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
              continue;
         if (seg == 0)
              return -EFAULT;
         //nr_seg: 有效的数据段数目
         nr_segs = seg;
         //上一个数据段无效,将其长度减下来
         count -= iv->iov_len;  /* This segment is no good */
         break;
     }
 
     /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
     //如果定义了O_DIRECT:直接传送数据`绕过了页高速缓存
     if (filp->f_flags & O_DIRECT) {
         loff_t pos = *ppos, size;
         struct address_space *mapping;
         struct inode *inode;
 
         mapping = filp->f_mapping;
         inode = mapping->host;
         retval = 0;
         if (!count)
              goto out; /* skip atime */
         size = i_size_read(inode);
         if (pos < size) {
              retval = generic_file_direct_IO(READ, iocb,
                            iov, pos, nr_segs);
              if (retval >= 0 && !is_sync_kiocb(iocb))
                   retval = -EIOCBQUEUED;
              if (retval > 0)
                   *ppos = pos + retval;
         }
         file_accessed(filp);
         goto out;
     }
 
 
     //count:读取文件的长度
     retval = 0;
     if (count) {
         for (seg = 0; seg < nr_segs; seg++) {
              //read_descriptor_t:读操作描述符`用来记录读的状态
              read_descriptor_t desc;
 
              desc.written = 0;
              desc.arg.buf = iov[seg].iov_base;
              desc.count = iov[seg].iov_len;
              //如果没有要传输的数据`继续下一个iov
              if (desc.count == 0)
                   continue;
              desc.error = 0;
              //对其中的每一个段调用do_generic_file_read
              do_generic_file_read(filp,ppos,&desc,file_read_actor,0);
              //desc.written:写入到用户空间的字节数
              //更新retval
              retval += desc.written;
              if (!retval) {
                   retval = desc.error;
                   break;
              }
         }
     }
out:
     return retval;
}
这里有种特殊情况,当文件是用直接I/O模式打开时(文件描述符带有O_DIRECT标志),就会采用直接I/O而跳过了页高速缓区。这样的情况我们在之后再讨论.
对于普通模块的情况。将会对每一个段调用do_generic_file_read()来完成I/O操作。这个函数的代码如下:
do_generic_file_read()à do_generic_file_read():
/*
     mapping:      页高速缓存区
     _ra:          filep对应的file_ra_state
     filep:        打开的文件描述符
     ppos:         当前的操作位置
     desc:         读操作描述符
     actor:        内核空间到用户空间的拷贝函数
     nonblock: 如果此变量为1,则需要预读
   */
void do_generic_mapping_read(struct address_space *mapping,
                   struct file_ra_state *_ra,
                   struct file *filp,
                   loff_t *ppos,
                   read_descriptor_t *desc,
                   read_actor_t actor,
                   int nonblock)
{
     struct inode *inode = mapping->host;
     unsigned long index, end_index, offset;
     loff_t isize;
     struct page *cached_page;
     int error;
     struct file_ra_state ra = *_ra;
 
     cached_page = NULL;
     //找到页面的偏移量。即确定是存储在那个存面中
     index = *ppos >> PAGE_CACHE_SHIFT;
     //第一个请求字节在页面的偏移量
     //亦即请求的字节在页面中的偏移
     offset = *ppos & ~PAGE_CACHE_MASK;
 
     //inode对应的文件大小
     isize = i_size_read(inode);
     if (!isize)
         goto out;
 
     //最后的缓存页序号
     end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
     for (;;) {
         struct page *page;
         unsigned long nr, ret;
 
         /* nr is the maximum number of bytes to copy from this page */
         //nr: 缓存页空间大小
         nr = PAGE_CACHE_SIZE;
         if (index >= end_index) {
              //index > end_indx: 肯定是非法的页面缓存器大小
              if (index > end_index)
                   goto out;
 
              //执行到这里,肯定有index == end_index
              //nr转化成了文件在最后一个缓存page中的位置
              nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
              //offset是当前位置在页中的偏移,nr: 是最后一个块在磁盘中的偏移
              //如果nr<=offset说明文件已经操作完了
              if (nr <= offset) {
                   goto out;
              }
         }
 
         //nr-offset: 页面的剩余操作字节数
         nr = nr - offset;
 
         //检查当前进程是否设置了重新调度标志`如果有`调用schdule()重新调度一次
         cond_resched();
 
         //文件预读
         if (!nonblock)
              page_cache_readahead(mapping, &ra, filp, index);
 
find_page:
         //寻找当前位置对应的缓存页
         page = find_get_page(mapping, index);
         if (unlikely(page == NULL)) {
              //没有找到对应的缓存页,说明在页缓存区中不存在此页面对应的缓存页
              if (nonblock) {
                   desc->error = -EWOULDBLOCKIO;
                   break;
              }
              handle_ra_miss(mapping, &ra, index);
              goto no_cached_page;
         }
 
         //在页缓存区中找到了相关的页面
 
         //检查PG_uptodata标志是否被设置`如果这个标志被设置的话,就不需要从设备
         //上去读取了
         if (!PageUptodate(page)) {
 
              //页面没有设置PG_uptodata`页面中的内容无效,所以要从文件系统中把数据读取出来
              if (nonblock) {
                   page_cache_release(page);
                   desc->error = -EWOULDBLOCKIO;
                   break;
              }
              goto page_not_up_to_date;
         }
page_ok:
 
         /* If users can be writing to this page using arbitrary
          * virtual addresses, take care about potential aliasing
          * before reading the page on the kernel side.
          */
         if (mapping_writably_mapped(mapping))
              flush_dcache_page(page);
 
         /*
          * Mark the page accessed if we read the beginning.
          */
         if (!offset)
              mark_page_accessed(page);
 
         /*
          * Ok, we have the page, and it's up-to-date, so
          * now we can copy it to user space...
          *
          * The actor routine returns how many bytes were actually used..
          * NOTE! This may not be the same as how much of a user buffer
          * we filled up (we may be padding etc), so we can only update
          * "pos" here (the actor routine has to update the user buffer
          * pointers and the remaining count).
          */
 
         //页面与用户空间的值拷贝.返回拷贝的数据数
         ret = actor(desc, page, offset, nr);
         offset += ret;
         index += offset >> PAGE_CACHE_SHIFT;
         offset &= ~PAGE_CACHE_MASK;
 
         page_cache_release(page);
         //如果ret == nr: 拷贝的长度等于在页面中的剩余长度,说明拷贝没有发生错误
         if (ret == nr && desc->count)
              continue;
         //否则,可以退出了
         goto out;
 
page_not_up_to_date:
         /* Get exclusive access to the page ... */
         //要从文件系统中传数据到此页面上。将此页面锁定
         lock_page(page);
 
         /* Did it get unhashed before we got the lock? */
         //有可能在锁页面的时候`有其它的进程将页面移除了页缓存区
         //在这种情况下:将page解锁`并减少它的使用计数,重新循环```
         //重新进入循环后,在页缓存区找不到对应的page.就会重新分配一个新的page
         if (!page->mapping) {
              unlock_page(page);
              page_cache_release(page);
              continue;
         }
 
         /* Did somebody else fill it already? */
         //在加锁的时候,有其它的进程完成了从文件系统到具体页面的映射?
         //在这种情况下,返回到page_ok.直接将页面上的内容copy到用户空间即可
         if (PageUptodate(page)) {
              unlock_page(page);
              goto page_ok;
         }
 
//读取页面
readpage:
         /* Start the actual read. The read will unlock the page. */
 
         //到这里的话,实际的读取过程开始了 ^_^
         error = mapping->a_ops->readpage(filp, page);
 
         //读取错误,退出
         if (unlikely(error))
              goto readpage_error;
 
         //如果PG_uptodata标志仍然末设置.就一直等待,一直到page不处于锁定状态
         // TODO: 在将文件系统的内容读入page之前,page一直是处理Lock状态的。一直到
         //读取完成后,才会将页面解锁.    然后将进程唤醒
         if (!PageUptodate(page)) {
              wait_on_page_locked(page);
 
              //如果页面仍然没有PG_uptodata标志.只可能是发生了错误.出错返回
              if (!PageUptodate(page)) {
                   error = -EIO;
                   goto readpage_error;
              }
         }
 
         /*
          * i_size must be checked after we have done ->readpage.
          *
          * Checking i_size after the readpage allows us to calculate
          * the correct value for "nr", which means the zero-filled
          * part of the page is not copied back to userspace (unless
          * another truncate extends the file - this is desired though).
          */
         isize = i_size_read(inode);
         end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
 
         //如果文件大小无效或者当前位置超过了文件大小
         if (unlikely(!isize || index > end_index)) {
              page_cache_release(page);
              goto out;
         }
 
         /* nr is the maximum number of bytes to copy from this page */
         //重新计算nr 即在页面中剩余的要copy的字节数
         nr = PAGE_CACHE_SIZE;
         if (index == end_index) {
              nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
              if (nr <= offset) {
                   page_cache_release(page);
                   goto out;
              }
         }
         nr = nr - offset;
         goto page_ok;
 
readpage_error:
         /* UHHUH! A synchronous read error occurred. Report it */
         desc->error = error;
         page_cache_release(page);
         goto out;
 
no_cached_page:
         /*
          * Ok, it wasn't cached, so we need to create a new
          * page..
          */
 
         //在页缓区中没有相关的缓存页
 
         //新分匹一个页面
         if (!cached_page) {
              cached_page = page_cache_alloc_cold(mapping);
              if (!cached_page) {
                   desc->error = -ENOMEM;
                   goto out;
              }
         }
 
         //将分得的页加到页缓存区和LRU
         // TODO:在将新页面插入页缓存区域中,会将页面标志设置为PG_locked
         error = add_to_page_cache_lru(cached_page, mapping,
                            index, GFP_KERNEL);
         if (error) {
              if (error == -EEXIST)
                   goto find_page;
              desc->error = error;
              goto out;
         }
         page = cached_page;
         cached_page = NULL;
         goto readpage;
     }
 
out:
     *_ra = ra;
 
     //ppos: 最后的读取位置
     *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
     if (cached_page)
         page_cache_release(cached_page);
     if (filp)
         file_accessed(filp);
}
如果参数为nonblock为1,则必须预读页面。在这里的调用nonblock为零,不需要考虑预读的情况。关于预读的操作,我们之后再给出分析.
在这个操作中,有这样几种可能的情况:
1:如果要访问的页面在页高速缓存中,而且已经被更新(含有PG_uptodata标志).只需要直接将其copy到用户空间即可.
2:序号对应的页面不在高速缓存中,那就需要在页高速缓存中增加序号对应的页面。然后从文件系统中读取数据到这个页面上.再拷贝到用户空间。
3:序号对应的页面在高速缓存中,但数据不是最新的.这就需要缓存页与文件系统进行同步.再将页面拷贝到用户空间.
对于2和3。它们有一部份是相同的,即从文件系统中读数据的过程。我们只需要分种对于第2的情况。对应的代码片段如下:
void do_generic_mapping_read(struct address_space *mapping,
                   struct file_ra_state *_ra,
                   struct file *filp,
                   loff_t *ppos,
                   read_descriptor_t *desc,
                   read_actor_t actor,
                   int nonblock)
{
     ……
     page = find_get_page(mapping, index);
         if (unlikely(page == NULL)) {
              //没有找到对应的缓存页,说明在页缓存区中不存在此页面对应的缓存页
              if (nonblock) {
                   desc->error = -EWOULDBLOCKIO;
                   break;
              }
              handle_ra_miss(mapping, &ra, index);
              goto no_cached_page;
         }
     ……
     ……
 
}
Handle_ra_miss()主要对文件的预读进行调整,在这里不进行分析,待分析预读机制的时候再来详细分析.
如果页面高速缓存中不存在此页面就会跳转到no_cached_page:
no_cached_page:
         /*
          * Ok, it wasn't cached, so we need to create a new
          * page..
          */
 
         //在页缓区中没有相关的缓存页
 
         //新分匹一个页面
         if (!cached_page) {
              cached_page = page_cache_alloc_cold(mapping);
              if (!cached_page) {
                   desc->error = -ENOMEM;
                   goto out;
              }
         }
 
         //将分得的页加到页缓存区和LRU
         // TODO:在将新页面插入页缓存区域中,会将页面标志设置为PG_locked
         error = add_to_page_cache_lru(cached_page, mapping,
                            index, GFP_KERNEL);
         if (error) {
              if (error == -EEXIST)
                   goto find_page;
              desc->error = error;
              goto out;
         }
         page = cached_page;
         cached_page = NULL;
         goto readpage;
在这里,会首先调用page_cache_alloc_cold()分配一个页面。然后调用add_to_page_cache_lru()将页面插入页高速缓存并加入lru.然后跳转到readpage。这也是第3种情况对应的处理:
 
//读取页面
readpage:
         /* Start the actual read. The read will unlock the page. */
 
         //到这里的话,实际的读取过程开始了 ^_^
         error = mapping->a_ops->readpage(filp, page);
在这里会看到,最终会调用页高速缓存的readpage方法进行读取操作。
 
文件页高速缓存的readpage操作
同理,还是以ext2文件系统为例来分析。在open的时候,它将页高速缓存对应的各项操作设置如下:
inode->i_mapping->a_ops = &ext2_aops;
struct address_space_operations ext2_aops = {
     .readpage     = ext2_readpage,
     .readpages         = ext2_readpages,
     .writepage         = ext2_writepage,
     .sync_page         = block_sync_page,
     .prepare_write         = ext2_prepare_write,
     .commit_write      = generic_commit_write,
     .bmap              = ext2_bmap,
     .direct_IO         = ext2_direct_IO,
     .writepages        = ext2_writepages,
};
对应的入口为ext2_readpage:
static int ext2_readpage(struct file *file, struct page *page)
{
     return mpage_readpage(page, ext2_get_block);
}
这是一个封装的函数,采用一个回调函数做为参数.该回调函数将相对于文件起始的块号转换为文件系统的逻辑块号.
Mpage_readpage()的代码如下:
int mpage_readpage(struct page *page, get_block_t get_block)
{
     struct bio *bio = NULL;
     sector_t last_block_in_bio = 0;
 
     //转要读的信息转换为bio结构
     bio = do_mpage_readpage(bio, page, 1,
              &last_block_in_bio, get_block);
     //提交这个bio
     if (bio)
         mpage_bio_submit(READ, bio);
     return 0;
}
mpage_bio_submit()这个操作中有一部份代码在之前已经分析过了。剩余的代码很简单。这里不做分析.
do_mpage_readpage()的代码如下:
static struct bio *
do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
              sector_t *last_block_in_bio, get_block_t get_block)
{
     struct inode *inode = page->mapping->host;
     const unsigned blkbits = inode->i_blkbits;
     //计算一个页面中的数据块数目
     const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
     //block的大小
     const unsigned blocksize = 1 << blkbits;
     sector_t block_in_file;
     sector_t last_block;
     sector_t blocks[MAX_BUF_PER_PAGE];
     unsigned page_block;
     unsigned first_hole = blocks_per_page;
     struct block_device *bdev = NULL;
     struct buffer_head bh;
     int length;
     int fully_mapped = 1;
 
     //如果页面是一个缓存区页,跳转到confused.直接更新页在中的块缓存区
     if (page_has_buffers(page))
         goto confused;
 
     //页序号*每个页中的块数目 = 页面中的首个块号
     block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
     //文件最后的块: 文件大小/块大小
     last_block = (i_size_read(inode) + blocksize - 1) >> blkbits;
 
     bh.b_page = page;
     //遍历页面中的块数
     for (page_block = 0; page_block < blocks_per_page;
                   page_block++, block_in_file++) {
         bh.b_state = 0;
         if (block_in_file < last_block) {
              //将文件中的块号转换成bh
              if (get_block(inode, block_in_file, &bh, 0))
                   //如果有错误
                   goto confused;
         }
 
     //bh没有被映射,可能是一个文件空洞
         if (!buffer_mapped(&bh)) {
              fully_mapped = 0;
              if (first_hole == blocks_per_page)
                   first_hole = page_block;
              continue;
         }
 
         /* some filesystems will copy data into the page during
          * the get_block call, in which case we don't want to
          * read it again.  map_buffer_to_page copies the data
          * we just collected from get_block into the page's buffers
          * so readpage doesn't have to repeat the get_block call
          */
          //如果块缓存区是最新的,将其数据直接copy到page
         if (buffer_uptodate(&bh)) {
              map_buffer_to_page(page, &bh, page_block);
              goto confused;
         }
    
         if (first_hole != blocks_per_page)
              goto confused;         /* hole -> non-hole */
 
         /* Contiguous blocks? */
         //判断请求的块缓存是不是连续的。如果不连续,就跳转到confused
         if (page_block && blocks[page_block-1] != bh.b_blocknr-1)
              goto confused;
         blocks[page_block] = bh.b_blocknr;
         bdev = bh.b_bdev;
     }
 
     if (first_hole != blocks_per_page) {
         char *kaddr = kmap_atomic(page, KM_USER0);
         memset(kaddr + (first_hole << blkbits), 0,
                   PAGE_CACHE_SIZE - (first_hole << blkbits));
         flush_dcache_page(page);
         kunmap_atomic(kaddr, KM_USER0);
         if (first_hole == 0) {
              SetPageUptodate(page);
              unlock_page(page);
              goto out;
         }
     } else if (fully_mapped) {
         //设置PG_mappedtodisk
         SetPageMappedToDisk(page);
     }
 
     /*
      * This page will go to BIO.  Do we need to send this BIO off first?
      */
     if (bio && (*last_block_in_bio != blocks[0] - 1))
         bio = mpage_bio_submit(READ, bio);
 
alloc_new:
     if (bio == NULL) {
         //创建一个bio
         bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
                   min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
                   GFP_KERNEL);
         if (bio == NULL)
              goto confused;
     }
 
     length = first_hole << blkbits;
     //将page对应的偏移与长度设置到bio 中
     if (bio_add_page(bio, page, length, 0) < length) {
         bio = mpage_bio_submit(READ, bio);
         goto alloc_new;
     }
 
     if (buffer_boundary(&bh) || (first_hole != blocks_per_page))
         bio = mpage_bio_submit(READ, bio);
     else
         *last_block_in_bio = blocks[blocks_per_page - 1];
out:
     return bio;
 
confused:
     if (bio)
         bio = mpage_bio_submit(READ, bio);
     if (!PageUptodate(page))
             block_read_full_page(page, get_block);
     else
         unlock_page(page);
     goto out;
}
这段代码实际上做了一个小小的优化。它会判断要提交的块缓存区是不是连续的。如果是连续的就可以将它们放一个bio中。然后提交到通用块设备层。如果不是连续的,对于每一个块缓存区都要提交一次.
对于连续条件的bio提交很好理解,代码也很容易.重点分析对于不连续的块的处理。
在上面的代码中可以看到,对于不连续块是通过block_read_full_page()来处理的.代码如下:
int block_read_full_page(struct page *page, get_block_t *get_block)
{
     struct inode *inode = page->mapping->host;
     sector_t iblock, lblock;
     struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
     unsigned int blocksize;
     int nr, i;
     int fully_mapped = 1;
 
     //页面没有被锁定
     if (!PageLocked(page))
         PAGE_BUG(page);
     //块大小
     blocksize = 1 << inode->i_blkbits;
     //如果页面中没有块缓存区,则在其中建立空的块缓存区
     if (!page_has_buffers(page))
         create_empty_buffers(page, blocksize, 0);
     //块缓存区描述符的首部
     head = page_buffers(page);
 
     //页中的起始块号
     iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
     //文件中的最后一个块号
     lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
     bh = head;
     nr = 0;
     i = 0;
 
     do {
         //已经是最新的了,不需要提交,继续下一个
         if (buffer_uptodate(bh))
              continue;
         //如果块缓存区没有被映射
         if (!buffer_mapped(bh)) {
              fully_mapped = 0;
              if (iblock < lblock) {
                   //将文件块号转换为bh
                   if (get_block(inode, iblock, bh, 0))
                       SetPageError(page);
              }
              //如果这个bh还是没有映射。可能是对应文件的空洞区域
              //将这个bh对应的区域置0
              if (!buffer_mapped(bh)) {
                   void *kaddr = kmap_atomic(page, KM_USER0);
                   memset(kaddr + i * blocksize, 0, blocksize);
                   flush_dcache_page(page);
                   kunmap_atomic(kaddr, KM_USER0);
                   set_buffer_uptodate(bh);
                   continue;
              }
              /*
               * get_block() might have updated the buffer
               * synchronously
               */
               //如果bh为最新了,不需要提交了
              if (buffer_uptodate(bh))
                   continue;
         }
 
         //提要提交的bh保存到arr数组里
         arr[nr++] = bh;
     } while (i++, iblock++, (bh = bh->b_this_page) != head);
 
     //设置PG_mappdtodisk
     if (fully_mapped)
         SetPageMappedToDisk(page);
 
     //如果没有要提交的
     if (!nr) {
         /*
          * All buffers are uptodate - we can set the page uptodate
          * as well. But not if get_block() returned an error.
          */
         if (!PageError(page))
              SetPageUptodate(page);
         unlock_page(page);
         return 0;
     }
 
     /* Stage two: lock the buffers */
     //对每一个提交的bh进行锁定
     for (i = 0; i < nr; i++) {
         bh = arr[i];
         lock_buffer(bh);
         mark_buffer_async_read(bh);
     }
 
     /*
      * Stage 3: start the IO.  Check for uptodateness
      * inside the buffer lock in case another process reading
      * the underlying blockdev brought it uptodate (the sct fix).
      */
 
     //提交每一个bh
     for (i = 0; i < nr; i++) {
         bh = arr[i];
         if (buffer_uptodate(bh))
              end_buffer_async_read(bh, 1);
         else
              submit_bh(READ, bh);
     }
     return 0;
}
从上面的代码中看了.对于不连续的读操作,会反复调用submit_bh()来完成.
 
8.2:文件的写操作
在用户空间中,用户的写操作接口为write.对应系统调用的入口为sys_write().
代码如下:
asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t count)
{
     struct file *file;
     ssize_t ret = -EBADF;
     int fput_needed;
 
     //取得文件描述符对应的file
     //fget_ligsh():对fget()进行了优化。如果当前file没有被共享的话。那么在取的时候就不必要加锁
     file = fget_light(fd, &fput_needed);
     if (file) {
         //当前文件指针位置
         loff_t pos = file_pos_read(file);
         ret = vfs_write(file, buf, count, &pos);
         //更新文件指针
         file_pos_write(file, pos);
         //对共享情况下的解锁
         fput_light(file, fput_needed);
     }
 
     return ret;
}
上面的代码与读操作差不多,都是取文件描述符和当前文件,操作完后,更新文件指针位置.
vfs_write()代码如下:
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
     struct inode *inode = file->f_dentry->d_inode;
     ssize_t ret;
 
     //文件不可写?    
     if (!(file->f_mode & FMODE_WRITE))
         return -EBADF;
     //没有操作函数或者是有操作函数但没有写函数。出错返回
     if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
         return -EINVAL;
     //对写区域所加的强制锁
     ret = locks_verify_area(FLOCK_VERIFY_WRITE, inode, file, *pos, count);
     if (!ret) {
         ret = security_file_permission (file, MAY_WRITE);
         if (!ret) {
              if (file->f_op->write)
                   ret = file->f_op->write(file, buf, count, pos);
              else
                   ret = do_sync_write(file, buf, count, pos);
              if (ret > 0)
                   dnotify_parent(file->f_dentry, DN_MODIFY);
         }
     }
 
     return ret;
}
对于大部份情况,写操作会由file->f_op->write完成.在ext2文件系统中,此接口对应的函数为:
ssize_t generic_file_write(struct file *file, const char __user *buf,
                 size_t count, loff_t *ppos)
{
     struct address_space *mapping = file->f_mapping;
     struct inode *inode = mapping->host;
     ssize_t  ret;
     struct iovec local_iov = { .iov_base = (void __user *)buf,
                       .iov_len = count };
 
     down(&inode->i_sem);
     //返回write的有效字节数
     ret = generic_file_write_nolock(file, &local_iov, 1, ppos);
     up(&inode->i_sem);
 
     //如果定义了O_SYNC或者inode定义了MS_SYNCHRONOUS标志
     if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
         ssize_t err;
 
         //把缓存区上面的东西写回设备
         err = sync_page_range(inode, mapping, *ppos - ret, ret);
         if (err < 0)
              ret = err;
     }
     return ret;
}
如果打开文件时带有O_SYNC标志,或者文件系统带有SYNC标志,都会将缓存中的数据直接写到文件系统上.
转入generic_file_write_nolock():
ssize_t
generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
                   unsigned long nr_segs, loff_t *ppos)
{
     struct file *file = iocb->ki_filp;
     struct address_space * mapping = file->f_mapping;
     size_t ocount;         /* original count */
     size_t count;      /* after file limit checks */
     struct inode *inode = mapping->host;
     unsigned long seg;
     loff_t        pos;
     ssize_t       written;
     ssize_t       err;
 
     ocount = 0;
     for (seg = 0; seg < nr_segs; seg++) {
         const struct iovec *iv = &iov[seg];
 
         /*
          * If any segment has a negative length, or the cumulative
          * length ever wraps negative then return -EINVAL.
          */
         ocount += iv->iov_len;
         if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
              return -EINVAL;
         //判断用户给的区域是否合法
         if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
              continue;
         if (seg == 0)
              return -EFAULT;
         nr_segs = seg;
         ocount -= iv->iov_len; /* This segment is no good */
         break;
     }
 
     //count: 要write的字节总数
     count = ocount;
     //ppos:当前的位置
     pos = *ppos;
 
     /* We can write back this queue in page reclaim */
     //backing_dev_info: 预读信息
     current->backing_dev_info = mapping->backing_dev_info;
     written = 0;
 
     //对写操作的详细检查
     err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
     if (err)
         goto out;
 
     if (count == 0)
         goto out;
 
     err = remove_suid(file->f_dentry);
     if (err)
         goto out;
 
     //更新索引结点的时间戳信息
     inode_update_time(inode, 1);
 
     /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
     if (unlikely(file->f_flags & O_DIRECT)) {
         written = generic_file_direct_write(iocb, iov,
                   &nr_segs, pos, ppos, count, ocount);
         if (written < 0 || written == count)
              goto out;
         /*
          * direct-io write to a hole: fall through to buffered I/O
          * for completing the rest of the request.
          */
         pos += written;
         count -= written;
     }
 
     written = generic_file_buffered_write(iocb, iov, nr_segs,
              pos, ppos, count, written);
out:
     current->backing_dev_info = NULL;
     return written ? written : err;
}
如果文件打开时带有了O_DIRECT标志,则会跳过文件缓存直接将数据写到文件系统中。对于O_DIRECT的操作我们在之后再做总结。对于一般的情况,都会转入到generic_file_buffered_write():
ssize_t
generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
         unsigned long nr_segs, loff_t pos, loff_t *ppos,
         size_t count, ssize_t written)
{
     struct file *file = iocb->ki_filp;
     struct address_space * mapping = file->f_mapping;
     struct address_space_operations *a_ops = mapping->a_ops;
     struct inode *inode = mapping->host;
     long     status = 0;
     struct page   *page;
     struct page   *cached_page = NULL;
     size_t        bytes;
     struct pagevec     lru_pvec;
     const struct iovec *cur_iov = iov; /* current iovec */
     size_t        iov_base = 0;    /* offset in the current iovec */
     char __user   *buf;
 
     pagevec_init(&lru_pvec, 0);
 
     buf = iov->iov_base + written;   /* handle partial DIO write */
     do {
         unsigned long index;
         unsigned long offset;
         size_t copied;
 
         //offset: 页面中的偏移
         offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
         //offset: 页面序号
         index = pos >> PAGE_CACHE_SHIFT;
         //页面中的剩余信息
         bytes = PAGE_CACHE_SIZE - offset;
         //如果bytes > 数据的长度
         if (bytes > count)
              bytes = count;
 
         /*
          * Bring in the user page that we will copy from _first_.
          * Otherwise there's a nasty deadlock on copying from the
          * same page as we're writing to, without it being marked
          * up-to-date.
          */
         fault_in_pages_readable(buf, bytes);
 
         //到页高速缓存中寻找index对应的页面。如果不存在,则新建
         page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
         if (!page) {
              status = -ENOMEM;
              break;
         }
 
         //调用prepare_write。在这里就会涉及到缓存头的概念了 ^_^
          status = a_ops->prepare_write(file, page, offset, offset+bytes);
         if (unlikely(status)) {
              loff_t isize = i_size_read(inode);
              /*
               * prepare_write() may have instantiated a few blocks
               * outside i_size.  Trim these off again.
               */
              unlock_page(page);
              page_cache_release(page);
              if (pos + bytes > isize)
                   vmtruncate(inode, isize);
              break;
         }
 
         //把数据copy到缓冲区
         if (likely(nr_segs == 1))
              copied = filemap_copy_from_user(page, offset,
                                 buf, bytes);
         else
              copied = filemap_copy_from_user_iovec(page, offset,
                            cur_iov, iov_base, bytes);
         flush_dcache_page(page);
 
         //调用commit_write。将数据写回设备
         status = a_ops->commit_write(file, page, offset, offset+bytes);
         if (likely(copied > 0)) {
              if (!status)
                   status = copied;
 
              if (status >= 0) {
                   written += status;
                   count -= status;
                   pos += status;
                   buf += status;
                   if (unlikely(nr_segs > 1))
                       filemap_set_next_iovec(&cur_iov,
                                 &iov_base, status);
              }
         }
         if (unlikely(copied != bytes))
              if (status >= 0)
                   status = -EFAULT;
         unlock_page(page);
         mark_page_accessed(page);
         page_cache_release(page);
         if (status < 0)
              break;
         balance_dirty_pages_ratelimited(mapping);
         cond_resched();
     } while (count);
     *ppos = pos;
 
     if (cached_page)
         page_cache_release(cached_page);
 
     /*
      * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
      */
     if (likely(status >= 0)) {
         if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
              if (!a_ops->writepage || !is_sync_kiocb(iocb))
                   status = generic_osync_inode(inode, mapping,
                            OSYNC_METADATA|OSYNC_DATA);
         }
     }
    
     /*
      * If we get here for O_DIRECT writes then we must have fallen through
      * to buffered writes (block instantiation inside i_size).  So we sync
      * the file data here, to try to honour O_DIRECT expectations.
      */
     if (unlikely(file->f_flags & O_DIRECT) && written)
         status = filemap_write_and_wait(mapping);
 
     pagevec_lru_add(&lru_pvec);
     return written ? written : status;
}
从上面的代码可以看出:对于写操作,会先到高速缓存中取对应的page。然后调用a_ops->prepare_write()。然后将要写的数据拷贝到缓存区页上,接着调用a_ops-> commit_write()。下来我们分别分别这两个操作.
8.2.1:页高速缓存的prepare_write()操作
Ext2系统对应的入口为:
static int
ext2_prepare_write(struct file *file, struct page *page,
              unsigned from, unsigned to)
{
     return block_prepare_write(page,from,to,ext2_get_block);
}
这里是一个封装函数。对于块设备来说,不同的只是后面所带的函数指针,这样的函数结构我们在读操作中也见过。Ext_get_block()函数的操作为,将对应文件的块号转换为文件系统的逻辑块号.
转入block_prepare_write():
int block_prepare_write(struct page *page, unsigned from, unsigned to,
              get_block_t *get_block)
{
     struct inode *inode = page->mapping->host;
     int err = __block_prepare_write(inode, page, from, to, get_block);
     //如果失败,清除page的uptodate标志
     if (err)
         ClearPageUptodate(page);
     return err;
}
__block_prepare_write()的操作为:
static int __block_prepare_write(struct inode *inode, struct page *page,
         unsigned from, unsigned to, get_block_t *get_block)
{
     unsigned block_start, block_end;
     sector_t block;
     int err = 0;
     unsigned blocksize, bbits;
     struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
 
     BUG_ON(!PageLocked(page));
     BUG_ON(from > PAGE_CACHE_SIZE);
     BUG_ON(to > PAGE_CACHE_SIZE);
     BUG_ON(from > to);
 
     //标大小
     blocksize = 1 << inode->i_blkbits;
     if (!page_has_buffers(page))
         create_empty_buffers(page, blocksize, 0);
     head = page_buffers(page);
 
     bbits = inode->i_blkbits;
     //该页面的起始起号
     block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
 
     for(bh = head, block_start = 0; bh != head || !block_start;
         block++, block_start=block_end, bh = bh->b_this_page) {
         block_end = block_start + blocksize;
 
         //对于没有落在from->to这个区间的bh
         // TODO: 这样做实际上要依赖一个条件: 块大小必须为512的整数倍且须为2的幂大小
         if (block_end <= from || block_start >= to) {
              if (PageUptodate(page)) {
                   if (!buffer_uptodate(bh))
                       set_buffer_uptodate(bh);
              }
              continue;
         }
         if (buffer_new(bh))
              clear_buffer_new(bh);
         if (!buffer_mapped(bh)) {
              //这里可能会进行文件系统大小的扩充.
              err = get_block(inode, block, bh, 1);
              if (err)
                   goto out;
              //块缓存区刚被分配,没有被访问就置为BH_NEW
              //通常是通过get_block()刚刚映射好的,不能访问
              if (buffer_new(bh)) {
                   clear_buffer_new(bh);
                   unmap_underlying_metadata(bh->b_bdev,
                                 bh->b_blocknr);
                   //如果页面uptodate.则设置bh的相应标志
                   if (PageUptodate(page)) {
                       set_buffer_uptodate(bh);
                       continue;
                   }
                   //如果只是对该块缓存区的部份进行操作,则将不操作的部份置0
                   if (block_end > to || block_start < from) {
                       void *kaddr;
 
                       kaddr = kmap_atomic(page, KM_USER0);
                       if (block_end > to)
                            memset(kaddr+to, 0,
                                 block_end-to);
                       if (block_start < from)
                            memset(kaddr+block_start,
                                 0, from-block_start);
                       flush_dcache_page(page);
                       kunmap_atomic(kaddr, KM_USER0);
                   }
                   continue;
              }
         }
         if (PageUptodate(page)) {
              if (!buffer_uptodate(bh))
                   set_buffer_uptodate(bh);
              continue;
         }
 
         //如果bh没有uptodata.先将其和文件系统同步
         if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
              (block_start < from || block_end > to)) {
              ll_rw_block(READ, 1, &bh);
              *wait_bh++=bh;
         }
     }
     /*
      * If we issued read requests - let them complete.
      */
      //如果有提交的bh.等待其I/O完成
     while(wait_bh > wait) {
         wait_on_buffer(*--wait_bh);
         if (!buffer_uptodate(*wait_bh))
              return -EIO;
     }
     return 0;
out:
     /*
      * Zero out any newly allocated blocks to avoid exposing stale
      * data.  If BH_New is set, we know that the block was newly
      * allocated in the above loop.
      */
     bh = head;
     block_start = 0;
     do {
         block_end = block_start+blocksize;
         if (block_end <= from)
              goto next_bh;
         if (block_start >= to)
              break;
         if (buffer_new(bh)) {
              void *kaddr;
 
              clear_buffer_new(bh);
              kaddr = kmap_atomic(page, KM_USER0);
              memset(kaddr+block_start, 0, bh->b_size);
              kunmap_atomic(kaddr, KM_USER0);
              set_buffer_uptodate(bh);
              mark_buffer_dirty(bh);
         }
next_bh:
         block_start = block_end;
         bh = bh->b_this_page;
     } while (bh != head);
     return err;
}
对于读操作,写操作可能更加复杂,因为写操作要动态调整文件的大小。文件大小的调整过程是在ext_get_block()这个回调函数中完成的。
Prepare_write操作完成了对缓存冲页进行了必要的初始化和文件大小的扩充.
直正将数据写到文件系统上是在commit_write()中完成的:
int generic_commit_write(struct file *file, struct page *page,
         unsigned from, unsigned to)
{
     struct inode *inode = page->mapping->host;
     loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
     __block_commit_write(inode,page,from,to);
     /*
      * No need to use i_size_read() here, the i_size
      * cannot change under us because we hold i_sem.
      */
      //如果文件被扩大了.更改inode->i_size
     if (pos > inode->i_size) {
         i_size_write(inode, pos);
         mark_inode_dirty(inode);
     }
     return 0;
}
经过上面的分析,我们知道,在调用commit_write()之前,已经将要写的数据拷贝到了页缓冲区.
__block_commit_write()的代码如下:
static int __block_commit_write(struct inode *inode, struct page *page,
         unsigned from, unsigned to)
{
     unsigned block_start, block_end;
     int partial = 0;
     unsigned blocksize;
     struct buffer_head *bh, *head;
 
     blocksize = 1 << inode->i_blkbits;
 
     //对被修改的部份置为dirty
     for(bh = head = page_buffers(page), block_start = 0;
         bh != head || !block_start;
         block_start=block_end, bh = bh->b_this_page) {
         block_end = block_start + blocksize;
         if (block_end <= from || block_start >= to) {
              if (!buffer_uptodate(bh))
                   partial = 1;
         } else {
              set_buffer_uptodate(bh);
              mark_buffer_dirty(bh);
         }
     }
 
     /*
      * If this is a partial write which happened to make all buffers
      * uptodate then we can optimize away a bogus readpage() for
      * the next read(). Here we 'discover' whether the page went
      * uptodate as a result of this (potentially partial) write.
      */
     //如果整个页面的块缓存区都置为了dirty.则置页面的PG_uptodate标志.
     if (!partial)
         SetPageUptodate(page);
     return 0;
}
在上面的代码中,我们看到,只是把块缓存区置为了“脏”,并没有直正的将数据写到文件系统中,那是什么时候完成这个写的过程的呢?
记得我们在分析pdflush线程数的时候,曾经介绍过 “回写陈旧的页面”。没错,就是在那里,旧页面被回写到了文件系统.
在那一节,我们遗留下了两个问题。即mapping->a_ops->writepages和mapping->a_ops->writepage的操作。我们在这一节里详细的分析一下.
 
8.2.1: mapping->a_ops->writepages()操作
对于ext2来说,它的mapping各项操作赋值为:
struct address_space_operations ext2_aops = {
……
.writepage         = ext2_writepage,
.writepages        = ext2_writepages,
……
}
相应的,writepages入口为ext2_writepages():
static int
ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
     return mpage_writepages(mapping, wbc, ext2_get_block);
}
mpage_writepages()就是我们在pdflush线程组中曾经分析过的子函数.在这里不再赘述.
 
8.2.2: mapping->a_ops->writepage()操作
相应的入口为ext2_writepage():
static int ext2_writepage(struct page *page, struct writeback_control *wbc)
{
     return block_write_full_page(page, ext2_get_block, wbc);
}
转入block_write_full_page()
static int __block_write_full_page(struct inode *inode, struct page *page,
              get_block_t *get_block, struct writeback_control *wbc)
{
     int err;
     sector_t block;
     sector_t last_block;
     struct buffer_head *bh, *head;
     int nr_underway = 0;
 
     BUG_ON(!PageLocked(page));
 
     //文件中的最后一个块号
     last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
 
     //如果不是块缓存页,则在页中建立块缓存区
     if (!page_has_buffers(page)) {
         create_empty_buffers(page, 1 << inode->i_blkbits,
                       (1 << BH_Dirty)|(1 << BH_Uptodate));
     }
 
     /*
      * Be very careful.  We have no exclusion from __set_page_dirty_buffers
      * here, and the (potentially unmapped) buffers may become dirty at
      * any time.  If a buffer becomes dirty here after we've inspected it
      * then we just miss that fact, and the page stays dirty.
      *
      * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
      * handle that here by just cleaning them.
      */
 
     //块缓存页中的起始块号
     block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
     //块缓存区描述符首部
     head = page_buffers(page);
     bh = head;
 
     /*
      * Get all the dirty buffers mapped to disk addresses and
      * handle any aliases from the underlying blockdev's mapping.
      */
     do {
         //如果块号超过了文件的最后块号
         if (block > last_block) {
              /*
               * mapped buffers outside i_size will occur, because
               * this page can be outside i_size when there is a
               * truncate in progress.
               */
              /*
               * The buffer was zeroed by block_write_full_page()
               */
              clear_buffer_dirty(bh);
              set_buffer_uptodate(bh);
         } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
         //从文件系统中读取文件相对块号对应的bh
              err = get_block(inode, block, bh, 1);
              if (err)
                   goto recover;
              if (buffer_new(bh)) {
                   /* blockdev mappings never come here */
                   clear_buffer_new(bh);
                   unmap_underlying_metadata(bh->b_bdev,
                                 bh->b_blocknr);
              }
         }
         bh = bh->b_this_page;
         block++;
     } while (bh != head);
 
     do {
         get_bh(bh);
         //块缓存区没有被映射
         if (!buffer_mapped(bh))
              continue;
         /*
          * If it's a fully non-blocking write attempt and we cannot
          * lock the buffer then redirty the page.  Note that this can
          * potentially cause a busy-wait loop from pdflush and kswapd
          * activity, but those code paths have their own higher-level
          * throttling.
          */
          //在操作之前先锁定块缓存区
         if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
              lock_buffer(bh);
         } else if (test_set_buffer_locked(bh)) {
              //如果操作模式为WB_SYNC_NONE或者不允许阻塞。
              //在块缓存区已经被锁定时,直接退出
              redirty_page_for_writepage(wbc, page);
              continue;
         }
         //如果页面为脏,设置块缓存区为BH_ASYNC_WRITE
         if (test_clear_buffer_dirty(bh)) {
              mark_buffer_async_write(bh);
         } else {
              unlock_buffer(bh);
         }
     } while ((bh = bh->b_this_page) != head);
 
     /*
      * The page and its buffers are protected by PageWriteback(), so we can
      * drop the bh refcounts early.
      */
     BUG_ON(PageWriteback(page));
     //设置页面回写标志
     set_page_writeback(page);
     unlock_page(page);
 
     //遍历页中的块缓存区,将BH_ASYNC_WRITE标志的BH回写到文件系统
     do {
         struct buffer_head *next = bh->b_this_page;
         if (buffer_async_write(bh)) {
              submit_bh(WRITE, bh);
              nr_underway++;
         }
         put_bh(bh);
         bh = next;
     } while (bh != head);
 
     err = 0;
done:
     if (nr_underway == 0) {
         /*
          * The page was marked dirty, but the buffers were
          * clean.  Someone wrote them back by hand with
          * ll_rw_block/submit_bh.  A rare case.
          */
         int uptodate = 1;
         do {
              if (!buffer_uptodate(bh)) {
                   uptodate = 0;
                   break;
              }
              bh = bh->b_this_page;
         } while (bh != head);
         if (uptodate)
              SetPageUptodate(page);
         end_page_writeback(page);
         /*
          * The page and buffer_heads can be released at any time from
          * here on.
          */
         wbc->pages_skipped++;  /* We didn't write this page */
     }
     return err;
 
recover:
     /*
      * ENOSPC, or some other error.  We may already have added some
      * blocks to the file, so we need to write these out to avoid
      * exposing stale data.
      * The page is currently locked and not marked for writeback
      */
     bh = head;
     /* Recovery: lock and submit the mapped buffers */
     do {
         get_bh(bh);
         if (buffer_mapped(bh) && buffer_dirty(bh)) {
              lock_buffer(bh);
              mark_buffer_async_write(bh);
         } else {
              /*
               * The buffer may have been set dirty during
               * attachment to a dirty page.
               */
              clear_buffer_dirty(bh);
         }
     } while ((bh = bh->b_this_page) != head);
     SetPageError(page);
     BUG_ON(PageWriteback(page));
     set_page_writeback(page);
     unlock_page(page);
     do {
         struct buffer_head *next = bh->b_this_page;
         if (buffer_async_write(bh)) {
              clear_buffer_dirty(bh);
              submit_bh(WRITE, bh);
              nr_underway++;
         }
         put_bh(bh);
         bh = next;
     } while (bh != head);
     goto done;
}
该函数会遍历页面中的块缓存区,然后将脏的块缓存区写回文件系统.
阅读(6114) | 评论(0) | 转发(3) |
给主人留下些什么吧!~~