Chinaunix首页 | 论坛 | 博客
  • 博客访问: 1240901
  • 博文数量: 122
  • 博客积分: 0
  • 博客等级: 民兵
  • 技术积分: 4004
  • 用 户 组: 普通用户
  • 注册时间: 2014-02-20 08:27
文章分类
文章存档

2016年(1)

2015年(21)

2014年(100)

分类: LINUX

2014-06-06 10:26:58

kernel 3.10内核源码分析--file write流程

1、基本原理
文件写入流程通常从用户态发起,总体分两部分:
1) 从用户态到页缓存
用户态进程先open指定文件,返回fd
用户态进程调用write库函数接口,fd作为入参
read库函数接口最终调用write对应的系统调用接口,sys_write,由此进入内核流程
--------用户态和内核态分界------------------------
内核态调用特定文件系统定义的对应相关操作接口,如ext2对应的write接口为ext2_file_operations->aio_write->generic_file_aio_write
由于用户对写入速度不敏感,同时磁盘操作较慢,所以写操作通常被延迟,以提升性能,即通常说的“延迟写”。
基本原理为:数据先写入页缓存(page cache)中并设置脏标记,后续由pdflush内核线程周期性或者按需对脏页进行回写。写入操作在数据写入页缓存后即可返回,无需等待数据实际写入磁盘,页缓存(内存)的写入速度显然比磁盘快多了,所以如此能大大提升write性能,但由于数据实际写入磁盘的时机不定,在诸如断电之类的异常情况下,会引入数据一致性问题。

2) 从页缓存到物理磁盘
通过pdflush内核线程或在需要时(如内存紧张回收内存时)按需将页缓存中的脏数据回写到磁盘。
回写之初,位于mapping层,在进行相应处理后,构造并提交bio,进入通用块层。
--------mapping层和通用块层分界----------------
通用块层构造request,提交到IO调度层
--------通用块层和IO调度层分界----------------
IO调度层对request按特定的IO调度算法进行合并和排序后,将request放入对应块设备的request queue,由此进入块设备驱动层。常用的块设备驱动层为SCSI,即进入SCSI层。
--------IO调度层和块设备驱动层分界-------------
SCSI层又分3层:SCSI上层驱动、SCSI中间层和SCSI底层驱动。
request进入SCSI层后,先由SCSI上层驱动和中间层进行处理,主要是按SCSI协议规范构造SCSI命令,最终将请求提交到SCSI底层驱动。
SCSI底层驱动,如mptsas,对request queue中的request进行逐一处理,主要跟根据硬件特性,构造最终的命令,以及跟硬件的交互。构造的命令最终提交到磁盘硬件(准确的说,应该是固件)。
--------SCSI层和硬件(固件)层的分界--------------
硬件处理完IO请求后,产生中断通知SCSI底层驱动(初始化时预先注册了相应的中断)。
ISR中调用SCSI层的scsi_done接口,进行相应的处理。
scsi_done调用上层(块设备层)定义的blk_complete_request接口,并触发相应的软中断
软中断中进行后续的处理:包括错误处理、request的清理、定时器的清理、唤醒等待的进程等。

2、数据写入页缓存流程
从进入内核态开始,到数据写入页缓存的过程,大致流程如下:

ext2_file_operations->aio_write->generic_file_aio_write
    __generic_file_aio_write
      generic_file_buffered_write
        generic_perform_write
          address_space->a_ops->write_begin //为write做好准备,比如通过get_block获取文件逻辑块号和设备逻辑块号的对应关系。
            block_write_begin
              grab_cache_page_write_begin // 从page cache中查找相应page,如果没有找到,则在page cache中的分配page
              __block_write_begin
                create_page_buffers // 如果该page还没有相应的buffer_head与之对应,则创建相应的bh。
                ext2_get_block // 获取文件逻辑块号和设备逻辑块号的对应关系,结果保存在buffer_head中(page->buffers),
                                            后续提交IO时,可以根据该对应关系确定是读
写整页,还是只读写其中的部分block。
                ll_rw_block //如果相关的bh已mapped,即有磁盘block与之映射,但该block不是uptodate,则从磁盘中重新读取。
          iov_iter_copy_from_user_atomic //将数据从用户态拷贝到内核态
          address_space->a_ops->write_end
            generic_write_end
              block_write_end // 提交IO,实际上并没有真正提交,而是设置了一些标记(比如dirty)和状态而已,真正的IO提交由flush进程定期或
                                           按需提交。
                __block_commit_write
                mark_inode_dirty

3、页面回写流程
数据从页缓存回写的大致流程如下所示,触发回写的机制包括:fsync、sync、flush、blance_dirty_page
  address_space->ext2_aops->ext2_writepages
    mpage_writepages
      write_cache_pages
        __mpage_writepage
          mpage_alloc
          mpage_bio_submit //按bio提交,标准流程。
          mapping->a_ops->writepage(page, wbc) //按buffer方式写,每次写一个block,不需要整页写
            ext2_writepage
              block_write_full_page 
                block_write_full_page_endio
                  __block_write_full_page
                    create_page_buffers // 如果该page还没有相应的buffer_head与之对应,则创建相应的bh。
                    ext2_get_block // 获取文件逻辑块号和设备逻辑块号的对应关系。
                    submit_bh // 按buffer方式提交IO。

4、代码分析
write流程中各关键函数分析如下:
generic_file_aio_write->__generic_file_aio_write

点击(此处)折叠或打开

  1. ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
  2.                  unsigned long nr_segs, loff_t *ppos)
  3. {
  4.     struct file *file = iocb->ki_filp;
  5.     struct address_space * mapping = file->f_mapping;
  6.     size_t ocount;        /* original count */
  7.     size_t count;        /* after file limit checks */
  8.     struct inode     *inode = mapping->host;
  9.     loff_t        pos;
  10.     ssize_t        written;
  11.     ssize_t        err;

  12.     ocount = 0;
  13.     err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
  14.     if (err)
  15.         return err;

  16.     count = ocount;
  17.     pos = *ppos;

  18.     /* We can write back this queue in page reclaim */
  19.     current->backing_dev_info = mapping->backing_dev_info;
  20.     written = 0;

  21.     err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
  22.     if (err)
  23.         goto out;

  24.     if (count == 0)
  25.         goto out;

  26.     err = file_remove_suid(file);
  27.     if (err)
  28.         goto out;

  29.     err = file_update_time(file);
  30.     if (err)
  31.         goto out;

  32.     /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
  33.     /*如果设置了O_DIRECT标记,则走DIO流程,即绕过cache和buffer。Fixme:会阻塞等待IO完成么?一定能保证成功么?*/
  34.     if (unlikely(file->f_flags & O_DIRECT)) {
  35.         loff_t endbyte;
  36.         ssize_t written_buffered;
  37.         /*进入Direct IO流程*/
  38.         written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
  39.                             ppos, count, ocount);
  40.         if (written < 0 || written == count)
  41.             goto out;
  42.         /*
  43.          * direct-io write to a hole: fall through to buffered I/O
  44.          * for completing the rest of the request.
  45.          */
  46.         /*当dio操作失败,或者其它异常时,需要在走aio流程,所以dio是不能保证一定成功的*/
  47.         pos += written;
  48.         count -= written;
  49.         written_buffered = generic_file_buffered_write(iocb, iov,
  50.                         nr_segs, pos, ppos, count,
  51.                         written);
  52.         /*
  53.          * If generic_file_buffered_write() retuned a synchronous error
  54.          * then we want to return the number of bytes which were
  55.          * direct-written, or the error code if that was zero. Note
  56.          * that this differs from normal direct-io semantics, which
  57.          * will return -EFOO even if some bytes were written.
  58.          */
  59.         if (written_buffered < 0) {
  60.             err = written_buffered;
  61.             goto out;
  62.         }

  63.         /*
  64.          * We need to ensure that the page cache pages are written to
  65.          * disk and invalidated to preserve the expected O_DIRECT
  66.          * semantics.
  67.          */
  68.         /*先把脏数据回写*/
  69.         endbyte = pos + written_buffered - written - 1;
  70.         err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
  71.         if (err == 0) {
  72.             written = written_buffered;
  73.             /*invalidate mapping*/
  74.             invalidate_mapping_pages(mapping,
  75.                          pos >> PAGE_CACHE_SHIFT,
  76.                          endbyte >> PAGE_CACHE_SHIFT);
  77.         } else {
  78.             /*
  79.              * We don't know how much we wrote, so just return
  80.              * the number of bytes which were direct-written
  81.              */
  82.         }
  83.     } else {
  84.         /*aio,异步io,即buffered write流程,经过cache或buffer写入数据*/
  85.         written = generic_file_buffered_write(iocb, iov, nr_segs,
  86.                 pos, ppos, count, written);
  87.     }
  88. out:
  89.     current->backing_dev_info = NULL;
  90.     return written ? written : err;
  91. }

generic_file_aio_write->__generic_file_aio_write->generic_file_buffered_write->generic_perform_write

点击(此处)折叠或打开

  1. static ssize_t generic_perform_write(struct file *file,
  2.                 struct iov_iter *i, loff_t pos)
  3. {
  4.     struct address_space *mapping = file->f_mapping;
  5.     const struct address_space_operations *a_ops = mapping->a_ops;
  6.     long status = 0;
  7.     ssize_t written = 0;
  8.     unsigned int flags = 0;

  9.     /*
  10.      * Copies from kernel address space cannot fail (NFSD is a big user).
  11.      */
  12.     if (segment_eq(get_fs(), KERNEL_DS))
  13.         flags |= AOP_FLAG_UNINTERRUPTIBLE;

  14.     do {
  15.         struct page *page;
  16.         unsigned long offset;    /* Offset into pagecache page */
  17.         unsigned long bytes;    /* Bytes to write to page */
  18.         size_t copied;        /* Bytes copied from user */
  19.         void *fsdata;

  20.         offset = (pos & (PAGE_CACHE_SIZE - 1));
  21.         bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
  22.                         iov_iter_count(i));

  23. again:
  24.         /*
  25.          * Bring in the user page that we will copy from _first_.
  26.          * Otherwise there's a nasty deadlock on copying from the
  27.          * same page as we're writing to, without it being marked
  28.          * up-to-date.
  29.          *
  30.          * Not only is this an optimisation, but it is also required
  31.          * to check that the address is actually valid, when atomic
  32.          * usercopies are used, below.
  33.          */
  34.         if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
  35.             status = -EFAULT;
  36.             break;
  37.         }
  38.         /* 为write做好准备,比如通过get_block获取文件逻辑块号和设备物理
  39.          * 块号的对应关系。write_begin根据文件系统不同有不同实现,比如ext2
  40.          * 对应为ext2_write_begin
  41.          */
  42.         status = a_ops->write_begin(file, mapping, pos, bytes, flags,
  43.                         &page, &fsdata);
  44.         if (unlikely(status))
  45.             break;

  46.         if (mapping_writably_mapped(mapping))
  47.             flush_dcache_page(page);
  48.         /*disable缺页异常,实际为关抢占。*/
  49.         pagefault_disable();
  50.         // 将数据从用户态拷贝到内核态
  51.         copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
  52.         /*enable缺页异常,开抢占*/
  53.         pagefault_enable();
  54.         flush_dcache_page(page);
  55.         /*标记page为accessed*/
  56.         mark_page_accessed(page);
  57.         // 提交IO,ext2文件系统实现为ext2_write_end,未执行实际的提交操作,只是设置脏标记等
  58.         status = a_ops->write_end(file, mapping, pos, bytes, copied,
  59.                         page, fsdata);
  60.         if (unlikely(status < 0))
  61.             break;
  62.         copied = status;
  63.         /*检查是否需要调度*/
  64.         cond_resched();

  65.         iov_iter_advance(i, copied);
  66.         if (unlikely(copied == 0)) {
  67.             /*
  68.              * If we were unable to copy any data at all, we must
  69.              * fall back to a single segment length write.
  70.              *
  71.              * If we didn't fallback here, we could livelock
  72.              * because not all segments in the iov can be copied at
  73.              * once without a pagefault.
  74.              */
  75.             bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
  76.                         iov_iter_single_seg_count(i));
  77.             goto again;
  78.         }
  79.         pos += copied;
  80.         written += copied;
  81.         /*检查脏页是否超限,超限时回收*/
  82.         balance_dirty_pages_ratelimited(mapping);
  83.         /*
  84.          * 检查是否有挂起的SIGKILL信号,如果有的话直接返回错误。
  85.          * 应该是新版本中新增的功能,能在write流程中将进程kill掉。
  86.          * 在引入此功能之前,write内核流程中由于不处理信号,同时
  87.          * 因为IO过程中,进程通常为D状态,导致相关进程无法kill?
  88.          */
  89.         if (fatal_signal_pending(current)) {
  90.             status = -EINTR;
  91.             break;
  92.         }
  93.     } while (iov_iter_count(i));

  94.     return written ? written : status;
  95. }

generic_file_aio_write->__generic_file_aio_write->generic_file_buffered_write->generic_perform_write->ext2_write_begin->block_write_begin

点击(此处)折叠或打开

  1. int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
  2.         unsigned flags, struct page **pagep, get_block_t *get_block)
  3. {
  4.     pgoff_t index = pos >> PAGE_CACHE_SHIFT;
  5.     struct page *page;
  6.     int status;

  7.     // 从page cache中查找相应page,如果没有找到,则在page cache中的分配page
  8.     page = grab_cache_page_write_begin(mapping, index, flags);
  9.     if (!page)
  10.         return -ENOMEM;
  11.     // 走到这儿,说明page要么在page cache中找到了,要不就新分配了
  12.     status = __block_write_begin(page, pos, len, get_block);
  13.     if (unlikely(status)) {
  14.         unlock_page(page);
  15.         page_cache_release(page);
  16.         page = NULL;
  17.     }

  18.     *pagep = page;
  19.     return status;
  20. }

generic_file_aio_write->__generic_file_aio_write->generic_file_buffered_write->generic_perform_write->ext2_write_begin->block_write_begin->__block_write_begin

点击(此处)折叠或打开

  1. int __block_write_begin(struct page *page, loff_t pos, unsigned len,
  2.         get_block_t *get_block)
  3. {
  4.     unsigned from = pos & (PAGE_CACHE_SIZE - 1);
  5.     unsigned to = from + len;
  6.     struct inode *inode = page->mapping->host;
  7.     unsigned block_start, block_end;
  8.     sector_t block;
  9.     int err = 0;
  10.     unsigned blocksize, bbits;
  11.     struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;

  12.     BUG_ON(!PageLocked(page));
  13.     BUG_ON(from > PAGE_CACHE_SIZE);
  14.     BUG_ON(to > PAGE_CACHE_SIZE);
  15.     BUG_ON(from > to);

  16.     // 为page分配相应的buffer_head,用于保存文件逻辑块和磁盘逻辑块之间的对应关系。
  17.     head = create_page_buffers(page, inode, 0);
  18.     blocksize = head->b_size;
  19.     bbits = block_size_bits(blocksize);
  20.     /*将位置转换成块号*/
  21.     block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);

  22.     for(bh = head, block_start = 0; bh != head || !block_start;
  23.      block++, block_start=block_end, bh = bh->b_this_page) {
  24.         block_end = block_start + blocksize;
  25.         if (block_end <= from || block_start >= to) {
  26.             if (PageUptodate(page)) {
  27.                 if (!buffer_uptodate(bh))
  28.                     set_buffer_uptodate(bh);
  29.             }
  30.             continue;
  31.         }
  32.         /*对于新创建的bh,清除BH_NEW标记*/
  33.         if (buffer_new(bh))
  34.             clear_buffer_new(bh);
  35.         /*如果buffer还没有映射,即还没有建立文件逻辑块和磁盘逻辑块之间的对应关系*/
  36.         if (!buffer_mapped(bh)) {
  37.             WARN_ON(bh->b_size != blocksize);
  38.             /* 调用传入的get_block接口,ext2对应为ext2_get_block,用于获取
  39.              * 文件逻辑块和磁盘逻辑块之间的对应关系。get_block并不实际从磁盘
  40.              * 中读取文件数据,只是获取对应关系(通过直接或间接块信息),如果
  41.              * 相关的对应关系没有建立,则创建相应的对应关系(比如创建间接块,
  42.              * 传入的最后一个参数为1,表示需要创建),获取的对应关系保存于bh中
  43.              */
  44.             err = get_block(inode, block, bh, 1);
  45.             if (err)
  46.                 break;
  47.             /*Fixme:前面已经clear过BH_NEW标记了,为何这里还要判断?*/
  48.             if (buffer_new(bh)) {
  49.                 unmap_underlying_metadata(bh->b_bdev,
  50.                             bh->b_blocknr);
  51.                 if (PageUptodate(page)) {
  52.                     clear_buffer_new(bh);
  53.                     set_buffer_uptodate(bh);
  54.                     mark_buffer_dirty(bh);
  55.                     continue;
  56.                 }
  57.                 if (block_end > to || block_start < from)
  58.                     zero_user_segments(page,
  59.                         to, block_end,
  60.                         block_start, from);
  61.                 continue;
  62.             }
  63.         }
  64.         if (PageUptodate(page)) {
  65.             if (!buffer_uptodate(bh))
  66.                 set_buffer_uptodate(bh);
  67.             continue;
  68.         }
  69.         /*
  70.          * 如果相关的bh已mapped,即有磁盘block与之映射,但该block不是uptodate,则从磁盘中重新读取。
  71.          */
  72.         if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
  73.          !buffer_unwritten(bh) &&
  74.          (block_start < from || block_end > to)) {
  75.             ll_rw_block(READ, 1, &bh);
  76.             *wait_bh++=bh;
  77.         }
  78.     }
  79.     /*
  80.      * If we issued read requests - let them complete.
  81.      */
  82.     /*如果前面触发了读请求,则等待读取操作完成*/
  83.     while(wait_bh > wait) {
  84.         wait_on_buffer(*--wait_bh);
  85.         if (!buffer_uptodate(*wait_bh))
  86.             err = -EIO;
  87.     }
  88.     if (unlikely(err))
  89.         page_zero_new_buffers(page, from, to);
  90.     return err;
  91. }
阅读(7296) | 评论(0) | 转发(2) |
给主人留下些什么吧!~~