Chinaunix首页 | 论坛 | 博客
  • 博客访问: 1940647
  • 博文数量: 1000
  • 博客积分: 0
  • 博客等级: 民兵
  • 技术积分: 7921
  • 用 户 组: 普通用户
  • 注册时间: 2013-08-20 09:23
个人简介

storage R&D guy.

文章分类

全部博文(1000)

文章存档

2019年(5)

2017年(47)

2016年(38)

2015年(539)

2014年(193)

2013年(178)

分类: LINUX

2015-05-20 13:55:47

kernel 3.10内核源码分析--file read流程

1、基本原理
文件读取流程通常从用户态发起,总体流程为:
用户态进程先open指定文件,返回fd
用户态进程调用read库函数接口,fd作为入参
read库函数接口最终调用read对应的系统调用接口,sys_read,由此进入内核流程
--------用户态和内核态分界------------------------
内核态调用特定文件系统定义的对应相关操作接口,如ext2对应的read接口为ext2_file_operations->aio_read,由此进入mapping层(VFS层)
mapping层在进行相应处理后,构造并提交bio,进入通用块层。
--------mapping层和通用块层分界----------------
通用块层构造request,提交到IO调度层
--------通用块层和IO调度层分界----------------
IO调度层对request按特定的IO调度算法进行合并和排序后,将request放入对应块设备的request queue,由此进入块设备驱动层。常用的块设备驱动层为SCSI,即进入SCSI层。
--------IO调度层和块设备驱动层分界-------------
SCSI层又分3层:SCSI上层驱动、SCSI中间层和SCSI底层驱动。
request进入SCSI层后,先由SCSI上层驱动和中间层进行处理,主要是按SCSI协议规范构造SCSI命令,最终将请求提交到SCSI底层驱动。
SCSI底层驱动,如mptsas,对request queue中的request进行逐一处理,主要跟根据硬件特性,构造最终的命令,以及跟硬件的交互。构造的命令最终提交到磁盘硬件(准确的说,应该是固件)。
--------SCSI层和硬件(固件)层的分界--------------
硬件处理完IO请求后,产生中断通知SCSI底层驱动(初始化时预先注册了相应的中断)。
ISR中调用SCSI层的scsi_done接口,进行相应的处理。
scsi_done调用上层(块设备层)定义的blk_complete_request接口,并触发相应的软中断
软中断中进行后续的处理:包括错误处理、request的清理、定时器的清理、唤醒等待的进程等。

2、mapping层的r
ead流程
mapping层的read流程(以ext2文件系统为例)如下:
ext2_file_operations->read->do_sync_read
ext2_file_operations->aio_read          
  generic_file_aio_read
    do_generic_file_read
      find_get_page //从page cache中查找page
      address_space->a_ops->readpage // 如果page cache中没找到,或者找到的页不是uptodate的,就从磁盘中读取。
        ext2_readpage
          mpage_readpage
            do_mpage_readpage
              ext2_get_block // 获取文件逻辑块号和设备物理块号的对应关系,结果保存在buffer_head中(page->buffers),后续提交IO时,可
                                         以根据该对应关系确定是读写整页,还是只读写其中的部分block。              
              mpage_bio_submit // 正常的IO提交流程
                submit_bio
              block_read_full_page //当读取的block不连续、文件中存在hole时,以buffer来读?即按block来读,不以page方式读,可能只读取
                                                  page中的部分block
                submit_bh // 提交请求, 以buffer方式,按block读
      file_read_actor //将相应page拷贝到用户态

3、代码分析
重要流程的代码分析如下。
generic_file_aio_read->do_generic_file_read()

点击(此处)折叠或打开

  1. /*
  2.   * 文件读取的主要处理函数,actor用于将数据从内核态拷贝到用户态。
  3.   */
  4. static void do_generic_file_read(struct file *filp, loff_t *ppos,
  5.         read_descriptor_t *desc, read_actor_t actor)
  6. {
  7.     struct address_space *mapping = filp->f_mapping;
  8.     struct inode *inode = mapping->host;
  9.     struct file_ra_state *ra = &filp->f_ra;
  10.     pgoff_t index;
  11.     pgoff_t last_index;
  12.     pgoff_t prev_index;
  13.     unsigned long offset; /* offset into pagecache page */
  14.     unsigned int prev_offset;
  15.     int error;
  16.     /*文件中的位置信息转换*/
  17.     index = *ppos >> PAGE_CACHE_SHIFT;
  18.     prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
  19.     prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
  20.     last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
  21.     offset = *ppos & ~PAGE_CACHE_MASK;

  22.     for (;;) {
  23.         struct page *page;
  24.         pgoff_t end_index;
  25.         loff_t isize;
  26.         unsigned long nr, ret;
  27.         /*检测是否需要调度*/
  28.         cond_resched();
  29. find_page:
  30.         // 从page cache中查找,即从address space中find指定index的page
  31.         page = find_get_page(mapping, index);
  32.         if (!page) {
  33.             // 当page cache中没有找到需要的page时,强制同步预读,阻塞等待预读完成。
  34.             page_cache_sync_readahead(mapping,
  35.                     ra, filp,
  36.                     index, last_index - index);
  37.             /*再次从page cache中查找*/
  38.             page = find_get_page(mapping, index);
  39.             if (unlikely(page == NULL))
  40.                 goto no_cached_page;
  41.         }
  42.         if (PageReadahead(page)) {
  43.             /*当设置了预读相关标记时,预读,异步,在向下层提交读请求后返回,不会阻塞。*/
  44.             page_cache_async_readahead(mapping,
  45.                     ra, filp, page,
  46.                     index, last_index - index);
  47.         }
  48.         /*如果从page cache中获取的page不是最新的,需要writeback*/
  49.         if (!PageUptodate(page)) {
  50.             if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
  51.                     !mapping->a_ops->is_partially_uptodate)
  52.                 goto page_not_up_to_date;
  53.             /*先获取锁,如果失败则需要在后面重新获取*/
  54.             if (!trylock_page(page))
  55.                 goto page_not_up_to_date;
  56.             /* Did it get truncated before we got the lock? */
  57.             if (!page->mapping)
  58.                 goto page_not_up_to_date_locked;
  59.             if (!mapping->a_ops->is_partially_uptodate(page,
  60.                                 desc, offset))
  61.                 goto page_not_up_to_date_locked;
  62.             unlock_page(page);
  63.         }
  64. // 从page cache中找到相应的页,且页的内容是uptodate的,即跟磁盘内容一致。
  65. page_ok:
  66.         /*
  67.          * i_size must be checked after we know the page is Uptodate.
  68.          *
  69.          * Checking i_size after the check allows us to calculate
  70.          * the correct value for "nr", which means the zero-filled
  71.          * part of the page is not copied back to userspace (unless
  72.          * another truncate extends the file - this is desired though).
  73.          */

  74.         // 更新相关计数
  75.         isize = i_size_read(inode);
  76.         end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
  77.         if (unlikely(!isize || index > end_index)) {
  78.             page_cache_release(page);
  79.             goto out;
  80.         }

  81.         /* nr is the maximum number of bytes to copy from this page */
  82.         nr = PAGE_CACHE_SIZE;
  83.         if (index == end_index) {
  84.             nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
  85.             if (nr <= offset) {
  86.                 page_cache_release(page);
  87.                 goto out;
  88.             }
  89.         }
  90.         nr = nr - offset;

  91.         /* If users can be writing to this page using arbitrary
  92.          * virtual addresses, take care about potential aliasing
  93.          * before reading the page on the kernel side.
  94.          */
  95.         if (mapping_writably_mapped(mapping))
  96.             flush_dcache_page(page);

  97.         /*
  98.          * When a sequential read accesses a page several times,
  99.          * only mark it as accessed the first time.
  100.          */
  101.         /*
  102.          * 设置page的accessed标记,在LRU页面回收时需要判断该标记,表示页面的active程度,second chance
  103.          * 当连续对同一page进行read时,只在第一次read时,设置一次。
  104.          */
  105.         if (prev_index != index || offset != prev_offset)
  106.             mark_page_accessed(page);
  107.         prev_index = index;

  108.         /*
  109.          * Ok, we have the page, and it's up-to-date, so
  110.          * now we can copy it to user space...
  111.          *
  112.          * The actor routine returns how many bytes were actually used..
  113.          * This may not be the same as how much of a user buffer
  114.          * we filled up (we may be padding etc), so we can only update
  115.          * "pos" here (the actor routine has to update the user buffer
  116.          * pointers and the remaining count).
  117.          */
  118.         // 调用actor(实际为file_read_actor)将page中的内容拷贝到用户态缓冲区中
  119.         ret = actor(desc, page, offset, nr);
  120.         /*更新位置信息*/
  121.         offset += ret;
  122.         index += offset >> PAGE_CACHE_SHIFT;
  123.         offset &= ~PAGE_CACHE_MASK;
  124.         prev_offset = offset;

  125.         page_cache_release(page);
  126.         if (ret == nr && desc->count)
  127.             continue;
  128.         goto out;

  129. // page cache中找到相关的page,但不是uptodate的
  130. page_not_up_to_date:
  131.         /* Get exclusive access to the page ... */
  132.         /*关键同步点。第一次read操作进入到这里时,先获取到锁,后面在发起实际readpage操作后,会重新获取该锁,由于此处已经获取了锁,就会阻塞等待。*/
  133.         error = lock_page_killable(page);
  134.         if (unlikely(error))
  135.             goto readpage_error;
  136. /*前面已经获取过锁了,这里不用再获取了*/
  137. page_not_up_to_date_locked:
  138.         /* Did it get truncated before we got the lock? */
  139.         if (!page->mapping) {
  140.             unlock_page(page);
  141.             page_cache_release(page);
  142.             continue;
  143.         }

  144.         /* Did somebody else fill it already? */
  145.         if (PageUptodate(page)) {
  146.             unlock_page(page);
  147.             goto page_ok;
  148.         }
  149. // 从外部存储中读数据(1个page)
  150. readpage:
  151.         /*
  152.          * A previous I/O error may have been due to temporary
  153.          * failures, eg. multipath errors.
  154.          * PG_error will be set again if readpage fails.
  155.          */
  156.         ClearPageError(page);
  157.         /* Start the actual read. The read will unlock the page. */
  158.         /*
  159.          * 实际调用address_space中的readpage接口来执行实际的读操作
  160.          * 具体实现取决于具体的文件系统,如ext2对应的接口为ext2_readpage
  161.          */
  162.         error = mapping->a_ops->readpage(filp, page);

  163.         if (unlikely(error)) {
  164.             if (error == AOP_TRUNCATED_PAGE) {
  165.                 page_cache_release(page);
  166.                 goto find_page;
  167.             }
  168.             goto readpage_error;
  169.         }

  170.         if (!PageUptodate(page)) {
  171.             /*
  172.              * 关键同步点。read操作是同步操作,由于前面已经获取过该锁。负责read的用户态进程在这里阻塞等待read操作完成,
  173.              * 即等待数据从磁盘读取完成后触发中断上来,从而触发bio_endio,在其中通过unlock_Page释放锁,最终唤醒该进程
  174.              */
  175.             error = lock_page_killable(page);
  176.             if (unlikely(error))
  177.                 goto readpage_error;
  178.             /*执行到这里,数据应该已经read完成了,如果期间没有新的脏数据的话,就应该是uptodate的数据了*/
  179.             if (!PageUptodate(page)) {
  180.                 if (page->mapping == NULL) {
  181.                     /*
  182.                      * invalidate_mapping_pages got it
  183.                      */
  184.                     /*释放锁*/
  185.                     unlock_page(page);
  186.                     page_cache_release(page);
  187.                     goto find_page;
  188.                 }
  189.                 /*释放锁*/
  190.                 unlock_page(page);
  191.                 shrink_readahead_size_eio(filp, ra);
  192.                 error = -EIO;
  193.                 goto readpage_error;
  194.             }
  195.             /*释放锁*/
  196.             unlock_page(page);
  197.         }
  198.         // 跳转到page_ok, 调用actor将page拷贝到用户态
  199.         goto page_ok;

  200. readpage_error:
  201.         /* A synchronous read error occurred. Report it */
  202.         desc->error = error;
  203.         page_cache_release(page);
  204.         goto out;

  205. // 如果page cache中没有找到相应的page,那么就要在page cache中分配相应的page了。
  206. no_cached_page:
  207.         /*
  208.          * Ok, it wasn't cached, so we need to create a new
  209.          * page..
  210.          */
  211.         // 从cold缓存中分配新page
  212.         page = page_cache_alloc_cold(mapping);
  213.         if (!page) {
  214.             desc->error = -ENOMEM;
  215.             goto out;
  216.         }
  217.         // 将新分配的page加入到page cache中
  218.         error = add_to_page_cache_lru(page, mapping,
  219.                         index, GFP_KERNEL);
  220.         if (error) {
  221.             page_cache_release(page);
  222.             if (error == -EEXIST)
  223.                 goto find_page;
  224.             desc->error = error;
  225.             goto out;
  226.         }
  227.         // 跳转到readpage,从外部存储中读取page
  228.         goto readpage;
  229.     }

  230. out:
  231.     ra->prev_pos = prev_index;
  232.     ra->prev_pos <<= PAGE_CACHE_SHIFT;
  233.     ra->prev_pos |= prev_offset;
  234.     // 更新当前文件中的位置
  235.     *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
  236.     /*更新文件的atime:访问时间*/
  237.     file_accessed(filp);
  238. }

generic_file_aio_read->do_generic_file_read->ext2_readpage->mpage_readpages->do_mpage_readpage()

点击(此处)折叠或打开

  1. static struct bio *
  2. do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
  3.         sector_t *last_block_in_bio, struct buffer_head *map_bh,
  4.         unsigned long *first_logical_block, get_block_t get_block)
  5. {
  6.     struct inode *inode = page->mapping->host;
  7.     const unsigned blkbits = inode->i_blkbits;
  8.     const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
  9.     const unsigned blocksize = 1 << blkbits;
  10.     sector_t block_in_file;
  11.     sector_t last_block;
  12.     sector_t last_block_in_file;
  13.     sector_t blocks[MAX_BUF_PER_PAGE];
  14.     unsigned page_block;
  15.     unsigned first_hole = blocks_per_page;
  16.     struct block_device *bdev = NULL;
  17.     int length;
  18.     int fully_mapped = 1;
  19.     unsigned nblocks;
  20.     unsigned relative_block;
  21.     
  22.     if (page_has_buffers(page))
  23.         goto confused;

  24.     block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
  25.     last_block = block_in_file + nr_pages * blocks_per_page;
  26.     last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
  27.     if (last_block > last_block_in_file)
  28.         last_block = last_block_in_file;
  29.     page_block = 0;

  30.     /*
  31.      * Map blocks using the result from the previous get_blocks call first.
  32.      */
  33.     nblocks = map_bh->b_size >> blkbits;
  34.     if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&
  35.             block_in_file < (*first_logical_block + nblocks)) {
  36.         unsigned map_offset = block_in_file - *first_logical_block;
  37.         unsigned last = nblocks - map_offset;

  38.         for (relative_block = 0; ; relative_block++) {
  39.             if (relative_block == last) {
  40.                 clear_buffer_mapped(map_bh);
  41.                 break;
  42.             }
  43.             if (page_block == blocks_per_page)
  44.                 break;
  45.             blocks[page_block] = map_bh->b_blocknr + map_offset +
  46.                         relative_block;
  47.             page_block++;
  48.             block_in_file++;
  49.         }
  50.         bdev = map_bh->b_bdev;
  51.     }

  52.     /*
  53.      * Then do more get_blocks calls until we are done with this page.
  54.      */
  55.     map_bh->b_page = page;
  56.     while (page_block < blocks_per_page) {
  57.         map_bh->b_state = 0;
  58.         map_bh->b_size = 0;

  59.         if (block_in_file < last_block) {
  60.             map_bh->b_size = (last_block-block_in_file) << blkbits;
  61.             /* 调用传入的get_block接口,ext2对应为ext2_get_block,用于获取
  62.              * 文件逻辑块和磁盘逻辑块之间的对应关系。get_block并不实际从磁盘
  63.              * 中读取文件数据,只是获取对应关系(通过直接或间接块信息),如果
  64.              * 相关的对应关系没有建立,则不创建相应的对应关系(比如创建间接块,
  65.              * 传入的最后一个参数为0,表示不需要创建),这个与write的流程不同。
  66.              * 对应关系需要在后续的write流程中创建。获取的对应关系保存于map_bh中
  67.              */
  68.             if (get_block(inode, block_in_file, map_bh, 0))
  69.                 goto confused;
  70.             *first_logical_block = block_in_file;
  71.         }

  72.         if (!buffer_mapped(map_bh)) {
  73.             fully_mapped = 0;
  74.             if (first_hole == blocks_per_page)
  75.                 first_hole = page_block;
  76.             page_block++;
  77.             block_in_file++;
  78.             continue;
  79.         }

  80.         /* some filesystems will copy data into the page during
  81.          * the get_block call, in which case we don't want to
  82.          * read it again. map_buffer_to_page copies the data
  83.          * we just collected from get_block into the page's buffers
  84.          * so readpage doesn't have to repeat the get_block call
  85.          */
  86.         if (buffer_uptodate(map_bh)) {
  87.             map_buffer_to_page(page, map_bh, page_block);
  88.             goto confused;
  89.         }
  90.     
  91.         if (first_hole != blocks_per_page)
  92.             goto confused;        /* hole -> non-hole */

  93.         /* Contiguous blocks? */
  94.         if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
  95.             goto confused;
  96.         nblocks = map_bh->b_size >> blkbits;
  97.         for (relative_block = 0; ; relative_block++) {
  98.             if (relative_block == nblocks) {
  99.                 clear_buffer_mapped(map_bh);
  100.                 break;
  101.             } else if (page_block == blocks_per_page)
  102.                 break;
  103.             blocks[page_block] = map_bh->b_blocknr+relative_block;
  104.             page_block++;
  105.             block_in_file++;
  106.         }
  107.         bdev = map_bh->b_bdev;
  108.     }

  109.     if (first_hole != blocks_per_page) {
  110.         zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE);
  111.         if (first_hole == 0) {
  112.             SetPageUptodate(page);
  113.             unlock_page(page);
  114.             goto out;
  115.         }
  116.     } else if (fully_mapped) {
  117.         SetPageMappedToDisk(page);
  118.     }

  119.     if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
  120.      cleancache_get_page(page) == 0) {
  121.         SetPageUptodate(page);
  122.         goto confused;
  123.     }

  124.     /*
  125.      * This page will go to BIO. Do we need to send this BIO off first?
  126.      */
  127.     if (bio && (*last_block_in_bio != blocks[0] - 1))
  128.         bio = mpage_bio_submit(READ, bio);
  129. // 通过bio,按page方式读,这也是file read的主要方式。
  130. alloc_new:
  131.     if (bio == NULL) {
  132.         /* 分配bio,blocks[0] << (blkbits - 9)为第一个扇区号,
  133.          * min_t(int, nr_pages, bio_get_nr_vecs(bdev)为扇区数。
  134.          */
  135.         bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
  136.                  min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
  137.                 GFP_KERNEL);
  138.         if (bio == NULL)
  139.             goto confused;
  140.     }

  141.     length = first_hole << blkbits;
  142.     /*将需要读取的page加入到刚创建的bio中*/
  143.     if (bio_add_page(bio, page, length, 0) < length) {
  144.         /*Fixme:应该不会走到这里?*/
  145.         bio = mpage_bio_submit(READ, bio);
  146.         goto alloc_new;
  147.     }

  148.     relative_block = block_in_file - *first_logical_block;
  149.     nblocks = map_bh->b_size >> blkbits;
  150.     /*判断是否需要先提交bio*/
  151.     if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
  152.      (first_hole != blocks_per_page))
  153.      /*提交成功后,返回null*/
  154.         bio = mpage_bio_submit(READ, bio);
  155.     else
  156.         *last_block_in_bio = blocks[blocks_per_page - 1];
  157. out:
  158.     /*返回上层函数,由上层函数提交bio*/
  159.     return bio;

  160. confused:
  161.     if (bio)
  162.         bio = mpage_bio_submit(READ, bio);
  163.     if (!PageUptodate(page))
  164.             /* 当读取的block不连续、文件中存在hole时,走到这里,正常的流程应该走
  165.              * mpage_bio_submit,通过bio,按page读?
  166.              * 这里通过buffer来读?即按block来读,不以page方式读,可能只读取page中
  167.              * 的部分block,最终通过submit_bh提交请求。
  168.              */
  169.      block_read_full_page(page, get_block);
  170.     else
  171.         unlock_page(page);
  172.     goto out;
  173. }
阅读(1040) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~