kernel 3.10内核源码分析--file read流程
1、基本原理
文件读取流程通常从用户态发起,总体流程为:
用户态进程先open指定文件,返回fd
用户态进程调用read库函数接口,fd作为入参
read库函数接口最终调用read对应的系统调用接口,sys_read,由此进入内核流程
--------用户态和内核态分界------------------------
内核态调用特定文件系统定义的对应相关操作接口,如ext2对应的read接口为ext2_file_operations->aio_read,由此进入mapping层(VFS层)
mapping层在进行相应处理后,构造并提交bio,进入通用块层。
--------mapping层和通用块层分界----------------
通用块层构造request,提交到IO调度层
--------通用块层和IO调度层分界----------------
IO调度层对request按特定的IO调度算法进行合并和排序后,将request放入对应块设备的request queue,由此进入块设备驱动层。常用的块设备驱动层为SCSI,即进入SCSI层。
--------IO调度层和块设备驱动层分界-------------
SCSI层又分3层:SCSI上层驱动、SCSI中间层和SCSI底层驱动。
request进入SCSI层后,先由SCSI上层驱动和中间层进行处理,主要是按SCSI协议规范构造SCSI命令,最终将请求提交到SCSI底层驱动。
SCSI底层驱动,如mptsas,对request queue中的request进行逐一处理,主要跟根据硬件特性,构造最终的命令,以及跟硬件的交互。构造的命令最终提交到磁盘硬件(准确的说,应该是固件)。
--------SCSI层和硬件(固件)层的分界--------------
硬件处理完IO请求后,产生中断通知SCSI底层驱动(初始化时预先注册了相应的中断)。
ISR中调用SCSI层的scsi_done接口,进行相应的处理。
scsi_done调用上层(块设备层)定义的blk_complete_request接口,并触发相应的软中断
软中断中进行后续的处理:包括错误处理、request的清理、定时器的清理、唤醒等待的进程等。
2、mapping层的read流程
mapping层的read流程(以ext2文件系统为例)如下:
ext2_file_operations->read->do_sync_read
ext2_file_operations->aio_read
generic_file_aio_read
do_generic_file_read
find_get_page //从page cache中查找page
address_space->a_ops->readpage // 如果page cache中没找到,或者找到的页不是uptodate的,就从磁盘中读取。
ext2_readpage
mpage_readpage
do_mpage_readpage
ext2_get_block // 获取文件逻辑块号和设备物理块号的对应关系,结果保存在buffer_head中(page->buffers),后续提交IO时,可
以根据该对应关系确定是读写整页,还是只读写其中的部分block。
mpage_bio_submit // 正常的IO提交流程
submit_bio
block_read_full_page //当读取的block不连续、文件中存在hole时,以buffer来读?即按block来读,不以page方式读,可能只读取
page中的部分block
submit_bh // 提交请求, 以buffer方式,按block读
file_read_actor //将相应page拷贝到用户态
3、代码分析
重要流程的代码分析如下。
generic_file_aio_read->do_generic_file_read()
-
/*
-
* 文件读取的主要处理函数,actor用于将数据从内核态拷贝到用户态。
-
*/
-
static void do_generic_file_read(struct file *filp, loff_t *ppos,
-
read_descriptor_t *desc, read_actor_t actor)
-
{
-
struct address_space *mapping = filp->f_mapping;
-
struct inode *inode = mapping->host;
-
struct file_ra_state *ra = &filp->f_ra;
-
pgoff_t index;
-
pgoff_t last_index;
-
pgoff_t prev_index;
-
unsigned long offset; /* offset into pagecache page */
-
unsigned int prev_offset;
-
int error;
-
/*文件中的位置信息转换*/
-
index = *ppos >> PAGE_CACHE_SHIFT;
-
prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
-
prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
-
last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
-
offset = *ppos & ~PAGE_CACHE_MASK;
-
-
for (;;) {
-
struct page *page;
-
pgoff_t end_index;
-
loff_t isize;
-
unsigned long nr, ret;
-
/*检测是否需要调度*/
-
cond_resched();
-
find_page:
-
// 从page cache中查找,即从address space中find指定index的page
-
page = find_get_page(mapping, index);
-
if (!page) {
-
// 当page cache中没有找到需要的page时,强制同步预读,阻塞等待预读完成。
-
page_cache_sync_readahead(mapping,
-
ra, filp,
-
index, last_index - index);
-
/*再次从page cache中查找*/
-
page = find_get_page(mapping, index);
-
if (unlikely(page == NULL))
-
goto no_cached_page;
-
}
-
if (PageReadahead(page)) {
-
/*当设置了预读相关标记时,预读,异步,在向下层提交读请求后返回,不会阻塞。*/
-
page_cache_async_readahead(mapping,
-
ra, filp, page,
-
index, last_index - index);
-
}
-
/*如果从page cache中获取的page不是最新的,需要writeback*/
-
if (!PageUptodate(page)) {
-
if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
-
!mapping->a_ops->is_partially_uptodate)
-
goto page_not_up_to_date;
-
/*先获取锁,如果失败则需要在后面重新获取*/
-
if (!trylock_page(page))
-
goto page_not_up_to_date;
-
/* Did it get truncated before we got the lock? */
-
if (!page->mapping)
-
goto page_not_up_to_date_locked;
-
if (!mapping->a_ops->is_partially_uptodate(page,
-
desc, offset))
-
goto page_not_up_to_date_locked;
-
unlock_page(page);
-
}
-
// 从page cache中找到相应的页,且页的内容是uptodate的,即跟磁盘内容一致。
-
page_ok:
-
/*
-
* i_size must be checked after we know the page is Uptodate.
-
*
-
* Checking i_size after the check allows us to calculate
-
* the correct value for "nr", which means the zero-filled
-
* part of the page is not copied back to userspace (unless
-
* another truncate extends the file - this is desired though).
-
*/
-
-
// 更新相关计数
-
isize = i_size_read(inode);
-
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
-
if (unlikely(!isize || index > end_index)) {
-
page_cache_release(page);
-
goto out;
-
}
-
-
/* nr is the maximum number of bytes to copy from this page */
-
nr = PAGE_CACHE_SIZE;
-
if (index == end_index) {
-
nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
-
if (nr <= offset) {
-
page_cache_release(page);
-
goto out;
-
}
-
}
-
nr = nr - offset;
-
-
/* If users can be writing to this page using arbitrary
-
* virtual addresses, take care about potential aliasing
-
* before reading the page on the kernel side.
-
*/
-
if (mapping_writably_mapped(mapping))
-
flush_dcache_page(page);
-
-
/*
-
* When a sequential read accesses a page several times,
-
* only mark it as accessed the first time.
-
*/
-
/*
-
* 设置page的accessed标记,在LRU页面回收时需要判断该标记,表示页面的active程度,second chance
-
* 当连续对同一page进行read时,只在第一次read时,设置一次。
-
*/
-
if (prev_index != index || offset != prev_offset)
-
mark_page_accessed(page);
-
prev_index = index;
-
-
/*
-
* Ok, we have the page, and it's up-to-date, so
-
* now we can copy it to user space...
-
*
-
* The actor routine returns how many bytes were actually used..
-
* This may not be the same as how much of a user buffer
-
* we filled up (we may be padding etc), so we can only update
-
* "pos" here (the actor routine has to update the user buffer
-
* pointers and the remaining count).
-
*/
-
// 调用actor(实际为file_read_actor)将page中的内容拷贝到用户态缓冲区中
-
ret = actor(desc, page, offset, nr);
-
/*更新位置信息*/
-
offset += ret;
-
index += offset >> PAGE_CACHE_SHIFT;
-
offset &= ~PAGE_CACHE_MASK;
-
prev_offset = offset;
-
-
page_cache_release(page);
-
if (ret == nr && desc->count)
-
continue;
-
goto out;
-
-
// page cache中找到相关的page,但不是uptodate的
-
page_not_up_to_date:
-
/* Get exclusive access to the page ... */
-
/*关键同步点。第一次read操作进入到这里时,先获取到锁,后面在发起实际readpage操作后,会重新获取该锁,由于此处已经获取了锁,就会阻塞等待。*/
-
error = lock_page_killable(page);
-
if (unlikely(error))
-
goto readpage_error;
-
/*前面已经获取过锁了,这里不用再获取了*/
-
page_not_up_to_date_locked:
-
/* Did it get truncated before we got the lock? */
-
if (!page->mapping) {
-
unlock_page(page);
-
page_cache_release(page);
-
continue;
-
}
-
-
/* Did somebody else fill it already? */
-
if (PageUptodate(page)) {
-
unlock_page(page);
-
goto page_ok;
-
}
-
// 从外部存储中读数据(1个page)
-
readpage:
-
/*
-
* A previous I/O error may have been due to temporary
-
* failures, eg. multipath errors.
-
* PG_error will be set again if readpage fails.
-
*/
-
ClearPageError(page);
-
/* Start the actual read. The read will unlock the page. */
-
/*
-
* 实际调用address_space中的readpage接口来执行实际的读操作
-
* 具体实现取决于具体的文件系统,如ext2对应的接口为ext2_readpage
-
*/
-
error = mapping->a_ops->readpage(filp, page);
-
-
if (unlikely(error)) {
-
if (error == AOP_TRUNCATED_PAGE) {
-
page_cache_release(page);
-
goto find_page;
-
}
-
goto readpage_error;
-
}
-
-
if (!PageUptodate(page)) {
-
/*
-
* 关键同步点。read操作是同步操作,由于前面已经获取过该锁。负责read的用户态进程在这里阻塞等待read操作完成,
-
* 即等待数据从磁盘读取完成后触发中断上来,从而触发bio_endio,在其中通过unlock_Page释放锁,最终唤醒该进程
-
*/
-
error = lock_page_killable(page);
-
if (unlikely(error))
-
goto readpage_error;
-
/*执行到这里,数据应该已经read完成了,如果期间没有新的脏数据的话,就应该是uptodate的数据了*/
-
if (!PageUptodate(page)) {
-
if (page->mapping == NULL) {
-
/*
-
* invalidate_mapping_pages got it
-
*/
-
/*释放锁*/
-
unlock_page(page);
-
page_cache_release(page);
-
goto find_page;
-
}
-
/*释放锁*/
-
unlock_page(page);
-
shrink_readahead_size_eio(filp, ra);
-
error = -EIO;
-
goto readpage_error;
-
}
-
/*释放锁*/
-
unlock_page(page);
-
}
-
// 跳转到page_ok, 调用actor将page拷贝到用户态
-
goto page_ok;
-
-
readpage_error:
-
/* A synchronous read error occurred. Report it */
-
desc->error = error;
-
page_cache_release(page);
-
goto out;
-
-
// 如果page cache中没有找到相应的page,那么就要在page cache中分配相应的page了。
-
no_cached_page:
-
/*
-
* Ok, it wasn't cached, so we need to create a new
-
* page..
-
*/
-
// 从cold缓存中分配新page
-
page = page_cache_alloc_cold(mapping);
-
if (!page) {
-
desc->error = -ENOMEM;
-
goto out;
-
}
-
// 将新分配的page加入到page cache中
-
error = add_to_page_cache_lru(page, mapping,
-
index, GFP_KERNEL);
-
if (error) {
-
page_cache_release(page);
-
if (error == -EEXIST)
-
goto find_page;
-
desc->error = error;
-
goto out;
-
}
-
// 跳转到readpage,从外部存储中读取page
-
goto readpage;
-
}
-
-
out:
-
ra->prev_pos = prev_index;
-
ra->prev_pos <<= PAGE_CACHE_SHIFT;
-
ra->prev_pos |= prev_offset;
-
// 更新当前文件中的位置
-
*ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
-
/*更新文件的atime:访问时间*/
-
file_accessed(filp);
-
}
generic_file_aio_read->do_generic_file_read->ext2_readpage->mpage_readpages->do_mpage_readpage()
-
static struct bio *
-
do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
-
sector_t *last_block_in_bio, struct buffer_head *map_bh,
-
unsigned long *first_logical_block, get_block_t get_block)
-
{
-
struct inode *inode = page->mapping->host;
-
const unsigned blkbits = inode->i_blkbits;
-
const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
-
const unsigned blocksize = 1 << blkbits;
-
sector_t block_in_file;
-
sector_t last_block;
-
sector_t last_block_in_file;
-
sector_t blocks[MAX_BUF_PER_PAGE];
-
unsigned page_block;
-
unsigned first_hole = blocks_per_page;
-
struct block_device *bdev = NULL;
-
int length;
-
int fully_mapped = 1;
-
unsigned nblocks;
-
unsigned relative_block;
-
-
if (page_has_buffers(page))
-
goto confused;
-
-
block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
-
last_block = block_in_file + nr_pages * blocks_per_page;
-
last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
-
if (last_block > last_block_in_file)
-
last_block = last_block_in_file;
-
page_block = 0;
-
-
/*
-
* Map blocks using the result from the previous get_blocks call first.
-
*/
-
nblocks = map_bh->b_size >> blkbits;
-
if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&
-
block_in_file < (*first_logical_block + nblocks)) {
-
unsigned map_offset = block_in_file - *first_logical_block;
-
unsigned last = nblocks - map_offset;
-
-
for (relative_block = 0; ; relative_block++) {
-
if (relative_block == last) {
-
clear_buffer_mapped(map_bh);
-
break;
-
}
-
if (page_block == blocks_per_page)
-
break;
-
blocks[page_block] = map_bh->b_blocknr + map_offset +
-
relative_block;
-
page_block++;
-
block_in_file++;
-
}
-
bdev = map_bh->b_bdev;
-
}
-
-
/*
-
* Then do more get_blocks calls until we are done with this page.
-
*/
-
map_bh->b_page = page;
-
while (page_block < blocks_per_page) {
-
map_bh->b_state = 0;
-
map_bh->b_size = 0;
-
-
if (block_in_file < last_block) {
-
map_bh->b_size = (last_block-block_in_file) << blkbits;
-
/* 调用传入的get_block接口,ext2对应为ext2_get_block,用于获取
-
* 文件逻辑块和磁盘逻辑块之间的对应关系。get_block并不实际从磁盘
-
* 中读取文件数据,只是获取对应关系(通过直接或间接块信息),如果
-
* 相关的对应关系没有建立,则不创建相应的对应关系(比如创建间接块,
-
* 传入的最后一个参数为0,表示不需要创建),这个与write的流程不同。
-
* 对应关系需要在后续的write流程中创建。获取的对应关系保存于map_bh中
-
*/
-
if (get_block(inode, block_in_file, map_bh, 0))
-
goto confused;
-
*first_logical_block = block_in_file;
-
}
-
-
if (!buffer_mapped(map_bh)) {
-
fully_mapped = 0;
-
if (first_hole == blocks_per_page)
-
first_hole = page_block;
-
page_block++;
-
block_in_file++;
-
continue;
-
}
-
-
/* some filesystems will copy data into the page during
-
* the get_block call, in which case we don't want to
-
* read it again. map_buffer_to_page copies the data
-
* we just collected from get_block into the page's buffers
-
* so readpage doesn't have to repeat the get_block call
-
*/
-
if (buffer_uptodate(map_bh)) {
-
map_buffer_to_page(page, map_bh, page_block);
-
goto confused;
-
}
-
-
if (first_hole != blocks_per_page)
-
goto confused; /* hole -> non-hole */
-
-
/* Contiguous blocks? */
-
if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
-
goto confused;
-
nblocks = map_bh->b_size >> blkbits;
-
for (relative_block = 0; ; relative_block++) {
-
if (relative_block == nblocks) {
-
clear_buffer_mapped(map_bh);
-
break;
-
} else if (page_block == blocks_per_page)
-
break;
-
blocks[page_block] = map_bh->b_blocknr+relative_block;
-
page_block++;
-
block_in_file++;
-
}
-
bdev = map_bh->b_bdev;
-
}
-
-
if (first_hole != blocks_per_page) {
-
zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE);
-
if (first_hole == 0) {
-
SetPageUptodate(page);
-
unlock_page(page);
-
goto out;
-
}
-
} else if (fully_mapped) {
-
SetPageMappedToDisk(page);
-
}
-
-
if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
-
cleancache_get_page(page) == 0) {
-
SetPageUptodate(page);
-
goto confused;
-
}
-
-
/*
-
* This page will go to BIO. Do we need to send this BIO off first?
-
*/
-
if (bio && (*last_block_in_bio != blocks[0] - 1))
-
bio = mpage_bio_submit(READ, bio);
-
// 通过bio,按page方式读,这也是file read的主要方式。
-
alloc_new:
-
if (bio == NULL) {
-
/* 分配bio,blocks[0] << (blkbits - 9)为第一个扇区号,
-
* min_t(int, nr_pages, bio_get_nr_vecs(bdev)为扇区数。
-
*/
-
bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
-
min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
-
GFP_KERNEL);
-
if (bio == NULL)
-
goto confused;
-
}
-
-
length = first_hole << blkbits;
-
/*将需要读取的page加入到刚创建的bio中*/
-
if (bio_add_page(bio, page, length, 0) < length) {
-
/*Fixme:应该不会走到这里?*/
-
bio = mpage_bio_submit(READ, bio);
-
goto alloc_new;
-
}
-
-
relative_block = block_in_file - *first_logical_block;
-
nblocks = map_bh->b_size >> blkbits;
-
/*判断是否需要先提交bio*/
-
if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
-
(first_hole != blocks_per_page))
-
/*提交成功后,返回null*/
-
bio = mpage_bio_submit(READ, bio);
-
else
-
*last_block_in_bio = blocks[blocks_per_page - 1];
-
out:
-
/*返回上层函数,由上层函数提交bio*/
-
return bio;
-
-
confused:
-
if (bio)
-
bio = mpage_bio_submit(READ, bio);
-
if (!PageUptodate(page))
-
/* 当读取的block不连续、文件中存在hole时,走到这里,正常的流程应该走
-
* mpage_bio_submit,通过bio,按page读?
-
* 这里通过buffer来读?即按block来读,不以page方式读,可能只读取page中
-
* 的部分block,最终通过submit_bh提交请求。
-
*/
-
block_read_full_page(page, get_block);
-
else
-
unlock_page(page);
-
goto out;
-
}
阅读(4466) | 评论(0) | 转发(4) |