Q:lock_buffer(bh)被调用时,是否意味着PageLocked(bh->page)=1?
A:NO.__bread(),ll_rw_block()函数对buffer做同步IO,可以在不锁page的情况下直接lock_buffer().只有在要对buffer_head做异步IO的情况下,才需要先锁page
Q:buffers和cached的区别?
A:buffers和cached至少会在3个地方出现:top,free,/proc/meminfo
buffers = nr_blockdev_pages()
struct block_device *bdev;
long ret = 0;
spin_lock(&bdev_lock);
list_for_each_entry(bdev, &all_bdevs, bd_list) {
ret += bdev->bd_inode->i_mapping->nrpages;
}
spin_unlock(&bdev_lock);
return ret;
buffers表示直接读取块设备,缓存在块设备基树中的数据,一般来说,这些数据是文件系统使用__bread()读取的元数据
执行dd if=/dev/sda of=/dev/zero bs=1M count=200,可以明显的看到buffers在增大
cached表示除buffers外,kernel缓存的文件数据,一般来说,是文件系统的文件数据
cached = global_page_state(NR_FILE_PAGES) - total_swapcache_pages - buffers;
执行dd if=~/foo of=/dev/zero bs=1M count=200,可以明显的看到cached在增大
函数调用过程:
sys_read()
file = fget_light(fd, &fput_needed);
loff_t pos = file_pos_read(file);
vfs_read(file, buf, count, &pos);
file->f_op->read()
do_sync_read()
generic_file_aio_read()
do_generic_file_read()=>
几个重要的函数:
/**
* do_generic_file_read - generic file read routine
* @filp: the file to read
* @ppos: current file position
* @desc: read_descriptor
* @actor: read method
*
* This is a generic file read routine, and uses the
* mapping->a_ops->readpage() function for the actual low-level stuff.
*
* This is really ugly. But the goto's actually try to clarify some
* of the logic when it comes to error handling etc.
*/
static void do_generic_file_read(struct file *filp, loff_t *ppos,
read_descriptor_t *desc, read_actor_t actor)
{
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
struct file_ra_state *ra = &filp->f_ra;
pgoff_t index;
pgoff_t last_index;
pgoff_t prev_index;
unsigned long offset; /* offset into pagecache page */
unsigned int prev_offset;
int error;
//ppos对应的页框索引号
index = *ppos >> PAGE_CACHE_SHIFT;
prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;//*ppos & 0xFFF,页内偏移
for (;;) {
struct page *page;
pgoff_t end_index;
loff_t isize;
unsigned long nr, ret;
cond_resched();
find_page:
//如果找到,增加page的引用计数
page = find_get_page(mapping, index);
if (!page) {
page_cache_sync_readahead(mapping,
ra, filp,
index, last_index - index);
page = find_get_page(mapping, index);
if (unlikely(page == NULL))
goto no_cached_page;
}
if (PageReadahead(page)) {
page_cache_async_readahead(mapping,
ra, filp, page,
index, last_index - index);
}
if (!PageUptodate(page)) {
if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
!mapping->a_ops->is_partially_uptodate)
goto page_not_up_to_date;
//如果page已经被加锁,跳转,等待page IO完毕
if (!trylock_page(page))
goto page_not_up_to_date;
if (!mapping->a_ops->is_partially_uptodate(page,
desc, offset))//block_is_partially_uptodate()
goto page_not_up_to_date_locked;
unlock_page(page);
}
page_ok:
//PageLocked(page)=0 && page需要被读取的部分是最新的
//如果page对应的块是空洞块,page相应的部分会被清零,空洞块不影响page的UPTODATE标志
/*
* i_size must be checked after we know the page is Uptodate.
*
* Checking i_size after the check allows us to calculate
* the correct value for "nr", which means the zero-filled
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/
isize = i_size_read(inode);
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
if (unlikely(!isize || index > end_index)) {
page_cache_release(page);
goto out;
}
/* nr is the maximum number of bytes to copy from this page */
nr = PAGE_CACHE_SIZE;
if (index == end_index) {
nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
if (nr <= offset) {
page_cache_release(page);
goto out;
}
}
nr = nr - offset;
/* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
/*
* When a sequential read accesses a page several times,
* only mark it as accessed the first time.
*/
if (prev_index != index || offset != prev_offset)
mark_page_accessed(page);
prev_index = index;
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
*
* The actor routine returns how many bytes were actually used..
* NOTE! This may not be the same as how much of a user buffer
* we filled up (we may be padding etc), so we can only update
* "pos" here (the actor routine has to update the user buffer
* pointers and the remaining count).
*/
//拷贝page中的数据到用户态buffer
ret = actor(desc, page, offset, nr); //file_read_actor
offset += ret;
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
prev_offset = offset;
//释放find_get_page()增加的引用计数
page_cache_release(page);
if (ret == nr && desc->count)
continue;
goto out;
page_not_up_to_date:
/* Get exclusive access to the page ... */
if (lock_page_killable(page))
goto readpage_eio;
page_not_up_to_date_locked:
/* Did it get truncated before we got the lock? */
if (!page->mapping) {
unlock_page(page);
page_cache_release(page);
continue;
}
/* Did somebody else fill it already? */
//在if (!PageUptodate(page))到lock_page_killable()完毕的过程中page
//被其他线程读取
if (PageUptodate(page)) {
unlock_page(page);
goto page_ok;
}
readpage://到此标签时,page存在于基树且PG_locked标志置位
/* Start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);//ext2_readpage
if (unlikely(error)) {
if (error == AOP_TRUNCATED_PAGE) {
page_cache_release(page);
goto find_page;
}
goto readpage_error;
}
if (!PageUptodate(page)) {
//mapping->a_ops->readpage()是异步的,针对page的读操作可能还未完
//调用lock_page_killable,等待读操作完毕
if (lock_page_killable(page))
goto readpage_eio;
//异步读取完毕后,page的UPTODATE标志还没有置位,肯定有IO错误
if (!PageUptodate(page)) {
if (page->mapping == NULL) {
/*
* invalidate_inode_pages got it
*/
unlock_page(page);
page_cache_release(page);
goto find_page;
}
unlock_page(page);
shrink_readahead_size_eio(filp, ra);
goto readpage_eio;
}
unlock_page(page);
}
goto page_ok;
readpage_eio:
error = -EIO;
readpage_error:
/* UHHUH! A synchronous read error occurred. Report it */
desc->error = error;
page_cache_release(page);
goto out;
no_cached_page:
/*
* Ok, it wasn't cached, so we need to create a new
* page..
*/
page = page_cache_alloc_cold(mapping);
if (!page) {
desc->error = -ENOMEM;
goto out;
}
//先对page加锁,再将page加入到基树
error = add_to_page_cache_lru(page, mapping,
index, GFP_KERNEL);
if (error) {
page_cache_release(page);
if (error == -EEXIST)
goto find_page;
desc->error = error;
goto out;
}
goto readpage;
}
out:
ra->prev_pos = prev_index;
ra->prev_pos <<= PAGE_CACHE_SHIFT;
ra->prev_pos |= prev_offset;
*ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
if (filp)
file_accessed(filp);
}
/*
* This is the worker routine which does all the work of mapping the disk
* blocks and constructs largest possible bios, submits them for IO if the
* blocks are not contiguous on the disk.
*
* We pass a buffer_head back and forth and use its buffer_mapped() flag to
* represent the validity of its disk mapping and to decide when to do the next
* get_block() call.
*/
//如果page的文件块在磁盘上都是连续的,那么page无需转化为buffer page,直接
//将page加入到BIO中,执行读操作
static struct bio *
do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
sector_t *last_block_in_bio, struct buffer_head *map_bh,
unsigned long *first_logical_block, get_block_t get_block)
{
struct inode *inode = page->mapping->host;
const unsigned blkbits = inode->i_blkbits;
const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
sector_t block_in_file;
sector_t last_block;
sector_t last_block_in_file;
sector_t blocks[MAX_BUF_PER_PAGE];
unsigned page_block;
unsigned first_hole = blocks_per_page;
struct block_device *bdev = NULL;
int length;
int fully_mapped = 1;
unsigned nblocks;
unsigned relative_block;
//page必须被锁住,在page读取完毕的回调mpage_end_io_read()中解锁
if (page_has_buffers(page))
goto confused;
//需要读的第一个文件块编号
block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
//需要读的最后一个文件块编号
last_block = block_in_file + nr_pages * blocks_per_page;
//文件总共的块数
last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
last_block = last_block_in_file;
page_block = 0;
/*
* Map blocks using the result from the previous get_blocks call first.
*/
nblocks = map_bh->b_size >> blkbits;//mpage_readpage()里面没有对map_bh初始化!!!
if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&
block_in_file < (*first_logical_block + nblocks)) {
unsigned map_offset = block_in_file - *first_logical_block;
unsigned last = nblocks - map_offset;
for (relative_block = 0; ; relative_block++) {
if (relative_block == last) {
clear_buffer_mapped(map_bh);
break;
}
if (page_block == blocks_per_page)
break;
blocks[page_block] = map_bh->b_blocknr + map_offset +
relative_block;
page_block++;
block_in_file++;
}
bdev = map_bh->b_bdev;
}
/*
* Then do more get_blocks calls until we are done with this page.
*/
map_bh->b_page = page;
while (page_block < blocks_per_page) { //page_block初始化为0;
map_bh->b_state = 0;
map_bh->b_size = 0;
if (block_in_file < last_block) {
//尝试一次性处理page中所有的buffer
map_bh->b_size = (last_block-block_in_file) << blkbits;
//让文件系统告知文件块编号为block_in_file,长度为map_bh->b_size的块是否做了映射
if (get_block(inode, block_in_file, map_bh, 0))//ext2_get_block()
goto confused;
*first_logical_block = block_in_file;
}
//当前块未做映射(空洞块或EOF),处理下一块
if (!buffer_mapped(map_bh)) {
fully_mapped = 0;
if (first_hole == blocks_per_page)
first_hole = page_block;
page_block++;
block_in_file++;
clear_buffer_mapped(map_bh);//此句多余啊!
continue;
}
/* some filesystems will copy data into the page during
* the get_block call, in which case we don't want to
* read it again. map_buffer_to_page copies the data
* we just collected from get_block into the page's buffers
* so readpage doesn't have to repeat the get_block call
*/
if (buffer_uptodate(map_bh)) {
//map_bh是指向堆栈中的buffer_head结构体,需要将map_bh的映射信息拷贝到page
//相应的buffer_head中
map_buffer_to_page(page, map_bh, page_block);
goto confused;
}
//空->非空的情况,当作不连续处理
if (first_hole != blocks_per_page)
goto confused; /* hole -> non-hole */
/* Contiguous blocks? */
//磁盘块号是不连续的
if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
goto confused;
nblocks = map_bh->b_size >> blkbits;//一次get_block()获得了多个连续block的映射
//这里没有比较nblocks是否等于last_block-block_in_file,因为将EOF当作连续的页处理
for (relative_block = 0; ; relative_block++) {
if (relative_block == nblocks) {
clear_buffer_mapped(map_bh);
break;
} else if (page_block == blocks_per_page)
break;
//将每个buffer_head对应的磁盘逻辑块号都记录下来
blocks[page_block] = map_bh->b_blocknr+relative_block;
page_block++;
block_in_file++;
}
bdev = map_bh->b_bdev;
}
//处理非空->空的情况
//到这里first_hole后面的块必然全部是空洞,因为空->非空的情况会跳到confuse标签
//因此可以安全的将first_hole及后面的逻辑块清零
if (first_hole != blocks_per_page) {
zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE);
if (first_hole == 0) {//page中的buffer_head全部没有映射
SetPageUptodate(page);
unlock_page(page);
goto out;
}
} else if (fully_mapped) {
//必须是page中所有的块都映射到磁盘,空洞和EOF都不算
SetPageMappedToDisk(page);
}
/*
* This page will go to BIO. Do we need to send this BIO off first?
*/
//上一次调用readpage时最后一个逻辑块与本次的第一个逻辑块不连续
//不能合并到同一个BIO
if (bio && (*last_block_in_bio != blocks[0] - 1))
bio = mpage_bio_submit(READ, bio);
alloc_new:
if (bio == NULL) {
bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
GFP_KERNEL);
if (bio == NULL)
goto confused;
}
length = first_hole << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
bio = mpage_bio_submit(READ, bio);
goto alloc_new;
}
//此次处理的最后一个文件块是边界块(因为与下一个磁盘块对应了不同的间接块,
//因此与下一个文件块在磁盘上是不连续的)或EOF
if (buffer_boundary(map_bh) || (first_hole != blocks_per_page))
bio = mpage_bio_submit(READ, bio);
else
*last_block_in_bio = blocks[blocks_per_page - 1];
out:
return bio;
confused:
if (bio)
bio = mpage_bio_submit(READ, bio);
if (!PageUptodate(page))
block_read_full_page(page, get_block);//一块一块的读
else
unlock_page(page);
goto out;
}
/*
* Generic "read page" function for block devices that have the normal
* get_block functionality. This is most of the block device filesystems.
* Reads the page asynchronously --- the unlock_buffer() and
* set/clear_buffer_uptodate() functions propagate buffer state into the
* page struct once IO has completed.
*/
/**
处理page中的buffer_head在磁盘上的逻辑块不连续的情况
1.将page转换为buffer page
2.对每个bh,调用submit_bh()读取磁盘内容
*/
int block_read_full_page(struct page *page, get_block_t *get_block)
{
struct inode *inode = page->mapping->host;
sector_t iblock, lblock;
struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
unsigned int blocksize;
int nr, i;
int fully_mapped = 1;
BUG_ON(!PageLocked(page));
blocksize = 1 << inode->i_blkbits;
if (!page_has_buffers(page))
create_empty_buffers(page, blocksize, 0);
head = page_buffers(page);
//在文件中的开始块号
iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
//文件的结束块号
lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
bh = head;
nr = 0;
i = 0;
do {
if (buffer_uptodate(bh))
continue;
if (!buffer_mapped(bh)) {
int err = 0;
fully_mapped = 0;
if (iblock < lblock) {
WARN_ON(bh->b_size != blocksize);
//让文件系统告知iblock开始的块是否做了映射
err = get_block(inode, iblock, bh, 0);
if (err)
SetPageError(page);
}
if (!buffer_mapped(bh)) {//空洞块,需要将数据全部清零
zero_user(page, i * blocksize, blocksize);
if (!err)
set_buffer_uptodate(bh);
continue;
}
/*
* get_block() might have updated the buffer
* synchronously
*/
//据说reiserfs会干这个事情
if (buffer_uptodate(bh))
continue;
}
//此bh是需要做磁盘IO的
arr[nr++] = bh;//buffer_mapped(bh)==1 && buffer_uptodate(bh)==0
} while (i++, iblock++, (bh = bh->b_this_page) != head);
if (fully_mapped)//page中所有的块都映射到磁盘,包括EOF的情况
SetPageMappedToDisk(page);
//没有需要从磁盘读取的数据块
if (!nr) {
/*
* All buffers are uptodate - we can set the page uptodate
* as well. But not if get_block() returned an error.
*/
if (!PageError(page))
SetPageUptodate(page);
unlock_page(page);
return 0;
}
/* Stage two: lock the buffers */
for (i = 0; i < nr; i++) {
bh = arr[i];
lock_buffer(bh);
mark_buffer_async_read(bh);
}
/*
* Stage 3: start the IO. Check for uptodateness
* inside the buffer lock in case another process reading
* the underlying blockdev brought it uptodate (the sct fix).
*/
//如果buffer_uptodate(bh)=1,bh必定不是被异步读取的,原因有两点:
//1.异步读取buffer_head,需要先锁page,在block_read_full_page()开始时,page已经被加锁,
//因此在进入block_read_full_page()后,不可能有其他进程能够启动异步IO读取buffer_head
//2.假定有其他进程启动异步IO读取buffer_head,那么此IO必然在if (buffer_uptodate(bh))之前完成,
//异步IO完成的回调函数end_buffer_async_read()会清除BH_async_read标志,导致在for循环中执行
//end_buffer_async_read(bh, 1)时,BUG_ON(!buffer_async_read(bh))被触发
//bh必然是被类似ll_rw_block()或__bread()之类的函数同步读入的
for (i = 0; i < nr; i++) {
bh = arr[i];
if (buffer_uptodate(bh))
end_buffer_async_read(bh, 1);
else
submit_bh(READ, bh);
}
return 0;
}
阅读(2341) | 评论(0) | 转发(0) |