sys_read()缓冲读文件-Mars007-ChinaUnix博客

逻辑、细节mars007.blog.chinaunix.net

首页　| 　博文目录　| 　关于我

Mars007

博客访问： 168668
博文数量： 22
博客积分： 126
博客等级：入伍新兵
技术积分： 459
用户组：普通用户
注册时间： 2010-10-26 21:14

文章分类

全部博文（22）

未分配的博文（22）

文章存档

2013年（22）

我的朋友

相关博文

sys_read()缓冲读文件

分类： LINUX

2013-01-03 13:06:26

Q:lock_buffer(bh)被调用时,是否意味着PageLocked(bh->page)=1?
A:NO.__bread(),ll_rw_block()函数对buffer做同步IO,可以在不锁page的情况下直接lock_buffer().只有在要对buffer_head做异步IO的情况下,才需要先锁page

Q:buffers和cached的区别?
A:buffers和cached至少会在3个地方出现:top,free,/proc/meminfo
buffers = nr_blockdev_pages()
   struct block_device *bdev;
   long ret = 0;
   spin_lock(&bdev_lock);
   list_for_each_entry(bdev, &all_bdevs, bd_list) {
       ret += bdev->bd_inode->i_mapping->nrpages;
   }
   spin_unlock(&bdev_lock);
   return ret;
buffers表示直接读取块设备,缓存在块设备基树中的数据,一般来说,这些数据是文件系统使用__bread()读取的元数据
执行dd if=/dev/sda of=/dev/zero bs=1M count=200,可以明显的看到buffers在增大
cached表示除buffers外,kernel缓存的文件数据,一般来说,是文件系统的文件数据
cached = global_page_state(NR_FILE_PAGES) - total_swapcache_pages - buffers;
执行dd if=~/foo of=/dev/zero bs=1M count=200,可以明显的看到cached在增大

函数调用过程:
sys_read()
   file = fget_light(fd, &fput_needed);
   loff_t pos = file_pos_read(file);
   vfs_read(file, buf, count, &pos);
       file->f_op->read()
       do_sync_read()
           generic_file_aio_read()
               do_generic_file_read()=>
几个重要的函数:
/**
* do_generic_file_read - generic file read routine
* @filp:    the file to read
* @ppos:    current file position
* @desc:    read_descriptor
* @actor:    read method
*
* This is a generic file read routine, and uses the
* mapping->a_ops->readpage() function for the actual low-level stuff.
*
* This is really ugly. But the goto's actually try to clarify some
* of the logic when it comes to error handling etc.
*/
static void do_generic_file_read(struct file *filp, loff_t *ppos,
        read_descriptor_t *desc, read_actor_t actor)
{
    struct address_space *mapping = filp->f_mapping;
    struct inode *inode = mapping->host;
    struct file_ra_state *ra = &filp->f_ra;
    pgoff_t index;
    pgoff_t last_index;
    pgoff_t prev_index;
    unsigned long offset;      /* offset into pagecache page */
    unsigned int prev_offset;
    int error;

    //ppos对应的页框索引号
    index = *ppos >> PAGE_CACHE_SHIFT;
    prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
    prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
    last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
    offset = *ppos & ~PAGE_CACHE_MASK;//*ppos & 0xFFF,页内偏移

    for (;;) {
        struct page *page;
        pgoff_t end_index;
        loff_t isize;
        unsigned long nr, ret;

        cond_resched();
find_page:
        //如果找到,增加page的引用计数
        page = find_get_page(mapping, index);
        if (!page) {
            page_cache_sync_readahead(mapping,
                    ra, filp,
                    index, last_index - index);
            page = find_get_page(mapping, index);
            if (unlikely(page == NULL))
                goto no_cached_page;
        }
        if (PageReadahead(page)) {
            page_cache_async_readahead(mapping,
                    ra, filp, page,
                    index, last_index - index);
        }
        if (!PageUptodate(page)) {
            if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
                    !mapping->a_ops->is_partially_uptodate)
                goto page_not_up_to_date;
            //如果page已经被加锁,跳转,等待page IO完毕
            if (!trylock_page(page))
                goto page_not_up_to_date;
            if (!mapping->a_ops->is_partially_uptodate(page,
                                desc, offset))//block_is_partially_uptodate()
                goto page_not_up_to_date_locked;
            unlock_page(page);
        }
page_ok:
        //PageLocked(page)=0 && page需要被读取的部分是最新的
        //如果page对应的块是空洞块,page相应的部分会被清零,空洞块不影响page的UPTODATE标志
        /*
        * i_size must be checked after we know the page is Uptodate.
        *
        * Checking i_size after the check allows us to calculate
        * the correct value for "nr", which means the zero-filled
        * part of the page is not copied back to userspace (unless
        * another truncate extends the file - this is desired though).
        */

        isize = i_size_read(inode);
        end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
        if (unlikely(!isize || index > end_index)) {
            page_cache_release(page);
            goto out;
        }

        /* nr is the maximum number of bytes to copy from this page */
        nr = PAGE_CACHE_SIZE;
        if (index == end_index) {
            nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
            if (nr <= offset) {
                page_cache_release(page);
                goto out;
            }
        }
        nr = nr - offset;

        /* If users can be writing to this page using arbitrary
        * virtual addresses, take care about potential aliasing
        * before reading the page on the kernel side.
        */
        if (mapping_writably_mapped(mapping))
            flush_dcache_page(page);

        /*
        * When a sequential read accesses a page several times,
        * only mark it as accessed the first time.
        */
        if (prev_index != index || offset != prev_offset)
            mark_page_accessed(page);
        prev_index = index;

        /*
        * Ok, we have the page, and it's up-to-date, so
        * now we can copy it to user space...
        *
        * The actor routine returns how many bytes were actually used..
        * NOTE! This may not be the same as how much of a user buffer
        * we filled up (we may be padding etc), so we can only update
        * "pos" here (the actor routine has to update the user buffer
        * pointers and the remaining count).
        */
        //拷贝page中的数据到用户态buffer
        ret = actor(desc, page, offset, nr);    //file_read_actor
        offset += ret;
        index += offset >> PAGE_CACHE_SHIFT;
        offset &= ~PAGE_CACHE_MASK;
        prev_offset = offset;
        //释放find_get_page()增加的引用计数
        page_cache_release(page);
        if (ret == nr && desc->count)
            continue;
        goto out;

page_not_up_to_date:
        /* Get exclusive access to the page ... */
        if (lock_page_killable(page))
            goto readpage_eio;

page_not_up_to_date_locked:
        /* Did it get truncated before we got the lock? */
        if (!page->mapping) {
            unlock_page(page);
            page_cache_release(page);
            continue;
        }

        /* Did somebody else fill it already? */
        //在if (!PageUptodate(page))到lock_page_killable()完毕的过程中page
        //被其他线程读取
        if (PageUptodate(page)) {
            unlock_page(page);
            goto page_ok;
        }

readpage://到此标签时,page存在于基树且PG_locked标志置位
        /* Start the actual read. The read will unlock the page. */
        error = mapping->a_ops->readpage(filp, page);//ext2_readpage

        if (unlikely(error)) {
            if (error == AOP_TRUNCATED_PAGE) {
                page_cache_release(page);
                goto find_page;
            }
            goto readpage_error;
        }

        if (!PageUptodate(page)) {
            //mapping->a_ops->readpage()是异步的,针对page的读操作可能还未完
            //调用lock_page_killable,等待读操作完毕
            if (lock_page_killable(page))
                goto readpage_eio;
            //异步读取完毕后,page的UPTODATE标志还没有置位,肯定有IO错误
            if (!PageUptodate(page)) {
                if (page->mapping == NULL) {
                    /*
                    * invalidate_inode_pages got it
                    */
                    unlock_page(page);
                    page_cache_release(page);
                    goto find_page;
                }
                unlock_page(page);
                shrink_readahead_size_eio(filp, ra);
                goto readpage_eio;
            }
            unlock_page(page);
        }

        goto page_ok;

readpage_eio:
        error = -EIO;
readpage_error:
        /* UHHUH! A synchronous read error occurred. Report it */
        desc->error = error;
        page_cache_release(page);
        goto out;

no_cached_page:
        /*
        * Ok, it wasn't cached, so we need to create a new
        * page..
        */
        page = page_cache_alloc_cold(mapping);
        if (!page) {
            desc->error = -ENOMEM;
            goto out;
        }
        //先对page加锁,再将page加入到基树
        error = add_to_page_cache_lru(page, mapping,
                        index, GFP_KERNEL);
        if (error) {
            page_cache_release(page);
            if (error == -EEXIST)
                goto find_page;
            desc->error = error;
            goto out;
        }
        goto readpage;
    }

out:
    ra->prev_pos = prev_index;
    ra->prev_pos <<= PAGE_CACHE_SHIFT;
    ra->prev_pos |= prev_offset;

    *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
    if (filp)
        file_accessed(filp);
}

/*
* This is the worker routine which does all the work of mapping the disk
* blocks and constructs largest possible bios, submits them for IO if the
* blocks are not contiguous on the disk.
*
* We pass a buffer_head back and forth and use its buffer_mapped() flag to
* represent the validity of its disk mapping and to decide when to do the next
* get_block() call.
*/
//如果page的文件块在磁盘上都是连续的,那么page无需转化为buffer page,直接
//将page加入到BIO中,执行读操作
static struct bio *
do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
        sector_t *last_block_in_bio, struct buffer_head *map_bh,
        unsigned long *first_logical_block, get_block_t get_block)
{
    struct inode *inode = page->mapping->host;
    const unsigned blkbits = inode->i_blkbits;
    const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
    const unsigned blocksize = 1 << blkbits;
    sector_t block_in_file;
    sector_t last_block;
    sector_t last_block_in_file;
    sector_t blocks[MAX_BUF_PER_PAGE];
    unsigned page_block;
    unsigned first_hole = blocks_per_page;
    struct block_device *bdev = NULL;
    int length;
    int fully_mapped = 1;
    unsigned nblocks;
    unsigned relative_block;

    //page必须被锁住,在page读取完毕的回调mpage_end_io_read()中解锁

    if (page_has_buffers(page))
        goto confused;

    //需要读的第一个文件块编号
    block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
    //需要读的最后一个文件块编号
    last_block = block_in_file + nr_pages * blocks_per_page;
    //文件总共的块数
    last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
    if (last_block > last_block_in_file)
        last_block = last_block_in_file;
    page_block = 0;

    /*
    * Map blocks using the result from the previous get_blocks call first.
    */
    nblocks = map_bh->b_size >> blkbits;//mpage_readpage()里面没有对map_bh初始化!!!
    if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&
            block_in_file < (*first_logical_block + nblocks)) {
        unsigned map_offset = block_in_file - *first_logical_block;
        unsigned last = nblocks - map_offset;

        for (relative_block = 0; ; relative_block++) {
            if (relative_block == last) {
                clear_buffer_mapped(map_bh);
                break;
            }
            if (page_block == blocks_per_page)
                break;
            blocks[page_block] = map_bh->b_blocknr + map_offset +
                        relative_block;
            page_block++;
            block_in_file++;
        }
        bdev = map_bh->b_bdev;
    }

    /*
    * Then do more get_blocks calls until we are done with this page.
    */
    map_bh->b_page = page;
    while (page_block < blocks_per_page) {    //page_block初始化为0;
        map_bh->b_state = 0;
        map_bh->b_size = 0;

        if (block_in_file < last_block) {
            //尝试一次性处理page中所有的buffer
            map_bh->b_size = (last_block-block_in_file) << blkbits;
            //让文件系统告知文件块编号为block_in_file,长度为map_bh->b_size的块是否做了映射
            if (get_block(inode, block_in_file, map_bh, 0))//ext2_get_block()
                goto confused;
            *first_logical_block = block_in_file;
        }
        //当前块未做映射(空洞块或EOF),处理下一块
        if (!buffer_mapped(map_bh)) {
            fully_mapped = 0;
            if (first_hole == blocks_per_page)
                first_hole = page_block;
            page_block++;
            block_in_file++;
            clear_buffer_mapped(map_bh);//此句多余啊!
            continue;
        }

        /* some filesystems will copy data into the page during
        * the get_block call, in which case we don't want to
        * read it again. map_buffer_to_page copies the data
        * we just collected from get_block into the page's buffers
        * so readpage doesn't have to repeat the get_block call
        */
        if (buffer_uptodate(map_bh)) {
            //map_bh是指向堆栈中的buffer_head结构体,需要将map_bh的映射信息拷贝到page
            //相应的buffer_head中
            map_buffer_to_page(page, map_bh, page_block);
            goto confused;
        }
        //空->非空的情况,当作不连续处理
        if (first_hole != blocks_per_page)
            goto confused;        /* hole -> non-hole */

        /* Contiguous blocks? */
        //磁盘块号是不连续的
        if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
            goto confused;
        nblocks = map_bh->b_size >> blkbits;//一次get_block()获得了多个连续block的映射
        //这里没有比较nblocks是否等于last_block-block_in_file,因为将EOF当作连续的页处理
        for (relative_block = 0; ; relative_block++) {
            if (relative_block == nblocks) {
                clear_buffer_mapped(map_bh);
                break;
            } else if (page_block == blocks_per_page)
                break;
            //将每个buffer_head对应的磁盘逻辑块号都记录下来
            blocks[page_block] = map_bh->b_blocknr+relative_block;
            page_block++;
            block_in_file++;
        }
        bdev = map_bh->b_bdev;
    }

    //处理非空->空的情况
    //到这里first_hole后面的块必然全部是空洞,因为空->非空的情况会跳到confuse标签
    //因此可以安全的将first_hole及后面的逻辑块清零
    if (first_hole != blocks_per_page) {
        zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE);
        if (first_hole == 0) {//page中的buffer_head全部没有映射
            SetPageUptodate(page);
            unlock_page(page);
            goto out;
        }
    } else if (fully_mapped) {
        //必须是page中所有的块都映射到磁盘,空洞和EOF都不算
        SetPageMappedToDisk(page);
    }

    /*
    * This page will go to BIO. Do we need to send this BIO off first?
    */
    //上一次调用readpage时最后一个逻辑块与本次的第一个逻辑块不连续
    //不能合并到同一个BIO
    if (bio && (*last_block_in_bio != blocks[0] - 1))
        bio = mpage_bio_submit(READ, bio);

alloc_new:
    if (bio == NULL) {
        bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
                min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
                GFP_KERNEL);
        if (bio == NULL)
            goto confused;
    }

    length = first_hole << blkbits;
    if (bio_add_page(bio, page, length, 0) < length) {
        bio = mpage_bio_submit(READ, bio);
        goto alloc_new;
    }

    //此次处理的最后一个文件块是边界块(因为与下一个磁盘块对应了不同的间接块,
    //因此与下一个文件块在磁盘上是不连续的)或EOF
    if (buffer_boundary(map_bh) || (first_hole != blocks_per_page))
        bio = mpage_bio_submit(READ, bio);
    else
        *last_block_in_bio = blocks[blocks_per_page - 1];
out:
    return bio;

confused:
    if (bio)
        bio = mpage_bio_submit(READ, bio);
    if (!PageUptodate(page))
            block_read_full_page(page, get_block);//一块一块的读
    else
        unlock_page(page);
    goto out;
}

/*
* Generic "read page" function for block devices that have the normal
* get_block functionality. This is most of the block device filesystems.
* Reads the page asynchronously --- the unlock_buffer() and
* set/clear_buffer_uptodate() functions propagate buffer state into the
* page struct once IO has completed.
*/

/**
处理page中的buffer_head在磁盘上的逻辑块不连续的情况
1.将page转换为buffer page
2.对每个bh,调用submit_bh()读取磁盘内容
*/
int block_read_full_page(struct page *page, get_block_t *get_block)
{
    struct inode *inode = page->mapping->host;
    sector_t iblock, lblock;
    struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
    unsigned int blocksize;
    int nr, i;
    int fully_mapped = 1;

    BUG_ON(!PageLocked(page));
    blocksize = 1 << inode->i_blkbits;
    if (!page_has_buffers(page))
        create_empty_buffers(page, blocksize, 0);
    head = page_buffers(page);

    //在文件中的开始块号
    iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
    //文件的结束块号
    lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
    bh = head;
    nr = 0;
    i = 0;

    do {
        if (buffer_uptodate(bh))
            continue;

        if (!buffer_mapped(bh)) {
            int err = 0;

            fully_mapped = 0;
            if (iblock < lblock) {
                WARN_ON(bh->b_size != blocksize);
                //让文件系统告知iblock开始的块是否做了映射
                err = get_block(inode, iblock, bh, 0);
                if (err)
                    SetPageError(page);
            }
            if (!buffer_mapped(bh)) {//空洞块,需要将数据全部清零
                zero_user(page, i * blocksize, blocksize);
                if (!err)
                    set_buffer_uptodate(bh);
                continue;
            }
            /*
            * get_block() might have updated the buffer
            * synchronously
            */
            //据说reiserfs会干这个事情
            if (buffer_uptodate(bh))
                continue;
        }
        //此bh是需要做磁盘IO的
        arr[nr++] = bh;//buffer_mapped(bh)==1 && buffer_uptodate(bh)==0
    } while (i++, iblock++, (bh = bh->b_this_page) != head);

    if (fully_mapped)//page中所有的块都映射到磁盘,包括EOF的情况
        SetPageMappedToDisk(page);
    //没有需要从磁盘读取的数据块
    if (!nr) {
        /*
        * All buffers are uptodate - we can set the page uptodate
        * as well. But not if get_block() returned an error.
        */
        if (!PageError(page))
            SetPageUptodate(page);
        unlock_page(page);
        return 0;
    }

    /* Stage two: lock the buffers */
    for (i = 0; i < nr; i++) {
        bh = arr[i];
        lock_buffer(bh);
        mark_buffer_async_read(bh);
    }

    /*
    * Stage 3: start the IO. Check for uptodateness
    * inside the buffer lock in case another process reading
    * the underlying blockdev brought it uptodate (the sct fix).
    */
    //如果buffer_uptodate(bh)=1,bh必定不是被异步读取的,原因有两点:
    //1.异步读取buffer_head,需要先锁page,在block_read_full_page()开始时,page已经被加锁,
    //因此在进入block_read_full_page()后,不可能有其他进程能够启动异步IO读取buffer_head
    //2.假定有其他进程启动异步IO读取buffer_head,那么此IO必然在if (buffer_uptodate(bh))之前完成,
    //异步IO完成的回调函数end_buffer_async_read()会清除BH_async_read标志,导致在for循环中执行
    //end_buffer_async_read(bh, 1)时,BUG_ON(!buffer_async_read(bh))被触发
    //bh必然是被类似ll_rw_block()或__bread()之类的函数同步读入的
    for (i = 0; i < nr; i++) {
        bh = arr[i];
        if (buffer_uptodate(bh))
            end_buffer_async_read(bh, 1);
        else
            submit_bh(READ, bh);
    }
    return 0;
}

阅读(2408) | 评论(0) | 转发(0) |

上一篇：对dentry的理解

下一篇：sys_write()缓冲写文件源码分析

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6