八:VFS层的I/O操作
VFS层是与用户界面直接交互的接口,在这一节里,我们将分为读写两部份来介绍VFS层的操作以及跟上层用用的交互.
8.1:文件的读操作
在用户空间,读文件操作的常用函数为read()。对应在系统空间的调用入口是sys_read().它的代码如下:
asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
{
struct file *file;
ssize_t ret = -EBADF;
int fput_needed;
//根据fd从进程中取出相应的file对象
file = fget_light(fd, &fput_needed);
if (file) {
loff_t pos = file_pos_read(file);
//文件的当前位置
ret = vfs_read(file, buf, count, &pos);
//更新当前的文件位置
file_pos_write(file, pos);
fput_light(file, fput_needed);
}
return ret;
}
从进程中取得文件描述符后和文件当前的操作位置后会调用vfs_read()执行具体的操作过程.它的代码如下:
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
struct inode *inode = file->f_dentry->d_inode;
ssize_t ret;
if (!(file->f_mode & FMODE_READ))
return -EBADF;
if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
return -EINVAL;
//检查当前区段是否允许读操作
ret = locks_verify_area(FLOCK_VERIFY_READ, inode, file, *pos, count);
if (!ret) {
//是否有权限
ret = security_file_permission (file, MAY_READ);
if (!ret) {
//如果有read 操作,调用之
if (file->f_op->read)
ret = file->f_op->read(file, buf, count, pos);
else
//否则调用aio_read
ret = do_sync_read(file, buf, count, pos);
//ret: 写入的字节数
if (ret > 0)
//产生通告
dnotify_parent(file->f_dentry, DN_ACCESS);
}
}
return ret;
}
从上面看到,会最终调用file的相关操作完成文件的读操作.曾记得我们在文件的打开一节中分析了文件的打开过程。在打开文件过程中,文件描述符的相关操作会被赋值为inode->f_op.对于ext2文件系统,inode的相关信息如下:
inode->i_fop = &ext2_file_operations;
struct file_operations ext2_file_operations = {
.llseek = generic_file_llseek,
.read = generic_file_read,
.write = generic_file_write,
.aio_read = generic_file_aio_read,
.aio_write = generic_file_aio_write,
.ioctl = ext2_ioctl,
.mmap = generic_file_mmap,
.open = generic_file_open,
.release = ext2_release_file,
.fsync = ext2_sync_file,
.readv = generic_file_readv,
.writev = generic_file_writev,
.sendfile = generic_file_sendfile,
}
相应文件读操作入口为generic_file_read():
ssize_t
generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
{
//用户空间的地址和长度
struct iovec local_iov = { .iov_base = buf, .iov_len = count };
//记录完成状态
struct kiocb kiocb;
ssize_t ret;
//kiocb.ki_key=KIOCB_SYNC_KEY; kiocb.ki_filp=filp;kiocb.ki_obj=current;
init_sync_kiocb(&kiocb, filp);
//返回读写完成的字节数
ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);
//异步操作,需用等待
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
//返回完成的字节数
return ret;
}
__generic_file_aio_read()是一个很重要的函数,它是读操作的入口。代码如下:
ssize_t
__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos)
{
struct file *filp = iocb->ki_filp;
ssize_t retval;
unsigned long seg;
size_t count;
count = 0;
for (seg = 0; seg < nr_segs; seg++) {
const struct iovec *iv = &iov[seg];
/*
* If any segment has a negative length, or the cumulative
* length ever wraps negative then return -EINVAL.
*/
count += iv->iov_len;
if (unlikely((ssize_t)(count|iv->iov_len) < 0))
return -EINVAL;
//检查从 iv->iov_base 开始的iov_len区间的合法性
if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
continue;
if (seg == 0)
return -EFAULT;
//nr_seg: 有效的数据段数目
nr_segs = seg;
//上一个数据段无效,将其长度减下来
count -= iv->iov_len; /* This segment is no good */
break;
}
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
//如果定义了O_DIRECT:直接传送数据`绕过了页高速缓存
if (filp->f_flags & O_DIRECT) {
loff_t pos = *ppos, size;
struct address_space *mapping;
struct inode *inode;
mapping = filp->f_mapping;
inode = mapping->host;
retval = 0;
if (!count)
goto out; /* skip atime */
size = i_size_read(inode);
if (pos < size) {
retval = generic_file_direct_IO(READ, iocb,
iov, pos, nr_segs);
if (retval >= 0 && !is_sync_kiocb(iocb))
retval = -EIOCBQUEUED;
if (retval > 0)
*ppos = pos + retval;
}
file_accessed(filp);
goto out;
}
//count:读取文件的长度
retval = 0;
if (count) {
for (seg = 0; seg < nr_segs; seg++) {
//read_descriptor_t:读操作描述符`用来记录读的状态
read_descriptor_t desc;
desc.written = 0;
desc.arg.buf = iov[seg].iov_base;
desc.count = iov[seg].iov_len;
//如果没有要传输的数据`继续下一个iov
if (desc.count == 0)
continue;
desc.error = 0;
//对其中的每一个段调用do_generic_file_read
do_generic_file_read(filp,ppos,&desc,file_read_actor,0);
//desc.written:写入到用户空间的字节数
//更新retval
retval += desc.written;
if (!retval) {
retval = desc.error;
break;
}
}
}
out:
return retval;
}
这里有种特殊情况,当文件是用直接I/O模式打开时(文件描述符带有O_DIRECT标志),就会采用直接I/O而跳过了页高速缓区。这样的情况我们在之后再讨论.
对于普通模块的情况。将会对每一个段调用do_generic_file_read()来完成I/O操作。这个函数的代码如下:
do_generic_file_read()à do_generic_file_read():
/*
mapping: 页高速缓存区
_ra: filep对应的file_ra_state
filep: 打开的文件描述符
ppos: 当前的操作位置
desc: 读操作描述符
actor: 内核空间到用户空间的拷贝函数
nonblock: 如果此变量为1,则需要预读
*/
void do_generic_mapping_read(struct address_space *mapping,
struct file_ra_state *_ra,
struct file *filp,
loff_t *ppos,
read_descriptor_t *desc,
read_actor_t actor,
int nonblock)
{
struct inode *inode = mapping->host;
unsigned long index, end_index, offset;
loff_t isize;
struct page *cached_page;
int error;
struct file_ra_state ra = *_ra;
cached_page = NULL;
//找到页面的偏移量。即确定是存储在那个存面中
index = *ppos >> PAGE_CACHE_SHIFT;
//第一个请求字节在页面的偏移量
//亦即请求的字节在页面中的偏移
offset = *ppos & ~PAGE_CACHE_MASK;
//inode对应的文件大小
isize = i_size_read(inode);
if (!isize)
goto out;
//最后的缓存页序号
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
for (;;) {
struct page *page;
unsigned long nr, ret;
/* nr is the maximum number of bytes to copy from this page */
//nr: 缓存页空间大小
nr = PAGE_CACHE_SIZE;
if (index >= end_index) {
//index > end_indx: 肯定是非法的页面缓存器大小
if (index > end_index)
goto out;
//执行到这里,肯定有index == end_index
//nr转化成了文件在最后一个缓存page中的位置
nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
//offset是当前位置在页中的偏移,nr: 是最后一个块在磁盘中的偏移
//如果nr<=offset说明文件已经操作完了
if (nr <= offset) {
goto out;
}
}
//nr-offset: 页面的剩余操作字节数
nr = nr - offset;
//检查当前进程是否设置了重新调度标志`如果有`调用schdule()重新调度一次
cond_resched();
//文件预读
if (!nonblock)
page_cache_readahead(mapping, &ra, filp, index);
find_page:
//寻找当前位置对应的缓存页
page = find_get_page(mapping, index);
if (unlikely(page == NULL)) {
//没有找到对应的缓存页,说明在页缓存区中不存在此页面对应的缓存页
if (nonblock) {
desc->error = -EWOULDBLOCKIO;
break;
}
handle_ra_miss(mapping, &ra, index);
goto no_cached_page;
}
//在页缓存区中找到了相关的页面
//检查PG_uptodata标志是否被设置`如果这个标志被设置的话,就不需要从设备
//上去读取了
if (!PageUptodate(page)) {
//页面没有设置PG_uptodata`页面中的内容无效,所以要从文件系统中把数据读取出来
if (nonblock) {
page_cache_release(page);
desc->error = -EWOULDBLOCKIO;
break;
}
goto page_not_up_to_date;
}
page_ok:
/* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
/*
* Mark the page accessed if we read the beginning.
*/
if (!offset)
mark_page_accessed(page);
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
*
* The actor routine returns how many bytes were actually used..
* NOTE! This may not be the same as how much of a user buffer
* we filled up (we may be padding etc), so we can only update
* "pos" here (the actor routine has to update the user buffer
* pointers and the remaining count).
*/
//页面与用户空间的值拷贝.返回拷贝的数据数
ret = actor(desc, page, offset, nr);
offset += ret;
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
page_cache_release(page);
//如果ret == nr: 拷贝的长度等于在页面中的剩余长度,说明拷贝没有发生错误
if (ret == nr && desc->count)
continue;
//否则,可以退出了
goto out;
page_not_up_to_date:
/* Get exclusive access to the page ... */
//要从文件系统中传数据到此页面上。将此页面锁定
lock_page(page);
/* Did it get unhashed before we got the lock? */
//有可能在锁页面的时候`有其它的进程将页面移除了页缓存区
//在这种情况下:将page解锁`并减少它的使用计数,重新循环```
//重新进入循环后,在页缓存区找不到对应的page.就会重新分配一个新的page
if (!page->mapping) {
unlock_page(page);
page_cache_release(page);
continue;
}
/* Did somebody else fill it already? */
//在加锁的时候,有其它的进程完成了从文件系统到具体页面的映射?
//在这种情况下,返回到page_ok.直接将页面上的内容copy到用户空间即可
if (PageUptodate(page)) {
unlock_page(page);
goto page_ok;
}
//读取页面
readpage:
/* Start the actual read. The read will unlock the page. */
//到这里的话,实际的读取过程开始了 ^_^
error = mapping->a_ops->readpage(filp, page);
//读取错误,退出
if (unlikely(error))
goto readpage_error;
//如果PG_uptodata标志仍然末设置.就一直等待,一直到page不处于锁定状态
// TODO: 在将文件系统的内容读入page之前,page一直是处理Lock状态的。一直到
//读取完成后,才会将页面解锁. 然后将进程唤醒
if (!PageUptodate(page)) {
wait_on_page_locked(page);
//如果页面仍然没有PG_uptodata标志.只可能是发生了错误.出错返回
if (!PageUptodate(page)) {
error = -EIO;
goto readpage_error;
}
}
/*
* i_size must be checked after we have done ->readpage.
*
* Checking i_size after the readpage allows us to calculate
* the correct value for "nr", which means the zero-filled
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/
isize = i_size_read(inode);
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
//如果文件大小无效或者当前位置超过了文件大小
if (unlikely(!isize || index > end_index)) {
page_cache_release(page);
goto out;
}
/* nr is the maximum number of bytes to copy from this page */
//重新计算nr 即在页面中剩余的要copy的字节数
nr = PAGE_CACHE_SIZE;
if (index == end_index) {
nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
if (nr <= offset) {
page_cache_release(page);
goto out;
}
}
nr = nr - offset;
goto page_ok;
readpage_error:
/* UHHUH! A synchronous read error occurred. Report it */
desc->error = error;
page_cache_release(page);
goto out;
no_cached_page:
/*
* Ok, it wasn't cached, so we need to create a new
* page..
*/
//在页缓区中没有相关的缓存页
//新分匹一个页面
if (!cached_page) {
cached_page = page_cache_alloc_cold(mapping);
if (!cached_page) {
desc->error = -ENOMEM;
goto out;
}
}
//将分得的页加到页缓存区和LRU
// TODO:在将新页面插入页缓存区域中,会将页面标志设置为PG_locked
error = add_to_page_cache_lru(cached_page, mapping,
index, GFP_KERNEL);
if (error) {
if (error == -EEXIST)
goto find_page;
desc->error = error;
goto out;
}
page = cached_page;
cached_page = NULL;
goto readpage;
}
out:
*_ra = ra;
//ppos: 最后的读取位置
*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
if (cached_page)
page_cache_release(cached_page);
if (filp)
file_accessed(filp);
}
如果参数为nonblock为1,则必须预读页面。在这里的调用nonblock为零,不需要考虑预读的情况。关于预读的操作,我们之后再给出分析.
在这个操作中,有这样几种可能的情况:
1:如果要访问的页面在页高速缓存中,而且已经被更新(含有PG_uptodata标志).只需要直接将其copy到用户空间即可.
2:序号对应的页面不在高速缓存中,那就需要在页高速缓存中增加序号对应的页面。然后从文件系统中读取数据到这个页面上.再拷贝到用户空间。
3:序号对应的页面在高速缓存中,但数据不是最新的.这就需要缓存页与文件系统进行同步.再将页面拷贝到用户空间.
对于2和3。它们有一部份是相同的,即从文件系统中读数据的过程。我们只需要分种对于第2的情况。对应的代码片段如下:
void do_generic_mapping_read(struct address_space *mapping,
struct file_ra_state *_ra,
struct file *filp,
loff_t *ppos,
read_descriptor_t *desc,
read_actor_t actor,
int nonblock)
{
……
page = find_get_page(mapping, index);
if (unlikely(page == NULL)) {
//没有找到对应的缓存页,说明在页缓存区中不存在此页面对应的缓存页
if (nonblock) {
desc->error = -EWOULDBLOCKIO;
break;
}
handle_ra_miss(mapping, &ra, index);
goto no_cached_page;
}
……
……
}
Handle_ra_miss()主要对文件的预读进行调整,在这里不进行分析,待分析预读机制的时候再来详细分析.
如果页面高速缓存中不存在此页面就会跳转到no_cached_page:
no_cached_page:
/*
* Ok, it wasn't cached, so we need to create a new
* page..
*/
//在页缓区中没有相关的缓存页
//新分匹一个页面
if (!cached_page) {
cached_page = page_cache_alloc_cold(mapping);
if (!cached_page) {
desc->error = -ENOMEM;
goto out;
}
}
//将分得的页加到页缓存区和LRU
// TODO:在将新页面插入页缓存区域中,会将页面标志设置为PG_locked
error = add_to_page_cache_lru(cached_page, mapping,
index, GFP_KERNEL);
if (error) {
if (error == -EEXIST)
goto find_page;
desc->error = error;
goto out;
}
page = cached_page;
cached_page = NULL;
goto readpage;
在这里,会首先调用page_cache_alloc_cold()分配一个页面。然后调用add_to_page_cache_lru()将页面插入页高速缓存并加入lru.然后跳转到readpage。这也是第3种情况对应的处理:
//读取页面
readpage:
/* Start the actual read. The read will unlock the page. */
//到这里的话,实际的读取过程开始了 ^_^
error = mapping->a_ops->readpage(filp, page);
在这里会看到,最终会调用页高速缓存的readpage方法进行读取操作。
文件页高速缓存的readpage操作
同理,还是以ext2文件系统为例来分析。在open的时候,它将页高速缓存对应的各项操作设置如下:
inode->i_mapping->a_ops = &ext2_aops;
struct address_space_operations ext2_aops = {
.readpage = ext2_readpage,
.readpages = ext2_readpages,
.writepage = ext2_writepage,
.sync_page = block_sync_page,
.prepare_write = ext2_prepare_write,
.commit_write = generic_commit_write,
.bmap = ext2_bmap,
.direct_IO = ext2_direct_IO,
.writepages = ext2_writepages,
};
对应的入口为ext2_readpage:
static int ext2_readpage(struct file *file, struct page *page)
{
return mpage_readpage(page, ext2_get_block);
}
这是一个封装的函数,采用一个回调函数做为参数.该回调函数将相对于文件起始的块号转换为文件系统的逻辑块号.
Mpage_readpage()的代码如下:
int mpage_readpage(struct page *page, get_block_t get_block)
{
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
//转要读的信息转换为bio结构
bio = do_mpage_readpage(bio, page, 1,
&last_block_in_bio, get_block);
//提交这个bio
if (bio)
mpage_bio_submit(READ, bio);
return 0;
}
mpage_bio_submit()这个操作中有一部份代码在之前已经分析过了。剩余的代码很简单。这里不做分析.
do_mpage_readpage()的代码如下:
static struct bio *
do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
sector_t *last_block_in_bio, get_block_t get_block)
{
struct inode *inode = page->mapping->host;
const unsigned blkbits = inode->i_blkbits;
//计算一个页面中的数据块数目
const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
//block的大小
const unsigned blocksize = 1 << blkbits;
sector_t block_in_file;
sector_t last_block;
sector_t blocks[MAX_BUF_PER_PAGE];
unsigned page_block;
unsigned first_hole = blocks_per_page;
struct block_device *bdev = NULL;
struct buffer_head bh;
int length;
int fully_mapped = 1;
//如果页面是一个缓存区页,跳转到confused.直接更新页在中的块缓存区
if (page_has_buffers(page))
goto confused;
//页序号*每个页中的块数目 = 页面中的首个块号
block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
//文件最后的块: 文件大小/块大小
last_block = (i_size_read(inode) + blocksize - 1) >> blkbits;
bh.b_page = page;
//遍历页面中的块数
for (page_block = 0; page_block < blocks_per_page;
page_block++, block_in_file++) {
bh.b_state = 0;
if (block_in_file < last_block) {
//将文件中的块号转换成bh
if (get_block(inode, block_in_file, &bh, 0))
//如果有错误
goto confused;
}
//bh没有被映射,可能是一个文件空洞
if (!buffer_mapped(&bh)) {
fully_mapped = 0;
if (first_hole == blocks_per_page)
first_hole = page_block;
continue;
}
/* some filesystems will copy data into the page during
* the get_block call, in which case we don't want to
* read it again. map_buffer_to_page copies the data
* we just collected from get_block into the page's buffers
* so readpage doesn't have to repeat the get_block call
*/
//如果块缓存区是最新的,将其数据直接copy到page
if (buffer_uptodate(&bh)) {
map_buffer_to_page(page, &bh, page_block);
goto confused;
}
if (first_hole != blocks_per_page)
goto confused; /* hole -> non-hole */
/* Contiguous blocks? */
//判断请求的块缓存是不是连续的。如果不连续,就跳转到confused
if (page_block && blocks[page_block-1] != bh.b_blocknr-1)
goto confused;
blocks[page_block] = bh.b_blocknr;
bdev = bh.b_bdev;
}
if (first_hole != blocks_per_page) {
char *kaddr = kmap_atomic(page, KM_USER0);
memset(kaddr + (first_hole << blkbits), 0,
PAGE_CACHE_SIZE - (first_hole << blkbits));
flush_dcache_page(page);
kunmap_atomic(kaddr, KM_USER0);
if (first_hole == 0) {
SetPageUptodate(page);
unlock_page(page);
goto out;
}
} else if (fully_mapped) {
//设置PG_mappedtodisk
SetPageMappedToDisk(page);
}
/*
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (bio && (*last_block_in_bio != blocks[0] - 1))
bio = mpage_bio_submit(READ, bio);
alloc_new:
if (bio == NULL) {
//创建一个bio
bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
GFP_KERNEL);
if (bio == NULL)
goto confused;
}
length = first_hole << blkbits;
//将page对应的偏移与长度设置到bio 中
if (bio_add_page(bio, page, length, 0) < length) {
bio = mpage_bio_submit(READ, bio);
goto alloc_new;
}
if (buffer_boundary(&bh) || (first_hole != blocks_per_page))
bio = mpage_bio_submit(READ, bio);
else
*last_block_in_bio = blocks[blocks_per_page - 1];
out:
return bio;
confused:
if (bio)
bio = mpage_bio_submit(READ, bio);
if (!PageUptodate(page))
block_read_full_page(page, get_block);
else
unlock_page(page);
goto out;
}
这段代码实际上做了一个小小的优化。它会判断要提交的块缓存区是不是连续的。如果是连续的就可以将它们放一个bio中。然后提交到通用块设备层。如果不是连续的,对于每一个块缓存区都要提交一次.
对于连续条件的bio提交很好理解,代码也很容易.重点分析对于不连续的块的处理。
在上面的代码中可以看到,对于不连续块是通过block_read_full_page()来处理的.代码如下:
int block_read_full_page(struct page *page, get_block_t *get_block)
{
struct inode *inode = page->mapping->host;
sector_t iblock, lblock;
struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
unsigned int blocksize;
int nr, i;
int fully_mapped = 1;
//页面没有被锁定
if (!PageLocked(page))
PAGE_BUG(page);
//块大小
blocksize = 1 << inode->i_blkbits;
//如果页面中没有块缓存区,则在其中建立空的块缓存区
if (!page_has_buffers(page))
create_empty_buffers(page, blocksize, 0);
//块缓存区描述符的首部
head = page_buffers(page);
//页中的起始块号
iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
//文件中的最后一个块号
lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
bh = head;
nr = 0;
i = 0;
do {
//已经是最新的了,不需要提交,继续下一个
if (buffer_uptodate(bh))
continue;
//如果块缓存区没有被映射
if (!buffer_mapped(bh)) {
fully_mapped = 0;
if (iblock < lblock) {
//将文件块号转换为bh
if (get_block(inode, iblock, bh, 0))
SetPageError(page);
}
//如果这个bh还是没有映射。可能是对应文件的空洞区域
//将这个bh对应的区域置0
if (!buffer_mapped(bh)) {
void *kaddr = kmap_atomic(page, KM_USER0);
memset(kaddr + i * blocksize, 0, blocksize);
flush_dcache_page(page);
kunmap_atomic(kaddr, KM_USER0);
set_buffer_uptodate(bh);
continue;
}
/*
* get_block() might have updated the buffer
* synchronously
*/
//如果bh为最新了,不需要提交了
if (buffer_uptodate(bh))
continue;
}
//提要提交的bh保存到arr数组里
arr[nr++] = bh;
} while (i++, iblock++, (bh = bh->b_this_page) != head);
//设置PG_mappdtodisk
if (fully_mapped)
SetPageMappedToDisk(page);
//如果没有要提交的
if (!nr) {
/*
* All buffers are uptodate - we can set the page uptodate
* as well. But not if get_block() returned an error.
*/
if (!PageError(page))
SetPageUptodate(page);
unlock_page(page);
return 0;
}
/* Stage two: lock the buffers */
//对每一个提交的bh进行锁定
for (i = 0; i < nr; i++) {
bh = arr[i];
lock_buffer(bh);
mark_buffer_async_read(bh);
}
/*
* Stage 3: start the IO. Check for uptodateness
* inside the buffer lock in case another process reading
* the underlying blockdev brought it uptodate (the sct fix).
*/
//提交每一个bh
for (i = 0; i < nr; i++) {
bh = arr[i];
if (buffer_uptodate(bh))
end_buffer_async_read(bh, 1);
else
submit_bh(READ, bh);
}
return 0;
}
从上面的代码中看了.对于不连续的读操作,会反复调用submit_bh()来完成.
8.2:文件的写操作
在用户空间中,用户的写操作接口为write.对应系统调用的入口为sys_write().
代码如下:
asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t count)
{
struct file *file;
ssize_t ret = -EBADF;
int fput_needed;
//取得文件描述符对应的file
//fget_ligsh():对fget()进行了优化。如果当前file没有被共享的话。那么在取的时候就不必要加锁
file = fget_light(fd, &fput_needed);
if (file) {
//当前文件指针位置
loff_t pos = file_pos_read(file);
ret = vfs_write(file, buf, count, &pos);
//更新文件指针
file_pos_write(file, pos);
//对共享情况下的解锁
fput_light(file, fput_needed);
}
return ret;
}
上面的代码与读操作差不多,都是取文件描述符和当前文件,操作完后,更新文件指针位置.
vfs_write()代码如下:
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
struct inode *inode = file->f_dentry->d_inode;
ssize_t ret;
//文件不可写?
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
//没有操作函数或者是有操作函数但没有写函数。出错返回
if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
return -EINVAL;
//对写区域所加的强制锁
ret = locks_verify_area(FLOCK_VERIFY_WRITE, inode, file, *pos, count);
if (!ret) {
ret = security_file_permission (file, MAY_WRITE);
if (!ret) {
if (file->f_op->write)
ret = file->f_op->write(file, buf, count, pos);
else
ret = do_sync_write(file, buf, count, pos);
if (ret > 0)
dnotify_parent(file->f_dentry, DN_MODIFY);
}
}
return ret;
}
对于大部份情况,写操作会由file->f_op->write完成.在ext2文件系统中,此接口对应的函数为:
ssize_t generic_file_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
ssize_t ret;
struct iovec local_iov = { .iov_base = (void __user *)buf,
.iov_len = count };
down(&inode->i_sem);
//返回write的有效字节数
ret = generic_file_write_nolock(file, &local_iov, 1, ppos);
up(&inode->i_sem);
//如果定义了O_SYNC或者inode定义了MS_SYNCHRONOUS标志
if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
ssize_t err;
//把缓存区上面的东西写回设备
err = sync_page_range(inode, mapping, *ppos - ret, ret);
if (err < 0)
ret = err;
}
return ret;
}
如果打开文件时带有O_SYNC标志,或者文件系统带有SYNC标志,都会将缓存中的数据直接写到文件系统上.
转入generic_file_write_nolock():
ssize_t
generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos)
{
struct file *file = iocb->ki_filp;
struct address_space * mapping = file->f_mapping;
size_t ocount; /* original count */
size_t count; /* after file limit checks */
struct inode *inode = mapping->host;
unsigned long seg;
loff_t pos;
ssize_t written;
ssize_t err;
ocount = 0;
for (seg = 0; seg < nr_segs; seg++) {
const struct iovec *iv = &iov[seg];
/*
* If any segment has a negative length, or the cumulative
* length ever wraps negative then return -EINVAL.
*/
ocount += iv->iov_len;
if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
return -EINVAL;
//判断用户给的区域是否合法
if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
continue;
if (seg == 0)
return -EFAULT;
nr_segs = seg;
ocount -= iv->iov_len; /* This segment is no good */
break;
}
//count: 要write的字节总数
count = ocount;
//ppos:当前的位置
pos = *ppos;
/* We can write back this queue in page reclaim */
//backing_dev_info: 预读信息
current->backing_dev_info = mapping->backing_dev_info;
written = 0;
//对写操作的详细检查
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
goto out;
if (count == 0)
goto out;
err = remove_suid(file->f_dentry);
if (err)
goto out;
//更新索引结点的时间戳信息
inode_update_time(inode, 1);
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (unlikely(file->f_flags & O_DIRECT)) {
written = generic_file_direct_write(iocb, iov,
&nr_segs, pos, ppos, count, ocount);
if (written < 0 || written == count)
goto out;
/*
* direct-io write to a hole: fall through to buffered I/O
* for completing the rest of the request.
*/
pos += written;
count -= written;
}
written = generic_file_buffered_write(iocb, iov, nr_segs,
pos, ppos, count, written);
out:
current->backing_dev_info = NULL;
return written ? written : err;
}
如果文件打开时带有了O_DIRECT标志,则会跳过文件缓存直接将数据写到文件系统中。对于O_DIRECT的操作我们在之后再做总结。对于一般的情况,都会转入到generic_file_buffered_write():
ssize_t
generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos, loff_t *ppos,
size_t count, ssize_t written)
{
struct file *file = iocb->ki_filp;
struct address_space * mapping = file->f_mapping;
struct address_space_operations *a_ops = mapping->a_ops;
struct inode *inode = mapping->host;
long status = 0;
struct page *page;
struct page *cached_page = NULL;
size_t bytes;
struct pagevec lru_pvec;
const struct iovec *cur_iov = iov; /* current iovec */
size_t iov_base = 0; /* offset in the current iovec */
char __user *buf;
pagevec_init(&lru_pvec, 0);
buf = iov->iov_base + written; /* handle partial DIO write */
do {
unsigned long index;
unsigned long offset;
size_t copied;
//offset: 页面中的偏移
offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
//offset: 页面序号
index = pos >> PAGE_CACHE_SHIFT;
//页面中的剩余信息
bytes = PAGE_CACHE_SIZE - offset;
//如果bytes > 数据的长度
if (bytes > count)
bytes = count;
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
*/
fault_in_pages_readable(buf, bytes);
//到页高速缓存中寻找index对应的页面。如果不存在,则新建
page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
if (!page) {
status = -ENOMEM;
break;
}
//调用prepare_write。在这里就会涉及到缓存头的概念了 ^_^
status = a_ops->prepare_write(file, page, offset, offset+bytes);
if (unlikely(status)) {
loff_t isize = i_size_read(inode);
/*
* prepare_write() may have instantiated a few blocks
* outside i_size. Trim these off again.
*/
unlock_page(page);
page_cache_release(page);
if (pos + bytes > isize)
vmtruncate(inode, isize);
break;
}
//把数据copy到缓冲区
if (likely(nr_segs == 1))
copied = filemap_copy_from_user(page, offset,
buf, bytes);
else
copied = filemap_copy_from_user_iovec(page, offset,
cur_iov, iov_base, bytes);
flush_dcache_page(page);
//调用commit_write。将数据写回设备
status = a_ops->commit_write(file, page, offset, offset+bytes);
if (likely(copied > 0)) {
if (!status)
status = copied;
if (status >= 0) {
written += status;
count -= status;
pos += status;
buf += status;
if (unlikely(nr_segs > 1))
filemap_set_next_iovec(&cur_iov,
&iov_base, status);
}
}
if (unlikely(copied != bytes))
if (status >= 0)
status = -EFAULT;
unlock_page(page);
mark_page_accessed(page);
page_cache_release(page);
if (status < 0)
break;
balance_dirty_pages_ratelimited(mapping);
cond_resched();
} while (count);
*ppos = pos;
if (cached_page)
page_cache_release(cached_page);
/*
* For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
*/
if (likely(status >= 0)) {
if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
if (!a_ops->writepage || !is_sync_kiocb(iocb))
status = generic_osync_inode(inode, mapping,
OSYNC_METADATA|OSYNC_DATA);
}
}
/*
* If we get here for O_DIRECT writes then we must have fallen through
* to buffered writes (block instantiation inside i_size). So we sync
* the file data here, to try to honour O_DIRECT expectations.
*/
if (unlikely(file->f_flags & O_DIRECT) && written)
status = filemap_write_and_wait(mapping);
pagevec_lru_add(&lru_pvec);
return written ? written : status;
}
从上面的代码可以看出:对于写操作,会先到高速缓存中取对应的page。然后调用a_ops->prepare_write()。然后将要写的数据拷贝到缓存区页上,接着调用a_ops-> commit_write()。下来我们分别分别这两个操作.
8.2.1:页高速缓存的prepare_write()操作
Ext2系统对应的入口为:
static int
ext2_prepare_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
return block_prepare_write(page,from,to,ext2_get_block);
}
这里是一个封装函数。对于块设备来说,不同的只是后面所带的函数指针,这样的函数结构我们在读操作中也见过。Ext_get_block()函数的操作为,将对应文件的块号转换为文件系统的逻辑块号.
转入block_prepare_write():
int block_prepare_write(struct page *page, unsigned from, unsigned to,
get_block_t *get_block)
{
struct inode *inode = page->mapping->host;
int err = __block_prepare_write(inode, page, from, to, get_block);
//如果失败,清除page的uptodate标志
if (err)
ClearPageUptodate(page);
return err;
}
__block_prepare_write()的操作为:
static int __block_prepare_write(struct inode *inode, struct page *page,
unsigned from, unsigned to, get_block_t *get_block)
{
unsigned block_start, block_end;
sector_t block;
int err = 0;
unsigned blocksize, bbits;
struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
BUG_ON(!PageLocked(page));
BUG_ON(from > PAGE_CACHE_SIZE);
BUG_ON(to > PAGE_CACHE_SIZE);
BUG_ON(from > to);
//标大小
blocksize = 1 << inode->i_blkbits;
if (!page_has_buffers(page))
create_empty_buffers(page, blocksize, 0);
head = page_buffers(page);
bbits = inode->i_blkbits;
//该页面的起始起号
block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
for(bh = head, block_start = 0; bh != head || !block_start;
block++, block_start=block_end, bh = bh->b_this_page) {
block_end = block_start + blocksize;
//对于没有落在from->to这个区间的bh
// TODO: 这样做实际上要依赖一个条件: 块大小必须为512的整数倍且须为2的幂大小
if (block_end <= from || block_start >= to) {
if (PageUptodate(page)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
}
continue;
}
if (buffer_new(bh))
clear_buffer_new(bh);
if (!buffer_mapped(bh)) {
//这里可能会进行文件系统大小的扩充.
err = get_block(inode, block, bh, 1);
if (err)
goto out;
//块缓存区刚被分配,没有被访问就置为BH_NEW
//通常是通过get_block()刚刚映射好的,不能访问
if (buffer_new(bh)) {
clear_buffer_new(bh);
unmap_underlying_metadata(bh->b_bdev,
bh->b_blocknr);
//如果页面uptodate.则设置bh的相应标志
if (PageUptodate(page)) {
set_buffer_uptodate(bh);
continue;
}
//如果只是对该块缓存区的部份进行操作,则将不操作的部份置0
if (block_end > to || block_start < from) {
void *kaddr;
kaddr = kmap_atomic(page, KM_USER0);
if (block_end > to)
memset(kaddr+to, 0,
block_end-to);
if (block_start < from)
memset(kaddr+block_start,
0, from-block_start);
flush_dcache_page(page);
kunmap_atomic(kaddr, KM_USER0);
}
continue;
}
}
if (PageUptodate(page)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
continue;
}
//如果bh没有uptodata.先将其和文件系统同步
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
(block_start < from || block_end > to)) {
ll_rw_block(READ, 1, &bh);
*wait_bh++=bh;
}
}
/*
* If we issued read requests - let them complete.
*/
//如果有提交的bh.等待其I/O完成
while(wait_bh > wait) {
wait_on_buffer(*--wait_bh);
if (!buffer_uptodate(*wait_bh))
return -EIO;
}
return 0;
out:
/*
* Zero out any newly allocated blocks to avoid exposing stale
* data. If BH_New is set, we know that the block was newly
* allocated in the above loop.
*/
bh = head;
block_start = 0;
do {
block_end = block_start+blocksize;
if (block_end <= from)
goto next_bh;
if (block_start >= to)
break;
if (buffer_new(bh)) {
void *kaddr;
clear_buffer_new(bh);
kaddr = kmap_atomic(page, KM_USER0);
memset(kaddr+block_start, 0, bh->b_size);
kunmap_atomic(kaddr, KM_USER0);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
}
next_bh:
block_start = block_end;
bh = bh->b_this_page;
} while (bh != head);
return err;
}
对于读操作,写操作可能更加复杂,因为写操作要动态调整文件的大小。文件大小的调整过程是在ext_get_block()这个回调函数中完成的。
Prepare_write操作完成了对缓存冲页进行了必要的初始化和文件大小的扩充.
直正将数据写到文件系统上是在commit_write()中完成的:
int generic_commit_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
struct inode *inode = page->mapping->host;
loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
__block_commit_write(inode,page,from,to);
/*
* No need to use i_size_read() here, the i_size
* cannot change under us because we hold i_sem.
*/
//如果文件被扩大了.更改inode->i_size
if (pos > inode->i_size) {
i_size_write(inode, pos);
mark_inode_dirty(inode);
}
return 0;
}
经过上面的分析,我们知道,在调用commit_write()之前,已经将要写的数据拷贝到了页缓冲区.
__block_commit_write()的代码如下:
static int __block_commit_write(struct inode *inode, struct page *page,
unsigned from, unsigned to)
{
unsigned block_start, block_end;
int partial = 0;
unsigned blocksize;
struct buffer_head *bh, *head;
blocksize = 1 << inode->i_blkbits;
//对被修改的部份置为dirty
for(bh = head = page_buffers(page), block_start = 0;
bh != head || !block_start;
block_start=block_end, bh = bh->b_this_page) {
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
if (!buffer_uptodate(bh))
partial = 1;
} else {
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
}
}
/*
* If this is a partial write which happened to make all buffers
* uptodate then we can optimize away a bogus readpage() for
* the next read(). Here we 'discover' whether the page went
* uptodate as a result of this (potentially partial) write.
*/
//如果整个页面的块缓存区都置为了dirty.则置页面的PG_uptodate标志.
if (!partial)
SetPageUptodate(page);
return 0;
}
在上面的代码中,我们看到,只是把块缓存区置为了“脏”,并没有直正的将数据写到文件系统中,那是什么时候完成这个写的过程的呢?
记得我们在分析pdflush线程数的时候,曾经介绍过 “回写陈旧的页面”。没错,就是在那里,旧页面被回写到了文件系统.
在那一节,我们遗留下了两个问题。即mapping->a_ops->writepages和mapping->a_ops->writepage的操作。我们在这一节里详细的分析一下.
8.2.1: mapping->a_ops->writepages()操作
对于ext2来说,它的mapping各项操作赋值为:
struct address_space_operations ext2_aops = {
……
.writepage = ext2_writepage,
.writepages = ext2_writepages,
……
}
相应的,writepages入口为ext2_writepages():
static int
ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
return mpage_writepages(mapping, wbc, ext2_get_block);
}
mpage_writepages()就是我们在pdflush线程组中曾经分析过的子函数.在这里不再赘述.
8.2.2: mapping->a_ops->writepage()操作
相应的入口为ext2_writepage():
static int ext2_writepage(struct page *page, struct writeback_control *wbc)
{
return block_write_full_page(page, ext2_get_block, wbc);
}
转入block_write_full_page()
static int __block_write_full_page(struct inode *inode, struct page *page,
get_block_t *get_block, struct writeback_control *wbc)
{
int err;
sector_t block;
sector_t last_block;
struct buffer_head *bh, *head;
int nr_underway = 0;
BUG_ON(!PageLocked(page));
//文件中的最后一个块号
last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
//如果不是块缓存页,则在页中建立块缓存区
if (!page_has_buffers(page)) {
create_empty_buffers(page, 1 << inode->i_blkbits,
(1 << BH_Dirty)|(1 << BH_Uptodate));
}
/*
* Be very careful. We have no exclusion from __set_page_dirty_buffers
* here, and the (potentially unmapped) buffers may become dirty at
* any time. If a buffer becomes dirty here after we've inspected it
* then we just miss that fact, and the page stays dirty.
*
* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
* handle that here by just cleaning them.
*/
//块缓存页中的起始块号
block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
//块缓存区描述符首部
head = page_buffers(page);
bh = head;
/*
* Get all the dirty buffers mapped to disk addresses and
* handle any aliases from the underlying blockdev's mapping.
*/
do {
//如果块号超过了文件的最后块号
if (block > last_block) {
/*
* mapped buffers outside i_size will occur, because
* this page can be outside i_size when there is a
* truncate in progress.
*/
/*
* The buffer was zeroed by block_write_full_page()
*/
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
//从文件系统中读取文件相对块号对应的bh
err = get_block(inode, block, bh, 1);
if (err)
goto recover;
if (buffer_new(bh)) {
/* blockdev mappings never come here */
clear_buffer_new(bh);
unmap_underlying_metadata(bh->b_bdev,
bh->b_blocknr);
}
}
bh = bh->b_this_page;
block++;
} while (bh != head);
do {
get_bh(bh);
//块缓存区没有被映射
if (!buffer_mapped(bh))
continue;
/*
* If it's a fully non-blocking write attempt and we cannot
* lock the buffer then redirty the page. Note that this can
* potentially cause a busy-wait loop from pdflush and kswapd
* activity, but those code paths have their own higher-level
* throttling.
*/
//在操作之前先锁定块缓存区
if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
lock_buffer(bh);
} else if (test_set_buffer_locked(bh)) {
//如果操作模式为WB_SYNC_NONE或者不允许阻塞。
//在块缓存区已经被锁定时,直接退出
redirty_page_for_writepage(wbc, page);
continue;
}
//如果页面为脏,设置块缓存区为BH_ASYNC_WRITE
if (test_clear_buffer_dirty(bh)) {
mark_buffer_async_write(bh);
} else {
unlock_buffer(bh);
}
} while ((bh = bh->b_this_page) != head);
/*
* The page and its buffers are protected by PageWriteback(), so we can
* drop the bh refcounts early.
*/
BUG_ON(PageWriteback(page));
//设置页面回写标志
set_page_writeback(page);
unlock_page(page);
//遍历页中的块缓存区,将BH_ASYNC_WRITE标志的BH回写到文件系统
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
submit_bh(WRITE, bh);
nr_underway++;
}
put_bh(bh);
bh = next;
} while (bh != head);
err = 0;
done:
if (nr_underway == 0) {
/*
* The page was marked dirty, but the buffers were
* clean. Someone wrote them back by hand with
* ll_rw_block/submit_bh. A rare case.
*/
int uptodate = 1;
do {
if (!buffer_uptodate(bh)) {
uptodate = 0;
break;
}
bh = bh->b_this_page;
} while (bh != head);
if (uptodate)
SetPageUptodate(page);
end_page_writeback(page);
/*
* The page and buffer_heads can be released at any time from
* here on.
*/
wbc->pages_skipped++; /* We didn't write this page */
}
return err;
recover:
/*
* ENOSPC, or some other error. We may already have added some
* blocks to the file, so we need to write these out to avoid
* exposing stale data.
* The page is currently locked and not marked for writeback
*/
bh = head;
/* Recovery: lock and submit the mapped buffers */
do {
get_bh(bh);
if (buffer_mapped(bh) && buffer_dirty(bh)) {
lock_buffer(bh);
mark_buffer_async_write(bh);
} else {
/*
* The buffer may have been set dirty during
* attachment to a dirty page.
*/
clear_buffer_dirty(bh);
}
} while ((bh = bh->b_this_page) != head);
SetPageError(page);
BUG_ON(PageWriteback(page));
set_page_writeback(page);
unlock_page(page);
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
submit_bh(WRITE, bh);
nr_underway++;
}
put_bh(bh);
bh = next;
} while (bh != head);
goto done;
}
该函数会遍历页面中的块缓存区,然后将脏的块缓存区写回文件系统.