kernel 3.10内核源码分析--file write流程
1、基本原理
文件写入流程通常从用户态发起,总体分两部分:
1) 从用户态到页缓存
用户态进程先open指定文件,返回fd
用户态进程调用write库函数接口,fd作为入参
read库函数接口最终调用write对应的系统调用接口,sys_write,由此进入内核流程
--------用户态和内核态分界------------------------
内核态调用特定文件系统定义的对应相关操作接口,如ext2对应的write接口为ext2_file_operations->aio_write->generic_file_aio_write,
由于用户对写入速度不敏感,同时磁盘操作较慢,所以写操作通常被延迟,以提升性能,即通常说的“延迟写”。
基本原理为:数据先写入页缓存(page cache)中并设置脏标记,后续由pdflush内核线程周期性或者按需对脏页进行回写。写入操作在数据写入页缓存后即可返回,无需等待数据实际写入磁盘,页缓存(内存)的写入速度显然比磁盘快多了,所以如此能大大提升write性能,但由于数据实际写入磁盘的时机不定,在诸如断电之类的异常情况下,会引入数据一致性问题。
2) 从页缓存到物理磁盘
通过pdflush内核线程或在需要时(如内存紧张回收内存时)按需将页缓存中的脏数据回写到磁盘。
回写之初,位于mapping层,在进行相应处理后,构造并提交bio,进入通用块层。
--------mapping层和通用块层分界----------------
通用块层构造request,提交到IO调度层
--------通用块层和IO调度层分界----------------
IO调度层对request按特定的IO调度算法进行合并和排序后,将request放入对应块设备的request queue,由此进入块设备驱动层。常用的块设备驱动层为SCSI,即进入SCSI层。
--------IO调度层和块设备驱动层分界-------------
SCSI层又分3层:SCSI上层驱动、SCSI中间层和SCSI底层驱动。
request进入SCSI层后,先由SCSI上层驱动和中间层进行处理,主要是按SCSI协议规范构造SCSI命令,最终将请求提交到SCSI底层驱动。
SCSI底层驱动,如mptsas,对request queue中的request进行逐一处理,主要跟根据硬件特性,构造最终的命令,以及跟硬件的交互。构造的命令最终提交到磁盘硬件(准确的说,应该是固件)。
--------SCSI层和硬件(固件)层的分界--------------
硬件处理完IO请求后,产生中断通知SCSI底层驱动(初始化时预先注册了相应的中断)。
ISR中调用SCSI层的scsi_done接口,进行相应的处理。
scsi_done调用上层(块设备层)定义的blk_complete_request接口,并触发相应的软中断
软中断中进行后续的处理:包括错误处理、request的清理、定时器的清理、唤醒等待的进程等。
2、数据写入页缓存流程
从进入内核态开始,到数据写入页缓存的过程,大致流程如下:
ext2_file_operations->aio_write->generic_file_aio_write
__generic_file_aio_write
generic_file_buffered_write
generic_perform_write
address_space->a_ops->write_begin //为write做好准备,比如通过get_block获取文件逻辑块号和设备逻辑块号的对应关系。
block_write_begin
grab_cache_page_write_begin // 从page cache中查找相应page,如果没有找到,则在page cache中的分配page
__block_write_begin
create_page_buffers // 如果该page还没有相应的buffer_head与之对应,则创建相应的bh。
ext2_get_block // 获取文件逻辑块号和设备逻辑块号的对应关系,结果保存在buffer_head中(page->buffers),
后续提交IO时,可以根据该对应关系确定是读写整页,还是只读写其中的部分block。
ll_rw_block //如果相关的bh已mapped,即有磁盘block与之映射,但该block不是uptodate,则从磁盘中重新读取。
iov_iter_copy_from_user_atomic //将数据从用户态拷贝到内核态
address_space->a_ops->write_end
generic_write_end
block_write_end // 提交IO,实际上并没有真正提交,而是设置了一些标记(比如dirty)和状态而已,真正的IO提交由flush进程定期或
按需提交。
__block_commit_write
mark_inode_dirty
3、页面回写流程
数据从页缓存回写的大致流程如下所示,触发回写的机制包括:fsync、sync、flush、blance_dirty_page
address_space->ext2_aops->ext2_writepages
mpage_writepages
write_cache_pages
__mpage_writepage
mpage_alloc
mpage_bio_submit //按bio提交,标准流程。
mapping->a_ops->writepage(page, wbc) //按buffer方式写,每次写一个block,不需要整页写
ext2_writepage
block_write_full_page
block_write_full_page_endio
__block_write_full_page
create_page_buffers // 如果该page还没有相应的buffer_head与之对应,则创建相应的bh。
ext2_get_block // 获取文件逻辑块号和设备逻辑块号的对应关系。
submit_bh // 按buffer方式提交IO。
4、代码分析
write流程中各关键函数分析如下:
generic_file_aio_write->__generic_file_aio_write
-
ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-
unsigned long nr_segs, loff_t *ppos)
-
{
-
struct file *file = iocb->ki_filp;
-
struct address_space * mapping = file->f_mapping;
-
size_t ocount; /* original count */
-
size_t count; /* after file limit checks */
-
struct inode *inode = mapping->host;
-
loff_t pos;
-
ssize_t written;
-
ssize_t err;
-
-
ocount = 0;
-
err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
-
if (err)
-
return err;
-
-
count = ocount;
-
pos = *ppos;
-
-
/* We can write back this queue in page reclaim */
-
current->backing_dev_info = mapping->backing_dev_info;
-
written = 0;
-
-
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
-
if (err)
-
goto out;
-
-
if (count == 0)
-
goto out;
-
-
err = file_remove_suid(file);
-
if (err)
-
goto out;
-
-
err = file_update_time(file);
-
if (err)
-
goto out;
-
-
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
-
/*如果设置了O_DIRECT标记,则走DIO流程,即绕过cache和buffer。Fixme:会阻塞等待IO完成么?一定能保证成功么?*/
-
if (unlikely(file->f_flags & O_DIRECT)) {
-
loff_t endbyte;
-
ssize_t written_buffered;
-
/*进入Direct IO流程*/
-
written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
-
ppos, count, ocount);
-
if (written < 0 || written == count)
-
goto out;
-
/*
-
* direct-io write to a hole: fall through to buffered I/O
-
* for completing the rest of the request.
-
*/
-
/*当dio操作失败,或者其它异常时,需要在走aio流程,所以dio是不能保证一定成功的*/
-
pos += written;
-
count -= written;
-
written_buffered = generic_file_buffered_write(iocb, iov,
-
nr_segs, pos, ppos, count,
-
written);
-
/*
-
* If generic_file_buffered_write() retuned a synchronous error
-
* then we want to return the number of bytes which were
-
* direct-written, or the error code if that was zero. Note
-
* that this differs from normal direct-io semantics, which
-
* will return -EFOO even if some bytes were written.
-
*/
-
if (written_buffered < 0) {
-
err = written_buffered;
-
goto out;
-
}
-
-
/*
-
* We need to ensure that the page cache pages are written to
-
* disk and invalidated to preserve the expected O_DIRECT
-
* semantics.
-
*/
-
/*先把脏数据回写*/
-
endbyte = pos + written_buffered - written - 1;
-
err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
-
if (err == 0) {
-
written = written_buffered;
-
/*invalidate mapping*/
-
invalidate_mapping_pages(mapping,
-
pos >> PAGE_CACHE_SHIFT,
-
endbyte >> PAGE_CACHE_SHIFT);
-
} else {
-
/*
-
* We don't know how much we wrote, so just return
-
* the number of bytes which were direct-written
-
*/
-
}
-
} else {
-
/*aio,异步io,即buffered write流程,经过cache或buffer写入数据*/
-
written = generic_file_buffered_write(iocb, iov, nr_segs,
-
pos, ppos, count, written);
-
}
-
out:
-
current->backing_dev_info = NULL;
-
return written ? written : err;
-
}
generic_file_aio_write->__generic_file_aio_write->generic_file_buffered_write->generic_perform_write
-
static ssize_t generic_perform_write(struct file *file,
-
struct iov_iter *i, loff_t pos)
-
{
-
struct address_space *mapping = file->f_mapping;
-
const struct address_space_operations *a_ops = mapping->a_ops;
-
long status = 0;
-
ssize_t written = 0;
-
unsigned int flags = 0;
-
-
/*
-
* Copies from kernel address space cannot fail (NFSD is a big user).
-
*/
-
if (segment_eq(get_fs(), KERNEL_DS))
-
flags |= AOP_FLAG_UNINTERRUPTIBLE;
-
-
do {
-
struct page *page;
-
unsigned long offset; /* Offset into pagecache page */
-
unsigned long bytes; /* Bytes to write to page */
-
size_t copied; /* Bytes copied from user */
-
void *fsdata;
-
-
offset = (pos & (PAGE_CACHE_SIZE - 1));
-
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
-
iov_iter_count(i));
-
-
again:
-
/*
-
* Bring in the user page that we will copy from _first_.
-
* Otherwise there's a nasty deadlock on copying from the
-
* same page as we're writing to, without it being marked
-
* up-to-date.
-
*
-
* Not only is this an optimisation, but it is also required
-
* to check that the address is actually valid, when atomic
-
* usercopies are used, below.
-
*/
-
if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
-
status = -EFAULT;
-
break;
-
}
-
/* 为write做好准备,比如通过get_block获取文件逻辑块号和设备物理
-
* 块号的对应关系。write_begin根据文件系统不同有不同实现,比如ext2
-
* 对应为ext2_write_begin
-
*/
-
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
-
&page, &fsdata);
-
if (unlikely(status))
-
break;
-
-
if (mapping_writably_mapped(mapping))
-
flush_dcache_page(page);
-
/*disable缺页异常,实际为关抢占。*/
-
pagefault_disable();
-
// 将数据从用户态拷贝到内核态
-
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
-
/*enable缺页异常,开抢占*/
-
pagefault_enable();
-
flush_dcache_page(page);
-
/*标记page为accessed*/
-
mark_page_accessed(page);
-
// 提交IO,ext2文件系统实现为ext2_write_end,未执行实际的提交操作,只是设置脏标记等
-
status = a_ops->write_end(file, mapping, pos, bytes, copied,
-
page, fsdata);
-
if (unlikely(status < 0))
-
break;
-
copied = status;
-
/*检查是否需要调度*/
-
cond_resched();
-
-
iov_iter_advance(i, copied);
-
if (unlikely(copied == 0)) {
-
/*
-
* If we were unable to copy any data at all, we must
-
* fall back to a single segment length write.
-
*
-
* If we didn't fallback here, we could livelock
-
* because not all segments in the iov can be copied at
-
* once without a pagefault.
-
*/
-
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
-
iov_iter_single_seg_count(i));
-
goto again;
-
}
-
pos += copied;
-
written += copied;
-
/*检查脏页是否超限,超限时回收*/
-
balance_dirty_pages_ratelimited(mapping);
-
/*
-
* 检查是否有挂起的SIGKILL信号,如果有的话直接返回错误。
-
* 应该是新版本中新增的功能,能在write流程中将进程kill掉。
-
* 在引入此功能之前,write内核流程中由于不处理信号,同时
-
* 因为IO过程中,进程通常为D状态,导致相关进程无法kill?
-
*/
-
if (fatal_signal_pending(current)) {
-
status = -EINTR;
-
break;
-
}
-
} while (iov_iter_count(i));
-
-
return written ? written : status;
-
}
generic_file_aio_write->__generic_file_aio_write->generic_file_buffered_write->generic_perform_write->ext2_write_begin->block_write_begin
-
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
-
unsigned flags, struct page **pagep, get_block_t *get_block)
-
{
-
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-
struct page *page;
-
int status;
-
-
// 从page cache中查找相应page,如果没有找到,则在page cache中的分配page
-
page = grab_cache_page_write_begin(mapping, index, flags);
-
if (!page)
-
return -ENOMEM;
-
// 走到这儿,说明page要么在page cache中找到了,要不就新分配了
-
status = __block_write_begin(page, pos, len, get_block);
-
if (unlikely(status)) {
-
unlock_page(page);
-
page_cache_release(page);
-
page = NULL;
-
}
-
-
*pagep = page;
-
return status;
-
}
generic_file_aio_write->__generic_file_aio_write->generic_file_buffered_write->generic_perform_write->ext2_write_begin->block_write_begin->__block_write_begin
-
int __block_write_begin(struct page *page, loff_t pos, unsigned len,
-
get_block_t *get_block)
-
{
-
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
-
unsigned to = from + len;
-
struct inode *inode = page->mapping->host;
-
unsigned block_start, block_end;
-
sector_t block;
-
int err = 0;
-
unsigned blocksize, bbits;
-
struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
-
-
BUG_ON(!PageLocked(page));
-
BUG_ON(from > PAGE_CACHE_SIZE);
-
BUG_ON(to > PAGE_CACHE_SIZE);
-
BUG_ON(from > to);
-
-
// 为page分配相应的buffer_head,用于保存文件逻辑块和磁盘逻辑块之间的对应关系。
-
head = create_page_buffers(page, inode, 0);
-
blocksize = head->b_size;
-
bbits = block_size_bits(blocksize);
-
/*将位置转换成块号*/
-
block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
-
-
for(bh = head, block_start = 0; bh != head || !block_start;
-
block++, block_start=block_end, bh = bh->b_this_page) {
-
block_end = block_start + blocksize;
-
if (block_end <= from || block_start >= to) {
-
if (PageUptodate(page)) {
-
if (!buffer_uptodate(bh))
-
set_buffer_uptodate(bh);
-
}
-
continue;
-
}
-
/*对于新创建的bh,清除BH_NEW标记*/
-
if (buffer_new(bh))
-
clear_buffer_new(bh);
-
/*如果buffer还没有映射,即还没有建立文件逻辑块和磁盘逻辑块之间的对应关系*/
-
if (!buffer_mapped(bh)) {
-
WARN_ON(bh->b_size != blocksize);
-
/* 调用传入的get_block接口,ext2对应为ext2_get_block,用于获取
-
* 文件逻辑块和磁盘逻辑块之间的对应关系。get_block并不实际从磁盘
-
* 中读取文件数据,只是获取对应关系(通过直接或间接块信息),如果
-
* 相关的对应关系没有建立,则创建相应的对应关系(比如创建间接块,
-
* 传入的最后一个参数为1,表示需要创建),获取的对应关系保存于bh中
-
*/
-
err = get_block(inode, block, bh, 1);
-
if (err)
-
break;
-
/*Fixme:前面已经clear过BH_NEW标记了,为何这里还要判断?*/
-
if (buffer_new(bh)) {
-
unmap_underlying_metadata(bh->b_bdev,
-
bh->b_blocknr);
-
if (PageUptodate(page)) {
-
clear_buffer_new(bh);
-
set_buffer_uptodate(bh);
-
mark_buffer_dirty(bh);
-
continue;
-
}
-
if (block_end > to || block_start < from)
-
zero_user_segments(page,
-
to, block_end,
-
block_start, from);
-
continue;
-
}
-
}
-
if (PageUptodate(page)) {
-
if (!buffer_uptodate(bh))
-
set_buffer_uptodate(bh);
-
continue;
-
}
-
/*
-
* 如果相关的bh已mapped,即有磁盘block与之映射,但该block不是uptodate,则从磁盘中重新读取。
-
*/
-
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
-
!buffer_unwritten(bh) &&
-
(block_start < from || block_end > to)) {
-
ll_rw_block(READ, 1, &bh);
-
*wait_bh++=bh;
-
}
-
}
-
/*
-
* If we issued read requests - let them complete.
-
*/
-
/*如果前面触发了读请求,则等待读取操作完成*/
-
while(wait_bh > wait) {
-
wait_on_buffer(*--wait_bh);
-
if (!buffer_uptodate(*wait_bh))
-
err = -EIO;
-
}
-
if (unlikely(err))
-
page_zero_new_buffers(page, from, to);
-
return err;
-
}
阅读(7296) | 评论(0) | 转发(2) |