全部博文(183)
分类: LINUX
2010-02-04 13:07:09
buffer cache vs page cache(page cache的演化) 在2.2x时期,page cache和buffer cache是两套cache系统,之间有同步.但是linux不保证每个版本都如此. 0)buffer head 和buffer 的free 链 从图中以及代码各个角落可以知道,buffer_head 是buffer cache的一个handler,拿到bh就可以进行io操作了,但是buffer head 也需要从内存中分配和释放 free list是 是buffer 的另外一个重要部分,那些用完的buffer, 但是( buffer_head, struct page, data buffer)的关系已经建立完成了,暂时缓存在这个链表中,只是他们已经不存在于hash表和lru队列中了,下次使用就不用在费时初始化( buffer_head, struct page, data buffer)之间的关系了。 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait); static struct bh_free_head free_list[NR_SIZES]; static void __remove_from_free_list(struct buffer_head * bh, int index) //纯粹的链表操作 static void put_last_free(struct buffer_head * bh)//纯粹的链表操作 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) //初始化buffer void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset) //建立buffer的数据缓冲区 对比一下unused list 分配出来的是buffer head这个东西而已,而free_list中是一个完整的buffer。 1)既然是cache,就有一个hash结构 hash的索引是(dev,block),这个dev是kdev_t,不是那个blkdev,block_device。。。kdev_t 到block device的映射以后再谈吧。值得说明的是只有加入了这个hash表的buffer才能叫做进入了buffer cache。这个部分包括hash表的hash算法,hash链表的维护(add delete 。。。), static unsigned int bh_hash_mask; static unsigned int bh_hash_shift; static struct buffer_head **hash_table; static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED; #define _hashfn(dev,block) 。。。 #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)] static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head) static __inline__ void __hash_unlink(struct buffer_head *bh) static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size) struct buffer_head * get_hash_table(kdev_t dev, int block, int size) 2)缓存就要考虑数据老化,缓存回收的问题,所以有个lur list static struct buffer_head *lru_list[NR_LIST];static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED; static int nr_buffers_type[NR_LIST]; static unsigned long size_buffers_type[NR_LIST]; 呵呵,没有进行啥包装,整个struct 多好。列一下buffer的各种lru队列。 #define BUF_CLEAN 0 #define BUF_LOCKED 1 /* Buffers scheduled for write */ #define BUF_DIRTY 2 /* Dirty buffers, not yet scheduled for write */ #define BUF_PROTECTED 3 /* Ramdisk persistent storage */ #define NR_LIST 4 static void __insert_into_lru_list(struct buffer_head * bh, int blist) static void __remove_from_lru_list(struct buffer_head * bh, int blist) static void __refile_buffer(struct buffer_head *bh) void refile_buffer(struct buffer_head *bh) static __inline__ void __mark_dirty(struct buffer_head *bh) void __mark_buffer_dirty(struct buffer_head *bh) static inline void __mark_buffer_clean(struct buffer_head *bh) static inline void mark_buffer_clean(struct buffer_head * bh) static inline void __mark_buffer_protected(struct buffer_head *bh) static inline void mark_buffer_protected(struct buffer_head * bh) 这些mark函数当然是标记buffer 的各种状态,然后通过 refile_buffer在各种类型的lru队列间移动。比较简单,就算是考虑的同步和互斥 啥的也不能算作是复杂吧? 有时需要一些打包函数,将buffer head 同时加入hash 和lru队列。 static void __remove_from_queues(struct buffer_head *bh) static void __insert_into_queues(struct buffer_head *bh) __refile_buffer 中有个 remove_inode_queue(bh) 的操作值得注意一下。 /* * A buffer may need to be moved from one buffer list to another * (e.g. in case it is not shared any more). Handle this. */ static void __refile_buffer(struct buffer_head *bh) { int dispose = BUF_CLEAN; if (buffer_locked(bh)) dispose = BUF_LOCKED; if (buffer_dirty(bh)) dispose = BUF_DIRTY; if (buffer_protected(bh)) dispose = BUF_PROTECTED; if (dispose != bh->b_list) { __remove_from_lru_list(bh, bh->b_list); bh->b_list = dispose; if (dispose == BUF_CLEAN) remove_inode_queue(bh); __insert_into_lru_list(bh, dispose); } } inode 中有个inode->i_dirty_buffers 记录了这个inode中所有dirty的数据。稍后我们再分析这个dirty的数据是什么:元数据还是文件 数据。 /* The caller must have the lru_list lock before calling the remove_inode_queue functions. */ static void __remove_inode_queue(struct buffer_head *bh) { bh->b_inode = NULL; list_del(&bh->b_inode_buffers); } static inline void remove_inode_queue(struct buffer_head *bh) { if (bh->b_inode) //可以看出,并不是每个buffer 都和一个inode 相对应的,只有以部分才有. __remove_inode_queue(bh); } int inode_has_buffers(struct inode *inode);//这个简单。。 到底什么buffer才有inode与之对应,等分析万buffer cache的创建就会清楚了。 我们先来看看buffer cache 的创建,藉此研究buffer cache 中的内容以及buffer cache 和系统其他几个部分之间的关系: 3)buffer cache 的创建与buffer head 的回收 实际上,有两种类型的buffer_head 存在于系统中,一种存在于buffer cache, 存在于buffer cache 中的 buffer(head)必然存在于lur list。这中类型的buffer 其唯一的分配途径就是 getblk, 然后通过bread(kdev_t dev, int block, int size)被广泛用于读取文件的元数据: struct buffer_head * getblk(kdev_t dev, int block, int size) { .... repeat: spin_lock(&lru_list_lock); write_lock(&hash_table_lock); bh = __get_hash_table(dev, block, size); //look up in hash first if (bh) goto out; //找到就简单了 isize = BUFSIZE_INDEX(size); spin_lock(&free_list[isize].lock); bh = free_list[isize].list; //尝试在free list 中分配一个 if (bh) { __remove_from_free_list(bh, isize); atomic_set(&bh->b_count, 1); } spin_unlock(&free_list[isize].lock); /* * OK, FINALLY we know that this buffer is the only one of * its kind, we hold a reference (b_count>0), it is unlocked, * and it is clean. */ if (bh) { init_buffer(bh, NULL, NULL); bh->b_dev = dev; bh->b_blocknr = block; bh->b_state = 1 << BH_Mapped; //mapped buffer 已经有设备上的sector和之相对应 /* Insert the buffer into the regular lists */ __insert_into_queues(bh); //进入hash 和 lru队列 out: write_unlock(&hash_table_lock); spin_unlock(&lru_list_lock); touch_buffer(bh); return bh; } /* * If we block while refilling the free list, somebody may * create the buffer first ... search the hashes again. */ write_unlock(&hash_table_lock); spin_unlock(&lru_list_lock); refill_freelist(size); //分配失败的话,重新分配一批buffer 进来,再试 goto repeat; } /* * We used to try various strange things. Let's not. * We'll just try to balance dirty buffers, and possibly * launder some pages. */ static void refill_freelist(int size) { balance_dirty(NODEV); // buffer 的回收策略,后面分析 if (free_shortage()) //看看空闲物理页面是否足够 (以前分析过这个函数...) page_launder(GFP_BUFFER, 0); //不够的话先释放一些buffer出来 grow_buffers(size); //然后再创建buffer 到free list中 } static int grow_buffers(int size) { .... page = alloc_page(GFP_BUFFER); //分配页面 if (!page) goto out; LockPage(page); bh = create_buffers(page, size, 0); //创建buffer,看看前面的概念,就是建立三元组(buffer_head, page, data buffer) insert_point = free_list[isize].list; ....... free_list[isize].list = bh; //insert 到free list spin_unlock(&free_list[isize].lock); page->buffers = bh; page->flags &= ~(1 << PG_referenced); lru_cache_add(page); //注意这里把page加入page cache的lru队列 ...... } 为啥吧page加入page cache的lru队列?目的是让page cache 帮助进行buffer head的老化回收:你注意到__put_unused_buffer_head的话,会发现只有try_to_free_buffers才会调用这个函数(brw_kiovec也调用,但是不进入"主干",大部分buffer走不到那个分支上),秘密就在这里. 算是知道为啥page cache 和buffer cache纠缠不清了,真的是一个阴阳鱼啊. static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async) //略过 另外一种buffer,并不存在于buffer cache 中,仅仅作为磁盘rw的中介,主要的创建接口函数: static void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize) create_empty_buffers为在给定的page上建立buffer,只是没有将buffer 映射的具体的磁盘块上,就是unmaped的buffer具体的映射操作交给具体的文件系统来处理。待会看个例子。 create_empty_buffers 为各种具体文件系统的文件读写(非元数据)提供一个和磁盘驱动交互的bh序列(把这个page切割成buffer), 是文件系统和磁盘驱动交换数据的具体形式。据传,2.6系统中取消了buffer cache,仅仅保留这种类型的buffer,蜕变成一个io entry。 create_empty_buffers的调用者不少: static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block) static int __block_prepare_write(struct inode *inode, struct page *page, static int __block_commit_write(struct inode *inode, struct page *page, int block_prepare_write(struct page *page, unsigned from, unsigned to,...) int generic_commit_write(struct file *file, struct page *page,...) int block_read_full_page(struct page *page, get_block_t *get_block) int block_write_full_page(struct page *page, get_block_t *get_block) int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block) int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size) int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], 这些函数为读写文件提供了buffer支持,作为一个io entry,使文件和磁盘驱动能够结合起来。手工 trace这些函数,就会知道这些buffer没有加入buffer cache,是“真正的文件”内容而非文件的元数据。典型的例子是文件的读写: do_generic_file_read -> mapping->a_ops->readpage(filp, page);->ext2_readpage->block_read_full_page generic_file_write ->mapping->a_ops->prepare_write(file, page, offset, offset+bytes);->ext2_prepare_write-> block_prepare_write generic_file_write -> mapping->a_ops->commit_write(file, page, offset, offset+bytes) -> generic_commit_write 这里给出一个图示说明page cache, filemap,buffer cache, buffer entry(仅作io entry的buffer)的关系(也许不是100%正确!!) 马上回顾一下buffer_head的回收,就会发现,这种类型的buffer 很自然的进入page cache继而通过try_to_free_buffers 进行回收. 实在没有必要把这些函数的实现都列到这里仔细讨论了,仅以其中一个为例吧,但是在讨论前还是说一下这些函数的用途吧: 这些函数值得注意的是写文件的方式,第一种提供给具体的文件系统使用,参考generic_file_write, int block_prepare_write(struct page *page, unsigned from, unsigned to,...) int generic_commit_write(struct file *file, struct page *page,...) 我们在讨论generic_file 的读写时也涉及到这些函数。 另外一中类型的是 block_write_full_page,像是上面两个函数的打包,其实其中有不同。 我们回顾一下generic_file_write的基本操作流程: ssize_t generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos) { ............ //略过 while (count) { unsigned long bytes, index, offset; char *kaddr; int deactivate = 1; /* * Try to find the page in the cache. If it isn't there, * allocate a free page. */ offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ 。。。。 page = __grab_cache_page(mapping, index, &cached_page); if (!page) break; /* We have exclusive IO access to the page.. */ if (!PageLocked(page)) { PAGE_BUG(page); } /*对于ext2,就是从磁盘先将文件页面读入,如果需要还要为文件分配磁盘block*/ status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes); if (status) goto unlock; kaddr = page_address(page); status = copy_from_user(kaddr+offset, buf, bytes); flush_dcache_page(page); if (status) goto fail_write; /*对于ext2,就是mark所有bh为dirt,mark 对应 inode为dirty. 见 ext2_aops */ status = mapping->a_ops->commit_write(file, page, offset, offset+bytes); .............//略过 /* For now, when the user asks for O_SYNC, we'll actually * provide O_DSYNC. */ if ((status >= 0) && (file->f_flags & O_SYNC)) status = generic_osync_inode(inode, 1); /* 1 means datasync */ } //mapping->a_ops->prepare_write -> block_prepare_write -->__block_prepare_write static int __block_prepare_write(struct inode *inode, struct page *page, unsigned from, unsigned to, get_block_t *get_block) { if (!page->buffers) create_empty_buffers(page, inode->i_dev, blocksize); //为page 创建 bh io entry ........ for(bh = head, block_start = 0; bh != head || !block_start; block++, block_start=block_end, bh = bh->b_this_page) { ........ if (!buffer_mapped(bh)) { err = get_block(inode, block, bh, 1);//如果没有对应到磁盘上就分配一个磁盘块,ext2,就是ext2_get_block,map bh到具体设备上的block if (buffer_new(bh)) { unmap_underlying_metadata(bh); //这次我们把这个东西讨论清楚...呵呵 ..... } } if (!buffer_uptodate(bh) && (block_start < from || block_end > to)) { ll_rw_block(READ, 1, &bh); //read in , make it uptodate *wait_bh++=bh; } } ...... } unmap_underlying_metadata 曾经是一个很困惑的问题,这次终于能够了断了 :-) 我们曾经在linuxforum上有一个讨论,但是基本上没有说道点子上,见这个帖子: linux forum上讨论unmpa_underlaying_metadata 的讨论 这次分析到这里,没有办法,经过刻苦的寻找,终于找到了1999年关于这个问题的一些线索,其实很简单,我终于受到了启发: 这个讨论启发了我: http://www.mail-archive.com/linux-fsdevel@vger.rutgers.edu/msg00298.html 问题的根源在于buffer 的释放问题:真正从buffer cache中消除buffer的函数是 __bforget, 然而只有(少数文件系统系统直接调用__bforget)unmap_underlying_metadata, try_to_free_buffers (page_lunder)是进入这个过程的常见入口. 设想这个一个流程: 1) 打开 foo/xxx , 修改xxx的内容 2)rm foo 3)吧xxx元数据所占用的block分配给新的文件, 现在,因为rm foo的时候我们并没有及时调用__bforget, 所以buffer cache 中还有一个alias的buffer. 至于以前讨论的,我们认为通过dd这种操作raw设备的方式所拥有的alias, 并不在unmap_underlying_metadata 考 虑的范围内.本来,2.4的时候已经不负责buffer cache和page cache之间的同步了.这里有必要性不在于这个alias在buffer cache中,而在于他是ditry的如果不clear掉,就会引起data corrupt. 2.4以后仅仅是drop掉数据就够了. /* * bforget() is like brelse(), except it puts the buffer on the * free list if it can.. We can NOT free the buffer if: * - there are other users of it * - it is locked and thus can have active IO */ void __bforget(struct buffer_head * buf) { /* grab the lru lock here to block bdflush. */ spin_lock(&lru_list_lock); write_lock(&hash_table_lock); if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf)) goto in_use; __hash_unlink(buf); remove_inode_queue(buf); write_unlock(&hash_table_lock); __remove_from_lru_list(buf, buf->b_list); spin_unlock(&lru_list_lock); put_last_free(buf); return; in_use: write_unlock(&hash_table_lock); spin_unlock(&lru_list_lock); } /* * We are taking a block for data and we don't want any output from any * buffer-cache aliases starting from return from that function and * until the moment when something will explicitly mark the buffer * dirty (hopefully that will not happen until we will free that block ;-) * We don't even need to mark it not-uptodate - nobody can expect * anything from a newly allocated buffer anyway. We used to used * unmap_buffer() for such invalidation, but that was wrong. We definitely * don't want to mark the alias unmapped, for example - it would confuse * anyone who might pick it with bread() afterwards... */ static void unmap_underlying_metadata(struct buffer_head * bh) { struct buffer_head *old_bh; old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size); if (old_bh) { mark_buffer_clean(old_bh); wait_on_buffer(old_bh); clear_bit(BH_Req, &old_bh->b_state); /* Here we could run brelse or bforget. We use bforget because it will try to put the buffer in the freelist. */ __bforget(old_bh); } } //mapping->a_ops->commit_write -> block_commit_write -->__block_commit_write static int __block_commit_write(struct inode *inode, struct page *page, unsigned from, unsigned to) { for(bh = head = page->buffers, block_start = 0; bh != head || !block_start; block_start=block_end, bh = bh->b_this_page) { //遍历所有的bh block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (!buffer_uptodate(bh)) partial = 1; } else { set_bit(BH_Uptodate, &bh->b_state); if (!atomic_set_buffer_dirty(bh)) { __mark_dirty(bh);//bh加入了lru队列,不代表就是加入了buffer cache.加入hash才是加入buffer cache的标志 buffer_insert_inode_queue(bh, inode); //呵呵这里证明,文件数据的bh关联一个inode, need_balance_dirty = 1; } } } ............. if (!partial) SetPageUptodate(page); //给page标记uptodate就够了, 通过后备任务,写入到磁盘(inode->i_dirty_buffers) return 0; } 写文件的时候仅仅是标记dirty,连block dev的io都没有启动,除非要求了syn,见generic_file_write (if ((status >= 0) && (file->f_flags & O_SYNC))) 这样才能速度快. 然后看看写整个磁盘文件的函数:这个函数提供给filemap的sync和page_lunder使用,所以是启动了磁盘的io操作的.不具体分析 int block_write_full_page(struct page *page, get_block_t *get_block) //对ext2,就是ext2_get_block,map bh到具体设备上的block { struct inode *inode = page->mapping->host; unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; unsigned offset; int err; /* easy case */ if (page->index < end_index) return __block_write_full_page(inode, page, get_block); //可以整个页面写入的 /* things got complicated... */ offset = inode->i_size & (PAGE_CACHE_SIZE-1); /* OK, are we completely out? */ if (page->index >= end_index+1 || !offset) { UnlockPage(page); return -EIO; } /* Sigh... will have to work, then... */ err = __block_prepare_write(inode, page, 0, offset, get_block); //否则得拆分开写1部分 if (!err) { memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);//clear无效部分 flush_dcache_page(page); __block_commit_write(inode,page,0,offset); //分开写和写一页,出了写了不同数量的bh,其余都类似 done: kunmap(page); UnlockPage(page); return err; } ClearPageUptodate(page); goto done; } 另外一个prepare write就是为不准有空洞的文件系统准备的: int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes) //bytes 是当前这个文件的最后一个byte的位置 { ..... while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) { //如果请求页超过当前最后一个byte,就要将空洞部分全部分配并填上0 status = -ENOMEM; new_page = grab_cache_page(mapping, pgpos); //分配或者查找page cache ..... zerofrom = *bytes & ~PAGE_CACHE_MASK; if (zerofrom & (blocksize-1)) { *bytes |= (blocksize-1); (*bytes)++; } status = __block_prepare_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE, get_block); //将中间位置的页面填 0 并写入文件 if (status) goto out_unmap; kaddr = page_address(new_page);//将中间位置的页面填 0 并写入文件 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom); flush_dcache_page(new_page); __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);//将中间位置的页面填 0 并写入文件 kunmap(new_page); UnlockPage(new_page); page_cache_release(new_page); } ...... //零头处理,略 return 0; return status; } 对比下read, read的操作都是启动了磁盘io的. brw_page: 提供给swap buffer 使用. brw_kiovec: raw.c使用,以后再说吧,逻辑不复杂. 4)Buffer cache 和 Inode 的关系总结 在分析__block_commit_write 的时候, 我们知道file的数据进入了inode->i_dirty_buffers, 并且加入了buffer的lru队列,但是这不代表文件数据加入了buffer cache. 另外一个加入inode->i_dirty_buffers的方式是 static inline void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) { mark_buffer_dirty(bh); buffer_insert_inode_queue(bh, inode); } 稍微搜索一下调用者就知道, 元数据也加入了inode->i_dirty_buffers. 好就是这样. 5)buffer cache的老化回收:lru 队列 bdflash进程是主要负责将dirty的buffer 写入磁盘的任务, 通过上面的分析我们知道无论是元数据还是文件数据,都通过bh进入lru队列。 union bdflush_param { } bdf_prm = {{30, 64, 64, 256, 5*HZ, 30*HZ, 60, 0, 0}}; /* These are the min and max parameter values that we will allow to be assigned */ int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 0, 0, 0}; int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 100, 0, 0}; 作为buffer cache,必须有buffer_head, struct page,和数据区(物理内存页面),缺一不可,并且要同时(几乎都是同时的呵呵)加入lru list 和hash表,这个我们在分析page cache (filemap.c) 的时候就见过类似的概念了。 另外文件数据只进入lru 队列,并不加入buffer cache,要时刻记住了. 我们从buflash开始吧. sys_bdflush: 配置,略. 从 __init bdflush_init(void) 知道有两个内核线程专注于回收buffers: bdflush 和 kupdate. /* * This is the actual bdflush daemon itself. It used to be started from * the syscall above, but now we launch it ourselves internally with * kernel_thread(...) directly after the first thread in init/main.c */ int bdflush(void *sem) { struct task_struct *tsk = current; int flushed; ....// 初始化,略 ....//clear signal,略 for (;;) { //主要任务: CHECK_EMERGENCY_SYNC //这玩意以后再说吧 flushed = flush_dirty_buffers(0); //flush buffers:遍历所有lru,启动磁盘io操作,仅此而已. if (free_shortage()) //如果物理页面不够了 flushed += page_launder(GFP_KERNEL, 0); //试图回收一些页面,会有更多dirty page通过bh进入buffer lru /* * If there are still a lot of dirty buffers around, * skip the sleep and flush some more. Otherwise, we * go to sleep waiting a wakeup. */ set_current_state(TASK_INTERRUPTIBLE); if (!flushed || balance_dirty_state(NODEV) < 0) {//根据dirt buffer 的数量,以及是否短缺free 页面决定是做同步flsuh,异步flush 还是不做. run_task_queue(&tq_disk); //进到这里代表再用flash,那就让进入tq_disk的队列的bh开始进行真正的io吧() //见pre fs对这个tq_disk 的分析, bh有可能停留在这里(如果没有人调用这个函数进行驱动的话) schedule(); } /* Remember to mark us as running otherwise the next schedule will block. */ __set_current_state(TASK_RUNNING); } } /* * This is the kernel update daemon. It was used to live in userspace * but since it's need to run safely we want it unkillable by mistake. * You don't need to change your userspace configuration since * the userspace `update` will do_exit(0) at the first sys_bdflush(). */ int kupdate(void *sem) { ....// 初始化,略 ....//clear signal,略 for (;;) { /* update interval */ interval = bdf_prm.b_un.interval; if (interval) { tsk->state = TASK_INTERRUPTIBLE; schedule_timeout(interval); //以一定的间隔运行 } else { stop_kupdate: tsk->state = TASK_STOPPED; schedule(); /* wait for SIGCONT */ } /* check for sigstop */ if (signal_pending(tsk)) { int stopped = 0; spin_lock_irq(&tsk->sigmask_lock); if (sigismember(&tsk->pending.signal, SIGSTOP)) {//收到SIGSTOP就停止运行 sigdelset(&tsk->pending.signal, SIGSTOP); stopped = 1; } recalc_sigpending(tsk); spin_unlock_irq(&tsk->sigmask_lock); if (stopped) goto stop_kupdate; } #ifdef DEBUG printk("kupdate() activated...\n"); #endif sync_old_buffers(); //结果就是以一定的见个运行这个函数 } } /* * Here we attempt to write back old buffers. We also try to flush inodes * and supers as well, since this function is essentially "update", and * otherwise there would be no way of ensuring that these quantities ever * get written back. Ideally, we would have a timestamp on the inodes * and superblocks so that we could write back only the old ones as well */ static int sync_old_buffers(void) { lock_kernel(); sync_supers(0); //回写super sync_inodes(0); //回写inode本身和 filemap的那些页面 unlock_kernel(); //回写完了就有更多的bh在lru队列了!! flush_dirty_buffers(1); //检查时戳,老到一定程度再flush,和bdflush的工作一样:启动磁盘io /* must really sync all the active I/O request to disk here */ run_task_queue(&tq_disk);//不要让bh 在磁盘调度队列中永远沉睡下去(没有timer驱动的,只有byhand调用了) return 0; } 顺便去看看tq_disk: 这是一个task queue, 但是不是所有的task queue 都会得到自动执行的. 其实本系列所覆盖的代码(kernel fs(only ext2/proc/devfs and common fs surport ) mm driver/(ide pci )) 只有extern task_queue tq_timer, tq_immediate, tq_disk; 这三个task queue, 而其中tq_disk没有像另外两个一样挂接到bottom half的处理中去. 其他接口函数: int block_sync_page(struct page *page) void wakeup_bdflush(int block) 再说一下buffer head 的回收 try_to_free_buffers 是buffer 回收和buffer head 回收的主要入口. 不论是buffer cache 中的buffer 以及bh还是作为io entry的buffer 以及bh, 绝大多数都是通过page cache的lru队列进行回收的. 我们看到buffer cache 中的page 页面也加入了page cache的lru队列(不过仅仅是加入lru队列而已,不会在page cache 的hash队列中看到的). 另外在flash 一个page 的时候也会试图释放buffer head 见block_flushpage(用于文件的truncate). 剩余部分:sync invalidate truncate Sync: 文件系统的dirty数据是以一定的策略,定时回写的,有时需要马上把dirty数据回写到硬盘上,这就需要sync的支持了. 这里边,sync_page_buffers(struct buffer_head *bh, int wait)就是为了try_to_free_buffers用用.不太关乎这里的文件sync操作. 不妨看看sync操作的几种情形: 1) fync 和 fdatasync(int fd):希望下面的man出来的信息已经足够理解这两个操作了 fdatasync() flushes all data buffers of a file to disk (before the sys-tem call returns). It resembles fsync() but is not required to update the metadata such as access time. asmlinkage long sys_fsync(unsigned int fd) { struct file * file; struct dentry * dentry; struct inode * inode; int err; err = -EBADF; file = fget(fd); if (!file) goto out; dentry = file->f_dentry; inode = dentry->d_inode; err = -EINVAL; if (!file->f_op || !file->f_op->fsync) goto out_putf; /* We need to protect against concurrent writers.. */ down(&inode->i_sem); filemap_fdatasync(inode->i_mapping); /* int (*writepage)(struct page *) = mapping->a_ops->writepage; ie,ext2_writepage->block_write_full_page->sumit all bh to driver */ err = file->f_op->fsync(file, dentry, 0); /* 基本就是调用file_fsync,ext2是fsync_inode_buffers*/ filemap_fdatawait(inode->i_mapping); /*等待io完成*/ up(&inode->i_sem); out_putf: fput(file); out: return err; } asmlinkage long sys_fdatasync(unsigned int fd) { ............................. filemap_fdatasync(inode->i_mapping); err = file->f_op->fsync(file, dentry, 1); /*和上面相比就这里不同*/ filemap_fdatawait(inode->i_mapping); ............... } 2) dev的sync : man sync: 将所有data写入磁盘,包括super block asmlinkage long sys_sync(void) { fsync_dev(0); /*0 代表sync所有设备*/ return 0; } 3)O_SYNC :open 一个文件的时候指定以同步方式写入文件. generic_file_write->generic_osync_inode -> osync_inode_buffers(inode);或者 fsync_inode_buffers(inode) int osync_inode_buffers(struct inode *inode): 就是等待inode上的dirty buffer io完成. int fsync_inode_buffers(struct inode *inode): 对当前的inode上的dirtybuffer 提交bh到块驱动程序, 然后等待这些buffer io完成,最后调用 osync_inode_buffers,等待在这个过程中其他提交了写操作的buffer. 然后看看在这些接口函数后面,真正干活的吧: /* * filp may be NULL if called via the msync of a vma. */ int file_fsync(struct file *filp, struct dentry *dentry, int datasync) { struct inode * inode = dentry->d_inode; struct super_block * sb; kdev_t dev; int ret; lock_kernel(); /* sync the inode to buffers */ write_inode_now(inode, 0); /*又做了一遍 filemap的同步,然后写入inode本身*/ /* sync the superblock to buffers */ sb = inode->i_sb; lock_super(sb); if (sb->s_op && sb->s_op->write_super) sb->s_op->write_super(sb); /*写入 super block*/ unlock_super(sb); /* .. finally sync the buffers to disk */ dev = inode->i_dev; ret = sync_buffers(dev, 1); /*上面的操作提交写操作到block dev(或者只是mark dirt),这里最后进行写入和等待*/ unlock_kernel(); return ret; } void sync_dev(kdev_t dev)和fync_dev类似,只是不等待io操作完成: int fsync_dev(kdev_t dev) { sync_buffers(dev, 0); /*先写入dirt buffer*/ lock_kernel(); sync_supers(dev); /*mark 更多 dirty bh*/ sync_inodes(dev); /*mark 更多 dirty bh,包括file map的,呵呵*/ DQUOT_SYNC(dev); unlock_kernel(); return sync_buffers(dev, 1); /*写入新mark的bh,然后等待io操作完成.*/ } 最后static int sync_buffers(kdev_t dev, int wait) 虽然不短,但是也是比较好理解的.看看他分三趟写入bh的方式就可以了吧? /* One pass for no-wait, three for wait: * 0) write out all dirty, unlocked buffers; * 1) write out all dirty buffers, waiting if locked; * 2) wait for completion by waiting for all buffers to unlock. */ invalidate:在unmount 文件系统,删除一个文件,或者发生disk change等 状况的时候,我们需要将文件或这设备上所有数据丢弃,这时需要的是invalidate. 对于文件,invalidate_inode_buffers 只是将 inode 的dirty buffer 和这个inode脱离关系,对dirty的buffer不做任何处理.(这些buffer 既含有meta数据又有文件数据),从这里看过去就知道 unmap_underlying_metadata 的重要之处了. 对于一个设备的invalidate操作分成两种,一种需要保留dity的buffer,一种干脆丢弃所有dirty的buffer:__invalidate_buffers #define invalidate_buffers(dev) __invalidate_buffers((dev), 0) #define destroy_buffers(dev) __invalidate_buffers((dev), 1) detroy的时候吧 dirty buffer统统从bufffer cache摘除,然后放到buffer 的free链表中去. 而invalidate 则仅仅减少引用计数,当然clean buffer在两种操作之中都会放到free list中去. 一般进行invalidate的时候都先进行了sync操作.... truncate: 截断一个文件. 对截断的部分进行flush操作. 本模块提供flush的支持:最终的操作都要归结与buffer的操作,page cache以buffer作为io entry,而元数据则是直接用buffer cache了。有两个接口函数用于buffer 的flush操作: int block_flushpage(struct page *page, unsigned long offset) : 将page上的buffer 进行unmap并将bh标记为clean如果是整个页面都被flush,还尝试释放buffer(try_to_free_buffers: 把buffer彻底释放,包括buffer head也释放掉,这是buffer head的另一个释放途经,也是通过page来的。对这个函数要说明一点,看下面的图:含有offset的那个buffer是没有动过的。注意到这点有助 于理解block_truncate_page。 cur_offset / | +--------\--------+--------+--------+ | | | | | | | | | | +--------+---/----+--------+--------+ / | \ | bh | | | offset 如果你曾看过这个函数,就会发现一个奇怪的地方,ext2_truncate-> block_truncate_page,是从vmtruncate调用下来的: void vmtruncate(struct inode * inode, loff_t offset) { unsigned long partial, pgoff; struct address_space *mapping = inode->i_mapping; unsigned long limit; if (inode->i_size < offset) goto do_expand; /*对于文件扩展不影响对此文件的maping*/ inode->i_size = offset; /* 清除page cache中相关的缓冲数据 */ truncate_inode_pages(mapping, offset); //->truncate_list_pages -》truncate_partial(full)_page已经将 //每个page进行了block_flushpage spin_lock(&mapping->i_shared_lock); /*检查是否存在此文件的mapping*/ if (!mapping->i_mmap && !mapping->i_mmap_shared) goto out_unlock; pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; partial = (unsigned long)offset & (PAGE_CACHE_SIZE - 1); /*truncate maping*/ if (mapping->i_mmap != NULL) vmtruncate_list(mapping->i_mmap, pgoff, partial); if (mapping->i_mmap_shared != NULL) vmtruncate_list(mapping->i_mmap_shared, pgoff, partial); out_unlock: spin_unlock(&mapping->i_shared_lock); /* this should go into ->truncate */ inode->i_size = offset; /*最后对文件进行truncate,ext2 参考ext2_truncate */ if (inode->i_op && inode->i_op->truncate) //这里的操作意义何在? inode->i_op->truncate(inode); return; do_expand: limit = current->rlim[RLIMIT_FSIZE].rlim_cur; if (limit != RLIM_INFINITY) { if (inode->i_size >= limit) { send_sig(SIGXFSZ, current, 0); goto out; } if (offset > limit) { send_sig(SIGXFSZ, current, 0); offset = limit; } } inode->i_size = offset; if (inode->i_op && inode->i_op->truncate) inode->i_op->truncate(inode); out: return; } 我们看看这个函数: int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block) { .....//略去 blocksize = inode->i_sb->s_blocksize; length = offset & (blocksize - 1); /* Block boundary? Nothing to do */ if (!length) return 0; length = blocksize - length; iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); page = grab_cache_page(mapping, index); err = PTR_ERR(page); if (IS_ERR(page)) goto out; if (!page->buffers) create_empty_buffers(page, inode->i_dev, blocksize); /* Find the buffer that contains "offset" */ /*这里是一个关键的操作,寻找包含offset的那个bh*/ /*通过对block_flushpage的分析知道,block flushpage是没有动过包含offset的那个bh的*/ bh = page->buffers; pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; iblock++; pos += blocksize; } /*对于包含offset的那个bh,使其uptodate*/ err = 0; if (!buffer_mapped(bh)) { /* Hole? Nothing to do */ if (buffer_uptodate(bh)) goto unlock; get_block(inode, iblock, bh, 0); /* Still unmapped? Nothing to do */ if (!buffer_mapped(bh)) goto unlock; } /* Ok, it's mapped. Make sure it's up-to-date */ if (Page_Uptodate(page)) set_bit(BH_Uptodate, &bh->b_state); if (!buffer_uptodate(bh)) { err = -EIO; ll_rw_block(READ, 1, &bh); /*为了使其uptodate,必要时需要读入那个bh*/ wait_on_buffer(bh); /* Uhhuh. Read error. Complain and punt. */ if (!buffer_uptodate(bh)) goto unlock; } memset(kmap(page) + offset, 0, length); flush_dcache_page(page); kunmap(page); __mark_buffer_dirty(bh); err = 0; unlock: UnlockPage(page); page_cache_release(page); out: return err; } 原来也就是尽力保持truncate后那个bh能够uptodate,并且clear掉那半个bh而已. 但是并不是每个文件系统都会再inode->truncate里进行这个操作的,也只有ext2和minix文件系统有这个操作 (linux2.6: sysv udf xfs也有此操作)(fixme). 想来其他文件系统也需要zero或者对这半个bh进行操作的,ext3文件系统中ext3_block_truncate_page就进行了类似操作. 而fat_truncate中则是根本没有考虑这个问题. 带着这些问题看看2.6的实现,列表于此: File.c (fs\affs): .truncate = affs_truncate, :采用prepare_write和commit_write来zero这包含offset的bh res = mapping->a_ops->prepare_write(NULL, page, size, size); if (!res) res = mapping->a_ops->commit_write(NULL, page, size, size); File.c (fs\ext2): .truncate = ext2_truncate, :采用 block_truncate_page File.c (fs\ext3): .truncate = ext3_truncate, :自己有自己的实现,但是zero,make uptodate mark dirt都有的 File.c (fs\fat): .truncate = fat_truncate, 看似没有zero mark dirty的操作,但是不知道fat_free里有没有处理 File.c (fs\hpfs): .truncate = hpfs_truncate, 和fat一样hpfs_truncate_btree里不知有么有做,想来是做了.???(fix me) File.c (fs\jfs): .truncate = jfs_truncate, -->nobh_truncate_page中做了. File.c (fs\minix): .truncate = minix_truncate, 采用 block_truncate_page File.c (fs\ntfs): .truncate = ntfs_truncate_vfs, 没看,但是ntfs_truncate中应该有???(fix me) File.c (fs\qnx4): .truncate = qnx4_truncate, 只有mark dirt操作,呵呵 File.c (fs\reiserfs): .truncate = reiserfs_vfs_truncate_file, 相当复杂,猜着他做了,太多了,不能一一研究了 File.c (fs\sysv): .truncate = sysv_truncate, 采用 block_truncate_page File.c (fs\udf): .truncate = udf_truncate, 采用 block_truncate_page 或者自己做 File.c (fs\ufs): .truncate = ufs_truncate, 看样子做了 Inode.c (fs\hfs): .truncate = hfs_file_truncate, :采用prepare_write和commit_write来zero这包含offset的bh Inode.c (fs\hfsplus): .truncate = hfsplus_file_truncate, :采用prepare_write和commit_write来zero这包含offset的bh Proc.c (fs\smbfs): .truncate = smb_proc_trunc32, 未看,猜吧,那位出来说说. Proc.c (fs\smbfs): .truncate = smb_proc_trunc32, Proc.c (fs\smbfs): .truncate = smb_proc_trunc95, Proc.c (fs\smbfs): .truncate = smb_proc_trunc64, Proc.c (fs\smbfs): .truncate = smb_proc_trunc64, Shmem.c (mm): .truncate = shmem_truncate, 特殊的文件系统,必有特殊处理,没看,呵呵 Shmem.c (mm): .truncate = shmem_truncate, Xfs_iops.c (fs\xfs\linux-2.6): .truncate = linvfs_truncate, 采用 block_truncate_page 从以上系统看来,除了 qnx4,其余几乎都是做了类似的工作的.顺便说, .truncate操作主要作用应该是把inode mark为dirt,顺便更新访问时间啥的. 最后还是有些函数提一提 void __wait_on_buffer(struct buffer_head * bh) :就是写到这里,不做啥分析了. static void end_buffer_io_async(struct buffer_head * bh, int uptodate) void set_blocksize(kdev_t dev, int size) :呵呵怎么放到这个文件啊,倒是要clear所有的lru队列中的bh. int block_symlink(struct inode *inode, const char *symname, int len):创建符号链接的时候,需要将page剩余福分zero然后映射页面剩余部分.... int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block):就是调用get_block,绕了些. |