writeback机制源码分析-raykwok1150-ChinaUnix博客

raykwok1150的ChinaUnix博客

首页　| 　博文目录　| 　关于我

raykwok1150

博客访问： 279845
博文数量： 74
博客积分： 0
博客等级：民兵
技术积分： 668
用户组：普通用户
注册时间： 2013-04-09 17:34

文章分类

全部博文（74）

git（1）
study（4）
内存（2）
eMMC/SD（7）
工作环境（11）
crypt（2）
Android（28）
Linux（19）
未分配的博文（0）

文章存档

2017年（1）

2016年（13）

2015年（24）

2014年（9）

2013年（27）

我的朋友

sudo_mei

相关博文

writeback机制源码分析

分类： LINUX

2013-11-11 09:42:39

writeback相关数据结构

与writeback相关的数据结构主要有：

1，backing_dev_info，该数据结构描述了backing_dev的所有信息，通常块设备的request queue中会包含backing_dev对象。

2，bdi_writeback，该数据结构封装了writeback的内核线程以及需要操作的inode队列。

3，wb_writeback_work，该数据结构封装了writeback的工作任务。

各数据结构之间的关系如下图所示：

下面对各个数据结构做简要介绍。

bdi information

bdi对象在块设备添加的时候需要注册到系统的bdi队列中。对于ext3而言，在mount的时候需要将底层块设备的bdi对象联系到ext3 root_inode中。bdi对象数据结构定义如下：

	

	
		
		
			struct backing_dev_info {   
		

		
			    struct list_head bdi_list;   
		

		
			    unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */   
		

		
			    unsigned long state;    /* Always use atomic bitops on this */   
		

		
			    unsigned int capabilities; /* Device capabilities */   
		

		
			    congested_fn *congested_fn; /* Function pointer if device is md/dm */   
		

		
			    void *congested_data;   /* Pointer to aux data for congested func */   
		

		
			  
		

		
			    char *name;   
		

		
			  
		

		
			    struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];   
		

		
			  
		

		
			    unsigned long bw_time_stamp;    /* last time write bw is updated */   
		

		
			    unsigned long dirtied_stamp;   
		

		
			    unsigned long written_stamp;    /* pages written at bw_time_stamp */   
		

		
			    unsigned long write_bandwidth;  /* the estimated write bandwidth */   
		

		
			    unsigned long avg_write_bandwidth; /* further smoothed write bw */   
		

		
			  
		

		
			    /*   
		

		
			     * The base dirty throttle rate, re-calculated on every 200ms.   
		

		
			     * All the bdi tasks' dirty rate will be curbed under it.   
		

		
			     * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit   
		

		
			     * in small steps and is much more smooth/stable than the latter.   
		

		
			     */   
		

		
			    unsigned long dirty_ratelimit;   
		

		
			    unsigned long balanced_dirty_ratelimit;   
		

		
			  
		

		
			    struct prop_local_percpu completions;   
		

		
			    int dirty_exceeded;   
		

		
			  
		

		
			    unsigned int min_ratio;   
		

		
			    unsigned int max_ratio, max_prop_frac;   
		

		
			  
		

		
			    struct bdi_writeback wb;  /* default writeback info for this bdi，writeback对象 */   
		

		
			    spinlock_t wb_lock;   /* protects work_list */   
		

		
			  
		

		
			    /* 任务链表 */   
		

		
			    struct list_head work_list;   
		

		
			  
		

		
			    struct device *dev;   
		

		
			    /* 在laptop模式下应用的定时器 */   
		

		
			    struct timer_list laptop_mode_wb_timer;   
		

		
			  
		

		
			#ifdef CONFIG_DEBUG_FS   
		

		
			    struct dentry *debug_dir;   
		

		
			    struct dentry *debug_stats;   
		

		
			#endif   
		

		
			};

在bdi数据结构中定义了一个writeback对象，该对象是对writeback内核线程的描述，并且封装了需要处理的inode队列。在bdi数据结构中有一条work_list，该work队列维护了writeback内核线程需要处理的任务。如果该队列上没有work可以处理，那么writeback内核线程将会睡眠等待。

writeback

writeback对象封装了内核线程task以及需要处理的inode队列。当page cache/buffer cache需要刷新radix tree上的inode时，可以将该inode挂载到writeback对象的b_dirty队列上，然后唤醒writeback线程。在处理过程中，inode会被移到b_io队列上进行处理。多条链表的方式可以降低多线程之间的资源共享。writeback数据结构具体定义如下：

	

	
		
		
			struct bdi_writeback {   
		

		
			    struct backing_dev_info *bdi;   /* our parent bdi */   
		

		
			    unsigned int nr;   
		

		
			  
		

		
			    unsigned long last_old_flush;   /* last old data flush */   
		

		
			    unsigned long last_active;  /* last time bdi thread was active */   
		

		
			  
		

		
			    struct task_struct *task;   /* writeback thread */   
		

		
			    struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */   
		

		
			    struct list_head b_dirty;   /* dirty inodes */   
		

		
			    struct list_head b_io;      /* parked for writeback */   
		

		
			    struct list_head b_more_io; /* parked for more writeback */   
		

		
			    spinlock_t list_lock;       /* protects the b_* lists */   
		

		
			};

writeback work

wb_writeback_work数据结构是对writeback任务的封装，不同的任务可以采用不同的刷新策略。writeback线程的处理对象就是writeback_work。如果writeback_work队列为空，那么内核线程就可以睡眠了。Writeback_work的数据结构定义如下：

	

	
		
		
			struct wb_writeback_work {   
		

		
			    long nr_pages;   
		

		
			    struct super_block *sb; /* superblock对象 */   
		

		
			    unsigned long *older_than_this;   
		

		
			    enum writeback_sync_modes sync_mode;   
		

		
			    unsigned int tagged_writepages:1;   
		

		
			    unsigned int for_kupdate:1;   
		

		
			    unsigned int range_cyclic:1;   
		

		
			    unsigned int for_background:1;   
		

		
			    enum wb_reason reason;      /* why was writeback initiated? */   
		

		
			       
		

		
			    struct list_head list;      /* pending work list，链入bdi-> work_list队列 */   
		

		
			    struct completion *done;    /* set if the caller waits，work完成时通知调用者 */   
		

		
			};

writeback主要函数分析

writeback机制的主要函数包括如下两个方面：

1，管理bdi对象并且fork相应的writeback内核线程处理cache数据的刷新工作。

2，writeback内核线程处理函数，实现dirty page的刷新操作

writeback线程管理

Linux中有一个内核守护线程，该线程用来管理系统bdi队列，并且负责为block device创建writeback thread。当bdi中有dirty page并且还没有为bdi分配内核线程的时候，bdi_forker_thread程序会为其分配线程资源；当一个writeback线程长时间处于空闲状态时，bdi_forker_thread程序会释放该线程资源。

writeback线程管理程序分析如下：

	

	
		
		
			static int bdi_forker_thread(void *ptr)   
		

		
			{   
		

		
			    struct bdi_writeback *me = ptr;   
		

		
			  
		

		
			    current->flags |= PF_SWAPWRITE;   
		

		
			    set_freezable();   
		

		
			  
		

		
			    /*   
		

		
			     * Our parent may run at a different priority, just set us to normal   
		

		
			     */   
		

		
			    set_user_nice(current, 0);   
		

		
			  
		

		
			    for (;;) {   
		

		
			        struct task_struct *task = NULL;   
		

		
			        struct backing_dev_info *bdi;   
		

		
			        enum {   
		

		
			            NO_ACTION,   /* Nothing to do */   
		

		
			            FORK_THREAD, /* Fork bdi thread */   
		

		
			            KILL_THREAD, /* Kill inactive bdi thread */   
		

		
			        } action = NO_ACTION;   
		

		
			  
		

		
			        /*   
		

		
			         * Temporary measure, we want to make sure we don't see   
		

		
			         * dirty data on the default backing_dev_info   
		

		
			         */   
		

		
			        if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {   
		

		
			            del_timer(&me->wakeup_timer);   
		

		
			            wb_do_writeback(me, 0);   
		

		
			        }   
		

		
			  
		

		
			        spin_lock_bh(&bdi_lock);   
		

		
			        /*   
		

		
			         * In the following loop we are going to check whether we have   
		

		
			         * some work to do without any synchronization with tasks   
		

		
			         * waking us up to do work for them. Set the task state here   
		

		
			         * so that we don't miss wakeups after verifying conditions.   
		

		
			         */   
		

		
			        set_current_state(TASK_INTERRUPTIBLE);   
		

		
			        /* 遍历所有的bdi对象，检查这些bdi是否存在脏数据，如果有脏数据，那么需要为其fork线程，然后做writeback操作 */   
		

		
			        list_for_each_entry(bdi, &bdi_list, bdi_list) {   
		

		
			            bool have_dirty_io;   
		

		
			  
		

		
			            if (!bdi_cap_writeback_dirty(bdi) ||   
		

		
			                 bdi_cap_flush_forker(bdi))   
		

		
			                continue;   
		

		
			  
		

		
			            WARN(!test_bit(BDI_registered, &bdi->state),   
		

		
			                 "bdi %p/%s is not registered!\n", bdi, bdi->name);   
		

		
			            /* 检查是否存在脏数据 */   
		

		
			            have_dirty_io = !list_empty(&bdi->work_list) ||   
		

		
			                    wb_has_dirty_io(&bdi->wb);   
		

		
			  
		

		
			            /*   
		

		
			             * If the bdi has work to do, but the thread does not   
		

		
			             * exist - create it.   
		

		
			             */   
		

		
			            if (!bdi->wb.task && have_dirty_io) {   
		

		
			                /*   
		

		
			                 * Set the pending bit - if someone will try to   
		

		
			                 * unregister this bdi - it'll wait on this bit.   
		

		
			                 */   
		

		
			                /* 如果有脏数据，并且不存在线程，那么接下来做线程的FORK操作 */   
		

		
			                set_bit(BDI_pending, &bdi->state);   
		

		
			                action = FORK_THREAD;   
		

		
			                break;   
		

		
			            }   
		

		
			  
		

		
			            spin_lock(&bdi->wb_lock);   
		

		
			  
		

		
			            /*   
		

		
			             * If there is no work to do and the bdi thread was   
		

		
			             * inactive long enough - kill it. The wb_lock is taken   
		

		
			             * to make sure no-one adds more work to this bdi and   
		

		
			             * wakes the bdi thread up.   
		

		
			             */   
		

		
			            /* 如果一个bdi长时间没有脏数据，那么执行线程的KILL操作，结束掉该bdi对应的writeback线程 */   
		

		
			            if (bdi->wb.task && !have_dirty_io &&   
		

		
			                time_after(jiffies, bdi->wb.last_active +   
		

		
			                        bdi_longest_inactive())) {   
		

		
			                task = bdi->wb.task;   
		

		
			                bdi->wb.task = NULL;   
		

		
			                spin_unlock(&bdi->wb_lock);   
		

		
			                set_bit(BDI_pending, &bdi->state);   
		

		
			                action = KILL_THREAD;   
		

		
			                break;   
		

		
			            }   
		

		
			            spin_unlock(&bdi->wb_lock);   
		

		
			        }   
		

		
			        spin_unlock_bh(&bdi_lock);   
		

		
			  
		

		
			        /* Keep working if default bdi still has things to do */   
		

		
			        if (!list_empty(&me->bdi->work_list))   
		

		
			            __set_current_state(TASK_RUNNING);   
		

		
			        /* 执行线程的FORK和KILL操作 */   
		

		
			        switch (action) {   
		

		
			        case FORK_THREAD:   
		

		
			            /* FORK一个bdi_writeback_thread线程，该线程的名字为flush-major:minor */   
		

		
			            __set_current_state(TASK_RUNNING);   
		

		
			            task = kthread_create(bdi_writeback_thread, &bdi->wb,   
		

		
			                          "flush-%s", dev_name(bdi->dev));   
		

		
			            if (IS_ERR(task)) {   
		

		
			                /*   
		

		
			                 * If thread creation fails, force writeout of   
		

		
			                 * the bdi from the thread. Hopefully 1024 is   
		

		
			                 * large enough for efficient IO.   
		

		
			                 */   
		

		
			                writeback_inodes_wb(&bdi->wb, 1024,   
		

		
			                            WB_REASON_FORKER_THREAD);   
		

		
			            } else {   
		

		
			                /*   
		

		
			                 * The spinlock makes sure we do not lose   
		

		
			                 * wake-ups when racing with 'bdi_queue_work()'.   
		

		
			                 * And as soon as the bdi thread is visible, we   
		

		
			                 * can start it.   
		

		
			                 */   
		

		
			                spin_lock_bh(&bdi->wb_lock);   
		

		
			                bdi->wb.task = task;   
		

		
			                spin_unlock_bh(&bdi->wb_lock);   
		

		
			                wake_up_process(task);   
		

		
			            }   
		

		
			            bdi_clear_pending(bdi);   
		

		
			            break;   
		

		
			  
		

		
			        case KILL_THREAD:   
		

		
			            /* KILL一个线程 */   
		

		
			            __set_current_state(TASK_RUNNING);   
		

		
			            kthread_stop(task);   
		

		
			            bdi_clear_pending(bdi);   
		

		
			            break;   
		

		
			  
		

		
			        case NO_ACTION:   
		

		
			            /* 如果没有可执行的动作，那么调度本线程睡眠一段时间 */   
		

		
			            if (!wb_has_dirty_io(me) || !dirty_writeback_interval)   
		

		
			                /*   
		

		
			                 * There are no dirty data. The only thing we   
		

		
			                 * should now care about is checking for   
		

		
			                 * inactive bdi threads and killing them. Thus,   
		

		
			                 * let's sleep for longer time, save energy and   
		

		
			                 * be friendly for battery-driven devices.   
		

		
			                 */   
		

		
			                schedule_timeout(bdi_longest_inactive());   
		

		
			            else   
		

		
			                schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));   
		

		
			            try_to_freeze();   
		

		
			            break;   
		

		
			        }   
		

		
			    }   
		

		
			  
		

		
			    return 0;   
		

		
			}

writeback线程

writeback线程是bdi_forker_thread 创建的，该线程的任务就是处理等待的数据回刷任务。线程处理函数为bdi_writeback_thread，其会调用wb_do_writeback函数完成具体操作，该函数分析如下：

	

	
		
		
			long wb_do_writeback(struct bdi_writeback *wb, int force_wait)   
		

		
			{   
		

		
			    struct backing_dev_info *bdi = wb->bdi;   
		

		
			    struct wb_writeback_work *work;   
		

		
			    long wrote = 0;   
		

		
			  
		

		
			    set_bit(BDI_writeback_running, &wb->bdi->state);   
		

		
			    /* 处理等待的work，所有等待work pengding在bdi->work_list上 */   
		

		
			    while ((work = get_next_work_item(bdi)) != NULL) {   
		

		
			        /*   
		

		
			         * Override sync mode, in case we must wait for completion   
		

		
			         * because this thread is exiting now.   
		

		
			         */   
		

		
			        if (force_wait)   
		

		
			            work->sync_mode = WB_SYNC_ALL;   
		

		
			  
		

		
			        trace_writeback_exec(bdi, work);   
		

		
			        /* 调用wb_writeback函数处理相应的inode */   
		

		
			        wrote += wb_writeback(wb, work);   
		

		
			  
		

		
			        /*   
		

		
			         * Notify the caller of completion if this is a synchronous   
		

		
			         * work item, otherwise just free it.   
		

		
			         */   
		

		
			        /* 通知上层软件，相应的work已经完成 */   
		

		
			        if (work->done)   
		

		
			            complete(work->done);   
		

		
			        else   
		

		
			            kfree(work);   
		

		
			    }   
		

		
			  
		

		
			    /*   
		

		
			     * Check for periodic writeback, kupdated() style   
		

		
			     */   
		

		
			    /* 处理周期性的dirty page刷新作业，buffer cache就会走这条路径，在下面的函数中会创建work，并且调用wb_writeback函数进行处理 */   
		

		
			    wrote += wb_check_old_data_flush(wb);   
		

		
			    wrote += wb_check_background_flush(wb);   
		

		
			    clear_bit(BDI_writeback_running, &wb->bdi->state);   
		

		
			  
		

		
			    return wrote;   
		

		
			}

小结

本文在linux-3.2的基础上对writeback代码进行了浏览。整体上来讲，writeback机制是比较简单的，其核心是通过一个常驻内核线程为bdi对象分配writeback线程，实现对cache中dirty page的数据回刷。

本文出自 “存储之道” 博客，请务必保留此出处http://alanwu.blog.51cto.com/3652632/1110046

阅读(938) | 评论(0) | 转发(0) |

上一篇： linux下proc里关于磁盘性能的参数

下一篇：linux 文件写入流程（从vfs层，通用block层到 scsi磁盘驱动

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6