一、基本原理
Linux块设备层使用了plug/unplug(蓄流/泄流)的机制来提升IO吞吐量。基本原理为:当IO请求提交时,不知直接提交给底层驱动,而是先将其放入一个队列中(相当于水池),待一定时机或周期后再将该队列中的请求统一下发。将请求放入队列的过程即plug(蓄流)过程,统一下发请求的过程即为unplug(泄流)过程。每个请求在队列中等待的时间不会太长,通常在ms级别。
如此设计,可以增加IO合并和排序的机会,便于提升磁盘访问效率。
二、plug
1、基本流程
从mapping层提交到块设备层的io请求为bio,bio会在块设备进行合并,并生成新的request,并经过IO调度(排序和合并)之后下发到底层。下发request时,通过请求队列的make_request_fn接口,其中实质为将请求放入per task的plug队列,当队列满或在进行调度时(schedule函数中)会根据当前进程的状态将该队列中的请求flush到派发队列中,并触发unplug(具体流程后面介绍)。
。
per task的plug队列:新内核版本中实现的机制。IO请求提交时先链入此队列,当该队列满时(>BLK_MAX_REQUEST_COUNT),会flush到相应设备的请求队列中(request_queue)。
优点:per task维护plug队列,可以避免频繁对设备的请求队列操作导致的锁竞争,能提升效率。
2、plug基本代码流程如下:
submit_bio->
generic_make_request->
make_request->
blk_queue_bio->
list_add_tail(&req->queuelist, &plug->list);//将请求加入plug队列
三、unplug
unplug分同步unplug和异步unplug两种方式。
同步unplug即当即通过调用blk_run_queue对下发请求队列中的情况。
异步unplug,通过唤醒kblockd工作队列来对请求队列中的请求进行下发。
1、kblockd工作队列的初始化:
1) 分配工作队列
主要代码流程:
blk_dev_init ->
alloc_workqueue //分配工作队列
2)
初始化工作队列
blk_alloc_queue_node():
-
/*在指定node上分配请求队列*/
-
struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
-
{
-
struct request_queue *q;
-
int err;
-
/*分配请求队列需要的内存,从slab中分配,并初始化为0*/
-
q = kmem_cache_alloc_node(blk_requestq_cachep,
-
gfp_mask | __GFP_ZERO, node_id);
-
if (!q)
-
return NULL;
-
-
-
if (percpu_counter_init(&q->mq_usage_counter, 0))
-
goto fail_q;
-
-
-
q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
-
if (q->id < 0)
-
goto fail_c;
-
-
-
q->backing_dev_info.ra_pages =
-
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
-
q->backing_dev_info.state = 0;
-
q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
-
q->backing_dev_info.name = "block";
-
q->node = node_id;
-
-
-
err = bdi_init(&q->backing_dev_info);
-
if (err)
-
goto fail_id;
-
/*设置laptop模式下的定时器*/
-
setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
-
laptop_mode_timer_fn, (unsigned long) q);
-
/*
-
* 关键点:设置请求队列的超时定时器,默认超时时间为30s,当30s内IO请求未完成时,定时器到期,
-
* 进行重试或错误处理。这是IO 错误处理架构中的关键点之一,在内核老版本中(2.6.38?),该定时器
-
* 是在scsi中间层定义的,新版本中将其上移至块设备层。Fixme:为何要这样?*/
-
setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
-
/*初始化各个队列*/
-
INIT_LIST_HEAD(&q->queue_head);
-
INIT_LIST_HEAD(&q->timeout_list);
-
INIT_LIST_HEAD(&q->icq_list);
-
#ifdef CONFIG_BLK_CGROUP
-
INIT_LIST_HEAD(&q->blkg_list);
-
#endif
-
INIT_LIST_HEAD(&q->flush_queue[0]);
-
INIT_LIST_HEAD(&q->flush_queue[1]);
-
INIT_LIST_HEAD(&q->flush_data_in_flight);
-
/*初始化delay_work,用于在kblockd中异步unplug请求队列*/
-
INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
-
-
-
kobject_init(&q->kobj, &blk_queue_ktype);
-
-
-
mutex_init(&q->sysfs_lock);
-
spin_lock_init(&q->__queue_lock);
-
-
-
/*
-
* By default initialize queue_lock to internal lock and driver can
-
* override it later if need be.
-
*/
-
q->queue_lock = &q->__queue_lock;
-
-
-
/*
-
* A queue starts its life with bypass turned on to avoid
-
* unnecessary bypass on/off overhead and nasty surprises during
-
* init. The initial bypass will be finished when the queue is
-
* registered by blk_register_queue().
-
*/
-
q->bypass_depth = 1;
-
__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
-
-
-
init_waitqueue_head(&q->mq_freeze_wq);
-
-
-
if (blkcg_init_queue(q))
-
goto fail_id;
-
-
-
return q;
-
-
-
fail_id:
-
ida_simple_remove(&blk_queue_ida, q->id);
-
fail_c:
-
percpu_counter_destroy(&q->mq_usage_counter);
-
fail_q:
-
kmem_cache_free(blk_requestq_cachep, q);
-
return NULL;
-
}
2) kblockd工作队列的工作内容
kblockd工作队列的工作内容有由blk_delay_work()函数实现,主要就是调用__blk_run_queue进行unplug请求队列。
-
/*IO请求队列的delay_work,用于在kblockd中异步unplug请求队列*/
-
static void blk_delay_work(struct work_struct *work)
-
{
-
struct request_queue *q;
-
/*获取delay_work所在的请求队列*/
-
q = container_of(work, struct request_queue, delay_work.work);
-
spin_lock_irq(q->queue_lock);
-
/*直接run queue,最终调用request_fn对队列中的请求逐一处理*/
-
__blk_run_queue(q);
-
spin_unlock_irq(q->queue_lock);
-
}
2、unplug机制
内核中设计了两种unplug机制:
1)调度时进行unplug(异步方式)
当发生内核调度时,当前进程sleep前,先将当前task的plug列表中的请求flush到派发队列中,并进行unplug。
主要代码流程如下:
schedule->
sched_submit_work ->
blk_schedule_flush_plug()->
blk_flush_plug_list(plug, true) ->注意:这里传入的from_schedule参数为true,表示将触发异步unplug,即唤醒kblockd工作队列来进行unplug操作。后续的kblockd的唤醒周期在块设备驱动中设置,比如scsi中设置为3ms。
queue_unplugged->
blk_run_queue_async
queue_unplugged():
-
/*unplug请求队列,plug相当于蓄水,将请求放入池子(请求队列)中,unplug相当于放水,即开始调用请求队列的request_fn(scsi_request_fn)来处理请求队列中的请求,将请求提交到scsi层(块设备驱动层)*/
-
static void queue_unplugged(struct request_queue *q, unsigned int depth,
-
bool from_schedule)
-
__releases(q->queue_lock)
-
{
-
trace_block_unplug(q, depth, !from_schedule);
-
/*调用块设备驱动层提供的request_fn接口处理请求队列中的请求,分异步和同步两种情况。*/
-
if (from_schedule)
-
/*异步unplug,即通过kblockd工作队列来处理,该工作队列定期唤醒(5s),通过这种方式可以控制流量,提高吞吐量*/
-
blk_run_queue_async(q);
-
else
-
/*同步unplug,即直接调用设备驱动层提供的request_fn接口处理请求队列中的请求*/
-
__blk_run_queue(q);
-
spin_unlock(q->queue_lock);
-
}
blk_run_queue_async():
-
/*异步unplug,即通过kblockd工作队列来处理,该工作队列定期唤醒(5s),通过这种方式可以控制流量,提高吞吐量*/
-
void blk_run_queue_async(struct request_queue *q)
-
{
-
if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q)))
-
/*唤醒kblockd相关的工作队列,进行unplug处理,注意:这里的delay传入0表示立刻唤醒,kblockd对应的处理接口为:blk_delay_work*/
-
mod_delayed_work(kblockd_workqueue, &q->delay_work, 0);
-
}
scsi_request_fn()://scsi块设备驱动的request_fn()接口,其中当scsi命令下发失败时,会重设kblockd,延迟unplug请求队列。
-
static void scsi_request_fn(struct request_queue *q)
-
{
-
...
-
/*
-
* Dispatch the command to the low-level driver.
-
*/
-
/*将scsi命令下发到底层驱动,当返回非0时,表示命令下发失败,则当前的请求队列需要被plug*/
-
rtn = scsi_dispatch_cmd(cmd);
-
spin_lock_irq(q->queue_lock);
-
/*命令下发失败,需要plug请求队列*/
-
if (rtn)
-
goto out_delay
-
...
-
out_delay:
-
if (sdev->device_busy == 0)
-
/*命令下发失败,需要延迟处理,需plug请求队列,设置3ms定时启动kblockd工作队列,进行请求队列的unplug*/
-
blk_delay_queue(q, SCSI_QUEUE_DELAY);
-
blk_delay_queue
-
/*在指定msecs时间后启动kblockd工作队列*/
-
void blk_delay_queue(struct request_queue *q, unsigned long msecs)
-
{
-
if (likely(!blk_queue_dead(q)))
-
queue_delayed_work(kblockd_workqueue, &q->delay_work,
-
msecs_to_jiffies(msecs));
-
}
2)提交IO请求时(make_request)进行unplug
提交IO请求时(make_request),先将请求提交时先链入此队列,当该队列满时(>BLK_MAX_REQUEST_COUNT),会flush到相应设备的请求队列中(request_queue)。
主要代码流程为:
submit_bio->
generic_make_request->
make_request->
blk_queue_bio->
blk_flush_plug_list(plug, false) ->注意:这里传入的from_schedule参数为false,表示将触发同步unplug,即当即下发请求。
queue_unplugged->
blk_run_queue_async ->
__blk_run_queue
普通块设备的make_request接口在3.10内核版本中被设置为blk_queue_bio,相应代码分析如下:
-
/*在submit_bio中被调用,用于合并bio,并提交请求(request),请求提交到per task的plug list中*/
-
void blk_queue_bio(struct request_queue *q, struct bio *bio)
-
{
-
const bool sync = !!(bio->bi_rw & REQ_SYNC);
-
struct blk_plug *plug;
-
int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
-
struct request *req;
-
unsigned int request_count = 0;
-
-
/*
-
* low level driver can indicate that it wants pages above a
-
* certain limit bounced to low memory (ie for highmem, or even
-
* ISA dma in theory)
-
*/
-
/*bounce buffer(回弹缓冲区)使用*/
-
blk_queue_bounce(q, &bio);
-
-
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
-
bio_endio(bio, -EIO);
-
return;
-
}
-
-
if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
-
spin_lock_irq(q->queue_lock);
-
where = ELEVATOR_INSERT_FLUSH;
-
goto get_rq;
-
}
-
-
/*
-
* Check if we can merge with the plugged list before grabbing
-
* any locks.
-
*/
-
/*尝试将bio合并到request中*/
-
if (blk_attempt_plug_merge(q, bio, &request_count))
-
return;
-
-
spin_lock_irq(q->queue_lock);
-
-
el_ret = elv_merge(q, &req, bio);
-
/*向后合并*/
-
if (el_ret == ELEVATOR_BACK_MERGE) {
-
if (bio_attempt_back_merge(q, req, bio)) {
-
elv_bio_merged(q, req, bio);
-
if (!attempt_back_merge(q, req))
-
elv_merged_request(q, req, el_ret);
-
goto out_unlock;
-
}
-
/*向前合并*/
-
} else if (el_ret == ELEVATOR_FRONT_MERGE) {
-
if (bio_attempt_front_merge(q, req, bio)) {
-
elv_bio_merged(q, req, bio);
-
if (!attempt_front_merge(q, req))
-
elv_merged_request(q, req, el_ret);
-
goto out_unlock;
-
}
-
}
-
/*不能合并,需要新建request来处理bio*/
-
get_rq:
-
/*
-
* This sync check and mask will be re-done in init_request_from_bio(),
-
* but we need to set it earlier to expose the sync flag to the
-
* rq allocator and io schedulers.
-
*/
-
rw_flags = bio_data_dir(bio);
-
/*判断是否需要sync,即直接将IO请求unplug(提交到块设备驱动层),不用等待kblockd来定期plug*/
-
if (sync)
-
rw_flags |= REQ_SYNC;
-
-
/*
-
* Grab a free request. This is might sleep but can not fail.
-
* Returns with the queue unlocked.
-
*/
-
/*从请求队列中取一个request*/
-
req = get_request(q, rw_flags, bio, GFP_NOIO);
-
if (unlikely(!req)) {
-
bio_endio(bio, -ENODEV); /* @q is dead */
-
goto out_unlock;
-
}
-
-
/*
-
* After dropping the lock and possibly sleeping here, our request
-
* may now be mergeable after it had proven unmergeable (above).
-
* We don't worry about that case for efficiency. It won't happen
-
* often, and the elevators are able to handle it.
-
*/
-
/*将bio加入新的request中*/
-
init_request_from_bio(req, bio);
-
-
if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
-
req->cpu = raw_smp_processor_id();
-
-
plug = current->plug;
-
/*如果有plug,则将请求加入到plug的list中,如果没有则直接调用__blk_run_queue提交请求*/
-
if (plug) {
-
/*
-
* If this is the first request added after a plug, fire
-
* of a plug trace. If others have been added before, check
-
* if we have multiple devices in this plug. If so, make a
-
* note to sort the list before dispatch.
-
*/
-
if (list_empty(&plug->list))
-
trace_block_plug(q);
-
else {/*如果请求队列中的请求数超过了限值,则先unplug?*/
-
if (request_count >= BLK_MAX_REQUEST_COUNT) {
-
blk_flush_plug_list(plug, false);
-
trace_block_plug(q);
-
}
-
}
-
/*把请求加入到plug的list中,当plug的list满了后(>BLK_MAX_REQUEST_COUNT),会flush到相应设备的请求队列中(request_queue)*/
-
list_add_tail(&req->queuelist, &plug->list);
-
blk_account_io_start(req, true);
-
} else {
-
spin_lock_irq(q->queue_lock);
-
add_acct_request(q, req, where);
-
/*如果没有plug控制,最终调用此接口处理队列中的请求,最终会调用请求队列的request_fn接口处理请求*/
-
__blk_run_queue(q);
-
out_unlock:
-
spin_unlock_irq(q->queue_lock);
-
}
-
}
阅读(4371) | 评论(0) | 转发(0) |