IO调度算法分析(1)--通用层-binghuiliang-ChinaUnix博客

binghuiliang

首页　| 　博文目录　| 　关于我

binghuiliang

博客访问： 296909
博文数量： 34
博客积分： 1400
博客等级：上尉
技术积分： 433
用户组：普通用户
注册时间： 2008-07-23 13:16

文章分类

全部博文（34）

C/C++（2）
界面特效（1）
Gtk（8）
Linux编程（4）
Windows（1）
Linux（10）
图像识别（1）
我的故事（3）
GPU／OpenGL／Dir（3）
未分配的博文（1）

文章存档

2011年（3）

2010年（4）

2009年（6）

2008年（21）

我的朋友

相关博文

IO调度算法分析(1)--通用层

分类： LINUX

2008-07-23 16:25:52

1. 数据结构
/*
* each queue has an elevator_queue associated with it
*/
struct elevator_queue
{
   struct elevator_ops *ops;
   void *elevator_data;
   struct kobject kobj;
   struct elevator_type *elevator_type;
   struct mutex sysfs_lock;
   struct hlist_head *hash;
};

调度算法的操作函数
struct elevator_ops
{
   elevator_merge_fn *elevator_merge_fn;
   elevator_merged_fn *elevator_merged_fn;
   elevator_merge_req_fn *elevator_merge_req_fn;
   elevator_allow_merge_fn *elevator_allow_merge_fn;

   elevator_dispatch_fn *elevator_dispatch_fn;
   elevator_add_req_fn *elevator_add_req_fn;
   elevator_activate_req_fn *elevator_activate_req_fn;
   elevator_deactivate_req_fn *elevator_deactivate_req_fn;

   elevator_queue_empty_fn *elevator_queue_empty_fn;
   elevator_completed_req_fn *elevator_completed_req_fn;

   elevator_request_list_fn *elevator_former_req_fn;
   elevator_request_list_fn *elevator_latter_req_fn;

   elevator_set_req_fn *elevator_set_req_fn;
   elevator_put_req_fn *elevator_put_req_fn;

   elevator_may_queue_fn *elevator_may_queue_fn;

   elevator_init_fn *elevator_init_fn;
   elevator_exit_fn *elevator_exit_fn;
   void (*trim)(struct io_context *);
};

//deadline 实现了如下的函数
static struct elevator_type iosched_deadline = {
   .ops = {
       .elevator_merge_fn =        deadline_merge,
       .elevator_merged_fn =       deadline_merged_request,
       .elevator_merge_req_fn =   deadline_merged_requests,
       .elevator_dispatch_fn =       deadline_dispatch_requests,
       .elevator_add_req_fn =       deadline_add_request,
       .elevator_queue_empty_fn =   deadline_queue_empty,
       .elevator_former_req_fn =   elv_rb_former_request,
       .elevator_latter_req_fn =   elv_rb_latter_request,
       .elevator_init_fn =       deadline_init_queue,
       .elevator_exit_fn =       deadline_exit_queue,
   },

   .elevator_attrs = deadline_attrs,
   .elevator_name = "deadline",
   .elevator_owner = THIS_MODULE,
};

2. 整体处理
static int __make_request(request_queue_t *q, struct bio *bio)
{
   struct request *req;
   int el_ret, nr_sectors, barrier, err;
   const unsigned short prio = bio_prio(bio);
   const int sync = bio_sync(bio);
   int rw_flags;

   //#define bio_sectors(bio)   ((bio)->bi_size >> 9)
   nr_sectors = bio_sectors(bio); //多少个sector

   /*
   * low level driver can indicate that it wants pages above a
   * certain limit bounced to low memory (ie for highmem, or even
   * ISA dma in theory)
   */
   blk_queue_bounce(q, &bio);

   barrier = bio_barrier(bio);
   if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
       err = -EOPNOTSUPP;
       goto end_io;
   }

   spin_lock_irq(q->queue_lock);

   if (unlikely(barrier) || elv_queue_empty(q))
       goto get_rq;

   //判断当前这个bio请求是否可以合并
   el_ret = elv_merge(q, &req, bio);

   switch (el_ret) { //两种合并方式, 向前/向后

       case ELEVATOR_BACK_MERGE: //向前的处理
           BUG_ON(!rq_mergeable(req));

           if (!ll_back_merge_fn(q, req, bio))
               break;

           blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);

           req->biotail->bi_next = bio;
           req->biotail = bio;
           req->nr_sectors = req->hard_nr_sectors += nr_sectors;
           req->ioprio = ioprio_best(req->ioprio, prio);
           drive_stat_acct(req, nr_sectors, 0);
           if (!attempt_back_merge(q, req))
               elv_merged_request(q, req, el_ret);
           goto out;

       case ELEVATOR_FRONT_MERGE: //向后的处理
           BUG_ON(!rq_mergeable(req));

           if (!ll_front_merge_fn(q, req, bio))
               break;

           blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);

           bio->bi_next = req->bio;
           req->bio = bio;

           /*
           * may not be valid. if the low level driver said
           * it didn't need a bounce buffer then it better
           * not touch req->buffer either...
           */
           req->buffer = bio_data(bio);
           req->current_nr_sectors = bio_cur_sectors(bio);
           req->hard_cur_sectors = req->current_nr_sectors;
           req->sector = req->hard_sector = bio->bi_sector;
           req->nr_sectors = req->hard_nr_sectors += nr_sectors;
           req->ioprio = ioprio_best(req->ioprio, prio);
           drive_stat_acct(req, nr_sectors, 0);
           if (!attempt_front_merge(q, req))
               elv_merged_request(q, req, el_ret);
           goto out;

       /* ELV_NO_MERGE: elevator says don't/can't merge. */
       default:
           ;
   }

//不能合并的处理
get_rq:
   /*
   * This sync check and mask will be re-done in init_request_from_bio(),
   * but we need to set it earlier to expose the sync flag to the
   * rq allocator and io schedulers.
   */
   rw_flags = bio_data_dir(bio);
   if (sync)
       rw_flags |= REQ_RW_SYNC;

   /*
   * Grab a free request. This is might sleep but can not fail.
   * Returns with the queue unlocked.
   */
   req = get_request_wait(q, rw_flags, bio);

   /*
   * After dropping the lock and possibly sleeping here, our request
   * may now be mergeable after it had proven unmergeable (above).
   * We don't worry about that case for efficiency. It won't happen
   * often, and the elevators are able to handle it.
   */
   init_request_from_bio(req, bio);

   spin_lock_irq(q->queue_lock);
   if (elv_queue_empty(q))
       blk_plug_device(q);
   add_request(q, req);
out:
   if (sync)
       __generic_unplug_device(q);

   spin_unlock_irq(q->queue_lock);
   return 0;

end_io:
   bio_endio(bio, nr_sectors << 9, err);
   return 0;
}

3. 合并的判断
//这里是判断一个新的bio请求可不可以被合并的原有的request中
int elv_merge(request_queue_t *q, struct request **req, struct bio *bio)
{
   elevator_t *e = q->elevator;
   struct request *__rq;
   int ret;

   /*
   * First try one-hit cache.
   */
   if (q->last_merge) { //检查最后一个request
       ret = elv_try_merge(q->last_merge, bio);
       if (ret != ELEVATOR_NO_MERGE) {
           *req = q->last_merge; //找到
           return ret;
       }
   }

   /*
   * See if our hash lookup can find a potential backmerge.
   */
   //如果上面没有找到, 则在hash表中再次查找
   __rq = elv_rqhash_find(q, bio->bi_sector);
   if (__rq && elv_rq_merge_ok(__rq, bio)) {
       *req = __rq; //找到
       return ELEVATOR_BACK_MERGE;
   }

   //如果还是没有找到, 则交由特定的IO调度算法处理
   if (e->ops->elevator_merge_fn)
       return e->ops->elevator_merge_fn(q, req, bio);

   return ELEVATOR_NO_MERGE;
}

static inline int elv_try_merge(struct request *__rq, struct bio *bio)
{
   int ret = ELEVATOR_NO_MERGE;

   /*
   * we can merge and sequence is ok, check if it's possible
   */
   if (elv_rq_merge_ok(__rq, bio)) {
       if (__rq->sector + __rq->nr_sectors == bio->bi_sector) //在后面
           ret = ELEVATOR_BACK_MERGE;
       else if (__rq->sector - bio_sectors(bio) == bio->bi_sector) //在前面
           ret = ELEVATOR_FRONT_MERGE;
   }

   return ret;
}

且每个request会加入到一个hash表中去, 我们在后面会看到

//以sector+nr_sector作为键值
#define rq_hash_key(rq)       ((rq)->sector + (rq)->nr_sectors)
static void elv_rqhash_add(request_queue_t *q, struct request *rq)
{
   elevator_t *e = q->elevator;

   BUG_ON(ELV_ON_HASH(rq));
   hlist_add_head(&rq->hash, &e->hash[ELV_HASH_FN(rq_hash_key(rq))]);
}

//现在遍历这个hash链表,查看有没有合适的request
static struct request *elv_rqhash_find(request_queue_t *q, sector_t offset)
{
   elevator_t *e = q->elevator;
   struct hlist_head *hash_list = &e->hash[ELV_HASH_FN(offset)]; //找出对应的链表
   struct hlist_node *entry, *next;
   struct request *rq;

   hlist_for_each_entry_safe(rq, entry, next, hash_list, hash) {
       BUG_ON(!ELV_ON_HASH(rq));

       if (unlikely(!rq_mergeable(rq))) {
           __elv_rqhash_del(rq);
           continue;
       }

       if (rq_hash_key(rq) == offset) //找到
           return rq;
   }

   return NULL;
}

如果至此还是没有找合适的request, 那下一步就是交给特定的IO调度算法来处理了,
因为,这个算法可能会保留一些request.

4. 向后合并处理

       case ELEVATOR_BACK_MERGE:
           BUG_ON(!rq_mergeable(req));

           if (!ll_back_merge_fn(q, req, bio))
               break;

           blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);

           req->biotail->bi_next = bio;
           req->biotail = bio;
           req->nr_sectors = req->hard_nr_sectors += nr_sectors;
           req->ioprio = ioprio_best(req->ioprio, prio);
           drive_stat_acct(req, nr_sectors, 0);
           if (!attempt_back_merge(q, req))
               elv_merged_request(q, req, el_ret);
           goto out;

我们知道, 一个request合并了一个新的bio之后, 它的最后的sector的位置就会发生了改变, 这样它就有可能和下一个request进行合并, 这就是合并产生的连锁反应.

static inline int attempt_back_merge(request_queue_t *q, struct request *rq)
{
   struct request *next = elv_latter_request(q, rq); //取出下一个request

   if (next)
       return attempt_merge(q, rq, next); //尝试合并两个request

   return 0;
}

struct request *elv_latter_request(request_queue_t *q, struct request *rq)
{
   elevator_t *e = q->elevator;

   if (e->ops->elevator_latter_req_fn)
       return e->ops->elevator_latter_req_fn(q, rq); 通过特定的IO调度算法中去取出
   return NULL;
}

/*
* Has to be called with the request spinlock acquired
*/
static int attempt_merge(request_queue_t *q, struct request *req,
              struct request *next)
{
   //两个都可以合并吗
   if (!rq_mergeable(req) || !rq_mergeable(next))
       return 0;

   /*
   * not contiguous   //非连续的request, 合并不了, 退出了
   */
   if (req->sector + req->nr_sectors != next->sector)
       return 0;

   if (rq_data_dir(req) != rq_data_dir(next)
        || req->rq_disk != next->rq_disk
        || next->special)
       return 0;

   /*
   * If we are allowed to merge, then append bio list
   * from next to rq and release next. merge_requests_fn
   * will have updated segment counts, update sector
   * counts here.
   */
   if (!ll_merge_requests_fn(q, req, next))
       return 0;

   /*
   * At this point we have either done a back merge
   * or front merge. We need the smaller start_time of
   * the merged requests to be the current request
   * for accounting purposes.
   */
   //更新请求request发生时间
   if (time_after(req->start_time, next->start_time))
       req->start_time = next->start_time;

   //将next的bio链入req
   req->biotail->bi_next = next->bio;
   req->biotail = next->biotail;
   //并将sector的总数相加
   req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;

   //通用层的合并操作
   elv_merge_requests(q, req, next);

   if (req->rq_disk) {
       disk_round_stats(req->rq_disk);
       req->rq_disk->in_flight--;
   }

   req->ioprio = ioprio_best(req->ioprio, next->ioprio);

   __blk_put_request(q, next);
   return 1;
}

void elv_merge_requests(request_queue_t *q, struct request *rq,
                 struct request *next)
{
   elevator_t *e = q->elevator;

   //特定的IO调度算法操作, 它也需要将被删除的request从自己专有的队列中删除
   if (e->ops->elevator_merge_req_fn)
       e->ops->elevator_merge_req_fn(q, rq, next);

   //合并后的request在hash表中的位置发生了变化
   elv_rqhash_reposition(q, rq);

   //在hash表中删除已经被合并了的request--next
   elv_rqhash_del(q, next);

   q->nr_sorted--;
   q->last_merge = rq;
}

至此, 如果合并两个request不成功, 则会执行elv_merged_request(q, req, el_ret), 因为当前的热确实挺合并了一个新的bio, 它在hash表的位置会有所改变, 需要调整:

void elv_merged_request(request_queue_t *q, struct request *rq, int type)
{
   elevator_t *e = q->elevator;

   if (e->ops->elevator_merged_fn)
       e->ops->elevator_merged_fn(q, rq, type); //deadline的向前合并操作

   //重新调整在hash表中的位置
   if (type == ELEVATOR_BACK_MERGE)
       elv_rqhash_reposition(q, rq);

   q->last_merge = rq;
}

5. 向前合并处理
       case ELEVATOR_FRONT_MERGE:
           BUG_ON(!rq_mergeable(req));

           if (!ll_front_merge_fn(q, req, bio))
               break;

           blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);

           bio->bi_next = req->bio;
           req->bio = bio;

           /*
           * may not be valid. if the low level driver said
           * it didn't need a bounce buffer then it better
           * not touch req->buffer either...
           */
           req->buffer = bio_data(bio);
           req->current_nr_sectors = bio_cur_sectors(bio);
           req->hard_cur_sectors = req->current_nr_sectors;
           req->sector = req->hard_sector = bio->bi_sector;
           req->nr_sectors = req->hard_nr_sectors += nr_sectors;
           req->ioprio = ioprio_best(req->ioprio, prio);
           drive_stat_acct(req, nr_sectors, 0);
           if (!attempt_front_merge(q, req))
               elv_merged_request(q, req, el_ret);
           goto out;

6, 没有合并
   get_rq:
   /*
   * This sync check and mask will be re-done in init_request_from_bio(),
   * but we need to set it earlier to expose the sync flag to the
   * rq allocator and io schedulers.
   */
   rw_flags = bio_data_dir(bio); //判断方向
   if (sync)
       rw_flags |= REQ_RW_SYNC;

   /*
   * Grab a free request. This is might sleep but can not fail.
   * Returns with the queue unlocked.
   */
   req = get_request_wait(q, rw_flags, bio); //请求一个新的request

   /*
   * After dropping the lock and possibly sleeping here, our request
   * may now be mergeable after it had proven unmergeable (above).
   * We don't worry about that case for efficiency. It won't happen
   * often, and the elevators are able to handle it.
   */
   init_request_from_bio(req, bio); //初始化这个request

   spin_lock_irq(q->queue_lock);
   if (elv_queue_empty(q)) //如果这个队列是空的处理
       blk_plug_device(q);
   add_request(q, req);   //将这个请求加入队列

===========================================

static void init_request_from_bio(struct request *req, struct bio *bio)
{
   req->cmd_type = REQ_TYPE_FS;

   /*
   * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
   */
   if (bio_rw_ahead(bio) || bio_failfast(bio))
       req->cmd_flags |= REQ_FAILFAST;

   /*
   * REQ_BARRIER implies no merging, but lets make it explicit
   */
   if (unlikely(bio_barrier(bio)))
       req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);

   if (bio_sync(bio))
       req->cmd_flags |= REQ_RW_SYNC;
   if (bio_rw_meta(bio))
       req->cmd_flags |= REQ_RW_META;

   req->errors = 0;
   req->hard_sector = req->sector = bio->bi_sector;
   req->hard_nr_sectors = req->nr_sectors = bio_sectors(bio);
   req->current_nr_sectors = req->hard_cur_sectors = bio_cur_sectors(bio);
   req->nr_phys_segments = bio_phys_segments(req->q, bio);
   req->nr_hw_segments = bio_hw_segments(req->q, bio);
   req->buffer = bio_data(bio);   /* see ->buffer comment above */
   req->bio = req->biotail = bio;
   req->ioprio = bio_prio(bio);
   req->rq_disk = bio->bi_bdev->bd_disk;
   req->start_time = jiffies;
}

/*
* "plug" the device if there are no outstanding requests: this will
* force the transfer to start only after we have put all the requests
* on the list.
*
* This is called with interrupts off and no requests on the queue and
* with the queue lock held.
*/
void blk_plug_device(request_queue_t *q)
{
   WARN_ON(!irqs_disabled());

   /*
   * don't plug a stopped queue, it must be paired with blk_start_queue()
   * which will restart the queueing
   */
   if (blk_queue_stopped(q)) //如果是由进程显式停止的队列, 不要去理会它
       return;

   //否则, 通过定时器去unplug它
   if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
       mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
       blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
   }
}

6.2 请求request入列操作
/*
* add-request adds a request to the linked list.
* queue lock is held and interrupts disabled, as we muck with the
* request queue list.
*/

/*
* Insertion selection
*/
#define ELEVATOR_INSERT_FRONT   1
#define ELEVATOR_INSERT_BACK   2
#define ELEVATOR_INSERT_SORT   3
#define ELEVATOR_INSERT_REQUEUE   4

static inline void add_request(request_queue_t * q, struct request * req)
{
   drive_stat_acct(req, req->nr_sectors, 1);

   /*
   * elevator indicated where it wants this request to be
   * inserted at elevator_merge time
   */
   //插入位置
   __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
}

void __elv_add_request(request_queue_t *q, struct request *rq, int where,
               int plug)
{
   if (q->ordcolor)
       rq->cmd_flags |= REQ_ORDERED_COLOR;

   if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
       /*
       * toggle ordered color
       */
       if (blk_barrier_rq(rq))
           q->ordcolor ^= 1;

       /*
       * barriers implicitly indicate back insertion
       */
       if (where == ELEVATOR_INSERT_SORT)
           where = ELEVATOR_INSERT_BACK;

       /*
       * this request is scheduling boundary, update
       * end_sector
       */
       if (blk_fs_request(rq)) {
           q->end_sector = rq_end_sector(rq);
           q->boundary_rq = rq;
       }
   } else if (!(rq->cmd_flags & REQ_ELVPRIV) && where == ELEVATOR_INSERT_SORT)
       where = ELEVATOR_INSERT_BACK;

   if (plug)
       blk_plug_device(q);

   elv_insert(q, rq, where);
}

void elv_insert(request_queue_t *q, struct request *rq, int where)
{
   struct list_head *pos;
   unsigned ordseq;
   int unplug_it = 1;

   blk_add_trace_rq(q, rq, BLK_TA_INSERT);

   rq->q = q;

   switch (where) {
   case ELEVATOR_INSERT_FRONT:
       rq->cmd_flags |= REQ_SOFTBARRIER;

       list_add(&rq->queuelist, &q->queue_head);
       break;

   case ELEVATOR_INSERT_BACK:
       rq->cmd_flags |= REQ_SOFTBARRIER;
       //调用调度算法找到最合适的request, 并将它插入的队头
       elv_drain_elevator(q);
       list_add_tail(&rq->queuelist, &q->queue_head);
       /*
       * We kick the queue here for the following reasons.
       * - The elevator might have returned NULL previously
       *   to delay requests and returned them now. As the
       *   queue wasn't empty before this request, ll_rw_blk
       *   won't run the queue on return, resulting in hang.
       * - Usually, back inserted requests won't be merged
       *   with anything. There's no point in delaying queue
       *   processing.
       */
       blk_remove_plug(q);
       q->request_fn(q);   //处理这个队列
       break;

   case ELEVATOR_INSERT_SORT:
       BUG_ON(!blk_fs_request(rq));
       rq->cmd_flags |= REQ_SORTED;
       q->nr_sorted++;
       if (rq_mergeable(rq)) {
           elv_rqhash_add(q, rq);
           if (!q->last_merge)
               q->last_merge = rq;
       }

       /*
       * Some ioscheds (cfq) run q->request_fn directly, so
       * rq cannot be accessed after calling
       * elevator_add_req_fn.
       */
       q->elevator->ops->elevator_add_req_fn(q, rq);
       break;

   case ELEVATOR_INSERT_REQUEUE:
       /*
       * If ordered flush isn't in progress, we do front
       * insertion; otherwise, requests should be requeued
       * in ordseq order.
       */
       rq->cmd_flags |= REQ_SOFTBARRIER;

       /*
       * Most requeues happen because of a busy condition,
       * don't force unplug of the queue for that case.
       */
       unplug_it = 0;

       if (q->ordseq == 0) {
           list_add(&rq->queuelist, &q->queue_head);
           break;
       }

       ordseq = blk_ordered_req_seq(rq);

       list_for_each(pos, &q->queue_head) {
           struct request *pos_rq = list_entry_rq(pos);
           if (ordseq <= blk_ordered_req_seq(pos_rq))
               break;
       }

       list_add_tail(&rq->queuelist, pos);
       break;

   default:
       printk(KERN_ERR "%s: bad insertion point %d\n",
               __FUNCTION__, where);
       BUG();
   }

   if (unplug_it && blk_queue_plugged(q)) {
       int nrq = q->rq.count[READ] + q->rq.count[WRITE]
           - q->in_flight;

       if (nrq >= q->unplug_thresh) //如果超过threshold,则立刻激活队列
           __generic_unplug_device(q);
   }
}

static void elv_drain_elevator(request_queue_t *q)
{
   static int printed;
   while (q->elevator->ops->elevator_dispatch_fn(q, 1))
       ;
   if (q->nr_sorted == 0)
       return;
   if (printed++ < 10) {
       printk(KERN_ERR "%s: forced dispatching is broken "
               "(nr_sorted=%u), please report this\n",
               q->elevator->elevator_type->elevator_name, q->nr_sorted);
   }
}

7, 收尾工作

out:
   if (sync) //如果是同步的, 立刻处理她
       __generic_unplug_device(q);

   spin_unlock_irq(q->queue_lock);
   return 0;

===========================

/*
* remove the plug and let it rip..
*/
void __generic_unplug_device(request_queue_t *q)
{
   if (unlikely(blk_queue_stopped(q)))
       return;

   if (!blk_remove_plug(q))
       return;

   q->request_fn(q); //这个就是驱动注册的处理函数了
}

/*
* remove the queue from the plugged list, if present. called with
* queue lock held and interrupts disabled.
*/
int blk_remove_plug(request_queue_t *q)
{
   WARN_ON(!irqs_disabled());

   if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
       return 0;

   del_timer(&q->unplug_timer);
   return 1;
}

~~~~~~~~~~~~~~~ this is how plug and unplug work descript in ULK3 ~~~~~~~~~~~~~~~~~~~~~~~~~

(14.3.3.) Activating the Block Device Driver

As we saw earlier, it's expedient to delay activation of the block device driver in order to increase the chances of clustering requests for adjacent blocks. The delay is accomplished through a technique known as device plugging and unplugging.[*] As long as a block device driver is plugged, the device driver is not activated even if there are requests to be processed in the driver's queues.

    [*] If you are confused by the terms "plugging" and "unplugging," you might consider them equivalent to "de-activating" and "activating," respectively.

The blk_plug_device( ) function plugs a block deviceor more precisely, a request queue serviced by some block device driver. Essentially, the function receives as an argument the address q of a request queue descriptor. It sets the QUEUE_FLAG_PLUGGED bit in the q->queue_flags field; then, it restarts the dynamic timer embedded in the q->unplug_timer field.

The blk_remove_plug( ) function unplugs a request queue q: it clears the QUEUE_FLAG_PLUGGED flag and cancels the execution of the q->unplug_timer dynamic timer. This function can be explicitly invoked by the kernel when all mergeable requests "in sight" have been added to the queue. Moreover, the I/O scheduler unplugs a request queue if the number of pending requests in the queue exceeds the value stored in the unplug_thres field of the request queue descriptor (by default, 4).

If a device remains plugged for a time interval of length q->unplug_delay (usually 3 milliseconds), the dynamic timer activated by blk_plug_device( ) elapses, thus the blk_unplug_timeout( ) function is executed. As a consequence, the kblockd kernel thread servicing the kblockd_workqueue work queue is awakened (see the section "Work Queues" in Chapter 4). This kernel thread executes the function whose address is stored in the q->unplug_work data structurethat is, the blk_unplug_work( ) function. In turn, this function invokes the q->unplug_fn method of the request queue, which is usually implemented by the generic_unplug_device( ) function. The generic_unplug_device( ) function takes care of unplugging the block device: first, it checks whether the queue is still active; then, it invokes blk_remove_plug( ); and finally, it executes the strategy routinerequest_fn methodto start processing the next request in the queue (see the section "Device Driver Registration and Initialization" later in this chapter).

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

8. 设备的注册
/*
* init request queue
*/
static int ide_init_queue(ide_drive_t *drive)
{
   ......

   q = blk_init_queue_node(do_ide_request, &ide_lock, hwif_to_node(hwif));

   ......
}

request_queue_t *
blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
{
   request_queue_t *q = blk_alloc_queue_node(GFP_KERNEL, node_id);

   if (!q)
       return NULL;

   q->node = node_id;
   if (blk_init_free_list(q)) {
       kmem_cache_free(requestq_cachep, q);
       return NULL;
   }

   /*
   * if caller didn't supply a lock, they get per-queue locking with
   * our embedded lock
   */
   if (!lock) {
       spin_lock_init(&q->__queue_lock);
       lock = &q->__queue_lock;
   }

   q->request_fn       = rfn; //do_ide_request
   q->prep_rq_fn       = NULL;
   q->unplug_fn       = generic_unplug_device; //unplug函数
   q->queue_flags       = (1 << QUEUE_FLAG_CLUSTER);
   q->queue_lock       = lock;

   blk_queue_segment_boundary(q, 0xffffffff);

   blk_queue_make_request(q, __make_request); //初始化为__make_request()
   blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);

   blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
   blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);

   q->sg_reserved_size = INT_MAX;

   /*
   * all done
   */
   if (!elevator_init(q, NULL)) {
       blk_queue_congestion_threshold(q); //threshold设置
       return q;
   }

   blk_put_queue(q);
   return NULL;
}

/**
* blk_queue_make_request - define an alternate make_request function for a device
* @q: the request queue for the device to be affected
* @mfn: the alternate make_request function
*
* Description:
*    The normal way for &struct bios to be passed to a device
*    driver is for them to be collected into requests on a request
*    queue, and then to allow the device driver to select requests
*    off that queue when it is ready. This works well for many block
*    devices. However some block devices (typically virtual devices
*    such as md or lvm) do not benefit from the processing on the
*    request queue, and are served best by having the requests passed
*    directly to them. This can be achieved by providing a function
*    to blk_queue_make_request().
*
* Caveat:
*    The driver that does this *must* be able to deal appropriately
*    with buffers in "highmemory". This can be accomplished by either calling
*    __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
*    blk_queue_bounce() to create a buffer in normal memory.
**/

#define BLKDEV_MIN_RQ   4
#define BLKDEV_MAX_RQ   128   /* Default maximum */

/* Amount of time in which a process may batch requests */
#define BLK_BATCH_TIME   (HZ/50UL)

/* Number of requests a "batching" process may submit */
#define BLK_BATCH_REQ   32

void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
{
   /*
   * set defaults
   */
   q->nr_requests = BLKDEV_MAX_RQ; //128, 请求队列的最大值
   blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
   blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
   q->make_request_fn = mfn;
   q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
   q->backing_dev_info.state = 0;
   q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
   blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
   blk_queue_hardsect_size(q, 512);
   blk_queue_dma_alignment(q, 511);
   blk_queue_congestion_threshold(q);
   q->nr_batching = BLK_BATCH_REQ;

   //unplug threshold设置, 当插入队列的request超过这个值时, 就立刻unplug它
   q->unplug_thresh = 4;       /* hmm */

   //超时设置
   q->unplug_delay = (3 * HZ) / 1000;   /* 3 milliseconds */
   if (q->unplug_delay == 0)
       q->unplug_delay = 1;

   //超时函数及工作队列函数设置, 如下, 一旦超时, 就会执行blk_unplug_timeout(), 它就会通过
   //工作队列执行blk_unplug_work(), 最终会执行上面的q->unplug_fn(=generic_unplug_device), 最后   //是__generic_unplug_device,

   INIT_WORK(&q->unplug_work, blk_unplug_work);

   q->unplug_timer.function = blk_unplug_timeout;
   q->unplug_timer.data = (unsigned long)q;

   /*
   * by default assume old behaviour and bounce for any highmem page
   */
   blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
}

我们看到两个激活(unplug)队列的条件: 1, request个数超过threshold, 2,超时

阅读(2337) | 评论(0) | 转发(0) |

上一篇：块设备层分析

下一篇：IO调度算法分析(2)--deadline算法分析

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6