UBIFS文件系统分析（五）：wear-leveling-mournjust-ChinaUnix博客

mournjustmournjust.blog.chinaunix.net

首页　| 　博文目录　| 　关于我

mournjust

博客访问： 762829
博文数量： 79
博客积分： 2671
博客等级：少校
技术积分： 1247
用户组：普通用户
注册时间： 2010-04-02 15:26

个人简介

宅男

文章分类

全部博文（79）

scheduler（7）
perl（3）
Android（10）
linux编译（3）
文件系统（23）
linux TCP/IP（0）
调试心得（4）
sep0718（3）
数据结构、编程（6）
杂谈，心情（6）
linux内核源码（14）
未分配的博文（0）

文章存档

2017年（11）

2016年（12）

2015年（6）

2012年（10）

2011年（33）

2010年（7）

我的朋友

相关博文

UBIFS文件系统分析（五）：wear-leveling

分类： LINUX

2011-03-08 14:34:56

在本文的开头，先接本章讲一下EBA，什么是EBA，Eraseblock Association。

在上次提到struct ubi_volume结构体的成员变量eba_tbl的时候稍微提到了。每次文件系统需要对一个逻辑可擦除块（LEB）进行操作的时候，它就会到对应的volume的eba_tbl中去查找该逻辑可擦除块对应着哪一个物理可擦除块（PEB）。

EBA子系统的两个最重要的操作是map和unmap的过程。但是在UBI的内核源码中并没有关于map的专门函数，而是嵌套在ubi_eba_write_leb函数中，下面看来函数的具体代码：

int ubi_eba_write_leb(struct ubi_device *ubi, struct ubi_volume *vol, int lnum,

const void *buf, int offset, int len, int dtype)

{

int err, pnum, tries = 0, vol_id = vol->vol_id;

struct ubi_vid_hdr *vid_hdr;

if (ubi->ro_mode)

return -EROFS;

err = leb_write_lock(ubi, vol_id, lnum);

if (err)

return err;

到具体volume的eba_tbl表中去查找LEB与PEB之间的关系，如果pnum大于0就表示该LEB已经影射了

pnum = vol->eba_tbl[lnum];

if (pnum >= 0) {

dbg_eba("write %d bytes at offset %d of LEB %d:%d, PEB %d",

len, offset, vol_id, lnum, pnum);

err = ubi_io_write_data(ubi, buf, pnum, offset, len);

if (err) {

ubi_warn("failed to write data to PEB %d", pnum);

if (err == -EIO && ubi->bad_allowed)

err = recover_peb(ubi, pnum, vol_id, lnum, buf,

offset, len);

//事实上上面的ubi_io_write_data有可能会失败的，因为ubi_dbg_check_all_ff函数会检查被写入的地方是否全是0xff，对于overwrite显然不是，那么就通过recover_peb来进行数据的搬运工作。

if (err)

ubi_ro_mode(ubi);

}

leb_write_unlock(ubi, vol_id, lnum);

return err;

}

* The logical eraseblock is not mapped. We have to get a free physical

* eraseblock and write the volume identifier header there first.

vid_hdr = ubi_zalloc_vid_hdr(ubi, GFP_NOFS);

if (!vid_hdr) {

leb_write_unlock(ubi, vol_id, lnum);

return -ENOMEM;

}

vid_hdr->vol_type = UBI_VID_DYNAMIC;

vid_hdr->sqnum = cpu_to_be64(next_sqnum(ubi));

vid_hdr->vol_id = cpu_to_be32(vol_id);

vid_hdr->lnum = cpu_to_be32(lnum);

vid_hdr->compat = ubi_get_compat(ubi, vol_id);

vid_hdr->data_pad = cpu_to_be32(vol->data_pad);

retry:

上面的代码比较简单，也不是本次关注的内容

通过函数ubi_wl_get_peb来从WL子系统中获得一块free的PEB，然后修改volume的eba_tbl，这样一个map过程就算完成了，so easy ,~。~！！

pnum = ubi_wl_get_peb(ubi, dtype);

if (pnum < 0) {

ubi_free_vid_hdr(ubi, vid_hdr);

leb_write_unlock(ubi, vol_id, lnum);

return pnum;

}

dbg_eba("write VID hdr and %d bytes at offset %d of LEB %d:%d, PEB %d",

len, offset, vol_id, lnum, pnum);

err = ubi_io_write_vid_hdr(ubi, pnum, vid_hdr);

if (err) {

ubi_warn("failed to write VID header to LEB %d:%d, PEB %d",

vol_id, lnum, pnum);

goto write_error;

}

if (len) {

err = ubi_io_write_data(ubi, buf, pnum, offset, len);

if (err) {

ubi_warn("failed to write %d bytes at offset %d of "

"LEB %d:%d, PEB %d", len, offset, vol_id,

lnum, pnum);

goto write_error;

}

vol->eba_tbl[lnum] = pnum;

leb_write_unlock(ubi, vol_id, lnum);

ubi_free_vid_hdr(ubi, vid_hdr);

return 0;

write_error:

if (err != -EIO || !ubi->bad_allowed) {

ubi_ro_mode(ubi);

leb_write_unlock(ubi, vol_id, lnum);

ubi_free_vid_hdr(ubi, vid_hdr);

return err;

}

* Fortunately, this is the first write operation to this physical

* eraseblock, so just put it and request a new one. We assume that if

* this physical eraseblock went bad, the erase code will handle that.

err = ubi_wl_put_peb(ubi, pnum, 1);

if (err || ++tries > UBI_IO_RETRIES) {

ubi_ro_mode(ubi);

leb_write_unlock(ubi, vol_id, lnum);

ubi_free_vid_hdr(ubi, vid_hdr);

return err;

}

vid_hdr->sqnum = cpu_to_be64(next_sqnum(ubi));

ubi_msg("try another PEB");

goto retry;

}

接着看一个unmap的过程：

int ubi_eba_unmap_leb(struct ubi_device *ubi, struct ubi_volume *vol,

int lnum)

{

int err, pnum, vol_id = vol->vol_id;

if (ubi->ro_mode)

return -EROFS;

err = leb_write_lock(ubi, vol_id, lnum);

if (err)

return err;

首先还是查询vol->eba_tbl表，如果对应的想为-1,说明我们要unmap的块根本就没有map，所以也就不需要做任何事情了

pnum = vol->eba_tbl[lnum];

if (pnum < 0)

/* This logical eraseblock is already unmapped */

goto out_unlock;

dbg_eba("erase LEB %d:%d, PEB %d", vol_id, lnum, pnum);

如果不是小于0，那么得到值肯定是一个PEB号，修改eba_tbl对应项为-1

vol->eba_tbl[lnum] = UBI_LEB_UNMAPPED;

我们上面提到了，在map的过程中需要从WL子系统中获得peb，现在unmap掉了，需要将PEB归还给WL子系统并需要擦除，这个是由ubi_wl_put_peb完成的。

err = ubi_wl_put_peb(ubi, pnum, 0);

out_unlock:

leb_write_unlock(ubi, vol_id, lnum);

return err;

}

从上面的这段例子中可以看出，在UBI中，获得每一个PEB都是从WL子系统中获得，释放掉的每一个PEB都要归还给WL子系统，可以说WL无处不在每一个涉及可擦除块的使用的操作肯定涉及到WL子系统。

下面介绍一下涉及的wl的主要的数据结构：

struct ubi_wl_entry {

union {

struct rb_node rb;

struct list_head list;

} u;

int ec;

int pnum;

};

从这个结构体中可以看出WL子系统操作的是实实在在的物理可擦除块，另外一个关注的就是EC头部的erase counter，这也是WL进行操作的依据。

从联合u中可以看出wl子系统中是采用红黑树来管理的。关于红黑的一些操作下面稍微掠过，并不以源码的形式详细阐述。

static void wl_tree_add(struct ubi_wl_entry *e, struct rb_root *root)该操作用于将e添加到以root为RB树根的树中

static int in_wl_tree(struct ubi_wl_entry *e, struct rb_root *root)用于判断e是否存在于以root为根的RB树中

static struct ubi_wl_entry *find_wl_entry(struct rb_root *root, int max)用于在以root为根的RB树中查找erase counter无限左接近max的PEB。

WL的作用是什么呢？上面提到了一点，就是以EC值为依据来进行可擦除逻辑块的管理，以防对某一些可擦除块过多的操作导致变为坏块。如果在操作的过程中发现，某一个可擦除块的EC值变的不正常了，也就是变的太大了。（EC值是随着擦除的次数增加的）。既然EC值已经变的这么大了，那么这块可擦除块还能用吗？能。

在include/mtd/ubi-user.h中有这样一个枚举。

enum {

UBI_LONGTERM = 1,

UBI_SHORTTERM = 2,

UBI_UNKNOWN = 3,

};

定了三种用于指定数据类型的标志位，从名字中可以看出这个枚举的目的用于说明数据是长期还是短期保存。

在ubi_wl_get_peb函数中有这样的一段代码：

case UBI_LONGTERM:

e = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);

break;

我们在获得一个PEB的时候，如果是用于长期保存的数据的话，那么就取一个EC值比较大（也就是已经擦除过很多次）的PEB。这样就物尽其用了。

根据ubidesign的说明：UBI select a long term storage block with a low erase count and copies the block contents to the block with the high erase count using the block moving function.但是在后面的源码中并没有看到long term这方面的考虑？（我哪儿没看到？）

函数ensure_wear_leveling就是用来判断是否存在上述的这种情况的。

static int ensure_wear_leveling(struct ubi_device *ubi)

{

int err = 0;

struct ubi_wl_entry *e1;

struct ubi_wl_entry *e2;

struct ubi_work *wrk;

spin_lock(&ubi->wl_lock);

//如果Wear-leveling已经在work工作队列了，那么这样的判断就没有必要了，因为不管你怎么判断，都是Wear-leveling必须的，而且会对pending的work造成影响，所以就什么事情也不做了。

if (ubi->wl_scheduled)

/* Wear-leveling is already in the work queue */

goto out_unlock;

* If the ubi->scrub tree is not empty, scrubbing is needed, and the

* the WL worker has to be scheduled anyway.

@情况一：如果没有已经使用的可擦除块，也就是说该UBI设备刚被attach上去，没有任何数据。

@情况二：没有可用的可擦除块。上面说到了WL是将一块的数据搬运到另外一块可擦除块中，现在没有可用的可擦除块了，工作进行不下去了

if (!ubi->scrub.rb_node) {//这个队列中的结点是从哪儿来的呢？也就是说在什么情况下添加进来的

if (!ubi->used.rb_node || !ubi->free.rb_node)

/* No physical eraseblocks - no deal */

goto out_unlock;

* We schedule wear-leveling only if the difference between the

* lowest erase counter of used physical eraseblocks and a high

* erase counter of free physical eraseblocks is greater than

* %UBI_WL_THRESHOLD.

上面说到了WL是将一块已用的可擦除块中的数据搬运到另外一块未用的可擦除块中去，所以就从used树中找一块EC值很小的（但是根据文档说，这儿应该是找一块UBI_LONGTERM类型的并且EC值比较小的），然后再从free树中找一块ec值很大的。

e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb);

e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);

if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD))

goto out_unlock;

dbg_wl("schedule wear-leveling");

} else

dbg_wl("schedule scrubbing");

ubi->wl_scheduled = 1;//注意这儿将wl_scheduled标志置位

spin_unlock(&ubi->wl_lock);

wrk = kmalloc(sizeof(struct ubi_work), GFP_NOFS);

if (!wrk) {

err = -ENOMEM;

goto out_cancel;

}

//构造一个worker，并添加到队列中由后台进程来完成。具体工作是由wear_leveling_worker来完成的。

wrk->func = &wear_leveling_worker;

schedule_ubi_work(ubi, wrk);

return err;

out_cancel:

spin_lock(&ubi->wl_lock);

ubi->wl_scheduled = 0;

out_unlock:

spin_unlock(&ubi->wl_lock);

return err;

}

下面就看看wear_leveling_worker这个函数的具体的工作：

static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,

int cancel)

{

int err, scrubbing = 0, torture = 0, protect = 0, erroneous = 0;

int vol_id = -1, uninitialized_var(lnum);

struct ubi_wl_entry *e1, *e2;

struct ubi_vid_hdr *vid_hdr;

kfree(wrk);

if (cancel)

return 0;

//分配一个VID头部，因为在拷贝数据的过程中，需要重新写入VID

vid_hdr = ubi_zalloc_vid_hdr(ubi, GFP_NOFS);

if (!vid_hdr)

return -ENOMEM;

mutex_lock(&ubi->move_mutex);

spin_lock(&ubi->wl_lock);

ubi_assert(!ubi->move_from && !ubi->move_to);

ubi_assert(!ubi->move_to_put);

@下面的英文注释已经说的很清楚了，如果没有free的PEB，没有关系，可以等待被pending的erase_worker完成。但是如果连scrub都没有，那么就没有办法了，取消本次WL操作

@没有used 的PEB？。在ubi_wl_get_peb函数中

rb_erase(&e->u.rb, &ubi->free)

prot_queue_add(ubi, e);

而在ubi_wl_put_peb中有：

prot_queue_del(ubi, e->pnum);

这样的操作，相信在别的地方如erase_wroker也有这样的操作。也就是说UBI会将暂时操作的PEB从相应的队列中暂时移除，把它放到ubi->pq中保护起来。

if (!ubi->free.rb_node ||

(!ubi->used.rb_node && !ubi->scrub.rb_node)) {

* No free physical eraseblocks? Well, they must be waiting in

* the queue to be erased. Cancel movement - it will be

* triggered again when a free physical eraseblock appears.

* No used physical eraseblocks? They must be temporarily

* protected from being moved. They will be moved to the

* @ubi->used tree later and the wear-leveling will be

* triggered again.

dbg_wl("cancel WL, a list is empty: free %d, used %d",

!ubi->free.rb_node, !ubi->used.rb_node);

goto out_cancel;

}

if (!ubi->scrub.rb_node) {

* Now pick the least worn-out used physical eraseblock and a

* highly worn-out free physical eraseblock. If the erase

* counters differ much enough, start wear-leveling.

e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb);

//如果scrub队列是空的，那么就从free队列中取一个目标PEB进行WL操作（EC无限左接近于WL_FREE_MAX_DIFF）

e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);

if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD)) {

dbg_wl("no WL needed: min used EC %d, max free EC %d",

e1->ec, e2->ec);

goto out_cancel;

}

paranoid_check_in_wl_tree(e1, &ubi->used);

// rb_erase是一个红黑的基本删除操作，在lib/rbtree.c中。这儿e1中的数据被转移了，那么就需要将e1从ubi->used队列中删除掉

rb_erase(&e1->u.rb, &ubi->used);

dbg_wl("move PEB %d EC %d to PEB %d EC %d",

e1->pnum, e1->ec, e2->pnum, e2->ec);

} else {

/* Perform scrubbing */

scrubbing = 1;

//注意这儿从scrub中获得e2的时候，并没有像上面一样if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD))比较e1和e2的EC值，为什么呢？因为scrub队列中的PEB都是在读的时候发生BIT_FILP的，所以必须进行WL

e1 = rb_entry(rb_first(&ubi->scrub), struct ubi_wl_entry, u.rb);

e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);

paranoid_check_in_wl_tree(e1, &ubi->scrub);

rb_erase(&e1->u.rb, &ubi->scrub);

dbg_wl("scrub PEB %d to PEB %d", e1->pnum, e2->pnum);

}

paranoid_check_in_wl_tree(e2, &ubi->free);

rb_erase(&e2->u.rb, &ubi->free);

//注意这儿，这两个指针在数据搬运完成之后会被清除掉的

ubi->move_from = e1;

ubi->move_to = e2;

spin_unlock(&ubi->wl_lock);

* Now we are going to copy physical eraseblock @e1->pnum to @e2->pnum.

* We so far do not know which logical eraseblock our physical

* eraseblock (@e1) belongs to. We have to read the volume identifier

* header first.

* Note, we are protected from this PEB being unmapped and erased. The

* 'ubi_wl_put_peb()' would wait for moving to be finished if the PEB

* which is being moved was unmapped.

err = ubi_io_read_vid_hdr(ubi, e1->pnum, vid_hdr, 0);

if (err && err != UBI_IO_BITFLIPS) {

if (err == UBI_IO_PEB_FREE) {

* We are trying to move PEB without a VID header. UBI

* always write VID headers shortly after the PEB was

* given, so we have a situation when it has not yet

* had a chance to write it, because it was preempted.

* So add this PEB to the protection queue so far,

* because presumably more data will be written there

* (including the missing VID header), and then we'll

* move it.

//进一步检查VID头部，不能说因为它是从used队列中取出来的就直接将数据搬运过去了，可以以前某个地方出错了。这儿如果发现我们要搬运的PEB本身就是空，那么搬运也就没必要进行下去了。

dbg_wl("PEB %d has no VID header", e1->pnum);

protect = 1;

goto out_not_moved;

}

ubi_err("error %d while reading VID header from PEB %d",

err, e1->pnum);

goto out_error;

}

vol_id = be32_to_cpu(vid_hdr->vol_id);

lnum = be32_to_cpu(vid_hdr->lnum);

//具体搬运数据由ubi_eba_copy_leb函数完成，实现比较简单，不在赘述

err = ubi_eba_copy_leb(ubi, e1->pnum, e2->pnum, vid_hdr);

if (err) {

if (err == MOVE_CANCEL_RACE) {

* The LEB has not been moved because the volume is

* being deleted or the PEB has been put meanwhile. We

* should prevent this PEB from being selected for

* wear-leveling movement again, so put it to the

* protection queue.

protect = 1;

goto out_not_moved;

}

if (err == MOVE_CANCEL_BITFLIPS || err == MOVE_TARGET_WR_ERR ||

err == MOVE_TARGET_RD_ERR) {

* Target PEB had bit-flips or write error - torture it.

torture = 1;

goto out_not_moved;

}

if (err == MOVE_SOURCE_RD_ERR) {

* An error happened while reading the source PEB. Do

* not switch to R/O mode in this case, and give the

* upper layers a possibility to recover from this,

* e.g. by unmapping corresponding LEB. Instead, just

* put this PEB to the @ubi->erroneous list to prevent

* UBI from trying to move it over and over again.

if (ubi->erroneous_peb_count > ubi->max_erroneous) {

ubi_err("too many erroneous eraseblocks (%d)",

ubi->erroneous_peb_count);

goto out_error;

}

erroneous = 1;

goto out_not_moved;

}

if (err < 0)

goto out_error;

ubi_assert(0);

}

/* The PEB has been successfully moved */

if (scrubbing)

ubi_msg("scrubbed PEB %d (LEB %d:%d), data moved to PEB %d",

e1->pnum, vol_id, lnum, e2->pnum);

ubi_free_vid_hdr(ubi, vid_hdr);

spin_lock(&ubi->wl_lock);

if (!ubi->move_to_put) {

wl_tree_add(e2, &ubi->used);

e2 = NULL;

}

ubi->move_from = ubi->move_to = NULL;

ubi->move_to_put = ubi->wl_scheduled = 0;

spin_unlock(&ubi->wl_lock);

//这个通过后台进程来擦除e1，erase_worker

err = schedule_erase(ubi, e1, 0);

if (err) {

kmem_cache_free(ubi_wl_entry_slab, e1);

if (e2)

kmem_cache_free(ubi_wl_entry_slab, e2);

goto out_ro;

}

if (e2) {

* Well, the target PEB was put meanwhile, schedule it for

* erasure.

dbg_wl("PEB %d (LEB %d:%d) was put meanwhile, erase",

e2->pnum, vol_id, lnum);

err = schedule_erase(ubi, e2, 0);

if (err) {

kmem_cache_free(ubi_wl_entry_slab, e2);

goto out_ro;

}

dbg_wl("done");

mutex_unlock(&ubi->move_mutex);

return 0;

* For some reasons the LEB was not moved, might be an error, might be

* something else. @e1 was not changed, so return it back. @e2 might

* have been changed, schedule it for erasure.

out_not_moved:

if (vol_id != -1)

dbg_wl("cancel moving PEB %d (LEB %d:%d) to PEB %d (%d)",

e1->pnum, vol_id, lnum, e2->pnum, err);

else

dbg_wl("cancel moving PEB %d to PEB %d (%d)",

e1->pnum, e2->pnum, err);

spin_lock(&ubi->wl_lock);

if (protect)

prot_queue_add(ubi, e1);

else if (erroneous) {

wl_tree_add(e1, &ubi->erroneous);

ubi->erroneous_peb_count += 1;

} else if (scrubbing)

wl_tree_add(e1, &ubi->scrub);

else

wl_tree_add(e1, &ubi->used);

ubi_assert(!ubi->move_to_put);

ubi->move_from = ubi->move_to = NULL;

ubi->wl_scheduled = 0;

spin_unlock(&ubi->wl_lock);

ubi_free_vid_hdr(ubi, vid_hdr);

err = schedule_erase(ubi, e2, torture);

if (err) {

kmem_cache_free(ubi_wl_entry_slab, e2);

goto out_ro;

}

mutex_unlock(&ubi->move_mutex);

return 0;

out_error:

if (vol_id != -1)

ubi_err("error %d while moving PEB %d to PEB %d",

err, e1->pnum, e2->pnum);

else

ubi_err("error %d while moving PEB %d (LEB %d:%d) to PEB %d",

err, e1->pnum, vol_id, lnum, e2->pnum);

spin_lock(&ubi->wl_lock);

ubi->move_from = ubi->move_to = NULL;

ubi->move_to_put = ubi->wl_scheduled = 0;

spin_unlock(&ubi->wl_lock);

ubi_free_vid_hdr(ubi, vid_hdr);

kmem_cache_free(ubi_wl_entry_slab, e1);

kmem_cache_free(ubi_wl_entry_slab, e2);

out_ro:

ubi_ro_mode(ubi);

mutex_unlock(&ubi->move_mutex);

ubi_assert(err != 0);

return err < 0 ? err : -EIO;

out_cancel:

ubi->wl_scheduled = 0;

spin_unlock(&ubi->wl_lock);

mutex_unlock(&ubi->move_mutex);

ubi_free_vid_hdr(ubi, vid_hdr);

return 0;

}

至此，WL基本完成。主要代码都在/drivers/mtd/ubi/wl.c文件中。

那么UBIFS中在什么情况下会调用ensure_wear_leveling来判断是否进行WL。

1. erase_worker

2. ubi_wl_scrub_peb

3. ubi_wl_init_scan

对于WL，有点需要详细说明一下:上面提到了ubi->scrub中的结点是从哪儿来的？

在ubi_eba_read_leb函数，当发生BIT_FILP的时候，会调用ubi_wl_scrub_peb来进行WL。

同样在上面的ensure_wear_leveling中看到了，WL中是优先到ubi->scrub队列中查找的。

———————————————不足之处，请多多指教。

阅读(5527) | 评论(0) | 转发(3) |

上一篇：UBIFS文件系统分析（四）：重要数据结构

下一篇：数组指针的问题

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6