Btrfs —— __commit_transaction() 分析-lmnos-ChinaUnix博客

自主操作系统LMOSlmos.blog.chinaunix.net

首页　| 　博文目录　| 　关于我

lmnos

博客访问： 1170738
博文数量： 53
博客积分： 1165
博客等级：下士
技术积分： 1811
用户组：普通用户
注册时间： 2012-09-19 14:56

个人简介

专注于操作系统内核的实现

文章分类

全部博文（53）

图书作品（13）
小文艺（1）
linux（1）
自主原创软件（3）
转载（5）
自由自主操作系统（20）
未分配的博文（10）

文章存档

2015年（2）

2014年（16）

2013年（18）

2012年（17）

我的朋友

最近访客

推荐博文

Btrfs —— __commit_transaction() 分析

分类：

2012-11-27 14:28:29

原文地址：Btrfs —— __commit_transaction() 分析作者：firocu

disk_io.c: __commit_transaction(trans, root);

Parameters:

trans: 调用之前创建的，在commit_tree_roots()函数中，有更新相关成员。

root： tree root

static int __commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root)

{

u64 start;

u64 end;

struct extent_buffer *eb;

//为什么是fs info的extent cache

// grep 下确实有两处涉及到这个extent_cache, 不过我不打算，为什么调用的。

//至于用途。看下面就知道了。

struct extent_io_tree *tree = &root->fs_info->extent_cache;

int ret;

while(1) {

//之前在commit_tree_roots,我们设置这个extent dirty

ret = find_first_extent_bit(tree, 0, &start, &end, EXTENT_DIRTY);

if (ret)

break;

//always be true？

while(start <= end) {

eb = find_first_extent_buffer(tree, start);

BUG_ON(!eb || eb->start != start);

ret = write_tree_block(trans, root, eb);

BUG_ON(ret);

start += eb->len;

clear_extent_buffer_dirty(eb);

free_extent_buffer(eb);

}

return 0;

}

Parameters

root:tree root

eb: first dirty extent buffer.

int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb)

{

int ret;

int dev_nr;

u64 length;

struct btrfs_multi_bio *multi = NULL;

if (check_tree_block(root, eb))

BUG();

if (!btrfs_buffer_uptodate(eb, trans->transid))

BUG();

//设置这个extent buffer 为被写。

btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);

csum_tree_block(root, eb, 0);

dev_nr = 0;

length = eb->len;

//这是一个非常重要的函数，我们下面就分析他~~

//在这里我们注意这个fs info mapping tree 和上面的fs info extent cache的不同用途

//extent io tree的chengyuan 要比 mapping tree的成员多。

//mapping tree 结构只是对cache_tree,即rb_root的简单封装而已。不用害怕。

ret = btrfs_map_block(&root->fs_info->mapping_tree, WRITE,eb->start, &length, &multi, 0);

while(dev_nr < multi->num_stripes) {

BUG_ON(ret);

eb->fd = multi->stripes[dev_nr].dev->fd;

eb->dev_bytenr = multi->stripes[dev_nr].physical;

multi->stripes[dev_nr].dev->total_ios++;

dev_nr++;

ret = write_extent_to_disk(eb);

BUG_ON(ret);

}

kfree(multi);

return 0;

}

//说实在的这个函数很有意思，他涉及到了，disk上数据的布局，以及raid方面的知识。

//首先是参数

//map tree 就是用来保存所有映射的rb tree至于是什么映射，且看下文。

//rw： WRITE

//logical：eb->start也就是这个node/leaf在disk上地址。//那为什么说是logical的呢？

//我理解的是，这个地址应该是建立在raid层之上. 也就是说，没有经过raid相关的处理。个人理解

//legth：eb->len 同时也做返回值。

//type：NULL

//multi_ret:彻头彻尾的返回值。

//mirror_num: 传过来的是0.

int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, u64 *type,

struct btrfs_multi_bio **multi_ret, int mirror_num)

{

struct cache_extent *ce;

struct map_lookup *map;

u64 offset;

u64 stripe_offset;

u64 stripe_nr;

int stripes_allocated = 8;

int stripes_required = 1;

int stripe_index;

int i;

struct btrfs_multi_bio *multi = NULL;

if (multi_ret && rw == READ) {

//初始为8,这个变量的名字大致是已经分配的条带数。

//为什么read和multi bio时，要变成一个呢？难道说读不用多在底层分配stripe，而写的时候则有必要。

//看似说得通。

stripes_allocated = 1;

}

again:

//现在map tree 里面找到我们的那个映射。

//记得不错应该是btrfs_alloc_data_chunk，插入了map_look包含了stripe的信息。

//这些stripe信息应该就是所谓的physical 物理地址了吧。

ce = find_first_cache_extent(&map_tree->cache_tree, logical);

if (!ce) {

if (multi)

kfree(multi);

return -ENOENT;

}

//我们找到的这个cache extent 不包含我们制定的logical地址。

if (ce->start > logical || ce->start + ce->size < logical) {

if (multi)

kfree(multi);

return -ENOENT;

}

if (multi_ret) {

//stripes_allocated 是8, 是怎么确定的。另外，读的1呢？

//raid10是需要4个disk的。

multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),

GFP_NOFS);

if (!multi)

return -ENOMEM;

}

//得到map_lookup

map = container_of(ce, struct map_lookup, ce);

//cache extent 内偏移

offset = logical - ce->start;

if (rw == WRITE) {

if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) {

//raid1(mirror) 和 dup非常类似都是备份机制，只不过dup是在一个disk上

//num_stripes 应该包含重复备份的那部分stripes，我从sub_stripes得来的。

stripes_required = map->num_stripes;

} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {

//想想之前除以sub_stripes，所以stripes_required就是总的stripes的条带数。

//这应该不包括raid0 的,那一部分怎么处理。

stripes_required = map->sub_stripes;

}

/* if our multi bio struct is too small, back off and try again */

if (multi_ret && rw == WRITE &&stripes_allocated < stripes_required) {

//这个很简单就是一开始指定的那8个不够用，从这里我们就应该知道这个8,就是指定的一个最大值。

//当然这个最大值是可上扩的。

//结合上面的BTRFS BLOCK GROUP RAID10，我们可以推断出num stripes 要大于 sub stripes的。

//所以这里赋值的是 num stripes 而不是 stripes required。num stripse 应该是总数。

stripes_allocated = map->num_stripes;

kfree(multi);

goto again;

}

stripe_nr = offset;

* stripe_nr counts the total number of stripes we have to stride(跨过) to get to this block

//上面的注释。stripe_len == 64 kb node/leaf =4 kb

stripe_nr = stripe_nr / map->stripe_len;

stripe_offset = stripe_nr * map->stripe_len;

BUG_ON(offset < stripe_offset);

/* stripe_offset is the offset of this block in its stripe*/

//这个就是我们的node/leaf在这个stripe中的偏移。stripe是64kb大小，而node/leaf才4kb。

//一个stripe 内可以保存多个node/leaf。

//offset 是在整个eb内的偏移。

stripe_offset = offset - stripe_offset;

if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1

| BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP)) {

//启动了 RAID 策略，也就是条带策略，那么一次读写要在stripe内完成。

/* we limit the length of each bio to what fits in a stripe */

//现在stripe len 64kb ， limit into a stripe

*length = min_t(u64, ce->size - offset, map->stripe_len - stripe_offset);

} else {

*length = ce->size - offset;//这个node/leaf的真是数据大小。

}

if (!multi_ret)

goto out;

multi->num_stripes = 1;

stripe_index = 0;

//下面的代码非常重要。。。仔细分析，可以揭示很多底层细节。

if (map->type & BTRFS_BLOCK_GROUP_RAID1) {//备份

//我们一点一点的来，首先，我们这个map使用了raid1备份

if (rw == WRITE)//那么写的时候要写所有的stripes，上面分析了，num stripes是总数。

multi->num_stripes = map->num_stripes;//包括重复备份。

else if (mirror_num) //这个的用途未知。。。

stripe_index = mirror_num - 1;

else //stripe nr是要跨过的stripe的数量。也就是开始的那个stripe的地址。

stripe_index = stripe_nr % map->num_stripes;

//我觉得有必要研究下这个num_stripes 什么时候会大于 stripe_nr ,或者小于。

//我们这么想当一个很大的数据要写到硬盘disk肯定会有很多个stripe的比方说

//《攻壳机动队笑面男》 4G，那么stripe的数量就非常客观了

} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {

//如果是raid10 先备份在striped

//这个factor就是raid0 striped 的stripe数。

int factor = map->num_stripes / map->sub_stripes;

//应该是先非配好raid0的条带数和位置，也就是做striped的数据。

stripe_index = stripe_nr % factor;

//之后在处理raid1 mirror

stripe_index *= map->sub_stripes;

if (rw == WRITE)

multi->num_stripes = map->sub_stripes;

else if (mirror_num)//用意未知

stripe_index += mirror_num - 1;

//一个striped ，raid0 就是一次写，读操作的大小，就是factor。好像可以理解成一个stripe。

stripe_nr = stripe_nr / factor;

} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {

if (rw == WRITE)

multi->num_stripes = map->num_stripes;

else if (mirror_num)

stripe_index = mirror_num - 1;

} else {

* after this do_div call, stripe_nr is the number of stripes

* on this device we have to walk to find the data, and

* stripe_index is the number of our device in the stripe array

//stripe index 是我们在整个stripe array的 number 序号。

stripe_index = stripe_nr % map->num_stripes;

//strip_nr是对device来讲的，就是我们在找到data之前要stride。

//我理解这里是处理掉了raid之一层，按整个num stripes 作为一个整体来计算stripe。

stripe_nr = stripe_nr / map->num_stripes;

}

//我想有些情况这个num stripes 是非常大的吧，比如说数据1G。

BUG_ON(stripe_index >= map->num_stripes);

for (i = 0; i < multi->num_stripes; i++) {

//上面理解的不是很透彻，这里貌似是个归总。所以理解好这里。非常重要。

//先梳理下我们知道的。

//strpe_offset就是一个stripe内的偏移量，而stripe nr * stripelen就是我们在到达数据之前要跨过的stripe。

//这个map stripes physical 是非常重要的他决定了每次这个数据放在哪个设备的哪个地址。

//他可能在不同的disk上，也可能在同一个disk。具体的这个physical计算，忘记在那里了。

//他是正个riad机制的根基。有时间在好好看看这里。

multi->stripes[i].physical =map->stripes[stripe_index].physical + stripe_offset+stripe_nr * map->stripe_len;

multi->stripes[i].dev = map->stripes[stripe_index].dev;

stripe_index++;

}

*multi_ret = multi;

//这type还有返回值的用途没想到吧，可惜这里是NULL~~~

if (type)

*type = map->type;

out:

return 0;

}

阅读(3219) | 评论(0) | 转发(0) |

上一篇：硬盘的存储原理和内部架构

下一篇：关于LMOS开源

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6