linux内存管理之kmalloc(2)-linuxDOS-ChinaUnix博客

linuxDOS的ChinaUnix博客

首页　| 　博文目录　| 　关于我

linuxDOS

博客访问： 1220272
博文数量： 56
博客积分： 400
博客等级：一等列兵
技术积分： 2800
用户组：普通用户
注册时间： 2010-03-30 13:08

个人简介

一个人的差异在于业余时间

文章分类

全部博文（56）

无线通信（1）
应用编程（1）
存储备份（0）
大数据（0）
互联网安全（0）
人工智能（0）
云与虚拟化（0）
工业安全（0）
应用业务（0）
工具（4）
USB和3G（5）
逆向（0）
硬件（2）
嵌入式（1）
linux驱动开发（4）
网络（16）
linux内核（21）
脚本编程（0）
心情（1）
未分配的博文（0）

文章存档

2023年（1）

2019年（1）

2018年（1）

2017年（1）

2016年（2）

2015年（20）

2014年（10）

2013年（7）

2012年（12）

2011年（1）

我的朋友

相关博文

linux内存管理之kmalloc(2)

分类： LINUX

2015-01-26 14:51:42

上一篇文章中简单说了下slab分配器下kmalloc是如何分配内存的。在看cache_alloc_refill这个函数的时候逻辑上还有一些困惑。

点击(此处)折叠或打开

static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
bool force_refill)
{
int batchcount;
struct kmem_list3 *l3;
struct array_cache *ac;
int node;
check_irq_off();
node = numa_mem_id();
if (unlikely(force_refill))
goto force_grow;
retry:
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
/*
* If there was little recent activity on this cache, then
* perform only a partial refill. Otherwise we could generate
* refill bouncing.
*/
batchcount = BATCHREFILL_LIMIT;
}
l3 = cachep->nodelists[node];

BUG_ON(ac->avail > 0 || !l3);
spin_lock(&l3->list_lock);
/* See if we can refill from the shared array */
if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
l3->shared->touched = 1;
goto alloc_done;
}
while (batchcount > 0) {
struct list_head *entry;
struct slab *slabp;
/* Get slab alloc is to come from. */
entry = l3->slabs_partial.next;
if (entry == &l3->slabs_partial) {
l3->free_touched = 1;
entry = l3->slabs_free.next;
if (entry == &l3->slabs_free)
goto must_grow;
}
slabp = list_entry(entry, struct slab, list);
check_slabp(cachep, slabp);
check_spinlock_acquired(cachep);
/*
* The slab was either on partial or free list so
* there must be at least one object available for
* allocation.
*/
BUG_ON(slabp->inuse >= cachep->num);
while (slabp->inuse < cachep->num && batchcount--) {
STATS_INC_ALLOCED(cachep);
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
node));
}
check_slabp(cachep, slabp);
/* move slabp to correct slabp list: */
list_del(&slabp->list);
if (slabp->free == BUFCTL_END)
list_add(&slabp->list, &l3->slabs_full);
else
list_add(&slabp->list, &l3->slabs_partial);
}
must_grow:
l3->free_objects -= ac->avail;
alloc_done:
spin_unlock(&l3->list_lock);
if (unlikely(!ac->avail)) {
int x;
force_grow:
x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
/* cache_grow can reenable interrupts, then ac could change. */
ac = cpu_cache_get(cachep);
node = numa_mem_id();
/* no objects in sight? abort */
if (!x && (ac->avail == 0 || force_refill))
return NULL;
if (!ac->avail) /* objects refilled by interrupt? */
goto retry;
}
ac->touched = 1;
return ac_get_obj(cachep, ac, flags, force_refill);
}

主要是关于 batchcount = ac->batchcount; 的问题。在默认初始化的时候即在kmem_cache_init中系统的cache都会调用到__kmem_cache_create中setup_cpu_cache的有这样一段代码：

点击(此处)折叠或打开

cpu_cache_get(cachep)->avail = 0;
cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
cpu_cache_get(cachep)->batchcount = 1;
cpu_cache_get(cachep)->touched = 0;
cachep->batchcount = 1;
cachep->limit = BOOT_CPUCACHE_ENTRIES;
return 0;

那么我是不是就可以认为ac->batchcount的值就是1了呢？那么 ac_put_obj的时候只放一个obj到array中。每次都这样，那么在__cache_alloc中

点击(此处)折叠或打开

ac = cpu_cache_get(cachep);
if (likely(ac->avail)) {
ac->touched = 1;
objp = ac_get_obj(cachep, ac, flags, false);
/*
* Allow for the possibility all avail objects are not allowed
* by the current flags
*/
if (objp) {
STATS_INC_ALLOCHIT(cachep);
goto out;
}
force_refill = true;
}

它的意义又何在呢？因为batchcount为1的话，每次放入一个obj到array 设置avail从0到1，但是get一个obj后，avail又为0了。当然这样效率很低。
后来才发现是自己代码没看全- -，我们看这样一段代码它在kmem_cache_init初始化后，调用的

点击(此处)折叠或打开

void __init kmem_cache_init_late(void)
{
struct kmem_cache *cachep;
slab_state = UP;
/* 6) resize the head arrays to their final sizes */
mutex_lock(&slab_mutex);
list_for_each_entry(cachep, &slab_caches, list)
if (enable_cpucache(cachep, GFP_NOWAIT))
BUG();
mutex_unlock(&slab_mutex);
/* Annotate slab for lockdep -- annotate the malloc caches */
init_lock_keys();
/* */
slab_state = FULL;
/*
* Register a cpu startup notifier callback that initializes
* cpu_cache_get for all new cpus
*/
register_cpu_notifier(&cpucache_notifier);
#ifdef CONFIG_NUMA
/*
* Register a memory hotplug callback that initializes and frees
* nodelists.
*/
hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
#endif
/*
* The reap timers are started later, with a module init call: That part
* of the kernel is not yet operational.
*/
}

这个函数就是把slab_caches链表上的所有cache都调用enable_cpucache(cachep, GFP_NOWAIT)一遍！

点击(此处)折叠或打开

/* Called with slab_mutex held always */
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
{
int err;
int limit = 0;
int shared = 0;
int batchcount = 0;
if (!is_root_cache(cachep)) {
struct kmem_cache *root = memcg_root_cache(cachep);
limit = root->limit;
shared = root->shared;
batchcount = root->batchcount;
}
if (limit && shared && batchcount)
goto skip_setup;
/*
* The head array serves three purposes:
* - create a LIFO ordering, i.e. return objects that are cache-warm
* - reduce the number of spinlock operations.
* - reduce the number of linked list operations on the slab and
* bufctl chains: array operations are cheaper.
* The numbers are guessed, we should auto-tune as described by
* Bonwick.
*/
if (cachep->size > 131072) // size 大一128k 小于page_size 则limit为1
limit = 1;
else if (cachep->size > PAGE_SIZE)
limit = 8;
else if (cachep->size > 1024)
limit = 24;
else if (cachep->size > 256)
limit = 54;
else
limit = 120;
/*
* CPU bound tasks (e.g. network routing) can exhibit cpu bound
* allocation behaviour: Most allocs on one cpu, most free operations
* on another cpu. For these cases, an efficient object passing between
* cpus is necessary. This is provided by a shared array. The array
* replaces Bonwick's magazine layer.
* On uniprocessor, it's functionally equivalent (but less efficient)
* to a larger limit. Thus disabled by default.
*/
shared = 0;
if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1) // smp 下 shared为8 ，单核为0
shared = 8;
#if DEBUG
/*
* With debugging enabled, large batchcount lead to excessively long
* periods with disabled local interrupts. Limit the batchcount
*/
if (limit > 32)
limit = 32;
#endif
batchcount = (limit + 1) / 2;
skip_setup:
err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); //设置参数值到cache里
if (err)
printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
cachep->name, -err);
return err;
}

对我们看到了limit 、shared 、 batchcount的新初始化.

点击(此处)折叠或打开

static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
int batchcount, int shared, gfp_t gfp)
{
int ret;
struct kmem_cache *c = NULL;
int i = 0;
ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); // 设置传递进来的cache的东西
if (slab_state < FULL)
return ret;
if ((ret < 0) || !is_root_cache(cachep))
return ret;
VM_BUG_ON(!mutex_is_locked(&slab_mutex));
for_each_memcg_cache_index(i) {
c = cache_from_memcg(cachep, i);
if (c)
/* return value determined by the parent cache only */
__do_tune_cpucache(c, limit, batchcount, shared, gfp);
}
return ret;
}

而具体实现在

点击(此处)折叠或打开

/* Always called with the slab_mutex held */
static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
int batchcount, int shared, gfp_t gfp)
{
struct ccupdate_struct *new;
// 说明一下上面的结构体
点击(此处)折叠或打开
1. struct ccupdate_struct {
2. struct kmem_cache *cachep;
3. struct array_cache *new[0];
4. };
int i;
new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), // 这个函数用完new就释放了。说明它只是起到一个中转的作用.
gfp);
if (!new)
return -ENOMEM;
for_each_online_cpu(i) {
new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
batchcount, gfp);
if (!new->new[i]) {
for (i--; i >= 0; i--)
kfree(new->new[i]);
kfree(new);
return -ENOMEM;
}
}
new->cachep = cachep;
on_each_cpu(do_ccupdate_local, (void *)new, 1); // 关键点：每个cpu上都调用do_ccupdate_local处理new。
check_irq_on();
cachep->batchcount = batchcount;
cachep->limit = limit;
cachep->shared = shared;
for_each_online_cpu(i) {
struct array_cache *ccold = new->new[i];
if (!ccold)
continue;
spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); //
spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
kfree(ccold);
}
kfree(new);
return alloc_kmemlist(cachep, gfp);
}

我们就看看do_ccupdate_local做了什么

点击(此处)折叠或打开

static void do_ccupdate_local(void *info)
{
struct ccupdate_struct *new = info;
struct array_cache *old;
check_irq_off();
old = cpu_cache_get(new->cachep);
new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];// 由于之前 new->cache已经指向了我们的cache，所以这里操作的是我们cache的array指向新的地方.
// 而new->new这个array的初始化是在申请它的时候见上个函数里的alloc_arraycache：
点击(此处)折叠或打开
1. static struct array_cache *alloc_arraycache(int node, int entries,
2. int batchcount, gfp_t gfp)
3. {
4. int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
5. struct array_cache *nc = NULL;
7. nc = kmalloc_node(memsize, gfp, node);
8. /*
9. * The array_cache structures contain pointers to free object.
10. * However, when such objects are allocated or transferred to another
11. * cache the pointers are not cleared and they could be counted as
12. * valid references during a kmemleak scan. Therefore, kmemleak must
13. * not scan such objects.
14. */
15. kmemleak_no_scan(nc);
16. if (nc) {
17. nc->avail = 0;
18. nc->limit = entries;
19. nc->batchcount = batchcount;
20. nc->touched = 0;
21. spin_lock_init(&nc->lock);
22. }
23. return nc;
24. }
new->new[smp_processor_id()] = old;
}

这样就和函数cache_alloc_refill接起来了
我们可以看看实际的内核开启slab的信息：

点击(此处)折叠或打开

cat /proc/slabinfo
slabinfo - version: 2.1
# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab> : tunables <limit> <batchcount> <sharedfactor> : slabdata <active_slabs> <num_slabs> <sharedavail>
nf_conntrack_expect 0 0 152 26 1 : tunables 120 60 8 : slabdata 0 0 0
nf_conntrack_8050c5f0 2 26 296 13 1 : tunables 54 27 8 : slabdata 2 2 0
bridge_fdb_cache       4     78     48   78    1 : tunables 120   60    8 : slabdata      1      1      0

fib6_nodes            12    113     32 113    1 : tunables 120   60    8 : slabdata      1      1      0

ip6_dst_cache         25     57    208   19    1 : tunables 120   60    8 : slabdata      3      3      0

ip6_mrt_cache          0      0    112   35    1 : tunables 120   60    8 : slabdata      0      0      0

RAWv6                  8     15    720    5    1 : tunables   54   27    8 : slabdata      3      3      0

UDPLITEv6              0      0    688   11    2 : tunables   54   27    8 : slabdata      0      0      0

UDPv6                  3     22    688   11    2 : tunables   54   27    8 : slabdata      2      2      0

tw_sock_TCPv6          0     0    144   27    1 : tunables 120   60    8 : slabdata      0      0      0

request_sock_TCPv6      0      0    112   35    1 : tunables 120   60    8 : slabdata      0      0      0

TCPv6                  5      6   1328    3    1 : tunables   24   12    8 : slabdata      2      2      0

ubi_wl_entry_slab    463    580     24 145    1 : tunables 120   60    8 : slabdata      4      4      0

sd_ext_cdb             2    113     32 113    1 : tunables 120   60    8 : slabdata      1      1      0

fuse_request           0      0    384   10    1 : tunables   54   27    8 : slabdata      0      0      0

fuse_inode             0      0    416    9    1 : tunables   54   27    8 : slabdata      0      0      0

jffs2_inode_cache     15    145     24 145    1 : tunables 120   60    8 : slabdata      1      1      0

jffs2_node_frag      130    290     24 145    1 : tunables 120   60    8 : slabdata      2      2      0

uid_cache              0      0     48   78    1 : tunables 120   60    8 : slabdata      0      0      0

UNIX                  24     32    480    8    1 : tunables   54   27    8 : slabdata      4      4      0

ip_mrt_cache           0      0     96   40    1 : tunables 120   60    8 : slabdata      0      0      0

UDP-Lite               0      0    560    7    1 : tunables   54   27    8 : slabdata      0      0      0

tcp_bind_bucket        6    113     32 113    1 : tunables 120   60    8 : slabdata      1      1      0

inet_peer_cache        8     24    160   24    1 : tunables 120   60    8 : slabdata      1      1      0

ip_fib_trie            7    113     32 113    1 : tunables 120   60    8 : slabdata      1      1      0

ip_fib_alias           8    145     24 145    1 : tunables 120   60    8 : slabdata      1      1      0

ip_dst_cache           6     27    144   27    1 : tunables 120   60    8 : slabdata      1      1      0

PING                   0      0    528    7    1 : tunables   54   27    8 : slabdata      0      0      0

RAW                    4      7    544    7    1 : tunables   54   27    8 : slabdata      1      1      0

UDP                   13     14    560    7    1 : tunables   54   27    8 : slabdata      2      2      0

tw_sock_TCP            0      0    112   35    1 : tunables 120   60    8 : slabdata      0      0      0

request_sock_TCP       0      0     80   48    1 : tunables 120   60    8 : slabdata      0      0      0

TCP                    1      6   1184    6    2 : tunables   24   12    8 : slabdata      1      1      0
......
size-2048(DMA)         0      0   2048    2    1 : tunables   24   12    8 : slabdata      0      0      0

size-2048            192    192   2048    2    1 : tunables   24   12    8 : slabdata     96     96      0

size-1024(DMA)         0      0   1024    4    1 : tunables   54   27    8 : slabdata      0      0      0

size-1024            215    216   1024    4    1 : tunables   54   27    8 : slabdata     54     54      0

size-512(DMA)          0      0    512    8    1 : tunables   54   27    8 : slabdata      0      0      0

size-512             601    624    512    8    1 : tunables   54   27    8 : slabdata     78     78      0

size-256(DMA)          0      0    256   15    1 : tunables 120   60    8 : slabdata      0      0      0

size-256            1234   1245    256   15    1 : tunables 120   60    8 : slabdata     83     83      0

size-192(DMA)          0      0    256   15    1 : tunables 120   60    8 : slabdata      0      0      0

size-192             287    300   256   15    1 : tunables 120   60    8 : slabdata     20     20      0

size-128(DMA)          0      0    128   30    1 : tunables 120   60    8 : slabdata      0      0      0

size-128            1890   1890    128   30    1 : tunables 120   60    8 : slabdata     63     63      0

size-96(DMA)           0      0    128   30    1 : tunables 120   60    8 : slabdata      0      0      0

size-96              930    930    128   30    1 : tunables 120   60    8 : slabdata     31     31      0

size-64(DMA)           0      0    128   30    1 : tunables 120   60    8 : slabdata      0      0      0

size-32(DMA)           0      0    128   30    1 : tunables 120   60    8 : slabdata      0      0      0

size-64             1577   1650    128   30    1 : tunables 120   60    8 : slabdata     55     55      0

size-32             6213   6300    128   30    1 : tunables 120   60    8 : slabdata    210    210      0

kmem_cache           150    160     96   40    1 : tunables 120   60    8 : slabdata      4      4      0

或许你看ubuntu系统的时候发现limit batchcount值为0 ，其实它是用了slub分配器.在slub.c中

点击(此处)折叠或打开

void __init kmem_cache_init_late(void)
{
}

这里顺便说明一下关于slab、slub、slob的简单区别：（具体如何实现的请参考内核代码slab.c /slub.c/slob.c）
slab是slub和slob的基础。
SLOB的目标是针对嵌入式系统的，主要是适用于那些内存非常有限的系统，比如32MB以下的内存，它不太注重large smp系统，虽然最近在这方面有一些小的改进
SLUB allocator，用于替代 slab 代码。通过取消了大量的队列和相关开销、简化 slab 的结构，SLUB 承诺提供更好的性能和更好的系统可伸缩性，并且可以同时保持现有的 slab 分配器接口
说了这么多，我们用个图来简单描述下slab机制：

阅读(2902) | 评论(0) | 转发(1) |

上一篇：linux内存管理之kmalloc

下一篇：linux内存管理之kmem_cache_init

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6