2011年(38)
分类: LINUX
2011-03-03 16:50:32
创建slab系统顶层的cache节点。创建完成后,cache里并没有任何slab以及对象,只有当分配对象,并且cache中没有空闲对象时,才会创建新的slab。
参数:
1) name:cache的名字。
2) size:对象的大小。
3) align:对齐方式。
4) flags:标志。
5) ctor:构造函数指针。
struct kmem_cache *
kmem_cache_create (const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
size_t left_over, slab_size, ralign;
struct kmem_cache *cachep = NULL, *pc;
gfp_t gfp;
/*
* Sanity checks... these are all serious usage bugs.
*/
/* 安全性检查 */
if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
size > KMALLOC_MAX_SIZE) {
printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
name);
BUG();
}
/*
* We use cache_chain_mutex to ensure a consistent view of
* cpu_online_mask as well. Please see cpuup_callback
*/
/* slab分配器是否已经初始化好,如果是内核启动阶段,则只有一个cpu执行slab分配器的初始化动作,无需加锁,否则需要加锁 */
if (slab_is_available()) {
get_online_cpus();
mutex_lock(&cache_chain_mutex);
}
/* 遍历cache链,做些校验工作 */
list_for_each_entry(pc, &cache_chain, next) {
char tmp;
int res;
/*
* This happens when the module gets unloaded and doesn't
* destroy its slab cache and no-one else reuses the vmalloc
* area of the module. Print a warning.
*/
/* 检查cache链表中的cache是否都有名字 */
res = probe_kernel_address(pc->name, tmp);
if (res) {
printk(KERN_ERR
"SLAB: cache with size %d has lost its name\n",
pc->buffer_size);
continue;
}
/* 检查cache链表中是否已经存在相同名字的cache */
if (!strcmp(pc->name, name)) {
printk(KERN_ERR
"kmem_cache_create: duplicate cache %s\n", name);
dump_stack();
goto oops;
}
}
……
/* slab分配器是否已经准备好 */
if (slab_is_available())
gfp = GFP_KERNEL;
else
/* slab初始化好之前,不允许阻塞,且只能在低端内存区分配 */
gfp = GFP_NOWAIT;
/* Get cache's description obj. */
/* 获得struct kmem_cache对象 */
cachep = kmem_cache_zalloc(&cache_cache, gfp);
if (!cachep)
goto oops;
……
/*
* Determine if the slab management is 'on' or 'off' slab.
* (bootstrapping cannot cope with offslab caches so don't do
* it too early on. Always use on-slab management when
* SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
*/
/* 确定slab管理对象的存储方式:内置还是外置。通常,当对象大于等于512时,使用外置方式。初始化阶段采用内置式。slab_early_init 参见kmem_cache_init函数 */
if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
!(flags & SLAB_NOLEAKTRACE))
/*
* Size is large, assume best to place the slab management obj
* off-slab (should allow better packing of objs).
*/
flags |= CFLGS_OFF_SLAB;
/* 对齐 */
size = ALIGN(size, align);
/* 获得slab中碎片的大小 */
left_over = calculate_slab_order(cachep, size, align, flags);
/* cachep->num为该cache中每个slab的对象数,为0,表示为该对象创建cache失败 */
if (!cachep->num) {
printk(KERN_ERR
"kmem_cache_create: couldn't create cache %s.\n", name);
kmem_cache_free(&cache_cache, cachep);
cachep = NULL;
goto oops;
}
/* 计算slab管理对象的大小,包括struct slab对象和kmem_bufctl_t数组 */
slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
+ sizeof(struct slab), align);
/*
* If the slab has been placed off-slab, and we have enough space then
* move it on-slab. This is at the expense of any extra colouring.
*/
/* 如果这是一个外置式slab,并且碎片大小大于slab管理对象的大小,则可将slab管理对象移到slab中,改造成一个内置式slab */
if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
/* 除去off-slab标志位 */
flags &= ~CFLGS_OFF_SLAB;
/* 更新碎片大小 */
left_over -= slab_size;
}
if (flags & CFLGS_OFF_SLAB) {
/* really off slab. No need for manual alignment */
/* align是针对slab对象的,如果slab管理对象是外置存储,自然不会像内置那样影响到后面slab对象的存储位置,也就不需要对齐了 */
slab_size =
cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
#ifdef CONFIG_PAGE_POISONING
/* If we're going to use the generic kernel_map_pages()
* poisoning, then it's going to smash the contents of
* the redzone and userword anyhow, so switch them off.
*/
if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
#endif
}
/* cache的着色块的单位大小 */
cachep->colour_off = cache_line_size();
/* Offset must be a multiple of the alignment. */
/* 着色块大小必须是对象要求对齐方式的倍数 */
if (cachep->colour_off < align)
cachep->colour_off = align;
/* 计算碎片区需要多少个着色快 */
cachep->colour = left_over / cachep->colour_off;
/* slab管理对象的大小 */
cachep->slab_size = slab_size;
cachep->flags = flags;
cachep->gfpflags = 0;
if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
cachep->gfpflags |= GFP_DMA;
/* slab对象的大小 */
cachep->buffer_size = size;
/* 计算对象在slab中索引时用,参见obj_to_index函数 */
cachep->reciprocal_buffer_size = reciprocal_value(size);
if (flags & CFLGS_OFF_SLAB) {
/* 分配一个slab管理对象,保存在slabp_cache中,如果是内置式的slab,此指针为空 */
cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
/*
* This is a possibility for one of the malloc_sizes caches.
* But since we go off slab only for object size greater than
* PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
* this should not happen at all.
* But leave a BUG_ON for some lucky dude.
*/
BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
}
/* 对象的构造函数 */
cachep->ctor = ctor;
/* cache的名字 */
cachep->name = name;
/* 设置每个cpu上的local cache */
if (setup_cpu_cache(cachep, gfp)) {
__kmem_cache_destroy(cachep);
cachep = NULL;
goto oops;
}
/* cache setup completed, link it into the list */
/* cache创建完毕,将其加入到全局slab cache链表中 */
list_add(&cachep->next, &cache_chain);
oops:
if (!cachep && (flags & SLAB_PANIC))
panic("kmem_cache_create(): failed to create slab `%s'\n",
name);
if (slab_is_available()) {
mutex_unlock(&cache_chain_mutex);
put_online_cpus();
}
return cachep;
}
calculate_slab_order
计算slab由几个页面组成,同时计算每个slab中有多少对象。
static size_t calculate_slab_order(struct kmem_cache *cachep,
size_t size, size_t align, unsigned long flags)
{
unsigned long offslab_limit;
size_t left_over = 0;
int gfporder;
for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
unsigned int num;
size_t remainder;
/* 计算slab中对象数 */
cache_estimate(gfporder, size, align, flags, &remainder, &num);
/* 对象数为0,表示此order下,一个对象都放不下,检查下一order */
if (!num)
continue;
if (flags & CFLGS_OFF_SLAB) {
/*
* Max number of objs-per-slab for caches which
* use off-slab slabs. Needed to avoid a possible
* looping condition in cache_grow().
*/
/* 创建一个外置式slab时,要相应分配该slab的管理对象,包含struct slab对象和kmem_bufctl_t数组,分配管理对象的流程就是分配普通对象的流程,再来看一下分配对象的流程:
kmem_cache_alloc->__cache_alloc-> __do_cache_alloc-> ____cache_alloc-> cache_alloc_refill-> cache_grow-> alloc_slabmgmt-> kmem_cache_alloc_node-> kmem_cache_alloc
可以看出这里可能存在一个循环,循环的关键在于alloc_slabmgmt函数,当slab管理对象是off-slab方式时,就形成了循环。那么什么时候slab管理对象会采用外置式slab呢?显然当其管理的slab中对象很多,从而kmem_bufctl_t数组很大,致使整个管理对象也很大,此时才会形成循环。故需要对kmem_bufctl_t的数目做限制,下面的算法是很粗略的,既然对象大小为size时,是外置式slab,那么我们假设管理对象的大小也是size,计算出kmem_bufctl_t数组的大小,即此大小的kmem_bufctl_t数组一定会造成管理对象是外置式slab。之所以说粗略,是指数组大小小于这个限制时,也不能确保管理对象一定是内置式slab。但这也不会引发错误,因为还有一个slab_break_gfp_order阀门来控制每个slab所占页面数,通常其值为1,即每个slab最多两个页面,外置式slab存放的都是大于512的大对象,所以slab中不会有太多的大对象,kmem_bufctl_t数组也不会很大,粗略判断一下就足够了。
*/
offslab_limit = size - sizeof(struct slab);
offslab_limit /= sizeof(kmem_bufctl_t);
/* 对象数目大于限制,跳出循环,不再尝试更大的order,避免slab中对象数目过多,此时计算的对象数也是有效的,循环一次没什么 */
if (num > offslab_limit)
break;
}
/* Found something acceptable - save it away */
/* 每个slab中的对象数 */
cachep->num = num;
/* slab的order,即由几个页面组成 */
cachep->gfporder = gfporder;
/* slab中剩余空间(碎片)的大小 */
left_over = remainder;
/*
* A VFS-reclaimable slab tends to have most allocations
* as GFP_NOFS and we really don't want to have to be allocating
* higher-order pages when we are unable to shrink dcache.
*/
/* SLAB_RECLAIM_ACCOUNT表示此slab所占页面为可回收的,当内核检测是否有足够的页面满足用户态的需求时,此类页面将被计算在内,通过调用kmem_freepages()函数可以释放分配给slab的页框。由于是可回收的,所以不需要做后面的碎片检测了 */
if (flags & SLAB_RECLAIM_ACCOUNT)
break;
/*
* Large number of objects is good, but very large slabs are
* currently bad for the gfp()s.
*/
/* slab_break_gfp_order为slab所占页面的阀门,超过这个阀门时,无论碎片大小,都不再检测更高的order了 */
if (gfporder >= slab_break_gfp_order)
break;
/*
* Acceptable internal fragmentation?
*/
/* slab所占页面的大小是碎片大小的8倍以上,页面利用率较高,可以接受这样的order */
if (left_over * 8 <= (PAGE_SIZE << gfporder))
break;
}
/* 返回碎片大小 */
return left_over;
}
cache_estimate
计算每个slab中对象的数目。
参数:
1) gfporder:slab由2gfporder个页面组成。
2) buffer_size:对象的大小。
3) align:对象的对齐方式。
4) flags:内置式slab还是外置式slab。
5) left_over:slab中浪费空间的大小。
6) num:slab中的对象数目。
static void cache_estimate(unsigned long gfporder, size_t buffer_size,
size_t align, int flags, size_t *left_over,
unsigned int *num)
{
int nr_objs;
size_t mgmt_size;
/* slab大小为2order个页面 */
size_t slab_size = PAGE_SIZE << gfporder;
/*
* The slab management structure can be either off the slab or
* on it. For the latter case, the memory allocated for a
* slab is used for:
*
* - The struct slab
* - One kmem_bufctl_t for each object
* - Padding to respect alignment of @align
* - @buffer_size bytes for each object
*
* If the slab management structure is off the slab, then the
* alignment will already be calculated into the size. Because
* the slabs are all pages aligned, the objects will be at the
* correct alignment when allocated.
*/
if (flags & CFLGS_OFF_SLAB) {
/* 外置式slab */
mgmt_size = 0;
/* slab页面不含slab管理对象,全部用来存储slab对象 */
nr_objs = slab_size / buffer_size;
/* 对象数不能超过上限 */
if (nr_objs > SLAB_LIMIT)
nr_objs = SLAB_LIMIT;
} else {
/*
* Ignore padding for the initial guess. The padding
* is at most @align-1 bytes, and @buffer_size is at
* least @align. In the worst case, this result will
* be one greater than the number of objects that fit
* into the memory allocation when taking the padding
* into account.
*/
/* 内置式slab,slab管理对象与slab对象在一起,此时slab页面中包含:一个struct slab对象,一个kmem_bufctl_t数组,slab对象。kmem_bufctl_t数组大小与slab对象数目相同 */
nr_objs = (slab_size - sizeof(struct slab)) /
(buffer_size + sizeof(kmem_bufctl_t));
/*
* This calculated number will be either the right
* amount, or one greater than what we want.
*/
/* 计算cache line对齐后的大小,如果超出了slab总的大小,则对象数减一 */
if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
> slab_size)
nr_objs--;
/* 对象数不能超过上限 */
if (nr_objs > SLAB_LIMIT)
nr_objs = SLAB_LIMIT;
/* 计算cache line对齐后slab管理对象的大小 */
mgmt_size = slab_mgmt_size(nr_objs, align);
}
/* 保存slab对象数目 */
*num = nr_objs;
/* 计算浪费空间的大小 */
*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
}
kmem_find_general_cachep
在general cache中分配一个struct kmem_cache对象。直接调用__find_general_cachep。
static inline struct kmem_cache *__find_general_cachep(size_t size,
gfp_t gfpflags)
{
struct cache_sizes *csizep = malloc_sizes;
#if DEBUG
/* This happens if someone tries to call
* kmem_cache_create(), or __kmalloc(), before
* the generic caches are initialized.
*/
BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
#endif
if (!size)
return ZERO_SIZE_PTR;
/* 找到合适的malloc size */
while (size > csizep->cs_size)
csizep++;
/*
* Really subtle: The last entry with cs->cs_size==ULONG_MAX
* has cs_{dma,}cachep==NULL. Thus no special case
* for large kmalloc calls required.
*/
#ifdef CONFIG_ZONE_DMA
if (unlikely(gfpflags & GFP_DMA))
return csizep->cs_dmacachep;
#endif
/* 返回该大小级别的cache */
return csizep->cs_cachep;
}
setup_cpu_cache
配置local cache和slab三链。
static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
{
/* general cache初始化完毕,配置每个cpu的local cache */
if (g_cpucache_up == FULL)
return enable_cpucache(cachep, gfp);
/* 此时处于系统初始化阶段,g_cpucache_up记录general cache初始化的进度,比如PARTIAL_AC表示struct array_cache所在的cache已经创建,PARTIAL_L3表示struct kmem_list3所在的cache已经创建,注意创建这两个cache的先后顺序。在初始化阶段只需配置主cpu的local cache和slab三链 */
if (g_cpucache_up == NONE) {
/*
* Note: the first kmem_cache_create must create the cache
* that's used by kmalloc(24), otherwise the creation of
* further caches will BUG().
*/
/* 初始化阶段创建struct array_cache所在cache时进入这个流程,此时struct array_cache所在的general cache还未创建,只能使用静态分配的全局变量initarray_generic表示的local cache */
cachep->array[smp_processor_id()] = &initarray_generic.cache;
/*
* If the cache that's used by kmalloc(sizeof(kmem_list3)) is
* the first cache, then we need to set up all its list3s,
* otherwise the creation of further caches will BUG().
*/
/* 创建struct kmem_list3所在的cache是在struct array_cache所在cache之后,所以此时struct kmem_list3所在的cache也一定没有创建,也需要使用全局变量 */
set_up_list3s(cachep, SIZE_AC);
/* 执行到这struct array_cache所在的cache创建完毕,如果struct kmem_list3和struct array_cache位于同一个general cache中,不会再重复创建了,g_cpucache_up表示的进度更进一步 */
if (INDEX_AC == INDEX_L3)
g_cpucache_up = PARTIAL_L3;
else
g_cpucache_up = PARTIAL_AC;
} else {
/* g_cpucache_up至少为PARTIAL_AC时进入这个流程,struct array_cache所在的general cache已经建立起来,可以通过kmalloc分配了 */
cachep->array[smp_processor_id()] =
kmalloc(sizeof(struct arraycache_init), gfp);
if (g_cpucache_up == PARTIAL_AC) {
/* struct kmem_list3所在cache仍未创建完毕,还需使用全局的slab三链 */
set_up_list3s(cachep, SIZE_L3);
/* 后面将会分析kmem_cache_init函数,只有创建struct kmem_list3所在cache时才会进入此流程,上面的代码执行完,struct kmem_list3所在cache也就创建完毕可以使用了,更新g_cpucache_up */
g_cpucache_up = PARTIAL_L3;
} else {
/* struct kmem_list3所在的cache和struct array_cache所在cache都已经创建完毕 */
int node;
for_each_online_node(node) {
/* 通过kmalloc分配struct kmem_list3对象 */
cachep->nodelists[node] =
kmalloc_node(sizeof(struct kmem_list3),
gfp, node);
BUG_ON(!cachep->nodelists[node]);
/* 初始化slab三链 */
kmem_list3_init(cachep->nodelists[node]);
}
}
}
/* 设置回收时间 */
cachep->nodelists[numa_mem_id()]->next_reap =
jiffies + REAPTIMEOUT_LIST3 +
((unsigned long)cachep) % REAPTIMEOUT_LIST3;
cpu_cache_get(cachep)->avail = 0;
cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
cpu_cache_get(cachep)->batchcount = 1;
cpu_cache_get(cachep)->touched = 0;
cachep->batchcount = 1;
cachep->limit = BOOT_CPUCACHE_ENTRIES;
return 0;
}
enable_cpucache
使能local cache。
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
{
int err;
int limit, shared;
/*
* The head array serves three purposes:
* - create a LIFO ordering, i.e. return objects that are cache-warm
* - reduce the number of spinlock operations.
* - reduce the number of linked list operations on the slab and
* bufctl chains: array operations are cheaper.
* The numbers are guessed, we should auto-tune as described by
* Bonwick.
*/
/* 根据对象大小计算local cache中对象数目 */
if (cachep->buffer_size > 131072)
limit = 1;
else if (cachep->buffer_size > PAGE_SIZE)
limit = 8;
else if (cachep->buffer_size > 1024)
limit = 24;
else if (cachep->buffer_size > 256)
limit = 54;
else
limit = 120;
/*
* CPU bound tasks (e.g. network routing) can exhibit cpu bound
* allocation behaviour: Most allocs on one cpu, most free operations
* on another cpu. For these cases, an efficient object passing between
* cpus is necessary. This is provided by a shared array. The array
* replaces Bonwick's magazine layer.
* On uniprocessor, it's functionally equivalent (but less efficient)
* to a larger limit. Thus disabled by default.
*/
shared = 0;
/* 多核系统,设置shared local cache中对象数目 */
if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
shared = 8;
#if DEBUG
/*
* With debugging enabled, large batchcount lead to excessively long
* periods with disabled local interrupts. Limit the batchcount
*/
if (limit > 32)
limit = 32;
#endif
/* 配置local cache */
err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
if (err)
printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
cachep->name, -err);
return err;
}
do_tune_cpucache
配置local cache、shared local cache和slab三链。
static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
int batchcount, int shared, gfp_t gfp)
{
struct ccupdate_struct *new;
int i;
new = kzalloc(sizeof(*new), gfp);
if (!new)
return -ENOMEM;
/* 为每个cpu分配新的struct array_cache对象 */
for_each_online_cpu(i) {
new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
batchcount, gfp);
if (!new->new[i]) {
for (i--; i >= 0; i--)
kfree(new->new[i]);
kfree(new);
return -ENOMEM;
}
}
new->cachep = cachep;
/* 用新的struct array_cache对象替换旧的struct array_cache对象,在支持cpu热插拔的系统上,离线cpu可能没有释放local cache,使用的仍是旧local cache,参见__kmem_cache_destroy函数。虽然cpu up时要重新配置local cache,也无济于事。考虑下面的情景:共有Cpu A和Cpu B,Cpu B down后,destroy Cache X,由于此时Cpu B是down状态,所以Cache X中Cpu B的local cache未释放,过一段时间Cpu B又up了,更新cache_chain 链中所有cache的local cache,但此时Cache X对象已经释放回cache_cache中了,其Cpu B local cache并未被更新。又过了一段时间,系统需要创建新的cache,将Cache X对象分配出去,其Cpu B仍然是旧的local cache,需要进行更新。
*/
on_each_cpu(do_ccupdate_local, (void *)new, 1);
check_irq_on();
cachep->batchcount = batchcount;
cachep->limit = limit;
cachep->shared = shared;
/* 释放旧的local cache */
for_each_online_cpu(i) {
struct array_cache *ccold = new->new[i];
if (!ccold)
continue;
spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
/* 释放旧local cache中的对象 */
free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
/* 释放旧的struct array_cache对象 */
kfree(ccold);
}
kfree(new);
/* 初始化shared local cache 和slab三链 */
return alloc_kmemlist(cachep, gfp);
}
alloc_arraycache
分配struct array_cache对象。
static struct array_cache *alloc_arraycache(int node, int entries,
int batchcount, gfp_t gfp)
{
/* struct array_cache后面紧接着的是entry数组,合在一起申请内存 */
int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
struct array_cache *nc = NULL;
/* 分配一个local cache对象,kmalloc从general cache中分配 */
nc = kmalloc_node(memsize, gfp, node);
/*
* The array_cache structures contain pointers to free object.
* However, when such objects are allocated or transfered to another
* cache the pointers are not cleared and they could be counted as
* valid references during a kmemleak scan. Therefore, kmemleak must
* not scan such objects.
*/
kmemleak_no_scan(nc);
/* 初始化local cache */
if (nc) {
nc->avail = 0;
nc->limit = entries;
nc->batchcount = batchcount;
nc->touched = 0;
spin_lock_init(&nc->lock);
}
return nc;
}
do_ccupdate_local
更新每个cpu的struct array_cache对象。
static void do_ccupdate_local(void *info)
{
struct ccupdate_struct *new = info;
struct array_cache *old;
check_irq_off();
old = cpu_cache_get(new->cachep);
/* 指向新的struct array_cache对象 */
new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
/* 保存旧的struct array_cache对象 */
new->new[smp_processor_id()] = old;
}
alloc_kmemlist
初始化shared local cache和slab三链,初始化完成后,slab三链中没有任何slab。
static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
{
int node;
struct kmem_list3 *l3;
struct array_cache *new_shared;
struct array_cache **new_alien = NULL;
for_each_online_node(node) {
/* NUMA相关 */
if (use_alien_caches) {
new_alien = alloc_alien_cache(node, cachep->limit, gfp);
if (!new_alien)
goto fail;
}
new_shared = NULL;
if (cachep->shared) {
/* 分配shared local cache */
new_shared = alloc_arraycache(node,
cachep->shared*cachep->batchcount,
0xbaadf00d, gfp);
if (!new_shared) {
free_alien_cache(new_alien);
goto fail;
}
}
/* 获得旧的slab三链 */
l3 = cachep->nodelists[node];
if (l3) {
/* 就slab三链指针不为空,需要先释放旧的资源 */
struct array_cache *shared = l3->shared;
spin_lock_irq(&l3->list_lock);
/* 释放旧的shared local cache中的对象 */
if (shared)
free_block(cachep, shared->entry,
shared->avail, node);
/* 指向新的shared local cache */
l3->shared = new_shared;
if (!l3->alien) {
l3->alien = new_alien;
new_alien = NULL;
}
/* 计算cache中空闲对象的上限 */
l3->free_limit = (1 + nr_cpus_node(node)) *
cachep->batchcount + cachep->num;
spin_unlock_irq(&l3->list_lock);
/* 释放旧shared local cache的struct array_cache对象 */
kfree(shared);
/* NUMA相关 */
free_alien_cache(new_alien);
continue;
}
/* 分配新的slab三链 */
l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
if (!l3) {
free_alien_cache(new_alien);
kfree(new_shared);
goto fail;
}
/* 初始化slab三链 */
kmem_list3_init(l3);
l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
((unsigned long)cachep) % REAPTIMEOUT_LIST3;
l3->shared = new_shared;
l3->alien = new_alien;
l3->free_limit = (1 + nr_cpus_node(node)) *
cachep->batchcount + cachep->num;
cachep->nodelists[node] = l3;
}
return 0;
fail:
if (!cachep->next.next) {
/* Cache is not active yet. Roll back what we did */
node--;
while (node >= 0) {
if (cachep->nodelists[node]) {
l3 = cachep->nodelists[node];
kfree(l3->shared);
free_alien_cache(l3->alien);
kfree(l3);
cachep->nodelists[node] = NULL;
}
node--;
}
}
return -ENOMEM;
}
set_up_list3s
设置cache的slab三链指向静态分配的全局变量。
static void __init set_up_list3s(struct kmem_cache *cachep, int index)
{
int node;
/* UMA只有一个节点 */
for_each_online_node(node) {
/* 全局变量initkmem_list3是初始化阶段使用的slab三链 */
cachep->nodelists[node] = &initkmem_list3[index + node];
/* 设置回收时间 */
cachep->nodelists[node]->next_reap = jiffies +
REAPTIMEOUT_LIST3 +
((unsigned long)cachep) % REAPTIMEOUT_LIST3;
}
}