Kernel提供了kmem_cache_create函数用于创建Cache,下面我们直接从API入手。
函数有点长,逐行分析一下。
-
/**
-
* kmem_cache_create - Create a cache.
-
* @name: A string which is used in /proc/slabinfo to identify this cache.
-
* @size: The size of objects to be created in this cache.
-
* @align: The required alignment for the objects.
-
* @flags: SLAB flags
-
* @ctor: A constructor for the objects.
-
*
-
* Returns a ptr to the cache on success, NULL on failure.
-
* Cannot be called within a int, but can be interrupted.
-
* The @ctor is run when new pages are allocated by the cache.
-
*
-
* @name must be valid until the cache is destroyed. This implies that
-
* the module calling this has to destroy the cache before getting unloaded.
-
* Note that kmem_cache_name() is not guaranteed to return the same pointer,
-
* therefore applications must manage it themselves.
-
*
-
* The flags are
-
*
-
* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
-
* to catch references to uninitialised memory.
-
*
-
* %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
-
* for buffer overruns.
-
*
-
* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
-
* cacheline. This can be beneficial if you're counting cycles as closely
-
* as davem.
-
*/
-
/× 创建成功后,cache中没有任何slab及对象,当分配对象时才会创建新的slab ×/
-
struct kmem_cache *
-
kmem_cache_create (const char *name, size_t size, size_t align,
-
unsigned long flags, void (*ctor)(void *))
-
{
-
size_t left_over, slab_size, ralign;
-
struct kmem_cache *cachep = NULL, *pc;
-
gfp_t gfp;
-
-
/*
-
* Sanity checks... these are all serious usage bugs.
-
*/
-
/× cache未指定名字,在中断上下文,对象大小小于sizeof(void ×),对象大小大于KMALLOC_MAX_SIZE,则报错 ×/
-
if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
-
size > KMALLOC_MAX_SIZE) {
-
printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
-
name);
-
BUG();
-
}
-
-
/*
-
* We use cache_chain_mutex to ensure a consistent view of
-
* cpu_online_mask as well. Please see cpuup_callback
-
*/
-
/× 判断slab释放已经初始化好,g_cpucache_up >= EARLY,见前文的初始化分析
-
如果是内核启动阶段,因为只有一个cpu在执行初始化的操作,所以不需要加锁 ×/
-
if (slab_is_available()) {
-
get_online_cpus();
-
mutex_lock(&cache_chain_mutex);
-
}
-
/× 所有创建的cache都连接在cache_chain链表上,遍历链表检查是否有重名的cache ×/
-
list_for_each_entry(pc, &cache_chain, next) {
-
char tmp;
-
int res;
-
-
/*
-
* This happens when the module gets unloaded and doesn't
-
* destroy its slab cache and no-one else reuses the vmalloc
-
* area of the module. Print a warning.
-
*/
-
/* 检查cache是否都有名字,没有名字则告警,并跳过 */
-
res = probe_kernel_address(pc->name, tmp);
-
if (res) {
-
printk(KERN_ERR
-
"SLAB: cache with size %d has lost its name\n",
-
pc->buffer_size);
-
continue;
-
}
-
/× 检查是否存在名字冲突的cache ×/
-
if (!strcmp(pc->name, name)) {
-
printk(KERN_ERR
-
"kmem_cache_create: duplicate cache %s\n", name);
-
dump_stack();
-
goto oops;
-
}
-
}
-
-
#if DEBUG // 调试,跳过
-
WARN_ON(strchr(name, ' ')); /* It confuses parsers */
-
#if FORCED_DEBUG // 调试,跳过
-
/*
-
* Enable redzoning and last user accounting, except for caches with
-
* large objects, if the increased size would increase the object size
-
* above the next power of two: caches with object sizes just above a
-
* power of two have a significant amount of internal fragmentation.
-
*/
-
if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
-
2 * sizeof(unsigned long long)))
-
flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
-
if (!(flags & SLAB_DESTROY_BY_RCU))
-
flags |= SLAB_POISON;
-
#endif
-
if (flags & SLAB_DESTROY_BY_RCU)
-
BUG_ON(flags & SLAB_POISON);
-
#endif
-
/*
-
* Always checks flags, a caller might be expecting debug support which
-
* isn't available.
-
*/
-
BUG_ON(flags & ~CREATE_MASK);
-
-
/*
-
* Check that size is in terms of words. This is needed to avoid
-
* unaligned accesses for some archs when redzoning is used, and makes
-
* sure any on-slab bufctl's are also correctly aligned.
-
*/
-
/* size 按照BYTES_PER_WORD对齐 */
-
if (size & (BYTES_PER_WORD - 1)) {
-
size += (BYTES_PER_WORD - 1);
-
size &= ~(BYTES_PER_WORD - 1);
-
}
-
-
/* calculate the final buffer alignment: */
-
-
/* 1) arch recommendation: can be overridden for debug */
-
/× 与硬件高速缓存行的cache_line_size对齐,根据size的大小决定对齐的单位 ×/
-
if (flags & SLAB_HWCACHE_ALIGN) {
-
/*
-
* Default alignment: as specified by the arch code. Except if
-
* an object is really small, then squeeze multiple objects into
-
* one cacheline.
-
*/
-
ralign = cache_line_size();
-
while (size <= ralign / 2)
-
ralign /= 2;
-
} else {
-
ralign = BYTES_PER_WORD;
-
}
-
-
/*
-
* Redzoning and user store require word alignment or possibly larger.
-
* Note this will be overridden by architecture or caller mandated
-
* alignment if either is greater than BYTES_PER_WORD.
-
*/
-
if (flags & SLAB_STORE_USER)
-
ralign = BYTES_PER_WORD;
-
-
if (flags & SLAB_RED_ZONE) {
-
ralign = REDZONE_ALIGN;
-
/* If redzoning, ensure that the second redzone is suitably
-
* aligned, by adjusting the object size accordingly. */
-
size += REDZONE_ALIGN - 1;
-
size &= ~(REDZONE_ALIGN - 1);
-
}
-
-
/* 2) arch mandated alignment */
-
if (ralign < ARCH_SLAB_MINALIGN) {
-
ralign = ARCH_SLAB_MINALIGN;
-
}
-
/* 3) caller mandated alignment */
-
if (ralign < align) {
-
ralign = align;
-
}
-
/* disable debug if necessary */
-
if (ralign > __alignof__(unsigned long long))
-
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
-
/*
-
* 4) Store it.
-
*/
-
/× 存储对齐但 ×/
-
align = ralign;
-
-
/× 确定slab是否可以使用,GFP_KERNEL允许申请时睡眠 ×/
-
if (slab_is_available())
-
gfp = GFP_KERNEL;
-
else
-
/× GFP_NOWAIT,在slab初始化完成前使用,不能阻塞,只能在低端内存区分配 ×/
-
gfp = GFP_NOWAIT;
-
-
/* Get cache's description obj. */
-
/× 申请kmem_cache结构,并初始化,cache_cache的对象正是struct kmem_cache结构 ×/
-
cachep = kmem_cache_zalloc(&cache_cache, gfp);
-
if (!cachep)
-
goto oops;
-
-
#if DEBUG
-
cachep->obj_size = size;
-
-
/*
-
* Both debugging options require word-alignment which is calculated
-
* into align above.
-
*/
-
if (flags & SLAB_RED_ZONE) {
-
/* add space for red zone words */
-
cachep->obj_offset += sizeof(unsigned long long);
-
size += 2 * sizeof(unsigned long long);
-
}
-
if (flags & SLAB_STORE_USER) {
-
/* user store requires one word storage behind the end of
-
* the real object. But if the second red zone needs to be
-
* aligned to 64 bits, we must allow that much space.
-
*/
-
if (flags & SLAB_RED_ZONE)
-
size += REDZONE_ALIGN;
-
else
-
size += BYTES_PER_WORD;
-
}
-
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
-
if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
-
&& cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
-
cachep->obj_offset += PAGE_SIZE - size;
-
size = PAGE_SIZE;
-
}
-
#endif
-
#endif
-
-
/*
-
* Determine if the slab management is 'on' or 'off' slab.
-
* (bootstrapping cannot cope with offslab caches so don't do
-
* it too early on. Always use on-slab management when
-
* SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
-
*/
-
/× 确定slab管理对象时采用内置还是外置的方式,当对象大小超过512时,采用外置方式;初始化阶段使用内置方式 ×/
-
if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
-
!(flags & SLAB_NOLEAKTRACE))
-
/*
-
* Size is large, assume best to place the slab management obj
-
* off-slab (should allow better packing of objs).
-
*/
-
flags |= CFLGS_OFF_SLAB;
-
/× 按照之前计算的对齐单元,调整size的大小 ×/
-
size = ALIGN(size, align);
-
/× 计算slab中碎片的大小 ×/
-
left_over = calculate_slab_order(cachep, size, align, flags);
-
/× num代表了当前cache允许每个slab中存在的对象数,正常不应该为0 ×/
-
if (!cachep->num) {
-
printk(KERN_ERR
-
"kmem_cache_create: couldn't create cache %s.\n", name);
-
kmem_cache_free(&cache_cache, cachep);
-
cachep = NULL;
-
goto oops;
-
}
-
/× 计算slab管理对象的大小,包括slab和kmem_bufctl_t ×/
-
slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
-
+ sizeof(struct slab), align);
-
-
/*
-
* If the slab has been placed off-slab, and we have enough space then
-
* move it on-slab. This is at the expense of any extra colouring.
-
*/
-
/× 如果碎片大小已经超过了管理对象的大小,并且是slab管理对象外置的话,可以直接移进slab中 ×/
-
if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
-
/× 取消外置的标签,此时是内置的 ×/
-
flags &= ~CFLGS_OFF_SLAB;
-
/× 碎片的大小可以减去管理对象的大小了 ×/
-
left_over -= slab_size;
-
}
-
-
/× 如果是外置的,则slab_size按照不对齐的方式重新计算一下大小 ×/
-
if (flags & CFLGS_OFF_SLAB) {
-
/* really off slab. No need for manual alignment */
-
slab_size =
-
cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
-
-
#ifdef CONFIG_PAGE_POISONING
-
/* If we're going to use the generic kernel_map_pages()
-
* poisoning, then it's going to smash the contents of
-
* the redzone and userword anyhow, so switch them off.
-
*/
-
if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
-
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
-
#endif
-
}
-
/× 记录着色块的大小,cache_line_size ×/
-
cachep->colour_off = cache_line_size();
-
/* Offset must be a multiple of the alignment. */
-
if (cachep->colour_off < align)
-
cachep->colour_off = align;
-
/* 计算碎片区需要多少着色块 */
-
cachep->colour = left_over / cachep->colour_off;
-
/* 记录slab管理对象的大小 */
-
cachep->slab_size = slab_size;
-
cachep->flags = flags;
-
cachep->gfpflags = 0;
-
/× 如果当前kernel配置了DMA,并且函数指定了DMA参数,则在cache上打上DMA的标签 ×/
-
if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
-
cachep->gfpflags |= GFP_DMA;
-
/× 记录每个slab对象的大小 ×/
-
cachep->buffer_size = size;
-
/× 下面成员用于后续计算对象在slab中的索引 ×/
-
cachep->reciprocal_buffer_size = reciprocal_value(size);
-
-
if (flags & CFLGS_OFF_SLAB) {
-
/× 分配一个slab管理区对象,保存在cachep->slabp_cache中 ×/
-
/× 函数传入的slab_size是管理区对象的大小,如果是slab管理区是外置的,则从slab_size大小的普通cache中申请对象 ×/
-
/* 这里找到对应的kmem_cache并记录下来,如果是内置的,则slabp_cache为NULL */
-
cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
-
/*
-
* This is a possibility for one of the malloc_sizes caches.
-
* But since we go off slab only for object size greater than
-
* PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
-
* this should not happen at all.
-
* But leave a BUG_ON for some lucky dude.
-
*/
-
BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
-
}
-
/* 设置构造函数 */
-
cachep->ctor = ctor;
-
/* 记录cache的名字 */
-
cachep->name = name;
-
/* 设置每个cpu上的local cache */
-
if (setup_cpu_cache(cachep, gfp)) {
-
__kmem_cache_destroy(cachep);
-
cachep = NULL;
-
goto oops;
-
}
-
-
/* cache setup completed, link it into the list */
-
/* cache创建完毕,将其加入全局的cache_chain上 */
-
list_add(&cachep->next, &cache_chain);
-
oops:
-
if (!cachep && (flags & SLAB_PANIC))
-
panic("kmem_cache_create(): failed to create slab `%s'\n",
-
name);
-
/* 如果不是初始化阶段,前面曾经加了锁,此处去掉,另,释放cpu热插拔相关计数 */
-
if (slab_is_available()) {
-
mutex_unlock(&cache_chain_mutex);
-
put_online_cpus();
-
}
-
return cachep;
-
}
下面函数计算slab由几个页面组成,以及每个slab中存在多少个对象
-
/**
-
* calculate_slab_order - calculate size (page order) of slabs
-
* @cachep: pointer to the cache that is being created
-
* @size: size of objects to be created in this cache.
-
* @align: required alignment for the objects.
-
* @flags: slab allocation flags
-
*
-
* Also calculates the number of objects per slab.
-
*
-
* This could be made much more intelligent. For now, try to avoid using
-
* high order pages for slabs. When the gfp() functions are more friendly
-
* towards high-order requests, this should be changed.
-
*/
-
static size_t calculate_slab_order(struct kmem_cache *cachep,
-
size_t size, size_t align, unsigned long flags)
-
{
-
unsigned long offslab_limit;
-
size_t left_over = 0;
-
int gfporder;
-
-
for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
-
unsigned int num;
-
size_t remainder;
-
/* 计算slab中存在的对象数量和slab浪费的空间大小 */
-
cache_estimate(gfporder, size, align, flags, &remainder, &num);
-
/* 如果num为0,则代表当前order的页面数连一个对象都无放入,需要扩大页面数 */
-
if (!num)
-
continue;
-
/*摘抄一段网友的注释:http://blog.csdn.net/bullbat/article/details/7192845
-
-
-
-
-
-
-
-
-
-
-
-
*/
-
if (flags & CFLGS_OFF_SLAB) {
-
/*
-
* Max number of objs-per-slab for caches which
-
* use off-slab slabs. Needed to avoid a possible
-
* looping condition in cache_grow().
-
*/
-
offslab_limit = size - sizeof(struct slab);
-
offslab_limit /= sizeof(kmem_bufctl_t);
-
/* 当前计算得到的对象数量,大于计算得到的限制时,就可以跳出循环了 */
-
if (num > offslab_limit)
-
break;
-
}
-
-
/* Found something acceptable - save it away */
-
/* slab中的对象数量 */
-
cachep->num = num;
-
/* slab由几个页面组成,见cache_estimate的计算过程 */
-
cachep->gfporder = gfporder;
-
/* slab中存在的碎片的大小,同样在cache_estimate中计算出来 */
-
left_over = remainder;
-
-
/*
-
* A VFS-reclaimable slab tends to have most allocations
-
* as GFP_NOFS and we really don't want to have to be allocating
-
* higher-order pages when we are unable to shrink dcache.
-
*/
-
/* 该标签代表slab中的页面可以回收,直接跳出 */
-
/* 可回收意味着当前slab占用的内存被当做可用内存看待,通过kmem_freepages可以将slab占用的页释放 */
-
if (flags & SLAB_RECLAIM_ACCOUNT)
-
break;
-
-
/*
-
* Large number of objects is good, but very large slabs are
-
* currently bad for the gfp()s.
-
*/
-
/* 一旦超过slab页框允许的上限,则不再继续循环,直接使用当前的gfporder */
-
if (gfporder >= slab_break_gfp_order)
-
break;
-
-
/*
-
* Acceptable internal fragmentation?
-
*/
-
/* 判断一下,当前页面的利用率,当利用率满足下方条件时,不再继续循环 */
-
if (left_over * 8 <= (PAGE_SIZE << gfporder))
-
break;
-
}
-
/* 当前slab引入的碎片的大小 */
-
return left_over;
-
}
单独再分析一下设置cpu的本地cache ,见下方函数
-
static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
-
{
-
/* enable_cpucache前面文章已经分析过,FULL在kmem_cache_init_late中赋值,
-
此时普通cache已经初始化完成了,直接配置每个cpu的local cache */
-
if (g_cpucache_up == FULL)
-
return enable_cpucache(cachep, gfp);
-
/* g_cpucache_up代表了通用cache的初始化的进度,取值NONE/EARLY/FULL/PARTIAL_AC/PARTIAL_L3 */
-
/* chicken and egg problem: delay the per-cpu array allocation until the general caches are up.
static enum {
NONE, // 系统初始化阶段
PARTIAL_AC, // struct array_cache所在的cache已经创建
PARTIAL_L3, // struct kmem_list3所在的cache已经创建
EARLY, // kmem_cache_init阶段完成
FULL // kmem_cache_init_late,resize head arrays完成
} g_cpucache_up;
*/
-
if (g_cpucache_up == NONE) {
-
/*
-
* Note: the first kmem_cache_create must create the cache
-
* that's used by kmalloc(24), otherwise the creation of
-
* further caches will BUG().
-
*/
-
/* 初始化阶段创建struct array_cache时走进这里,此时general cache尚未创建,只能使用静态的cache */
-
cachep->array[smp_processor_id()] = &initarray_generic.cache;
-
-
/*
-
* If the cache that's used by kmalloc(sizeof(kmem_list3)) is
-
* the first cache, then we need to set up all its list3s,
-
* otherwise the creation of further caches will BUG().
-
*/
-
/* kmem_list3的cache也未创建,使用全局变量 */
-
set_up_list3s(cachep, SIZE_AC);
-
/* 更新进度 */
-
if (INDEX_AC == INDEX_L3)
-
g_cpucache_up = PARTIAL_L3;
-
else
-
g_cpucache_up = PARTIAL_AC;
-
} else {
-
/* general cache已经创建,使用kmalloc申请 */
-
cachep->array[smp_processor_id()] =
-
kmalloc(sizeof(struct arraycache_init), gfp);
-
-
if (g_cpucache_up == PARTIAL_AC) {
-
/* kmem_list3所在cache尚未创建完成,仍使用静态全局的slab三链 */
-
set_up_list3s(cachep, SIZE_L3);
-
/* 只有创建kmem_list3 cache时才会走进该流程,set_up_list3创建了kmem_list3的cache,更新进度 */
-
g_cpucache_up = PARTIAL_L3;
-
} else {
-
int node;
-
for_each_online_node(node) {
-
/* 通过kmalloc直接申请 */
-
cachep->nodelists[node] =
-
kmalloc_node(sizeof(struct kmem_list3),
-
gfp, node);
-
BUG_ON(!cachep->nodelists[node]);
-
/* 初始化链表 */
-
kmem_list3_init(cachep->nodelists[node]);
-
}
-
}
-
}
-
cachep->nodelists[numa_node_id()]->next_reap =
-
jiffies + REAPTIMEOUT_LIST3 +
-
((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-
-
cpu_cache_get(cachep)->avail = 0;
-
cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
-
cpu_cache_get(cachep)->batchcount = 1;
-
cpu_cache_get(cachep)->touched = 0;
-
cachep->batchcount = 1;
-
cachep->limit = BOOT_CPUCACHE_ENTRIES;
-
return 0;
-
}
阅读(2717) | 评论(0) | 转发(2) |