初始化的调用过程:start_kernel()->mm_init()->kmem_cache_init(),
下面分析一下具体代码。
-
/*
-
* Initialisation. Called after the page allocator have been initialised and
-
* before smp_init().
-
*/
-
void __init kmem_cache_init(void)
-
{
-
size_t left_over;
-
struct cache_sizes *sizes;
-
struct cache_names *names;
-
int i;
-
int order;
-
int node;
-
-
/× 在非NUMA平台上,将use_alien_cache设置为0,此时cache_free_alien将禁止调用 ×/
-
if (num_possible_nodes() == 1)
-
use_alien_caches = 0;
-
-
/* initkmem_list3为全局变量,此时slab尚未完成初始化,kmalloc无法使用 */
-
/* #define NODES_SHIFT CONFIG_NODES_SHIFT */
-
/* #define MAX_NUMNODES (1 << NODES_SHIFT) */
-
/* #define NUM_INIT_LISTS (3 * MAX_NUMNODES) */
-
/* CONFIG_NODES_SHIFT是当前系统配置的可支持的NUMA节点的最大个数,针对每个内存节点,包含:
-
struct kmem_cache/struct arraycache_init/struct kmem_list3的slab(full/free/partial)所以是3倍的NUMA节点
-
*/
-
-
for (i = 0; i < NUM_INIT_LISTS; i++) {
-
/× 针对链表、锁和成员的初始化,比较简单 ×/
-
kmem_list3_init(&initkmem_list3[i]);
-
/× cache_cache是全局变量,是内核中第一个cache(struct kmem_cache),遍历当前所有节点,初始化为NULL ×/
-
if (i < MAX_NUMNODES)
-
cache_cache.nodelists[i] = NULL;
-
}
-
/× 将cache_cache中的nodelists指向initkmem_list3数组中对应成员,按照NUMA节点进行对应 ×/
-
/× CACHE_CACHES是cache_cache在内核cache链表中的索引,因为这里是第一个cache,所以为0 ×/
-
set_up_list3s(&cache_cache, CACHE_CACHE);
-
-
/*
-
* Fragmentation resistance on low memory - only use bigger
-
* page orders on machines with more than 32MB of memory.
-
*/
-
/* 当内存大于32M时,slab_break_gfp_order为1(即每个slab最多占用2个页面),否则为0,其用于指定每个slab最多占用的页面数量,用于抑制碎片 ×/
-
/× 有一种可能是,当对象很大导致slab中一个对象都无法放入时,可以超过该值的限制 ×/
-
if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
-
slab_break_gfp_order = BREAK_GFP_ORDER_HI;
-
-
/* Bootstrap is tricky, because several objects are allocated
-
* from caches that do not exist yet:
-
* 1) initialize the cache_cache cache: it contains the struct
-
* kmem_cache structures of all caches, except cache_cache itself:
-
* cache_cache is statically allocated.
-
* Initially an __init data area is used for the head array and the
-
* kmem_list3 structures, it's replaced with a kmalloc allocated
-
* array at the end of the bootstrap.
-
* 2) Create the first kmalloc cache.
-
* The struct kmem_cache for the new cache is allocated normally.
-
* An __init data area is used for the head array.
-
* 3) Create the remaining kmalloc caches, with minimally sized
-
* head arrays.
-
* 4) Replace the __init data head arrays for cache_cache and the first
-
* kmalloc cache with kmalloc allocated arrays.
-
* 5) Replace the __init data for kmem_list3 for cache_cache and
-
* the other cache's with kmalloc allocated memory.
-
* 6) Resize the head arrays of the kmalloc caches to their final sizes.
-
*/
-
/× 根据当前CPU,获取对应的NUMA节点的ID ×/
-
node = numa_node_id();
-
-
/* 1) create the cache_cache */
-
/× cache_chain 是内核slab cache链表的链表头 ×/
-
INIT_LIST_HEAD(&cache_chain);
-
/× cache_cache是kernel的第一个slab cache,链接到cache_chain上 ×/
-
list_add(&cache_cache.next, &cache_chain);
-
/* 设置cache的着色偏移为cache_line_size的大小 */
-
cache_cache.colour_off = cache_line_size();
-
/× 设置cache_cache的local_cache直接指向全局变量的cache ×/
-
cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
-
/× 给当前的NUMA内存节点的slab赋值,指向全局变量的slab的几个链表 ×/
-
/* 从目前的代码看,此处应该与set_up_list3s重复了,list3s中遍历了所有的NUMA节点进行了赋值,包括了当前的NUMA节点 */
-
cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
-
-
/*
-
* struct kmem_cache size depends on nr_node_ids, which
-
* can be less than MAX_NUMNODES.
-
*/
-
/× buffer_size保存的是slab中的对象大小,看注释已经很清楚,以nr_node_ids为准,所以对对象大小进行了重新计算 ×/
-
cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
-
nr_node_ids * sizeof(struct kmem_list3 *);
-
#if DEBUG
-
cache_cache.obj_size = cache_cache.buffer_size;
-
#endif
-
/× 将对象大小按照cache_line_size进行对齐 ×/
-
cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
-
cache_line_size());
-
/× 计算对象大小的倒数,用于计算对象在slab中的索引 ×/
-
cache_cache.reciprocal_buffer_size =
-
reciprocal_value(cache_cache.buffer_size);
-
-
for (order = 0; order < MAX_ORDER; order++) {
-
/× 获取cache_cache中的对象的最大数目 ×/
-
cache_estimate(order, cache_cache.buffer_size,
-
cache_line_size(), 0, &left_over, &cache_cache.num);
-
if (cache_cache.num)
-
break;
-
}
-
BUG_ON(!cache_cache.num);
-
/× slab包含的页面个数,2^gfporder个 ×/
-
cache_cache.gfporder = order;
-
/× slab着色区的大小,以colour_off为单位 ×/
-
cache_cache.colour = left_over / cache_cache.colour_off;
-
/* slab管理区大小 */
-
cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
-
sizeof(struct slab), cache_line_size());
-
-
/* 2+3) create the kmalloc caches */
-
/* 创建kmalloc所用的general_cache,即普通高速缓存,普通高速缓存分为(2^0)/(2^1)...区域的个数以及大小与系统内存配置
-
以及PAGE_SIZE/L1_CACHE_BYTES/KMALLOC_MAX_SIZE相关,具体在linux/kmalloc_sizes.h中定义,每个对应两个高速缓存,
-
一个是DMA高速缓存,一个是常规高速缓存,存放在struct cache_sizes malloc_sizes[]中
-
×/
-
sizes = malloc_sizes;
-
names = cache_names;
-
-
/*
-
* Initialize the caches that provide memory for the array cache and the
-
* kmem_list3 structures first. Without this, further allocations will
-
* bug.
-
*/
-
/* 创建struct arraycache_init对应的普通cache,后续初始化会使用 */
-
/× INDEX_AC是计算local cache所用的struct arraycache_init对象在kmalloc size中的索引,即属于哪一级大小的索引,看一下INDEX_AC的定义一切了然 ×/
-
sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
-
sizes[INDEX_AC].cs_size,
-
ARCH_KMALLOC_MINALIGN,
-
ARCH_KMALLOC_FLAGS|SLAB_PANIC,
-
NULL);
-
/× 如果struct kmem_list3和struct arraycache_init对应的kmalloc size索引不同,则为kmem_list3创建自己的cache,否则共用一个 ×/
-
if (INDEX_AC != INDEX_L3) {
-
sizes[INDEX_L3].cs_cachep =
-
kmem_cache_create(names[INDEX_L3].name,
-
sizes[INDEX_L3].cs_size,
-
ARCH_KMALLOC_MINALIGN,
-
ARCH_KMALLOC_FLAGS|SLAB_PANIC,
-
NULL);
-
}
-
/× 创建结束以上两个通用cache之后,slab_early_init阶段结束 ×/
-
slab_early_init = 0;
-
/* 下面开始循环创建kmalloc各个级别的cache,各级别的定义参见linux/kmalloc_sizes.h文件 */
-
while (sizes->cs_size != ULONG_MAX) {
-
/*
-
* For performance, all the general caches are L1 aligned.
-
* This should be particularly beneficial on SMP boxes, as it
-
* eliminates "false sharing".
-
* Note for systems short on memory removing the alignment will
-
* allow tighter packing of the smaller caches.
-
*/
-
/× 对应大小的kmalloc的cache还未创建,所以下面需要进行创建 ×/
-
if (!sizes->cs_cachep) {
-
sizes->cs_cachep = kmem_cache_create(names->name,
-
sizes->cs_size,
-
ARCH_KMALLOC_MINALIGN,
-
ARCH_KMALLOC_FLAGS|SLAB_PANIC,
-
NULL);
-
}
-
#ifdef CONFIG_ZONE_DMA
-
/× 对于kmalloc的cache,每个级别都对应一个普通的cache和一个dma的cache,如果支持dma则创建之 ×/
-
sizes->cs_dmacachep = kmem_cache_create(
-
names->name_dma,
-
sizes->cs_size,
-
ARCH_KMALLOC_MINALIGN,
-
ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
-
SLAB_PANIC,
-
NULL);
-
#endif
-
/× 向下循环 ×/
-
sizes++;
-
names++;
-
}
-
/* 4) Replace the bootstrap head arrays */
-
/× 下面开始使用kmalloc申请的动态内存替换掉之前的静态变量 ×/
-
/× 从代码可以看出需要替换的是initarray_cache.cache和initarray_generic.cache ×/
-
{
-
struct array_cache *ptr;
-
/* 申请cache_cache所用的local cache的空间 */
-
ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
-
/× 复制原initarray_cache.cache到新的位置 ×/
-
BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
-
memcpy(ptr, cpu_cache_get(&cache_cache),
-
sizeof(struct arraycache_init));
-
/*
-
* Do not assume that spinlocks can be initialized via memcpy:
-
*/
-
spin_lock_init(&ptr->lock);
-
/× 更新,指向动态申请的内存区 ×/
-
cache_cache.array[smp_processor_id()] = ptr;
-
/* 申请空间,用于替换initarray_generic.cache */
-
ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
-
-
BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
-
!= &initarray_generic.cache);
-
memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
-
sizeof(struct arraycache_init));
-
/*
-
* Do not assume that spinlocks can be initialized via memcpy:
-
*/
-
spin_lock_init(&ptr->lock);
-
/× 更新,指向新申请的内存 ×/
-
malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
-
ptr;
-
}
-
/* 5) Replace the bootstrap kmem_list3's */
-
/× 同4一样,使用动态申请的内存,替换静态分配的slab的几个链表 ×/
-
{
-
int nid;
-
-
for_each_online_node(nid) {
-
init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
-
-
init_list(malloc_sizes[INDEX_AC].cs_cachep,
-
&initkmem_list3[SIZE_AC + nid], nid);
-
-
if (INDEX_AC != INDEX_L3) {
-
init_list(malloc_sizes[INDEX_L3].cs_cachep,
-
&initkmem_list3[SIZE_L3 + nid], nid);
-
}
-
}
-
}
-
/× 更新slab系统的初始化的进度 ×/
-
g_cpucache_up = EARLY;
-
}
继续分析一下一些子函数的代码
-
static void kmem_list3_init(struct kmem_list3 *parent)
-
{
-
/* 全被占用的slab链表 */
-
INIT_LIST_HEAD(&parent->slabs_full);
-
/* 部分空闲的slab链表 */
-
INIT_LIST_HEAD(&parent->slabs_partial);
-
/* 全部空闲的slab链表 */
-
INIT_LIST_HEAD(&parent->slabs_free);
-
parent->shared = NULL;
-
parent->alien = NULL;
-
parent->colour_next = 0;
-
spin_lock_init(&parent->list_lock);
-
parent->free_objects = 0;
-
parent->free_touched = 0;
-
}
-
/*
-
* For setting up all the kmem_list3s for cache whose buffer_size is same as
-
* size of kmem_list3.
-
*/
-
/× set_up_list3s(&cache_cache, CACHE_CACHE),其中CACHE_CACHE为0 ×/
-
/* 设置cache_cache的nodeliste指向静态分配的全局变量,即slab的三个链表都使用静态全局的定义 */
-
static void __init set_up_list3s(struct kmem_cache *cachep, int index)
-
{
-
int node;
-
/× 遍历NUMA内存节点 ×/
-
for_each_online_node(node) {
-
/× 指向静态全局定义的slab list ×/
-
cachep->nodelists[node] = &initkmem_list3[index + node];
-
/× 设置回收时间,next_reap是两次缓存回收之间必须经历的时间间隔 ×/
-
cachep->nodelists[node]->next_reap = jiffies +
-
REAPTIMEOUT_LIST3 +
-
((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-
}
-
}
-
/*
-
* Calculate the number of objects and left-over bytes for a given buffer size.
-
*/
-
/* gfporder: 取值0~11遍历直到计算出cache的对象数量跳出循环,slab由2^gfporder个页面组成
-
buffer_size: 为当前cache中对象经过cache_line_size对齐后的大小
-
align: 是cache_line_size,按照该大小对齐
-
flags: 此处为0,用于标识内置slab还是外置slab
-
left_over: 输出值,记录slab中浪费空间的大小
-
num:输出值,用于记录当前cache中允许存在的对象数目
-
*/
-
static void cache_estimate(unsigned long gfporder, size_t buffer_size,
-
size_t align, int flags, size_t *left_over,
-
unsigned int *num)
-
{
-
int nr_objs;
-
size_t mgmt_size;
-
/× PAGE_SIZE代表一个页面,slab_size记录需要多少个页面 ×/
-
size_t slab_size = PAGE_SIZE << gfporder;
-
-
/*
-
* The slab management structure can be either off the slab or
-
* on it. For the latter case, the memory allocated for a
-
* slab is used for:
-
*
-
* - The struct slab
-
* - One kmem_bufctl_t for each object
-
* - Padding to respect alignment of @align
-
* - @buffer_size bytes for each object
-
*
-
* If the slab management structure is off the slab, then the
-
* alignment will already be calculated into the size. Because
-
* the slabs are all pages aligned, the objects will be at the
-
* correct alignment when allocated.
-
*/
-
/× 外置slab ×/
-
if (flags & CFLGS_OFF_SLAB) {
-
mgmt_size = 0;
-
/* slab中不含管理对象,全部用于存储slab对象,计算当前的对象数量 */
-
nr_objs = slab_size / buffer_size;
-
/* 如果超过阀值,则取上限 */
-
if (nr_objs > SLAB_LIMIT)
-
nr_objs = SLAB_LIMIT;
-
} else {
-
/*
-
* Ignore padding for the initial guess. The padding
-
* is at most @align-1 bytes, and @buffer_size is at
-
* least @align. In the worst case, this result will
-
* be one greater than the number of objects that fit
-
* into the memory allocation when taking the padding
-
* into account.
-
*/
-
/× 内置的slab管理对象,slab管理对象与slab对象在一起。
-
此时slab页面中包含struct slab管理对象,kmem_bufctl_t数组和slab对象,其中kmem_bufctl_t数组个数与slab对象数量一致 ×/
-
nr_objs = (slab_size - sizeof(struct slab)) /
-
(buffer_size + sizeof(kmem_bufctl_t));
-
-
/*
-
* This calculated number will be either the right
-
* amount, or one greater than what we want.
-
*/
-
/× 计算cache_line对齐后的大小,如果超出slab总的大小,则对象数减1 ×/
-
if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size > slab_size)
-
nr_objs--;
-
/× 判断有无超过阀值,最大取阀值 ×/
-
if (nr_objs > SLAB_LIMIT)
-
nr_objs = SLAB_LIMIT;
-
/* 计算cache_line对齐后,管理对象的大小 */
-
mgmt_size = slab_mgmt_size(nr_objs, align);
-
}
-
/× 计算得到的slab对象的数目,通过num输出 ×/
-
*num = nr_objs;
-
/× 计算当前slab中浪费的空间的大小 ×/
-
*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
-
}
上面用到了cache_names和malloc_sizes两个数组,它们用于表示普通cache的名字和对应的大小,并且一一对应。可以看一下代码,比较简单。
阅读(1283) | 评论(0) | 转发(0) |