之前多多少少接触过cache之类的东西,总觉的很神秘,当然cache就是为了读写内存更高效。比如查看meminfo或者slabinfo的时候,你是否真的对内存机制理解的很清晰?
参考内核linux 3.8.13
我们看看调用它的函数接口:
-
/*
-
* Set up kernel memory allocators
-
*/
-
static void __init mm_init(void)
-
{
-
/*
-
* page_cgroup requires contiguous pages,
-
* bigger than MAX_ORDER unless SPARSEMEM.
-
*/
-
page_cgroup_init_flatmem();
-
mem_init();
-
kmem_cache_init();
-
percpu_init_late();
-
pgtable_cache_init();
-
vmalloc_init();
-
}
这个函数在start_kernel里调用. 下面我们就看看
kmem_cache_init(); //默认slab分配器
-
/*
-
* Initialisation. Called after the page allocator have been initialised and
-
* before smp_init().
-
*/
-
void __init kmem_cache_init(void)
-
{
-
struct cache_sizes *sizes;
-
struct cache_names *names;
-
int i;
-
-
kmem_cache = &kmem_cache_boot;
-
setup_nodelists_pointer(kmem_cache); // 关于为什么要设置这个玩意,我找到一个patch说明
-
-
From 3c58346525d82625e68e24f071804c2dc057b6f4 Mon Sep 17 00:00:00 2001
-
From: Christoph Lameter <cl@linux.com>
-
Date: Wed, 28 Nov 2012 16:23:01 +0000
-
Subject: [PATCH] slab: Simplify bootstrap
-
-
The nodelists field in kmem_cache is pointing to the first unused
-
object in the array field when bootstrap is complete.
-
-
A problem with the current approach is that the statically sized
-
kmem_cache structure use on boot can only contain NR_CPUS entries.
-
If the number of nodes plus the number of cpus is greater then we
-
would overwrite memory following the kmem_cache_boot definition.
-
-
Increase the size of the array field to ensure that also the node
-
pointers fit into the array field.
-
-
Once we do that we no longer need the kmem_cache_nodelists
-
array and we can then also use that structure elsewhere.
-
-
Acked-by: Glauber Costa <glommer@parallels.com>
-
Signed-off-by: Christoph Lameter <cl@linux.com>
-
Signed-off-by: Pekka Enberg <penberg@kernel.org>
-
-
if (num_possible_nodes() == 1)
-
use_alien_caches = 0;
-
-
for (i = 0; i < NUM_INIT_LISTS; i++)
-
kmem_list3_init(&initkmem_list3[i]);
-
-
set_up_list3s(kmem_cache, CACHE_CACHE);
-
-
/*
-
* Fragmentation resistance on low memory - only use bigger
-
* page orders on machines with more than 32MB of memory if
-
* not overridden on the command line.
-
*/
-
if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
-
slab_max_order = SLAB_MAX_ORDER_HI;
-
-
/* Bootstrap is tricky, because several objects are allocated
-
* from caches that do not exist yet:
-
* 1) initialize the kmem_cache cache: it contains the struct
-
* kmem_cache structures of all caches, except kmem_cache itself:
-
* kmem_cache is statically allocated.
-
* Initially an __init data area is used for the head array and the
-
* kmem_list3 structures, it's replaced with a kmalloc allocated
-
* array at the end of the bootstrap.
-
* 2) Create the first kmalloc cache.
-
* The struct kmem_cache for the new cache is allocated normally.
-
* An __init data area is used for the head array.
-
* 3) Create the remaining kmalloc caches, with minimally sized
-
* head arrays.
-
* 4) Replace the __init data head arrays for kmem_cache and the first
-
* kmalloc cache with kmalloc allocated arrays.
-
* 5) Replace the __init data for kmem_list3 for kmem_cache and
-
* the other cache's with kmalloc allocated memory.
-
* 6) Resize the head arrays of the kmalloc caches to their final sizes.
-
*/
-
-
/* 1) create the kmem_cache */
-
-
/*
-
* struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
-
*/
-
create_boot_cache(kmem_cache, "kmem_cache",
-
offsetof(struct kmem_cache, array[nr_cpu_ids]) +
-
nr_node_ids * sizeof(struct kmem_list3 *),
-
SLAB_HWCACHE_ALIGN);
-
list_add(&kmem_cache->list, &slab_caches); // create kmem_cache后把它添加到slab_caches全局链表.
-
-
/* 2+3) create the kmalloc caches */
-
sizes = malloc_sizes;
-
names = cache_names;
-
-
/*
-
* Initialize the caches that provide memory for the array cache and the
-
* kmem_list3 structures first. Without this, further allocations will
-
* bug.
-
*/
-
-
sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name,
-
sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS);
-
-
if (INDEX_AC != INDEX_L3)
-
sizes[INDEX_L3].cs_cachep =
-
create_kmalloc_cache(names[INDEX_L3].name,
-
sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS);
-
-
slab_early_init = 0;
-
-
while (sizes->cs_size != ULONG_MAX) {
-
/*
-
* For performance, all the general caches are L1 aligned.
-
* This should be particularly beneficial on SMP boxes, as it
-
* eliminates "false sharing".
-
* Note for systems short on memory removing the alignment will
-
* allow tighter packing of the smaller caches.
-
*/
-
if (!sizes->cs_cachep)
-
sizes->cs_cachep = create_kmalloc_cache(names->name,
-
sizes->cs_size, ARCH_KMALLOC_FLAGS);
-
-
#ifdef CONFIG_ZONE_DMA
-
sizes->cs_dmacachep = create_kmalloc_cache(
-
names->name_dma, sizes->cs_size,
-
SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS);
-
#endif
-
sizes++;
-
names++;
-
}
-
/* 4) Replace the bootstrap head arrays */
-
{
-
struct array_cache *ptr;
-
-
ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
-
-
memcpy(ptr, cpu_cache_get(kmem_cache),
-
sizeof(struct arraycache_init));
-
/*
-
* Do not assume that spinlocks can be initialized via memcpy:
-
*/
-
spin_lock_init(&ptr->lock);
-
-
kmem_cache->array[smp_processor_id()] = ptr;
-
-
ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
-
-
BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
-
!= &initarray_generic.cache);
-
memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
-
sizeof(struct arraycache_init));
-
/*
-
* Do not assume that spinlocks can be initialized via memcpy:
-
*/
-
spin_lock_init(&ptr->lock);
-
-
malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
-
ptr;
-
}
-
/* 5) Replace the bootstrap kmem_list3's */
-
{
-
int nid;
-
-
for_each_online_node(nid) {
-
init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
-
-
init_list(malloc_sizes[INDEX_AC].cs_cachep,
-
&initkmem_list3[SIZE_AC + nid], nid);
-
-
if (INDEX_AC != INDEX_L3) {
-
init_list(malloc_sizes[INDEX_L3].cs_cachep,
-
&initkmem_list3[SIZE_L3 + nid], nid);
-
}
-
}
-
}
-
-
slab_state = UP;
-
}
第一行来自一个全局的指针变量,即为创建第一个cache( kmem_cache)
在mm/slab_common.c中
struct kmem_cache *kmem_cache;
创建的所有cache都会挂在LIST_HEAD(slab_caches); 这个全局链表上.在cat /proc/slabinfo可以查看》
这里可以看看
struct kmem_cache:在slab_def.h中
-
struct kmem_cache {
-
/* 1) Cache tunables. Protected by cache_chain_mutex */
-
unsigned int batchcount;
-
unsigned int limit;
-
unsigned int shared;
-
-
unsigned int size;
-
u32 reciprocal_buffer_size;
-
/* 2) touched by every alloc & free from the backend */
-
-
unsigned int flags; /* constant flags */
-
unsigned int num; /* # of objs per slab */
-
-
/* 3) cache_grow/shrink */
-
/* order of pgs per slab (2^n) */
-
unsigned int gfporder;
-
-
/* force GFP flags, e.g. GFP_DMA */
-
gfp_t allocflags;
-
-
size_t colour; /* cache colouring range */
-
unsigned int colour_off; /* colour offset */
-
struct kmem_cache *slabp_cache;
-
unsigned int slab_size;
-
-
/* constructor func */
-
void (*ctor)(void *obj);
-
-
/* 4) cache creation/removal */
-
const char *name;
-
struct list_head list;
-
int refcount;
-
int object_size;
-
int align;
-
-
/* 5) statistics */
-
#ifdef CONFIG_DEBUG_SLAB
-
unsigned long num_active;
-
unsigned long num_allocations;
-
unsigned long high_mark;
-
unsigned long grown;
-
unsigned long reaped;
-
unsigned long errors;
-
unsigned long max_freeable;
-
unsigned long node_allocs;
-
unsigned long node_frees;
-
unsigned long node_overflow;
-
atomic_t allochit;
-
atomic_t allocmiss;
-
atomic_t freehit;
-
atomic_t freemiss;
-
-
/*
-
* If debugging is enabled, then the allocator can add additional
-
* fields and/or padding to every object. size contains the total
-
* object size including these internal fields, the following two
-
* variables contain the offset to the user object and its size.
-
*/
-
int obj_offset;
-
#endif /* CONFIG_DEBUG_SLAB */
-
#ifdef CONFIG_MEMCG_KMEM
-
struct memcg_cache_params *memcg_params;
-
#endif
-
-
/* 6) per-cpu/per-node data, touched during every alloc/free */
-
/*
-
* We put array[] at the end of kmem_cache, because we want to size
-
* this array to nr_cpu_ids slots instead of NR_CPUS
-
* (see kmem_cache_init())
-
* We still use [NR_CPUS] and not [1] or [0] because cache_cache
-
* is statically defined, so we reserve the max number of cpus.
-
*
-
* We also need to guarantee that the list is able to accomodate a
-
* pointer for each node since "nodelists" uses the remainder of
-
* available pointers.
-
*/
-
struct kmem_list3 **nodelists;
-
struct array_cache *array[NR_CPUS + MAX_NUMNODES];
-
/*
-
* Do not add fields after array[]
-
*/
-
}
这个结构体里面几个关键的元素之前在kmalloc里已经说到了。
而kmem_cache_boot则是:
-
/* internal cache of cache description objs */
-
static struct kmem_cache kmem_cache_boot = {
-
.batchcount = 1,
-
.limit = BOOT_CPUCACHE_ENTRIES, // 默认为 1
-
.shared = 1,
-
.size = sizeof(struct kmem_cache),
-
.name = "kmem_cache",
-
};
注释解释的已经很清晰了.
而setup_nodelists_pointer的作用就是把struct kmem_cache里array指针地址存放在nodelists.目的是为了便于操作指针.
对于一致性内存访问,inode只有一个.
-
static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
它是slab.c中静态全局变量
-
/*
-
* Need this for bootstrapping a per node allocator.
-
*/
kmem_list3_init初始化slab的三个链表slabs_full、slabs_partial、slabs_free.为什么初始化这个和cache组成结构有关系,可以看个图:
这里CACHE_CACHE在文件的开头部分被定义为0.
-
/*
-
* For setting up all the kmem_list3s for cache whose buffer_size is same as
-
* size of kmem_list3.
-
*/
-
static void __init set_up_list3s(struct kmem_cache *cachep, int index)
-
{
-
int node;
-
-
for_each_online_node(node) {
-
cachep->nodelists[node] = &initkmem_list3[index + node];
-
cachep->nodelists[node]->next_reap = jiffies +
-
REAPTIMEOUT_LIST3 +
-
((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-
}
-
}
接着就要开始了真正的创建cache的工作,并且给出了初始化步骤和说明:
-
/* Bootstrap is tricky, because several objects are allocated
-
* from caches that do not exist yet:
-
* 1) initialize the kmem_cache cache: it contains the struct
-
* kmem_cache structures of all caches, except kmem_cache itself:
-
* kmem_cache is statically allocated.
-
* Initially an __init data area is used for the head array and the
-
* kmem_list3 structures, it's replaced with a kmalloc allocated
-
* array at the end of the bootstrap.
-
* 2) Create the first kmalloc cache.
-
* The struct kmem_cache for the new cache is allocated normally.
-
* An __init data area is used for the head array.
-
* 3) Create the remaining kmalloc caches, with minimally sized
-
* head arrays.
-
* 4) Replace the __init data head arrays for kmem_cache and the first
-
* kmalloc cache with kmalloc allocated arrays.
-
* 5) Replace the __init data for kmem_list3 for kmem_cache and
-
* the other cache's with kmalloc allocated memory.
-
* 6) Resize the head arrays of the kmalloc caches to their final sizes.
-
*/
-
-
/* 1) create the kmem_cache */
-
-
/*
-
* struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
-
*/
-
create_boot_cache(kmem_cache, "kmem_cache",
-
offsetof(struct kmem_cache, array[nr_cpu_ids]) +
-
nr_node_ids * sizeof(struct kmem_list3 *),
-
SLAB_HWCACHE_ALIGN);
-
list_add(&kmem_cache->list, &slab_caches);
首先创建第一个cache它名为kmem_cache,并且kmem_cache指针变量指向了kmem_cache_boot.
下面我们看看create_boot_cache函数
-
#ifndef CONFIG_SLOB
-
/* Create a cache during boot when no slab services are available yet */
-
void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
-
unsigned long flags)
-
{
-
int err;
-
-
s->name = name;
-
s->size = s->object_size = size;
-
s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
-
err = __kmem_cache_create(s, flags);
-
-
if (err)
-
panic("Creation of kmalloc slab %s size=%zd failed. Reason %d\n",
-
name, size, err);
-
-
s->refcount = -1; /* Exempt from merging for now */
-
}
-
-
struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
-
unsigned long flags)
-
{
-
struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-
-
if (!s)
-
panic("Out of memory when creating slab %s\n", name);
-
-
create_boot_cache(s, name, size, flags);
-
list_add(&s->list, &slab_caches);
-
s->refcount = 1;
-
return s;
-
}
-
-
#endif /* !CONFIG_SLOB */
而它接着调用了__kmem_cache_create:这是最关键的函数
-
/**
-
* __kmem_cache_create - Create a cache.
-
* @cachep: cache management descriptor
-
* @flags: SLAB flags
-
*
-
* Returns a ptr to the cache on success, NULL on failure.
-
* Cannot be called within a int, but can be interrupted.
-
* The @ctor is run when new pages are allocated by the cache.
-
*
-
* The flags are
-
*
-
* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
-
* to catch references to uninitialised memory.
-
*
-
* %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
-
* for buffer overruns.
-
*
-
* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
-
* cacheline. This can be beneficial if you're counting cycles as closely
-
* as davem.
-
*/
-
int
-
__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
-
{
-
size_t left_over, slab_size, ralign;
-
gfp_t gfp;
-
int err;
-
size_t size = cachep->size;
-
-
#if DEBUG
-
#if FORCED_DEBUG
-
/*
-
* Enable redzoning and last user accounting, except for caches with
-
* large objects, if the increased size would increase the object size
-
* above the next power of two: caches with object sizes just above a
-
* power of two have a significant amount of internal fragmentation.
-
*/
-
if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
-
2 * sizeof(unsigned long long)))
-
flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
-
if (!(flags & SLAB_DESTROY_BY_RCU))
-
flags |= SLAB_POISON;
-
#endif
-
if (flags & SLAB_DESTROY_BY_RCU)
-
BUG_ON(flags & SLAB_POISON);
-
#endif
-
-
/*
-
* Check that size is in terms of words. This is needed to avoid
-
* unaligned accesses for some archs when redzoning is used, and makes
-
* sure any on-slab bufctl's are also correctly aligned.
-
*/
-
if (size & (BYTES_PER_WORD - 1)) {
-
size += (BYTES_PER_WORD - 1);
-
size &= ~(BYTES_PER_WORD - 1);
-
} //4//四字节对齐
-
-
/*
-
* Redzoning and user store require word alignment or possibly larger.
-
* Note this will be overridden by architecture or caller mandated
-
* alignment if either is greater than BYTES_PER_WORD.
-
*/
-
if (flags & SLAB_STORE_USER)
-
ralign = BYTES_PER_WORD;
-
-
if (flags & SLAB_RED_ZONE) {
-
ralign = REDZONE_ALIGN;
-
/* If redzoning, ensure that the second redzone is suitably
-
* aligned, by adjusting the object size accordingly. */
-
size += REDZONE_ALIGN - 1;
-
size &= ~(REDZONE_ALIGN - 1);
-
}
-
-
/* 3) caller mandated alignment */
-
if (ralign < cachep->align) {
-
ralign = cachep->align;
-
}
-
/* disable debug if necessary */
-
if (ralign > __alignof__(unsigned long long))
-
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
-
/*
-
* 4) Store it.
-
*/
-
cachep->align = ralign;
-
-
if (slab_is_available()) // 为什么要插入这一段注释,因为它就是判断slab_state的值,默认它的值没人初始化即为DOWN.
-
/*
-
* State of the slab allocator.
-
*
-
* This is used to describe the states of the allocator during bootup.
-
* Allocators use this to gradually bootstrap themselves. Most allocators
-
* have the problem that the structures used for managing slab caches are
-
* allocated from slab caches themselves.
-
*/
-
enum slab_state {
-
DOWN, /* No slab functionality yet */
-
PARTIAL, /* SLUB: kmem_cache_node available */
-
PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */
-
PARTIAL_L3, /* SLAB: kmalloc size for l3 struct available */
-
UP, /* Slab caches usable but not all extras yet */
-
FULL /* Everything is working */
-
};
-
gfp = GFP_KERNEL;
-
else
-
gfp = GFP_NOWAIT;
-
//点击(此处)折叠或打开
-
#define GFP_NOWAIT (GFP_ATOMIC & ~__GFP_HIGH)
-
-
setup_nodelists_pointer(cachep);
-
#if DEBUG
-
-
/*
-
* Both debugging options require word-alignment which is calculated
-
* into align above.
-
*/
-
if (flags & SLAB_RED_ZONE) {
-
/* add space for red zone words */
-
cachep->obj_offset += sizeof(unsigned long long);
-
size += 2 * sizeof(unsigned long long);
-
}
-
if (flags & SLAB_STORE_USER) {
-
/* user store requires one word storage behind the end of
-
* the real object. But if the second red zone needs to be
-
* aligned to 64 bits, we must allow that much space.
-
*/
-
if (flags & SLAB_RED_ZONE)
-
size += REDZONE_ALIGN;
-
else
-
size += BYTES_PER_WORD;
-
}
-
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
-
if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
-
&& cachep->object_size > cache_line_size()
-
&& ALIGN(size, cachep->align) < PAGE_SIZE) {
-
cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
-
size = PAGE_SIZE;
-
}
-
#endif
-
#endif
-
-
/*
-
* Determine if the slab management is 'on' or 'off' slab.
-
* (bootstrapping cannot cope with offslab caches so don't do // 判断slab管理信息是否在slab分配的内存页上,判断条件见下面:
-
* it too early on. Always use on-slab management when // size >= (默认page =4k/8k) 512/1024 ; slab_early_init在创建kmem_cache的时候为1;当创建通用cache
-
* SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) //的时才会把它初始化为0 . 而第一传递的flags为 SLAB_HWCACHE_ALIGN
-
*/
-
if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
-
!(flags & SLAB_NOLEAKTRACE))
-
/*
-
* Size is large, assume best to place the slab management obj
-
* off-slab (should allow better packing of objs).
-
*/
-
flags |= CFLGS_OFF_SLAB;
-
-
size = ALIGN(size, cachep->align);
-
-
left_over = calculate_slab_order(cachep, size, cachep->align, flags); // 根据obj size 计算申请page的个数即一个slab包含多少个pages,
-
if (!cachep->num) // 也包含了多少个obj,除去管理信息等 剩余的空间。很简单易懂.
-
return -E2BIG;
-
-
slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
-
+ sizeof(struct slab), cachep->align);
-
-
/*
-
* If the slab has been placed off-slab, and we have enough space then
-
* move it on-slab. This is at the expense of any extra colouring.
-
*/
-
if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
-
flags &= ~CFLGS_OFF_SLAB;
-
left_over -= slab_size;
-
}
-
-
if (flags & CFLGS_OFF_SLAB) {
-
/* really off slab. No need for manual alignment */
-
slab_size =
-
cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
-
-
#ifdef CONFIG_PAGE_POISONING
-
/* If we're going to use the generic kernel_map_pages()
-
* poisoning, then it's going to smash the contents of
-
* the redzone and userword anyhow, so switch them off.
-
*/
-
if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
-
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
-
#endif
-
}
-
-
cachep->colour_off = cache_line_size(); //32B
-
/* Offset must be a multiple of the alignment. */
-
if (cachep->colour_off < cachep->align)
-
cachep->colour_off = cachep->align;
-
cachep->colour = left_over / cachep->colour_off; // slab 着色的初始化工作.
-
cachep->slab_size = slab_size;
-
cachep->flags = flags;
-
cachep->allocflags = 0;
-
if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
-
cachep->allocflags |= GFP_DMA;
-
cachep->size = size;
-
cachep->reciprocal_buffer_size = reciprocal_value(size);
-
-
if (flags & CFLGS_OFF_SLAB) {
-
cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
-
/*
-
* This is a possibility for one of the malloc_sizes caches.
-
* But since we go off slab only for object size greater than
-
* PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
-
* this should not happen at all.
-
* But leave a BUG_ON for some lucky dude.
-
*/
-
BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
-
}
-
-
err = setup_cpu_cache(cachep, gfp);
-
if (err) {
-
__kmem_cache_shutdown(cachep);
-
return err;
-
}
-
-
if (flags & SLAB_DEBUG_OBJECTS) {
-
/*
-
* Would deadlock through slab_destroy()->call_rcu()->
-
* debug_object_activate()->kmem_cache_alloc().
-
*/
-
WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
-
-
slab_set_debugobj_lock_classes(cachep);
-
} else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU))
-
on_slab_lock_classes(cachep);
-
-
return 0;
-
}
它里面有个很有趣的函数很关键的一个函数:它泄露了slab具体管理obj的布局和方法.
-
/**
-
* calculate_slab_order - calculate size (page order) of slabs
-
* @cachep: pointer to the cache that is being created
-
* @size: size of objects to be created in this cache.
-
* @align: required alignment for the objects.
-
* @flags: slab allocation flags
-
*
-
* Also calculates the number of objects per slab.
-
*
-
* This could be made much more intelligent. For now, try to avoid using
-
* high order pages for slabs. When the gfp() functions are more friendly
-
* towards high-order requests, this should be changed.
-
*/
-
static size_t calculate_slab_order(struct kmem_cache *cachep,
-
size_t size, size_t align, unsigned long flags)
-
{
-
unsigned long offslab_limit;
-
size_t left_over = 0;
-
int gfporder;
-
-
for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
-
unsigned int num;
-
size_t remainder;
-
-
cache_estimate(gfporder, size, align, flags, &remainder, &num); // 根据是off-slab 还是on-slab除去管理信息后多少个页面才能存下一个obj.以及其他信息,值得仔细看看.
-
if (!num) // 必须保证slab至少能装下一个obj
-
continue;
-
-
if (flags & CFLGS_OFF_SLAB) {
-
/*
-
* Max number of objs-per-slab for caches which
-
* use off-slab slabs. Needed to avoid a possible
-
* looping condition in cache_grow().
-
*/
-
offslab_limit = size - sizeof(struct slab);
-
offslab_limit /= sizeof(kmem_bufctl_t);
-
-
if (num > offslab_limit)
-
break;
-
}
-
-
/* Found something acceptable - save it away */
-
cachep->num = num;
-
cachep->gfporder = gfporder;
-
left_over = remainder;
-
-
/*
-
* A VFS-reclaimable slab tends to have most allocations
-
* as GFP_NOFS and we really don't want to have to be allocating
-
* higher-order pages when we are unable to shrink dcache.
-
*/
-
if (flags & SLAB_RECLAIM_ACCOUNT)
-
break;
-
-
/*
-
* Large number of objects is good, but very large slabs are
-
* currently bad for the gfp()s.
-
*/
-
if (gfporder >= slab_max_order)
-
break;
-
-
/*
-
* Acceptable internal fragmentation?
-
*/
-
if (left_over * 8 <= (PAGE_SIZE << gfporder))
-
break;
-
}
-
return left_over;
-
}
经过上面的初始化和设置,最后调用setup_cpu_cache就完成了一个创建cache的工作.接着进行第2、3步的工作:
-
/* 2+3) create the kmalloc caches */
-
sizes = malloc_sizes;
-
names = cache_names;
-
-
/*
-
* Initialize the caches that provide memory for the array cache and the
-
* kmem_list3 structures first. Without this, further allocations will
-
* bug.
-
*/
-
-
sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name, // create obj size 为sizeof(struct arraycache_init) 的cache
-
sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS);
-
-
if (INDEX_AC != INDEX_L3)
-
sizes[INDEX_L3].cs_cachep =
-
create_kmalloc_cache(names[INDEX_L3].name, //// create obj size 为sizeof(struct kmem_list3) 的cache
-
sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS);
-
-
slab_early_init = 0;
-
-
while (sizes->cs_size != ULONG_MAX) { //创建通用cache 根据 malloc_sizes ,cache_names
-
/*
-
* For performance, all the general caches are L1 aligned.
-
* This should be particularly beneficial on SMP boxes, as it
-
* eliminates "false sharing".
-
* Note for systems short on memory removing the alignment will
-
* allow tighter packing of the smaller caches.
-
*/
-
if (!sizes->cs_cachep)
-
sizes->cs_cachep = create_kmalloc_cache(names->name,
-
sizes->cs_size, ARCH_KMALLOC_FLAGS);
-
-
#ifdef CONFIG_ZONE_DMA
-
sizes->cs_dmacachep = create_kmalloc_cache(
-
names->name_dma, sizes->cs_size,
-
SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS);
-
#endif
-
sizes++;
-
names++;
-
}
这里在说一下cache_names和malloc_sizes:
-
/*
-
* These are the default caches for kmalloc. Custom caches can have other sizes.
-
*/
-
struct cache_sizes malloc_sizes[] = {
-
#define CACHE(x) { .cs_size = (x) },
-
#include <linux/kmalloc_sizes.h>
-
CACHE(ULONG_MAX)
-
#undef CACHE
-
};
这里就不扩展开了.
-
/* Must match cache_sizes above. Out of line to keep cache footprint low. */
-
struct cache_names {
-
char *name;
-
char *name_dma;
-
};
-
-
static struct cache_names __initdata cache_names[] = {
-
#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
-
#include <linux/kmalloc_sizes.h>
-
{NULL,}
-
#undef CACHE
-
};
create_kmalloc_cache实际上是调用create_boot_cache. 把kernel预定义的通用cache创建一遍.之后我们进入第四步、第5步:
-
/* 4) Replace the bootstrap head arrays */
-
{
-
struct array_cache *ptr;
-
-
ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
-
-
memcpy(ptr, cpu_cache_get(kmem_cache),
-
sizeof(struct arraycache_init));
-
/*
-
* Do not assume that spinlocks can be initialized via memcpy:
-
*/
-
spin_lock_init(&ptr->lock);
-
-
kmem_cache->array[smp_processor_id()] = ptr;
-
-
ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
-
-
BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
-
!= &initarray_generic.cache);
-
memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
-
sizeof(struct arraycache_init));
-
/*
-
* Do not assume that spinlocks can be initialized via memcpy:
-
*/
-
spin_lock_init(&ptr->lock);
-
-
malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
-
ptr;
-
}
-
/* 5) Replace the bootstrap kmem_list3's */
-
{
-
int nid;
-
-
for_each_online_node(nid) {
-
init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
-
-
init_list(malloc_sizes[INDEX_AC].cs_cachep,
-
&initkmem_list3[SIZE_AC + nid], nid);
-
-
if (INDEX_AC != INDEX_L3) {
-
init_list(malloc_sizes[INDEX_L3].cs_cachep,
-
&initkmem_list3[SIZE_L3 + nid], nid);
-
}
-
}
-
}
-
-
slab_state = UP;
最后把slab_state状态设置为up 即已经可以正常使用了。虽然上面大部分是代码,具体申请内存的流程前面kmalloc已经讲过了。仅仅是为了弄明白cache到底是个什么玩意,以及如何初始化的。
在kmem_cache_init后,还有一个kmem_cache_init_late函数.
它主要是调用了enable_cpucache和注册一个cpu通知连
-
/*
-
* Register a cpu startup notifier callback that initializes
-
* cpu_cache_get for all new cpus
-
*/
-
register_cpu_notifier(&cpucache_notifier);
还记不记得之前我们分析batchcount的时候的矛盾点?
-
/* Called with slab_mutex held always */
-
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
-
{
-
int err;
-
int limit = 0;
-
int shared = 0;
-
int batchcount = 0;
-
-
if (!is_root_cache(cachep)) {
-
struct kmem_cache *root = memcg_root_cache(cachep);
-
limit = root->limit;
-
shared = root->shared;
-
batchcount = root->batchcount;
-
}
-
-
if (limit && shared && batchcount)
-
goto skip_setup;
-
/*
-
* The head array serves three purposes:
-
* - create a LIFO ordering, i.e. return objects that are cache-warm
-
* - reduce the number of spinlock operations.
-
* - reduce the number of linked list operations on the slab and
-
* bufctl chains: array operations are cheaper.
-
* The numbers are guessed, we should auto-tune as described by
-
* Bonwick.
-
*/
-
if (cachep->size > 131072)
-
limit = 1;
-
else if (cachep->size > PAGE_SIZE)
-
limit = 8;
-
else if (cachep->size > 1024)
-
limit = 24;
-
else if (cachep->size > 256)
-
limit = 54;
-
else
-
limit = 120;
-
-
/*
-
* CPU bound tasks (e.g. network routing) can exhibit cpu bound
-
* allocation behaviour: Most allocs on one cpu, most free operations
-
* on another cpu. For these cases, an efficient object passing between
-
* cpus is necessary. This is provided by a shared array. The array
-
* replaces Bonwick's magazine layer.
-
* On uniprocessor, it's functionally equivalent (but less efficient)
-
* to a larger limit. Thus disabled by default.
-
*/
-
shared = 0;
-
if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
-
shared = 8;
-
-
#if DEBUG
-
/*
-
* With debugging enabled, large batchcount lead to excessively long
-
* periods with disabled local interrupts. Limit the batchcount
-
*/
-
if (limit > 32)
-
limit = 32;
-
#endif
-
batchcount = (limit + 1) / 2;
-
skip_setup:
-
err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
-
if (err)
-
printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
-
cachep->name, -err);
-
return err;
-
}
它会根据obj size 计算limit值 ,再去计算batchcount的值.
这个只是一个小小的开始吧,内存管理本来就博大精深,只有遇到具体问题具体分析,来加深理解了.
阅读(5857) | 评论(0) | 转发(3) |