2013年(22)
分类: LINUX
2013-08-04 18:40:40
Q:SLAB为什么需要定时调用cache_reap()回收page?
A:SLAB的每CPU array_cache中的对象可能关联到多个slab,在长时间的分配和释放后,可能出现比较极端的情况:array_cache借用了多个slab的对象,而slab除了被array_cache借用的对象外,其他对象都是空闲的,导致slab不能销毁,大量page被占用.定时的cache_reap()后,array_cache借用的对象都被归还给slab,如果某些slab是全空闲的,那么可以被销毁,后续的分配会集中到一个或少量的几个slab上,解决page被虚占的问题.
Q:为了避免频繁的向伙伴系统分配或释放page,每个kmem_cache会缓存kmem_list3->free_limit个对象(也就是至少kmem_list3->free_limit/kmem_cache->num个slab),当kmem_list3中的空闲对象超过这个阀值后才考虑销毁slab,那么在内存非常紧张的情况下这些缓存的slab是否会被销毁以释放page?
A:当前的内核实现中,在内存紧张释放内存时,不会自动销毁kmem_list3中的完全空闲slab来释放page,但是提供了kem_cache_shrink()接口来完成相应的操作.
分配内存:
kmem_cache_alloc()
__cache_alloc()
objp = ____cache_alloc(cachep, flags);
//取得当前CPU的array_cache
struct array_cache *ac = cpu_cache_get(cachep);
if (likely(ac->avail)) {//当前ac中有对象可用,立即返回
ac->touched = 1;
objp = ac->entry[--ac->avail];
} else {
//从可用的slab中转移ac->batchcount个对象填充当前CPU的ac
cache_alloc_refill(cachep, flags);
objp = ac->entry[--ac->avail];
}
return objp;
释放内存:
kmem_cache_free()
__cache_free(cachep, objp);
struct array_cache *ac = cpu_cache_get(cachep);
if (likely(ac->avail < ac->limit)) {//ac未满,直接释放到ac中
STATS_INC_FREEHIT(cachep);
ac->entry[ac->avail++] = objp;
return;
} else {
//ac已满,转移ac->batchcount个对象到slab,腾出位置存放释放的对象
cache_flusharray(cachep, ac);
ac->entry[ac->avail++] = objp;
}
几个重要的函数:
/**
* calculate_slab_order - calculate size (page order) of slabs
* @cachep: pointer to the cache that is being created
* @size: size of objects to be created in this cache.
* @align: required alignment for the objects.
* @flags: slab allocation flags
*
* Also calculates the number of objects per slab.
*
* This could be made much more intelligent. For now, try to avoid using
* high order pages for slabs. When the gfp() functions are more friendly
* towards high-order requests, this should be changed.
*/
//根据对象大小和对齐长度计算一个slab需要多少个页框,返回着色区长度
static size_t calculate_slab_order(struct kmem_cache *cachep,
size_t size, size_t align, unsigned long flags)
{
unsigned long offslab_limit;
size_t left_over = 0;
int gfporder;
//依次递增order,找到最合适的order
for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
unsigned int num;
size_t remainder;
cache_estimate(gfporder, size, align, flags, &remainder, &num);
if (!num)
continue;
if (flags & CFLGS_OFF_SLAB) {
/*
* Max number of objs-per-slab for caches which
* use off-slab slabs. Needed to avoid a possible
* looping condition in cache_grow().
*/
offslab_limit = size - sizeof(struct slab);
offslab_limit /= sizeof(kmem_bufctl_t);
if (num > offslab_limit)
break;
}
/* Found something acceptable - save it away */
cachep->num = num;
cachep->gfporder = gfporder;
left_over = remainder;
/*
* A VFS-reclaimable slab tends to have most allocations
* as GFP_NOFS and we really don't want to have to be allocating
* higher-order pages when we are unable to shrink dcache.
*/
//各文件系统在创建inode cache时一般会设置SLAB_RECLAIM_ACCOUNT标志
//表示分配的对象是可以回收的
//如果此标志置位,就没有必要再分配更高order的page
if (flags & SLAB_RECLAIM_ACCOUNT)
break;
/*
* Large number of objects is good, but very large slabs are
* currently bad for the gfp()s.
*/
if (gfporder >= slab_break_gfp_order)
break;
/*
* Acceptable internal fragmentation?
*/
//着色区(浪费的区域)不得多于实际使用区的1/8
if (left_over * 8 <= (PAGE_SIZE << gfporder))
break;
}
return left_over;
}
/*
* Calculate the number of objects and left-over bytes for a given buffer size.
*/
//计算2<
size_t align, int flags, size_t *left_over,
unsigned int *num)
{
int nr_objs;
size_t mgmt_size;
size_t slab_size = PAGE_SIZE << gfporder;
/*
* The slab management structure can be either off the slab or
* on it. For the latter case, the memory allocated for a
* slab is used for:
*
* - The struct slab
* - One kmem_bufctl_t for each object
* - Padding to respect alignment of @align
* - @buffer_size bytes for each object
*
* If the slab management structure is off the slab, then the
* alignment will already be calculated into the size. Because
* the slabs are all pages aligned, the objects will be at the
* correct alignment when allocated.
*/
if (flags & CFLGS_OFF_SLAB) {//slab管理区与对象不在同一个区域
mgmt_size = 0;
nr_objs = slab_size / buffer_size;
if (nr_objs > SLAB_LIMIT)
nr_objs = SLAB_LIMIT;
} else {//slab管理区与对象在同一个区域
/*
* Ignore padding for the initial guess. The padding
* is at most @align-1 bytes, and @buffer_size is at
* least @align. In the worst case, this result will
* be one greater than the number of objects that fit
* into the memory allocation when taking the padding
* into account.
*/
//每个对象都有一个int大小的管理索引号
nr_objs = (slab_size - sizeof(struct slab)) /
(buffer_size + sizeof(kmem_bufctl_t));
/*
* This calculated number will be either the right
* amount, or one greater than what we want.
*/
//如果管理区向上对齐到对齐长度导致溢出,那么需要将对象数目减1
//为什么减1就够了?由size = ALIGN(size, align)可知
//obj_size必然大于等于align,因此减掉一个obj_size肯定就不会溢出了
if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
> slab_size)
nr_objs--;
if (nr_objs > SLAB_LIMIT)
nr_objs = SLAB_LIMIT;
mgmt_size = slab_mgmt_size(nr_objs, align);
}
*num = nr_objs;
*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
}
/*
* Grow (by 1) the number of slabs within a cache. This is called by
* kmem_cache_alloc() when there are no active objs left in a cache.
*/
//生产一个slab
static int cache_grow(struct kmem_cache *cachep,
gfp_t flags, int nodeid, void *objp)
{
struct slab *slabp;
size_t offset;
gfp_t local_flags;
struct kmem_list3 *l3;
/*
* Be lazy and only check for valid flags here, keeping it out of the
* critical path in kmem_cache_alloc().
*/
BUG_ON(flags & GFP_SLAB_BUG_MASK);
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
/* Take the l3 list lock to change the colour_next on this node */
check_irq_off();
l3 = cachep->nodelists[nodeid];
spin_lock(&l3->list_lock);
/* Get colour for the slab, and cal the next value. */
offset = l3->colour_next;
l3->colour_next++;
if (l3->colour_next >= cachep->colour)
l3->colour_next = 0;
spin_unlock(&l3->list_lock);
offset *= cachep->colour_off;
if (local_flags & __GFP_WAIT)
local_irq_enable();
/*
* The test for missing atomic flag is performed here, rather than
* the more obvious place, simply to reduce the critical path length
* in kmem_cache_alloc(). If a caller is seriously mis-behaving they
* will eventually be caught here (where it matters).
*/
kmem_flagcheck(cachep, flags);
/*
* Get mem for the objs. Attempt to allocate a physical page from
* 'nodeid'.
*/
if (!objp)//如果没有为slab指定内存区域,那么从伙伴系统分配pages
objp = kmem_getpages(cachep, local_flags, nodeid);
if (!objp)
goto failed;
/* Get slab management. */
slabp = alloc_slabmgmt(cachep, objp, offset,
local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
if (!slabp)
goto opps1;
//将slab第一个page的lru的next和prev分别用于存放kmem_cache和slab
//kfree()时,根据虚拟地址即可找到kmem_cache
slab_map_pages(cachep, slabp, objp);
//通过kmem_bufctl_t,将对象组成链表
cache_init_objs(cachep, slabp);
if (local_flags & __GFP_WAIT)
local_irq_disable();
check_irq_off();
spin_lock(&l3->list_lock);
/* Make slab active. */
list_add_tail(&slabp->list, &(l3->slabs_free));
STATS_INC_GROWN(cachep);
l3->free_objects += cachep->num;
spin_unlock(&l3->list_lock);
return 1;
opps1:
kmem_freepages(cachep, objp);
failed:
if (local_flags & __GFP_WAIT)
local_irq_disable();
return 0;
}
//从可用的slab中转移ac->batchcount个对象填充当前CPU的array_cache
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
int batchcount;
struct kmem_list3 *l3;
struct array_cache *ac;
int node;
retry:
check_irq_off();
node = numa_node_id();
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
//自从cache_reap()后此ac还没分配过对象,表示此ac的分配不活跃,限制一次性从slab分配对象的个数
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
/*
* If there was little recent activity on this cache, then
* perform only a partial refill. Otherwise we could generate
* refill bouncing.
*/
batchcount = BATCHREFILL_LIMIT;
}
l3 = cachep->nodelists[node];
//当前处于关中断环境,ac不可能被切换,ac->avail必须为0
BUG_ON(ac->avail > 0 || !l3);
spin_lock(&l3->list_lock);
/* See if we can refill from the shared array */
//如果当前节点有共享的array_cache,那么考虑从共享的array_cache中分配batchcount个对象到当前CPU的array_cache
if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
goto alloc_done;
//没有共享array_cache或共享array_cache中没有空闲的对象
while (batchcount > 0) {
struct list_head *entry;
struct slab *slabp;
/* Get slab alloc is to come from. */
entry = l3->slabs_partial.next;//先从部分空闲链表中找slab
if (entry == &l3->slabs_partial) {//部分空闲链表为空
l3->free_touched = 1;
entry = l3->slabs_free.next;//尝试从完全空闲链表中找slab
if (entry == &l3->slabs_free)//完全空闲链表也为空,必须分配slab
goto must_grow;
}
slabp = list_entry(entry, struct slab, list);
check_slabp(cachep, slabp);
check_spinlock_acquired(cachep);
/*
* The slab was either on partial or free list so
* there must be at least one object available for
* allocation.
*/
BUG_ON(slabp->inuse < 0 || slabp->inuse >= cachep->num);
//从slab获取空闲对象,挂入当前CPU的array_cache
while (slabp->inuse < cachep->num && batchcount--) {
STATS_INC_ALLOCED(cachep);
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
node);
}
check_slabp(cachep, slabp);
/* move slabp to correct slabp list: */
list_del(&slabp->list);
if (slabp->free == BUFCTL_END)
list_add(&slabp->list, &l3->slabs_full);
else
list_add(&slabp->list, &l3->slabs_partial);
}
must_grow:
//此函数开始时ac->avail为0,此时需要减少此节点上的总可用对象个数
l3->free_objects -= ac->avail;
alloc_done:
spin_unlock(&l3->list_lock);
if (unlikely(!ac->avail)) {
int x;
//生产一个slab
x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
/* cache_grow can reenable interrupts, then ac could change. */
ac = cpu_cache_get(cachep);
#if 0
if (!x && ac->avail == 0) /* no objects in sight? abort */
return NULL;
if (!ac->avail) /* objects refilled by interrupt? */
goto retry;
#else //等价于
if(ac->avail == 0){
//cache_grow()失败
if(!x)
return NULL;
goto retry; //cache_grow()成功,现在空闲链表不为空,尝试分配
}
else{
//1.cache_grow()打开中断后,其他内核路径填充过ac
//2.cache_grow()打开中断后,进程切换了CPU
}
#endif
}
ac->touched = 1;
return ac->entry[--ac->avail];
}
//array_cache中的空闲对象太多,转移batchcount个空闲对象到slab
//batchcount个对象可能属于多个slab
static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
{
int batchcount;
struct kmem_list3 *l3;
int node = numa_node_id();
batchcount = ac->batchcount;
#if DEBUG
BUG_ON(!batchcount || batchcount > ac->avail);
#endif
//array_cache是每个CPU一个,修改array_cache的成员必须在关中断的环境中进行
check_irq_off();
l3 = cachep->nodelists[node];
spin_lock(&l3->list_lock);
//当前节点是一个共享节点,释放shared_array->limit - shared_array->avail个对象到共享array_cache
if (l3->shared) {
struct array_cache *shared_array = l3->shared;
int max = shared_array->limit - shared_array->avail;
if (max) {
if (batchcount > max)
batchcount = max;
memcpy(&(shared_array->entry[shared_array->avail]),
ac->entry, sizeof(void *) * batchcount);
shared_array->avail += batchcount;
goto free_done;
}
}
//将batchcount个对象释放回slab
free_block(cachep, ac->entry, batchcount, node);
free_done:
#if STATS
{
int i = 0;
struct list_head *p;
p = l3->slabs_free.next;
while (p != &(l3->slabs_free)) {
struct slab *slabp;
slabp = list_entry(p, struct slab, list);
BUG_ON(slabp->inuse);
i++;
p = p->next;
}
STATS_SET_FREEABLE(cachep, i);
}
#endif
spin_unlock(&l3->list_lock);
ac->avail -= batchcount;
//前面的batchcount个对象已经被释放,将后面的整体往前移
memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
}
/*
* Caller needs to acquire correct kmem_list's list_lock
*/
//将nr_objects个对象归还给slab,nr_objects个对象可能属于多个slab
static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
int node)
{
int i;
struct kmem_list3 *l3;
for (i = 0; i < nr_objects; i++) {
void *objp = objpp[i];
struct slab *slabp;
//先得到组合页框的第一个页,page->lru.prev即为slab对象
slabp = virt_to_slab(objp);
l3 = cachep->nodelists[node];
list_del(&slabp->list);
check_spinlock_acquired_node(cachep, node);
check_slabp(cachep, slabp);
//归还对象
slab_put_obj(cachep, slabp, objp, node);
STATS_DEC_ACTIVE(cachep);
l3->free_objects++;
check_slabp(cachep, slabp);
/* fixup slab chains */
//此slab的对象都回来了,考虑销毁
if (slabp->inuse == 0) {
if (l3->free_objects > l3->free_limit) {
l3->free_objects -= cachep->num;
/* No need to drop any previously held
* lock here, even if we have a off-slab slab
* descriptor it is guaranteed to come from
* a different cache, refer to comments before
* alloc_slabmgmt.
*/
slab_destroy(cachep, slabp);
} else {
//缓存slab,将slab加入到完全空闲链表
list_add(&slabp->list, &l3->slabs_free);
}
} else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
*/
list_add_tail(&slabp->list, &l3->slabs_partial);
}
}
}
/**
* cache_reap - Reclaim memory from caches.
* @w: work descriptor
*
* Called from workqueue/eventd every few seconds.
* Purpose:
* - clear the per-cpu caches for this CPU.
* - return freeable pages to the main free memory pool.
*
* If we cannot acquire the cache chain mutex then just give up - we'll try
* again on the next iteration.
*/
//尝试销毁kmem_cache中的空闲slab,回收page
static void cache_reap(struct work_struct *w)
{
struct kmem_cache *searchp;
struct kmem_list3 *l3;
int node = numa_node_id();
struct delayed_work *work =
container_of(w, struct delayed_work, work);
if (!mutex_trylock(&cache_chain_mutex))
/* Give up. Setup the next iteration. */
goto out;
list_for_each_entry(searchp, &cache_chain, next) {
check_irq_on();
/*
* We only take the l3 lock if absolutely necessary and we
* have established with reasonable certainty that
* we can do some work if the lock was obtained.
*/
l3 = searchp->nodelists[node];
reap_alien(searchp, l3);
//如果ac->avail > ac->limit/5,认为ac中的对象较多,释放ac->limit/5到slab
//如果ac->avail < ac->limit/5,认为ac中的对象较少,释放ac->avail/2到slab
drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
/*
* These are racy checks but it does not matter
* if we skip one check or scan twice.
*/
if (time_after(l3->next_reap, jiffies))
goto next;
l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
drain_array(searchp, l3, l3->shared, 0, node);
//刚刚分配对象时用掉了一个完全空闲的slab,此kmem_cache上对象分配较活跃,暂时不考虑销毁slab
if (l3->free_touched)
l3->free_touched = 0;
else {
int freed;
//最多销毁的slab为总共空闲slab的1/5
freed = drain_freelist(searchp, l3, (l3->free_limit +
5 * searchp->num - 1) / (5 * searchp->num));
STATS_ADD_REAPED(searchp, freed);
}
next:
cond_resched();
}
check_irq_on();
mutex_unlock(&cache_chain_mutex);
next_reap_node();
out:
/* Set up the next iteration */
schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
}