2013年(22)
分类: LINUX
2013-08-04 18:49:55
Q:PG_slub_frozen的作用?
A:在SLUB中,没有专门的slab管理对象,而是由page充当,PG_slub_frozen标识slab被CPU关联,主要作用:在__slab_free()中,如果当前返回的对象是slab中最后一个在使用的对象,那么slab就可以销毁,但是如果PG_slub_frozen标志置位,表示slab仍被其他CPU关联,暂时还不能销毁slab,从这一点来说,PG_slub_frozen充当了引用计数的角色, PG_slub_frozen置位表示有CPU仍在引用slab,还不能将slab返还给伙伴系统.
Q:kmem_cache_node的partial链表中是否可能出现inuse=0的slab?
A:可能,当需要取消CPU和slab的关联时,flush_slab()被调用,此时与CPU关联的slab的inuse可能为0,在flush_slab()->deactivate_slab()->unfreeze_slab()中,如果kmem_cache_node->nr_partial < kmem_cache_node->min_partial,完全空闲的slab会被加入到部分空闲对象链表中.
Q:SLAB VS SLUB?
a.SLAB在每个内存节点上使用3个链表(空闲链表、部分空闲链表、全满链表)来管理slab;SLUB在每个节点上只使用了一个部分空闲链表来管理部分空闲的slab,大大减少了链表的操作
b.SLAB使用slab结构来管理slab中的对象,需要考虑slab结构和管理区的存放位置,但是slub巧妙的使用page来代替slab结构,无相应的管理区,内存占用更少,自然也导致page的成员稍微复杂了一些。
c.SLAB使用了着色来解决physically tagged,virtually indexed导致的CPU L1缓冲行的冲突问题(虽然实际效果很难评估),SLUB未考虑着色。
d.SLAB使用kmem_bufctl_t来辅助完成slab内部的单链表的实现,SLUB则在每个对象头部预留指向下一个可用对象的指针来实现单链表。
e.SLAB针对每个内存节点实现了一个供所有CPU共享的array_cache,SLUB无此类似的kmem_cache_cpu
f.SLAB的调试需要重新编译内核,SLUB的调试只需要在在内核启动时增加slub_debug选项即可
g.SLAB的array_cache内部维持了一个对象数组,因此array_cache可以关联到多个slab,为了减少锁的争用和链表操作的次数,与slab之间的对象借用与归还需要进行批量操作;SLUB的kmem_cache_cpu只能关联到一个slab,自然无需做批量操作,但是在NUMA的平台下,也带来了关联和取消关联的开销
h.SLAB需要定期的调用cache_reap()回收可能被虚占的page
i.SLAB的代码似乎可读性更好;但是读完SLUB的代码后,只想对作者说一句:niubility!
分配内存:
kmem_cache_alloc()
slab_alloc()
kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
objsize = c->objsize;
if (unlikely(!c->freelist || !node_match(c, node)))//当前CPU关联的slab已无空闲对象
//重新为CPU关联一个有空闲对象的slab并分配对象
object = __slab_alloc(s, gfpflags, node, addr, c);
else {
//从链表中取一个元素即可
object = c->freelist;
c->freelist = object[c->offset];
stat(c, ALLOC_FASTPATH);
}
释放内存:
kmem_cache_free()
page = virt_to_head_page(x);
struct page *page = virt_to_page(x);
return compound_head(page);//如果是组合页框,返回组合页框的第一页
slab_free();
//对象所属的slab与CPU当前使用的slab相同,快速释放
if (likely(page == c->page && c->node >= 0)) {
object[c->offset] = c->freelist;
c->freelist = object;
stat(c, FREE_FASTPATH);
} else
//处理释放的对象所在slab不等于当前CPU关联的slab的情况
__slab_free(s, page, x, addr, c->offset);
几个重要的函数:
/*
* Slow path. The lockless freelist is empty or we need to perform
* debugging duties.
*
* Interrupts are disabled.
*
* Processing is still very fast if new objects have been freed to the
* regular freelist. In that case we simply take over the regular freelist
* as the lockless freelist and zap the regular freelist.
*
* If that is not working then we fall back to the partial lists. We take the
* first element of the freelist as the object to allocate now and move the
* rest of the freelist to the lockless freelist.
*
* And if we were unable to get a new slab from the partial slab lists then
* we need to allocate a new slab. This is the slowest path since it involves
* a call to the page allocator and the setup of a new slab.
*/
//处理当前CPU的链表中无对象可用的情况
static void *__slab_alloc(struct kmem_cache *s,
gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c)
{
void **object;
struct page *new;
/* We handle __GFP_ZERO in the caller */
gfpflags &= ~__GFP_ZERO;
//当前CPU还没有关联的slab,出现的情况:
//1.在第一次调用__slab_alloc()时
//2.flush_slab()导致当前CPU无关联的slab
if (!c->page)
goto new_slab;
//kmem_cache_alloc()和kmem_cache_free()可能在两个CPU上同时进行,且涉及同一个
//slab对象,因为分配和释放都会修改page->freelist,page->inuse,故借用PG_locked来
//做互斥
slab_lock(c->page);
if (unlikely(!node_match(c, node)))
goto another_slab;
stat(c, ALLOC_REFILL);
load_freelist:
//如果当前CPU对应的slab有对象释放回来,则c->page->freelist不空,
//CPU直接再次接管slab上的所有空闲对象,这种情况下分配会非常快
//如果当前slab中的对象已经被用完,c->page->freelist为空,则
//需要重新关联一个slab
object = c->page->freelist;
if (unlikely(!object))
goto another_slab;
if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
goto debug;
//将slab关联到当前CPU,当前CPU接管slab的空闲对象链表,后续分配从链表上取元素即可
c->freelist = object[c->offset];
//因为slab中的所有对象已经被当前CPU接管:
//1.标记slab中所有的对象为已使用
//2.将slab指向空闲链表的指针置空
c->page->inuse = c->page->objects;
c->page->freelist = NULL;
c->node = page_to_nid(c->page);
unlock_out:
slab_unlock(c->page);
stat(c, ALLOC_SLOWPATH);
return object;
another_slab:
deactivate_slab(s, c);//取消CPU与slab的关联
new_slab:
//get_partial()返回的slab会立即关联到CPU,因此一旦选中,立即置位PG_slub_frozen
new = get_partial(s, gfpflags, node);
if (new) {
c->page = new;
stat(c, ALLOC_FROM_PARTIAL);
goto load_freelist;
}
if (gfpflags & __GFP_WAIT)
local_irq_enable();
new = new_slab(s, gfpflags, node);
if (gfpflags & __GFP_WAIT)
local_irq_disable();
if (new) {
c = get_cpu_slab(s, smp_processor_id());
stat(c, ALLOC_SLAB);
//当gfpflags置位__GFP_WAIT,在new_slab()期间,中断被打开,可能出现两种情况:
//1.其他的内核路径执行了__slab_alloc(),导致当前CPU关联上其他的slab
//2.线程切换CPU,有可能现在的代码已经不在先前运行的CPU上执行
//这两种情况都下都有可能出现c->page不为空,因此需要将当前CPU与slab取消关联,与新的slab关联
if (c->page)
flush_slab(s, c);
slab_lock(new);
__SetPageSlubFrozen(new);
c->page = new;
goto load_freelist;
}
return NULL;
debug:
if (!alloc_debug_processing(s, c->page, object, addr))
goto another_slab;
c->page->inuse++;
c->page->freelist = object[c->offset];
c->node = -1;
goto unlock_out;
}
/*
* Remove the cpu slab
*/
//取消CPU与slab的关联
static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
struct page *page = c->page;
int tail = 1;
if (page->freelist)
stat(c, DEACTIVATE_REMOTE_FREES);
/*
* Merge cpu freelist into slab freelist. Typically we get here
* because both freelists are empty. So this is unlikely
* to occur.
*/
//将空闲对象从CPU链表还给slab链表,只有这样才能保证:
//1.page->inuse的值是正确的
//2.slab的空闲链表是完整的
while (unlikely(c->freelist)) {
void **object;
tail = 0; /* Hot objects. Put the slab first */
/* Retrieve object from cpu_freelist */
object = c->freelist;
c->freelist = c->freelist[c->offset];
/* And put onto the regular freelist */
object[c->offset] = page->freelist;
page->freelist = object;
page->inuse--;
}
c->page = NULL;//取消CPU和slab的关联
unfreeze_slab(s, page, tail);
}
/*
* Move a page back to the lists.
*
* Must be called with the slab lock held.
*
* On exit the slab lock will have been dropped.
*/
//清除slab关联到CPU的标志,如果slab的部分对象仍然未释放将它加入到部分空闲链表
//如果slab的所有对象都已释放,销毁slab
static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
{
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
__ClearPageSlubFrozen(page);//清除slab关联到CPU的标志
if (page->inuse) {//slab中的对象没有全部回来
if (page->freelist) {//此slab还有空闲对象,将它加入到部分空闲链表中
add_partial(n, page, tail);
stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
} else {//此slab已经没有空闲对象,如果CONFIG_SLUB_DEBUG打开,将它加入到全满链表中
stat(c, DEACTIVATE_FULL);
if (SLABDEBUG && PageSlubDebug(page) &&
(s->flags & SLAB_STORE_USER))
add_full(n, page);
}
slab_unlock(page);
} else {//此slab的对象已经全部回来,可以考虑销毁
stat(c, DEACTIVATE_EMPTY);
if (n->nr_partial < n->min_partial) {
/*
* Adding an empty slab to the partial slabs in order
* to avoid page allocator overhead. This slab needs
* to come after the other slabs with objects in
* so that the others get filled first. That way the
* size of the partial list stays small.
*
* kmem_cache_shrink can reclaim any empty slabs from
* the partial list.
*/
add_partial(n, page, 1);
slab_unlock(page);
} else {
slab_unlock(page);
stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB);
//销毁slab,将page交还给伙伴系统
discard_slab(s, page);
}
}
}
//向伙伴系统请求2^order个page,将请求的page构建成一个slab
static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
struct page *page;
void *start;
void *last;
void *p;
BUG_ON(flags & GFP_SLAB_BUG_MASK);
page = allocate_slab(s,
flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
if (!page)
goto out;
inc_slabs_node(s, page_to_nid(page), page->objects);
page->slab = s;
page->flags |= 1 << PG_slab;
if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
SLAB_STORE_USER | SLAB_TRACE))
__SetPageSlubDebug(page);
start = page_address(page);
if (unlikely(s->flags & SLAB_POISON))
memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page));
//1.将start开始的PAGE_SIZE*2^order长度的内存区域平均划分为page->objects个对象
//2.将每个对象的最开始部分填入下一个对象的地址,即将所有的对象使用单链表连接起来
last = start;
for_each_object(p, s, start, page->objects) {
setup_object(s, page, last);
set_freepointer(s, last, p);
last = p;
}
setup_object(s, page, last);
set_freepointer(s, last, NULL);
//page->freelist指向第一个对象
page->freelist = start;
page->inuse = 0;
out:
return page;
}
/*
* Slow patch handling. This may still be called frequently since objects
* have a longer lifetime than the cpu slabs in most processing loads.
*
* So we still attempt to reduce cache line usage. Just take the slab
* lock and free the item. If there is no additional partial page
* handling required then we can return immediately.
*/
//处理释放的对象所在slab不等于当前CPU关联的slab的情况
static void __slab_free(struct kmem_cache *s, struct page *page,
void *x, void *addr, unsigned int offset)
{
void *prior;
void **object = (void *)x;
struct kmem_cache_cpu *c;
c = get_cpu_slab(s, raw_smp_processor_id());
stat(c, FREE_SLOWPATH);
slab_lock(page);
if (unlikely(SLABDEBUG && PageSlubDebug(page)))
goto debug;
checks_ok:
//prior用于判断slab是否在空闲链表中
prior = object[offset] = page->freelist;
page->freelist = object;
page->inuse--;
//如果PG_slub_frozen标志置位,表示slab仍被CPU关联(执行
//kmem_cache_free()的CPU与slab被关联的CPU是不同的),
//在page->inuse=0的情况下,不能销毁slab
if (unlikely(PageSlubFrozen(page))) {
stat(c, FREE_FROZEN);
goto out_unlock;
}
//当前释放的对象是slab的唯一一个在使用的对象,且slab没有被其他CPU关联
//此对象回来后slab就可以被销毁
if (unlikely(!page->inuse))
goto slab_empty;
/*
* Objects left in the slab. If it was not on the partial list before
* then add it.
*/
//page->freelist为NULL,表示slab的对象要么缓存在kmem_cache_cpu中,
//要么被kmem_cache_alloc()取走,slab本身无可用对象,自然不在部分空闲链表中
if (unlikely(!prior)) {
//slab原先不在空闲链表中,加入之
add_partial(get_node(s, page_to_nid(page)), page, 1);
stat(c, FREE_ADD_PARTIAL);
}
out_unlock:
slab_unlock(page);
return;
slab_empty:
//page->freelist不为NULL,slab必定在部分空闲链表中
if (prior) {
/*
* Slab still on the partial list.
*/
//将slab从部分空闲链表中移除,准备释放到伙伴系统
remove_partial(s, page);
stat(c, FREE_REMOVE_PARTIAL);
}
slab_unlock(page);
stat(c, FREE_SLAB);
//销毁slab
discard_slab(s, page);
return;
debug:
if (!free_debug_processing(s, page, x, addr))
goto out_unlock;
goto checks_ok;
}