Chinaunix首页 | 论坛 | 博客
  • 博客访问: 165460
  • 博文数量: 22
  • 博客积分: 126
  • 博客等级: 入伍新兵
  • 技术积分: 459
  • 用 户 组: 普通用户
  • 注册时间: 2010-10-26 21:14
文章分类
文章存档

2013年(22)

我的朋友

分类: LINUX

2013-08-04 18:40:40

Q:SLAB为什么需要定时调用cache_reap()回收page?
A:SLAB的每CPU array_cache中的对象可能关联到多个slab,在长时间的分配和释放后,可能出现比较极端的情况:array_cache借用了多个slab的对象,而slab除了被array_cache借用的对象外,其他对象都是空闲的,导致slab不能销毁,大量page被占用.定时的cache_reap()后,array_cache借用的对象都被归还给slab,如果某些slab是全空闲的,那么可以被销毁,后续的分配会集中到一个或少量的几个slab上,解决page被虚占的问题.

Q:为了避免频繁的向伙伴系统分配或释放page,每个kmem_cache会缓存kmem_list3->free_limit个对象(也就是至少kmem_list3->free_limit/kmem_cache->num个slab),当kmem_list3中的空闲对象超过这个阀值后才考虑销毁slab,那么在内存非常紧张的情况下这些缓存的slab是否会被销毁以释放page?
A:当前的内核实现中,在内存紧张释放内存时,不会自动销毁kmem_list3中的完全空闲slab来释放page,但是提供了kem_cache_shrink()接口来完成相应的操作.


分配内存:
kmem_cache_alloc()
 __cache_alloc()
  objp = ____cache_alloc(cachep, flags);
   //取得当前CPU的array_cache
   struct array_cache *ac = cpu_cache_get(cachep);
   if (likely(ac->avail)) {//当前ac中有对象可用,立即返回
    ac->touched = 1;
    objp = ac->entry[--ac->avail];
   } else {
    //从可用的slab中转移ac->batchcount个对象填充当前CPU的ac
    cache_alloc_refill(cachep, flags);
    objp = ac->entry[--ac->avail];
   }
   return objp;

释放内存:
kmem_cache_free()
 __cache_free(cachep, objp);
  struct array_cache *ac = cpu_cache_get(cachep);
  if (likely(ac->avail < ac->limit)) {//ac未满,直接释放到ac中
   STATS_INC_FREEHIT(cachep);
   ac->entry[ac->avail++] = objp;
   return;
  } else {
   //ac已满,转移ac->batchcount个对象到slab,腾出位置存放释放的对象
   cache_flusharray(cachep, ac);
   ac->entry[ac->avail++] = objp;
  }

几个重要的函数:

/**
 * calculate_slab_order - calculate size (page order) of slabs
 * @cachep: pointer to the cache that is being created
 * @size: size of objects to be created in this cache.
 * @align: required alignment for the objects.
 * @flags: slab allocation flags
 *
 * Also calculates the number of objects per slab.
 *
 * This could be made much more intelligent.  For now, try to avoid using
 * high order pages for slabs.  When the gfp() functions are more friendly
 * towards high-order requests, this should be changed.
 */
//根据对象大小和对齐长度计算一个slab需要多少个页框,返回着色区长度
static size_t calculate_slab_order(struct kmem_cache *cachep,
   size_t size, size_t align, unsigned long flags)
{
 unsigned long offslab_limit;
 size_t left_over = 0;
 int gfporder;
 //依次递增order,找到最合适的order
 for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
  unsigned int num;
  size_t remainder;

  cache_estimate(gfporder, size, align, flags, &remainder, &num);
  if (!num)
   continue;

  if (flags & CFLGS_OFF_SLAB) {
   /*
    * Max number of objs-per-slab for caches which
    * use off-slab slabs. Needed to avoid a possible
    * looping condition in cache_grow().
    */
   offslab_limit = size - sizeof(struct slab);
   offslab_limit /= sizeof(kmem_bufctl_t);

    if (num > offslab_limit)
    break;
  }

  /* Found something acceptable - save it away */
  cachep->num = num;
  cachep->gfporder = gfporder;
  left_over = remainder;

  /*
   * A VFS-reclaimable slab tends to have most allocations
   * as GFP_NOFS and we really don't want to have to be allocating
   * higher-order pages when we are unable to shrink dcache.
   */
  //各文件系统在创建inode cache时一般会设置SLAB_RECLAIM_ACCOUNT标志
  //表示分配的对象是可以回收的
  //如果此标志置位,就没有必要再分配更高order的page
  if (flags & SLAB_RECLAIM_ACCOUNT)
   break;

  /*
   * Large number of objects is good, but very large slabs are
   * currently bad for the gfp()s.
   */
  if (gfporder >= slab_break_gfp_order)
   break;

  /*
   * Acceptable internal fragmentation?
   */
  //着色区(浪费的区域)不得多于实际使用区的1/8
  if (left_over * 8 <= (PAGE_SIZE << gfporder))
   break;
 }
 return left_over;
}

/*
 * Calculate the number of objects and left-over bytes for a given buffer size.
 */
//计算2< static void cache_estimate(unsigned long gfporder, size_t buffer_size,
      size_t align, int flags, size_t *left_over,
      unsigned int *num)
{
 int nr_objs;
 size_t mgmt_size;
 size_t slab_size = PAGE_SIZE << gfporder;

 /*
  * The slab management structure can be either off the slab or
  * on it. For the latter case, the memory allocated for a
  * slab is used for:
  *
  * - The struct slab
  * - One kmem_bufctl_t for each object
  * - Padding to respect alignment of @align
  * - @buffer_size bytes for each object
  *
  * If the slab management structure is off the slab, then the
  * alignment will already be calculated into the size. Because
  * the slabs are all pages aligned, the objects will be at the
  * correct alignment when allocated.
  */
 if (flags & CFLGS_OFF_SLAB) {//slab管理区与对象不在同一个区域
  mgmt_size = 0;
  nr_objs = slab_size / buffer_size;

  if (nr_objs > SLAB_LIMIT)
   nr_objs = SLAB_LIMIT;
 } else {//slab管理区与对象在同一个区域
  /*
   * Ignore padding for the initial guess. The padding
   * is at most @align-1 bytes, and @buffer_size is at
   * least @align. In the worst case, this result will
   * be one greater than the number of objects that fit
   * into the memory allocation when taking the padding
   * into account.
   */
  //每个对象都有一个int大小的管理索引号
  nr_objs = (slab_size - sizeof(struct slab)) /
     (buffer_size + sizeof(kmem_bufctl_t));

  /*
   * This calculated number will be either the right
   * amount, or one greater than what we want.
   */
  //如果管理区向上对齐到对齐长度导致溢出,那么需要将对象数目减1
  //为什么减1就够了?由size = ALIGN(size, align)可知
  //obj_size必然大于等于align,因此减掉一个obj_size肯定就不会溢出了  
  if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
         > slab_size)
   nr_objs--;

  if (nr_objs > SLAB_LIMIT)
   nr_objs = SLAB_LIMIT;

  mgmt_size = slab_mgmt_size(nr_objs, align);
 }
 *num = nr_objs;
 *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
}

/*
 * Grow (by 1) the number of slabs within a cache.  This is called by
 * kmem_cache_alloc() when there are no active objs left in a cache.
 */
//生产一个slab
static int cache_grow(struct kmem_cache *cachep,
  gfp_t flags, int nodeid, void *objp)
{
 struct slab *slabp;
 size_t offset;
 gfp_t local_flags;
 struct kmem_list3 *l3;

 /*
  * Be lazy and only check for valid flags here,  keeping it out of the
  * critical path in kmem_cache_alloc().
  */
 BUG_ON(flags & GFP_SLAB_BUG_MASK);
 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);

 /* Take the l3 list lock to change the colour_next on this node */
 check_irq_off();
 l3 = cachep->nodelists[nodeid];
 spin_lock(&l3->list_lock);

 /* Get colour for the slab, and cal the next value. */
 offset = l3->colour_next;
 l3->colour_next++;
 if (l3->colour_next >= cachep->colour)
  l3->colour_next = 0;
 spin_unlock(&l3->list_lock);

 offset *= cachep->colour_off;

 if (local_flags & __GFP_WAIT)
  local_irq_enable();

 /*
  * The test for missing atomic flag is performed here, rather than
  * the more obvious place, simply to reduce the critical path length
  * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
  * will eventually be caught here (where it matters).
  */
 kmem_flagcheck(cachep, flags);

 /*
  * Get mem for the objs.  Attempt to allocate a physical page from
  * 'nodeid'.
  */
 if (!objp)//如果没有为slab指定内存区域,那么从伙伴系统分配pages
  objp = kmem_getpages(cachep, local_flags, nodeid);
 if (!objp)
  goto failed;

 /* Get slab management. */
 slabp = alloc_slabmgmt(cachep, objp, offset,
   local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
 if (!slabp)
  goto opps1;
 //将slab第一个page的lru的next和prev分别用于存放kmem_cache和slab
 //kfree()时,根据虚拟地址即可找到kmem_cache
 slab_map_pages(cachep, slabp, objp);
 //通过kmem_bufctl_t,将对象组成链表
 cache_init_objs(cachep, slabp);

 if (local_flags & __GFP_WAIT)
  local_irq_disable();
 check_irq_off();
 spin_lock(&l3->list_lock);

 /* Make slab active. */
 list_add_tail(&slabp->list, &(l3->slabs_free));
 STATS_INC_GROWN(cachep);
 l3->free_objects += cachep->num;
 spin_unlock(&l3->list_lock);
 return 1;
opps1:
 kmem_freepages(cachep, objp);
failed:
 if (local_flags & __GFP_WAIT)
  local_irq_disable();
 return 0;
}

//从可用的slab中转移ac->batchcount个对象填充当前CPU的array_cache
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
 int batchcount;
 struct kmem_list3 *l3;
 struct array_cache *ac;
 int node;

retry:
 check_irq_off();
 node = numa_node_id();
 ac = cpu_cache_get(cachep);
 batchcount = ac->batchcount;
 //自从cache_reap()后此ac还没分配过对象,表示此ac的分配不活跃,限制一次性从slab分配对象的个数
 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
  /*
   * If there was little recent activity on this cache, then
   * perform only a partial refill.  Otherwise we could generate
   * refill bouncing.
   */
  batchcount = BATCHREFILL_LIMIT;
 }
 l3 = cachep->nodelists[node];
 //当前处于关中断环境,ac不可能被切换,ac->avail必须为0
 BUG_ON(ac->avail > 0 || !l3);
 spin_lock(&l3->list_lock);

 /* See if we can refill from the shared array */
 //如果当前节点有共享的array_cache,那么考虑从共享的array_cache中分配batchcount个对象到当前CPU的array_cache
 if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
  goto alloc_done;
 //没有共享array_cache或共享array_cache中没有空闲的对象
 while (batchcount > 0) {
  struct list_head *entry;
  struct slab *slabp;
  /* Get slab alloc is to come from. */
  entry = l3->slabs_partial.next;//先从部分空闲链表中找slab
  if (entry == &l3->slabs_partial) {//部分空闲链表为空
   l3->free_touched = 1;
   entry = l3->slabs_free.next;//尝试从完全空闲链表中找slab
   if (entry == &l3->slabs_free)//完全空闲链表也为空,必须分配slab
    goto must_grow;
  }

  slabp = list_entry(entry, struct slab, list);
  check_slabp(cachep, slabp);
  check_spinlock_acquired(cachep);

  /*
   * The slab was either on partial or free list so
   * there must be at least one object available for
   * allocation.
   */
  BUG_ON(slabp->inuse < 0 || slabp->inuse >= cachep->num);
  //从slab获取空闲对象,挂入当前CPU的array_cache
  while (slabp->inuse < cachep->num && batchcount--) {
   STATS_INC_ALLOCED(cachep);
   STATS_INC_ACTIVE(cachep);
   STATS_SET_HIGH(cachep);

   ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
           node);
  }
  check_slabp(cachep, slabp);

  /* move slabp to correct slabp list: */
  list_del(&slabp->list);
  if (slabp->free == BUFCTL_END)
   list_add(&slabp->list, &l3->slabs_full);
  else
   list_add(&slabp->list, &l3->slabs_partial);
 }

must_grow:
 //此函数开始时ac->avail为0,此时需要减少此节点上的总可用对象个数
 l3->free_objects -= ac->avail;
alloc_done:
 spin_unlock(&l3->list_lock);

 if (unlikely(!ac->avail)) {
  int x;
  //生产一个slab
  x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);

  /* cache_grow can reenable interrupts, then ac could change. */
  ac = cpu_cache_get(cachep);
#if 0
  if (!x && ac->avail == 0) /* no objects in sight? abort */
   return NULL;

  if (!ac->avail)  /* objects refilled by interrupt? */
   goto retry;
#else //等价于
  if(ac->avail == 0){
   //cache_grow()失败
   if(!x)
    return NULL; 
   goto retry; //cache_grow()成功,现在空闲链表不为空,尝试分配
  }
  else{ 
   //1.cache_grow()打开中断后,其他内核路径填充过ac
   //2.cache_grow()打开中断后,进程切换了CPU
  }
#endif
 }
 ac->touched = 1;
 return ac->entry[--ac->avail];
}

//array_cache中的空闲对象太多,转移batchcount个空闲对象到slab
//batchcount个对象可能属于多个slab
static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
{
 int batchcount;
 struct kmem_list3 *l3;
 int node = numa_node_id();

 batchcount = ac->batchcount;
#if DEBUG
 BUG_ON(!batchcount || batchcount > ac->avail);
#endif
 //array_cache是每个CPU一个,修改array_cache的成员必须在关中断的环境中进行
 check_irq_off();
 l3 = cachep->nodelists[node];
 spin_lock(&l3->list_lock);
 //当前节点是一个共享节点,释放shared_array->limit - shared_array->avail个对象到共享array_cache
 if (l3->shared) {
  struct array_cache *shared_array = l3->shared;
  int max = shared_array->limit - shared_array->avail;
  if (max) {
   if (batchcount > max)
    batchcount = max;
   memcpy(&(shared_array->entry[shared_array->avail]),
          ac->entry, sizeof(void *) * batchcount);
   shared_array->avail += batchcount;
   goto free_done;
  }
 }
 //将batchcount个对象释放回slab
 free_block(cachep, ac->entry, batchcount, node);
free_done:
#if STATS
 {
  int i = 0;
  struct list_head *p;

  p = l3->slabs_free.next;
  while (p != &(l3->slabs_free)) {
   struct slab *slabp;

   slabp = list_entry(p, struct slab, list);
   BUG_ON(slabp->inuse);

   i++;
   p = p->next;
  }
  STATS_SET_FREEABLE(cachep, i);
 }
#endif
 spin_unlock(&l3->list_lock);
 ac->avail -= batchcount;
 //前面的batchcount个对象已经被释放,将后面的整体往前移
 memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
}

/*
 * Caller needs to acquire correct kmem_list's list_lock
 */
//将nr_objects个对象归还给slab,nr_objects个对象可能属于多个slab
static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
         int node)
{
 int i;
 struct kmem_list3 *l3;

 for (i = 0; i < nr_objects; i++) {
  void *objp = objpp[i];
  struct slab *slabp;
  //先得到组合页框的第一个页,page->lru.prev即为slab对象
  slabp = virt_to_slab(objp);
  l3 = cachep->nodelists[node];
  list_del(&slabp->list);
  check_spinlock_acquired_node(cachep, node);
  check_slabp(cachep, slabp);
  //归还对象
  slab_put_obj(cachep, slabp, objp, node);
  STATS_DEC_ACTIVE(cachep);
  l3->free_objects++;
  check_slabp(cachep, slabp);

  /* fixup slab chains */
  //此slab的对象都回来了,考虑销毁
  if (slabp->inuse == 0) {
   if (l3->free_objects > l3->free_limit) {
    l3->free_objects -= cachep->num;
    /* No need to drop any previously held
     * lock here, even if we have a off-slab slab
     * descriptor it is guaranteed to come from
     * a different cache, refer to comments before
     * alloc_slabmgmt.
     */
    slab_destroy(cachep, slabp);
   } else {
    //缓存slab,将slab加入到完全空闲链表
    list_add(&slabp->list, &l3->slabs_free);
   }
  } else {
   /* Unconditionally move a slab to the end of the
    * partial list on free - maximum time for the
    * other objects to be freed, too.
    */
   list_add_tail(&slabp->list, &l3->slabs_partial);
  }
 }
}

/**
 * cache_reap - Reclaim memory from caches.
 * @w: work descriptor
 *
 * Called from workqueue/eventd every few seconds.
 * Purpose:
 * - clear the per-cpu caches for this CPU.
 * - return freeable pages to the main free memory pool.
 *
 * If we cannot acquire the cache chain mutex then just give up - we'll try
 * again on the next iteration.
 */
//尝试销毁kmem_cache中的空闲slab,回收page
static void cache_reap(struct work_struct *w)
{
 struct kmem_cache *searchp;
 struct kmem_list3 *l3;
 int node = numa_node_id();
 struct delayed_work *work =
  container_of(w, struct delayed_work, work);

 if (!mutex_trylock(&cache_chain_mutex))
  /* Give up. Setup the next iteration. */
  goto out;

 list_for_each_entry(searchp, &cache_chain, next) {
  check_irq_on();

  /*
   * We only take the l3 lock if absolutely necessary and we
   * have established with reasonable certainty that
   * we can do some work if the lock was obtained.
   */
  l3 = searchp->nodelists[node];

  reap_alien(searchp, l3);
  //如果ac->avail > ac->limit/5,认为ac中的对象较多,释放ac->limit/5到slab
  //如果ac->avail < ac->limit/5,认为ac中的对象较少,释放ac->avail/2到slab
  drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);

  /*
   * These are racy checks but it does not matter
   * if we skip one check or scan twice.
   */
  if (time_after(l3->next_reap, jiffies))
   goto next;

  l3->next_reap = jiffies + REAPTIMEOUT_LIST3;

  drain_array(searchp, l3, l3->shared, 0, node);

  //刚刚分配对象时用掉了一个完全空闲的slab,此kmem_cache上对象分配较活跃,暂时不考虑销毁slab
  if (l3->free_touched)
   l3->free_touched = 0;
  else {
   int freed;
   //最多销毁的slab为总共空闲slab的1/5
   freed = drain_freelist(searchp, l3, (l3->free_limit +
    5 * searchp->num - 1) / (5 * searchp->num));
   STATS_ADD_REAPED(searchp, freed);
  }
next:
  cond_resched();
 }
 check_irq_on();
 mutex_unlock(&cache_chain_mutex);
 next_reap_node();
out:
 /* Set up the next iteration */
 schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
}

阅读(1962) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~