Chinaunix首页 | 论坛 | 博客
  • 博客访问: 1212420
  • 博文数量: 56
  • 博客积分: 400
  • 博客等级: 一等列兵
  • 技术积分: 2800
  • 用 户 组: 普通用户
  • 注册时间: 2010-03-30 13:08
个人简介

一个人的差异在于业余时间

文章分类

全部博文(56)

文章存档

2023年(1)

2019年(1)

2018年(1)

2017年(1)

2016年(2)

2015年(20)

2014年(10)

2013年(7)

2012年(12)

2011年(1)

分类: LINUX

2015-01-26 14:51:42

  上一篇文章中简单说了下slab分配器下kmalloc是如何分配内存的。在看cache_alloc_refill这个函数的时候逻辑上还有一些困惑。

点击(此处)折叠或打开

  1. static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
  2.                             bool force_refill)
  3. {
  4.     int batchcount;
  5.     struct kmem_list3 *l3;
  6.     struct array_cache *ac;
  7.     int node;

  8.     check_irq_off();
  9.     node = numa_mem_id();
  10.     if (unlikely(force_refill))
  11.         goto force_grow;
  12. retry:
  13.     ac = cpu_cache_get(cachep);
  14.     batchcount = ac->batchcount;  
  15.     if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
  16.         /*
  17.          * If there was little recent activity on this cache, then
  18.          * perform only a partial refill. Otherwise we could generate
  19.          * refill bouncing.
  20.          */
  21.         batchcount = BATCHREFILL_LIMIT;
  22.     }
  23.     l3 = cachep->nodelists[node];

  1.     BUG_ON(ac->avail > 0 || !l3);
  2.     spin_lock(&l3->list_lock);

  3.     /* See if we can refill from the shared array */
  4.     if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
  5.         l3->shared->touched = 1;
  6.         goto alloc_done;
  7.     }

  8.     while (batchcount > 0) {
  9.         struct list_head *entry;
  10.         struct slab *slabp;
  11.         /* Get slab alloc is to come from. */
  12.         entry = l3->slabs_partial.next;
  13.         if (entry == &l3->slabs_partial) {
  14.             l3->free_touched = 1;
  15.             entry = l3->slabs_free.next;
  16.             if (entry == &l3->slabs_free)
  17.                 goto must_grow;
  18.         }

  19.         slabp = list_entry(entry, struct slab, list);
  20.         check_slabp(cachep, slabp);
  21.         check_spinlock_acquired(cachep);

  22.         /*
  23.          * The slab was either on partial or free list so
  24.          * there must be at least one object available for
  25.          * allocation.
  26.          */
  27.         BUG_ON(slabp->inuse >= cachep->num);

  28.         while (slabp->inuse < cachep->num && batchcount--) {
  29.             STATS_INC_ALLOCED(cachep);
  30.             STATS_INC_ACTIVE(cachep);
  31.             STATS_SET_HIGH(cachep);

  32.             ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
  33.                                     node));
  34.         }
  35.         check_slabp(cachep, slabp);

  36.         /* move slabp to correct slabp list: */
  37.         list_del(&slabp->list);
  38.         if (slabp->free == BUFCTL_END)
  39.             list_add(&slabp->list, &l3->slabs_full);
  40.         else
  41.             list_add(&slabp->list, &l3->slabs_partial);
  42.     }

  43. must_grow:
  44.     l3->free_objects -= ac->avail;
  45. alloc_done:
  46.     spin_unlock(&l3->list_lock);

  47.     if (unlikely(!ac->avail)) {
  48.         int x;
  49. force_grow:
  50.         x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);

  51.         /* cache_grow can reenable interrupts, then ac could change. */
  52.         ac = cpu_cache_get(cachep);
  53.         node = numa_mem_id();

  54.         /* no objects in sight? abort */
  55.         if (!x && (ac->avail == 0 || force_refill))
  56.             return NULL;

  57.         if (!ac->avail)        /* objects refilled by interrupt? */
  58.             goto retry;
  59.     }
  60.     ac->touched = 1;

  61.     return ac_get_obj(cachep, ac, flags, force_refill);
  62. }
主要是关于  batchcount = ac->batchcount;  的问题。在默认初始化的时候即在kmem_cache_init中系统的cache都会调用到__kmem_cache_create中setup_cpu_cache的有这样一段代码:

点击(此处)折叠或打开

  1. cpu_cache_get(cachep)->avail = 0;
  2.     cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
  3.     cpu_cache_get(cachep)->batchcount = 1;
  4.     cpu_cache_get(cachep)->touched = 0;
  5.     cachep->batchcount = 1;
  6.     cachep->limit = BOOT_CPUCACHE_ENTRIES;
  7.     return 0;
那么我是不是就可以认为ac->batchcount的值就是1了呢?那么 ac_put_obj的时候只放一个obj到array中。每次都这样,那么在__cache_alloc中

点击(此处)折叠或打开

  1. ac = cpu_cache_get(cachep);
  2.     if (likely(ac->avail)) {
  3.         ac->touched = 1;
  4.         objp = ac_get_obj(cachep, ac, flags, false);

  5.         /*
  6.          * Allow for the possibility all avail objects are not allowed
  7.          * by the current flags
  8.          */
  9.         if (objp) {
  10.             STATS_INC_ALLOCHIT(cachep);
  11.             goto out;
  12.         }
  13.         force_refill = true;
  14.     }
它的意义又何在呢? 因为batchcount为1的话,每次放入一个obj到array 设置avail从0到1,但是get一个obj后,avail又为0了。当然这样效率很低。
后来才发现是自己代码没看全- -, 我们看这样一段代码它在kmem_cache_init初始化后,调用的

点击(此处)折叠或打开

  1. void __init kmem_cache_init_late(void)
  2. {
  3.     struct kmem_cache *cachep;

  4.     slab_state = UP;

  5.     /* 6) resize the head arrays to their final sizes */
  6.     mutex_lock(&slab_mutex);
  7.     list_for_each_entry(cachep, &slab_caches, list)
  8.         if (enable_cpucache(cachep, GFP_NOWAIT))
  9.             BUG();
  10.     mutex_unlock(&slab_mutex);

  11.     /* Annotate slab for lockdep -- annotate the malloc caches */
  12.     init_lock_keys();

  13.     /* */
  14.     slab_state = FULL;

  15.     /*
  16.      * Register a cpu startup notifier callback that initializes
  17.      * cpu_cache_get for all new cpus
  18.      */
  19.     register_cpu_notifier(&cpucache_notifier);

  20. #ifdef CONFIG_NUMA
  21.     /*
  22.      * Register a memory hotplug callback that initializes and frees
  23.      * nodelists.
  24.      */
  25.     hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
  26. #endif

  27.     /*
  28.      * The reap timers are started later, with a module init call: That part
  29.      * of the kernel is not yet operational.
  30.      */
  31. }
这个函数就是把slab_caches链表上的所有cache都调用enable_cpucache(cachep, GFP_NOWAIT)一遍!

点击(此处)折叠或打开

  1. /* Called with slab_mutex held always */
  2. static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
  3. {
  4.     int err;
  5.     int limit = 0;
  6.     int shared = 0;
  7.     int batchcount = 0;

  8.     if (!is_root_cache(cachep)) {
  9.         struct kmem_cache *root = memcg_root_cache(cachep);
  10.         limit = root->limit;
  11.         shared = root->shared;
  12.         batchcount = root->batchcount;
  13.     }

  14.     if (limit && shared && batchcount)
  15.         goto skip_setup;
  16.     /*
  17.      * The head array serves three purposes:
  18.      * - create a LIFO ordering, i.e. return objects that are cache-warm
  19.      * - reduce the number of spinlock operations.
  20.      * - reduce the number of linked list operations on the slab and
  21.      * bufctl chains: array operations are cheaper.
  22.      * The numbers are guessed, we should auto-tune as described by
  23.      * Bonwick.
  24.      */
  25.     if (cachep->size > 131072)             // size 大一128k 小于page_size 则limit为1
  26.         limit = 1;
  27.     else if (cachep->size > PAGE_SIZE)
  28.         limit = 8;
  29.     else if (cachep->size > 1024)
  30.         limit = 24;
  31.     else if (cachep->size > 256)
  32.         limit = 54;
  33.     else
  34.         limit = 120;

  35.     /*
  36.      * CPU bound tasks (e.g. network routing) can exhibit cpu bound
  37.      * allocation behaviour: Most allocs on one cpu, most free operations
  38.      * on another cpu. For these cases, an efficient object passing between
  39.      * cpus is necessary. This is provided by a shared array. The array
  40.      * replaces Bonwick's magazine layer.
  41.      * On uniprocessor, it's functionally equivalent (but less efficient)
  42.      * to a larger limit. Thus disabled by default.
  43.      */
  44.     shared = 0;
  45.     if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)   //  smp 下 shared为8  ,单核为0 
  46.         shared = 8;

  47. #if DEBUG
  48.     /*
  49.      * With debugging enabled, large batchcount lead to excessively long
  50.      * periods with disabled local interrupts. Limit the batchcount
  51.      */
  52.     if (limit > 32)
  53.         limit = 32;
  54. #endif
  55.     batchcount = (limit + 1) / 2;
  56. skip_setup:
  57.     err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);     //设置 参数值到cache里 
  58.     if (err)
  59.         printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
  60.          cachep->name, -err);
  61.     return err;
  62. }
对我们看到了limit shared  batchcount的新初始化.

点击(此处)折叠或打开

  1. static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
  2.                 int batchcount, int shared, gfp_t gfp)
  3. {
  4.     int ret;
  5.     struct kmem_cache *c = NULL;
  6.     int i = 0;

  7.     ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);   // 设置传递进来的cache的东西

  8.     if (slab_state < FULL)
  9.         return ret;

  10.     if ((ret < 0) || !is_root_cache(cachep))
  11.         return ret;

  12.     VM_BUG_ON(!mutex_is_locked(&slab_mutex));
  13.     for_each_memcg_cache_index(i) {
  14.         c = cache_from_memcg(cachep, i);
  15.         if (c)
  16.             /* return value determined by the parent cache only */
  17.             __do_tune_cpucache(c, limit, batchcount, shared, gfp);
  18.     }

  19.     return ret;
  20. }
而具体实现在

点击(此处)折叠或打开

  1. /* Always called with the slab_mutex held */
  2. static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
  3.                 int batchcount, int shared, gfp_t gfp)
  4. {
  5.     struct ccupdate_struct *new;
  6.    // 说明一下上面的结构体
  7.    

    点击(此处)折叠或打开

    1. struct ccupdate_struct {
    2.     struct kmem_cache *cachep;
    3.     struct array_cache *new[0];
    4. };

  8.     int i;

  9.     new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),   //  这个函数用完new就释放了。说明它只是起到一个中转的作用.
  10.          gfp);
  11.     if (!new)
  12.         return -ENOMEM;

  13.     for_each_online_cpu(i) {
  14.         new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
  15.                         batchcount, gfp);
  16.         if (!new->new[i]) {
  17.             for (i--; i >= 0; i--)
  18.                 kfree(new->new[i]);
  19.             kfree(new);
  20.             return -ENOMEM;
  21.         }
  22.     }
  23.     new->cachep = cachep;

  24.     on_each_cpu(do_ccupdate_local, (void *)new, 1);   // 关键点: 每个cpu上都调用do_ccupdate_local处理new。

  25.     check_irq_on();
  26.     cachep->batchcount = batchcount;
  27.     cachep->limit = limit;
  28.     cachep->shared = shared;

  29.     for_each_online_cpu(i) {
  30.         struct array_cache *ccold = new->new[i];
  31.         if (!ccold)
  32.             continue;
  33.         spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
  34.         free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));         //
  35.         spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
  36.         kfree(ccold);
  37.     }
  38.     kfree(new);
  39.     return alloc_kmemlist(cachep, gfp);
  40. }
我们就看看do_ccupdate_local做了什么

点击(此处)折叠或打开

  1. static void do_ccupdate_local(void *info)
  2. {
  3.     struct ccupdate_struct *new = info;
  4.     struct array_cache *old;

  5.     check_irq_off();
  6.     old = cpu_cache_get(new->cachep);

  7.     new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];//  由于之前 new->cache已经指向了我们的cache,所以这里操作的是我们cache的array指向新的地方.
  8.                        // 而new->new这个array的初始化是在申请它的时候 见上个函数里的alloc_arraycache
  9.         

    点击(此处)折叠或打开

    1. static struct array_cache *alloc_arraycache(int node, int entries,
    2.                      int batchcount, gfp_t gfp)
    3. {
    4.     int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
    5.     struct array_cache *nc = NULL;

    6.     nc = kmalloc_node(memsize, gfp, node);
    7.     /*
    8.      * The array_cache structures contain pointers to free object.
    9.      * However, when such objects are allocated or transferred to another
    10.      * cache the pointers are not cleared and they could be counted as
    11.      * valid references during a kmemleak scan. Therefore, kmemleak must
    12.      * not scan such objects.
    13.      */
    14.     kmemleak_no_scan(nc);
    15.     if (nc) {
    16.         nc->avail = 0;
    17.         nc->limit = entries;
    18.         nc->batchcount = batchcount;
    19.         nc->touched = 0;
    20.         spin_lock_init(&nc->lock);
    21.     }
    22.     return nc;
    23. }


  10.     new->new[smp_processor_id()] = old;               
  11. }
这样就和函数cache_alloc_refill接起来了
我们可以看看实际的内核开启slab的信息:

点击(此处)折叠或打开

  1. cat /proc/slabinfo
  2. slabinfo - version: 2.1
  3. # name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab> : tunables <limit> <batchcount> <sharedfactor> : slabdata <active_slabs> <num_slabs> <sharedavail>
  4. nf_conntrack_expect 0 0 152 26 1 : tunables 120 60 8 : slabdata 0 0 0
  5. nf_conntrack_8050c5f0 2 26 296 13 1 : tunables 54 27 8 : slabdata 2 2 0

    bridge_fdb_cache       4     78     48   78    1 : tunables  120   60    8 : slabdata      1      1      0

    fib6_nodes            12    113     32  113    1 : tunables  120   60    8 : slabdata      1      1      0

    ip6_dst_cache         25     57    208   19    1 : tunables  120   60    8 : slabdata      3      3      0

    ip6_mrt_cache          0      0    112   35    1 : tunables  120   60    8 : slabdata      0      0      0

    RAWv6                  8     15    720    5    1 : tunables   54   27    8 : slabdata      3      3      0

    UDPLITEv6              0      0    688   11    2 : tunables   54   27    8 : slabdata      0      0      0

    UDPv6                  3     22    688   11    2 : tunables   54   27    8 : slabdata      2      2      0

    tw_sock_TCPv6          0      0    144   27    1 : tunables  120   60    8 : slabdata      0      0      0

    request_sock_TCPv6      0      0    112   35    1 : tunables  120   60    8 : slabdata      0      0      0

    TCPv6                  5      6   1328    3    1 : tunables   24   12    8 : slabdata      2      2      0

    ubi_wl_entry_slab    463    580     24  145    1 : tunables  120   60    8 : slabdata      4      4      0

    sd_ext_cdb             2    113     32  113    1 : tunables  120   60    8 : slabdata      1      1      0

    fuse_request           0      0    384   10    1 : tunables   54   27    8 : slabdata      0      0      0

    fuse_inode             0      0    416    9    1 : tunables   54   27    8 : slabdata      0      0      0

    jffs2_inode_cache     15    145     24  145    1 : tunables  120   60    8 : slabdata      1      1      0

    jffs2_node_frag      130    290     24  145    1 : tunables  120   60    8 : slabdata      2      2      0

    uid_cache              0      0     48   78    1 : tunables  120   60    8 : slabdata      0      0      0

    UNIX                  24     32    480    8    1 : tunables   54   27    8 : slabdata      4      4      0

    ip_mrt_cache           0      0     96   40    1 : tunables  120   60    8 : slabdata      0      0      0

    UDP-Lite               0      0    560    7    1 : tunables   54   27    8 : slabdata      0      0      0

    tcp_bind_bucket        6    113     32  113    1 : tunables  120   60    8 : slabdata      1      1      0

    inet_peer_cache        8     24    160   24    1 : tunables  120   60    8 : slabdata      1      1      0

    ip_fib_trie            7    113     32  113    1 : tunables  120   60    8 : slabdata      1      1      0

    ip_fib_alias           8    145     24  145    1 : tunables  120   60    8 : slabdata      1      1      0

    ip_dst_cache           6     27    144   27    1 : tunables  120   60    8 : slabdata      1      1      0

    PING                   0      0    528    7    1 : tunables   54   27    8 : slabdata      0      0      0

    RAW                    4      7    544    7    1 : tunables   54   27    8 : slabdata      1      1      0

    UDP                   13     14    560    7    1 : tunables   54   27    8 : slabdata      2      2      0

    tw_sock_TCP            0      0    112   35    1 : tunables  120   60    8 : slabdata      0      0      0

    request_sock_TCP       0      0     80   48    1 : tunables  120   60    8 : slabdata      0      0      0

    TCP                    1      6   1184    6    2 : tunables   24   12    8 : slabdata      1      1      0

  6. ......
  7. size-2048(DMA)         0      0   2048    2    1 : tunables   24   12    8 : slabdata      0      0      0

    size-2048            192    192   2048    2    1 : tunables   24   12    8 : slabdata     96     96      0

    size-1024(DMA)         0      0   1024    4    1 : tunables   54   27    8 : slabdata      0      0      0

    size-1024            215    216   1024    4    1 : tunables   54   27    8 : slabdata     54     54      0

    size-512(DMA)          0      0    512    8    1 : tunables   54   27    8 : slabdata      0      0      0

    size-512             601    624    512    8    1 : tunables   54   27    8 : slabdata     78     78      0

    size-256(DMA)          0      0    256   15    1 : tunables  120   60    8 : slabdata      0      0      0

    size-256            1234   1245    256   15    1 : tunables  120   60    8 : slabdata     83     83      0

    size-192(DMA)          0      0    256   15    1 : tunables  120   60    8 : slabdata      0      0      0

    size-192             287    300    256   15    1 : tunables  120   60    8 : slabdata     20     20      0

    size-128(DMA)          0      0    128   30    1 : tunables  120   60    8 : slabdata      0      0      0

    size-128            1890   1890    128   30    1 : tunables  120   60    8 : slabdata     63     63      0

    size-96(DMA)           0      0    128   30    1 : tunables  120   60    8 : slabdata      0      0      0

    size-96              930    930    128   30    1 : tunables  120   60    8 : slabdata     31     31      0

    size-64(DMA)           0      0    128   30    1 : tunables  120   60    8 : slabdata      0      0      0

    size-32(DMA)           0      0    128   30    1 : tunables  120   60    8 : slabdata      0      0      0

    size-64             1577   1650    128   30    1 : tunables  120   60    8 : slabdata     55     55      0

    size-32             6213   6300    128   30    1 : tunables  120   60    8 : slabdata    210    210      0

    kmem_cache           150    160     96   40    1 : tunables  120   60    8 : slabdata      4      4      0


或许你看ubuntu系统的时候发现limit  batchcount值为0 ,其实它是用了slub分配器.在slub.c中

点击(此处)折叠或打开

  1. void __init kmem_cache_init_late(void)
  2. {
  3. }
这里顺便说明一下关于slab、slub、slob的简单区别:(具体如何实现的请参考内核代码slab.c /slub.c/slob.c)
slab是slub和slob的基础。 
SLOB的目标是针对嵌入式系统的,主要是适用于那些内存非常有限的系统,比如32MB以下的内存,它不太注重large smp系统,虽然最近在这方面有一些小的改进
SLUB allocator,用于替代 slab 代码。通过取消了大量的队列和相关开销、简化 slab 的结构,SLUB 承诺提供更好的性能和更好的系统可伸缩性,并且可以同时保持现有的 slab 分配器接口
说了这么多,我们用个图来简单描述下slab机制:


阅读(2879) | 评论(0) | 转发(1) |
给主人留下些什么吧!~~