Chinaunix首页 | 论坛 | 博客
  • 博客访问: 294450
  • 博文数量: 44
  • 博客积分: 10
  • 博客等级: 民兵
  • 技术积分: 1354
  • 用 户 组: 普通用户
  • 注册时间: 2012-04-08 15:38
个人简介

人生像是在跑马拉松,能够完赛的都是不断地坚持向前迈进;人生就是像在跑马拉松,不断调整步伐,把握好分分秒秒;人生还是像在跑马拉松,能力决定了能跑短程、半程还是全程。人生其实就是一场马拉松,坚持不懈,珍惜时间。

文章分类

分类: LINUX

2015-03-16 00:33:48

前面已经分析了伙伴管理算法的释放实现,接着分析一下伙伴管理算法的内存申请实现。

  伙伴管理算法内存申请和释放的入口一样,其实并没有很清楚的界限表示这个函数是入口,而那个不是,所以例行从稍微偏上一点的地方作为入口分析。于是选择了alloc_pages()宏定义作为分析切入口:

  1. 【file:/include/linux/gfp.h】
  2. #define alloc_pages(gfp_mask, order) \
  3.         alloc_pages_node(numa_node_id(), gfp_mask, order)

而alloc_pages_node()的实现:

  1. 【file:/include/linux/gfp.h】
  2. static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
  3.                         unsigned int order)
  4. {
  5.     /* Unknown node is current node */
  6.     if (nid < 0)
  7.         nid = numa_node_id();
  8.  
  9.     return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
  10. }

没有明确内存申请的node节点时,则默认会选择当前的node节点作为申请节点。往下则接着调用__alloc_pages()来申请具体内存,其中入参node_zonelist()是用于获取node节点的zone管理区列表。接着往下看一下__alloc_pages()的实现:

  1. 【file:/include/linux/gfp.h】
  2. static inline struct page *
  3. __alloc_pages(gfp_t gfp_mask, unsigned int order,
  4.         struct zonelist *zonelist)
  5. {
  6.     return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
  7. }

实则是封装了__alloc_pages_nodemask()。而__alloc_pages_nodemask()的实现:

  1. 【file:/mm/page_alloc.c】
  2. /*
  3.  * This is the 'heart' of the zoned buddy allocator.
  4.  */
  5. struct page *
  6. __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
  7.             struct zonelist *zonelist, nodemask_t *nodemask)
  8. {
  9.     enum zone_type high_zoneidx = gfp_zone(gfp_mask);
  10.     struct zone *preferred_zone;
  11.     struct page *page = NULL;
  12.     int migratetype = allocflags_to_migratetype(gfp_mask);
  13.     unsigned int cpuset_mems_cookie;
  14.     int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
  15.     struct mem_cgroup *memcg = NULL;
  16.  
  17.     gfp_mask &= gfp_allowed_mask;
  18.  
  19.     lockdep_trace_alloc(gfp_mask);
  20.  
  21.     might_sleep_if(gfp_mask & __GFP_WAIT);
  22.  
  23.     if (should_fail_alloc_page(gfp_mask, order))
  24.         return NULL;
  25.  
  26.     /*
  27.      * Check the zones suitable for the gfp_mask contain at least one
  28.      * valid zone. It's possible to have an empty zonelist as a result
  29.      * of GFP_THISNODE and a memoryless node
  30.      */
  31.     if (unlikely(!zonelist->_zonerefs->zone))
  32.         return NULL;
  33.  
  34.     /*
  35.      * Will only have any effect when __GFP_KMEMCG is set. This is
  36.      * verified in the (always inline) callee
  37.      */
  38.     if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
  39.         return NULL;
  40.  
  41. retry_cpuset:
  42.     cpuset_mems_cookie = get_mems_allowed();
  43.  
  44.     /* The preferred zone is used for statistics later */
  45.     first_zones_zonelist(zonelist, high_zoneidx,
  46.                 nodemask ? : &cpuset_current_mems_allowed,
  47.                 &preferred_zone);
  48.     if (!preferred_zone)
  49.         goto out;
  50.  
  51. #ifdef CONFIG_CMA
  52.     if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
  53.         alloc_flags |= ALLOC_CMA;
  54. #endif
  55. retry:
  56.     /* First allocation attempt */
  57.     page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
  58.             zonelist, high_zoneidx, alloc_flags,
  59.             preferred_zone, migratetype);
  60.     if (unlikely(!page)) {
  61.         /*
  62.          * The first pass makes sure allocations are spread
  63.          * fairly within the local node. However, the local
  64.          * node might have free pages left after the fairness
  65.          * batches are exhausted, and remote zones haven't
  66.          * even been considered yet. Try once more without
  67.          * fairness, and include remote zones now, before
  68.          * entering the slowpath and waking kswapd: prefer
  69.          * spilling to a remote zone over swapping locally.
  70.          */
  71.         if (alloc_flags & ALLOC_FAIR) {
  72.             reset_alloc_batches(zonelist, high_zoneidx,
  73.                         preferred_zone);
  74.             alloc_flags &= ~ALLOC_FAIR;
  75.             goto retry;
  76.         }
  77.         /*
  78.          * Runtime PM, block IO and its error handling path
  79.          * can deadlock because I/O on the device might not
  80.          * complete.
  81.          */
  82.         gfp_mask = memalloc_noio_flags(gfp_mask);
  83.         page = __alloc_pages_slowpath(gfp_mask, order,
  84.                 zonelist, high_zoneidx, nodemask,
  85.                 preferred_zone, migratetype);
  86.     }
  87.  
  88.     trace_mm_page_alloc(page, order, gfp_mask, migratetype);
  89.  
  90. out:
  91.     /*
  92.      * When updating a task's mems_allowed, it is possible to race with
  93.      * parallel threads in such a way that an allocation can fail while
  94.      * the mask is being updated. If a page allocation is about to fail,
  95.      * check if the cpuset changed during allocation and if so, retry.
  96.      */
  97.     if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
  98.         goto retry_cpuset;
  99.  
  100.     memcg_kmem_commit_charge(page, memcg, order);
  101.  
  102.     return page;
  103. }

这就是伙伴管理算法的核心了,于是兜兜转转,终于到了。

其中lockdep_trace_alloc()需要CONFIG_TRACE_IRQFLAGS和CONFIG_PROVE_LOCKING同时定义的时候,才起作用,否则为空函数;如果申请页面传入的gfp_mask掩码携带__GFP_WAIT标识,表示允许页面申请时休眠,则会进入might_sleep_if()检查是否需要休眠等待以及重新调度;由于未设置CONFIG_FAIL_PAGE_ALLOC,则should_fail_alloc_page()恒定返回false;if (unlikely(!zonelist->_zonerefs->zone))用于检查当前申请页面的内存管理区zone是否为空;memcg_kmem_newpage_charge()和memcg_kmem_commit_charge()与控制组群Cgroup相关;get_mems_allowed()封装了read_seqcount_begin()用于获得当前对被顺序计数保护的共享资源进行读访问的顺序号,用于避免并发的情况下引起的失败,与其组合的操作函数是put_mems_allowed();first_zones_zonelist()则是用于根据nodemask,找到合适的不大于high_zoneidx的内存管理区preferred_zone;另外allocflags_to_migratetype()是用于转换GFP标识为正确的迁移类型。

最后__alloc_pages_nodemask()分配内存页面的关键函数是:get_page_from_freelist()和__alloc_pages_slowpath(),其中get_page_from_freelist()最先用于尝试页面分配,如果分配失败的情况下,则会进一步调用__alloc_pages_slowpath()。__alloc_pages_slowpath()是用于慢速页面分配,允许等待和内存回收。由于__alloc_pages_slowpath()涉及其他内存管理机制,这里暂不深入分析。

故最后分析一下get_page_from_freelist()的实现:

  1. 【file:/mm/page_alloc.c】
  2. /*
  3.  * get_page_from_freelist goes through the zonelist trying to allocate
  4.  * a page.
  5.  */
  6. static struct page *
  7. get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
  8.         struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
  9.         struct zone *preferred_zone, int migratetype)
  10. {
  11.     struct zoneref *z;
  12.     struct page *page = NULL;
  13.     int classzone_idx;
  14.     struct zone *zone;
  15.     nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
  16.     int zlc_active = 0; /* set if using zonelist_cache */
  17.     int did_zlc_setup = 0; /* just call zlc_setup() one time */
  18.  
  19.     classzone_idx = zone_idx(preferred_zone);
  20. zonelist_scan:
  21.     /*
  22.      * Scan zonelist, looking for a zone with enough free.
  23.      * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
  24.      */
  25.     for_each_zone_zonelist_nodemask(zone, z, zonelist,
  26.                         high_zoneidx, nodemask) {
  27.         unsigned long mark;
  28.  
  29.         if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
  30.             !zlc_zone_worth_trying(zonelist, z, allowednodes))
  31.                 continue;
  32.         if ((alloc_flags & ALLOC_CPUSET) &&
  33.             !cpuset_zone_allowed_softwall(zone, gfp_mask))
  34.                 continue;
  35.         BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
  36.         if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
  37.             goto try_this_zone;
  38.         /*
  39.          * Distribute pages in proportion to the individual
  40.          * zone size to ensure fair page aging. The zone a
  41.          * page was allocated in should have no effect on the
  42.          * time the page has in memory before being reclaimed.
  43.          */
  44.         if (alloc_flags & ALLOC_FAIR) {
  45.             if (!zone_local(preferred_zone, zone))
  46.                 continue;
  47.             if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
  48.                 continue;
  49.         }
  50.         /*
  51.          * When allocating a page cache page for writing, we
  52.          * want to get it from a zone that is within its dirty
  53.          * limit, such that no single zone holds more than its
  54.          * proportional share of globally allowed dirty pages.
  55.          * The dirty limits take into account the zone's
  56.          * lowmem reserves and high watermark so that kswapd
  57.          * should be able to balance it without having to
  58.          * write pages from its LRU list.
  59.          *
  60.          * This may look like it could increase pressure on
  61.          * lower zones by failing allocations in higher zones
  62.          * before they are full. But the pages that do spill
  63.          * over are limited as the lower zones are protected
  64.          * by this very same mechanism. It should not become
  65.          * a practical burden to them.
  66.          *
  67.          * XXX: For now, allow allocations to potentially
  68.          * exceed the per-zone dirty limit in the slowpath
  69.          * (ALLOC_WMARK_LOW unset) before going into reclaim,
  70.          * which is important when on a NUMA setup the allowed
  71.          * zones are together not big enough to reach the
  72.          * global limit. The proper fix for these situations
  73.          * will require awareness of zones in the
  74.          * dirty-throttling and the flusher threads.
  75.          */
  76.         if ((alloc_flags & ALLOC_WMARK_LOW) &&
  77.             (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
  78.             goto this_zone_full;
  79.  
  80.         mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
  81.         if (!zone_watermark_ok(zone, order, mark,
  82.                        classzone_idx, alloc_flags)) {
  83.             int ret;
  84.  
  85.             if (IS_ENABLED(CONFIG_NUMA) &&
  86.                     !did_zlc_setup && nr_online_nodes > 1) {
  87.                 /*
  88.                  * we do zlc_setup if there are multiple nodes
  89.                  * and before considering the first zone allowed
  90.                  * by the cpuset.
  91.                  */
  92.                 allowednodes = zlc_setup(zonelist, alloc_flags);
  93.                 zlc_active = 1;
  94.                 did_zlc_setup = 1;
  95.             }
  96.  
  97.             if (zone_reclaim_mode == 0 ||
  98.                 !zone_allows_reclaim(preferred_zone, zone))
  99.                 goto this_zone_full;
  100.  
  101.             /*
  102.              * As we may have just activated ZLC, check if the first
  103.              * eligible zone has failed zone_reclaim recently.
  104.              */
  105.             if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
  106.                 !zlc_zone_worth_trying(zonelist, z, allowednodes))
  107.                 continue;
  108.  
  109.             ret = zone_reclaim(zone, gfp_mask, order);
  110.             switch (ret) {
  111.             case ZONE_RECLAIM_NOSCAN:
  112.                 /* did not scan */
  113.                 continue;
  114.             case ZONE_RECLAIM_FULL:
  115.                 /* scanned but unreclaimable */
  116.                 continue;
  117.             default:
  118.                 /* did we reclaim enough */
  119.                 if (zone_watermark_ok(zone, order, mark,
  120.                         classzone_idx, alloc_flags))
  121.                     goto try_this_zone;
  122.  
  123.                 /*
  124.                  * Failed to reclaim enough to meet watermark.
  125.                  * Only mark the zone full if checking the min
  126.                  * watermark or if we failed to reclaim just
  127.                  * 1<<order pages or else the page allocator
  128.                  * fastpath will prematurely mark zones full
  129.                  * when the watermark is between the low and
  130.                  * min watermarks.
  131.                  */
  132.                 if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
  133.                     ret == ZONE_RECLAIM_SOME)
  134.                     goto this_zone_full;
  135.  
  136.                 continue;
  137.             }
  138.         }
  139.  
  140. try_this_zone:
  141.         page = buffered_rmqueue(preferred_zone, zone, order,
  142.                         gfp_mask, migratetype);
  143.         if (page)
  144.             break;
  145. this_zone_full:
  146.         if (IS_ENABLED(CONFIG_NUMA))
  147.             zlc_mark_zone_full(zonelist, z);
  148.     }
  149.  
  150.     if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
  151.         /* Disable zlc cache for second zonelist scan */
  152.         zlc_active = 0;
  153.         goto zonelist_scan;
  154.     }
  155.  
  156.     if (page)
  157.         /*
  158.          * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
  159.          * necessary to allocate the page. The expectation is
  160.          * that the caller is taking steps that will free more
  161.          * memory. The caller should avoid the page being used
  162.          * for !PFMEMALLOC purposes.
  163.          */
  164.         page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
  165.  
  166.     return page;
  167. }

该函数主要是遍历各个内存管理区列表zonelist以尝试页面申请。其中for_each_zone_zonelist_nodemask()则是用于遍历zonelist的,每个内存管理区尝试申请前,都将检查内存管理区是否有可分配的内存空间、根据alloc_flags判断当前CPU是否允许在该内存管理区zone中申请以及做watermark水印检查以判断zone中的内存是否足够等。这部分的功能实现将在后面详细分析,当前主要聚焦在伙伴管理算法的实现。

不难找到真正用于分配内存页面的函数为buffered_rmqueue(),其实现:

  1. 【file:/mm/page_alloc.c】
  2. /*
  3.  * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
  4.  * we cheat by calling it from here, in the order > 0 path. Saves a branch
  5.  * or two.
  6.  */
  7. static inline
  8. struct page *buffered_rmqueue(struct zone *preferred_zone,
  9.             struct zone *zone, int order, gfp_t gfp_flags,
  10.             int migratetype)
  11. {
  12.     unsigned long flags;
  13.     struct page *page;
  14.     int cold = !!(gfp_flags & __GFP_COLD);
  15.  
  16. again:
  17.     if (likely(order == 0)) {
  18.         struct per_cpu_pages *pcp;
  19.         struct list_head *list;
  20.  
  21.         local_irq_save(flags);
  22.         pcp = &this_cpu_ptr(zone->pageset)->pcp;
  23.         list = &pcp->lists[migratetype];
  24.         if (list_empty(list)) {
  25.             pcp->count += rmqueue_bulk(zone, 0,
  26.                     pcp->batch, list,
  27.                     migratetype, cold);
  28.             if (unlikely(list_empty(list)))
  29.                 goto failed;
  30.         }
  31.  
  32.         if (cold)
  33.             page = list_entry(list->prev, struct page, lru);
  34.         else
  35.             page = list_entry(list->next, struct page, lru);
  36.  
  37.         list_del(&page->lru);
  38.         pcp->count--;
  39.     } else {
  40.         if (unlikely(gfp_flags & __GFP_NOFAIL)) {
  41.             /*
  42.              * __GFP_NOFAIL is not to be used in new code.
  43.              *
  44.              * All __GFP_NOFAIL callers should be fixed so that they
  45.              * properly detect and handle allocation failures.
  46.              *
  47.              * We most definitely don't want callers attempting to
  48.              * allocate greater than order-1 page units with
  49.              * __GFP_NOFAIL.
  50.              */
  51.             WARN_ON_ONCE(order > 1);
  52.         }
  53.         spin_lock_irqsave(&zone->lock, flags);
  54.         page = __rmqueue(zone, order, migratetype);
  55.         spin_unlock(&zone->lock);
  56.         if (!page)
  57.             goto failed;
  58.         __mod_zone_freepage_state(zone, -(1 << order),
  59.                       get_pageblock_migratetype(page));
  60.     }
  61.  
  62.     __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
  63.  
  64.     __count_zone_vm_events(PGALLOC, zone, 1 << order);
  65.     zone_statistics(preferred_zone, zone, gfp_flags);
  66.     local_irq_restore(flags);
  67.  
  68.     VM_BUG_ON_PAGE(bad_range(zone, page), page);
  69.     if (prep_new_page(page, order, gfp_flags))
  70.         goto again;
  71.     return page;
  72.  
  73. failed:
  74.     local_irq_restore(flags);
  75.     return NULL;
  76. }

   if (likely(order == 0))如果申请的内存页面处于伙伴管理算法中的0阶,即只申请一个内存页面时,则首先尝试从冷热页中申请,若申请失败则继而调用rmqueue_bulk()去申请页面至冷热页管理列表中,继而再从冷热页列表中获取;如果申请多个页面则会通过__rmqueue()直接从伙伴管理中申请。

    __rmqueue()的实现:

  1. 【file:/mm/page_alloc.c】
  2. /*
  3.  * Do the hard work of removing an element from the buddy allocator.
  4.  * Call me with the zone->lock already held.
  5.  */
  6. static struct page *__rmqueue(struct zone *zone, unsigned int order,
  7.                         int migratetype)
  8. {
  9.     struct page *page;
  10.  
  11. retry_reserve:
  12.     page = __rmqueue_smallest(zone, order, migratetype);
  13.  
  14.     if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
  15.         page = __rmqueue_fallback(zone, order, migratetype);
  16.  
  17.         /*
  18.          * Use MIGRATE_RESERVE rather than fail an allocation. goto
  19.          * is used because __rmqueue_smallest is an inline function
  20.          * and we want just one call site
  21.          */
  22.         if (!page) {
  23.             migratetype = MIGRATE_RESERVE;
  24.             goto retry_reserve;
  25.         }
  26.     }
  27.  
  28.     trace_mm_page_alloc_zone_locked(page, order, migratetype);
  29.     return page;
  30. }

该函数里面有两个关键函数:__rmqueue_smallest()和__rmqueue_fallback()。

先行分析一下__rmqueue_fallback():

  1. 【file:/mm/page_alloc.c】
  2. /*
  3.  * Go through the free lists for the given migratetype and remove
  4.  * the smallest available page from the freelists
  5.  */
  6. static inline
  7. struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
  8.                         int migratetype)
  9. {
  10.     unsigned int current_order;
  11.     struct free_area *area;
  12.     struct page *page;
  13.  
  14.     /* Find a page of the appropriate size in the preferred list */
  15.     for (current_order = order; current_order < MAX_ORDER; ++current_order) {
  16.         area = &(zone->free_area[current_order]);
  17.         if (list_empty(&area->free_list[migratetype]))
  18.             continue;
  19.  
  20.         page = list_entry(area->free_list[migratetype].next,
  21.                             struct page, lru);
  22.         list_del(&page->lru);
  23.         rmv_page_order(page);
  24.         area->nr_free--;
  25.         expand(zone, page, order, current_order, area, migratetype);
  26.         return page;
  27.     }
  28.  
  29.     return NULL;
  30. }

该函数实现了分配算法的核心功能,首先for()循环其由指定的伙伴管理算法链表order阶开始,如果该阶的链表不为空,则直接通过list_del()从该链表中获取空闲页面以满足申请需要;如果该阶的链表为空,则往更高一阶的链表查找,直到找到链表不为空的一阶,至于若找到了最高阶仍为空链表,则申请失败;否则将在找到链表不为空的一阶后,将空闲页面块通过list_del()从链表中摘除出来,然后通过expand()将其对等拆分开,并将拆分出来的一半空闲部分挂接至低一阶的链表中,直到拆分至恰好满足申请需要的order阶,最后将得到的满足要求的页面返回回去。至此,页面已经分配到了。

至于__rmqueue_fallback():

  1. 【file:/mm/page_alloc.c】
  2. /* Remove an element from the buddy allocator from the fallback list */
  3. static inline struct page *
  4. __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
  5. {
  6.     struct free_area *area;
  7.     int current_order;
  8.     struct page *page;
  9.     int migratetype, new_type, i;
  10.  
  11.     /* Find the largest possible block of pages in the other list */
  12.     for (current_order = MAX_ORDER-1; current_order >= order;
  13.                         --current_order) {
  14.         for (i = 0;; i++) {
  15.             migratetype = fallbacks[start_migratetype][i];
  16.  
  17.             /* MIGRATE_RESERVE handled later if necessary */
  18.             if (migratetype == MIGRATE_RESERVE)
  19.                 break;
  20.  
  21.             area = &(zone->free_area[current_order]);
  22.             if (list_empty(&area->free_list[migratetype]))
  23.                 continue;
  24.  
  25.             page = list_entry(area->free_list[migratetype].next,
  26.                     struct page, lru);
  27.             area->nr_free--;
  28.  
  29.             new_type = try_to_steal_freepages(zone, page,
  30.                               start_migratetype,
  31.                               migratetype);
  32.  
  33.             /* Remove the page from the freelists */
  34.             list_del(&page->lru);
  35.             rmv_page_order(page);
  36.  
  37.             expand(zone, page, order, current_order, area,
  38.                    new_type);
  39.  
  40.             trace_mm_page_alloc_extfrag(page, order, current_order,
  41.                 start_migratetype, migratetype, new_type);
  42.  
  43.             return page;
  44.         }
  45.     }
  46.  
  47.     return NULL;
  48. }

其主要是向其他迁移类型中获取内存。较正常的伙伴算法不同,其向迁移类型的内存申请内存页面时,是从最高阶开始查找的,主要是从大块内存中申请可以避免更少的碎片。如果尝试完所有的手段仍无法获得内存页面,则会从MIGRATE_RESERVE列表中获取。这部分暂不深入,后面再详细分析。

毕了,至此伙伴管理算法的分配部分暂时分析完毕。


阅读(3246) | 评论(0) | 转发(1) |
给主人留下些什么吧!~~