一、Linux伙伴系统分配器
伙伴系统分配器大体上分为两类。__get_free_pages()类函数返回分配的第一个页面的线性地址;alloc_pages()类函数返回页面描述符地址。不管以哪种函数进行分配,最终会调用alloc_pages()进行分配页面。
为清楚了解其分配制度,先给个伙伴系统数据的存储框图
也就是每个order对应一个free_area结构,free_area以不同的类型以链表的方式存储这些内存块。
二、主分配函数
下面我们来看这个函数(在UMA模式下)
- #define alloc_pages(gfp_mask, order) \
- alloc_pages_node(numa_node_id(), gfp_mask, order)
-
- static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
- unsigned int order)
- {
-
- if (nid < 0)
- nid = numa_node_id();
-
- return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
- }
- static inline struct page *
- __alloc_pages(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist)
- {
- return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
- }
上层分配函数__alloc_pages_nodemask()
-
-
-
-
- struct page *
- __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, nodemask_t *nodemask)
- {
- enum zone_type high_zoneidx = gfp_zone(gfp_mask);
- struct zone *preferred_zone;
- struct page *page;
-
-
- int migratetype = allocflags_to_migratetype(gfp_mask);
-
- gfp_mask &= gfp_allowed_mask;
-
- lockdep_trace_alloc(gfp_mask);
-
- might_sleep_if(gfp_mask & __GFP_WAIT);
-
- if (should_fail_alloc_page(gfp_mask, order))
- return NULL;
-
-
-
-
-
-
- if (unlikely(!zonelist->_zonerefs->zone))
- return NULL;
-
-
-
- first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
- if (!preferred_zone)
- return NULL;
-
-
-
- page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
- zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
- preferred_zone, migratetype);
- if (unlikely(!page))
- page = __alloc_pages_slowpath(gfp_mask, order,
- zonelist, high_zoneidx, nodemask,
- preferred_zone, migratetype);
-
- trace_mm_page_alloc(page, order, gfp_mask, migratetype);
- return page;
- }
三、从pcp和伙伴系统中正常的分配内存空间
函数get_page_from_freelist()
-
-
-
-
-
- static struct page *
- get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
- struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
- struct zone *preferred_zone, int migratetype)
- {
- struct zoneref *z;
- struct page *page = NULL;
- int classzone_idx;
- struct zone *zone;
- nodemask_t *allowednodes = NULL;
- int zlc_active = 0;
- int did_zlc_setup = 0;
-
- classzone_idx = zone_idx(preferred_zone);
- zonelist_scan:
-
-
-
-
-
- for_each_zone_zonelist_nodemask(zone, z, zonelist,
- high_zoneidx, nodemask) {
- if (NUMA_BUILD && zlc_active &&
- !zlc_zone_worth_trying(zonelist, z, allowednodes))
- continue;
- if ((alloc_flags & ALLOC_CPUSET) &&
- !cpuset_zone_allowed_softwall(zone, gfp_mask))
- goto try_next_zone;
-
- BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
-
- if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
- unsigned long mark;
- int ret;
-
- mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
-
- if (zone_watermark_ok(zone, order, mark,
- classzone_idx, alloc_flags))
- goto try_this_zone;
-
- if (zone_reclaim_mode == 0)
- goto this_zone_full;
-
- ret = zone_reclaim(zone, gfp_mask, order);
- switch (ret) {
- case ZONE_RECLAIM_NOSCAN:
-
- goto try_next_zone;
- case ZONE_RECLAIM_FULL:
-
- goto this_zone_full;
- default:
-
- if (!zone_watermark_ok(zone, order, mark,
- classzone_idx, alloc_flags))
- goto this_zone_full;
- }
- }
-
- try_this_zone:
-
- page = buffered_rmqueue(preferred_zone, zone, order,
- gfp_mask, migratetype);
- if (page)
- break;
- this_zone_full:
- if (NUMA_BUILD)
- zlc_mark_zone_full(zonelist, z);
- try_next_zone:
- if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
-
-
-
-
- allowednodes = zlc_setup(zonelist, alloc_flags);
- zlc_active = 1;
- did_zlc_setup = 1;
- }
- }
-
- if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
-
- zlc_active = 0;
- goto zonelist_scan;
- }
- return page;
- }
主分配函数
-
-
-
-
-
-
- static inline
- struct page *buffered_rmqueue(struct zone *preferred_zone,
- struct zone *zone, int order, gfp_t gfp_flags,
- int migratetype)
- {
- unsigned long flags;
- struct page *page;
- int cold = !!(gfp_flags & __GFP_COLD);
- int cpu;
-
- again:
- cpu = get_cpu();
- if (likely(order == 0)) {
- struct per_cpu_pages *pcp;
- struct list_head *list;
-
- pcp = &zone_pcp(zone, cpu)->pcp;
- list = &pcp->lists[migratetype];
-
-
-
- local_irq_save(flags);
- if (list_empty(list)) {
-
-
- pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, list,
- migratetype, cold);
-
- if (unlikely(list_empty(list)))
- goto failed;
- }
-
-
- if (cold)
- page = list_entry(list->prev, struct page, lru);
- else
-
- page = list_entry(list->next, struct page, lru);
-
- list_del(&page->lru);
- pcp->count--;
- }
- else {
- if (unlikely(gfp_flags & __GFP_NOFAIL)) {
-
-
-
-
-
-
-
-
-
-
- WARN_ON_ONCE(order > 1);
- }
-
- spin_lock_irqsave(&zone->lock, flags);
-
- page = __rmqueue(zone, order, migratetype);
-
- __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
- spin_unlock(&zone->lock);
- if (!page)
- goto failed;
- }
-
- __count_zone_vm_events(PGALLOC, zone, 1 << order);
- zone_statistics(preferred_zone, zone);
- local_irq_restore(flags);
- put_cpu();
-
- VM_BUG_ON(bad_range(zone, page));
-
-
-
- if (prep_new_page(page, order, gfp_flags))
- goto again;
- return page;
-
- failed:
- local_irq_restore(flags);
- put_cpu();
- return NULL;
- }
3.1 pcp缓存补充
从伙伴系统中获得batch个页面,batch为一次分配的页面数rmqueue_bulk()函数。
-
-
-
-
-
-
-
-
-
- static int rmqueue_bulk(struct zone *zone, unsigned int order,
- unsigned long count, struct list_head *list,
- int migratetype, int cold)
- {
- int i;
-
- spin_lock(&zone->lock);
- for (i = 0; i < count; ++i) {
-
- struct page *page = __rmqueue(zone, order, migratetype);
- if (unlikely(page == NULL))
- break;
-
-
-
-
-
-
-
-
-
-
- if (likely(cold == 0))
- list_add(&page->lru, list);
- else
- list_add_tail(&page->lru, list);
- set_page_private(page, migratetype);
- list = &page->lru;
- }
-
- __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
- spin_unlock(&zone->lock);
- return i;
- }
3.2 从伙伴系统中取出页面
__rmqueue()函数
-
-
-
-
-
- static struct page *__rmqueue(struct zone *zone, unsigned int order,
- int migratetype)
- {
- struct page *page;
-
- retry_reserve:
-
- page = __rmqueue_smallest(zone, order, migratetype);
-
-
-
-
-
-
-
- if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
-
- page = __rmqueue_fallback(zone, order, migratetype);
-
-
-
-
-
-
- if (!page) {
- migratetype = MIGRATE_RESERVE;
- goto retry_reserve;
- }
- }
-
- trace_mm_page_alloc_zone_locked(page, order, migratetype);
- return page;
- }
3.2.1 从指定的迁移类型链表中分配页面
从指定order开始从小到达遍历,优先从指定的迁移类型链表中分配页面__rmqueue_smallest(zone, order, migratetype);
-
-
-
-
-
-
- static inline
- struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
- int migratetype)
- {
- unsigned int current_order;
- struct free_area * area;
- struct page *page;
-
-
- for (current_order = order; current_order < MAX_ORDER; ++current_order) {
- area = &(zone->free_area[current_order]);
-
- if (list_empty(&area->free_list[migratetype]))
- continue;
-
- page = list_entry(area->free_list[migratetype].next,
- struct page, lru);
- list_del(&page->lru);
- rmv_page_order(page);
- area->nr_free--;
-
- expand(zone, page, order, current_order, area, migratetype);
- return page;
- }
-
- return NULL;
- }
伙伴系统内存块拆分和合并
看一个辅助函数,用于伙伴系统中内存块的拆分、合并
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- static inline void expand(struct zone *zone, struct page *page,
- int low, int high, struct free_area *area,
- int migratetype)
- {
- unsigned long size = 1 << high;
-
- while (high > low) {
-
- area--;
- high--;
- size >>= 1;
- VM_BUG_ON(bad_range(zone, &page[size]));
-
- list_add(&page[size].lru, &area->free_list[migratetype]);
- area->nr_free++;
- set_page_order(&page[size], high);
- }
- }
3.2.2 从备用链表中分配页面
-
- static inline struct page *
- __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
- {
- struct free_area * area;
- int current_order;
- struct page *page;
- int migratetype, i;
-
-
-
-
- for (current_order = MAX_ORDER-1; current_order >= order;
- --current_order) {
- for (i = 0; i < MIGRATE_TYPES - 1; i++) {
-
- migratetype = fallbacks[start_migratetype][i];
-
-
-
-
-
- if (migratetype == MIGRATE_RESERVE)
- continue;
-
- area = &(zone->free_area[current_order]);
-
- if (list_empty(&area->free_list[migratetype]))
- continue;
-
- page = list_entry(area->free_list[migratetype].next,
- struct page, lru);
- area->nr_free--;
-
-
-
-
-
-
-
- if (unlikely(current_order >= (pageblock_order >> 1)) ||
-
- start_migratetype == MIGRATE_RECLAIMABLE ||
-
- page_group_by_mobility_disabled) {
-
- unsigned long pages;
-
- pages = move_freepages_block(zone, page,
- start_migratetype);
-
-
-
-
-
- if (pages >= (1 << (pageblock_order-1)) ||
- page_group_by_mobility_disabled)
-
- set_pageblock_migratetype(page,
- start_migratetype);
-
- migratetype = start_migratetype;
- }
-
-
- list_del(&page->lru);
- rmv_page_order(page);
-
-
- if (current_order >= pageblock_order)
-
- change_pageblock_range(page, current_order,
- start_migratetype);
-
- expand(zone, page, order, current_order, area, migratetype);
-
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, migratetype);
-
- return page;
- }
- }
-
- return NULL;
- }
备用链表
-
-
-
-
-
-
- static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
- [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
- [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
- [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
- [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE },
- };
移动到指定类型的伙伴系统中
-
-
-
-
-
-
- static int move_freepages_block(struct zone *zone, struct page *page,
- int migratetype)
- {
- unsigned long start_pfn, end_pfn;
- struct page *start_page, *end_page;
-
-
- start_pfn = page_to_pfn(page);
- start_pfn = start_pfn & ~(pageblock_nr_pages-1);
- start_page = pfn_to_page(start_pfn);
- end_page = start_page + pageblock_nr_pages - 1;
- end_pfn = start_pfn + pageblock_nr_pages - 1;
-
-
- if (start_pfn < zone->zone_start_pfn)
- start_page = page;
-
- if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
- return 0;
-
- return move_freepages(zone, start_page, end_page, migratetype);
- }
-
-
-
-
-
-
-
- static int move_freepages(struct zone *zone,
- struct page *start_page, struct page *end_page,
- int migratetype)
- {
- struct page *page;
- unsigned long order;
- int pages_moved = 0;
-
- #ifndef CONFIG_HOLES_IN_ZONE
-
-
-
-
-
-
-
- BUG_ON(page_zone(start_page) != page_zone(end_page));
- #endif
-
- for (page = start_page; page <= end_page;) {
-
- VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
-
- if (!pfn_valid_within(page_to_pfn(page))) {
- page++;
- continue;
- }
-
- if (!PageBuddy(page)) {
- page++;
- continue;
- }
-
- order = page_order(page);
- list_del(&page->lru);
-
-
- list_add(&page->lru,
- &zone->free_area[order].free_list[migratetype]);
- page += 1 << order;
- pages_moved += 1 << order;
- }
-
- return pages_moved;
- }
四、慢速分配,允许等待和回收
-
-
-
-
-
- static inline struct page *
- __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask, struct zone *preferred_zone,
- int migratetype)
- {
- const gfp_t wait = gfp_mask & __GFP_WAIT;
- struct page *page = NULL;
- int alloc_flags;
- unsigned long pages_reclaimed = 0;
- unsigned long did_some_progress;
- struct task_struct *p = current;
-
-
-
-
-
-
-
- if (order >= MAX_ORDER) {
- WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
- return NULL;
- }
-
-
-
-
-
-
-
-
-
-
-
-
-
- if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
- goto nopage;
-
- restart:
- wake_all_kswapd(order, zonelist, high_zoneidx);
-
-
-
-
-
-
-
- alloc_flags = gfp_to_alloc_flags(gfp_mask);
-
-
-
-
-
-
-
- page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
- high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, migratetype);
- if (page)
- goto got_pg;
-
- rebalance:
-
-
- if (alloc_flags & ALLOC_NO_WATERMARKS) {
- page = __alloc_pages_high_priority(gfp_mask, order,
- zonelist, high_zoneidx, nodemask,
- preferred_zone, migratetype);
- if (page))
- goto got_pg;
- }
-
-
-
- if (!wait)
- goto nopage;
-
-
-
- if (p->flags & PF_MEMALLOC)
- goto nopage;
-
-
-
-
-
-
- if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
- goto nopage;
-
-
-
-
-
- page = __alloc_pages_direct_reclaim(gfp_mask, order,
- zonelist, high_zoneidx,
- nodemask,
- alloc_flags, preferred_zone,
- migratetype, &did_some_progress);
- if (page))
- goto got_pg;
-
-
-
-
-
-
- if (!did_some_progress) {
-
-
-
-
- if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
- if (oom_killer_disabled)
- goto nopage;
-
-
-
- page = __alloc_pages_may_oom(gfp_mask, order,
- zonelist, high_zoneidx,
- nodemask, preferred_zone,
- migratetype);
- if (page)
- goto got_pg;
-
-
-
-
-
-
- )
- if (order > PAGE_ALLOC_COSTLY_ORDER &&
- !(gfp_mask & __GFP_NOFAIL))
- goto nopage;
-
- goto restart;
- }
- }
-
-
-
- pages_reclaimed += did_some_progress;
- if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
-
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- goto rebalance;
- }
-
- nopage:
-
- if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
- printk(KERN_WARNING "%s: page allocation failure."
- " order:%d, mode:0x%x\n",
- p->comm, order, gfp_mask);
- dump_stack();
- show_mem();
- }
- return page;
- got_pg:
-
- if (kmemcheck_enabled)
- kmemcheck_pagealloc_alloc(page, order, gfp_mask);
- return page;
-
- }
总结:Linux伙伴系统主要分配流程为
正常非配(或叫快速分配)流程:
1,如果分配的是单个页面,考虑从per CPU缓存中分配空间,如果缓存中没有页面,从伙伴系统中提取页面做补充。
2,分配多个页面时,从指定类型中分配,如果指定类型中没有足够的页面,从备用类型链表中分配。最后会试探保留类型链表。
慢速(允许等待和页面回收)分配:
3,当上面两种分配方案都不能满足要求时,考虑页面回收、杀死进程等操作后在试。
阅读(5275) | 评论(0) | 转发(0) |