关于伙伴系统算法的原理还是比较好理解的,这里不作复数。直接看下关键数据结构。
-
struct zone {
-
~~snip
-
struct free_area free_area[MAX_ORDER]; //每一阶以一个元素保存,平台最大11阶。
-
~~snip
-
};
可以看到每个zone都有它自己的free_area,效果如下图:
相关信息可以从/proc/buddyinfo读取到,各个值含义依次为节点号, zone类型,后面的值表示从0阶开始各个阶空闲的页数:
-
#cat/proc/buddyinfo
-
Node0, zone Normal 70 27
-
33 148 47 13 7 2
-
2 1 1
-
Node0, zone HighMem 0 1 1 2 2 0 0 0
-
0 0 0
-
struct free_area {
-
/*同样阶数保存到一个链表上 */
-
struct list_head free_list[MIGRATE_TYPES];
-
/*当前空闲页块的数目。*/
-
unsigned long nr_free;
-
};
效果如下图:
MIGRATE_TYPES是作为反碎片的一种机制,专有名叫迁移类型。大概的原理就是将伙伴系统的内存页分为几种类型,有可移动,不可移动,可回收等,同一类型的页放在一个区域,如不可回收的页不能放在可移动类型区域,这样对可以移动区域,伙伴系统就可以回收了。
-
enum {
-
MIGRATE_UNMOVABLE,
-
MIGRATE_RECLAIMABLE,
-
MIGRATE_MOVABLE,
-
MIGRATE_PCPTYPES, /* the number of types on the pcp lists */
-
MIGRATE_RESERVE = MIGRATE_PCPTYPES,
-
#ifdef CONFIG_CMA
-
MIGRATE_CMA,
-
#endif
-
MIGRATE_ISOLATE, /* can't allocate from here */
-
MIGRATE_TYPES
-
};
MIGRATE_UNMOVABLE:不可移动页,在内存中有固定位置,不能移动。核心内核分配的大部分内存属于此类。
MIGRATE_RECLAIMABLE:可回收页,不能移动,但能删除。Kswapd内核线程会操作次区域。
MIGRATE_MOVABLE:可移动又可回收页,用户空间程序使用此类,通过页表映射实现,如果应用程序虚拟地址空间有变化,只要变化页表就可以了。
MIGRATE_RESERVE: 当系统内存相当少而且比较紧急时,才用到此区域。
MIGRATE_CMA:这个是为了避免预留大块内存实现的,当需要大块内存的时候如audio/camera等,它可以被使用;当小内存申请需要时,它也可以被使用,避免了pmem/ion的弊端,不过似乎要基于DMA。后面打算用一篇文章来分析cma.
MIGRATE_ISOLATE: NUMA系统上使用,我们用UMA,不管它。
当某个迁移类型的内存不足时,会向另外一个迁移类型去要内存。这个跟zone的申请机制很像!下面结构规定了当前迁移类型不够时下一个使用的类型,如MIGRATE_UNMOVABLE的使用顺序是: MIGRATE_RECLAIMABLE -> MIGRATE_RECLAIMABLE -> MIGRATE_MOVABLE -> MIGRATE_RESERVE.
-
static int fallbacks[MIGRATE_TYPES][4] = {
-
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
-
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
-
#ifdef CONFIG_CMA
-
[MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
-
[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
-
#else
-
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
-
#endif
-
[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
-
[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
-
};
迁移类型的信息可以从/proc/pagetypeinfo读到:
-
#cat /proc/pagetypeinfo
-
Page block order: 10
-
Pages per block: 1024
-
-
Free pages count per migrate type at order 0 1 2 3 4
-
5 6 7 8 9 10
-
Node 0, zone Normal, type Unmovable 1 1 2 0 1
-
0 1 1 1 0 0
-
Node 0, zone Normal, type Reclaimable 8 18 3 0 0
-
0 0 0 0 0 0
-
Node 0, zone Normal, type Movable 1 1 0 115 46
-
12 5 0 0 0 0
-
Node 0, zone Normal, type Reserve 0 0 0 1 0
-
1 1 1 1 1 1
-
Node 0, zone Normal, type Isolate 0 0 0 0 0
-
0 0 0 0 0 0
-
Node 0, zone HighMem, type Unmovable 0 0 0 0 0
-
0 0 0 0 0 0
-
Node 0, zone HighMem, type Reclaimable 0 0 0 0 0
-
0 0 0 0 0 0
-
Node 0, zone HighMem, type Movable 0 1 0 0 0
-
0 0 0 0 0 0
-
Node 0, zone HighMem, type Reserve 0 0 1 2 2
-
0 0 0 0 0 0
-
Node 0, zone HighMem, type Isolate 0 0 0 0 0
-
0 0 0 0 0 0
-
-
Number of blocks type Unmovable Reclaimable Movable Reserve
-
Isolate
-
Node 0, zone Normal 13 8 178 2
-
0
-
Node 0, zone HighMem 1 0 14 1
-
0
初始化:
首先对伙伴系统相关数据结构初始化,有如下调用流程:
start_kernel -> setup_arch ->paging_init -> bootmem_init -> arm_bootmem_free ->
free_area_init_node ->init_currently_empty_zone -> zone_init_free_lists
-
static void __meminit zone_init_free_lists(struct zone *zone)
-
{
-
int order, t;
-
/*对每种迁移类型每个order初始化free_list和nr_free。*/
-
for_each_migratetype_order(order, t) {
-
INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
-
zone->free_area[order].nr_free = 0;
-
}
-
}
-
-
#define for_each_migratetype_order(order, type) \
-
for (order = 0; order < MAX_ORDER; order++) \
-
for (type = 0; type < MIGRATE_TYPES; type++)
-
-
初始化好数据结构之后,先要得到系统当前空闲的可供伙伴系统分配的页,由于在伙伴系统初始化之前使用的是bootmem分配器,所以现在是该释放bootmem分配器所管理的内存部分了。调用流程如下:
-
-
start_kernel -> mm_init -> mem_init.
-
/*
-
* mem_init() marks the free areas in the mem_map and tells us how much
-
* memory is free. This is done after various parts of the system have
-
* claimed their memory after the kernel image.
-
*/
-
void __init mem_init(void)
-
{
-
unsigned long reserved_pages, free_pages;
-
struct memblock_region *reg;
-
int i;
-
-
max_mapnr = pfn_to_page(max_pfn + PHYS_PFN_OFFSET) - mem_map;
-
-
/* this will put all unused low memory onto the freelists */
-
/*标记可以使用的内存页*/
-
free_unused_memmap(&meminfo);
-
/*由bootmem分配器管理的空闲内存部分都会被释放,
-
释放的接口是__free_pages (),后面会说道调用这个接口最终会
-
被放到伙伴系统的free_list上面。另外,bootmem分配器到这里
-
也结束生命了。*/
-
totalram_pages += free_all_bootmem();
-
/*高端内存空闲页也被释放到free_list中。*/
-
free_highpages();
-
-
reserved_pages = free_pages = 0;
-
/*统计当前物理内存空闲页和保留页各有多少。*/
-
for_each_bank(i, &meminfo) {
-
struct membank *bank = &meminfo.bank[i];
-
unsigned int pfn1, pfn2;
-
struct page *page, *end;
-
-
pfn1 = bank_pfn_start(bank);
-
pfn2 = bank_pfn_end(bank);
-
-
page = pfn_to_page(pfn1);
-
end = pfn_to_page(pfn2 - 1) + 1;
-
-
do {
-
if (PageReserved(page))
-
reserved_pages++;
-
else if (!page_count(page))
-
free_pages++;
-
page++;
-
#ifdef CONFIG_SPARSEMEM
-
pfn1++;
-
if (!(pfn1 % PAGES_PER_SECTION))
-
page = pfn_to_page(pfn1);
-
} while (pfn1 < pfn2);
-
#else
-
} while (page < end);
-
#endif
-
}
-
-
/*
-
* Since our memory may not be contiguous, calculate the
-
* real number of pages we have in this system
-
*/
-
printk(KERN_INFO "Memory:");
-
num_physpages = 0;
-
for_each_memblock(memory, reg) {
-
unsigned long pages = memblock_region_memory_end_pfn(reg) -
-
memblock_region_memory_base_pfn(reg);
-
num_physpages += pages;
-
printk(" %ldMB", pages >> (20 - PAGE_SHIFT));
-
}
-
printk(" = %luMB total\n", num_physpages >> (20 - PAGE_SHIFT));
-
-
printk(KERN_NOTICE "Memory: %luk/%luk available, %luk reserved, %luK highmem\n",
-
nr_free_pages() << (PAGE_SHIFT-10),
-
free_pages << (PAGE_SHIFT-10),
-
reserved_pages << (PAGE_SHIFT-10),
-
totalhigh_pages << (PAGE_SHIFT-10));
-
-
#define MLK(b, t) b, t, ((t) - (b)) >> 10
-
#define MLM(b, t) b, t, ((t) - (b)) >> 20
-
#define MLK_ROUNDUP(b, t) b, t, DIV_ROUND_UP(((t) - (b)), SZ_1K)
-
-
/*打印出各个区域的起始和结束地址。*/
-
printk(KERN_NOTICE "Virtual kernel memory layout:\n"
-
" vector : 0x%08lx - 0x%08lx (%4ld kB)\n"
-
#ifdef CONFIG_HAVE_TCM
-
" DTCM : 0x%08lx - 0x%08lx (%4ld kB)\n"
-
" ITCM : 0x%08lx - 0x%08lx (%4ld kB)\n"
-
#endif
-
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
-
" vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
-
" lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
-
#ifdef CONFIG_HIGHMEM
-
" pkmap : 0x%08lx - 0x%08lx (%4ld MB)\n"
-
#endif
-
#ifdef CONFIG_MODULES
-
" modules : 0x%08lx - 0x%08lx (%4ld MB)\n"
-
#endif
-
" .text : 0x%p" " - 0x%p" " (%4d kB)\n"
-
" .init : 0x%p" " - 0x%p" " (%4d kB)\n"
-
" .data : 0x%p" " - 0x%p" " (%4d kB)\n"
-
" .bss : 0x%p" " - 0x%p" " (%4d kB)\n",
-
-
MLK(UL(CONFIG_VECTORS_BASE), UL(CONFIG_VECTORS_BASE) +
-
(PAGE_SIZE)),
-
MLK(FIXADDR_START, FIXADDR_TOP),
-
MLM(VMALLOC_START, VMALLOC_END),
-
MLM(PAGE_OFFSET, (unsigned long)high_memory),
-
#ifdef CONFIG_HIGHMEM
-
MLM(PKMAP_BASE, (PKMAP_BASE) + (LAST_PKMAP) *
-
(PAGE_SIZE)),
-
#endif
-
#ifdef CONFIG_MODULES
-
MLM(MODULES_VADDR, MODULES_END),
-
#endif
-
-
MLK_ROUNDUP(_text, _etext),
-
MLK_ROUNDUP(__init_begin, __init_end),
-
MLK_ROUNDUP(_sdata, _edata),
-
MLK_ROUNDUP(__bss_start, __bss_stop));
-
~~snip
-
}
到此,系统空闲的内存都交由伙伴系统管理了!
内存分配:
调用的接口有如下几个:
-
#define alloc_pages(gfp_mask, order) \
-
alloc_pages_node(numa_node_id(), gfp_mask, order)
-
#define alloc_pages_vma(gfp_mask, order, vma, addr, node) \
-
alloc_pages(gfp_mask, order)
-
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
-
#define alloc_page_vma(gfp_mask, vma, addr) \
-
alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
-
#define alloc_page_vma_node(gfp_mask, vma, addr, node) \
-
alloc_pages_vma(gfp_mask, 0, vma, addr, node)
不过最终调用的都是__alloc_pages_nodemask()
__alloc_pages_nodemask()
-
struct page *
-
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
-
struct zonelist *zonelist, nodemask_t *nodemask)
-
{
-
/*得到分配者所需要的对应zone区域。*/
-
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
-
struct zone *preferred_zone;
-
struct page *page = NULL;
-
-
/*根据分配者存在gfp_mask中的值来得到对应的表示用哪种迁移类型。*/
-
int migratetype = allocflags_to_migratetype(gfp_mask);
-
unsigned int cpuset_mems_cookie;
-
-
gfp_mask &= gfp_allowed_mask;
-
-
~~snip
-
/*第一次尝试快速分配,可能从pcp高速缓存页分配,也可以从free_list上分配。
-
这是最简单的分配状况。*/
-
/* First allocation attempt */
-
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
-
zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
-
preferred_zone, migratetype);
-
/*如果分配失败,则尝试再一次慢速分配,可能需要等页面回收等之后才有可用内存页。*/
-
if (unlikely(!page))
-
page = __alloc_pages_slowpath(gfp_mask, order,
-
zonelist, high_zoneidx, nodemask,
-
preferred_zone, migratetype);
-
~snip
-
-
return page;
-
}
get_page_from_freelist():
-
static struct page *
-
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
-
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
-
struct zone *preferred_zone, int migratetype)
-
{
-
struct zoneref *z;
-
struct page *page = NULL;
-
int classzone_idx;
-
struct zone *zone;
-
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
-
int zlc_active = 0; /* set if using zonelist_cache */
-
int did_zlc_setup = 0; /* just call zlc_setup() one time */
-
-
classzone_idx = zone_idx(preferred_zone);
-
zonelist_scan:
-
/*
-
* Scan zonelist, looking for a zone with enough free.
-
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
-
*/
-
/*扫描整个zonlist列表,我们是UMA,所以只有一个了。
-
不过还是会在当前zone内存不足的情况下,依次扫描下个zone
-
是否有空闲页。*/
-
for_each_zone_zonelist_nodemask(zone, z, zonelist,
-
high_zoneidx, nodemask) {
-
/*检查是否在运行的node上分配内存*/
-
if ((alloc_flags & ALLOC_CPUSET) &&
-
!cpuset_zone_allowed_softwall(zone, gfp_mask))
-
continue;
-
~~snip
-
/*伙伴系统提供了一个水位线机制来合理的分配内存。从名字
-
可以想到,肯定有对应的水位高低之分,高出或者低于肯定有想对应的
-
操作。*/
-
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
-
unsigned long mark;
-
int ret;
-
-
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
-
/*检查水位线是否正常。*/
-
if (zone_watermark_ok(zone, order, mark,
-
classzone_idx, alloc_flags))
-
goto try_this_zone;
-
-
~~snip
-
/*跑到这里表示水位线不正常了,这里为0
-
表示没有设置页面回收模式。*/
-
if (zone_reclaim_mode == 0)
-
goto this_zone_full;
-
/*开始回收本zone相应页面,不过函数只针对NUMA,
-
UMA直接返回了0.*/
-
ret = zone_reclaim(zone, gfp_mask, order);
-
switch (ret) {
-
case ZONE_RECLAIM_NOSCAN:
-
/* did not scan */
-
continue;
-
case ZONE_RECLAIM_FULL:
-
/* scanned but unreclaimable */
-
continue;
-
default:
-
/* did we reclaim enough */
-
/*回收过后重新判断水位线。*/
-
if (!zone_watermark_ok(zone, order, mark,
-
classzone_idx, alloc_flags))
-
/*不正常则表示此zone已满!*/
-
goto this_zone_full;
-
}
-
}
-
/*跑到这表示此zone可以分配。*/
-
try_this_zone:
-
/*根据order的值来决定从pcp还是free_list上分配。*/
-
page = buffered_rmqueue(preferred_zone, zone, order,
-
gfp_mask, migratetype);
-
if (page)
-
break;
-
this_zone_full:
-
~~snip
-
}
-
-
~~snip
-
return page;
-
}
buffered_rmqueue():
-
static inline
-
struct page *buffered_rmqueue(struct zone *preferred_zone,
-
struct zone *zone, int order, gfp_t gfp_flags,
-
int migratetype)
-
{
-
unsigned long flags;
-
struct page *page;
-
/*分配的是pcp冷页还是热页,热页表示也存在硬件高速缓冲中。
-
而冷页没有。一般使用的都是热页。*/
-
int cold = !!(gfp_flags & __GFP_COLD);
-
-
again:
-
/*当order为0也就是只分配一页的时候,为了提高效率,
-
直接从pcp中去获取。*/
-
if (likely(order == 0)) {
-
struct per_cpu_pages *pcp;
-
struct list_head *list;
-
-
local_irq_save(flags);
-
/*每个cpu对应一个pcp,pcp名字由来就是
-
Per cpu pageset.*/
-
pcp = &this_cpu_ptr(zone->pageset)->pcp;
-
/*从当前迁移类型上得到list,以判断是否有空闲页。*/
-
list = &pcp->lists[migratetype];
-
/*pcp本质上也是从伙伴系统的free list中获得的。*/
-
if (list_empty(list)) {
-
/*从free list上申请batch个页放入pcp备用,最终调用
-
的是__rmqueue()标准申请接口,下面会分析到。*/
-
pcp->count += rmqueue_bulk(zone, 0,
-
pcp->batch, list,
-
migratetype, cold);
-
if (unlikely(list_empty(list)))
-
goto failed;
-
}
-
/*冷页从list最后取,热页从最前面取, 由存放顺序决定。*/
-
if (cold)
-
page = list_entry(list->prev, struct page, lru);
-
else
-
page = list_entry(list->next, struct page, lru);
-
/*被申请掉后从空闲List中移除。*/
-
list_del(&page->lru);
-
pcp->count--;
-
} else {
-
if (unlikely(gfp_flags & __GFP_NOFAIL)) {
-
WARN_ON_ONCE(order > 1);
-
}
-
spin_lock_irqsave(&zone->lock, flags);
-
/*从伙伴系统上的free list申请。*/
-
page = __rmqueue(zone, order, migratetype);
-
spin_unlock(&zone->lock);
-
if (!page)
-
goto failed;
-
__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
-
}
-
/*重新统计当前zone相关信息。*/
-
__count_zone_vm_events(PGALLOC, zone, 1 << order);
-
zone_statistics(preferred_zone, zone, gfp_flags);
-
local_irq_restore(flags);
-
-
VM_BUG_ON(bad_range(zone, page));
-
/*后续准备工作,如设置一些页相关标志,是否是zero page等。*/
-
if (prep_new_page(page, order, gfp_flags))
-
goto again;
-
return page;
-
-
failed:
-
local_irq_restore(flags);
-
return NULL;
-
}
__rmqueue():
-
rmqueue_bulk()只是pcp调用__rmqueue()然后设置和pcp相关的一些参数,比较简单,这里不介绍了,直接看__rmqueue().
-
static struct page *__rmqueue(struct zone *zone, unsigned int order,
-
int migratetype)
-
{
-
struct page *page;
-
-
retry_reserve:
-
/*使用伙伴系统算法分配内存*/
-
page = __rmqueue_smallest(zone, order, migratetype);
-
/*如果失败了,而且当前迁移类型不是RESERVE,
-
那么尝试从下个迁移类型分配。分配次序前面有说明过了,
-
按照fallbacks 定义的顺序,MIGRATE_RESERVE表示很紧急的时候分配
-
如果它还是失败那没戏了。*/
-
if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
-
page = __rmqueue_fallback(zone, order, migratetype);
-
if (!page) {
-
/*还是失败的话那么只能用MIGRATE_RESERVE 类型的
-
去申请了。*/
-
migratetype = MIGRATE_RESERVE;
-
goto retry_reserve;
-
}
-
}
-
-
trace_mm_page_alloc_zone_locked(page, order, migratetype);
-
return page;
-
}
__rmqueue_smallest():
-
static inline
-
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
-
int migratetype)
-
{
-
unsigned int current_order;
-
struct free_area * area;
-
struct page *page;
-
/*buddy算法还是很容易理解的。从当前要申请的order开始查找,
-
只有大于order的有空闲页,那就成功了!*/
-
/* Find a page of the appropriate size in the preferred list */
-
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
-
/*取得当前order对应的free_area*/
-
area = &(zone->free_area[current_order]);
-
/*没有空闲页则查找更大的order。*/
-
if (list_empty(&area->free_list[migratetype]))
-
continue;
-
/*跑到这里表示已经找到,取出free_list中的一页。*/
-
page = list_entry(area->free_list[migratetype].next,
-
struct page, lru);
-
/*从free_list中删掉。*/
-
list_del(&page->lru);
-
rmv_page_order(page);
-
/*空闲页减少。*/
-
area->nr_free--;
-
/*拆分合并此order,因为一部分被使用了,剩下一部分还空闲着,
-
会被安排到低于当前找到order的free_list上,重新排列buddy内存布局*/
-
expand(zone, page, order, current_order, area, migratetype);
-
return page;
-
}
-
-
return NULL;
-
}
expand():
-
static inline void expand(struct zone *zone, struct page *page,
-
int low, int high, struct free_area *area,
-
int migratetype)
-
{
-
unsigned long size = 1 << high;
-
/*low和high分别表示要申请的order和现在找到的order*/
-
while (high > low) {
-
/*使用低一阶的area, order小一阶, size也减半*/
-
area--;
-
high--;
-
size >>= 1;
-
VM_BUG_ON(bad_range(zone, &page[size]));
-
/*从size开始的page插入到当前area的freelist中*/
-
list_add(&page[size].lru, &area->free_list[migratetype]);
-
area->nr_free++;
-
/*保存当前order当struct page中。*/
-
set_page_order(&page[size], high);
-
}
-
}
所以如果第一次分配就成功,buddy算法流程相当简单的。如果当前迁移类型分配失败,
那么就要从下一个迁移类型上去分配了!来看__rmqueue_fallback().
__rmqueue_fallback():
-
static inline struct page *
-
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
-
{
-
struct free_area * area;
-
int current_order;
-
struct page *page;
-
int migratetype, i;
-
/*从最高阶开始搜索,大块内存申请/分割可以避免更少的碎片。*/
-
/* Find the largest possible block of pages in the other list */
-
for (current_order = MAX_ORDER-1; current_order >= order;
-
--current_order) {
-
for (i = 0;; i++) {
-
/*得到数组中指定顺序的迁移类型。*/
-
migratetype = fallbacks[start_migratetype][i];
-
/*保留内存晚点操作。*/
-
/* MIGRATE_RESERVE handled later if necessary */
-
if (migratetype == MIGRATE_RESERVE)
-
break;
-
/*取得area,判断list,为空表示当前迁移类型也没有空闲页。*/
-
area = &(zone->free_area[current_order]);
-
if (list_empty(&area->free_list[migratetype]))
-
continue;
-
-
page = list_entry(area->free_list[migratetype].next,
-
struct page, lru);
-
area->nr_free--;
-
-
/*
-
* If breaking a large block of pages, move all free
-
/*当前不是cma迁移类型,或者order比一个pageblock的order的
-
一半要大,或者是可回收类型,或者定义了迁移策略时,移动内存页到先前申请的迁移类型中去。*/
-
if (!is_migrate_cma(migratetype) &&
-
(unlikely(current_order >= pageblock_order / 2) ||
-
start_migratetype == MIGRATE_RECLAIMABLE ||
-
page_group_by_mobility_disabled)) {
-
int pages;
-
/*移动空闲页到先前迁移类型中去。*/
-
pages = move_freepages_block(zone, page,
-
start_migratetype);
-
/*当当前内存块大部分已经迁移到先前类型中或者定义了迁移策略时,
-
那么就将这一整块全部迁移过去。*/
-
/* Claim the whole block if over half of it is free */
-
if (pages >= (1 << (pageblock_order-1)) ||
-
page_group_by_mobility_disabled)
-
set_pageblock_migratetype(page,
-
start_migratetype);
-
-
migratetype = start_migratetype;
-
}
-
-
/* Remove the page from the freelists */
-
list_del(&page->lru);
-
rmv_page_order(page);
-
-
/* Take ownership for orders >= pageblock_order */
-
if (current_order >= pageblock_order &&
-
!is_migrate_cma(migratetype))
-
change_pageblock_range(page, current_order,
-
start_migratetype);
-
/*拆分/合并*/
-
expand(zone, page, order, current_order, area,
-
is_migrate_cma(migratetype)
-
? migratetype : start_migratetype);
-
-
trace_mm_page_alloc_extfrag(page, order, current_order,
-
start_migratetype, migratetype);
-
-
return page;
-
}
-
}
-
return NULL;
-
}
以上的内存分配都是基于分配比较顺利的情况,如果分配依然失败,那么只能使用慢速分配机制了!so 继续看__alloc_pages_slowpath().
__alloc_pages_slowpath():
-
static inline struct page *
-
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
-
struct zonelist *zonelist, enum zone_type high_zoneidx,
-
nodemask_t *nodemask, struct zone *preferred_zone,
-
int migratetype)
-
{
-
const gfp_t wait = gfp_mask & __GFP_WAIT;
-
struct page *page = NULL;
-
int alloc_flags;
-
unsigned long pages_reclaimed = 0;
-
unsigned long did_some_progress;
-
bool sync_migration = false;
-
bool deferred_compaction = false;
-
~~snip
-
restart:
-
/*如果没有禁止内存回收的话就唤醒交换线程来进行内存页面回收, 写回或
-
换出很少使用的页到磁盘上。*/
-
if (!(gfp_mask & __GFP_NO_KSWAPD))
-
wake_all_kswapd(order, zonelist, high_zoneidx,
-
zone_idx(preferred_zone));
-
-
/*为了更积极地尝试分配,将水位线降低以便更有可能分配成功。*/
-
alloc_flags = gfp_to_alloc_flags(gfp_mask);
-
-
/*
-
* Find the true preferred zone if the allocation is unconstrained by
-
* cpusets.
-
*/
-
if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
-
first_zones_zonelist(zonelist, high_zoneidx, NULL,
-
&preferred_zone);
-
-
rebalance:
-
/*降低水位线之后重新尝试分配。*/
-
/* This is the last chance, in general, before the goto nopage. */
-
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
-
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
-
preferred_zone, migratetype);
-
if (page)
-
goto got_pg;
-
/*如果不考虑水位线,那么继续尝试分配。如果定义了__GFP_NOFAIL,
-
那么此函数会不断尝试分配,直到成功。*/
-
/* Allocate without watermarks if the context allows */
-
if (alloc_flags & ALLOC_NO_WATERMARKS) {
-
page = __alloc_pages_high_priority(gfp_mask, order,
-
zonelist, high_zoneidx, nodemask,
-
preferred_zone, migratetype);
-
if (page)
-
goto got_pg;
-
}
-
/*没定义等待,如在中断上下文中需要原子份额配,所以直接返回失败。*/
-
/* Atomic allocations - we can't balance anything */
-
if (!wait)
-
goto nopage;
-
/*分配器自身需要更多内存,避免递归调用。*/
-
/* Avoid recursion of direct reclaim */
-
if (current->flags & PF_MEMALLOC)
-
goto nopage;
-
/*oom killer选中的线程才会设置TIF_MEMDIE标志,当然如果不允许失败的
-
话那就循环等待其他线程释放内存。*/
-
/* Avoid allocations with no watermarks from looping endlessly */
-
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
-
goto nopage;
-
-
/*尝试压缩内存再尝试分配。*/
-
page = __alloc_pages_direct_compact(gfp_mask, order,
-
zonelist, high_zoneidx,
-
nodemask,
-
alloc_flags, preferred_zone,
-
migratetype, sync_migration,
-
&deferred_compaction,
-
&did_some_progress);
-
if (page)
-
goto got_pg;
-
~~snip
-
/*自己直接去回收内存,然后尝试分配。*/
-
/* Try direct reclaim and then allocating */
-
page = __alloc_pages_direct_reclaim(gfp_mask, order,
-
zonelist, high_zoneidx,
-
nodemask,
-
alloc_flags, preferred_zone,
-
migratetype, &did_some_progress);
-
if (page)
-
goto got_pg;
-
-
/*如果还是失败的话那就使用oom killer杀掉一些线程,再尝试分配。*/
-
if (!did_some_progress) {
-
if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
-
if (oom_killer_disabled)
-
goto nopage;
-
/* Coredumps can quickly deplete all memory reserves */
-
if ((current->flags & PF_DUMPCORE) &&
-
!(gfp_mask & __GFP_NOFAIL))
-
goto nopage;
-
page = __alloc_pages_may_oom(gfp_mask, order,
-
zonelist, high_zoneidx,
-
nodemask, preferred_zone,
-
migratetype);
-
if (page)
-
goto got_pg;
-
-
if (!(gfp_mask & __GFP_NOFAIL)) {
-
/*
-
* The oom killer is not called for high-order
-
* allocations that may fail, so if no progress
-
* is being made, there are no other options and
-
* retrying is unlikely to help.
-
*/
-
if (order > PAGE_ALLOC_COSTLY_ORDER)
-
goto nopage;
-
/*
-
* The oom killer is not called for lowmem
-
* allocations to prevent needlessly killing
-
* innocent tasks.
-
*/
-
if (high_zoneidx < ZONE_NORMAL)
-
goto nopage;
-
}
-
-
goto restart;
-
}
-
}
-
/*是否需要等待一会再尝试重新分配?*/
-
/* Check if we should retry the allocation */
-
pages_reclaimed += did_some_progress;
-
if (should_alloc_retry(gfp_mask, order, did_some_progress,
-
pages_reclaimed)) {
-
/* Wait for some write requests to complete then retry */
-
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
-
goto rebalance;
-
} else {
-
/*在做过内存回收之后再使用内存压缩分配试试看。*/
-
page = __alloc_pages_direct_compact(gfp_mask, order,
-
zonelist, high_zoneidx,
-
nodemask,
-
alloc_flags, preferred_zone,
-
migratetype, sync_migration,
-
&deferred_compaction,
-
&did_some_progress);
-
if (page)
-
goto got_pg;
-
}
-
/*到这里表示真分配失败了,打印警告信息。*/
-
nopage:
-
warn_alloc_failed(gfp_mask, order, NULL);
-
return page;
-
got_pg:
-
if (kmemcheck_enabled)
-
kmemcheck_pagealloc_alloc(page, order, gfp_mask);
-
return page;
-
-
}
内存分配一波三折,小结一下:
1. 先尝试快速分配,其中会从不同的zone以及迁移类型上去尝试,失败的话就进入慢速分配,里面会再划分单页面从pcp上分配以及多页面从伙伴系统中分配。
2. 尝试慢速分配,一般流程就是唤醒内存页面回收线程,然后尝试低水位分配 -> 忽略水位分配 -> 压缩内存分配 -> 直接回收内存分配 -> oom killer杀死线程分配 -> 压缩内存分配。
内存释放:
关于内存释放,使用的最终公共接口为__free_pages, 流程部分还是比较清晰的,
这里不对代码作具体分析了。分单页和多页释放。
单页:释放到pcp缓冲中,如果pcp中的空闲页面数过多,就会移动一部分到伙伴系统中。
多页:释放多页到伙伴系统,当当前的释放页面数和相邻的空闲页面数相同时,那就将两者合并,然后放到更高一阶的order链表上面,依次循环执行次操作直到不能合并为止。