__alloc_pages源代码分析-litary1986-ChinaUnix博客

嵌入式系统开发——皓月

首页　| 　博文目录　| 　关于我

litary1986

博客访问： 497539
博文数量： 115
博客积分： 5016
博客等级：大校
技术积分： 1401
用户组：普通用户
注册时间： 2008-09-21 16:03

文章分类

全部博文（115）

工作有感（0）
毕业设计（4）
觅职记（3）
英语学习（1）
心情日札（17）
技术天地（90）

就业相关（14）

项目相关（21）

VI使用（6）

心得体会（3）

linux程序设计（11）

linux应用（28）

linux内核学习（7）
未分配的博文（0）

文章存档

2013年（1）

2010年（17）

2009年（76）

2008年（21）

我的朋友

Guobutte

相关博文

__alloc_pages源代码分析

分类： LINUX

2009-04-25 12:38:41

* This is the 'heart' of the zoned buddy allocator.

struct page * fastcall

__alloc_pages(gfp_t gfp_mask, unsigned int order,

struct zonelist *zonelist)

{

const gfp_t wait = gfp_mask & __GFP_WAIT;

struct zone **z;

struct page *page;

struct reclaim_state reclaim_state;

struct task_struct *p = current;

int do_retry;

int alloc_flags;

int did_some_progress;

might_sleep_if(wait);

/*一系列宏定义，最后调用cond_resched()

might_sleep_if-> might_sleep->might_resched->cond_resched

如果在gfp_mask中设置了__GFP_WAIT位，表明内核可以阻塞当前进程，来等待空闲页面。在分配开始之前即阻塞，目的是为了等待其它进程释放更多的页面？？

if (should_fail_alloc_page(gfp_mask, order))

/*通过简单算法在真正分配前检查分配是否会失败，避免进入真正的分配程序后浪费系统时间*/

return NULL;

restart:

z = zonelist->zones; /* the list of zones suitable for gfp_mask */

if (unlikely(*z == NULL)) {

/* Should this ever happen?? */

return NULL;

}

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,

zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);

/*get_page_from_freelist以指定的watermark来分配页面

每个zone struct中定义了三个watermark：pages_min, pages_low, pages_high，表示zone中应保持的空闲页面的阈值。

get_page_from_freelist函数通过设置Alloc flags来选择watermark。

#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */

#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */

#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */

#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */

#define ALLOC_HARDER 0x10 /* try to alloc harder */

#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */

#define ALLOC_CPUSET 0x40 /* check for correct cpuset */

/*首先以pages_low watermark分配页面，如果分配成功，则跳转到got_pg*/

if (page)

goto got_pg;

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

* __GFP_NOWARN set) should not cause reclaim since the subsystem

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

* using a larger set of nodes after it has established that the

* allowed per node queues are empty and that nodes are

* over allocated.

/*如果pages_low watermark分配失败的话，检查gfp_mask，如果GFP_THISNODE标志被设置，表明不能重试，因此跳转到nopage，返回失败!*/

if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)

goto nopage;

/*否则调用kswapd对zonelist中的所有zone进行页面回首，期待能将一些闲置

页面交换到文件系统中*/

for (z = zonelist->zones; *z; z++)

wakeup_kswapd(*z, order);

* OK, we're below the kswapd watermark and have kicked background

* reclaim. Now things get more complex, so set up alloc_flags according

* to how we want to proceed.

* The caller may dip into page reserves a bit more if the caller

* cannot run direct reclaim, or if the caller has realtime scheduling

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).

alloc_flags = ALLOC_WMARK_MIN;

if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)

alloc_flags |= ALLOC_HARDER;

if (gfp_mask & __GFP_HIGH)

alloc_flags |= ALLOC_HIGH;

if (wait)

alloc_flags |= ALLOC_CPUSET;

* Go through the zonelist again. Let __GFP_HIGH and allocations

* coming from realtime tasks go deeper into reserves.

* This is the last chance, in general, before the goto nopage.

* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

/*降低对zone中空闲页面得要求，以pages_min再次分配页面*/

page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);

if (page)

goto got_pg;

/* This allocation should allow future memory freeing. */

rebalance:

if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))

&& !in_interrupt()) {

if (!(gfp_mask & __GFP_NOMEMALLOC)) {

nofail_alloc:

/*如果gfp_mask设置不需要保留紧急内存区域，

以不设water_mark再次分配页面*/

/* go through the zonelist yet again, ignoring mins */

page = get_page_from_freelist(gfp_mask, order,

zonelist, ALLOC_NO_WATERMARKS);

if (page)

goto got_pg;

/*如果gfp_mask设置了__GFP_NOFAIL，则不断重试，直到分配成功*/

if (gfp_mask & __GFP_NOFAIL) {

congestion_wait(WRITE, HZ/50);

goto nofail_alloc;

}

goto nopage;

}

/* Atomic allocations - we can't balance anything */

if (!wait)

goto nopage;

/*重新调度之后，试图释放一些不常用的页面*/

cond_resched();

/* We now go into synchronous reclaim */

cpuset_memory_divssure_bump();

p->flags |= PF_MEMALLOC;

reclaim_state.reclaimed_slab = 0;

p->reclaim_state = &reclaim_state;

did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);

p->reclaim_state = NULL;

p->flags &= ~PF_MEMALLOC;

cond_resched();

/*调度之后，如果确实释放了一部分页面，则重新分配页面*/

if (likely(did_some_progress)) {

page = get_page_from_freelist(gfp_mask, order,

zonelist, alloc_flags);

if (page)

goto got_pg;

/*如果没有释放页面，并且允许重试，则以pages_high watermark分配页面，因为以pages_high分配，所以除非此时其它的cpu杀死了某个进程，释放了一部分页面，这次分配肯定会失败，失败之后内核通过调用out_of_momery函数杀死某个经过选择的进程，获得一部分空闲页面。采用pages_high watermark可以避免两个cpu都选择某个进程来杀死，从而造成不必要的损失。*/

} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

* Go through the zonelist yet one more time, keep

* very high watermark here, this is only to catch

* a parallel oom killing, we must fail if we're still

* under heavy divssure.

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,

zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);

if (page)

goto got_pg;

/*在这里选择某个进程杀死，释放部分空闲页面*/

out_of_memory(zonelist, gfp_mask, order);

goto restart;

}

* Don't let big-order allocations loop unless the caller explicitly

* requests that. Wait for some write requests to complete then retry.

* In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order

* <= 3, but that may not be true in other implementations.

do_retry = 0;

if (!(gfp_mask & __GFP_NORETRY)) {

if ((order <= 3) || (gfp_mask & __GFP_REPEAT))

do_retry = 1;

if (gfp_mask & __GFP_NOFAIL)

do_retry = 1;

}

if (do_retry) {

congestion_wait(WRITE, HZ/50);

goto rebalance;

}

nopage:

if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {

printk(KERN_WARNING "%s: page allocation failure."

" order:%d, mode:0x%x\n",

p->comm, order, gfp_mask);

dump_stack();

show_mem();

}

got_pg:

return page;

}

阅读(1241) | 评论(0) | 转发(0) |

上一篇：kmalloc详解

下一篇：关于进程的休眠和唤醒

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6