《LINUX3.0内核源代码分析》第四章：内存管理(3)-hezhi11-ChinaUnix博客

hezhi11的ChinaUnix博客

首页　| 　博文目录　| 　关于我

hezhi11

博客访问： 166271
博文数量： 84
博客积分： 0
博客等级：民兵
技术积分： 1
用户组：普通用户
注册时间： 2014-03-09 10:55

文章分类

全部博文（84）

性能优化（2）
链接加载（1）
应用开发（4）

应用开发实例（2）
程序员面试（1）

算法（1）
Linux内核（75）

内核编程（2）

启动与初始化（2）

内核与用户通信（2）

模块机制（1）

内核数据结构（1）

输入输出（1）

系统调用（1）

其他机制（8）

时钟机制（3）

网络协议栈（3）

内核构建（2）

内核调试（3）

设备驱动（3）

同步机制（14）

文件系统（2）

进程管理（1）

内存管理（12）

中断（14）
未分配的博文（1）

文章存档

2014年（84）

我的朋友

相关博文

《LINUX3.0内核源代码分析》第四章：内存管理(3)

分类： LINUX

2014-05-15 15:29:23

原文地址：《LINUX3.0内核源代码分析》第四章：内存管理(3) 作者：xiebaoyou

摘要：本文主要讲述linux如何处理ARM cortex A9多核处理器的内存管理部分。主要包括对页面快速分配流程和慢速分配流程的介绍。

法律声明：《LINUX3.0内核源代码分析》系列文章由谢宝友（）发表于http://xiebaoyou.blog.chinaunix.net，文章中的LINUX3.0源代码遵循GPL协议。除此以外，文档中的其他内容由作者保留所有版权。谢绝转载。

1.1.1.1 快速分配流程

/**

* 遍历管理区列表，分配一个页面。

* gfp_mask: 分配标志。

* nodemask: 在哪些节点中进行分配，一般未指定。

* order: 分配的页面数量为2^order。

* zonelist: 在哪些管理区中分配，根据gfs_mask确定。

* high_zoneidx:从哪一个管理区开始分配。

* alloc_flags:分配标志，此标志可以用于控制水线。

* preferred_zone:优先从这个管理区开始分配。

* migratetype:页面迁移类型.

static struct page *

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

struct zonelist *zonelist, int high_zoneidx, int alloc_flags,

struct zone *preferred_zone, int migratetype)

{

struct zoneref *z;

struct page *page = NULL;

int classzone_idx;

struct zone *zone;

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

int zlc_active = 0; /* set if using zonelist_cache */

int did_zlc_setup = 0; /* just call zlc_setup() one time */

/**

* 首先从preferred_zone这个管理区开始分配页面

classzone_idx = zone_idx(preferred_zone);

zonelist_scan:

* Scan zonelist, looking for a zone with enough free.

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

/**

* 在允许的节点中，遍历满足要求的管理区。

for_each_zone_zonelist_nodemask(zone, z, zonelist,

high_zoneidx, nodemask) {

if (NUMA_BUILD && zlc_active &&/* 是第一遍分配，在其他管理区中分配页面时需要考虑其页面是否充足 */

!zlc_zone_worth_trying(zonelist, z, allowednodes))/* 该管理区页面不是很充足，考虑下一个管理区 */

continue;

if ((alloc_flags & ALLOC_CPUSET) &&

!cpuset_zone_allowed_softwall(zone, gfp_mask))/* 当前分配标志不允许在该管理区中分配页面。 */

goto try_next_zone;

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {/* 分配时需要考虑水线 */

unsigned long mark;

int ret;

/* 根据分配标志，确定使用哪一个水线 */

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

if (zone_watermark_ok(zone, order, mark,

classzone_idx, alloc_flags))/* 该管理区的可用内存可以满足本次分配的要求 */

goto try_this_zone;

if (zone_reclaim_mode == 0)

goto this_zone_full;

/* 运行到此，说明该管理区中内存不足，需要对该管理区进行回收 */

ret = zone_reclaim(zone, gfp_mask, order);

switch (ret) {

case ZONE_RECLAIM_NOSCAN:/* 当前管理区还没有进行回收 */

/* did not scan */

goto try_next_zone;

case ZONE_RECLAIM_FULL:/* 进行了回收，但是没有可回收的内存 */

/* scanned but unreclaimable */

goto this_zone_full;

default:/* 回收部分内存 */

/* did we reclaim enough */

if (!zone_watermark_ok(zone, order, mark,

classzone_idx, alloc_flags))/* 回收的内存较少，仍然不满足分配要求 */

goto this_zone_full;

}

/**

* 当前管理区中有足够的可用内存，试图在此管理区中分配内存

try_this_zone:

/**

* 调用伙伴系统的分配函数

page = buffered_rmqueue(preferred_zone, zone, order,

gfp_mask, migratetype);

if (page)/* 从伙伴系统分配成功，退出 */

break;

/* 当管理区中不能满足分配要求时，运行到此 */

this_zone_full:

if (NUMA_BUILD)/* 标记该管理区空间不足，下次分配时将略过本管理区，避免浪费太多时间 */

zlc_mark_zone_full(zonelist, z);

try_next_zone:

/**

* 当前管理区内存不足，需要加大在其他区中的分配力度。

if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {

* we do zlc_setup after the first zone is tried but only

* if there are multiple nodes make it worthwhile

allowednodes = zlc_setup(zonelist, alloc_flags);

zlc_active = 1;

did_zlc_setup = 1;

}

/**

* 第一遍分配不成功，则取消zlc_active，这样会尽量从其他节点中分配内存。

if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {

/* Disable zlc cache for second zonelist scan */

zlc_active = 0;

goto zonelist_scan;

}

/**

* 两次扫描节点，不管是否成功，都向上层返回分配结果。

return page;

}

1.1.1.2 慢速分配流程

/**

* 当无法快速分配页面时，如果调用者允许等待，则通过本函数进行慢速分配。

* 此时允许进行内存回收。

static inline struct page *

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

struct zonelist *zonelist, enum zone_type high_zoneidx,

nodemask_t *nodemask, struct zone *preferred_zone,

int migratetype)

{

const gfp_t wait = gfp_mask & __GFP_WAIT;

struct page *page = NULL;

int alloc_flags;

unsigned long pages_reclaimed = 0;

unsigned long did_some_progress;

bool sync_migration = false;

* In the slowpath, we sanity check order to avoid ever trying to

* reclaim >= MAX_ORDER areas which will never succeed. Callers may

* be using allocators in order of preference for an area that is

* too large.

if (order >= MAX_ORDER) {/* 这里进行参数合法性检测 */

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

return NULL;

}

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

* __GFP_NOWARN set) should not cause reclaim since the subsystem

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

* using a larger set of nodes after it has established that the

* allowed per node queues are empty and that nodes are

* over allocated.

/**

* 调用者指定了GFP_THISNODE标志，表示不能进行内存回收。

* 上层调用者应当在指定了GFP_THISNODE失败后，使用其他标志进行分配。

if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)

goto nopage;

restart:

if (!(gfp_mask & __GFP_NO_KSWAPD))/* 如果调用者没有禁止kswapd，则唤醒该线程进行内存回收。 */

wake_all_kswapd(order, zonelist, high_zoneidx,

zone_idx(preferred_zone));

* OK, we're below the kswapd watermark and have kicked background

* reclaim. Now things get more complex, so set up alloc_flags according

* to how we want to proceed.

alloc_flags = gfp_to_alloc_flags(gfp_mask);/* 根据分配标志确定内部标志，主要是用于水线 */

* Find the true preferred zone if the allocation is unconstrained by

* cpusets.

/**

* 如果不受cpuset的限制，那么找到真正的优先用于分配的管理区

if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)

first_zones_zonelist(zonelist, high_zoneidx, NULL,

&preferred_zone);

rebalance:

/* This is the last chance, in general, before the goto nopage. */

/**

* 与快速分配流程相比，这里的分配标志使用了低的水线。

* 在进行内存回收操作前，我们使用低水线再尝试分配一下。

* 当然，不管是否允许ALLOC_NO_WATERMARKS标志，我们都将它清除。

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,

preferred_zone, migratetype);

if (page)/* 庆幸的是，分配到内存了，退出。 */

goto got_pg;

/* Allocate without watermarks if the context allows */

if (alloc_flags & ALLOC_NO_WATERMARKS) {/* 某些上下文，如内存回收进程及被杀死的任务，都允许它完全突破水线的限制分配内存。 */

page = __alloc_pages_high_priority(gfp_mask, order,

zonelist, high_zoneidx, nodemask,

preferred_zone, migratetype);

if (page)/* 在不考虑水线的情况下，分配到了内存 */

goto got_pg;

}

/* Atomic allocations - we can't balance anything */

if (!wait)/* 调用者希望原子分配内存，此时不能等待内存回收，返回NULL */

goto nopage;

/* Avoid recursion of direct reclaim */

if (current->flags & PF_MEMALLOC)/* 调用者本身就是内存回收进程，不能进入后面的内存回收处理流程，否则死锁 */

goto nopage;

/* Avoid allocations with no watermarks from looping endlessly */

/**

* 当前线程正在被杀死，它可以完全突破水线分配内存。这里向上层返回NULL，是为了避免系统进入死循环。

* 当然，如果上层调用不允许失败，则死循环继续分配，等待其他线程释放一点点内存。

if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))

goto nopage;

* Try direct compaction. The first pass is asynchronous. Subsequent

* attempts after direct reclaim are synchronous

/**

* 尝试压缩内存。这样可以将一些小的外碎片合并成大页面，这样也许能够满足调用者的内存分配要求。

* 内存压缩是通过页面迁移实现的。

* 第一次调用的时候，是非同步的。第二次调用则是同步方式。

page = __alloc_pages_direct_compact(gfp_mask, order,

zonelist, high_zoneidx,

nodemask,

alloc_flags, preferred_zone,

migratetype, &did_some_progress,

sync_migration);

if (page)/* 庆幸，通过压缩内存，分配到了内存 */

goto got_pg;

sync_migration = true;/* 将页面迁移标志设置为同步方式，这样第二次页面迁移都会使用同步方式 */

/* Try direct reclaim and then allocating */

/**

* 直接在内存分配上下文中进行内存回收操作。

page = __alloc_pages_direct_reclaim(gfp_mask, order,

zonelist, high_zoneidx,

nodemask,

alloc_flags, preferred_zone,

migratetype, &did_some_progress);

if (page)/* 庆幸，回收了一些内存后，满足了上层分配需求 */

goto got_pg;

* If we failed to make any progress reclaiming, then we are

* running out of options and have to consider going OOM

if (!did_some_progress) {/* 内存回收过程没有回收到内存，系统真的内存不足了 */

/**

* 调用者不是文件系统的代码，允许进行文件系统操作，并且允许重试。

* 这里需要__GFP_FS标志可能是进入OOM流程后会杀进程或进入panic，需要文件操作。

if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

if (oom_killer_disabled)/* 系统禁止了OOM，向上层返回NULL */

goto nopage;

/**

* 杀死其他进程后再尝试分配内存

page = __alloc_pages_may_oom(gfp_mask, order,

zonelist, high_zoneidx,

nodemask, preferred_zone,

migratetype);

if (page)

goto got_pg;

if (!(gfp_mask & __GFP_NOFAIL)) {/* 调用者允许失败 */

* The oom killer is not called for high-order

* allocations that may fail, so if no progress

* is being made, there are no other options and

* retrying is unlikely to help.

if (order > PAGE_ALLOC_COSTLY_ORDER)/* 要求的页面数量较多，再试意义不大 */

goto nopage;

* The oom killer is not called for lowmem

* allocations to prevent needlessly killing

* innocent tasks.

if (high_zoneidx < ZONE_NORMAL)/* 是从DMA区域中分配内存，容易出现内存不足，重试可能也释放不了多少内存 */

goto nopage;

}

goto restart;

}

/* Check if we should retry the allocation */

/* 内存回收过程回收了一些内存，接下来判断是否有必要继续重试 */

pages_reclaimed += did_some_progress;

if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {/* 如果需要重试，则等待后重试 */

/* Wait for some write requests to complete then retry */

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

goto rebalance;

} else {/* 不需要再重试了 */

* High-order allocations do not necessarily loop after

* direct reclaim and reclaim/compaction depends on compaction

* being called after reclaim so call directly if necessary

/* 再次压缩内存，这次是同步方式，如果还分配不到内存就放弃了 */

page = __alloc_pages_direct_compact(gfp_mask, order,

zonelist, high_zoneidx,

nodemask,

alloc_flags, preferred_zone,

migratetype, &did_some_progress,

sync_migration);

if (page)

goto got_pg;

}

nopage:

/* 内存分配失败了，打印内存分配失败的警告 */

warn_alloc_failed(gfp_mask, order, NULL);

return page;

got_pg:

if (kmemcheck_enabled)/* 运行到这里，说明成功分配了内存，这里进行内存检测调试 */

kmemcheck_pagealloc_alloc(page, order, gfp_mask);

return page;

}

1.1.1.3 辅助函数

__zone_watermark_ok函数用于确定管理区的内存水线：

/**

* 计算管理区中的水线是否充足

static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,

int classzone_idx, int alloc_flags, long free_pages)

{

/* free_pages my go negative - that's OK */

long min = mark;

int o;

/**

* 本次内存分配后，剩余的内存数量

free_pages -= (1 << order) + 1;

if (alloc_flags & ALLOC_HIGH)/* 在紧急内存中分配，首先将水线减半 */

min -= min / 2;

if (alloc_flags & ALLOC_HARDER)/* 内存稍显不足，指定此标志表示适当降低水线 */

min -= min / 4;

/* 当前管理区已经低于水线了，退出 */

if (free_pages <= min + z->lowmem_reserve[classzone_idx])

return false;

for (o = 0; o < order; o++) {/* 如果是分配大块内存，那么需要将低阶的伙伴系统中的内存去除再判断 */

/* At the next order, this order's pages become unavailable */

free_pages -= z->free_area[o].nr_free << o;/* 当前阶的空闲内存要从总的空闲内存中去除 */

/* Require fewer higher order pages to be free */

min >>= 1;/* 越高阶，就越要降低内存水线 */

if (free_pages <= min)/* 水线不足了，alloc_pages函数应当考虑从其他管理区或者其他节点分配内存，或者传入另外的标志，降低水线标准 */

return false;

}

return true;

}

在内存分配过程中，需要与文件系统的缓存管理、页面回写交互，以进行内存回收，这部分内容在以后的文件系统分析中介绍。

另，内存回收过程是容易出现故障的地方。前一段时间接连处理了多个与内存管理相关的故障，涉及到死循环、dead lock、live lock等等。感兴趣的朋友可以参见http://www.spinics.net/lists/linux-fsdevel/msg45346.html，这里列出了18个相关的补丁。正是由于忙于处理故障，这两周才没有更新日志。

接下来我们将描述一下伙伴系统的主要函数。敬请关注。

阅读(587) | 评论(0) | 转发(0) |

上一篇：《LINUX3.0内核源代码分析》第四章：内存管理(4)

下一篇：《LINUX3.0内核源代码分析》第四章：内存管理(2)

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6