Chinaunix首页 | 论坛 | 博客
  • 博客访问: 289314
  • 博文数量: 44
  • 博客积分: 10
  • 博客等级: 民兵
  • 技术积分: 1354
  • 用 户 组: 普通用户
  • 注册时间: 2012-04-08 15:38
个人简介

人生像是在跑马拉松,能够完赛的都是不断地坚持向前迈进;人生就是像在跑马拉松,不断调整步伐,把握好分分秒秒;人生还是像在跑马拉松,能力决定了能跑短程、半程还是全程。人生其实就是一场马拉松,坚持不懈,珍惜时间。

文章分类

分类: LINUX

2015-07-02 23:57:06

   Linux系统内存管理中存在着一个称之为OOM killerOut-Of-Memory killer)的机制,该机制主要用于内存监控,监控进程的内存使用量,当系统的内存耗尽时,其将根据算法选择性地kill了部分进程。本文分析的内存溢出保护机制,也就是OOM killer机制了。

回到伙伴管理算法中涉及的一函数__alloc_pages_nodemask(),其里面调用的__alloc_pages_slowpath()并未展开深入,而内存溢出保护机制则在此函数中。

先行查看一下__alloc_pages_slowpath()的实现:

  1. 【file:/ mm/page_alloc.h】
  2. static inline struct page *
  3. __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
  4.     struct zonelist *zonelist, enum zone_type high_zoneidx,
  5.     nodemask_t *nodemask, struct zone *preferred_zone,
  6.     int migratetype)
  7. {
  8.     const gfp_t wait = gfp_mask & __GFP_WAIT;
  9.     struct page *page = NULL;
  10.     int alloc_flags;
  11.     unsigned long pages_reclaimed = 0;
  12.     unsigned long did_some_progress;
  13.     bool sync_migration = false;
  14.     bool deferred_compaction = false;
  15.     bool contended_compaction = false;
  16.  
  17.     /*
  18.      * In the slowpath, we sanity check order to avoid ever trying to
  19.      * reclaim >= MAX_ORDER areas which will never succeed. Callers may
  20.      * be using allocators in order of preference for an area that is
  21.      * too large.
  22.      */
  23.     if (order >= MAX_ORDER) {
  24.         WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
  25.         return NULL;
  26.     }
  27.  
  28.     /*
  29.      * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
  30.      * __GFP_NOWARN set) should not cause reclaim since the subsystem
  31.      * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
  32.      * using a larger set of nodes after it has established that the
  33.      * allowed per node queues are empty and that nodes are
  34.      * over allocated.
  35.      */
  36.     if (IS_ENABLED(CONFIG_NUMA) &&
  37.         (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
  38.         goto nopage;
  39.  
  40. restart:
  41.     if (!(gfp_mask & __GFP_NO_KSWAPD))
  42.         wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
  43.  
  44.     /*
  45.      * OK, we're below the kswapd watermark and have kicked background
  46.      * reclaim. Now things get more complex, so set up alloc_flags according
  47.      * to how we want to proceed.
  48.      */
  49.     alloc_flags = gfp_to_alloc_flags(gfp_mask);
  50.  
  51.     /*
  52.      * Find the true preferred zone if the allocation is unconstrained by
  53.      * cpusets.
  54.      */
  55.     if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
  56.         first_zones_zonelist(zonelist, high_zoneidx, NULL,
  57.                     &preferred_zone);
  58.  
  59. rebalance:
  60.     /* This is the last chance, in general, before the goto nopage. */
  61.     page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
  62.             high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
  63.             preferred_zone, migratetype);
  64.     if (page)
  65.         goto got_pg;
  66.  
  67.     /* Allocate without watermarks if the context allows */
  68.     if (alloc_flags & ALLOC_NO_WATERMARKS) {
  69.         /*
  70.          * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
  71.          * the allocation is high priority and these type of
  72.          * allocations are system rather than user orientated
  73.          */
  74.         zonelist = node_zonelist(numa_node_id(), gfp_mask);
  75.  
  76.         page = __alloc_pages_high_priority(gfp_mask, order,
  77.                 zonelist, high_zoneidx, nodemask,
  78.                 preferred_zone, migratetype);
  79.         if (page) {
  80.             goto got_pg;
  81.         }
  82.     }
  83.  
  84.     /* Atomic allocations - we can't balance anything */
  85.     if (!wait) {
  86.         /*
  87.          * All existing users of the deprecated __GFP_NOFAIL are
  88.          * blockable, so warn of any new users that actually allow this
  89.          * type of allocation to fail.
  90.          */
  91.         WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
  92.         goto nopage;
  93.     }
  94.  
  95.     /* Avoid recursion of direct reclaim */
  96.     if (current->flags & PF_MEMALLOC)
  97.         goto nopage;
  98.  
  99.     /* Avoid allocations with no watermarks from looping endlessly */
  100.     if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
  101.         goto nopage;
  102.  
  103.     /*
  104.      * Try direct compaction. The first pass is asynchronous. Subsequent
  105.      * attempts after direct reclaim are synchronous
  106.      */
  107.     page = __alloc_pages_direct_compact(gfp_mask, order,
  108.                     zonelist, high_zoneidx,
  109.                     nodemask,
  110.                     alloc_flags, preferred_zone,
  111.                     migratetype, sync_migration,
  112.                     &contended_compaction,
  113.                     &deferred_compaction,
  114.                     &did_some_progress);
  115.     if (page)
  116.         goto got_pg;
  117.     sync_migration = true;
  118.  
  119.     /*
  120.      * If compaction is deferred for high-order allocations, it is because
  121.      * sync compaction recently failed. In this is the case and the caller
  122.      * requested a movable allocation that does not heavily disrupt the
  123.      * system then fail the allocation instead of entering direct reclaim.
  124.      */
  125.     if ((deferred_compaction || contended_compaction) &&
  126.                         (gfp_mask & __GFP_NO_KSWAPD))
  127.         goto nopage;
  128.  
  129.     /* Try direct reclaim and then allocating */
  130.     page = __alloc_pages_direct_reclaim(gfp_mask, order,
  131.                     zonelist, high_zoneidx,
  132.                     nodemask,
  133.                     alloc_flags, preferred_zone,
  134.                     migratetype, &did_some_progress);
  135.     if (page)
  136.         goto got_pg;
  137.  
  138.     /*
  139.      * If we failed to make any progress reclaiming, then we are
  140.      * running out of options and have to consider going OOM
  141.      */
  142.     if (!did_some_progress) {
  143.         if (oom_gfp_allowed(gfp_mask)) {
  144.             if (oom_killer_disabled)
  145.                 goto nopage;
  146.             /* Coredumps can quickly deplete all memory reserves */
  147.             if ((current->flags & PF_DUMPCORE) &&
  148.                 !(gfp_mask & __GFP_NOFAIL))
  149.                 goto nopage;
  150.             page = __alloc_pages_may_oom(gfp_mask, order,
  151.                     zonelist, high_zoneidx,
  152.                     nodemask, preferred_zone,
  153.                     migratetype);
  154.             if (page)
  155.                 goto got_pg;
  156.  
  157.             if (!(gfp_mask & __GFP_NOFAIL)) {
  158.                 /*
  159.                  * The oom killer is not called for high-order
  160.                  * allocations that may fail, so if no progress
  161.                  * is being made, there are no other options and
  162.                  * retrying is unlikely to help.
  163.                  */
  164.                 if (order > PAGE_ALLOC_COSTLY_ORDER)
  165.                     goto nopage;
  166.                 /*
  167.                  * The oom killer is not called for lowmem
  168.                  * allocations to prevent needlessly killing
  169.                  * innocent tasks.
  170.                  */
  171.                 if (high_zoneidx < ZONE_NORMAL)
  172.                     goto nopage;
  173.             }
  174.  
  175.             goto restart;
  176.         }
  177.     }
  178.  
  179.     /* Check if we should retry the allocation */
  180.     pages_reclaimed += did_some_progress;
  181.     if (should_alloc_retry(gfp_mask, order, did_some_progress,
  182.                         pages_reclaimed)) {
  183.         /* Wait for some write requests to complete then retry */
  184.         wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
  185.         goto rebalance;
  186.     } else {
  187.         /*
  188.          * High-order allocations do not necessarily loop after
  189.          * direct reclaim and reclaim/compaction depends on compaction
  190.          * being called after reclaim so call directly if necessary
  191.          */
  192.         page = __alloc_pages_direct_compact(gfp_mask, order,
  193.                     zonelist, high_zoneidx,
  194.                     nodemask,
  195.                     alloc_flags, preferred_zone,
  196.                     migratetype, sync_migration,
  197.                     &contended_compaction,
  198.                     &deferred_compaction,
  199.                     &did_some_progress);
  200.         if (page)
  201.             goto got_pg;
  202.     }
  203.  
  204. nopage:
  205.     warn_alloc_failed(gfp_mask, order, NULL);
  206.     return page;
  207. got_pg:
  208.     if (kmemcheck_enabled)
  209.         kmemcheck_pagealloc_alloc(page, order, gfp_mask);
  210.  
  211.     return page;
  212. }

该函数首先判断调用者是否禁止唤醒kswapd线程,若不做禁止则唤醒线程进行内存回收工作,然后通过gfp_to_alloc_flags()对内存分配标识进行调整,而后再次调用get_page_from_freelist()尝试分配,如果分配到则退出。否则继续尝试内存分配,继续尝试分配则先行判断是否设置了ALLOC_NO_WATERMARKS标识,如果设置了,则将忽略watermark,调用__alloc_pages_high_priority()进行分配。

__alloc_pages_high_priority()函数实现:

  1. 【file:/ mm/page_alloc.h】
  2. /*
  3.  * This is called in the allocator slow-path if the allocation request is of
  4.  * sufficient urgency to ignore watermarks and take other desperate measures
  5.  */
  6. static inline struct page *
  7. __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
  8.     struct zonelist *zonelist, enum zone_type high_zoneidx,
  9.     nodemask_t *nodemask, struct zone *preferred_zone,
  10.     int migratetype)
  11. {
  12.     struct page *page;
  13.  
  14.     do {
  15.         page = get_page_from_freelist(gfp_mask, nodemask, order,
  16.             zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
  17.             preferred_zone, migratetype);
  18.  
  19.         if (!page && gfp_mask & __GFP_NOFAIL)
  20.             wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
  21.     } while (!page && (gfp_mask & __GFP_NOFAIL));
  22.  
  23.     return page;
  24. }

可以看到该函数根据分配标识__GFP_NOFAIL不断地调用get_page_from_freelist()循环尝试去获得内存。

接着回到__alloc_pages_slowpath()中,其从__alloc_pages_high_priority()退出后继而判断是否设置了__GFP_WAIT标识,如果设置则表示内存分配运行休眠,否则直接以分配内存失败而退出。接着将会调用__alloc_pages_direct_compact()__alloc_pages_direct_reclaim()尝试回收内存并尝试分配。基于上面的多种尝试内存分配仍然失败的情况,将会调用__alloc_pages_may_oom()触发OOM killer机制。OOM killer将进程kill后会重新再次尝试内存分配,最后则是分配失败或分配成功的收尾处理。

__alloc_pages_slowpath()暂且分析至此,回到本文重点函数__alloc_pages_may_oom()中进一步进行分析。

  1. 【file:/ mm/page_alloc.h】
  2. static inline struct page *
  3. __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
  4.     struct zonelist *zonelist, enum zone_type high_zoneidx,
  5.     nodemask_t *nodemask, struct zone *preferred_zone,
  6.     int migratetype)
  7. {
  8.     struct page *page;
  9.  
  10.     /* Acquire the OOM killer lock for the zones in zonelist */
  11.     if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
  12.         schedule_timeout_uninterruptible(1);
  13.         return NULL;
  14.     }
  15.  
  16.     /*
  17.      * Go through the zonelist yet one more time, keep very high watermark
  18.      * here, this is only to catch a parallel oom killing, we must fail if
  19.      * we're still under heavy pressure.
  20.      */
  21.     page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
  22.         order, zonelist, high_zoneidx,
  23.         ALLOC_WMARK_HIGH|ALLOC_CPUSET,
  24.         preferred_zone, migratetype);
  25.     if (page)
  26.         goto out;
  27.  
  28.     if (!(gfp_mask & __GFP_NOFAIL)) {
  29.         /* The OOM killer will not help higher order allocs */
  30.         if (order > PAGE_ALLOC_COSTLY_ORDER)
  31.             goto out;
  32.         /* The OOM killer does not needlessly kill tasks for lowmem */
  33.         if (high_zoneidx < ZONE_NORMAL)
  34.             goto out;
  35.         /*
  36.          * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
  37.          * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
  38.          * The caller should handle page allocation failure by itself if
  39.          * it specifies __GFP_THISNODE.
  40.          * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
  41.          */
  42.         if (gfp_mask & __GFP_THISNODE)
  43.             goto out;
  44.     }
  45.     /* Exhausted what can be done so it's blamo time */
  46.     out_of_memory(zonelist, gfp_mask, order, nodemask, false);
  47.  
  48. out:
  49.     clear_zonelist_oom(zonelist, gfp_mask);
  50.     return page;
  51. }

该函数首先通过try_set_zonelist_oom()判断OOM killer是否已经在其他核进行killing操作,如果没有的情况下将会在try_set_zonelist_oom()内部进行锁操作,确保只有一个核执行killing的操作。继而调用get_page_from_freelist()在高watermark的情况下尝试再次获取内存,不过这里注定会失败。接着就是调用到了关键函数out_of_memory()。最后函数退出时将会调用clear_zonelist_oom()清除掉try_set_zonelist_oom()里面的锁操作。

着重分析一下out_of_memory()

  1. 【file:/ mm/oom_kill.c】
  2. /**
  3.  * out_of_memory - kill the "best" process when we run out of memory
  4.  * @zonelist: zonelist pointer
  5.  * @gfp_mask: memory allocation flags
  6.  * @order: amount of memory being requested as a power of 2
  7.  * @nodemask: nodemask passed to page allocator
  8.  * @force_kill: true if a task must be killed, even if others are exiting
  9.  *
  10.  * If we run out of memory, we have the choice between either
  11.  * killing a random task (bad), letting the system crash (worse)
  12.  * OR try to be smart about which process to kill. Note that we
  13.  * don't have to be perfect here, we just have to be good.
  14.  */
  15. void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
  16.         int order, nodemask_t *nodemask, bool force_kill)
  17. {
  18.     const nodemask_t *mpol_mask;
  19.     struct task_struct *p;
  20.     unsigned long totalpages;
  21.     unsigned long freed = 0;
  22.     unsigned int uninitialized_var(points);
  23.     enum oom_constraint constraint = CONSTRAINT_NONE;
  24.     int killed = 0;
  25.  
  26.     blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
  27.     if (freed > 0)
  28.         /* Got some memory back in the last second. */
  29.         return;
  30.  
  31.     /*
  32.      * If current has a pending SIGKILL or is exiting, then automatically
  33.      * select it. The goal is to allow it to allocate so that it may
  34.      * quickly exit and free its memory.
  35.      */
  36.     if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
  37.         set_thread_flag(TIF_MEMDIE);
  38.         return;
  39.     }
  40.  
  41.     /*
  42.      * Check if there were limitations on the allocation (only relevant for
  43.      * NUMA) that may require different handling.
  44.      */
  45.     constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
  46.                         &totalpages);
  47.     mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
  48.     check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
  49.  
  50.     if (sysctl_oom_kill_allocating_task && current->mm &&
  51.         !oom_unkillable_task(current, NULL, nodemask) &&
  52.         current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
  53.         get_task_struct(current);
  54.         oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
  55.                  nodemask,
  56.                  "Out of memory (oom_kill_allocating_task)");
  57.         goto out;
  58.     }
  59.  
  60.     p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
  61.     /* Found nothing?!?! Either we hang forever, or we panic. */
  62.     if (!p) {
  63.         dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
  64.         panic("Out of memory and no killable processes...\n");
  65.     }
  66.     if (p != (void *)-1UL) {
  67.         oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
  68.                  nodemask, "Out of memory");
  69.         killed = 1;
  70.     }
  71. out:
  72.     /*
  73.      * Give the killed threads a good chance of exiting before trying to
  74.      * allocate memory again.
  75.      */
  76.     if (killed)
  77.         schedule_timeout_killable(1);
  78. }

该函数首先调用blocking_notifier_call_chain()进行OOM的内核通知链回调处理;接着的if (fatal_signal_pending(current) || current->flags & PF_EXITING)判断则是用于检查是否有SIGKILL信号挂起或者正在信号处理中,如果有则退出;再接着通过constrained_alloc()检查内存分配限制以及check_panic_on_oom()检查是否报linux内核panic;继而判断sysctl_oom_kill_allocating_task变量及进程检查,如果符合条件判断,则将当前分配的内存kill掉;否则最后,将通过select_bad_process()选出最佳的进程,进而调用oom_kill_process()对其进行kill操作。

最后分析一下select_bad_process()oom_kill_process(),其中select_bad_process()的实现:

  1. 【file:/ mm/oom_kill.c】
  2. /*
  3.  * Simple selection loop. We chose the process with the highest
  4.  * number of 'points'. Returns -1 on scan abort.
  5.  *
  6.  * (not docbooked, we don't want this one cluttering up the manual)
  7.  */
  8. static struct task_struct *select_bad_process(unsigned int *ppoints,
  9.         unsigned long totalpages, const nodemask_t *nodemask,
  10.         bool force_kill)
  11. {
  12.     struct task_struct *g, *p;
  13.     struct task_struct *chosen = NULL;
  14.     unsigned long chosen_points = 0;
  15.  
  16.     rcu_read_lock();
  17.     for_each_process_thread(g, p) {
  18.         unsigned int points;
  19.  
  20.         switch (oom_scan_process_thread(p, totalpages, nodemask,
  21.                         force_kill)) {
  22.         case OOM_SCAN_SELECT:
  23.             chosen = p;
  24.             chosen_points = ULONG_MAX;
  25.             /* fall through */
  26.         case OOM_SCAN_CONTINUE:
  27.             continue;
  28.         case OOM_SCAN_ABORT:
  29.             rcu_read_unlock();
  30.             return (struct task_struct *)(-1UL);
  31.         case OOM_SCAN_OK:
  32.             break;
  33.         };
  34.         points = oom_badness(p, NULL, nodemask, totalpages);
  35.         if (!points || points < chosen_points)
  36.             continue;
  37.         /* Prefer thread group leaders for display purposes */
  38.         if (points == chosen_points && thread_group_leader(chosen))
  39.             continue;
  40.  
  41.         chosen = p;
  42.         chosen_points = points;
  43.     }
  44.     if (chosen)
  45.         get_task_struct(chosen);
  46.     rcu_read_unlock();
  47.  
  48.     *ppoints = chosen_points * 1000 / totalpages;
  49.     return chosen;
  50. }

    此函数通过for_each_process_thread()宏遍历所有进程,进而借用oom_scan_process_thread()获得进程扫描类型然后通过switch-case作特殊化处理,例如存在某进程退出中则中断扫描、某进程占用内存过多且被标识为优先kill掉则优选等特殊处理。而正常情况则会通过oom_badness()计算出进程的分值,然后根据最高分值将进程控制块返回回去。

    顺便研究一下oom_badness()的实现:

  1. 【file:/ mm/oom_kill.c】
  2. /**
  3.  * oom_badness - heuristic function to determine which candidate task to kill
  4.  * @p: task struct of which task we should calculate
  5.  * @totalpages: total present RAM allowed for page allocation
  6.  *
  7.  * The heuristic for determining which task to kill is made to be as simple and
  8.  * predictable as possible. The goal is to return the highest value for the
  9.  * task consuming the most memory to avoid subsequent oom failures.
  10.  */
  11. unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
  12.               const nodemask_t *nodemask, unsigned long totalpages)
  13. {
  14.     long points;
  15.     long adj;
  16.  
  17.     if (oom_unkillable_task(p, memcg, nodemask))
  18.         return 0;
  19.  
  20.     p = find_lock_task_mm(p);
  21.     if (!p)
  22.         return 0;
  23.  
  24.     adj = (long)p->signal->oom_score_adj;
  25.     if (adj == OOM_SCORE_ADJ_MIN) {
  26.         task_unlock(p);
  27.         return 0;
  28.     }
  29.  
  30.     /*
  31.      * The baseline for the badness score is the proportion of RAM that each
  32.      * task's rss, pagetable and swap space use.
  33.      */
  34.     points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) +
  35.          get_mm_counter(p->mm, MM_SWAPENTS);
  36.     task_unlock(p);
  37.  
  38.     /*
  39.      * Root processes get 3% bonus, just like the __vm_enough_memory()
  40.      * implementation used by LSMs.
  41.      */
  42.     if (has_capability_noaudit(p, CAP_SYS_ADMIN))
  43.         points -= (points * 3) / 100;
  44.  
  45.     /* Normalize to oom_score_adj units */
  46.     adj *= totalpages / 1000;
  47.     points += adj;
  48.  
  49.     /*
  50.      * Never return 0 for an eligible task regardless of the root bonus and
  51.      * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
  52.      */
  53.     return points > 0 ? points : 1;
  54. }

    计算进程分值的函数中,首先排除了不可OOM kill的进程以及oom_score_adj值为OOM_SCORE_ADJ_MIN(即-1000)的进程,其中oom_score_adj取值范围是-10001000;接着就是计算进程的RSS、页表以及SWAP空间的使用量占RAM的比重,如果该进程是超级进程,则去除3%的权重;最后将oom_score_adjpoints归一后,但凡小于0值的都返回1,其他的则返回原值。由此可知,分值越低的则越不会被kill,而且该值可以通过修改oom_score_adj进行调整。

    最后分析一下找到了最“bad”的进程后,其享受的“待遇”oom_kill_process()

  1. 【file:/ mm/oom_kill.c】
  2. /*
  3.  * Must be called while holding a reference to p, which will be released upon
  4.  * returning.
  5.  */
  6. void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
  7.               unsigned int points, unsigned long totalpages,
  8.               struct mem_cgroup *memcg, nodemask_t *nodemask,
  9.               const char *message)
  10. {
  11.     struct task_struct *victim = p;
  12.     struct task_struct *child;
  13.     struct task_struct *t;
  14.     struct mm_struct *mm;
  15.     unsigned int victim_points = 0;
  16.     static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
  17.                           DEFAULT_RATELIMIT_BURST);
  18.  
  19.     /*
  20.      * If the task is already exiting, don't alarm the sysadmin or kill
  21.      * its children or threads, just set TIF_MEMDIE so it can die quickly
  22.      */
  23.     if (p->flags & PF_EXITING) {
  24.         set_tsk_thread_flag(p, TIF_MEMDIE);
  25.         put_task_struct(p);
  26.         return;
  27.     }
  28.  
  29.     if (__ratelimit(&oom_rs))
  30.         dump_header(p, gfp_mask, order, memcg, nodemask);
  31.  
  32.     task_lock(p);
  33.     pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
  34.         message, task_pid_nr(p), p->comm, points);
  35.     task_unlock(p);
  36.  
  37.     /*
  38.      * If any of p's children has a different mm and is eligible for kill,
  39.      * the one with the highest oom_badness() score is sacrificed for its
  40.      * parent. This attempts to lose the minimal amount of work done while
  41.      * still freeing memory.
  42.      */
  43.     read_lock(&tasklist_lock);
  44.     for_each_thread(p, t) {
  45.         list_for_each_entry(child, &t->children, sibling) {
  46.             unsigned int child_points;
  47.  
  48.             if (child->mm == p->mm)
  49.                 continue;
  50.             /*
  51.              * oom_badness() returns 0 if the thread is unkillable
  52.              */
  53.             child_points = oom_badness(child, memcg, nodemask,
  54.                                 totalpages);
  55.             if (child_points > victim_points) {
  56.                 put_task_struct(victim);
  57.                 victim = child;
  58.                 victim_points = child_points;
  59.                 get_task_struct(victim);
  60.             }
  61.         }
  62.     }
  63.     read_unlock(&tasklist_lock);
  64.  
  65.     p = find_lock_task_mm(victim);
  66.     if (!p) {
  67.         put_task_struct(victim);
  68.         return;
  69.     } else if (victim != p) {
  70.         get_task_struct(p);
  71.         put_task_struct(victim);
  72.         victim = p;
  73.     }
  74.  
  75.     /* mm cannot safely be dereferenced after task_unlock(victim) */
  76.     mm = victim->mm;
  77.     pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
  78.         task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
  79.         K(get_mm_counter(victim->mm, MM_ANONPAGES)),
  80.         K(get_mm_counter(victim->mm, MM_FILEPAGES)));
  81.     task_unlock(victim);
  82.  
  83.     /*
  84.      * Kill all user processes sharing victim->mm in other thread groups, if
  85.      * any. They don't get access to memory reserves, though, to avoid
  86.      * depletion of all memory. This prevents mm->mmap_sem livelock when an
  87.      * oom killed thread cannot exit because it requires the semaphore and
  88.      * its contended by another thread trying to allocate memory itself.
  89.      * That thread will now get access to memory reserves since it has a
  90.      * pending fatal signal.
  91.      */
  92.     rcu_read_lock();
  93.     for_each_process(p)
  94.         if (p->mm == mm && !same_thread_group(p, victim) &&
  95.             !(p->flags & PF_KTHREAD)) {
  96.             if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
  97.                 continue;
  98.  
  99.             task_lock(p); /* Protect ->comm from prctl() */
  100.             pr_err("Kill process %d (%s) sharing same memory\n",
  101.                 task_pid_nr(p), p->comm);
  102.             task_unlock(p);
  103.             do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
  104.         }
  105.     rcu_read_unlock();
  106.  
  107.     set_tsk_thread_flag(victim, TIF_MEMDIE);
  108.     do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
  109.     put_task_struct(victim);
  110. }

    该函数将会判断当前被kill的进程情况,如果该进程处于退出状态,则设置TIF_MEMDIE标志,不做kill操作;接着会通过list_for_each_entry()遍历该进程的子进程信息,如果某个子进程拥有不同的mm且合适被kill掉,将会优先考虑将该子进程替代父进程kill掉,这样可以避免kill掉父进程带来的接管子进程的工作开销;再往下通过find_lock_task_mm()找到持有mm锁的进程,如果进程处于退出状态,则return,否则继续处理,若此时的进程与传入的不是同一个时则更新victim;继而接着通过for_each_process()查找与当前被kill进程使用到了同样的共享内存的进程进行一起kill掉,kill之前将对应的进程添加标识TIF_MEMDIE,而kill的动作则是通过发送SICKILL信号给对应进程,由被kill进程从内核态返回用户态时进行处理。

    至此,OOM kill处理分析完毕。

阅读(5323) | 评论(0) | 转发(1) |
给主人留下些什么吧!~~