Chinaunix首页 | 论坛 | 博客
  • 博客访问: 491913
  • 博文数量: 115
  • 博客积分: 5016
  • 博客等级: 大校
  • 技术积分: 1401
  • 用 户 组: 普通用户
  • 注册时间: 2008-09-21 16:03
文章分类

全部博文(115)

文章存档

2013年(1)

2010年(17)

2009年(76)

2008年(21)

我的朋友

分类: LINUX

2009-04-25 12:38:41

/*
 * This is the 'heart' of the zoned buddy allocator.
 */
struct page * fastcall
__alloc_pages(gfp_t gfp_mask, unsigned int order,
              struct zonelist *zonelist)
{
       const gfp_t wait = gfp_mask & __GFP_WAIT;
       struct zone **z;
       struct page *page;
       struct reclaim_state reclaim_state;
       struct task_struct *p = current;
       int do_retry;
       int alloc_flags;
       int did_some_progress;
 
       might_sleep_if(wait);
       /*一系列宏定义,最后调用cond_resched()
       might_sleep_if-> might_sleep->might_resched->cond_resched
       如果在gfp_mask中设置了__GFP_WAIT位,表明内核可以阻塞当前进程,来等待空闲页面。在分配开始之前即阻塞,目的是为了等待其它进程释放更多的页面??
*/
 
       if (should_fail_alloc_page(gfp_mask, order)) 
              /*通过简单算法在真正分配前检查分配是否会失败,避免进入真正的分配程序后浪费系统时间*/
              return NULL;
 
restart:
       z = zonelist->zones; /* the list of zones suitable for gfp_mask */
 
       if (unlikely(*z == NULL)) {
              /* Should this ever happen?? */
              return NULL;
       }
 
       page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
                            zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
       /*get_page_from_freelist以指定的watermark来分配页面
每个zone struct中定义了三个watermark:pages_min, pages_low, pages_high,表示zone中应保持的空闲页面的阈值。
get_page_from_freelist函数通过设置Alloc flags来选择watermark。
 
 
#define ALLOC_NO_WATERMARKS   0x01 /* don't check watermarks at all */
#define ALLOC_WMARK_MIN           0x02 /* use pages_min watermark */
#define ALLOC_WMARK_LOW          0x04 /* use pages_low watermark */
#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */
#define ALLOC_HARDER                0x10 /* try to alloc harder */
#define ALLOC_HIGH                  0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET                 0x40 /* check for correct cpuset */
*/
       /*首先以pages_low watermark分配页面,如果分配成功,则跳转到got_pg*/
       if (page)
              goto got_pg;
 
       /*
        * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
        * __GFP_NOWARN set) should not cause reclaim since the subsystem
        * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
        * using a larger set of nodes after it has established that the
        * allowed per node queues are empty and that nodes are
        * over allocated.
        */
        /*如果pages_low watermark分配失败的话,检查gfp_mask,如果GFP_THISNODE标志被设置,表明不能重试,因此跳转到nopage,返回失败!*/
       if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
              goto nopage;
       /*否则调用kswapd对zonelist中的所有zone进行页面回首,期待能将一些闲置
       页面交换到文件系统中*/
       for (z = zonelist->zones; *z; z++)
              wakeup_kswapd(*z, order);
 
       /*
        * OK, we're below the kswapd watermark and have kicked background
        * reclaim. Now things get more complex, so set up alloc_flags according
        * to how we want to proceed.
        *
        * The caller may dip into page reserves a bit more if the caller
        * cannot run direct reclaim, or if the caller has realtime scheduling
        * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
        * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
        */
       alloc_flags = ALLOC_WMARK_MIN;
       if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
              alloc_flags |= ALLOC_HARDER;
       if (gfp_mask & __GFP_HIGH)
              alloc_flags |= ALLOC_HIGH;
       if (wait)
              alloc_flags |= ALLOC_CPUSET;
 
       /*
        * Go through the zonelist again. Let __GFP_HIGH and allocations
        * coming from realtime tasks go deeper into reserves.
        *
        * This is the last chance, in general, before the goto nopage.
        * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
        * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
        */
        /*降低对zone中空闲页面得要求,以pages_min再次分配页面*/
       page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
       if (page)
              goto got_pg;
 
       /* This allocation should allow future memory freeing. */
 
rebalance:
       if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
                     && !in_interrupt()) {
              if (!(gfp_mask & __GFP_NOMEMALLOC)) {
nofail_alloc:
                     /*如果gfp_mask设置不需要保留紧急内存区域,
                     以不设water_mark再次分配页面*/
                     /* go through the zonelist yet again, ignoring mins */
                     page = get_page_from_freelist(gfp_mask, order,
                            zonelist, ALLOC_NO_WATERMARKS);
                     if (page)
                            goto got_pg;
                     /*如果gfp_mask设置了__GFP_NOFAIL,则不断重试,直到分配成功*/
                     if (gfp_mask & __GFP_NOFAIL) {
                            congestion_wait(WRITE, HZ/50);
                            goto nofail_alloc;
                     }
              }
              goto nopage;
       }
 
       /* Atomic allocations - we can't balance anything */
       if (!wait)
              goto nopage;
       /*重新调度之后,试图释放一些不常用的页面*/
       cond_resched();
 
       /* We now go into synchronous reclaim */
       cpuset_memory_divssure_bump();
       p->flags |= PF_MEMALLOC;
       reclaim_state.reclaimed_slab = 0;
       p->reclaim_state = &reclaim_state;
 
       did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
 
       p->reclaim_state = NULL;
       p->flags &= ~PF_MEMALLOC;
 
       cond_resched();
 
       /*调度之后,如果确实释放了一部分页面,则重新分配页面*/
       if (likely(did_some_progress)) {
              page = get_page_from_freelist(gfp_mask, order,
                                          zonelist, alloc_flags);
              if (page)
                     goto got_pg;
       /*如果没有释放页面,并且允许重试,则以pages_high watermark分配页面,因为以pages_high分配,所以除非此时其它的cpu杀死了某个进程,释放了一部分页面,这次分配肯定会失败,失败之后内核通过调用out_of_momery函数杀死某个经过选择的进程,获得一部分空闲页面。采用pages_high watermark可以避免两个cpu都选择某个进程来杀死,从而造成不必要的损失。*/
       } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
              /*
               * Go through the zonelist yet one more time, keep
               * very high watermark here, this is only to catch
               * a parallel oom killing, we must fail if we're still
               * under heavy divssure.
               */
              page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
                            zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
              if (page)
                     goto got_pg;
              /*在这里选择某个进程杀死,释放部分空闲页面*/
              out_of_memory(zonelist, gfp_mask, order);
              goto restart;
       }
 
       /*
        * Don't let big-order allocations loop unless the caller explicitly
        * requests that. Wait for some write requests to complete then retry.
        *
        * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
        * <= 3, but that may not be true in other implementations.
        */
       do_retry = 0;
       if (!(gfp_mask & __GFP_NORETRY)) {
              if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
                     do_retry = 1;
              if (gfp_mask & __GFP_NOFAIL)
                     do_retry = 1;
       }
       if (do_retry) {
              congestion_wait(WRITE, HZ/50);
              goto rebalance;
       }
 
nopage:
       if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
              printk(KERN_WARNING "%s: page allocation failure."
                     " order:%d, mode:0x%x\n",
                     p->comm, order, gfp_mask);
              dump_stack();
              show_mem();
       }
got_pg:
       return page;
}
阅读(1214) | 评论(0) | 转发(0) |
0

上一篇:kmalloc详解

下一篇:关于进程的休眠和唤醒

给主人留下些什么吧!~~