Chinaunix首页 | 论坛 | 博客
  • 博客访问: 616375
  • 博文数量: 113
  • 博客积分: 2554
  • 博客等级: 少校
  • 技术积分: 1428
  • 用 户 组: 普通用户
  • 注册时间: 2011-12-21 19:53
文章分类

全部博文(113)

文章存档

2014年(1)

2013年(2)

2012年(94)

2011年(16)

分类: LINUX

2011-12-21 19:53:40

 

  1. __alloc_pages处于内核内存管理的最底层,无论slab,vmallc,kmalloc,mmap,brk
  2. 还是page cache,buffer都要通过__alloc_pages获取最基本的物理内存pages.
  3.   linux执行这样一种内存管理策略:
  4.   a)充分利用物理内存,建立各种cache,优化程序性能,减少磁盘操作.这一点和win
  5. dows系统不同,windows系统中总是有很多内存空闲,即便是进行了大量的磁盘操作后.
  6. 而linux中真正空闲的物理内存几乎就看不到.

  7.   b)保证有足够的潜在物理内存(页面),可以立即加以回收,或称潜在可分配页面.
  8. 过内核的守护进程kswapd,bdflush,kreclaimd的定期处理,加上每次内存分配对系统
  9. 的调整,即通过__alloc_pages所遇到的各种内存分配压力,不断的调整守护进程的工
  10. 作方向,保证系统拥有足够的潜在可回收内存.
  11.   
  12.   先看看对内存页面有些什么样的保有量要求:
  13.   1)可分配页面的保有量要求:inactive_clean+free pages(in buddy pages)
  14.   
  15.   系统的期望值是freepages.high + inactive_target / 3,inactive_target就是
  16. min((memory_pressure >> INACTIVE_SHIFT),num_physpages / 4)).可见期望的保
  17. 有量有动态的因素在内.
  18.   现在的保有量是nr_free_pages() + nr_inactive_clean_pages();
  19.   mm/vmscan.c中的函数free_shortage,计算期望的可分配页面和现实之差距.如果
  20. 保有量合格,但看zone中的inbuddy free pages是比期望值少.只要有一个保有量不
  21. 合格,就必须立即加以调整.free_shortage请自己阅读.

  22.   2)潜在可分配页面的保有量要求:(buddyfree+inactiveclean+inactive_dirty)
  23.   期望保有量:freepages.high+inactive_target
  24.   现存量:
  25.       nr_free_pages()+nr_inactive_clean_pages()+nr_inactive_dirty_pages.
  26.       

  27. 所做分析已注入代码:
  28. /*
  29.  * 基于区的buddy 系统的核心策略
  30.  * This is the 'heart' of the zoned buddy allocator:
  31.  */
  32. struct page * __alloc_pages(zonelist_t *zonelist, unsigned longorder)
  33. {
  34.     zone_t **zone;
  35.     int direct_reclaim = 0;
  36.     unsigned int gfp_mask = zonelist->gfp_mask;
  37.     struct page * page;

  38.     /*
  39.      * Allocations put pressure on the VM subsystem.
  40.      */
  41.     memory_pressure++;

  42.     /*
  43.      * (If anyone calls gfp from interrupts nonatomically then it
  44.      * will sooner or later tripped up by a schedule().)
  45.      *
  46.      * We are falling back to lower-level zones if allocation
  47.      * in a higher zone fails.
  48.      */

  49.     /*
  50.      * Can we take pages directly from the inactive_clean
  51.      * list?
  52.      */
  53.     /* PF_MEMALLOC 代表是为管理目的而请求分配pages */
  54.     if (order == 0 && (gfp_mask & __GFP_WAIT) &&
  55.             !(current->flags & PF_MEMALLOC))
  56.         direct_reclaim = 1;

  57.     /*
  58.      * If we are about to get low on free pages and we also have
  59.      * an inactive page shortage, wake up kswapd.
  60.      */
  61.     if (inactive_shortage() > inactive_target / 2 &&free_shortage())
  62.         wakeup_kswapd(0);/*用各种办法保持潜在可分配页面的数量*/
  63.     /*
  64.      * If we are about to get low on free pages and cleaning
  65.      * the inactive_dirty pages would fix the situation,
  66.      * wake up bdflush.
  67.      */
  68.     else if (free_shortage() && nr_inactive_dirty_pages >free_shortage()
  69.             && nr_inactive_dirty_pages >= freepages.high)
  70.         wakeup_bdflush(0);/*加速将buffer中的数据写入磁盘的过程*/

  71. try_again:
  72.     /*
  73.      * 首先,选取那些拥有许多的空闲内存的zone
  74.      * We allocate free memory first because it doesn't contain
  75.      * any data ...
  76.      */
  77.      /* 这轮分配只看绝对空闲页的水位*/
  78.     zone = zonelist->zones;
  79.     for (;;) {
  80.         zone_t *z = *(zone++);
  81.         if (!z)
  82.             break;
  83.         if (!z->size)
  84.             BUG();

  85.         if (z->free_pages >= z->pages_low) {//空闲页面保有量合格
  86.             page = rmqueue(z, order);
  87.             if (page)
  88.                 return page;
  89.         } else if (z->free_pages < z->pages_min &&
  90.                     waitqueue_active(&kreclaimd_wait)) {
  91.                 wake_up_interruptible(&kreclaimd_wait);
  92.                 /* kreclaimd:从zone_t->inactive_clean_list 队列中回收页面 */
  93.         }
  94.     }


  95.      /* If there is a lot of activity, inactive_target
  96.      * will be high and we'll have a good chance of
  97.      * finding a page using the HIGH limit.
  98.      */
  99.    /*既然找不到空闲页面较多的zone,就找inactive_clean页面很
  100.     *丰富的zone试试
  101.     */
  102.     page = __alloc_pages_limit(zonelist, order, PAGES_HIGH,direct_reclaim);
  103.     if (page)
  104.         return page;

  105.     /*
  106.      * 还不行就找inactive_clean页面还行的zone
  107.      * zone->pages_low < free + inactive_clean
  108.      * When the working set is very large and VM activity
  109.      * is low, we're most likely to have our allocation
  110.      * succeed here.
  111.      */
  112.     page = __alloc_pages_limit(zonelist, order, PAGES_LOW,direct_reclaim);
  113.     if (page)
  114.         return page;

  115.     /*
  116.      * 没有zone 的空闲页面(buddy+inactive clean)能够满足需求了
  117.      *
  118.      * We wake up kswapd, in the hope that kswapd will
  119.      * resolve this situation before memory gets tight.
  120.      *
  121.      * We also yield the CPU, because that:
  122.      * - gives kswapd a chance to do something
  123.      * - slows down allocations, in particular the
  124.      * allocations from the fast allocator that's
  125.      * causing the problems ...
  126.      * - ... which minimises the impact the "bad guys"
  127.      * have on the rest of the system
  128.      * - if we don't have __GFP_IO set, kswapd may be
  129.      * able to free some memory we can't free ourselves
  130.      */
  131.     wakeup_kswapd(0); /* 参数0, 代表不睡眠*/
  132.     /* kswapd -->致力于保持潜在可分配页面的保有量*/
  133.     if (gfp_mask & __GFP_WAIT) {
  134.         __set_current_state(TASK_RUNNING);
  135.         current->policy |= SCHED_YIELD;
  136.         schedule();
  137.     }

  138.     /*
  139.      * After waking up kswapd, we try to allocate a page
  140.      * from any zone which isn't critical yet.
  141.      *
  142.      * 也许我们不能等Kswapd 完成他的工作
  143.      * 先以更低的水位要求试试
  144.      */
  145.     page = __alloc_pages_limit(zonelist, order, PAGES_MIN,direct_reclaim);
  146.     if (page)
  147.         return page;


  148.     /*
  149.      * Damn, we didn't succeed.
  150.      *
  151.      */
  152.   /* 对于普通进程还有情况我们可以 考虑到*/
  153.     if (!(current->flags & PF_MEMALLOC)) {
  154.     
  155.     if (order > 0 && (gfp_mask & __GFP_WAIT)) {
  156.         /* 我们在处理 higher order 的分配,并且可以等待 */
  157.             zone = zonelist->zones;
  158.         /*将dirty页面写入磁盘*/
  159.             current->flags |= PF_MEMALLOC; //page_launder也可能分配页面
  160.             page_launder(gfp_mask, 1);//这个进程作为调用环境,提升其
  161.             current->flags &= ~PF_MEMALLOC;color=blue>//优先级避免递归运行到这里
  162.             for (;;) {
  163.                 zone_t *z = *(zone++);
  164.                 if (!z)
  165.                     break;
  166.                 if (!z->size)
  167.                     continue;
  168.                 while (z->inactive_clean_pages) {
  169.                     /*补充空闲页面到buddy*/
  170.                     struct page * page;
  171.                     /* Move one page to the free list. */
  172.                     page = reclaim_page(z);
  173.                     if (!page)
  174.                         break;
  175.                     __free_page(page); //释放到buddy
  176.                     /*也许就有连续页面了*/
  177.                     /* Try if the allocation succeeds. */
  178.                     page = rmqueue(z, order); //再试试high_order的分配
  179.                     if (page)
  180.                         return page;
  181.                 }
  182.             }
  183.         }


  184.         /*
  185.          * We have to do this because something else might eat
  186.          * the memory kswapd frees for us and we need to be
  187.          * reliable.
  188.          */
  189.         if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) ==(__GFP_WAIT|__GFP_IO)) {
  190.             /* 如果容许io操作,并可以等待,唤醒kswapd
  191.              * 并等待kswapd 恢复内存的平衡状态
  192.              */
  193.             wakeup_kswapd(1); /* 参数1, 代表可以阻塞*/
  194.             memory_pressure++;
  195.             if (!order) //* 主意:我们在higher order 时不'again',
  196.           // 因为,可能kswapd 永远( *ever* )不能为我们
  197.          // 释放出一个大的连续区域.
  198.                 goto try_again;
  199.         /*
  200.          * If __GFP_IO isn't set, we can't wait on kswapd because
  201.          * kswapd just might need some IO locks /we/ are holding ...
  202.          *
  203.          * SUBTLE: The scheduling point above makes sure that
  204.          * kswapd does get the chance to free memory we can't
  205.          * free ourselves...
  206.          */
  207.         } else if (gfp_mask & __GFP_WAIT) {
  208.          //不能进行io的情况下代替kswapd做些
  209.          //不进行io 努力
  210.             try_to_free_pages(gfp_mask);
  211.             memory_pressure++;
  212.             if (!order)
  213.                 goto try_again;
  214.         }

  215.     }

  216.     /*
  217.      * Final phase: allocate anything we
  218.      *
  219.      * Higher order allocations, GFP_ATOMIC allocations and
  220.      * recursive allocations (PF_MEMALLOC) end up here.
  221.      *
  222.      * Only recursive allocations can use the very last pages
  223.      * in the system, otherwise it would be just too easy to
  224.      * deadlock the system...
  225.      */
  226.     zone = zonelist->zones;
  227.     for (;;) {
  228.         zone_t *z = *(zone++);
  229.         struct page * page = NULL;
  230.         if (!z)
  231.             break;
  232.         if (!z->size)
  233.             BUG();

  234.         /*
  235.          * SUBTLE: direct_reclaim is only possible if the task
  236.          * becomes PF_MEMALLOC while looping above. This will
  237.          * happen when the OOM killer selects this task for
  238.          * instant execution...(看英文吧)
  239.          */
  240.         if (direct_reclaim) {
  241.             page = reclaim_page(z);
  242.             if (page)
  243.                 return page;
  244.         }

  245.         /* XXX: is pages_min/4 a good amount to reserve for this? */
  246.         if (z->free_pages < z->pages_min / 4 &&
  247.                 !(current->flags & PF_MEMALLOC))
  248.             continue;
  249.         page = rmqueue(z, order);
  250.         if (page)
  251.             return page;
  252.     }

  253.     /* No luck.. */
  254.     printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);
  255.     return NULL;
  256. }
  257.    
  258.   与内存分配有关的函数还有:
  259. unsigned long get_zeroed_page(int gfp_mask)
  260. void __free_pages(struct page *page, unsigned long order)
  261. void free_pages(unsigned long addr, unsigned long order)
  262.   另外还有几个用于统计内存压力的函数:
  263.   unsigned int nr_free_pages (void)
  264.   unsigned int nr_inactive_clean_pages (void)
  265.   unsigned int nr_free_buffer_pages (void)
  266.   unsigned int nr_free_highpages (void)
  267.   这些函数较为简单,不再分析.
阅读(3622) | 评论(0) | 转发(0) |
0

上一篇:没有了

下一篇:expand函数

给主人留下些什么吧!~~