(4)linux内存管理-returnx-ChinaUnix博客

returnx

首页　| 　博文目录　| 　关于我

returnx

博客访问： 75721
博文数量： 22
博客积分： 1475
博客等级：上尉
技术积分： 260
用户组：普通用户
注册时间： 2010-03-04 23:39

文章分类

全部博文（22）

qemu（1）
Android（6）
ubuntu（3）
数据结构（1）
linux启动过程（2）
未分配的博文（9）

文章存档

2013年（1）

2011年（6）

2010年（15）

我的朋友

相关数据结构：

meminfo是这个类型的变量，是通过bootloader传递的参数建立的，一个内存的分为多个结点

struct meminfo {
//表示有多少个结点
int nr_banks;
struct {
  //sdram的开始地址和大小。
  unsigned long start;
  unsigned long size;
  //相应sdram的结点号。
  int           node;
} bank[NR_BANKS];
};

每个平台都要根据实际情况填充这里面的成员。

struct machine_desc {
  unsigned int  nr;  /* architecture number */
unsigned int  phys_ram; /* start of physical ram */
unsigned int  phys_io; /* start of physical io */
unsigned int  io_pg_offst; /* byte offset for io
       * page tabe entry */

const char *name; /* architecture name */
unsigned long boot_params; /* tagged list */

unsigned int video_start; /* start of video RAM */
unsigned int video_end; /* end of video RAM */

unsigned int  reserve_lp0 :1; /* never has lp0 */
unsigned int  reserve_lp1 :1; /* never has lp1 */
unsigned int  reserve_lp2 :1; /* never has lp2 */
unsigned int  soft_reboot :1; /* soft reboot  */
void   (*fixup)(struct machine_desc *,
      struct tag *, char **,
      struct meminfo *);
void   (*map_io)(void);/* IO mapping function */
void   (*init_irq)(void);
struct sys_timer *timer;  /* system tick timer */
void   (*init_machine)(void);
};

//每个节点对应一个这样的结构。
typedef struct pglist_data {
struct zone node_zones[MAX_NR_ZONES];
//根据管理区的不同共有三个原素，每个里面存放了系统中所在节点的与数组索引一样类型的管理区指针。
struct zonelist node_zonelists[GFP_ZONETYPES];
//管理区数量。
int nr_zones;//0
#ifdef CONFIG_FLAT_NODE_MEM_MAP
//把当前SDRAM分成N个页面，每个页面分配一个struct page结构，node_mem_map是数组头。
struct page *node_mem_map;
#endif
//bootmem使用，已级静态分配并赋值。
struct bootmem_data *bdata;
//SDRAM物理地址的PFN。
unsigned long node_start_pfn;
//SDRAM总的大小（去悼洞）。
unsigned long node_present_pages; /* total number of physical pages */
//SDRAM总的大小包括洞。
unsigned long node_spanned_pages; /* total size of physical page
range, including holes */
//当前SDRAM对应的节点ID。
int node_id;
//下一个节点。
struct pglist_data *pgdat_next;
//kswapd相关的。
wait_queue_head_t kswapd_wait;
struct task_struct *kswapd;
int kswapd_max_order;//0
} pg_data_t;

struct zone {
//空闲页的数目。
unsigned long free_pages;
//分别是保留页、回收下界、回收上界。
unsigned long pages_min, pages_low, pages_high;

//在处理内存不足的情况下，每个管理区必须保留的页框数。
unsigned long lowmem_reserve[MAX_NR_ZONES];

#ifdef CONFIG_NUMA
struct per_cpu_pageset *pageset[NR_CPUS];
#else
//每CPU页框高速缓存。
struct per_cpu_pageset pageset[NR_CPUS];
#endif
/*
* free areas of different sizes
*/
spinlock_t lock;
//管理区空闲页框块。
struct free_area free_area[MAX_ORDER];

ZONE_PADDING(_pad1_)

/* Fields commonly accessed by the page reclaim scanner */
spinlock_t  lru_lock;
struct list_head active_list;
struct list_head inactive_list;
unsigned long  nr_scan_active;
unsigned long  nr_scan_inactive;
unsigned long  nr_active;
unsigned long  nr_inactive;
unsigned long  pages_scanned;    /* since last reclaim */
int   all_unreclaimable; /* All pages pinned */

atomic_t reclaim_in_progress;

int temp_priority;
int prev_priority;

ZONE_PADDING(_pad2_)

//进程等待队列的散列表。
wait_queue_head_t * wait_table;
//散列表的大小。
unsigned long wait_table_size;

unsigned long wait_table_bits;

//所属的结点描述符。
struct pglist_data *zone_pgdat;
//管理区内第一个页描述符的指针。
struct page  *zone_mem_map;
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
//管理区开始物理地址的PFN。
unsigned long  zone_start_pfn;
//当前管理区总大小。
unsigned long  spanned_pages; /* total size, including holes */
//去悼洞后的总大小。
unsigned long  present_pages; /* amount of memory (excluding holes) */
//管理区名字。
char   *name;
} ____cacheline_maxaligned_in_smp;

typedef struct bootmem_data {
//SDRAM的开始物理地址。
unsigned long node_boot_start;
//SDRAM的结束PFN。
unsigned long node_low_pfn;
//SDRAM在bootmem管理区的起始虚拟地址，每个比特代一个页，记录SDRAM的使用情况。
void *node_bootmem_map;//vm end of kernel code
unsigned long last_offset;
unsigned long last_pos;
unsigned long last_success; /* Previous allocation point. To speed
* up searching */
} bootmem_data_t;

提示：每个SDRAM要分为多个结点，每个结点要分为多个管理区。struct meminfo 代表SDRAM，struct pglist_data 代表结点，struct zone 代表管理区。

void __init setup_arch(char **cmdline_p)
{
struct tag *tags = (struct tag *)&init_tags;
struct machine_desc *mdesc;
char *from = default_command_line;

//对CPU相关的变里初始化。
setup_processor();
//获取平台描述符。
mdesc = setup_machine(machine_arch_type);
machine_name = mdesc->name;

if (mdesc->soft_reboot)
reboot_setup("s");
//load参数存放位置。
if (mdesc->boot_params)
tags = phys_to_virt(mdesc->boot_params);

/*
* If we have the old style parameters, convert them to
* a tag list.
*/
//如果可能把老格式转成新格式。
if (tags->hdr.tag != ATAG_CORE)
convert_to_tag_list(tags);
if (tags->hdr.tag != ATAG_CORE)
tags = (struct tag *)&init_tags;

//这个成员是空。
if (mdesc->fixup)
  mdesc->fixup(mdesc, tags, &from, &meminfo);
else
  printk("fixup is null\n");
//分析loader放在参数区里放的参数。
if (tags->hdr.tag == ATAG_CORE) {
  if (meminfo.nr_banks != 0)
   squash_mem_tags(tags);
  parse_tags(tags);
}

init_mm.start_code = (unsigned long) &_text;
init_mm.end_code   = (unsigned long) &_etext;
init_mm.end_data   = (unsigned long) &_edata;
init_mm.brk    = (unsigned long) &_end;

memcpy(saved_command_line, from, COMMAND_LINE_SIZE);
saved_command_line[COMMAND_LINE_SIZE-1] = '\0';

printk("parse_cmdline before cmdline [%s]\n",from);
parse_cmdline(cmdline_p, from);
printk("parse_cmdline before cmdline [%s]\n",*cmdline_p);

printk("meminfo [%d][%d][%d][%d]\n",meminfo.nr_banks,meminfo.bank[0].start,meminfo.bank[0].size,meminfo.bank[0].node);

//对物内存作了全方位的初始化。
paging_init(&meminfo, mdesc);
request_standard_resources(&meminfo, mdesc);

//初始化中断和异常的栈，只是三个字啊。
cpu_init();

/*
* Set up various architecture-specific pointers
*/
init_arch_irq = mdesc->init_irq;
system_timer = mdesc->timer;
init_machine = mdesc->init_machine;

//配置控制台。
#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
conswitchp = &vga_con;
#elif defined(CONFIG_DUMMY_CONSOLE)
conswitchp = &dummy_con;
printk("conswitchp=%p\n",conswitchp);
#endif
#endif
}

page_init根据SDRAM内结点的物理布局、平台相关的变量，对物理内存作全方位的初化。

void __init paging_init(struct meminfo *mi, struct machine_desc *mdesc)
{
void *zero_page;
int node;

//初始化SDRAM，函数返回后，可以用bootmem分配内存了。
bootmem_init(mi);

memcpy(&meminfo, mi, sizeof(meminfo));

/*
* allocate the zero page. Note that we count on this going ok.
*/
zero_page = alloc_bootmem_low_pages(PAGE_SIZE);

/*
* initialise the page tables.
*/
//初始化内核的页表，把所在SDRAM都映射到相应的虚拟空间。
memtable_init(mi);

//调用平台的映射IO的函数，把所有IO寄存器都映射的相应的虚拟空间。
if (mdesc->map_io)
{
printk("[setup_arch -> machine_desc.map_io]\n");
mdesc->map_io();
}
local_flush_tlb_all();

/*
* initialise the zones within each node
*/
//初始化所有节点
for_each_online_node(node) {
  unsigned long zone_size[MAX_NR_ZONES];
  unsigned long zhole_size[MAX_NR_ZONES];
  struct bootmem_data *bdata;
  pg_data_t *pgdat;
  int i;

  /*
   * Initialise the zone size information.
   */
  for (i = 0; i < MAX_NR_ZONES; i++) {
   zone_size[i] = 0;
   zhole_size[i] = 0;
  }

pgdat = NODE_DATA(node);
bdata = pgdat->bdata;

  /*
   * The size of this node has already been determined.
   * If we need to do anything fancy with the allocation
   * of this memory to the zones, now is the time to do
   * it.
   */
  zone_size[0] = bdata->node_low_pfn -
    (bdata->node_boot_start >> PAGE_SHIFT);

  /*
   * If this zone has zero size, skip it.
   */
  if (!zone_size[0])
   continue;

  /*
   * For each bank in this node, calculate the size of the
   * holes. holes = node_size - sum(bank_sizes_in_node)
   */
  zhole_size[0] = zone_size[0];
  for (i = 0; i < mi->nr_banks; i++) {
   if (mi->bank[i].node != node)
    continue;

zhole_size[0] -= mi->bank[i].size >> PAGE_SHIFT;
}

  /*
   * Adjust the sizes according to any special
   * requirements for this machine type.
   */
  arch_adjust_zones(node, zone_size, zhole_size);

  //按照管理区参数，对结点作初始化。
  free_area_init_node(node, pgdat, zone_size,
    bdata->node_boot_start >> PAGE_SHIFT, zhole_size);
}

/*
* finish off the bad pages once
* the mem_map is initialised
*/
memzero(zero_page, PAGE_SIZE);
empty_zero_page = virt_to_page(zero_page);
flush_dcache_page(empty_zero_page);
}

static void __init bootmem_init(struct meminfo *mi)
{
struct node_info node_info[MAX_NUMNODES], *np = node_info;
unsigned int bootmap_pages, bootmap_pfn, map_pg;
int node, initrd_node;

printk("MAX_NUMNODES %d\n",MAX_NUMNODES);
//返回映射物理内存需要的页数，并把所有SDRAM信息填充到node_info.
bootmap_pages = find_memend_and_nodes(mi, np);
//查找内核所在SDRAM 结点，把此结点上可用的第一个4KPFN返回，bootmem从这里开始使用。
bootmap_pfn = find_bootmap_pfn(0, mi, bootmap_pages);
initrd_node = check_initrd(mi);

map_pg = bootmap_pfn;

/*
* Initialise the bootmem nodes.
*
* What we really want to do is:
*
*   unmap_all_regions_except_kernel();
*   for_each_node_in_reverse_order(node) {
*     map_node(node);
*     allocate_bootmem_map(node);
*     init_bootmem_node(node);
*     free_bootmem_node(node);
*   }
*
* but this is a 2.5-type change. For now, we just set
* the nodes up in reverse order.
*
* (we could also do with rolling bootmem_init and paging_init
* into one generic "memory_init" type function).
*/
//对所有SDRAM结点从高到低依次处理。
np += num_online_nodes() - 1;
for (node = num_online_nodes() - 1; node >= 0; node--, np--) {
  /*
   * If there are no pages in this node, ignore it.
   * Note that node 0 must always have some pages.
   */
  if (np->end == 0 || !node_online(node)) {
   if (node == 0)
    BUG();
   continue;
  }

  /*
   * Initialise the bootmem allocator.
   */
  //NODE_DATA(node)查找相应结点对应的struct pglist_data，初始化struct pglist_data的struct bootmem_data
  //在bootmem管理区的map_pg位置标记所有SDRAM为保留。
  init_bootmem_node(NODE_DATA(node), map_pg, np->start, np->end);
  //标记所有结点为可用
  free_bootmem_node_bank(node, mi);
  //增加当前结点所用的页数。
  map_pg += np->bootmap_pages;

  /*
   * If this is node 0, we need to reserve some areas ASAP -
   * we may use bootmem on node 0 to setup the other nodes.
   */
  //结点0是内核所在的SDRAM，上面标记为可用，但有好多是不可用的，如内核代码数据、bootmem管理页等。
  if (node == 0)
   reserve_node_zero(bootmap_pfn, bootmap_pages);
}

#ifdef CONFIG_BLK_DEV_INITRD
if (phys_initrd_size && initrd_node >= 0) {
  reserve_bootmem_node(NODE_DATA(initrd_node), phys_initrd_start,
         phys_initrd_size);
  initrd_start = __phys_to_virt(phys_initrd_start);
  initrd_end = initrd_start + phys_initrd_size;
}
#endif

BUG_ON(map_pg != bootmap_pfn + bootmap_pages);
}

/*
* Scan the memory info structure and pull out:
* - the end of memory
* - the number of nodes
* - the pfn range of each node
* - the number of bootmem bitmap pages
*/
static unsigned int __init
find_memend_and_nodes(struct meminfo *mi, struct node_info *np)
{
unsigned int i, bootmem_pages = 0, memend_pfn = 0;

for (i = 0; i < MAX_NUMNODES; i++) {
  np[i].start = -1U;
  np[i].end = 0;
  np[i].bootmap_pages = 0;
}

for (i = 0; i < mi->nr_banks; i++) {
unsigned long start, end;
int node;

  if (mi->bank[i].size == 0) {
   /*
    * Mark this bank with an invalid node number
    */
   mi->bank[i].node = -1;
   continue;
  }

node = mi->bank[i].node;

  /*
   * Make sure we haven't exceeded the maximum number of nodes
   * that we have in this configuration. If we have, we're in
   * trouble. (maybe we ought to limit, instead of bugging?)
   */
  if (node >= MAX_NUMNODES)
   BUG();
  //把node_online_map结点位图的node位置位。
  node_set_online(node);

  /*
   * Get the start and end pfns for this bank
   */
  //把结点的起始和结尾物理地址转成以页为单位。
  start = mi->bank[i].start >> PAGE_SHIFT;
  end   = (mi->bank[i].start + mi->bank[i].size) >> PAGE_SHIFT;

  //填充到NP里相应的成员。
  if (np[node].start > start)
   np[node].start = start;

  if (np[node].end < end)
   np[node].end = end;
  //记录所有结点里内存最高的。
  if (memend_pfn < end)
   memend_pfn = end;
}

/*
* Calculate the number of pages we require to
* store the bootmem bitmaps.
*/
//统计所有结点。
for_each_online_node(i) {
  if (np[i].end == 0)
   continue;
  //计算要有几个页来管理这个结点，一个页要一个比特。
  np[i].bootmap_pages = bootmem_bootmap_pages(np[i].end -
           np[i].start);
  //计算bootmem管理SDRAM所有结点用的总页数。
  bootmem_pages += np[i].bootmap_pages;
}

//最高物理内存地址所对应的虚拟地。
high_memory = __va(memend_pfn << PAGE_SHIFT);

/*
* This doesn't seem to be used by the Linux memory
* manager any more. If we can get rid of it, we
* also get rid of some of the stuff above as well.
*
* Note: max_low_pfn and max_pfn reflect the number
* of _pages_ in the system, not the maximum PFN.
*/
//所有SDRAM的总页数，包括洞。
max_low_pfn = memend_pfn - O_PFN_DOWN(PHYS_OFFSET);
max_pfn = memend_pfn - O_PFN_DOWN(PHYS_OFFSET);

return bootmem_pages;
}

static int __init check_initrd(struct meminfo *mi)
{
int initrd_node = -2;
#ifdef CONFIG_BLK_DEV_INITRD
unsigned long end = phys_initrd_start + phys_initrd_size;

/*
* Make sure that the initrd is within a valid area of
* memory.
*/
if (phys_initrd_size) {
unsigned int i;

initrd_node = -1;

for (i = 0; i < mi->nr_banks; i++) {
unsigned long bank_end;

bank_end = mi->bank[i].start + mi->bank[i].size;

   if (mi->bank[i].start <= phys_initrd_start &&
       end <= bank_end)
    initrd_node = mi->bank[i].node;
  }
}

if (initrd_node == -1) {
  printk(KERN_ERR "initrd (0x%08lx - 0x%08lx) extends beyond "
         "physical memory - disabling initrd\n",
         phys_initrd_start, end);
  phys_initrd_start = phys_initrd_size = 0;
}
#endif

return initrd_node;
}

static unsigned int __init
find_bootmap_pfn(int node, struct meminfo *mi, unsigned int bootmap_pages)
{
unsigned int start_pfn, bank, bootmap_pfn;

//内核结束地址对就的PFN，向上取。
start_pfn = O_PFN_UP(__pa(&_end));
bootmap_pfn = 0;

for (bank = 0; bank < mi->nr_banks; bank ++) {
unsigned int start, end;

if (mi->bank[bank].node != node)
continue;

  start = mi->bank[bank].start >> PAGE_SHIFT;
  end   = (mi->bank[bank].size +
    mi->bank[bank].start) >> PAGE_SHIFT;

if (end < start_pfn)
continue;

if (start < start_pfn)
start = start_pfn;

if (end <= start)
continue;

  if (end - start >= bootmap_pages) {
   bootmap_pfn = start;
   break;
  }
}

if (bootmap_pfn == 0)
BUG();

return bootmap_pfn;
}

unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn)
{
return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn));
}

static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
unsigned long mapstart, unsigned long start, unsigned long end)
{
bootmem_data_t *bdata = pgdat->bdata;
//映射SDRAM所需的字节数。
unsigned long mapsize = ((end - start)+7)/8;

//挂到SDRAM链表上。
pgdat->pgdat_next = pgdat_list;
pgdat_list = pgdat;
//映射SDRAM所需的字节数，字对齐。
mapsize = ALIGN(mapsize, sizeof(long));
bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
bdata->node_boot_start = (start << PAGE_SHIFT);
bdata->node_low_pfn = end;

/*
* Initially all pages are reserved - setup_arch() has to
* register free RAM areas explicitly.
*/
//把当前SDRAM的所有页标记为保留。
memset(bdata->node_bootmem_map, 0xff, mapsize);

return mapsize;
}

/*
* Reserve the various regions of node 0
*/
static __init void reserve_node_zero(unsigned int bootmap_pfn, unsigned int bootmap_pages)
{
pg_data_t *pgdat = NODE_DATA(0);
unsigned long res_size = 0;

/*
* Register the kernel text and data with bootmem.
* Note that this can only be in node 0.
*/
#ifdef CONFIG_XIP_KERNEL
reserve_bootmem_node(pgdat, __pa(&__data_start), &_end - &__data_start);
#else
//保留内核区域。
reserve_bootmem_node(pgdat, __pa(&_stext), &_end - &_stext);
#endif

/*
* Reserve the page tables. These are already in use,
* and can only be in node 0.
*/
//保留内核页表。
reserve_bootmem_node(pgdat, __pa(swapper_pg_dir),
PTRS_PER_PGD * sizeof(pgd_t));

/*
* And don't forget to reserve the allocator bitmap,
* which will be freed later.
*/
//保留bootmem管理内存的区域。
reserve_bootmem_node(pgdat, bootmap_pfn << PAGE_SHIFT,
bootmap_pages << PAGE_SHIFT);

/*
* Hmm... This should go elsewhere, but we really really need to
* stop things allocating the low memory; ideally we need a better
* implementation of GFP_DMA which does not assume that DMA-able
* memory starts at zero.
*/
if (machine_is_integrator() || machine_is_cintegrator())
res_size = __pa(swapper_pg_dir) - PHYS_OFFSET;

/*
* These should likewise go elsewhere. They pre-reserve the
* screen memory region at the start of main system memory.
*/
if (machine_is_edb7211())
res_size = 0x00020000;
if (machine_is_p720t())
res_size = 0x00014000;

#ifdef CONFIG_SA1111
/*
* Because of the SA1111 DMA bug, we want to preserve our
* precious DMA-able memory...
*/
res_size = __pa(swapper_pg_dir) - PHYS_OFFSET;
#endif
if (res_size)
reserve_bootmem_node(pgdat, PHYS_OFFSET, res_size);
}

void __init memtable_init(struct meminfo *mi)
{
struct map_desc *init_maps, *p, *q;
unsigned long address = 0;
int i;

//初始化MMU相关的数据结构。
build_mem_type_table();

init_maps = p = alloc_bootmem_low_pages(PAGE_SIZE);

#ifdef CONFIG_XIP_KERNEL
p->physical   = CONFIG_XIP_PHYS_ADDR & PMD_MASK;
p->virtual    = (unsigned long)&_stext & PMD_MASK;
p->length     = ((unsigned long)&_etext - p->virtual + ~PMD_MASK) & PMD_MASK;
p->type       = MT_ROM;
p ++;
#endif

for (i = 0; i < mi->nr_banks; i++) {
if (mi->bank[i].size == 0)
continue;

  p->physical   = mi->bank[i].start;
  p->virtual    = __phys_to_virt(p->physical);
  p->length     = mi->bank[i].size;
  p->type       = MT_MEMORY;
  p ++;
}

#ifdef FLUSH_BASE
q->physical   = FLUSH_BASE_PHYS;
p->virtual    = FLUSH_BASE;
p->length     = PGDIR_SIZE;
p->type       = MT_CACHECLEAN;
p ++;
#endif

#ifdef FLUSH_BASE_MINICACHE
q->physical   = FLUSH_BASE_PHYS + PGDIR_SIZE;
p->virtual    = FLUSH_BASE_MINICACHE;
p->length     = PGDIR_SIZE;
p->type       = MT_MINICLEAN;
p ++;
#endif

/*
* Go through the initial mappings, but clear out any
* pgdir entries that are not in the description.
*/
//在4G地址范围内把所有SDRAM映射到虚拟空间内。
q = init_maps;
do {
  //如果SDRAM之间的洞或者是在最高SDRAM之上，清除映射。
  if (address < q->virtual || q == p)
  {
   clear_mapping(address);
   address += PGDIR_SIZE;
  }
  else
  {
   //映射SDRAM。
   create_mapping(q);

address = q->virtual + q->length;
address = (address + PGDIR_SIZE - 1) & PGDIR_MASK;

q ++;
}
} while (address != 0);

/*
* Create a mapping for the machine vectors at the high-vectors
* location (0xffff0000). If we aren't using high-vectors, also
* create a mapping at the low-vectors virtual address.
*/
//映射中断向量所在页。
init_maps->physical   = virt_to_phys(init_maps);
init_maps->virtual    = 0xffff0000;
init_maps->length     = PAGE_SIZE;
init_maps->type       = MT_HIGH_VECTORS;
create_mapping(init_maps);

if (!vectors_high()) {
  init_maps->virtual = 0;
  init_maps->type = MT_LOW_VECTORS;
  create_mapping(init_maps);
}

flush_cache_all();
local_flush_tlb_all();

top_pmd = pmd_off_k(0xffff0000);
}

//调用平台的映射IO的函数，把所有IO寄存器都映射的相应的虚拟空间。

static void __init smdk2410_map_io(void)
{
s3c24xx_init_io(smdk2410_iodesc, ARRAY_SIZE(smdk2410_iodesc));
s3c24xx_init_clocks(0);
s3c24xx_init_uarts(smdk2410_uartcfgs, ARRAY_SIZE(smdk2410_uartcfgs));
s3c24xx_set_board(&smdk2410_board);
set_s3c2410fb_info(&smdk2410_lcdcfg); /* add by gjl */
usb_sbc2410_init(); /* by gjl */

}

void __init free_area_init_node(int nid, struct pglist_data *pgdat,
  unsigned long *zones_size, unsigned long node_start_pfn,
  unsigned long *zholes_size)
{
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
//设置节点描述符里SDRAM的大小。
calculate_zone_totalpages(pgdat, zones_size, zholes_size);
//根据节点的大小分配struct page数组。
alloc_node_mem_map(pgdat);
    //对一个节点的所有管里区初始化。
free_area_init_core(pgdat, zones_size, zholes_size);
}

static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long *zholes_size)
{
unsigned long realtotalpages, totalpages = 0;
int i;

for (i = 0; i < MAX_NR_ZONES; i++)
totalpages += zones_size[i];
pgdat->node_spanned_pages = totalpages;

realtotalpages = totalpages;
if (zholes_size)
for (i = 0; i < MAX_NR_ZONES; i++)
realtotalpages -= zholes_size[i];
pgdat->node_present_pages = realtotalpages;
printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
}

static void __init alloc_node_mem_map(struct pglist_data *pgdat)
{
/* Skip empty nodes */
if (!pgdat->node_spanned_pages)
return;

#ifdef CONFIG_FLAT_NODE_MEM_MAP
/* ia64 gets its own node_mem_map, before this, without bootmem */
if (!pgdat->node_mem_map) {
unsigned long size;
struct page *map;

  size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
  map = alloc_remap(pgdat->node_id, size);
  if (!map)
   map = alloc_bootmem_node(pgdat, size);
  pgdat->node_mem_map = map;
}
#ifdef CONFIG_FLATMEM
/*
* With no DISCONTIG, the global mem_map is just set as node 0's
*/
if (pgdat == NODE_DATA(0))
  mem_map = NODE_DATA(0)->node_mem_map;
#endif
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
}

/*
* Set up the zone data structures:
*   - mark all pages reserved
*   - mark all memory queues empty
*   - clear the memory bitmaps
*/
static void __init free_area_init_core(struct pglist_data *pgdat,
  unsigned long *zones_size, unsigned long *zholes_size)
{
unsigned long i, j;
int cpu, nid = pgdat->node_id;
unsigned long zone_start_pfn = pgdat->node_start_pfn;

pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
pgdat->kswapd_max_order = 0;

for (j = 0; j < MAX_NR_ZONES; j++) {
  struct zone *zone = pgdat->node_zones + j;
  unsigned long size, realsize;
  unsigned long batch;

  realsize = size = zones_size[j];
  if (zholes_size)
   realsize -= zholes_size[j];

  if (j == ZONE_DMA || j == ZONE_NORMAL)
   nr_kernel_pages += realsize;
  nr_all_pages += realsize;

  zone->spanned_pages = size;
  zone->present_pages = realsize;
  zone->name = zone_names[j];
  spin_lock_init(&zone->lock);
  spin_lock_init(&zone->lru_lock);
  zone->zone_pgdat = pgdat;
  zone->free_pages = 0;

zone->temp_priority = zone->prev_priority = DEF_PRIORITY;

batch = zone_batchsize(zone);

  for (cpu = 0; cpu < NR_CPUS; cpu++) {
#ifdef CONFIG_NUMA
   /* Early boot. Slab allocator not functional yet */
   zone->pageset[cpu] = &boot_pageset[cpu];
   setup_pageset(&boot_pageset[cpu],0);
#else
   setup_pageset(zone_pcp(zone,cpu), batch);
#endif
  }
  printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
    zone_names[j], realsize, batch);
  INIT_LIST_HEAD(&zone->active_list);
  INIT_LIST_HEAD(&zone->inactive_list);
  zone->nr_scan_active = 0;
  zone->nr_scan_inactive = 0;
  zone->nr_active = 0;
  zone->nr_inactive = 0;
  atomic_set(&zone->reclaim_in_progress, 0);
  if (!size)
   continue;

  /*
   * The per-page waitqueue mechanism uses hashed waitqueues
   * per zone.
   */
  zone->wait_table_size = wait_table_size(size);
  zone->wait_table_bits =
   wait_table_bits(zone->wait_table_size);
  zone->wait_table = (wait_queue_head_t *)
   alloc_bootmem_node(pgdat, zone->wait_table_size
      * sizeof(wait_queue_head_t));

for(i = 0; i < zone->wait_table_size; ++i)
init_waitqueue_head(zone->wait_table + i);

pgdat->nr_zones = j+1;

zone->zone_mem_map = pfn_to_page(zone_start_pfn);
zone->zone_start_pfn = zone_start_pfn;

//初始化管里区所有的struct page描述符。
memmap_init(size, nid, j, zone_start_pfn);

//增加到管里区的数组里。
zonetable_add(zone, nid, j, zone_start_pfn, size);

zone_start_pfn += size;

//初始化伙伴系统的11个链表。
zone_init_free_lists(pgdat, zone, zone->spanned_pages);
}
}

在start_kernel里被调用。

//设置每个节点的zonelist。

void __init build_all_zonelists(void)
{
int i;

for_each_online_node(i)
build_zonelists(NODE_DATA(i));
printk("Built %i zonelists\n", num_online_nodes());
cpuset_init_current_mems_allowed();
}

static void __init build_zonelists(pg_data_t *pgdat)
{
int i, j, k, node, local_node;

local_node = pgdat->node_id;
//处理每个结点的管理区。
for (i = 0; i < GFP_ZONETYPES; i++) {
struct zonelist *zonelist;

zonelist = pgdat->node_zonelists + i;

  j = 0;
  k = ZONE_NORMAL;
  if (i & __GFP_HIGHMEM)
   k = ZONE_HIGHMEM;
  if (i & __GFP_DMA)
   k = ZONE_DMA;

  //让每个管理区
   j = build_zonelists_node(pgdat, zonelist, j, k);
   /*
    * Now we build the zonelist so that it contains the zones
    * of all the other nodes.
    * We don't want to pressure a particular node, so when
    * building the zones for node N, we make sure that the
    * zones coming right after the local ones are those from
    * node N+1 (modulo N)
    */
  for (node = local_node + 1; node < MAX_NUMNODES; node++) {
   if (!node_online(node))
    continue;
   j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
  }
  for (node = 0; node < local_node; node++) {
   if (!node_online(node))
    continue;
   j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
  }

zonelist->zones[j] = NULL;
}
}

/*
* mem_init() marks the free areas in the mem_map and tells us how much
* memory is free. This is done after various parts of the system have
* claimed their memory after the kernel image.
*/

//在start_kernel中被调用，这个函数执行完毕，bootmem的生命就完成了，交给伙伴系统。
void __init mem_init(void)
{
unsigned int codepages, datapages, initpages;
int i, node;

//依次是内核代码、数据、初始化代码和数据所占用的内存。
codepages = &_etext - &_text;
datapages = &_end - &__data_start;
initpages = &__init_end - &__init_begin;

#ifndef CONFIG_DISCONTIGMEM
max_mapnr = virt_to_page(high_memory) - mem_map;
#endif

/* this will put all unused low memory onto the freelists */
//把每个节点里没有使用的面放到页框分配器。
for_each_online_node(node) {
pg_data_t *pgdat = NODE_DATA(node);

free_unused_memmap_node(node, &meminfo);

if (pgdat->node_spanned_pages != 0)
totalram_pages += free_all_bootmem_node(pgdat);
}

#ifdef CONFIG_SA1111
/* now that our DMA memory is actually so designated, we can free it */
free_area(PAGE_OFFSET, (unsigned long)swapper_pg_dir, NULL);
#endif

/*
* Since our memory may not be contiguous, calculate the
* real number of pages we have in this system
*/
//输出物理内存的总页数。
printk(KERN_INFO "Memory:");
num_physpages = 0;
for (i = 0; i < meminfo.nr_banks; i++) {
num_physpages += meminfo.bank[i].size >> PAGE_SHIFT;
printk(" %ldMB", meminfo.bank[i].size >> 20);
}
printk(" = %luMB total\n", num_physpages >> (20 - PAGE_SHIFT));

//输出可用的总页数、内核代码数据INIT占用的空间。
printk(KERN_NOTICE "Memory: %luKB available (%dK code, "
  "%dK data, %dK init)\n",
  (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
  codepages >> 10, datapages >> 10, initpages >> 10);

if (PAGE_SIZE >= 16384 && num_physpages <= 128) {
  extern int sysctl_overcommit_memory;
  /*
   * On a machine this small we won't get
   * anywhere without overcommit, so turn
   * it on by default.
   */
  sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
}
}

void free_initmem(void)
{
if (!machine_is_integrator() && !machine_is_cintegrator()) {
  free_area((unsigned long)(&__init_begin),
     (unsigned long)(&__init_end),
     "init");
}
}

unsigned long __init free_all_bootmem_node (pg_data_t *pgdat)
{
return(free_all_bootmem_core(pgdat));
}

static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
{
struct page *page;
unsigned long pfn;
bootmem_data_t *bdata = pgdat->bdata;
unsigned long i, count, total = 0;
unsigned long idx;
unsigned long *map;
int gofast = 0;

BUG_ON(!bdata->node_bootmem_map);

count = 0;
/* first extant page of the node */
pfn = bdata->node_boot_start >> PAGE_SHIFT;
idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
map = bdata->node_bootmem_map;
/* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
if (bdata->node_boot_start == 0 ||
ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG))
gofast = 1;

for (i = 0; i < idx; ) {
unsigned long v = ~map[i / BITS_PER_LONG];

  //如果连续32个页都是空闲的。
  if (gofast && v == ~0UL) {
   int j, order;

   page = pfn_to_page(pfn);
   count += BITS_PER_LONG;
   //清除页的保留标志。
   __ClearPageReserved(page);
   order = ffs(BITS_PER_LONG) - 1;
   //设使用计数为0。
   set_page_refs(page, order);
   for (j = 1; j < BITS_PER_LONG; j++) {
    if (j + 16 < BITS_PER_LONG)
     prefetchw(page + j + 16);
    //清除页的保留标志。
    __ClearPageReserved(page + j);
   }
   //释放32个页框。
   __free_pages(page, order);
   i += BITS_PER_LONG;
   page += BITS_PER_LONG;
  }
  //如果是部分空闲的。
  else if (v)
  {
   unsigned long m;

   page = pfn_to_page(pfn);
   for (m = 1; m && i < idx; m<<=1, page++, i++) {
    if (v & m) {
     count++;
     //清除页的保留标志。
     __ClearPageReserved(page);
     //设使用计数为0。
     set_page_refs(page, 0);
     //释放单个页框。
     __free_page(page);
    }
   }
  }
  //如果没有空闲的。
  else
  {
   i+=BITS_PER_LONG;
  }

  pfn += BITS_PER_LONG;
}
total += count;

/*
* Now free the allocator bitmap itself, it's not
* needed anymore:
*/
//释放bootmem原来用于映射页位图的空间。
page = virt_to_page(bdata->node_bootmem_map);
count = 0;
for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) {
  count++;
  __ClearPageReserved(page);
  set_page_count(page, 1);
  __free_page(page);
}
total += count;
bdata->node_bootmem_map = NULL;

return total;
}

2、伙伴系统、slab分配器、非连续内存的管理；

1）伙伴系统

工作原理，页框分配器是对外部的接口，接受外部的分配和释放请求，然后调用相应区的伙伴系统，伙伴系统有11个链表每个链表里放的是空闲页，但每个链表里存放页的连续是不一样的，分别是2^0 2^1 2^2.....2^10个页；如果相分配8个连续页的一块内存，但是没有，就向上找，如找到了32个的，这样就要把多的24个页向下找伙伴如果找到再合并成更大的，释放和这个是一样的。就是完成对连续页框的分配和释放请求。

数据结构。

struct page {
//包括所在节点、管理区、PFN等。
page_flags_t flags;
     //使用计数，-1为没使用。
atomic_t _count;  /* Usage count, see below. */
//页框中页表项的数目。
atomic_t _mapcount;

unsigned long private;

struct address_space *mapping;

pgoff_t index;   /* Our offset within mapping. */
//当页表空闲时用来连接空闲块。
struct list_head lru;  #if defined(WANT_PAGE_VIRTUAL)
void *virtual;   /* Kernel virtual address (NULL if
        not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
};

//这是伙伴系统使用的重要数据结构，共11个。

struct free_area {
//空闲链表的头。
struct list_head free_list;
//空闲链表原素个数。
unsigned long nr_free;
};

//从指定的管理区中分配2^order个连续个页表。
static struct page *__rmqueue(struct zone *zone, unsigned int order)
{
struct free_area * area;
unsigned int current_order;
struct page *page;

for (current_order = order; current_order < MAX_ORDER; ++current_order) {
  //从指定空闲链表向上找。
  area = zone->free_area + current_order;
  if (list_empty(&area->free_list))
   continue;

  page = list_entry(area->free_list.next, struct page, lru);
  //从相应的空闲链表上删除。
  list_del(&page->lru);
  //清私有标志。
  rmv_page_order(page);
  //空闲链表空闲K页递减。
  area->nr_free--;
  //管理区也递减。
  zone->free_pages -= 1UL << order;
  //如果多分配了还要还给相应空闲链表。
  return expand(zone, page, order, current_order, area);
}

return NULL;
}

//分配count个2^order个页框并挂到list上。
static int rmqueue_bulk(struct zone *zone, unsigned int order,
   unsigned long count, struct list_head *list)
{
unsigned long flags;
int i;
int allocated = 0;
struct page *page;

spin_lock_irqsave(&zone->lock, flags);
for (i = 0; i < count; ++i) {
  page = __rmqueue(zone, order);
  if (page == NULL)
   break;
  allocated++;
  list_add_tail(&page->lru, list);
}
spin_unlock_irqrestore(&zone->lock, flags);
return allocated;
}

//根所gfg_flags和order在指定的管理区中分配页框，如果可能还会更新每cpu高速缓存。
static struct page *
buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
{
unsigned long flags;
struct page *page = NULL;
int cold = !!(gfp_flags & __GFP_COLD);

//如果分配一个页框从每CPU高速缓存分配。
if (order == 0) {
  struct per_cpu_pages *pcp;
  //获取当前CPU的热或冷高速缓存。
  pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
  local_irq_save(flags);
  if (pcp->count <= pcp->low)
   pcp->count += rmqueue_bulk(zone, 0,
      pcp->batch, &pcp->list);
  if (pcp->count) {
   page = list_entry(pcp->list.next, struct page, lru);
   list_del(&page->lru);
   pcp->count--;
  }
  local_irq_restore(flags);
  put_cpu();
}

//从伙伴系统分配。
if (page == NULL) {
  spin_lock_irqsave(&zone->lock, flags);
  page = __rmqueue(zone, order);
  spin_unlock_irqrestore(&zone->lock, flags);
}

if (page != NULL) {
  BUG_ON(bad_range(zone, page));
  mod_page_state_zone(zone, pgalloc, 1 << order);
  prep_new_page(page, order);

if (gfp_flags & __GFP_ZERO)
prep_zero_page(page, order, gfp_flags);

if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
}
return page;
}

//释放页框。
static inline void __free_pages_bulk (struct page *page,
struct zone *zone, unsigned int order)
{
unsigned long page_idx;
int order_size = 1 << order;

if (unlikely(order))
destroy_compound_page(page, order);

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

BUG_ON(page_idx & (order_size - 1));
BUG_ON(bad_range(zone, page));

zone->free_pages += order_size;
while (order < MAX_ORDER-1) {
  unsigned long combined_idx;
  struct free_area *area;
  struct page *buddy;

  //查找伙伴。
  combined_idx = __find_combined_index(page_idx, order);
  buddy = __page_find_buddy(page, page_idx, order);

  if (bad_range(zone, buddy))
   break;
  if (!page_is_buddy(buddy, order))
   break;  /* Move the buddy up one level. */
  list_del(&buddy->lru);
  area = zone->free_area + order;
  area->nr_free--;
  rmv_page_order(buddy);
  page = page + (combined_idx - page_idx);
  page_idx = combined_idx;
  order++;
}
set_page_order(page, order);
list_add(&page->lru, &zone->free_area[order].free_list);
zone->free_area[order].nr_free++;
}

//释放链表中存放count个2^order页框
static int
free_pages_bulk(struct zone *zone, int count,
struct list_head *list, unsigned int order)
{
unsigned long flags;
struct page *page = NULL;
int ret = 0;

spin_lock_irqsave(&zone->lock, flags);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
while (!list_empty(list) && count--) {
  page = list_entry(list->prev, struct page, lru);
  /* have to delete it as __free_pages_bulk list manipulates */
  list_del(&page->lru);
  __free_pages_bulk(page, zone, order);
  ret++;
}
spin_unlock_irqrestore(&zone->lock, flags);
return ret;
}

//在热或冷每CPU高速缓存中释放单一页框。
static void fastcall free_hot_cold_page(struct page *page, int cold)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
unsigned long flags;

arch_free_page(page, 0);

kernel_map_pages(page, 1, 0);
inc_page_state(pgfree);
if (PageAnon(page))
page->mapping = NULL;
free_pages_check(__FUNCTION__, page);
pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
local_irq_save(flags);
list_add(&page->lru, &pcp->list);
pcp->count++;
if (pcp->count >= pcp->high)
pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
local_irq_restore(flags);
put_cpu();
}

//这是管理区分配器的核心。
struct page * fastcall
__alloc_pages(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist)
{
const int wait = gfp_mask & __GFP_WAIT;
struct zone **zones, *z;
struct page *page;
struct reclaim_state reclaim_state;
struct task_struct *p = current;
int i;
int classzone_idx;
int do_retry;
int can_try_harder;
int did_some_progress;

might_sleep_if(wait);

/*
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or is the caller has realtime scheduling
* policy
*/
can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;

zones = zonelist->zones; /* the list of zones suitable for gfp_mask */

if (unlikely(zones[0] == NULL)) {
/* Should this ever happen?? */
return NULL;
}

classzone_idx = zone_idx(zones[0]);

restart:
/*
* Go through the zonelist once, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
for (i = 0; (z = zones[i]) != NULL; i++) {
int do_reclaim = should_reclaim_zone(z, gfp_mask);

if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
continue;

  /*
   * If the zone is to attempt early page reclaim then this loop
   * will try to reclaim pages and check the watermark a second
   * time before giving up and falling back to the next zone.
   */
zone_reclaim_retry:
  //检查管理区中是否有足够的页框。
  if (!zone_watermark_ok(z, order, z->pages_low,
           classzone_idx, 0, 0))
  {
   //选择下一个Z
   if (!do_reclaim)
    continue;
   else
   {
    //直接回收，再扫描，如果又失败则选下一个Z。
    zone_reclaim(z, gfp_mask, order);
    /* Only try reclaim once */
    do_reclaim = 0;
    goto zone_reclaim_retry;
   }
  }

  //分配页返回。
  page = buffered_rmqueue(z, order, gfp_mask);
  if (page)
   goto got_pg;
}

//唤醒回收线程。
for (i = 0; (z = zones[i]) != NULL; i++)
wakeup_kswapd(z, order);

/*
* Go through the zonelist again. Let __GFP_HIGH and allocations
* coming from realtime tasks to go deeper into reserves
*
* This is the last chance, in general, before the goto nopage.
* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
for (i = 0; (z = zones[i]) != NULL; i++)
{
  //要求降低再次检查管理区中是否有足够的页框
  if (!zone_watermark_ok(z, order, z->pages_min,
           classzone_idx, can_try_harder,
           gfp_mask & __GFP_HIGH))
   continue;

  if (wait && !cpuset_zone_allowed(z, gfp_mask))
   continue;
  //可以分配了。
  page = buffered_rmqueue(z, order, gfp_mask);
  if (page)
   goto got_pg;
}

/* This allocation should allow future memory freeing. */

//如果当前请者是为了释放内存，不再检查边界，直接分配。
if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
   && !in_interrupt())
{
  if (!(gfp_mask & __GFP_NOMEMALLOC))
  {
   /* go through the zonelist yet again, ignoring mins */
   for (i = 0; (z = zones[i]) != NULL; i++) {
    if (!cpuset_zone_allowed(z, gfp_mask))
     continue;
    page = buffered_rmqueue(z, order, gfp_mask);
    if (page)
     goto got_pg;
   }
  }
  goto nopage;
}

//如果要求不能睡眠则失败。
/* Atomic allocations - we can't balance anything */
if (!wait)
goto nopage;

rebalance:
cond_resched();

/* We now go into synchronous reclaim */
p->flags |= PF_MEMALLOC;
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
//开始异步回收。
did_some_progress = try_to_free_pages(zones, gfp_mask);

p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;

cond_resched();
//如果回收了一些内存。
if (likely(did_some_progress))
{
  //再次扫描。
  for (i = 0; (z = zones[i]) != NULL; i++) {
   if (!zone_watermark_ok(z, order, z->pages_min,
            classzone_idx, can_try_harder,
            gfp_mask & __GFP_HIGH))
    continue;

if (!cpuset_zone_allowed(z, gfp_mask))
continue;

   page = buffered_rmqueue(z, order, gfp_mask);
   if (page)
    goto got_pg;
  }
}
//如果回收失败，杀死一个进程。
else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
  /*
   * Go through the zonelist yet one more time, keep
   * very high watermark here, this is only to catch
   * a parallel oom killing, we must fail if we're still
   * under heavy pressure.
   */
  for (i = 0; (z = zones[i]) != NULL; i++) {
   if (!zone_watermark_ok(z, order, z->pages_high,
            classzone_idx, 0, 0))
    continue;

if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
continue;

   page = buffered_rmqueue(z, order, gfp_mask);
   if (page)
    goto got_pg;
  }

out_of_memory(gfp_mask, order);
goto restart;
}

/*
* Don't let big-order allocations loop unless the caller explicitly
* requests that. Wait for some write requests to complete then retry.
*
* In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
* <= 3, but that may not be true in other implementations.
*/
//再重复回收扫描，最多3次。
do_retry = 0;
if (!(gfp_mask & __GFP_NORETRY)) {
  if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
   do_retry = 1;
  if (gfp_mask & __GFP_NOFAIL)
   do_retry = 1;
}
if (do_retry) {
  blk_congestion_wait(WRITE, HZ/50);
  goto rebalance;
}

nopage:
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
  printk(KERN_WARNING "%s: page allocation failure."
   " order:%d, mode:0x%x\n",
   p->comm, order, gfp_mask);
  dump_stack();
  show_mem();
}
return NULL;
got_pg:
zone_statistics(zonelist, z);
return page;
}

/*
* Return 1 if free pages are above 'mark'. This takes into account the order
* of the allocation.
*/
int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int classzone_idx, int can_try_harder, int gfp_high)
{
/* free_pages my go negative - that's OK */
long min = mark, free_pages = z->free_pages - (1 << order) + 1;
int o;

if (gfp_high)
  min -= min / 2;
if (can_try_harder)
  min -= min / 4;
//如果去悼被分配的页框外不够保留的。
if (free_pages <= min + z->lowmem_reserve[classzone_idx])
  return 0;
for (o = 0; o < order; o++) {
  /* At the next order, this order's pages become unavailable */
  free_pages -= z->free_area[o].nr_free << o;

/* Require fewer higher order pages to be free */
min >>= 1;

if (free_pages <= min)
return 0;
}
return 1;
}

2）slab分配器

伙伴系统只能以页框为单位，slab分配器提供小数据块的分配释放请求，工作原理：一个高速缓存提供一个特定大小数据块的分配和释放请求，里面包函多个SLAB，每个SLAB包函一个或多个连续的物理页用于存放真正的对像。

//slab分配器初化。

void __init kmem_cache_init(void)
{
size_t left_over;
struct cache_sizes *sizes;
struct cache_names *names;
int i;

for (i = 0; i < NUM_INIT_LISTS; i++) {
  kmem_list3_init(&initkmem_list3[i]);
  if (i < MAX_NUMNODES)
   cache_cache.nodelists[i] = NULL;
}

/*
* Fragmentation resistance on low memory - only use bigger
* page orders on machines with more than 32MB of memory.
*/
//如果物理内存大于32M，slab中对像最大页数为2个页。
if (num_physpages > (32 << 20) >> PAGE_SHIFT)
slab_break_gfp_order = BREAK_GFP_ORDER_HI;

/* Bootstrap is tricky, because several objects are allocated
* from caches that do not exist yet:
* 1) initialize the cache_cache cache: it contains the kmem_cache_t
*    structures of all caches, except cache_cache itself: cache_cache
*    is statically allocated.
*    Initially an __init data area is used for the head array and the
*    kmem_list3 structures, it's replaced with a kmalloc allocated
*    array at the end of the bootstrap.
* 2) Create the first kmalloc cache.
*    The kmem_cache_t for the new cache is allocated normally.
*    An __init data area is used for the head array.
* 3) Create the remaining kmalloc caches, with minimally sized
*    head arrays.
* 4) Replace the __init data head arrays for cache_cache and the first
*    kmalloc cache with kmalloc allocated arrays.
* 5) Replace the __init data for kmem_list3 for cache_cache and
*    the other cache's with kmalloc allocated memory.
* 6) Resize the head arrays of the kmalloc caches to their final sizes.
*/

/* 1) create the cache_cache */
//用手工方式创建第一个高速缓存cache_chche,用于分配所有其它高速缓存描述符。
init_MUTEX(&cache_chain_sem);
INIT_LIST_HEAD(&cache_chain);
list_add(&cache_cache.next, &cache_chain);
cache_cache.colour_off = cache_line_size();
cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];

cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());

cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
&left_over, &cache_cache.num);
if (!cache_cache.num)
BUG();

cache_cache.colour = left_over/cache_cache.colour_off;
cache_cache.colour_next = 0;
cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) +
sizeof(struct slab), cache_line_size());

/* 2+3) create the kmalloc caches */
//创建所有kmalloc用的高速缓存。
sizes = malloc_sizes;
names = cache_names;

/* Initialize the caches that provide memory for the array cache
* and the kmem_list3 structures first.
* Without this, further allocations will bug
*/

//首先要创建两个，因为下面其它kmalloc高速缓存的创建要用kmalloc分配内存，这
//两个高速缓存中的struct array_cache、 struct kmem_list3是静态的，所以它们不用
//kmalloc分配。
sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN,
(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);

if (INDEX_AC != INDEX_L3)
  sizes[INDEX_L3].cs_cachep =
   kmem_cache_create(names[INDEX_L3].name,
    sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN,
    (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);

while (sizes->cs_size != ULONG_MAX) {
  //创建所有其它kmalloc高速缓存。
  /*
   * For performance, all the general caches are L1 aligned.
   * This should be particularly beneficial on SMP boxes, as it
   * eliminates "false sharing".
   * Note for systems short on memory removing the alignment will
   * allow tighter packing of the smaller caches.
   */
  if(!sizes->cs_cachep)
   sizes->cs_cachep = kmem_cache_create(names->name,
    sizes->cs_size, ARCH_KMALLOC_MINALIGN,
    (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);

  /* Inc off-slab bufctl limit until the ceiling is hit. */
  if (!(OFF_SLAB(sizes->cs_cachep))) {
   offslab_limit = sizes->cs_size-sizeof(struct slab);
   offslab_limit /= sizeof(kmem_bufctl_t);
  }

  sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
   sizes->cs_size, ARCH_KMALLOC_MINALIGN,
   (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC),
   NULL, NULL);

  sizes++;
  names++;
}

/* 4) Replace the bootstrap head arrays */
//替换静态的struct array_cache为动态的。
{
  void * ptr;

  //替换cache_chche中静态的struct array_cache为动态的。
  ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);

  local_irq_disable();
  BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
  memcpy(ptr, ac_data(&cache_cache),
    sizeof(struct arraycache_init));
  cache_cache.array[smp_processor_id()] = ptr;
  local_irq_enable();

//替换kmalloc中索引为INDEX_AC的高速缓存中静态的struct array_cache为动态的。
ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);

  local_irq_disable();
  BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep)
    != &initarray_generic.cache);
  memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
    sizeof(struct arraycache_init));
  malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
      ptr;
  local_irq_enable();
}

/* 5) Replace the bootstrap kmem_list3's */
//替换三个静态的struct kmem_list3为动态的。
{
  int node;
  /* Replace the static kmem_list3 structures for the boot cpu */

  init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
    numa_node_id());

  for_each_online_node(node) {
   init_list(malloc_sizes[INDEX_AC].cs_cachep,
     &initkmem_list3[SIZE_AC+node], node);

   if (INDEX_AC != INDEX_L3) {
    init_list(malloc_sizes[INDEX_L3].cs_cachep,
      &initkmem_list3[SIZE_L3+node],
      node);
   }
  }
}

/* 6) resize the head arrays to their final sizes */
//重新调整刚才创建的所有高速缓存中的struct array_cache、struct kmem_list3.
{
  kmem_cache_t *cachep;
  down(&cache_chain_sem);
  list_for_each_entry(cachep, &cache_chain, next)
   enable_cpucache(cachep);
  up(&cache_chain_sem);
}

/* Done! */
g_cpucache_up = FULL;

/* Register a cpu startup notifier callback
* that initializes ac_data for all new cpus
*/
register_cpu_notifier(&cpucache_notifier);

/* The reap timers are started later, with a module init call:
* That part of the kernel is not yet operational.
*/
}

static int __init cpucache_init(void)
{
int cpu;

/*
* Register the timers that return unneeded
* pages to gfp.
*/
for_each_online_cpu(cpu)
start_cpu_timer(cpu);

return 0;
}

//专用高速缓存的创建。

kmem_cache_t *
kmem_cache_create (const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
void (*dtor)(void*, kmem_cache_t *, unsigned long))
{
size_t left_over, slab_size, ralign;
kmem_cache_t *cachep = NULL;

/*
* Sanity checks... these are all serious usage bugs.
*/
//做基本的参数检查。
if ((!name) ||
  in_interrupt() ||
  (size < BYTES_PER_WORD) ||
  (size > (1<  (dtor && !ctor)) {
   printk(KERN_ERR "%s: Early error in slab %s\n",
     __FUNCTION__, name);
   BUG();
  }

#if DEBUG
WARN_ON(strchr(name, ' ')); /* It confuses parsers */
if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
  /* No constructor, but inital state check requested */
  printk(KERN_ERR "%s: No con, but init state check "
    "requested - %s\n", __FUNCTION__, name);
  flags &= ~SLAB_DEBUG_INITIAL;
}

#if FORCED_DEBUG
/*
* Enable redzoning and last user accounting, except for caches with
* large objects, if the increased size would increase the object size
* above the next power of two: caches with object sizes just above a
* power of two have a significant amount of internal fragmentation.
*/
if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)))
  flags |= SLAB_RED_ZONE|SLAB_STORE_USER;
if (!(flags & SLAB_DESTROY_BY_RCU))
  flags |= SLAB_POISON;
#endif
if (flags & SLAB_DESTROY_BY_RCU)
  BUG_ON(flags & SLAB_POISON);
#endif
if (flags & SLAB_DESTROY_BY_RCU)
  BUG_ON(dtor);

/*
* Always checks flags, a caller might be expecting debug
* support which isn't available.
*/
if (flags & ~CREATE_MASK)
BUG();

/* Check that size is in terms of words. This is needed to avoid
* unaligned accesses for some archs when redzoning is used, and makes
* sure any on-slab bufctl's are also correctly aligned.
*/
//size要求字对齐。
if (size & (BYTES_PER_WORD-1)) {
size += (BYTES_PER_WORD-1);
size &= ~(BYTES_PER_WORD-1);
}

/* calculate out the final buffer alignment: */
//计算出最终的对齐。

/* 1) arch recommendation: can be overridden for debug */
//如果要求CACHE对齐。
if (flags & SLAB_HWCACHE_ALIGN) {
  /* Default alignment: as specified by the arch code.
   * Except if an object is really small, then squeeze multiple
   * objects into one cacheline.
   */
  ralign = cache_line_size();
  //如果size太小。
  while (size <= ralign/2)
   ralign /= 2;
}
//否则字对齐。
else
{
  ralign = BYTES_PER_WORD;
}

/* 2) arch mandated alignment: disables debug if necessary */
if (ralign < ARCH_SLAB_MINALIGN) {
  ralign = ARCH_SLAB_MINALIGN;
  if (ralign > BYTES_PER_WORD)
   flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
}

/* 3) caller mandated alignment: disables debug if necessary */
//如果上面计处出的对齐值小于参数要求的，以参数为准。
if (ralign < align) {
  ralign = align;
  if (ralign > BYTES_PER_WORD)
   flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
}

/* 4) Store it. Note that the debug code below can reduce
*    the alignment to BYTES_PER_WORD.
*/
align = ralign;

/* Get cache's description obj. */
//分配高速缓存描述符。
cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
if (!cachep)
goto opps;
memset(cachep, 0, sizeof(kmem_cache_t));

#if DEBUG
cachep->reallen = size;

if (flags & SLAB_RED_ZONE) {
/* redzoning only works with word aligned caches */
align = BYTES_PER_WORD;

  /* add space for red zone words */
  cachep->dbghead += BYTES_PER_WORD;
  size += 2*BYTES_PER_WORD;
}
if (flags & SLAB_STORE_USER) {
  /* user store requires word alignment and
   * one word storage behind the end of the real
   * object.
   */
  align = BYTES_PER_WORD;
  size += BYTES_PER_WORD;
}
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
if (size >= malloc_sizes[INDEX_L3+1].cs_size && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
  cachep->dbghead += PAGE_SIZE - size;
  size = PAGE_SIZE;
}
#endif
#endif

/* Determine if the slab management is 'on' or 'off' slab. */
//如果对像大小大于8分之1页，强制SLAB在外边。
if (size >= (PAGE_SIZE>>3))
  /*
   * Size is large, assume best to place the slab management obj
   * off-slab (should allow better packing of objs).
   */
  flags |= CFLGS_OFF_SLAB;

//按照上面计算的对齐值调整对像大小size.
size = ALIGN(size, align);

//分配slab中对像的空间//
//如果跟宗页且对像大小小于页。
if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) {
  /*
   * A VFS-reclaimable slab tends to have most allocations
   * as GFP_NOFS and we really don't want to have to be allocating
   * higher-order pages when we are unable to shrink dcache.
   */
  //slab中包函一个页。
  cachep->gfporder = 0;
  //计算slab中对像的数量及用于着色的空间。
  cache_estimate(cachep->gfporder, size, align, flags,
     &left_over, &cachep->num);
} else {
  /*
   * Calculate size (in pages) of slabs, and the num of objs per
   * slab. This could be made much more intelligent. For now,
   * try to avoid using high page-orders for slabs. When the
   * gfp() funcs are more friendly towards high-order requests,
   * this should be changed.
   */
  do {
   //按照一定的算法计算slab中对像的数量及用于着色的空间。
   unsigned int break_flag = 0;
cal_wastage:
   cache_estimate(cachep->gfporder, size, align, flags,
      &left_over, &cachep->num);
   if (break_flag)
    break;
   if (cachep->gfporder >= MAX_GFP_ORDER)
    break;
   if (!cachep->num)
    goto next;
   //如果slab中对像的数量太大。
   if (flags & CFLGS_OFF_SLAB &&
     cachep->num > offslab_limit) {
    /* This num of objs will cause problems. */
    cachep->gfporder--;
    break_flag++;
    goto cal_wastage;
   }

   /*
    * Large num of objs is good, but v. large slabs are
    * currently bad for the gfp()s.
    */
   //如果slab在的页大于规定值。
   if (cachep->gfporder >= slab_break_gfp_order)
    break;

   //如果用于着色的空间小于总空间的8分之1。
   if ((left_over*8) <= (PAGE_SIZE<gfporder))
    break; /* Acceptable internal fragmentation. */
next:
   cachep->gfporder++;
  } while (1);
}

//分配失败。
if (!cachep->num) {
  printk("kmem_cache_create: couldn't create cache %s.\n", name);
  kmem_cache_free(&cache_cache, cachep);
  cachep = NULL;
  goto opps;
}
slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t)
    + sizeof(struct slab), align);

/*
* If the slab has been placed off-slab, and we have enough space then
* move it on-slab. This is at the expense of any extra colouring.
*/
//如果已经把slab放在外边而剩余空间又比较大，则把SLAB放在内部。
if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
flags &= ~CFLGS_OFF_SLAB;
left_over -= slab_size;
}

if (flags & CFLGS_OFF_SLAB) {
  /* really off slab. No need for manual alignment */
  //注意，如果在外边不要求结齐。
  slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab);
}

//计算高速缓存中slab的对齐偏移。
cachep->colour_off = cache_line_size();
/* Offset must be a multiple of the alignment. */
if (cachep->colour_off < align)
cachep->colour_off = align;
//计算高速缓存的彦色个数。
cachep->colour = left_over/cachep->colour_off;
//slab本身的大小。
cachep->slab_size = slab_size;
cachep->flags = flags;
cachep->gfpflags = 0;
if (flags & SLAB_CACHE_DMA)
cachep->gfpflags |= GFP_DMA;
spin_lock_init(&cachep->spinlock);
//slab中对像的大小。
cachep->objsize = size;

//如果在外部确定分配slab本身所用的普通高速缓存。
if (flags & CFLGS_OFF_SLAB)
cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
cachep->ctor = ctor;
cachep->dtor = dtor;
cachep->name = name;

/* Don't let CPUs to come and go */
lock_cpu_hotplug();

//为struct array_cache、struct kmem_list3赋值。
if (g_cpucache_up == FULL)
{
  //正常情况（初始化之后）。
  enable_cpucache(cachep);
}
else
{
  //初始化时。
  if (g_cpucache_up == NONE)
  {
   //第1次。
   /* Note: the first kmem_cache_create must create
    * the cache that's used by kmalloc(24), otherwise
    * the creation of further caches will BUG().
    */
   //设置相应高速缓存中相应CPU的struct array_cache。
   cachep->array[smp_processor_id()] =
    &initarray_generic.cache;

   /* If the cache that's used by
    * kmalloc(sizeof(kmem_list3)) is the first cache,
    * then we need to set up all its list3s, otherwise
    * the creation of further caches will BUG().
    */
   //设置高速缓存中struct kmem_list3数组
   set_up_list3s(cachep, SIZE_AC);
   if (INDEX_AC == INDEX_L3)
    g_cpucache_up = PARTIAL_L3;
   else
    g_cpucache_up = PARTIAL_AC;
  }
  else
  {
   //设置相应高速缓存中相应CPU的struct array_cache?
   cachep->array[smp_processor_id()] =
    kmalloc(sizeof(struct arraycache_init),
      GFP_KERNEL);
   //设置高速缓存中struct kmem_list3数组
   if (g_cpucache_up == PARTIAL_AC) {
    //第2次
    set_up_list3s(cachep, SIZE_L3);
    g_cpucache_up = PARTIAL_L3;
   }
   // 其它kmalloc
   else
   {
    int node;
    for_each_online_node(node) {

     cachep->nodelists[node] =
      kmalloc_node(sizeof(struct kmem_list3),
        GFP_KERNEL, node);
     BUG_ON(!cachep->nodelists[node]);
     kmem_list3_init(cachep->nodelists[node]);
    }
   }
  }
  cachep->nodelists[numa_node_id()]->next_reap =
   jiffies + REAPTIMEOUT_LIST3 +
   ((unsigned long)cachep)%REAPTIMEOUT_LIST3;

  BUG_ON(!ac_data(cachep));
  ac_data(cachep)->avail = 0;
  ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
  ac_data(cachep)->batchcount = 1;
  ac_data(cachep)->touched = 0;
  cachep->batchcount = 1;
  cachep->limit = BOOT_CPUCACHE_ENTRIES;
}

/* Need the semaphore to access the chain. */
down(&cache_chain_sem);
//检查是否有一样的高速缓存的名子。
{
struct list_head *p;
mm_segment_t old_fs;

  old_fs = get_fs();
  set_fs(KERNEL_DS);
  list_for_each(p, &cache_chain) {
   kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
   char tmp;
   /* This happens when the module gets unloaded and doesn't
      destroy its slab cache and noone else reuses the vmalloc
      area of the module. Print a warning. */
   if (__get_user(tmp,pc->name)) {
    printk("SLAB: cache with size %d has lost its name\n",
     pc->objsize);
    continue;
   }
   if (!strcmp(pc->name,name)) {
    printk("kmem_cache_create: duplicate cache %s\n",name);
    up(&cache_chain_sem);
    unlock_cpu_hotplug();
    BUG();
   }
  }
  set_fs(old_fs);
}

/* cache setup completed, link it into the list */
//把高速缓存放到全局链表里。
list_add(&cachep->next, &cache_chain);
up(&cache_chain_sem);
unlock_cpu_hotplug();
opps:
if (!cachep && (flags & SLAB_PANIC))
panic("kmem_cache_create(): failed to create slab `%s'\n",
name);
return cachep;
}

static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
int flags, size_t *left_over, unsigned int *num)
{
int i;
//slab中总的空间。
size_t wastage = PAGE_SIZE< size_t extra = 0;
size_t base = 0;

//如果slab在内部。
if (!(flags & CFLGS_OFF_SLAB)) {
  //slab本身
  base = sizeof(struct slab);
  //slab中每个对像需要的。
  extra = sizeof(kmem_bufctl_t);
}
//计算中slab对像的个数。
i = 0;
//    每个对像管理对像需要的（可能没有，如果slab在外部）。
while (i*size + ALIGN(base+i*extra, align) <= wastage)
  i++;
if (i > 0)
  i--;

if (i > SLAB_LIMIT)
i = SLAB_LIMIT;

*num = i;
//计算出剩余的用于着色。
wastage -= i*size;
wastage -= ALIGN(base+i*extra, align);
*left_over = wastage;
}

static void enable_cpucache(kmem_cache_t *cachep)
{
int err;
int limit, shared;

/* The head array serves three purposes:
* - create a LIFO ordering, i.e. return objects that are cache-warm
* - reduce the number of spinlock operations.
* - reduce the number of linked list operations on the slab and
*   bufctl chains: array operations are cheaper.
* The numbers are guessed, we should auto-tune as described by
* Bonwick.
*/
//根据对像大小计算每CPU高速缓存的最大容量。
if (cachep->objsize > 131072)
  limit = 1;
else if (cachep->objsize > PAGE_SIZE)
  limit = 8;
else if (cachep->objsize > 1024)
  limit = 24;
else if (cachep->objsize > 256)
  limit = 54;
else
  limit = 120;

/* Cpu bound tasks (e.g. network routing) can exhibit cpu bound
* allocation behaviour: Most allocs on one cpu, most free operations
* on another cpu. For these cases, an efficient object passing between
* cpus is necessary. This is provided by a shared array. The array
* replaces Bonwick's magazine layer.
* On uniprocessor, it's functionally equivalent (but less efficient)
* to a larger limit. Thus disabled by default.
*/
shared = 0;
#ifdef CONFIG_SMP
if (cachep->objsize <= PAGE_SIZE)
shared = 8;
#endif

#if DEBUG
/* With debugging enabled, large batchcount lead to excessively
* long periods with disabled local interrupts. Limit the
* batchcount
*/
if (limit > 32)
  limit = 32;
#endif
err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared);
if (err)
  printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
     cachep->name, -err);
}

static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
int shared)
{
struct ccupdate_struct new;
int i, err;

memset(&new.new,0,sizeof(new.new));
//每个CPU。
for_each_online_cpu(i) {
  //根据limit、batchcount分配struct array_cache并初始化。
  new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount);
  if (!new.new[i]) {
   for (i--; i >= 0; i--) kfree(new.new[i]);
   return -ENOMEM;
  }
}
new.cachep = cachep;

//每个CPU都要调用do_ccupdate_local去更新struct array_cache数组中的项为上边分配的，并把原来的存回。
smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);

check_irq_on();
spin_lock_irq(&cachep->spinlock);
//再设置高速缓存中的项
cachep->batchcount = batchcount;
cachep->limit = limit;
cachep->shared = shared;
spin_unlock_irq(&cachep->spinlock);

for_each_online_cpu(i) {
  struct array_cache *ccold = new.new[i];
  if (!ccold)
   continue;
  spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
  //把原来的每CPU高速缓存中的对像释放到SLAB分配器。
  free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
  spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
  kfree(ccold);
}

//分配或修改struct kmem_list3
err = alloc_kmemlist(cachep);
if (err) {
  printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
    cachep->name, -err);
  BUG();
}
return 0;
}

static void do_ccupdate_local(void *info)
{
struct ccupdate_struct *new = (struct ccupdate_struct *)info;
struct array_cache *old;

check_irq_off();
old = ac_data(new->cachep);

new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
new->new[smp_processor_id()] = old;
}

static int alloc_kmemlist(kmem_cache_t *cachep)
{
int node;
struct kmem_list3 *l3;
int err = 0;

for_each_online_node(node) {
  struct array_cache *nc = NULL, *new;
  struct array_cache **new_alien = NULL;
#ifdef CONFIG_NUMA
  if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
   goto fail;
#endif
  if (!(new = alloc_arraycache(node, (cachep->shared*
    cachep->batchcount), 0xbaadf00d)))
   goto fail;
  if ((l3 = cachep->nodelists[node]))
  {

spin_lock_irq(&l3->list_lock);

   if ((nc = cachep->nodelists[node]->shared))
    free_block(cachep, nc->entry,
       nc->avail, node);

   l3->shared = new;
   if (!cachep->nodelists[node]->alien) {
    l3->alien = new_alien;
    new_alien = NULL;
   }
   l3->free_limit = (1 + nr_cpus_node(node))*
    cachep->batchcount + cachep->num;
   spin_unlock_irq(&l3->list_lock);
   kfree(nc);
   free_alien_cache(new_alien);
   continue;
  }
  if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
      GFP_KERNEL, node)))
   goto fail;

  kmem_list3_init(l3);
  l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
   ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
  l3->shared = new;
  l3->alien = new_alien;
  l3->free_limit = (1 + nr_cpus_node(node))*
   cachep->batchcount + cachep->num;
  cachep->nodelists[node] = l3;
}
return err;
fail:
err = -ENOMEM;
return err;
}

//从cache中分配对像。
void *kmem_cache_alloc(kmem_cache_t *cachep, gfp_t flags)
{
return __cache_alloc(cachep, flags);
}

static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
{
unsigned long save_flags;
void* objp;

cache_alloc_debugcheck_before(cachep, flags);

local_irq_save(save_flags);
//关中断。
objp = ____cache_alloc(cachep, flags);
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp,
__builtin_return_address(0));
prefetchw(objp);
return objp;
}

static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
{
void* objp;
struct array_cache *ac;

check_irq_off();
//检查每CPU CACHE中是否有空闲对像。
ac = ac_data(cachep);
if (likely(ac->avail)) {
  STATS_INC_ALLOCHIT(cachep);
  ac->touched = 1;
  objp = ac->entry[--ac->avail];
}
//到SLAB分配器中分配。
else
{
  STATS_INC_ALLOCMISS(cachep);
  objp = cache_alloc_refill(cachep, flags);
}
return objp;
}

static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
{
int batchcount;
struct kmem_list3 *l3;
struct array_cache *ac;

check_irq_off();
ac = ac_data(cachep);
retry:
//计算每CPU高速缓存中要补充的对像个数。
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
  /* if there was little recent activity on this
   * cache, then perform only a partial refill.
   * Otherwise we could generate refill bouncing.
   */
  batchcount = BATCHREFILL_LIMIT;
}
l3 = cachep->nodelists[numa_node_id()];

BUG_ON(ac->avail > 0 || !l3);
spin_lock(&l3->list_lock);

//如果所有CPU共亨的高速缓存中有对像。
if (l3->shared) {
  struct array_cache *shared_array = l3->shared;
  if (shared_array->avail) {
   if (batchcount > shared_array->avail)
    batchcount = shared_array->avail;
   shared_array->avail -= batchcount;
   ac->avail = batchcount;
   //复制到当前CPU高速缓存。
   memcpy(ac->entry,
    &(shared_array->entry[shared_array->avail]),
    sizeof(void*)*batchcount);
   shared_array->touched = 1;
   goto alloc_done;
  }
}

//从空闲和半空闲连表移动对像到当前CPU高速缓存。
while (batchcount > 0) {
  struct list_head *entry;
  struct slab *slabp;

  /* Get slab alloc is to come from. */
  //确定从哪个链表里分配对像。
  entry = l3->slabs_partial.next;
  if (entry == &l3->slabs_partial) {
   l3->free_touched = 1;
   entry = l3->slabs_free.next;
   if (entry == &l3->slabs_free)
    //两个链表都没有。
    goto must_grow;
  }

  slabp = list_entry(entry, struct slab, list);
  check_slabp(cachep, slabp);
  check_spinlock_acquired(cachep);
  //从SLAB中移动对像到当前CPU高速缓存。
  while (slabp->inuse < cachep->num && batchcount--) {
   kmem_bufctl_t next;
   STATS_INC_ALLOCED(cachep);
   STATS_INC_ACTIVE(cachep);
   STATS_SET_HIGH(cachep);

   /* get obj pointer */
   //移动对像。
   ac->entry[ac->avail++] = slabp->s_mem +
    slabp->free*cachep->objsize;

   slabp->inuse++;
   //更新空闲对像位置。
   next = slab_bufctl(slabp)[slabp->free];
#if DEBUG
   slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
#endif
      slabp->free = next;
  }
  check_slabp(cachep, slabp);

  /* move slabp to correct slabp list: */
  //移动SLAB到相应的链表。
  list_del(&slabp->list);
  if (slabp->free == BUFCTL_END)
   list_add(&slabp->list, &l3->slabs_full);
  else
   list_add(&slabp->list, &l3->slabs_partial);
}

must_grow:
//高速缓存中空闲对像的个数要减少。
l3->free_objects -= ac->avail;
alloc_done:
spin_unlock(&l3->list_lock);

//如果经过以上努力当前CPU高速缓存还是空的。
if (unlikely(!ac->avail)) {
  int x;
  //添加SLAB到高速缓存。
  x = cache_grow(cachep, flags, numa_node_id());

  // cache_grow can reenable interrupts, then ac could change.
  ac = ac_data(cachep);
  if (!x && ac->avail == 0) // no objects in sight? abort
   return NULL;

if (!ac->avail) // objects refilled by interrupt?
goto retry;
}
ac->touched = 1;
//成功分配结像。
return ac->entry[--ac->avail];
}

static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
{
struct slab *slabp;
void *objp;
size_t offset;
unsigned int local_flags;
unsigned long ctor_flags;
struct kmem_list3 *l3;

/* Be lazy and only check for valid flags here,
   * keeping it out of the critical path in kmem_cache_alloc().
*/
if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW))
  BUG();
if (flags & SLAB_NO_GROW)
  return 0;

ctor_flags = SLAB_CTOR_CONSTRUCTOR;
local_flags = (flags & SLAB_LEVEL_MASK);
if (!(local_flags & __GFP_WAIT))
  /*
   * Not allowed to sleep. Need to tell a constructor about
   * this - it might need to know...
   */
  ctor_flags |= SLAB_CTOR_ATOMIC;

/* About to mess with non-constant members - lock. */
check_irq_off();
spin_lock(&cachep->spinlock);

/* Get colour for the slab, and cal the next value. */
//根据颜色计算偏移。
offset = cachep->colour_next;
cachep->colour_next++;
if (cachep->colour_next >= cachep->colour)
cachep->colour_next = 0;
offset *= cachep->colour_off;

spin_unlock(&cachep->spinlock);

check_irq_off();
if (local_flags & __GFP_WAIT)
local_irq_enable();

/*
* The test for missing atomic flag is performed here, rather than
* the more obvious place, simply to reduce the critical path length
* in kmem_cache_alloc(). If a caller is seriously mis-behaving they
* will eventually be caught here (where it matters).
*/
kmem_flagcheck(cachep, flags);

/* Get mem for the objs.
* Attempt to allocate a physical page from 'nodeid',
*/
//分配gfporder个连续物理页框。
if (!(objp = kmem_getpages(cachep, flags, nodeid)))
goto failed;

/* Get slab management. */
//分配SLAB并初始化。
if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
goto opps1;

slabp->nodeid = nodeid;
//设置关联。
set_slab_attr(cachep, slabp, objp);

//初始化SLAB内对像相关的成员。
cache_init_objs(cachep, slabp, ctor_flags);

if (local_flags & __GFP_WAIT)
local_irq_disable();
check_irq_off();
l3 = cachep->nodelists[nodeid];
spin_lock(&l3->list_lock);

/* Make slab active. */
//把SLAB加到空闲链表。
list_add_tail(&slabp->list, &(l3->slabs_free));
STATS_INC_GROWN(cachep);
//空闲对像数量更新。
l3->free_objects += cachep->num;
spin_unlock(&l3->list_lock);
return 1;
opps1:
kmem_freepages(cachep, objp);
failed:
if (local_flags & __GFP_WAIT)
local_irq_disable();
return 0;
}

static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
{
struct page *page;
void *addr;
int i;

flags |= cachep->gfpflags;
//根据创建时设置的gfporder分配物理页。
if (likely(nodeid == -1)) {
  page = alloc_pages(flags, cachep->gfporder);
} else {
  page = alloc_pages_node(nodeid, flags, cachep->gfporder);
}
if (!page)
  return NULL;
addr = page_address(page);

i = (1 << cachep->gfporder);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
  atomic_add(i, &slab_reclaim_pages);
add_page_state(nr_slab, i);
//设置被SLAB使用标志。
while (i--) {
  SetPageSlab(page);
  page++;
}
return addr;
}

static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
int colour_off, gfp_t local_flags)
{
struct slab *slabp;

//SLAB在外部。
if (OFF_SLAB(cachep)) {
  /* Slab management obj is off-slab. */
  slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
  if (!slabp)
   return NULL;
}
//在内部。
else {
  //SLAB在着色之后。
  slabp = objp+colour_off;
  colour_off += cachep->slab_size;
}
//初始化SLAB成员。
slabp->inuse = 0;
slabp->colouroff = colour_off;
slabp->s_mem = objp+colour_off;

return slabp;
}

static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
{
int i;
struct page *page;

/* Nasty!!!!!! I hope this is OK. */
i = 1 << cachep->gfporder;
page = virt_to_page(objp);
do {
  //让每个页的lru.next=高速缓存，lru.prev=SLAB
  SET_PAGE_CACHE(page, cachep);
  SET_PAGE_SLAB(page, slabp);
  page++;
} while (--i);
}

static void cache_init_objs(kmem_cache_t *cachep,
struct slab *slabp, unsigned long ctor_flags)
{
int i;

for (i = 0; i < cachep->num; i++) {
  void *objp = slabp->s_mem+cachep->objsize*i;
#if DEBUG
  /* need to poison the objs? */
  if (cachep->flags & SLAB_POISON)
   poison_obj(cachep, objp, POISON_FREE);
  if (cachep->flags & SLAB_STORE_USER)
   *dbg_userword(cachep, objp) = NULL;

  if (cachep->flags & SLAB_RED_ZONE) {
   *dbg_redzone1(cachep, objp) = RED_INACTIVE;
   *dbg_redzone2(cachep, objp) = RED_INACTIVE;
  }
  /*
   * Constructors are not allowed to allocate memory from
   * the same cache which they are a constructor for.
   * Otherwise, deadlock. They must also be threaded.
   */
  if (cachep->ctor && !(cachep->flags & SLAB_POISON))
   cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags);

  if (cachep->flags & SLAB_RED_ZONE) {
   if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
    slab_error(cachep, "constructor overwrote the"
       " end of an object");
   if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
    slab_error(cachep, "constructor overwrote the"
       " start of an object");
  }
  if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
          kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
#else
  //调用构造。
  if (cachep->ctor)
   cachep->ctor(objp, cachep, ctor_flags);
#endif
  //对像之间的链表。
  slab_bufctl(slabp)[i] = i+1;
}
slab_bufctl(slabp)[i-1] = BUFCTL_END;
slabp->free = 0;
}

//释放高速缓存中的对像。
void kmem_cache_free(kmem_cache_t *cachep, void *objp)
{
unsigned long flags;

local_irq_save(flags);
__cache_free(cachep, objp);
local_irq_restore(flags);
}

static inline void __cache_free(kmem_cache_t *cachep, void *objp)
{
struct array_cache *ac = ac_data(cachep);

check_irq_off();
objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));

/* Make sure we are not freeing a object from another
* node to the array cache on this cpu.
*/
#ifdef CONFIG_NUMA
{
  struct slab *slabp;
  slabp = GET_PAGE_SLAB(virt_to_page(objp));
  if (unlikely(slabp->nodeid != numa_node_id())) {
   struct array_cache *alien = NULL;
   int nodeid = slabp->nodeid;
   struct kmem_list3 *l3 = cachep->nodelists[numa_node_id()];

   STATS_INC_NODEFREES(cachep);
   if (l3->alien && l3->alien[nodeid]) {
    alien = l3->alien[nodeid];
    spin_lock(&alien->lock);
    if (unlikely(alien->avail == alien->limit))
     __drain_alien_cache(cachep,
       alien, nodeid);
    alien->entry[alien->avail++] = objp;
    spin_unlock(&alien->lock);
   } else {
    spin_lock(&(cachep->nodelists[nodeid])->
      list_lock);
    free_block(cachep, &objp, 1, nodeid);
    spin_unlock(&(cachep->nodelists[nodeid])->
      list_lock);
   }
   return;
  }
}
#endif
//如果当前CPU高速缓存中有空间
if (likely(ac->avail < ac->limit)) {
  STATS_INC_FREEHIT(cachep);
  ac->entry[ac->avail++] = objp;
  return;
}
else
{
  STATS_INC_FREEMISS(cachep);
  //当前CPU高速缓存对像太多。
  cache_flusharray(cachep, ac);
  ac->entry[ac->avail++] = objp;
}
}

//把每CPU高速缓存中的对像移走。
static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
{
int batchcount;
struct kmem_list3 *l3;
int node = numa_node_id();

batchcount = ac->batchcount;
#if DEBUG
BUG_ON(!batchcount || batchcount > ac->avail);
#endif
check_irq_off();
l3 = cachep->nodelists[node];
spin_lock(&l3->list_lock);
//首先放到所有CPU共亨的高速缓存。
if (l3->shared) {
  struct array_cache *shared_array = l3->shared;
  int max = shared_array->limit-shared_array->avail;
  if (max) {
   if (batchcount > max)
    batchcount = max;
   memcpy(&(shared_array->entry[shared_array->avail]),
     ac->entry,
     sizeof(void*)*batchcount);
   shared_array->avail += batchcount;
   goto free_done;
  }
}
//释放对像到SLAB分配器。
free_block(cachep, ac->entry, batchcount, node);
free_done:
#if STATS
{
  int i = 0;
  struct list_head *p;

  p = l3->slabs_free.next;
  while (p != &(l3->slabs_free)) {
   struct slab *slabp;

slabp = list_entry(p, struct slab, list);
BUG_ON(slabp->inuse);

   i++;
   p = p->next;
  }
  STATS_SET_FREEABLE(cachep, i);
}
#endif
spin_unlock(&l3->list_lock);
//移动对像位置。
ac->avail -= batchcount;
memmove(ac->entry, &(ac->entry[batchcount]),
   sizeof(void*)*ac->avail);
}

static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int node)
{
int i;
struct kmem_list3 *l3;

for (i = 0; i < nr_objects; i++) {
  void *objp = objpp[i];
  struct slab *slabp;
  unsigned int objnr;

  //释放对像到SLAB。
  slabp = GET_PAGE_SLAB(virt_to_page(objp));
  l3 = cachep->nodelists[node];
  list_del(&slabp->list);
  objnr = (objp - slabp->s_mem) / cachep->objsize;
  check_spinlock_acquired_node(cachep, node);
  check_slabp(cachep, slabp);

#if DEBUG
  if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
   printk(KERN_ERR "slab: double free detected in cache "
     "'%s', objp %p\n", cachep->name, objp);
   BUG();
  }
#endif
  slab_bufctl(slabp)[objnr] = slabp->free;
  slabp->free = objnr;
  STATS_DEC_ACTIVE(cachep);
  slabp->inuse--;
  l3->free_objects++;
  check_slabp(cachep, slabp);

  /* fixup slab chains */
  //SLAB内对像全是空闲的。
  if (slabp->inuse == 0) {
   //空闲对像太多。
   if (l3->free_objects > l3->free_limit) {
    l3->free_objects -= cachep->num;
    //销毁SLAB。
    slab_destroy(cachep, slabp);
   } else {
    //放到空闲链表。
    list_add(&slabp->list, &l3->slabs_free);
   }
  } else {
   /* Unconditionally move a slab to the end of the
    * partial list on free - maximum time for the
    * other objects to be freed, too.
    */
   //放到半空闲链表。
   list_add_tail(&slabp->list, &l3->slabs_partial);
  }
}
}

//销毁SLAB。
static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
{
//物理页起始地址。
void *addr = slabp->s_mem - slabp->colouroff;

#if DEBUG
int i;
for (i = 0; i < cachep->num; i++) {
void *objp = slabp->s_mem + cachep->objsize * i;

  if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
   if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep))
    kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1);
   else
    check_poison_obj(cachep, objp);
#else
   check_poison_obj(cachep, objp);
#endif
  }
  if (cachep->flags & SLAB_RED_ZONE) {
   if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
    slab_error(cachep, "start of a freed object "
       "was overwritten");
   if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
    slab_error(cachep, "end of a freed object "
       "was overwritten");
  }
  if (cachep->dtor && !(cachep->flags & SLAB_POISON))
   (cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0);
}
#else
if (cachep->dtor) {
  int i;
  for (i = 0; i < cachep->num; i++) {
   void* objp = slabp->s_mem+cachep->objsize*i;
   (cachep->dtor)(objp, cachep, 0);
  }
}
#endif

if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
struct slab_rcu *slab_rcu;

  slab_rcu = (struct slab_rcu *) slabp;
  slab_rcu->cachep = cachep;
  slab_rcu->addr = addr;
  call_rcu(&slab_rcu->head, kmem_rcu_free);
}
else
{
  //释放页到伙伴系统。
  kmem_freepages(cachep, addr);
  if (OFF_SLAB(cachep))
   //释放SLAB本身。
   kmem_cache_free(cachep->slabp_cache, slabp);
}
}

//销毁高速缓存及所有SLAB及以下所有对像，不一定成功。
int kmem_cache_destroy(kmem_cache_t * cachep)
{
int i;
struct kmem_list3 *l3;

if (!cachep || in_interrupt())
BUG();

/* Don't let CPUs to come and go */
lock_cpu_hotplug();

/* Find the cache in the chain of caches. */
down(&cache_chain_sem);
/*
* the chain is never empty, cache_cache is never destroyed
*/
list_del(&cachep->next);
up(&cache_chain_sem);

if (__cache_shrink(cachep)) {
  slab_error(cachep, "Can't free all objects");
  down(&cache_chain_sem);
  list_add(&cachep->next,&cache_chain);
  up(&cache_chain_sem);
  unlock_cpu_hotplug();
  return 1;
}

if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
synchronize_rcu();

for_each_online_cpu(i)
kfree(cachep->array[i]);

/* NUMA: free the list3 structures */
for_each_online_node(i) {
  if ((l3 = cachep->nodelists[i])) {
   kfree(l3->shared);
   free_alien_cache(l3->alien);
   kfree(l3);
  }
}
kmem_cache_free(&cache_cache, cachep);

unlock_cpu_hotplug();

return 0;
}

3）非连续内存的管理

3、进程地址空间的内存管理；

4、内存回收；

阅读(2651) | 评论(0) | 转发(0) |

上一篇：(1)linux启动过程

下一篇：(5)linux文件系统

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6