Chinaunix首页 | 论坛 | 博客
  • 博客访问: 1855822
  • 博文数量: 473
  • 博客积分: 13997
  • 博客等级: 上将
  • 技术积分: 5953
  • 用 户 组: 普通用户
  • 注册时间: 2010-01-22 11:52
文章分类

全部博文(473)

文章存档

2014年(8)

2013年(38)

2012年(95)

2011年(181)

2010年(151)

分类: LINUX

2013-06-03 09:41:33

原文地址:2.6.37的setup_arch 作者:longzhang

暂时放下对于内存管理的探讨而对setup_arch进行深入分析的目的在于要找出相关于内存管理初始化细节。在遍历代码的过程中,我跳过了一些和特殊平台相关的代码,例如EFI等。

1. 首先kernel将全局页表目录page global directory切换到swapper_pg_dir。方法是将initial_page_table的内容复制到swapper_pg_dir里面,并且重置cr3。initial_page_table的初始化可以参见http://blog.chinaunix.net/space.php?uid=1701789&do=blog&id=154125。
  1. /*
  2.      * copy kernel address range established so far and switch
  3.      * to the proper swapper page table

  4.      */
  5.     clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY,
  6.             initial_page_table + KERNEL_PGD_BOUNDARY,
  7.             KERNEL_PGD_PTRS);

  8.     load_cr3(swapper_pg_dir);
  9.     __flush_tlb_all();
2. 初始化中断向量和CPU
  1. early_trap_init();//初始化中断向量1的服务例程为debug所指向的中断服务例程,向量3指向int3所指向的服务例程,向量14则指向page_fault指向的服务例程。
  2. early_cpu_init(); //初始化cpu_devs[X86_VENDOR_NUM] 并以此侦测boot_cpu_data
3. 根据实模式下设置的各个启动参数boot_params设置全局变量
  1. ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
  2.     screen_info = boot_params.screen_info;
  3.     edid_info = boot_params.edid_info;
  4. #ifdef CONFIG_X86_32
  5.     apm_info.bios = boot_params.apm_bios_info;
  6.     ist_info = boot_params.ist_info;
  7.     if (boot_params.sys_desc_table.length != 0) {
  8.         set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
  9.         machine_id = boot_params.sys_desc_table.table[0];
  10.         machine_submodel_id = boot_params.sys_desc_table.table[1];
  11.         BIOS_revision = boot_params.sys_desc_table.table[2];
  12.     }
  13. #endif
  14.     saved_video_mode = boot_params.hdr.vid_mode;
  15.     bootloader_type = boot_params.hdr.type_of_loader;
  16.     if ((bootloader_type >> 4) == 0xe) {
  17.         bootloader_type &= 0xf;
  18.         bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
  19.     }
  20.     bootloader_version = bootloader_type & 0xf;
  21.     bootloader_version |= boot_params.hdr.ext_loader_ver << 4;

  22. #ifdef CONFIG_BLK_DEV_RAM
  23.     rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
  24.     rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
  25.     rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
  26. #endif
  27. #ifdef CONFIG_EFI
  28.     if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
  29. #ifdef CONFIG_X86_32
  30.          "EL32",
  31. #else
  32.          "EL64",
  33. #endif
  34.      4)) {
  35.         efi_enabled = 1;
  36.         efi_memblock_x86_reserve_range();
  37.     }
  38. #endif
        copy_edd();
if (!boot_params.hdr.root_flags)
root_mountflags &= ~MS_RDONLY;

#ifdef CONFIG_CMDLINE_BOOL
#ifdef CONFIG_CMDLINE_OVERRIDE
strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
#else
if (builtin_cmdline[0]) {
/* append boot loader cmdline to builtin */
strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
}
#endif
#endif

strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;

        parse_early_param();

memblock_x86_reserve_range_setup_data(); /* after early param, so could get panic from serial */

4. 初始化early_ioremap
  1. early_ioremap_init();对于early_ioremap的解释参考http://blog.chinaunix.net/space.php?uid=1701789&do=blog&id=263951
5. 调用x86_init.oem.arch_setup()。x86_init结构的初始化值在x86_init.c中定义,其具体内容如下。所以arch_setup实际执行的是x86_init_noop(). 这是一个空函数,什么都没有做。x86平台的其他非普通PC如果需要进行一些特定的初始化,可以重置x86_init的结构。
  1. /*
  2.  * The platform setup functions are preset with the default functions
  3.  * for standard PC hardware.
  4.  */
  5. struct x86_init_ops x86_init __initdata = {

  6.     .resources = {
  7.         .probe_roms        = x86_init_noop,
  8.         .reserve_resources    = reserve_standard_io_resources,
  9.         .memory_setup        = default_machine_specific_memory_setup,
  10.     },

  11.     .mpparse = {
  12.         .mpc_record        = x86_init_uint_noop,
  13.         .setup_ioapic_ids    = x86_init_noop,
  14.         .mpc_apic_id        = default_mpc_apic_id,
  15.         .smp_read_mpc_oem    = default_smp_read_mpc_oem,
  16.         .mpc_oem_bus_info    = default_mpc_oem_bus_info,
  17.         .find_smp_config    = default_find_smp_config,
  18.         .get_smp_config        = default_get_smp_config,
  19.     },

  20.     .irqs = {
  21.         .pre_vector_init    = init_ISA_irqs,
  22.         .intr_init        = native_init_IRQ,
  23.         .trap_init        = x86_init_noop,
  24.     },

  25.     .oem = {
  26.         .arch_setup        = x86_init_noop,
  27.         .banner            = default_banner,
  28.     },

  29.     .paging = {
  30.         .pagetable_setup_start    = native_pagetable_setup_start,
  31.         .pagetable_setup_done    = native_pagetable_setup_done,
  32.     },

  33.     .timers = {
  34.         .setup_percpu_clockev    = setup_boot_APIC_clock,
  35.         .tsc_pre_init        = x86_init_noop,
  36.         .timer_init        = hpet_time_init,
  37.     },

  38.     .iommu = {
  39.         .iommu_init        = iommu_init_noop,
  40.     },

  41.     .pci = {
  42.         .init            = x86_default_pci_init,
  43.         .init_irq        = x86_default_pci_init_irq,
  44.         .fixup_irqs        = x86_default_pci_fixup_irqs,
  45.     },
  46. };
6. 初始化e820结构。对于e820的初始化介绍详见http://blog.chinaunix.net/space.php?uid=1701789&do=blog&id=263951
  1. setup_memory_map();
  2.     parse_setup_data();
  3.     /* update the e820_saved too */
  4.     e820_reserve_setup_data();
  5.    finish_e820_parsing();
 7. 初始化dmi --- dmi_scan_machine();
  1. void __init dmi_scan_machine(void)
  2. {
  3.     char __iomem *p, *q;
  4.     int rc;

  5.     if (efi_enabled) { //如果存在efi则在efi_smbios中寻找dmi
  6.         if (efi.smbios == EFI_INVALID_TABLE_ADDR)
  7.             goto error;

  8.         /* This is called as a core_initcall() because it isn't
  9.          * needed during early boot. This also means we can
  10.          * iounmap the space when we're done with it.
  11.          */
  12.         p = dmi_ioremap(efi.smbios, 32);
  13.         if (p == NULL)
  14.             goto error;

  15.         rc = dmi_present(p + 0x10); /* offset of _DMI_ string */ // to decode DMI table to dmi_ident table
  16.         dmi_iounmap(p, 32);
  17.         if (!rc) {
  18.             dmi_available = 1;
  19.             goto out;
  20.         }
  21.     }
  22.     else {
  23.         /*
  24.          * no iounmap() for that ioremap(); it would be a no-op, but
  25.          * it's so early in setup that sucker gets confused into doing
  26.          * what it shouldn't if we actually call it.
  27.          */
  28.         p = dmi_ioremap(0xF0000, 0x10000); // 在非efi的BIOS中,dmi应该从 memory range 0xF0000, size 0x10000处开始寻找
  29.         if (p == NULL)
  30.             goto error;

  31.         for (q = p; q < p + 0x10000; q += 16) { //每次步进16个字节循环搜索dmi
  32.             rc = dmi_present(q);//查找dmi的标记并提取 dmi_num, dmi_len, dmi_base三个变量
  33.             if (!rc) { //发现了dmi,那么就可以跳出循环了
  34.                 dmi_available = 1;
  35.                 dmi_iounmap(p, 0x10000);
  36.                 goto out;
  37.             }
  38.         }
  39.         dmi_iounmap(p, 0x10000);
  40.     }
  41.  error:
  42.     printk(KERN_INFO "DMI not present or invalid.\n");
  43.  out:
  44.     dmi_initialized = 1;
  45. }
8. iomem_resource. iomem_resource的定义如下, 其定义了一个从0到ffffffff的内存区间。
  1. struct resource iomem_resource = {
  2.     .name    = "PCI mem",
  3.     .start    = 0,
  4.     .end    = -1,
  5.     .flags    = IORESOURCE_MEM,
  6. };
相关于iomem_resource在setup_arch里的操作如下:
  1. iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; //iomem_resource.end=0xffffffff

  2.     code_resource.start = virt_to_phys(_text);
  3.     code_resource.end = virt_to_phys(_etext)-1;
  4.     data_resource.start = virt_to_phys(_etext);
  5.     data_resource.end = virt_to_phys(_edata)-1;
  6.     bss_resource.start = virt_to_phys(&__bss_start);
  7.     bss_resource.end = virt_to_phys(&__bss_stop)-1;


  8.     x86_init.resources.probe_roms();// call x86_init_noop which does nothing

  9.     /* after parse_early_param, so could debug it */
  10. //将code_resource, data_resource和bss_resource加入iomem_resource,iomem_resource是一个超集,code_resource, data_resource 和bss_resource是iomem_resource中的无不重叠的子集,这三个自己之间是平级关系,通过sibling域相互联通。
  11.     insert_resource(&iomem_resource, &code_resource);
  12.     insert_resource(&iomem_resource, &data_resource);
  13.     insert_resource(&iomem_resource, &bss_resource);

9. 计算max_pfn, max_low_pfn以确定HighMemory大小。
由于kernel的实现中0xC0000000到0xFFFFFFFF之间的区域都是使用页面线性映射的方式将3G-4G逻辑地址映射到0-1G物理地址,所以内核逻辑地址的最大上限是0xFFFFFFFF。而逻辑地址在0-3G的空间是分配给应用程序使用的。而前面说过从0xFFFFFFFF往下的128M的空间被ioremap所占据用以固定内存的映射。那么实际kernel使用的空间就是从0xC0000000的896M空间。当物理内存超过896M的空间的时候,超出的物理内存将无法直接从逻辑地址到物理地址进行线性映射。所以就出现了HighMemory:超出896M的物理内存的部分就被定义为HIGHMemory.当kernel需要去使用这部分HighMemory的时候需要进行映射。
max_pfn的含义是最大的物理内存的页面号码。
max_low_pfn的含义是Kernel能够直接使用的页面号码。
HighMemory就是max_low_pfn到max_pfn之间的物理内存。
highmem_pages的含义是属于HighMemory的页面数。
max_pfn和max_low_pfn的计算方式如下:
  1. max_pfn = e820_end_of_ram_pfn();//已经知道e820里面是物理内存分配的情况。e820_end_of_ram_pfn通过遍历e820结构找出最大物理地址,并将其所处的页面作为max_pfn

  2.     /* update e820 for memory not covered by WB MTRRs */ MTTR是X86结构中一组寄存器以指明在物理内存中的一组内存区域是被CPU所加载入Cache以加快访问速度。*/
  3.     mtrr_bp_init();//initialize boot cpu mtrr. Actually it clean up the mtrr to make sure only Write Back cache memory range are included in mtrr memory range
  4.     if (mtrr_trim_uncached_memory(max_pfn))//update e820 to reserve mtrr memory range
  5.         max_pfn = e820_end_of_ram_pfn();//如果由于mtrr的原因导致e820被更新,那么重新计算max_pfn

  6. #ifdef CONFIG_X86_32
  7.     /* max_low_pfn get updated here */
  8.     find_low_pfn_range(); //计算max_low_pfn,只看x86_32部分。
  9. #else
  10.     num_physpages = max_pfn

  11.     check_x2apic();

  12.     /* How many end-of-memory variables you have, */
  13.     /* need this before calling reserve_initrd */
  14.     if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
  15.         max_low_pfn = e820_end_of_low_ram_pfn();
  16.     else
  17.         max_low_pfn = max_pfn;

  18.     high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
  19. #endif
find_low_pfn_range()的定义如下:
  1. void __init find_low_pfn_range(void)
  2. {
  3.     /* it could update max_pfn */

  4.     if (max_pfn <= MAXMEM_PFN
  5.         lowmem_pfn_init();//如果物理内存小于MAXMEM
  6.     else
  7.         highmem_pfn_init();//如果物理内存大于MAXMEM
  8. }
先来看MAXMEM_PFN的定义:
  1. #define MAXMEM_PFN     PFN_DOWN(MAXMEM)
  2. #define MAXMEM    (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)
  3. #ifdef CONFIG_X86_PAE
  4. #define LAST_PKMAP 512
  5. #else
  6. #define LAST_PKMAP 1024
  7. #endif

  8. #define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1))    \
  9.          & PMD_MASK)

  10. #ifdef CONFIG_HIGHMEM
  11. # define VMALLOC_END    (PKMAP_BASE - 2 * PAGE_SIZE)
  12. #else
  13. # define VMALLOC_END    (FIXADDR_START - 2 * PAGE_SIZE)
  14. #endif
  15. unsigned int __VMALLOC_RESERVE = 128 << 20;
可以看到MAXMEM是由VMALLOC_END, PAGE_OFFSET和VMALLOC_RESERVE决定的。PAGE_OFFSET是0xC0000000。
从以上的定义可以看到,从0xFFFFFFFF的最高地址往下,有FIXADDR固定映射区,PKMAP区和VMALLOC区。在这三个区域以下是kernel可以使用自由访问的物理内存空间。
当max_pfn小于MAXMEM_PFN的时候,即是所具备的实际物理内存数小于Kernel所能自由访问的物理内存数时,HighMemory的存在与否实际取决于传给kernel的命令行highmem=x的参数。如果没有传入该参数,highmemory将不存在。如果存在该参数,highmemory的将在lowmem_pfn_init里被设置。
  1. void __init lowmem_pfn_init(void)
  2. {
  3.     /* max_low_pfn is 0, we already have early_res support */
  4.     max_low_pfn = max_pfn;

  5.     if (highmem_pages == -1) //highmem_pages在parse_highmem()设置
  6.         highmem_pages = 0;
  7. #ifdef CONFIG_HIGHMEM
  8.     if (highmem_pages >= max_pfn) { //如果命令行要求的highmem大于实际内存数,就放弃设置highmem
  9.         printk(KERN_ERR MSG_HIGHMEM_TOO_BIG,
  10.             pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
  11.         highmem_pages = 0;
  12.     }
  13.     if (highmem_pages)
  14.         if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) {//如果需要的highmem小于实际内存数,但是物理内存数减去所需的highmem之后,给kernel留下的空间不足64M,则放弃设置highmem.
  15.             printk(KERN_ERR MSG_LOWMEM_TOO_SMALL,
  16.                 pages_to_mb(highmem_pages));
  17.             highmem_pages = 0;    
  18.         }
  19.         max_low_pfn -= highmem_pages;//根据highmem_pages所要求的数目调整max_low_pfn。
  20.     }
  21. #else
  22.     if (highmem_pages)
  23.         printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
  24. #endif
  25. }
当max_pfn大于MAXMEM_PFN,即实际存在的物理内存大于kernel能够直接访问的物理内存时,为了能够访问其余的内存,就需要设置HighMemory。highmem_pfn_init()进行了设置。
  1. /*
  2.  * We have more RAM than fits into lowmem - we try to put it into
  3.  * highmem, also taking the highmem=x boot parameter into account:
  4.  */
  5. void __init highmem_pfn_init(void)
  6. {
  7.     max_low_pfn = MAXMEM_PFN;

  8.     if (highmem_pages == -1)//如果命令行没有设置highmem_pages,那么默认highmem_pages就是实际物理内存数减去kernel可直接访问的最大物理内存。
  9.         highmem_pages = max_pfn - MAXMEM_PFN;

  10.     if (highmem_pages + MAXMEM_PFN < max_pfn)//如果命令行所要求的highmem+MAXMEM小于实际存在的物理内存数,那么实际可用的物理内存数目就要相应减少。
  11.         max_pfn = MAXMEM_PFN + highmem_pages;

  12.     if (highmem_pages + MAXMEM_PFN > max_pfn) {//如果所需的highmem+MAXMEM大于实际存在的物理内存数。则不配置highmem
  13.         printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
  14.             pages_to_mb(max_pfn - MAXMEM_PFN),
  15.             pages_to_mb(highmem_pages));
  16.         highmem_pages = 0;
  17.     }
  18. #ifndef CONFIG_HIGHMEM
  19.     /* Maximum memory usable is what is directly addressable */
  20.     printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
  21.     if (max_pfn > MAX_NONPAE_PFN)
  22.         printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
  23.     else
  24.         printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
  25.     max_pfn = MAXMEM_PFN;//如果编译kernel时没有定义CONFIG_HIGHMEM,就应该把max_pfn从实际物理内存数改变成为MAXMEM从而取消HIGHMEM
  26. #else /* !CONFIG_HIGHMEM */
  27. #ifndef CONFIG_HIGHMEM64G
  28.     if (max_pfn > MAX_NONPAE_PFN) {//如果编译kernel时没有打开PAE以打开CPU的PAE寻址,那么最大可用内存不可超过4G.
  29.         max_pfn = MAX_NONPAE_PFN;
  30.         printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
  31.     }
  32. #endif /* !CONFIG_HIGHMEM64G */
  33. #endif /* !CONFIG_HIGHMEM */
  34. }
10. 继续分配和预留内存空间。
  1. /*
  2.      * Need to conclude brk, before memblock_x86_fill()
  3.      * it could use memblock_find_in_range, could overlap with
  4.      * brk area.
  5.      */
  6.     reserve_brk(); //将brk段保留在memblock.reserved里

  7.     memblock.current_limit = get_max_mapped(); //对于x86_32系统来说,max_pfn_mapped在head_32.S里面被初始化。 详见http://blog.chinaunix.net/space.php?uid=1701789&do=blog&id=154125
  8.     memblock_x86_fill();//将e820中的E820_RAM类型和E820_RESERVED_KERN类型的内存区域加入到memblock.memory中

  9.     /* preallocate 4k for mptable mpc */
  10.     early_reserve_e820_mpc_new();//在memblock中找到一块4K的空闲内存以供mptable使用。并更新memblock和e820反映这个变化。

  11. #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
  12.     setup_bios_corruption_check();
  13. #endif

  14.     printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
  15.             max_pfn_mapped<<PAGE_SHIFT);

  16.     reserve_trampoline_memory();

  17. #ifdef CONFIG_ACPI_SLEEP
  18.     /*
  19.      * Reserve low memory region for sleep support.
  20.      * even before init_memory_mapping
  21.      */
  22.     acpi_reserve_wakeup_memory();//在memblock中保留(wakeup_code_end-wakeup_code_start)大小的空间为ACPI S3代码预留空间
  23. #endif

11. 继续初始化页表。页表在head32.S里面已经进行了一部分的初始化。但是在head32.S中,页表的初始化并没有覆盖所有的内存空间,而是只覆盖了_end+MAPPING_BEYOND_END。这里就对页面继续进行初始化
  1. init_gbpages();//这个函数的定义是空的。没有任何的操作。

  2.     /* max_pfn_mapped is updated here */
  3.     max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);//将0-max_low_pfn之间的页面映射到页表。
  4.     max_pfn_mapped = max_low_pfn_mapped;

  5. #ifdef CONFIG_X86_64
  6.     if (max_pfn > max_low_pfn) {
  7.         max_pfn_mapped = init_memory_mapping(1UL<<32,
  8.                          max_pfn<<PAGE_SHIFT);
  9.         /* can we preseve max_low_pfn ?*/
  10.         max_low_pfn = max_pfn;
  11.     }
  12. #endif
  13.     memblock.current_limit = get_max_mapped();
init_memory_mapping的定义如下。

  1. /*
  2.  * Setup the direct mapping of the physical memory at PAGE_OFFSET.
  3.  * This runs before bootmem is initialized and gets pages directly from
  4.  * the physical memory. To access them they are temporarily mapped.
  5.  */
  6. unsigned long __init_refok init_memory_mapping(unsigned long start,
  7.                      unsigned long end)
  8. {
  9.     unsigned long page_size_mask = 0;
  10.     unsigned long start_pfn, end_pfn;
  11.     unsigned long ret = 0;
  12.     unsigned long pos;

  13.     struct map_range mr[NR_RANGE_MR];
  14.     int nr_range, i;
  15.     int use_pse, use_gbpages;

  16.     printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);

  17. #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
  18.     /*
  19.      * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
  20.      * This will simplify cpa(), which otherwise needs to support splitting
  21.      * large pages into small in interrupt context, etc.
  22.      */
  23.     use_pse = use_gbpages = 0;
  24. #else
  25.     use_pse = cpu_has_pse;
  26.     use_gbpages = direct_gbpages;
  27. #endif

  28.     /* Enable PSE if available */
  29.     if (cpu_has_pse)//如果CPU支持PSE,就设置CR4中的PSE位
  30.         set_in_cr4(X86_CR4_PSE);

  31.     /* Enable PGE if available */
  32.     if (cpu_has_pge) {//如果CPU支持PGE,就设置CR4中的PGE位
  33.         set_in_cr4(X86_CR4_PGE);
  34.         __supported_pte_mask |= _PAGE_GLOBAL;
  35.     }

  36.     if (use_gbpages)//如果使用1GB大小的页面,就设置PG_LEVEL_1G的位在page_size_mask里
  37.         page_size_mask |= 1 << PG_LEVEL_1G;
  38.     if (use_pse)//如果使用PSE那么就设置PG_LEVEL_2M位在page_size_mask中
  39.         page_size_mask |= 1 << PG_LEVEL_2M;

  40.     memset(mr, 0, sizeof(mr)); //清空mr
  41.     nr_range = 0;

  42.     /* head if not big page alignment ? */
  43.     start_pfn = start >> PAGE_SHIFT;//按照start的地址计算start_pfn
  44.     pos = start_pfn << PAGE_SHIFT; //pos就是start_pfn的起始地址
  45. #ifdef CONFIG_X86_32
  46.     /*
  47.      * Don't use a large page for the first 2/4MB of memory
  48.      * because there are often fixed size MTRRs in there
  49.      * and overlapping MTRRs into large pages can cause
  50.      * slowdowns.
  51.      */
  52.     /*在只考虑32寻址能力而不考虑PAE的X86系统上,PAGE_SHIFT=12, PMD_SHIFT=PUD_SHIFT=PGDIR_SHIFT=22,即名义上Linux使用4级页面转换机制,而实际上只使用2级页面转换机制
  53.     if (pos == 0)
  54.         end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);//end_pfn=(1<>PAGE_SHIFT。其含义是start_pfn和end_pfn代表了第一个PMD项所指向的页表所指向的内存区域0-4M。
  55.     else
  56.         end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
  57.                  << (PMD_SHIFT - PAGE_SHIFT);//如果start不是从第0个page作为起点,那么start_pfn和end_pfn就应该指向start_pfn为起始的那个PMD项所指向的整个页表所代表的4M空间。
  58. #else /* CONFIG_X86_64 */
  59.     end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
  60.             << (PMD_SHIFT - PAGE_SHIFT);
  61. #endif
  62.     if (end_pfn > (end >> PAGE_SHIFT))
  63.         end_pfn = end >> PAGE_SHIFT;
  64.     if (start_pfn < end_pfn) {
  65.         nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);//保存start_pfn和end_pfn在mr中
  66.         pos = end_pfn << PAGE_SHIFT;
  67.     }

  68.     /* big page (2M) range */
  69.     start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
  70.              << (PMD_SHIFT - PAGE_SHIFT); //start_pfn指向上个range的end_pfn。理想状态下pos+PMD_SIZE-1不会造成对于PMD_SIZE的进位,所以start_pfn==end_pfn
  71. #ifdef CONFIG_X86_32
  72.     end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);//对于X86_32来说,这个range的end_pfn就是指向end地址的PMD的页面号
  73. #else /* CONFIG_X86_64 */
  74.     end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
  75.              << (PUD_SHIFT - PAGE_SHIFT);
  76.     if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
  77.         end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
  78. #endif

  79.     if (start_pfn < end_pfn) {
  80.         nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
  81.                 page_size_mask & (1<<PG_LEVEL_2M));//将start_pfn,end_pfn加入mr,同时指出该区域如果可能的话使用2M的页
  82.         pos = end_pfn << PAGE_SHIFT;
  83.     }

  84. #ifdef CONFIG_X86_64
  85.     /* big page (1G) range */
  86.     start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
  87.              << (PUD_SHIFT - PAGE_SHIFT);
  88.     end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
  89.     if (start_pfn < end_pfn) {
  90.         nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
  91.                 page_size_mask &
  92.                  ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
  93.         pos = end_pfn << PAGE_SHIFT;
  94.     }

  95.     /* tail is not big page (1G) alignment */
  96.     start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
  97.              << (PMD_SHIFT - PAGE_SHIFT);
  98.     end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
  99.     if (start_pfn < end_pfn) {
  100.         nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
  101.                 page_size_mask & (1<<PG_LEVEL_2M));
  102.         pos = end_pfn << PAGE_SHIFT;
  103.     }
  104. #endif

  105.     /* tail is not big page (2M) alignment *
  106.     start_pfn = pos>>PAGE_SHIFT;
  107.     end_pfn = end>>PAGE_SHIFT;
  108.     nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);//建立第三个memory range,将最后的未被(endL>>PMD_SHIFT<<(PMD_SHIFT-PAGE_SHIFT)除尽的内存进行

  109.     /* try to merge same page size and continuous *//如果三个memory range都使用4K的页面,那么就合并。
  110.     for (i = 0; nr
      _range > 1 && i < nr_range - 1; i++) {
  111.         unsigned long old_start;
  112.         if (mr[i].end != mr[i+1].start ||
  113.          mr[i].page_size_mask != mr[i+1].page_size_mask)
  114.             continue;
  115.         /* move it */
  116.         old_start = mr[i].start;
  117.         memmove(&mr[i], &mr[i+1],
  118.             (nr_range - 1 - i) * sizeof(struct map_range));
  119.         mr[i--].start = old_start;
  120.         nr_range--;
  121.     }

  122.     //OK now, there is no overlap between every range in mr
  123.     for (i = 0; i < nr_range; i++)
  124.         printk(KERN_DEBUG " %010lx - %010lx page %s\n",
  125.                 mr[i].start, mr[i].end,
  126.             (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
  127.              (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));

  128.     /*
  129.      * Find space for the kernel direct mapping tables.
  130.      *
  131.      * Later we should allocate these tables in the local node of the
  132.      * memory mapped. Unfortunately this is done currently before the
  133.      * nodes are discovered.
  134.      */
  135.     if (!after_bootmem)
  136.         find_early_table_space(end, use_pse, use_gbpages); //寻找一块连续的空间能够存放所有PUD, PMD, PT表。全局变量e820_table_start用来描述这段内存的开始地址,e820_table_end用来描述这块内存已经使用了的最大地址。e820_table_top用来描述这段内存区域的结束地址。

  137.     for (i = 0; i < nr_range; i++) //真正建立pgd, pt的戏码在这里呢。
  138.         ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
  139.                          mr[i].page_size_mask);

  140. #ifdef CONFIG_X86_32
  141.     early_ioremap_page_table_range_init();//在新pgd, pmd, pt的结构下刷新固定内存映射。

  142.     load_cr3(swapper_pg_dir);//启用新的pgd, pmd, pt
  143. #endif

  144. #ifdef CONFIG_X86_64
  145.     if (!after_bootmem && !start) {
  146.         pud_t *pud;
  147.         pmd_t *pmd;

  148.         mmu_cr4_features = read_cr4();

  149.         /*
  150.          * _brk_end cannot change anymore, but it and _end may be
  151.          * located on different 2M pages. cleanup_highmap(), however,
  152.          * can only consider _end when it runs, so destroy any
  153.          * mappings beyond _brk_end here.
  154.          */
  155.         pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
  156.         pmd = pmd_offset(pud, _brk_end - 1);
  157.         while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
  158.             pmd_clear(pmd);
  159.     }
  160. #endif
  161.     __flush_tlb_all();

  162.     if (!after_bootmem && e820_table_end > e820_table_start)
  163.         memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT,
  164.                  e820_table_end << PAGE_SHIFT, "PGTABLE");//将pgd, pud, pmd, pt所占据的空间加入memblock.reserve中

  165.     if (!after_bootmem)
  166.         early_memtest(start, end);

  167.     return ret >> PAGE_SHIFT;
  168. }
再来看一下实际建立2级页表的过程。
  1. /*
  2.  * This maps the physical memory to kernel virtual address space, a total
  3.  * of max_low_pfn pages, by creating page tables starting from address
  4.  * PAGE_OFFSET:
  5.  */
  6. unsigned long __init
  7. kernel_physical_mapping_init(unsigned long start,
  8.              unsigned long end,
  9.              unsigned long page_size_mask)
  10. {
  11.     int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
  12.     unsigned long last_map_addr = end;
  13.     unsigned long start_pfn, end_pfn;
  14.     pgd_t *pgd_base = swapper_pg_dir;
  15.     int pgd_idx, pmd_idx, pte_ofs;
  16.     unsigned long pfn;
  17.     pgd_t *pgd;
  18.     pmd_t *pmd;
  19.     pte_t *pte;
  20.     unsigned pages_2m, pages_4k;
  21.     int mapping_iter;

  22.     start_pfn = start >> PAGE_SHIFT;
  23.     end_pfn = end >> PAGE_SHIFT;

  24.     /*
  25.      * First iteration will setup identity mapping using large/small pages
  26.      * based on use_pse, with other attributes same as set by
  27.      * the early code in head_32.S
  28.      *
  29.      * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
  30.      * as desired for the kernel identity mapping.
  31.      *
  32.      * This two pass mechanism conforms to the TLB app note which says:
  33.      *
  34.      * "Software should not write to a paging-structure entry in a way
  35.      * that would change, for any linear address, both the page size
  36.      * and either the page frame or attributes."
  37.      */
  38.     mapping_iter = 1;

  39.     if (!cpu_has_pse)
  40.         use_pse = 0;

  41. repeat:
  42.     pages_2m = pages_4k = 0;
  43.     pfn = start_pfn;
//pgd, pmd, pte实现的是0xC0000000以上的虚拟地址转换成0x00000000的物理地址,所以当pgd, pmd, pte建立起来之后将不需要使用pa, va来进行虚拟地址和物理地址的转换。而对于现在的mapping建立,由于pfn是基于物理地址的,所以pgd_idx在计算pgd entry的时候需要加上PAGE_OFFSET.
  1.     pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
  2.     pgd = pgd_base + pgd_idx;
  3. //以下的循环建立了所有的pgd entry
  4.     for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
  5.         pmd = one_md_table_init(pgd);//在e820_table中获得一个页面作为pmd table.如果已经对相应pgd存在一个则返回该pmd的地址。

  6.         if (pfn >= end_pfn)
  7.             continue;
  8. #ifdef CONFIG_X86_PAE
  9.         pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
  10.         pmd += pmd_idx;
  11. #else
  12.         pmd_idx = 0;
  13. #endif
  14.          //以下循环为每一个pgd entry即pmd中的pmd entry建立page table.
  15.         for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
  16.          pmd++, pmd_idx++) {
  17.             unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; //addr是给定pfn的起始地址

  18.             /*
  19.              * Map with big pages if possible, otherwise
  20.              * create normal page tables:
  21.              */
  22.             if (use_pse) { //如果使用了2M的page
  23.                 unsigned int addr2;
  24.                 pgprot_t prot = PAGE_KERNEL_LARGE;//该page的属性是PAGE_KERNEL_LARGE
  25.                 /*
  26.                  * first pass will use the same initial
  27.                  * identity mapping attribute + _PAGE_PSE.
  28.                  */
  29.                 pgprot_t init_prot =
  30.                     __pgprot(PTE_IDENT_ATTR |
  31.                          _PAGE_PSE);

  32.                 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
  33.                     PAGE_OFFSET + PAGE_SIZE-1;
//如果addr和addr2都落在kernel_text的内存空间里面,则我们可以把这个page设为PAGE_KERNEL_LARGE_EXEC.这里addr2是:假设该page是2M的,该2M空间的结束地址是addr2.
  1.                 if (is_kernel_text(addr) ||
  2.                  is_kernel_text(addr2))
  3.                     prot = PAGE_KERNEL_LARGE_EXEC;//如果开始和结束都是在kernel_text之内,则设置页面属性为PAGE_KERNEL_LARGE_EXEC.

  4.                 pages_2m++; //pages_2m是对于使用2M page的计数
  5.               
  6.                 //将pmd和物理pfn相映射以设置pmd entry
  7.                 if (mapping_iter == 1)
  8.                     set_pmd(pmd, pfn_pmd(pfn, init_prot));
  9.                 else
  10.                     set_pmd(pmd, pfn_pmd(pfn, prot));

  11.                 pfn += PTRS_PER_PTE;
  12.                 continue; //在2M page的情况下我们就不需要page table了吗?地址转换就变成了pgd->pmd?
  13.             }
  14.             pte = one_page_table_init(pmd); //在e820_table里面获得一个页面作为page table

  15.             pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
  16.             pte += pte_ofs; //计算处给定pfn对应的page table entry的位置。
  17. //以下循环设置了page table中每一个page table entry.
  18.             for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
  19.              pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
  20.                 pgprot_t prot = PAGE_KERNEL;
  21.                 /*
  22.                  * first pass will use the same initial
  23.                  * identity mapping attribute.
  24.                  */
  25.                 pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);

  26.                 if (is_kernel_text(addr))
  27.                     prot = PAGE_KERNEL_EXEC;

  28.                 pages_4k++;
                 //设置page table entry, 将pte与物理pfn相映射
  1.                 if (mapping_iter == 1) {
  2.                     set_pte(pte, pfn_pte(pfn, init_prot));
  3.                     last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE;
  4.                 } else
  5.                     set_pte(pte, pfn_pte(pfn, prot));
  6.             }
  7.         }
  8.     }
//这是我所不能理解的,为什么会两次对pte的属性进行设置,而不是一次完成呢?
  1.     if (mapping_iter == 1) {
  2.         /*
  3.          * update direct mapping page count only in the first
  4.          * iteration.
  5.          */
  6.         update_page_count(PG_LEVEL_2M, pages_2m);
  7.         update_page_count(PG_LEVEL_4K, pages_4k);

  8.         /*
  9.          * local global flush tlb, which will flush the previous
  10.          * mappings present in both small and large page TLB's.
  11.          */
  12.         __flush_tlb_all();

  13.         /*
  14.          * Second iteration will set the actual desired PTE attributes.
  15.          */
  16.         mapping_iter = 2;
  17.         goto repeat;
  18.     }
  19.     return last_map_addr;
  20. }

12. 接下来是一些杂七杂八的初始化:
  1. reserve_initrd();//如果需要的话,将initrd移动位置。

  2.     reserve_crashkernel();//如果在启动命令行指定了crashkernel大小,就在memblock.memory中划分一块内存作为crashkernel。

  3.     vsmp_init(); //这个函数只在x64系统中使用。暂时不需要看。

  4.     io_delay_init(); //这个函数检查BIOS中的dmi,如果DMI_BOARD_VENDOR, DMI_BOARD_NAME匹配,就调用dmi_io_delay_0xed_port。只有HP的特定机型会匹配DMI_BOARD_VENDOR和DMI_BOARD_NAME

  5.     /*
  6.      * Parse the ACPI tables for possible boot-time SMP configuration.
  7.      */
  8.     acpi_boot_table_init();//初始化acpi table,最终的结果放在全局变量里initial_tables

  9.     early_acpi_boot_init(); //Process the Multiple APIC Description Table (MADT)
13. initmem_init(). initmem_init的作用就在于建立了mem_section的概念。
  1. #ifndef CONFIG_NEED_MULTIPLE_NODES
  2. void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
  3.                 int acpi, int k8)
  4. {
  5. #ifdef CONFIG_HIGHMEM
  6.     highstart_pfn = highend_pfn = max_pfn;
  7.     if (max_pfn > max_low_pfn)
  8.         highstart_pfn = max_low_pfn;
  9.     memblock_x86_register_active_regions(0, 0, highend_pfn);//建立early_node_map, 将0-highend_pfn的空间(0-最大实际物理内存页码)放入到early_node_map里,并且其nodeid是0。实际上内核按照当前memblock.memory的map状况将0-highend_pfn之间的memory range放入到early_node_map。
  10.     sparse_memory_present_with_active_regions(0);//将放入early_node_map中并且nodeid是0的memory region的page放入另外一个mem_section的数组中。这里引入了mem_section的概念,其引入这个概念的目的可能是内存分配更加高效。kernel中一个mem_section包含了物理地址相连的固定数目个page。在x86系统中,如果没有PAE的状况下, 一个mem_section可以包含16384个page。所以从一个给定地址,我们可以得到它所在的page,也能得到它所在的mem_section的下标。而sparse_memory_present_with_active_regions的作用就是为给定node所拥有的物理page建立mem_sections.
  11.     printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
  12.         pages_to_mb(highend_pfn - highstart_pfn));
  13.     num_physpages = highend_pfn;
  14.     high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; //high_memory就是指向highmemory起始的虚拟地址。
  15. #else
  16.     memblock_x86_register_active_regions(0, 0, max_low_pfn);
  17.     sparse_memory_present_with_active_regions(0);
  18.     num_physpages = max_low_pfn;
  19.     high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
  20. #endif
  21. #ifdef CONFIG_FLATMEM
  22.     max_mapnr = num_physpages;
  23. #endif
  24.     __vmalloc_start_set = true;

  25.     printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
  26.             pages_to_mb(max_low_pfn));

  27.     setup_bootmem_allocator();//将after_bootmem置1,标志我们已经完成了bootmem初始化.
  28. }
  29. #endif /* !CONFIG_NEED_MULTIPLE_NODES */
13. 预留DMA空间
  1. memblock_find_dma_reserve();// 0-16M的物理内存空间是DMA可以使用的空间,这个函数意义是找出0-16M空间中有多大的内存已经被memblock.reserved保留而无法分配给后续的DMA操作。
  2.     dma32_reserve_bootmem(); //在x86_32上是空函数
14. paging_init进行kmap初始化,sparse memory初始化和zone初始化。
  1. x86_init.paging.pagetable_setup_start(swapper_pg_dir); //call native_pagetable_setup_start.实际上是把max_low_pfn以上的可能存在的pte进行清空
  2.     paging_init();
  3.     x86_init.paging.pagetable_setup_done(swapper_pg_dir); //call native_pagetable_setup_done。实际是空函数。
在paging_init以前,已经建立的页表包括0xC000000-max_low_pfn,FIXMAP。而在paging_init里,对kmap进行了页表映射。

  1. void __init paging_init(void)
  2. {
  3.     pagetable_init();//这个函数相对简单,其作用是在swapper_pg_dir所指向的pgd中建立从PKMAP_BASE到PKMAP_BASE+PAGE_SIZE*LASTPKMAP页映射所需要的pgd, pmd, pud, pte.PKMAP_BASE到PKMAP_BASE+PAGE_SIZE*LASTKMAP指向空间是在FIXMAP之下的LASTP_KMAP个页面空间。其为kmap建立一个固定的内存映射区域,为kernel访问1G以上的物理空间服务。

  4.     __flush_tlb_all();

  5.     kmap_init();//根据enum fixed_addresses中的FIX_KMAP_BEGIN和FIX_KMAP_END的定义在FIXMAP中为KMAP建立一个映射区域,该区域所能映射的page个数以及每个page所使用的目的在enum km_type中定义。每个CPU核心都有一套kmap page相对应。
  6. /*#ifdef CONFIG_X86_32
  7. FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
  8. FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
  9. #ifdef CONFIG_PCI_MMCONFIG

  10. enum km_type {
  11. KM_BOUNCE_READ,
  12. KM_SKB_SUNRPC_DATA,
  13. KM_SKB_DATA_SOFTIRQ,
  14. KM_USER0,
  15. KM_USER1,
  16. KM_BH_IRQ,
  17. KM_SOFTIRQ0,
  18. KM_SOFTIRQ1,
  19. KM_TYPE_NR
  20. };
  21. */

  22.     /*
  23.      * NOTE: at this point the bootmem allocator is fully available.
  24.      */
  25.     sparse_init(); //查了一下,sparse memory和热插拔memory相关,一般使用在server和SMP上,在这种环境下,memory dimm的地址是不连续的,并且在插拔dimm的时候也不会存在相同memory地址的变化,所以在kernel里面必须有能力处理这种非连续性的内存地址。sparse memory的作用就在这里。这部分先掠过不看
  26.     zone_sizes_init();//初始化zone
  27. }
再看一下zone_sizes_init到底做了点什么:
  1. static void __init zone_sizes_init(void)
  2. {
  3.     unsigned long max_zone_pfns[MAX_NR_ZONES];
  4.     memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
  5.     max_zone_pfns[ZONE_DMA] =
  6.         virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; //dma zone is 0-16M
  7.     max_zone_pfns[ZONE_NORMAL] = max_low_pfn; //zone normal is 16M-max_low_pfn
  8. #ifdef CONFIG_HIGHMEM
  9.     max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; //HIGHMEM Zone is max_low_pfn -highend_pfn
  10. #endif

  11.     free_area_init_nodes(max_zone_pfns);//按照以上对于max_zone_pfns的设定进行zone的初始化
  12. }
好了,不得不再看看free_area_init_nodes在干些什么:
  1. /**
  2.  * free_area_init_nodes - Initialise all pg_data_t and zone data
  3.  * @max_zone_pfn: an array of max PFNs for each zone
  4.  *
  5.  * This will call free_area_init_node() for each active node in the system.
  6.  * Using the page ranges provided by add_active_range(), the size of each
  7.  * zone in each node and their holes is calculated. If the maximum PFN
  8.  * between two adjacent zones match, it is assumed that the zone is empty.
  9.  * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
  10.  * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
  11.  * starts where the previous one ended. For example, ZONE_DMA32 starts
  12.  * at arch_max_dma_pfn.
  13.  */
  14. void __init free_area_init_nodes(unsigned long *max_zone_pfn)
  15. {
  16.     unsigned long nid;
  17.     int i;

  18.     /* Sort early_node_map as initialisation assumes it is sorted */
  19.     sort_node_map();

  20.     /* Record where the zone boundaries are */
  21.     /*以下的代码对于全局变量arch_zone_lowest_possible_pfn[]
  22.       和arch_zone_highest_possible_pfn进行设置,以确定每个zone的lowest_pfn和highest_pfn.依次初始化设定ZONE_DMA, ZONE_Normal, ZONE_HIGHMEM的lowest pfn和highest pfn.并且对于ZONE_MOVABLE的lowest pfn和highest pfn设为9.
  23.     */
  24.     memset(arch_zone_lowest_possible_pfn, 0,
  25.                 sizeof(arch_zone_lowest_possible_pfn));
  26.     memset(arch_zone_highest_possible_pfn, 0,
  27.                 sizeof(arch_zone_highest_possible_pfn));
  28.     arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();//find lowest pfn of physical memory can be used. here it is 0
  29.     arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; //it refer to MAX_DMA_ADDRESS
  30.     for (i = 1; i < MAX_NR_ZONES; i++) {
  31.         if (i == ZONE_MOVABLE)
  32.             continue;
  33.         arch_zone_lowest_possible_pfn[i] =
  34.             arch_zone_highest_possible_pfn[i-1]; //previous higest pfn is the lowest pfn in the next region
  35.         arch_zone_highest_possible_pfn[i] =
  36.             max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);//highest pfn is max_zone_pfn[i]
  37.     }
  38.     arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
  39.     arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; //now arch_zone[ZONE_MOVABLE] is empty

  40.     /* Find the PFNs that ZONE_MOVABLE begins at in each node */
  41.     /* 为每一个cpu node建立zone_movable_pfn。ZONE_MOVABLE是在kernel启动时由命令行传入的参数,意义在于指明内核空间中哪些page是可以移动的,其他的内核page则称为kernel core,是不可以移动的。find_zone_movable_pfns_for_nodes的作用就是按照early_node_map根据每个node的不同内存分布计算出每一个node中movable page的数量
  42.     */
  43.     memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
  44.     find_zone_movable_pfns_for_nodes(zone_movable_pfn); //

  45.     /* Print out the zone ranges */
  46.     printk("Zone PFN ranges:\n");
  47.     for (i = 0; i < MAX_NR_ZONES; i++) {
  48.         if (i == ZONE_MOVABLE)
  49.             continue;
  50.         printk(" %-8s ", zone_names[i]);
  51.         if (arch_zone_lowest_possible_pfn[i] ==
  52.                 arch_zone_highest_possible_pfn[i])
  53.             printk("empty\n");
  54.         else
  55.             printk("%0#10lx -> %0#10lx\n",
  56.                 arch_zone_lowest_possible_pfn[i],
  57.                 arch_zone_highest_possible_pfn[i]);
  58.     }

  59.     /* Print out the PFNs ZONE_MOVABLE begins at in each node */
  60.     printk("Movable zone start PFN for each node\n");
  61.     for (i = 0; i < MAX_NUMNODES; i++) {
  62.         if (zone_movable_pfn[i])
  63.             printk(" Node %d: %lu\n", i, zone_movable_pfn[i]);
  64.     }

  65.     /* Print out the early_node_map[] */
  66.     printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
  67.     for (i = 0; i < nr_nodemap_entries; i++)
  68.         printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,
  69.                         early_node_map[i].start_pfn,
  70.                         early_node_map[i].end_pfn);

  71.     /* Initialise every node */
  72.     mminit_verify_pageflags_layout();
  73.     setup_nr_node_ids();
      /*以下的for是按照early_node_map和每一个zone的内存分配建立全局变量node_data[]
      */
  1.     for_each_online_node(nid) {
  2.         pg_data_t *pgdat = NODE_DATA(nid);//对于NUMA来说每一个node都有一个pg_data_t结构描述该node对应的内存分配状况,在单CPU的环境下,只有一个node存在
  3.         free_area_init_node(nid, NULL,
  4.                 find_min_pfn_for_node(nid), NULL);//对于每个node,初始化其memory分配。具体如何分配看以下的代码

  5.         /* Any memory on that node */
  6.         if (pgdat->node_present_pages)
  7.             node_set_state(nid, N_HIGH_MEMORY);
  8.         check_for_regular_memory(pgdat);
  9.     }
  10. }

  1. void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
  2.         unsigned long node_start_pfn, unsigned long *zholes_size)
  3. {
  4.     pg_data_t *pgdat = NODE_DATA(nid);

  5.     pgdat->node_id = nid;
  6.     pgdat->node_start_pfn = node_start_pfn;//该node的开始页面号
  7.     calculate_node_totalpages(pgdat, zones_size, zholes_size);//计算对于该node来说有多少pages可用。分别设置了pgdat->node_spanned_pages为总共算上hole有多少page,pgdat->node_present_pages为不计hole有多少实际可用的page

  8.     alloc_node_mem_map(pgdat);//为node在memblock.memory或者bootmem中分配一块大小为pgdat->node_spanned_pages大小的内存,并把内存的起始地址放置在pgdat->node_mem_map中。该分配的原则是,如果kernel编译过程中没有使用bootmem,则在memblock.memory中找寻一块内存区域并且该内存区域应该归该node所有,即early_node_map[]的nid是该node.
  9. #ifdef CONFIG_FLAT_NODE_MEM_MAP
  10.     printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
  11.         nid, (unsigned long)pgdat,
  12.         (unsigned long)pgdat->node_mem_map);
  13. #endif

  14.     free_area_init_core(pgdat, zones_size, zholes_size);
  15. }



阅读(675) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~