暂时放下对于内存管理的探讨而对setup_arch进行深入分析的目的在于要找出相关于内存管理初始化细节。在遍历代码的过程中,我跳过了一些和特殊平台相关的代码,例如EFI等。
1. 首先kernel将全局页表目录page global directory切换到swapper_pg_dir。方法是将initial_page_table的内容复制到swapper_pg_dir里面,并且重置cr3。initial_page_table的初始化可以参见http://blog.chinaunix.net/space.php?uid=1701789&do=blog&id=154125。
- /*
-
* copy kernel address range established so far and switch
-
* to the proper swapper page table
-
*/
-
clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY,
-
initial_page_table + KERNEL_PGD_BOUNDARY,
-
KERNEL_PGD_PTRS);
-
-
load_cr3(swapper_pg_dir);
-
__flush_tlb_all();
2. 初始化中断向量和CPU
- early_trap_init();//初始化中断向量1的服务例程为debug所指向的中断服务例程,向量3指向int3所指向的服务例程,向量14则指向page_fault指向的服务例程。
- early_cpu_init(); //初始化cpu_devs[X86_VENDOR_NUM] 并以此侦测boot_cpu_data
3. 根据实模式下设置的各个启动参数boot_params设置全局变量
- ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
-
screen_info = boot_params.screen_info;
-
edid_info = boot_params.edid_info;
-
#ifdef CONFIG_X86_32
-
apm_info.bios = boot_params.apm_bios_info;
-
ist_info = boot_params.ist_info;
-
if (boot_params.sys_desc_table.length != 0) {
-
set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
-
machine_id = boot_params.sys_desc_table.table[0];
-
machine_submodel_id = boot_params.sys_desc_table.table[1];
-
BIOS_revision = boot_params.sys_desc_table.table[2];
-
}
-
#endif
-
saved_video_mode = boot_params.hdr.vid_mode;
-
bootloader_type = boot_params.hdr.type_of_loader;
-
if ((bootloader_type >> 4) == 0xe) {
-
bootloader_type &= 0xf;
-
bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
-
}
-
bootloader_version = bootloader_type & 0xf;
-
bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
-
-
#ifdef CONFIG_BLK_DEV_RAM
-
rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
-
rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
-
rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
-
#endif
-
#ifdef CONFIG_EFI
-
if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-
#ifdef CONFIG_X86_32
-
"EL32",
-
#else
-
"EL64",
-
#endif
-
4)) {
-
efi_enabled = 1;
-
efi_memblock_x86_reserve_range();
-
}
-
#endif
copy_edd();
if (!boot_params.hdr.root_flags)
root_mountflags &= ~MS_RDONLY;
#ifdef CONFIG_CMDLINE_BOOL
#ifdef CONFIG_CMDLINE_OVERRIDE
strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
#else
if (builtin_cmdline[0]) {
/* append boot loader cmdline to builtin */
strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
}
#endif
#endif
strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
parse_early_param();
memblock_x86_reserve_range_setup_data(); /* after early param, so could get panic from serial */
4. 初始化early_ioremap
5. 调用x86_init.oem.arch_setup()。x86_init结构的初始化值在x86_init.c中定义,其具体内容如下。所以arch_setup实际执行的是x86_init_noop(). 这是一个空函数,什么都没有做。x86平台的其他非普通PC如果需要进行一些特定的初始化,可以重置x86_init的结构。
- /*
-
* The platform setup functions are preset with the default functions
-
* for standard PC hardware.
-
*/
-
struct x86_init_ops x86_init __initdata = {
-
-
.resources = {
-
.probe_roms = x86_init_noop,
-
.reserve_resources = reserve_standard_io_resources,
-
.memory_setup = default_machine_specific_memory_setup,
-
},
-
-
.mpparse = {
-
.mpc_record = x86_init_uint_noop,
-
.setup_ioapic_ids = x86_init_noop,
-
.mpc_apic_id = default_mpc_apic_id,
-
.smp_read_mpc_oem = default_smp_read_mpc_oem,
-
.mpc_oem_bus_info = default_mpc_oem_bus_info,
-
.find_smp_config = default_find_smp_config,
-
.get_smp_config = default_get_smp_config,
-
},
-
-
.irqs = {
-
.pre_vector_init = init_ISA_irqs,
-
.intr_init = native_init_IRQ,
-
.trap_init = x86_init_noop,
-
},
-
-
.oem = {
-
.arch_setup = x86_init_noop,
-
.banner = default_banner,
-
},
-
-
.paging = {
-
.pagetable_setup_start = native_pagetable_setup_start,
-
.pagetable_setup_done = native_pagetable_setup_done,
-
},
-
-
.timers = {
-
.setup_percpu_clockev = setup_boot_APIC_clock,
-
.tsc_pre_init = x86_init_noop,
-
.timer_init = hpet_time_init,
-
},
-
-
.iommu = {
-
.iommu_init = iommu_init_noop,
-
},
-
-
.pci = {
-
.init = x86_default_pci_init,
-
.init_irq = x86_default_pci_init_irq,
-
.fixup_irqs = x86_default_pci_fixup_irqs,
-
},
-
};
- setup_memory_map();
-
parse_setup_data();
-
/* update the e820_saved too */
-
e820_reserve_setup_data();
- finish_e820_parsing();
7. 初始化dmi --- dmi_scan_machine();
- void __init dmi_scan_machine(void)
-
{
-
char __iomem *p, *q;
-
int rc;
-
-
if (efi_enabled) { //如果存在efi则在efi_smbios中寻找dmi
-
if (efi.smbios == EFI_INVALID_TABLE_ADDR)
-
goto error;
-
-
/* This is called as a core_initcall() because it isn't
-
* needed during early boot. This also means we can
-
* iounmap the space when we're done with it.
-
*/
-
p = dmi_ioremap(efi.smbios, 32);
-
if (p == NULL)
-
goto error;
-
-
rc = dmi_present(p + 0x10); /* offset of _DMI_ string */ // to decode DMI table to dmi_ident table
-
dmi_iounmap(p, 32);
-
if (!rc) {
-
dmi_available = 1;
-
goto out;
-
}
-
}
-
else {
-
/*
-
* no iounmap() for that ioremap(); it would be a no-op, but
-
* it's so early in setup that sucker gets confused into doing
-
* what it shouldn't if we actually call it.
-
*/
-
p = dmi_ioremap(0xF0000, 0x10000); // 在非efi的BIOS中,dmi应该从 memory range 0xF0000, size 0x10000处开始寻找
-
if (p == NULL)
-
goto error;
-
-
for (q = p; q < p + 0x10000; q += 16) { //每次步进16个字节循环搜索dmi
-
rc = dmi_present(q);//查找dmi的标记并提取 dmi_num, dmi_len, dmi_base三个变量
-
if (!rc) { //发现了dmi,那么就可以跳出循环了
-
dmi_available = 1;
-
dmi_iounmap(p, 0x10000);
-
goto out;
-
}
-
}
-
dmi_iounmap(p, 0x10000);
-
}
-
error:
-
printk(KERN_INFO "DMI not present or invalid.\n");
-
out:
-
dmi_initialized = 1;
-
}
8. iomem_resource. iomem_resource的定义如下, 其定义了一个从0到ffffffff的内存区间。
- struct resource iomem_resource = {
-
.name = "PCI mem",
-
.start = 0,
-
.end = -1,
-
.flags = IORESOURCE_MEM,
-
};
相关于iomem_resource在setup_arch里的操作如下:
- iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; //iomem_resource.end=0xffffffff
- code_resource.start = virt_to_phys(_text);
-
code_resource.end = virt_to_phys(_etext)-1;
-
data_resource.start = virt_to_phys(_etext);
-
data_resource.end = virt_to_phys(_edata)-1;
-
bss_resource.start = virt_to_phys(&__bss_start);
-
bss_resource.end = virt_to_phys(&__bss_stop)-1;
-
-
-
x86_init.resources.probe_roms();// call x86_init_noop which does nothing
-
-
/* after parse_early_param, so could debug it */
- //将code_resource, data_resource和bss_resource加入iomem_resource,iomem_resource是一个超集,code_resource, data_resource 和bss_resource是iomem_resource中的无不重叠的子集,这三个自己之间是平级关系,通过sibling域相互联通。
-
insert_resource(&iomem_resource, &code_resource);
-
insert_resource(&iomem_resource, &data_resource);
-
insert_resource(&iomem_resource, &bss_resource);
9. 计算max_pfn, max_low_pfn以确定HighMemory大小。
由于kernel的实现中0xC0000000到0xFFFFFFFF之间的区域都是使用页面线性映射的方式将3G-4G逻辑地址映射到0-1G物理地址,所以内核逻辑地址的最大上限是0xFFFFFFFF。而逻辑地址在0-3G的空间是分配给应用程序使用的。而前面说过从0xFFFFFFFF往下的128M的空间被ioremap所占据用以固定内存的映射。那么实际kernel使用的空间就是从0xC0000000的896M空间。当物理内存超过896M的空间的时候,超出的物理内存将无法直接从逻辑地址到物理地址进行线性映射。所以就出现了HighMemory:超出896M的物理内存的部分就被定义为HIGHMemory.当kernel需要去使用这部分HighMemory的时候需要进行映射。
max_pfn的含义是最大的物理内存的页面号码。
max_low_pfn的含义是Kernel能够直接使用的页面号码。
HighMemory就是max_low_pfn到max_pfn之间的物理内存。
highmem_pages的含义是属于HighMemory的页面数。
max_pfn和max_low_pfn的计算方式如下:
- max_pfn = e820_end_of_ram_pfn();//已经知道e820里面是物理内存分配的情况。e820_end_of_ram_pfn通过遍历e820结构找出最大物理地址,并将其所处的页面作为max_pfn
-
-
/* update e820 for memory not covered by WB MTRRs */ MTTR是X86结构中一组寄存器以指明在物理内存中的一组内存区域是被CPU所加载入Cache以加快访问速度。*/
-
mtrr_bp_init();//initialize boot cpu mtrr. Actually it clean up the mtrr to make sure only Write Back cache memory range are included in mtrr memory range
-
if (mtrr_trim_uncached_memory(max_pfn))//update e820 to reserve mtrr memory range
-
max_pfn = e820_end_of_ram_pfn();//如果由于mtrr的原因导致e820被更新,那么重新计算max_pfn
-
-
#ifdef CONFIG_X86_32
-
/* max_low_pfn get updated here */
-
find_low_pfn_range(); //计算max_low_pfn,只看x86_32部分。
-
#else
-
num_physpages = max_pfn;
-
-
check_x2apic();
-
-
/* How many end-of-memory variables you have, */
-
/* need this before calling reserve_initrd */
-
if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
-
max_low_pfn = e820_end_of_low_ram_pfn();
-
else
-
max_low_pfn = max_pfn;
-
-
high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
-
#endif
find_low_pfn_range()的定义如下:
- void __init find_low_pfn_range(void)
-
{
-
/* it could update max_pfn */
-
-
if (max_pfn <= MAXMEM_PFN)
-
lowmem_pfn_init();//如果物理内存小于MAXMEM
-
else
-
highmem_pfn_init();//如果物理内存大于MAXMEM
-
}
先来看MAXMEM_PFN的定义:
- #define MAXMEM_PFN PFN_DOWN(MAXMEM)
- #define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)
-
#ifdef CONFIG_X86_PAE
-
#define LAST_PKMAP 512
-
#else
-
#define LAST_PKMAP 1024
-
#endif
-
-
#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \
-
& PMD_MASK)
-
-
#ifdef CONFIG_HIGHMEM
-
# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
-
#else
-
# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
-
#endif
- unsigned int __VMALLOC_RESERVE = 128 << 20;
可以看到MAXMEM是由VMALLOC_END, PAGE_OFFSET和VMALLOC_RESERVE决定的。PAGE_OFFSET是0xC0000000。
从以上的定义可以看到,从0xFFFFFFFF的最高地址往下,有FIXADDR固定映射区,PKMAP区和VMALLOC区。在这三个区域以下是kernel可以使用自由访问的物理内存空间。
当max_pfn小于MAXMEM_PFN的时候,即是所具备的实际物理内存数小于Kernel所能自由访问的物理内存数时,HighMemory的存在与否实际取决于传给kernel的命令行highmem=x的参数。如果没有传入该参数,highmemory将不存在。如果存在该参数,highmemory的将在lowmem_pfn_init里被设置。
- void __init lowmem_pfn_init(void)
-
{
-
/* max_low_pfn is 0, we already have early_res support */
-
max_low_pfn = max_pfn;
-
-
if (highmem_pages == -1) //highmem_pages在parse_highmem()设置
-
highmem_pages = 0;
-
#ifdef CONFIG_HIGHMEM
-
if (highmem_pages >= max_pfn) { //如果命令行要求的highmem大于实际内存数,就放弃设置highmem
-
printk(KERN_ERR MSG_HIGHMEM_TOO_BIG,
-
pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
-
highmem_pages = 0;
-
}
-
if (highmem_pages) {
-
if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) {//如果需要的highmem小于实际内存数,但是物理内存数减去所需的highmem之后,给kernel留下的空间不足64M,则放弃设置highmem.
-
printk(KERN_ERR MSG_LOWMEM_TOO_SMALL,
-
pages_to_mb(highmem_pages));
-
highmem_pages = 0;
-
}
-
max_low_pfn -= highmem_pages;//根据highmem_pages所要求的数目调整max_low_pfn。
-
}
-
#else
-
if (highmem_pages)
-
printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
-
#endif
-
}
当max_pfn大于MAXMEM_PFN,即实际存在的物理内存大于kernel能够直接访问的物理内存时,为了能够访问其余的内存,就需要设置HighMemory。highmem_pfn_init()进行了设置。
- /*
-
* We have more RAM than fits into lowmem - we try to put it into
-
* highmem, also taking the highmem=x boot parameter into account:
-
*/
-
void __init highmem_pfn_init(void)
-
{
-
max_low_pfn = MAXMEM_PFN;
-
-
if (highmem_pages == -1)//如果命令行没有设置highmem_pages,那么默认highmem_pages就是实际物理内存数减去kernel可直接访问的最大物理内存。
-
highmem_pages = max_pfn - MAXMEM_PFN;
-
-
if (highmem_pages + MAXMEM_PFN < max_pfn)//如果命令行所要求的highmem+MAXMEM小于实际存在的物理内存数,那么实际可用的物理内存数目就要相应减少。
-
max_pfn = MAXMEM_PFN + highmem_pages;
-
-
if (highmem_pages + MAXMEM_PFN > max_pfn) {//如果所需的highmem+MAXMEM大于实际存在的物理内存数。则不配置highmem
-
printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
-
pages_to_mb(max_pfn - MAXMEM_PFN),
-
pages_to_mb(highmem_pages));
-
highmem_pages = 0;
-
}
-
#ifndef CONFIG_HIGHMEM
-
/* Maximum memory usable is what is directly addressable */
-
printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
-
if (max_pfn > MAX_NONPAE_PFN)
-
printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
-
else
-
printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
-
max_pfn = MAXMEM_PFN;//如果编译kernel时没有定义CONFIG_HIGHMEM,就应该把max_pfn从实际物理内存数改变成为MAXMEM从而取消HIGHMEM
-
#else /* !CONFIG_HIGHMEM */
-
#ifndef CONFIG_HIGHMEM64G
-
if (max_pfn > MAX_NONPAE_PFN) {//如果编译kernel时没有打开PAE以打开CPU的PAE寻址,那么最大可用内存不可超过4G.
-
max_pfn = MAX_NONPAE_PFN;
-
printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
-
}
-
#endif /* !CONFIG_HIGHMEM64G */
-
#endif /* !CONFIG_HIGHMEM */
-
}
10. 继续分配和预留内存空间。
- /*
-
* Need to conclude brk, before memblock_x86_fill()
-
* it could use memblock_find_in_range, could overlap with
-
* brk area.
-
*/
-
reserve_brk(); //将brk段保留在memblock.reserved里
-
-
memblock.current_limit = get_max_mapped(); //对于x86_32系统来说,max_pfn_mapped在head_32.S里面被初始化。 详见http://blog.chinaunix.net/space.php?uid=1701789&do=blog&id=154125
-
memblock_x86_fill();//将e820中的E820_RAM类型和E820_RESERVED_KERN类型的内存区域加入到memblock.memory中
-
-
/* preallocate 4k for mptable mpc */
-
early_reserve_e820_mpc_new();//在memblock中找到一块4K的空闲内存以供mptable使用。并更新memblock和e820反映这个变化。
-
-
#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
-
setup_bios_corruption_check();
-
#endif
-
-
printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
-
max_pfn_mapped<<PAGE_SHIFT);
-
-
reserve_trampoline_memory();
-
-
#ifdef CONFIG_ACPI_SLEEP
-
/*
-
* Reserve low memory region for sleep support.
-
* even before init_memory_mapping
-
*/
-
acpi_reserve_wakeup_memory();//在memblock中保留(wakeup_code_end-wakeup_code_start)大小的空间为ACPI S3代码预留空间
-
#endif
11. 继续初始化页表。页表在head32.S里面已经进行了一部分的初始化。但是在head32.S中,页表的初始化并没有覆盖所有的内存空间,而是只覆盖了_end+MAPPING_BEYOND_END。这里就对页面继续进行初始化
- init_gbpages();//这个函数的定义是空的。没有任何的操作。
-
-
/* max_pfn_mapped is updated here */
-
max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);//将0-max_low_pfn之间的页面映射到页表。
-
max_pfn_mapped = max_low_pfn_mapped;
-
-
#ifdef CONFIG_X86_64
-
if (max_pfn > max_low_pfn) {
-
max_pfn_mapped = init_memory_mapping(1UL<<32,
-
max_pfn<<PAGE_SHIFT);
-
/* can we preseve max_low_pfn ?*/
-
max_low_pfn = max_pfn;
-
}
-
#endif
-
memblock.current_limit = get_max_mapped();
init_memory_mapping的定义如下。
- /*
-
* Setup the direct mapping of the physical memory at PAGE_OFFSET.
-
* This runs before bootmem is initialized and gets pages directly from
-
* the physical memory. To access them they are temporarily mapped.
-
*/
-
unsigned long __init_refok init_memory_mapping(unsigned long start,
-
unsigned long end)
-
{
-
unsigned long page_size_mask = 0;
-
unsigned long start_pfn, end_pfn;
-
unsigned long ret = 0;
-
unsigned long pos;
-
-
struct map_range mr[NR_RANGE_MR];
-
int nr_range, i;
-
int use_pse, use_gbpages;
-
-
printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
-
-
#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
-
/*
-
* For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
-
* This will simplify cpa(), which otherwise needs to support splitting
-
* large pages into small in interrupt context, etc.
-
*/
-
use_pse = use_gbpages = 0;
-
#else
-
use_pse = cpu_has_pse;
-
use_gbpages = direct_gbpages;
-
#endif
-
-
/* Enable PSE if available */
-
if (cpu_has_pse)//如果CPU支持PSE,就设置CR4中的PSE位
-
set_in_cr4(X86_CR4_PSE);
-
-
/* Enable PGE if available */
-
if (cpu_has_pge) {//如果CPU支持PGE,就设置CR4中的PGE位
-
set_in_cr4(X86_CR4_PGE);
-
__supported_pte_mask |= _PAGE_GLOBAL;
-
}
-
-
if (use_gbpages)//如果使用1GB大小的页面,就设置PG_LEVEL_1G的位在page_size_mask里
-
page_size_mask |= 1 << PG_LEVEL_1G;
-
if (use_pse)//如果使用PSE那么就设置PG_LEVEL_2M位在page_size_mask中
-
page_size_mask |= 1 << PG_LEVEL_2M;
-
-
memset(mr, 0, sizeof(mr)); //清空mr
-
nr_range = 0;
-
-
/* head if not big page alignment ? */
-
start_pfn = start >> PAGE_SHIFT;//按照start的地址计算start_pfn
-
pos = start_pfn << PAGE_SHIFT; //pos就是start_pfn的起始地址
-
#ifdef CONFIG_X86_32
-
/*
-
* Don't use a large page for the first 2/4MB of memory
-
* because there are often fixed size MTRRs in there
-
* and overlapping MTRRs into large pages can cause
-
* slowdowns.
-
*/
- /*在只考虑32寻址能力而不考虑PAE的X86系统上,PAGE_SHIFT=12, PMD_SHIFT=PUD_SHIFT=PGDIR_SHIFT=22,即名义上Linux使用4级页面转换机制,而实际上只使用2级页面转换机制
-
if (pos == 0)
-
end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);//end_pfn=(1<>PAGE_SHIFT。其含义是start_pfn和end_pfn代表了第一个PMD项所指向的页表所指向的内存区域0-4M。
-
else
-
end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-
<< (PMD_SHIFT - PAGE_SHIFT);//如果start不是从第0个page作为起点,那么start_pfn和end_pfn就应该指向start_pfn为起始的那个PMD项所指向的整个页表所代表的4M空间。
-
#else /* CONFIG_X86_64 */
-
end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
-
<< (PMD_SHIFT - PAGE_SHIFT);
-
#endif
-
if (end_pfn > (end >> PAGE_SHIFT))
-
end_pfn = end >> PAGE_SHIFT;
-
if (start_pfn < end_pfn) {
-
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);//保存start_pfn和end_pfn在mr中
-
pos = end_pfn << PAGE_SHIFT;
-
}
-
-
/* big page (2M) range */
-
start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-
<< (PMD_SHIFT - PAGE_SHIFT); //start_pfn指向上个range的end_pfn。理想状态下pos+PMD_SIZE-1不会造成对于PMD_SIZE的进位,所以start_pfn==end_pfn
-
#ifdef CONFIG_X86_32
-
end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);//对于X86_32来说,这个range的end_pfn就是指向end地址的PMD的页面号
-
#else /* CONFIG_X86_64 */
-
end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-
<< (PUD_SHIFT - PAGE_SHIFT);
-
if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
-
end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
-
#endif
-
-
if (start_pfn < end_pfn) {
-
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-
page_size_mask & (1<<PG_LEVEL_2M));//将start_pfn,end_pfn加入mr,同时指出该区域如果可能的话使用2M的页
-
pos = end_pfn << PAGE_SHIFT;
-
}
-
-
#ifdef CONFIG_X86_64
-
/* big page (1G) range */
-
start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-
<< (PUD_SHIFT - PAGE_SHIFT);
-
end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
-
if (start_pfn < end_pfn) {
-
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-
page_size_mask &
-
((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
-
pos = end_pfn << PAGE_SHIFT;
-
}
-
-
/* tail is not big page (1G) alignment */
-
start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-
<< (PMD_SHIFT - PAGE_SHIFT);
-
end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
-
if (start_pfn < end_pfn) {
-
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-
page_size_mask & (1<<PG_LEVEL_2M));
-
pos = end_pfn << PAGE_SHIFT;
-
}
-
#endif
-
-
/* tail is not big page (2M) alignment */
-
start_pfn = pos>>PAGE_SHIFT;
-
end_pfn = end>>PAGE_SHIFT;
-
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);//建立第三个memory range,将最后的未被(endL>>PMD_SHIFT<<(PMD_SHIFT-PAGE_SHIFT)除尽的内存进行
-
-
/* try to merge same page size and continuous *//如果三个memory range都使用4K的页面,那么就合并。
-
for (i = 0; nr
_range > 1 && i < nr_range - 1; i++) {
-
unsigned long old_start;
-
if (mr[i].end != mr[i+1].start ||
-
mr[i].page_size_mask != mr[i+1].page_size_mask)
-
continue;
-
/* move it */
-
old_start = mr[i].start;
-
memmove(&mr[i], &mr[i+1],
-
(nr_range - 1 - i) * sizeof(struct map_range));
-
mr[i--].start = old_start;
-
nr_range--;
-
}
-
-
//OK now, there is no overlap between every range in mr
-
for (i = 0; i < nr_range; i++)
-
printk(KERN_DEBUG " %010lx - %010lx page %s\n",
-
mr[i].start, mr[i].end,
-
(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
-
(mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
-
-
/*
-
* Find space for the kernel direct mapping tables.
-
*
-
* Later we should allocate these tables in the local node of the
-
* memory mapped. Unfortunately this is done currently before the
-
* nodes are discovered.
-
*/
-
if (!after_bootmem)
-
find_early_table_space(end, use_pse, use_gbpages); //寻找一块连续的空间能够存放所有PUD, PMD, PT表。全局变量e820_table_start用来描述这段内存的开始地址,e820_table_end用来描述这块内存已经使用了的最大地址。e820_table_top用来描述这段内存区域的结束地址。
-
-
for (i = 0; i < nr_range; i++) //真正建立pgd, pt的戏码在这里呢。
-
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
-
mr[i].page_size_mask);
-
-
#ifdef CONFIG_X86_32
-
early_ioremap_page_table_range_init();//在新pgd, pmd, pt的结构下刷新固定内存映射。
-
-
load_cr3(swapper_pg_dir);//启用新的pgd, pmd, pt
-
#endif
-
-
#ifdef CONFIG_X86_64
-
if (!after_bootmem && !start) {
-
pud_t *pud;
-
pmd_t *pmd;
-
-
mmu_cr4_features = read_cr4();
-
-
/*
-
* _brk_end cannot change anymore, but it and _end may be
-
* located on different 2M pages. cleanup_highmap(), however,
-
* can only consider _end when it runs, so destroy any
-
* mappings beyond _brk_end here.
-
*/
-
pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
-
pmd = pmd_offset(pud, _brk_end - 1);
-
while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
-
pmd_clear(pmd);
-
}
-
#endif
-
__flush_tlb_all();
-
-
if (!after_bootmem && e820_table_end > e820_table_start)
-
memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT,
-
e820_table_end << PAGE_SHIFT, "PGTABLE");//将pgd, pud, pmd, pt所占据的空间加入memblock.reserve中
-
-
if (!after_bootmem)
-
early_memtest(start, end);
-
-
return ret >> PAGE_SHIFT;
-
}
再来看一下实际建立2级页表的过程。
- /*
-
* This maps the physical memory to kernel virtual address space, a total
-
* of max_low_pfn pages, by creating page tables starting from address
-
* PAGE_OFFSET:
-
*/
-
unsigned long __init
-
kernel_physical_mapping_init(unsigned long start,
-
unsigned long end,
-
unsigned long page_size_mask)
-
{
-
int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
-
unsigned long last_map_addr = end;
-
unsigned long start_pfn, end_pfn;
-
pgd_t *pgd_base = swapper_pg_dir;
-
int pgd_idx, pmd_idx, pte_ofs;
-
unsigned long pfn;
-
pgd_t *pgd;
-
pmd_t *pmd;
-
pte_t *pte;
-
unsigned pages_2m, pages_4k;
-
int mapping_iter;
-
-
start_pfn = start >> PAGE_SHIFT;
-
end_pfn = end >> PAGE_SHIFT;
-
-
/*
-
* First iteration will setup identity mapping using large/small pages
-
* based on use_pse, with other attributes same as set by
-
* the early code in head_32.S
-
*
-
* Second iteration will setup the appropriate attributes (NX, GLOBAL..)
-
* as desired for the kernel identity mapping.
-
*
-
* This two pass mechanism conforms to the TLB app note which says:
-
*
-
* "Software should not write to a paging-structure entry in a way
-
* that would change, for any linear address, both the page size
-
* and either the page frame or attributes."
-
*/
-
mapping_iter = 1;
-
-
if (!cpu_has_pse)
-
use_pse = 0;
-
-
repeat:
-
pages_2m = pages_4k = 0;
-
pfn = start_pfn;
//pgd, pmd, pte实现的是0xC0000000以上的虚拟地址转换成0x00000000的物理地址,所以当pgd, pmd, pte建立起来之后将不需要使用pa, va来进行虚拟地址和物理地址的转换。而对于现在的mapping建立,由于pfn是基于物理地址的,所以pgd_idx在计算pgd entry的时候需要加上PAGE_OFFSET.
-
pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
-
pgd = pgd_base + pgd_idx;
- //以下的循环建立了所有的pgd entry
-
for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
-
pmd = one_md_table_init(pgd);//在e820_table中获得一个页面作为pmd table.如果已经对相应pgd存在一个则返回该pmd的地址。
-
-
if (pfn >= end_pfn)
-
continue;
-
#ifdef CONFIG_X86_PAE
-
pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
-
pmd += pmd_idx;
-
#else
-
pmd_idx = 0;
-
#endif
- //以下循环为每一个pgd entry即pmd中的pmd entry建立page table.
- for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
-
pmd++, pmd_idx++) {
-
unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; //addr是给定pfn的起始地址
-
-
/*
-
* Map with big pages if possible, otherwise
-
* create normal page tables:
-
*/
-
if (use_pse) { //如果使用了2M的page
-
unsigned int addr2;
-
pgprot_t prot = PAGE_KERNEL_LARGE;//该page的属性是PAGE_KERNEL_LARGE
-
/*
-
* first pass will use the same initial
-
* identity mapping attribute + _PAGE_PSE.
-
*/
-
pgprot_t init_prot =
-
__pgprot(PTE_IDENT_ATTR |
-
_PAGE_PSE);
-
-
addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
-
PAGE_OFFSET + PAGE_SIZE-1;
//如果addr和addr2都落在kernel_text的内存空间里面,则我们可以把这个page设为PAGE_KERNEL_LARGE_EXEC.这里addr2是:假设该page是2M的,该2M空间的结束地址是addr2.
-
if (is_kernel_text(addr) ||
-
is_kernel_text(addr2))
-
prot = PAGE_KERNEL_LARGE_EXEC;//如果开始和结束都是在kernel_text之内,则设置页面属性为PAGE_KERNEL_LARGE_EXEC.
-
-
pages_2m++; //pages_2m是对于使用2M page的计数
-
- //将pmd和物理pfn相映射以设置pmd entry
-
if (mapping_iter == 1)
-
set_pmd(pmd, pfn_pmd(pfn, init_prot));
-
else
-
set_pmd(pmd, pfn_pmd(pfn, prot));
-
-
pfn += PTRS_PER_PTE;
-
continue; //在2M page的情况下我们就不需要page table了吗?地址转换就变成了pgd->pmd?
-
}
-
pte = one_page_table_init(pmd); //在e820_table里面获得一个页面作为page table
-
-
pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
-
pte += pte_ofs; //计算处给定pfn对应的page table entry的位置。
- //以下循环设置了page table中每一个page table entry.
-
for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
-
pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
-
pgprot_t prot = PAGE_KERNEL;
-
/*
-
* first pass will use the same initial
-
* identity mapping attribute.
-
*/
-
pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
-
-
if (is_kernel_text(addr))
-
prot = PAGE_KERNEL_EXEC;
-
-
pages_4k++;
//设置page table entry, 将pte与物理pfn相映射
-
if (mapping_iter == 1) {
-
set_pte(pte, pfn_pte(pfn, init_prot));
-
last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE;
-
} else
-
set_pte(pte, pfn_pte(pfn, prot));
-
}
-
}
-
}
//这是我所不能理解的,为什么会两次对pte的属性进行设置,而不是一次完成呢?
-
if (mapping_iter == 1) {
-
/*
-
* update direct mapping page count only in the first
-
* iteration.
-
*/
-
update_page_count(PG_LEVEL_2M, pages_2m);
-
update_page_count(PG_LEVEL_4K, pages_4k);
-
-
/*
-
* local global flush tlb, which will flush the previous
-
* mappings present in both small and large page TLB's.
-
*/
-
__flush_tlb_all();
-
-
/*
-
* Second iteration will set the actual desired PTE attributes.
-
*/
-
mapping_iter = 2;
-
goto repeat;
-
}
-
return last_map_addr;
-
}
12. 接下来是一些杂七杂八的初始化:
- reserve_initrd();//如果需要的话,将initrd移动位置。
-
-
reserve_crashkernel();//如果在启动命令行指定了crashkernel大小,就在memblock.memory中划分一块内存作为crashkernel。
-
-
vsmp_init(); //这个函数只在x64系统中使用。暂时不需要看。
-
-
io_delay_init(); //这个函数检查BIOS中的dmi,如果DMI_BOARD_VENDOR, DMI_BOARD_NAME匹配,就调用dmi_io_delay_0xed_port。只有HP的特定机型会匹配DMI_BOARD_VENDOR和DMI_BOARD_NAME
-
-
/*
-
* Parse the ACPI tables for possible boot-time SMP configuration.
-
*/
-
acpi_boot_table_init();//初始化acpi table,最终的结果放在全局变量里initial_tables
-
-
early_acpi_boot_init(); //Process the Multiple APIC Description Table (MADT)
13. initmem_init(). initmem_init的作用就在于建立了mem_section的概念。
- #ifndef CONFIG_NEED_MULTIPLE_NODES
-
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
-
int acpi, int k8)
-
{
-
#ifdef CONFIG_HIGHMEM
-
highstart_pfn = highend_pfn = max_pfn;
-
if (max_pfn > max_low_pfn)
-
highstart_pfn = max_low_pfn;
-
memblock_x86_register_active_regions(0, 0, highend_pfn);//建立early_node_map, 将0-highend_pfn的空间(0-最大实际物理内存页码)放入到early_node_map里,并且其nodeid是0。实际上内核按照当前memblock.memory的map状况将0-highend_pfn之间的memory range放入到early_node_map。
-
sparse_memory_present_with_active_regions(0);//将放入early_node_map中并且nodeid是0的memory region的page放入另外一个mem_section的数组中。这里引入了mem_section的概念,其引入这个概念的目的可能是内存分配更加高效。kernel中一个mem_section包含了物理地址相连的固定数目个page。在x86系统中,如果没有PAE的状况下, 一个mem_section可以包含16384个page。所以从一个给定地址,我们可以得到它所在的page,也能得到它所在的mem_section的下标。而sparse_memory_present_with_active_regions的作用就是为给定node所拥有的物理page建立mem_sections.
-
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
-
pages_to_mb(highend_pfn - highstart_pfn));
-
num_physpages = highend_pfn;
-
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; //high_memory就是指向highmemory起始的虚拟地址。
-
#else
-
memblock_x86_register_active_regions(0, 0, max_low_pfn);
-
sparse_memory_present_with_active_regions(0);
-
num_physpages = max_low_pfn;
-
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
-
#endif
-
#ifdef CONFIG_FLATMEM
-
max_mapnr = num_physpages;
-
#endif
-
__vmalloc_start_set = true;
-
-
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
-
pages_to_mb(max_low_pfn));
-
-
setup_bootmem_allocator();//将after_bootmem置1,标志我们已经完成了bootmem初始化.
-
}
-
#endif /* !CONFIG_NEED_MULTIPLE_NODES */
13. 预留DMA空间
- memblock_find_dma_reserve();// 0-16M的物理内存空间是DMA可以使用的空间,这个函数意义是找出0-16M空间中有多大的内存已经被memblock.reserved保留而无法分配给后续的DMA操作。
- dma32_reserve_bootmem(); //在x86_32上是空函数
14. paging_init进行kmap初始化,sparse memory初始化和zone初始化。
- x86_init.paging.pagetable_setup_start(swapper_pg_dir); //call native_pagetable_setup_start.实际上是把max_low_pfn以上的可能存在的pte进行清空
-
paging_init();
-
x86_init.paging.pagetable_setup_done(swapper_pg_dir); //call native_pagetable_setup_done。实际是空函数。
在paging_init以前,已经建立的页表包括0xC000000-max_low_pfn,FIXMAP。而在paging_init里,对kmap进行了页表映射。
- void __init paging_init(void)
-
{
-
pagetable_init();//这个函数相对简单,其作用是在swapper_pg_dir所指向的pgd中建立从PKMAP_BASE到PKMAP_BASE+PAGE_SIZE*LASTPKMAP页映射所需要的pgd, pmd, pud, pte.PKMAP_BASE到PKMAP_BASE+PAGE_SIZE*LASTKMAP指向空间是在FIXMAP之下的LASTP_KMAP个页面空间。其为kmap建立一个固定的内存映射区域,为kernel访问1G以上的物理空间服务。
-
-
__flush_tlb_all();
-
-
kmap_init();//根据enum fixed_addresses中的FIX_KMAP_BEGIN和FIX_KMAP_END的定义在FIXMAP中为KMAP建立一个映射区域,该区域所能映射的page个数以及每个page所使用的目的在enum km_type中定义。每个CPU核心都有一套kmap page相对应。
- /*#ifdef CONFIG_X86_32
- FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
- FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
- #ifdef CONFIG_PCI_MMCONFIG
- enum km_type {
- KM_BOUNCE_READ,
- KM_SKB_SUNRPC_DATA,
- KM_SKB_DATA_SOFTIRQ,
- KM_USER0,
- KM_USER1,
- KM_BH_IRQ,
- KM_SOFTIRQ0,
- KM_SOFTIRQ1,
- KM_TYPE_NR
- };
- */
-
-
/*
-
* NOTE: at this point the bootmem allocator is fully available.
-
*/
-
sparse_init(); //查了一下,sparse memory和热插拔memory相关,一般使用在server和SMP上,在这种环境下,memory dimm的地址是不连续的,并且在插拔dimm的时候也不会存在相同memory地址的变化,所以在kernel里面必须有能力处理这种非连续性的内存地址。sparse memory的作用就在这里。这部分先掠过不看
-
zone_sizes_init();//初始化zone
-
}
再看一下zone_sizes_init到底做了点什么:
- static void __init zone_sizes_init(void)
-
{
-
unsigned long max_zone_pfns[MAX_NR_ZONES];
-
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-
max_zone_pfns[ZONE_DMA] =
-
virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; //dma zone is 0-16M
-
max_zone_pfns[ZONE_NORMAL] = max_low_pfn; //zone normal is 16M-max_low_pfn
-
#ifdef CONFIG_HIGHMEM
-
max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; //HIGHMEM Zone is max_low_pfn -highend_pfn
-
#endif
-
-
free_area_init_nodes(max_zone_pfns);//按照以上对于max_zone_pfns的设定进行zone的初始化
-
}
好了,不得不再看看free_area_init_nodes在干些什么:
- /**
-
* free_area_init_nodes - Initialise all pg_data_t and zone data
-
* @max_zone_pfn: an array of max PFNs for each zone
-
*
-
* This will call free_area_init_node() for each active node in the system.
-
* Using the page ranges provided by add_active_range(), the size of each
-
* zone in each node and their holes is calculated. If the maximum PFN
-
* between two adjacent zones match, it is assumed that the zone is empty.
-
* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
-
* that arch_max_dma32_pfn has no pages. It is also assumed that a zone
-
* starts where the previous one ended. For example, ZONE_DMA32 starts
-
* at arch_max_dma_pfn.
-
*/
-
void __init free_area_init_nodes(unsigned long *max_zone_pfn)
-
{
-
unsigned long nid;
-
int i;
-
-
/* Sort early_node_map as initialisation assumes it is sorted */
-
sort_node_map();
-
-
/* Record where the zone boundaries are */
- /*以下的代码对于全局变量arch_zone_lowest_possible_pfn[]
- 和arch_zone_highest_possible_pfn进行设置,以确定每个zone的lowest_pfn和highest_pfn.依次初始化设定ZONE_DMA, ZONE_Normal, ZONE_HIGHMEM的lowest pfn和highest pfn.并且对于ZONE_MOVABLE的lowest pfn和highest pfn设为9.
- */
-
memset(arch_zone_lowest_possible_pfn, 0,
-
sizeof(arch_zone_lowest_possible_pfn));
-
memset(arch_zone_highest_possible_pfn, 0,
-
sizeof(arch_zone_highest_possible_pfn));
-
arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();//find lowest pfn of physical memory can be used. here it is 0
-
arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; //it refer to MAX_DMA_ADDRESS
-
for (i = 1; i < MAX_NR_ZONES; i++) {
-
if (i == ZONE_MOVABLE)
-
continue;
-
arch_zone_lowest_possible_pfn[i] =
-
arch_zone_highest_possible_pfn[i-1]; //previous higest pfn is the lowest pfn in the next region
-
arch_zone_highest_possible_pfn[i] =
-
max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);//highest pfn is max_zone_pfn[i]
-
}
-
arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
-
arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; //now arch_zone[ZONE_MOVABLE] is empty
-
-
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
- /* 为每一个cpu node建立zone_movable_pfn。ZONE_MOVABLE是在kernel启动时由命令行传入的参数,意义在于指明内核空间中哪些page是可以移动的,其他的内核page则称为kernel core,是不可以移动的。find_zone_movable_pfns_for_nodes的作用就是按照early_node_map根据每个node的不同内存分布计算出每一个node中movable page的数量
- */
-
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
-
find_zone_movable_pfns_for_nodes(zone_movable_pfn); //
-
-
/* Print out the zone ranges */
-
printk("Zone PFN ranges:\n");
-
for (i = 0; i < MAX_NR_ZONES; i++) {
-
if (i == ZONE_MOVABLE)
-
continue;
-
printk(" %-8s ", zone_names[i]);
-
if (arch_zone_lowest_possible_pfn[i] ==
-
arch_zone_highest_possible_pfn[i])
-
printk("empty\n");
-
else
-
printk("%0#10lx -> %0#10lx\n",
-
arch_zone_lowest_possible_pfn[i],
-
arch_zone_highest_possible_pfn[i]);
-
}
-
-
/* Print out the PFNs ZONE_MOVABLE begins at in each node */
-
printk("Movable zone start PFN for each node\n");
-
for (i = 0; i < MAX_NUMNODES; i++) {
-
if (zone_movable_pfn[i])
-
printk(" Node %d: %lu\n", i, zone_movable_pfn[i]);
-
}
-
-
/* Print out the early_node_map[] */
-
printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
-
for (i = 0; i < nr_nodemap_entries; i++)
-
printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,
-
early_node_map[i].start_pfn,
-
early_node_map[i].end_pfn);
-
-
/* Initialise every node */
-
mminit_verify_pageflags_layout();
-
setup_nr_node_ids();
/*以下的for是按照early_node_map和每一个zone的内存分配建立全局变量node_data[]
*/
-
for_each_online_node(nid) {
-
pg_data_t *pgdat = NODE_DATA(nid);//对于NUMA来说每一个node都有一个pg_data_t结构描述该node对应的内存分配状况,在单CPU的环境下,只有一个node存在
-
free_area_init_node(nid, NULL,
-
find_min_pfn_for_node(nid), NULL);//对于每个node,初始化其memory分配。具体如何分配看以下的代码
-
-
/* Any memory on that node */
-
if (pgdat->node_present_pages)
-
node_set_state(nid, N_HIGH_MEMORY);
-
check_for_regular_memory(pgdat);
-
}
-
}
- void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
-
unsigned long node_start_pfn, unsigned long *zholes_size)
-
{
-
pg_data_t *pgdat = NODE_DATA(nid);
-
-
pgdat->node_id = nid;
-
pgdat->node_start_pfn = node_start_pfn;//该node的开始页面号
-
calculate_node_totalpages(pgdat, zones_size, zholes_size);//计算对于该node来说有多少pages可用。分别设置了pgdat->node_spanned_pages为总共算上hole有多少page,pgdat->node_present_pages为不计hole有多少实际可用的page
-
-
alloc_node_mem_map(pgdat);//为node在memblock.memory或者bootmem中分配一块大小为pgdat->node_spanned_pages大小的内存,并把内存的起始地址放置在pgdat->node_mem_map中。该分配的原则是,如果kernel编译过程中没有使用bootmem,则在memblock.memory中找寻一块内存区域并且该内存区域应该归该node所有,即early_node_map[]的nid是该node.
-
#ifdef CONFIG_FLAT_NODE_MEM_MAP
-
printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
-
nid, (unsigned long)pgdat,
-
(unsigned long)pgdat->node_mem_map);
-
#endif
-
-
free_area_init_core(pgdat, zones_size, zholes_size);
-
}
阅读(8041) | 评论(0) | 转发(4) |