linux内存管理之页表管理（2）-sh_ch

Linux、C++、C、python、大数

首页　| 　博文目录　| 　关于我

sh_ch_ping1987

博客访问： 209414
博文数量： 33
博客积分： 0
博客等级：民兵
技术积分： 1277
用户组：普通用户
注册时间： 2013-03-03 10:03

个人简介

现于杭州电子科技大学攻读硕士学位

文章分类

全部博文（33）

linux进程管理（0）
linux内存管理（6）
linux网络（0）
ARM（3）
linux系统（2）
GCC（1）
linux内核（5）
linux设备驱动程（16）
未分配的博文（0）

文章存档

2013年（33）

我的朋友

相关博文

linux内存管理之页表管理（2）

分类： LINUX

2013-09-21 21:47:25

/*****************************************************************************************************************************************/
/* head-armv.S */
/*
* linux/arch/arm/kernel/head-armv.S
*
* Copyright (C) 1994-1999 Russell King
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* 32-bit kernel startup code for all architectures
*/
#include
#include

#include
#include
#include

#define K(a,b,c) ((a) << 24 | (b) << 12 | (c))

/*
* We place the page tables 16K below TEXTADDR. Therefore, we must make sure
* that TEXTADDR is correctly set. Currently, we expect the least significant
* "short" to be 0x8000, but we could probably relax this restriction to
* TEXTADDR > PAGE_OFFSET + 0x4000
*
* Note that swapper_pg_dir is the virtual address of the page tables, and
* pgtbl gives us a position-independent reference to these tables. We can
* do this because stext == TEXTADDR
*
* swapper_pg_dir, pgtbl and krnladr are all closely related.
*/
#if (TEXTADDR & 0xffff) != 0x8000
#error TEXTADDR must start at 0xXXXX8000
#endif

.globl SYMBOL_NAME(swapper_pg_dir)
.equ SYMBOL_NAME(swapper_pg_dir), TEXTADDR - 0x4000 /*页表放在距离内核代码段下面的4k的地方0xc0004000 */

.macro pgtbl, reg, rambase
adr \reg, stext /* stext为内核的入口地址，比如0x30008000 */
sub \reg, \reg, #0x4000
.endm

/*
* Since the page table is closely related to the kernel start address, we
* can convert the page table base address to the base address of the section
* containing both.
*/
.macro krnladr, rd, pgtable, rambase
bic \rd, \pgtable, #0x000ff000
.endm

/*
* Kernel startup entry point.
* ---------------------------
*
* This is normally called from the decompressor code. The requirements
* are: MMU = off, D-cache = off, I-cache = dont care, r0 = 0,
* r1 = machine nr.
*
* This code is mostly position independent, so if you link the kernel at
* 0xc0008000, you call this at __pa(0xc0008000).
*
* See linux/arch/arm/tools/mach-types for the complete list of machine
* numbers for r1.
*
* We're trying to keep crap to a minimum; DO NOT add any machine specific
* crap here - that's what the boot loader (or in extreme, well justified
* circumstances, zImage) is for.
*/
.section ".text.init",#alloc,#execinstr
.type stext, #function
ENTRY(stext)
mov r12, r0
/*
* NOTE! Any code which is placed here should be done for one of
* the following reasons:
*
* 1. Compatability with old production boot firmware (ie, users
* actually have and are booting the kernel with the old firmware)
* and therefore will be eventually removed.
* 2. Cover the case when there is no boot firmware. This is not
* ideal, but in this case, it should ONLY set r0 and r1 to the
* appropriate value.
*/
#if defined(CONFIG_ARCH_NETWINDER)
/*
* Compatability cruft for old NetWinder NeTTroms. This
* code is currently scheduled for destruction in 2.5.xx
*/
.rept 8
mov r0, r0
.endr

adr r2, 1f
ldmdb r2, {r7, r8}
and r3, r2, #0xc000
teq r3, #0x8000
beq __entry
bic r3, r2, #0xc000
orr r3, r3, #0x8000
mov r0, r3
mov r4, #64
sub r5, r8, r7
b 1f

.word _stext
.word __bss_start

1:
.rept 4
ldmia r2!, {r6, r7, r8, r9}
stmia r3!, {r6, r7, r8, r9}
.endr
subs r4, r4, #64
bcs 1b
movs r4, r5
mov r5, #0
movne pc, r0

mov r1, #MACH_TYPE_NETWINDER @ (will go in 2.5)
mov r12, #2 << 24 @ scheduled for removal in 2.5.xx
orr r12, r12, #5 << 12
__entry:
#endif
#if defined(CONFIG_ARCH_L7200)
/*
* FIXME - No bootloader, so manually set 'r1' with our architecture number.
*/
mov r1, #MACH_TYPE_L7200
#endif

mov r0, #F_BIT | I_BIT | MODE_SVC @ make sure svc mode
msr cpsr_c, r0 @ and all irqs disabled
bl __lookup_processor_type
teq r10, #0 @ invalid processor?
moveq r0, #'p' @ yes, error 'p'
beq __error
bl __lookup_architecture_type
teq r7, #0 @ invalid architecture?
moveq r0, #'a' @ yes, error 'a'
beq __error
bl __create_page_tables /* 创建页表 */
adr lr, __ret @ return address
add pc, r10, #12 @ initialise processor
@ (return control reg)

.type __switch_data, %object
__switch_data: .long __mmap_switched
.long SYMBOL_NAME(__bss_start)
.long SYMBOL_NAME(_end)
.long SYMBOL_NAME(processor_id)
.long SYMBOL_NAME(__machine_arch_type)
.long SYMBOL_NAME(cr_alignment)
.long SYMBOL_NAME(init_task_union)+8192

/*
* Enable the MMU. This completely changes the structure of the visible
* memory space. You will not be able to trace execution through this.
* If you have an enquiry about this, *please* check the linux-arm-kernel
* mailing list archives BEFORE sending another post to the list.
*/
.type __ret, %function
__ret: ldr lr, __switch_data
mcr p15, 0, r0, c1, c0 /* 使能MMU */
mrc p15, 0, r0, c1, c0, 0 @ read it back.
mov r0, r0
mov r0, r0
mov pc, lr

/*
* The following fragment of code is executed with the MMU on, and uses
* absolute addresses; this is not position independent.
*
* r0 = processor control register
* r1 = machine ID
* r9 = processor ID
*/
.align 5
__mmap_switched:
adr r3, __switch_data + 4
ldmia r3, {r4, r5, r6, r7, r8, sp}@ r2 = compat
@ sp = stack pointer

mov fp, #0 @ Clear BSS (and zero fp)
1: cmp r4, r5
strcc fp, [r4],#4
bcc 1b

str r9, [r6] @ Save processor ID
str r1, [r7] @ Save machine type
#ifdef CONFIG_ALIGNMENT_TRAP
orr r0, r0, #2 @ ...........A.
#endif
bic r2, r0, #2 @ Clear 'A' bit
stmia r8, {r0, r2} @ Save control register values
b SYMBOL_NAME(start_kernel)

/*
* Setup the initial page tables. We only setup the barest
* amount which are required to get the kernel running, which
* generally means mapping in the kernel code.
*
* We only map in 4MB of RAM, which should be sufficient in
* all cases.
*
* r5 = physical address of start of RAM
* r6 = physical IO address
* r7 = byte offset into page tables for IO
* r8 = page table flags
*/
/* 映射4MB的物理内存
r5=内存的起始物理地址比如:0x30000000
r6=IO物理地址空间
r7=IO地址空间在页表的的偏移
r8=页表的访问权限和属性
*/
__create_page_tables: /* 创建页表 */
/*
.macro pgtbl, reg, rambase
adr \reg, stext //stext为内核的入口地址，比如0x30008000
sub \reg, \reg, #0x4000
.endm
*/
pgtbl r4, r5 @ page table address /* r4=0x30004000=swapper_pg_dir[]数组的首地址 */
/*
pgtbl r4, r5
就等价于:
adr r4, =0x30008000
sub r4, r4,#0x4000 //r4=0x30004000=swapper_pg_dir[]数组的首地址
*/
/*
* Clear the 16K level 1 swapper page table
*/
mov r0, r4 /* r0=r4=0x30004000=swapper_pg_dir[] */
mov r3, #0 /* r3=0 */
add r2, r0, #0x4000 /* r2=0x30008000 */
/* 将30004000~0x30008000这16K的内存清空即清空数组swapper_pg_dir[] */
1: str r3, [r0], #4
str r3, [r0], #4
str r3, [r0], #4
str r3, [r0], #4
teq r0, r2
bne 1b

/*
* Create identity mapping for first MB of kernel to
* cater for the MMU enable. This identity mapping
* will be removed by paging_init()
*/
/*
.macro krnladr, rd, pgtable, rambase
bic \rd, \pgtable, #0x000ff000
.endm
*/
krnladr r2, r4, r5 @ start of kernel
/*
krnladr r2, r4, r5
就等价于
bic r2, r4, #0x000ff000 //将r4=0x30004000=swapper_pg_dir[] 中的bit[20:12]清0 ====> r2=0x30000000
*/
/* 这里将1M内存的起始虚拟地址0x30000000映射到1M内存的物理地址0x30000000上，这样访问虚拟地址和访问物理
地址是等价的*/
add r3, r8, r2 @ flags + kernel base /* r3=0x30000000+flags */
str r3, [r4, r2, lsr #18] @ identity mapping /*0x30004000+0x30000000>>18=0x30000000+flags;即建立物理地址和虚拟地址相等的映射 */

/*
* Now setup the pagetables for our kernel direct
* mapped region. We round TEXTADDR down to the
* nearest megabyte boundary.
*/
/* 这里将1M的虚拟起始地址0xc0000000映射到1M的物理起始地址0x30000000上 */
add r0, r4, #(TEXTADDR & 0xff000000) >> 18 @ start of kernel /* r0=0x30004000+(0xc0008000&0xff000000) >> 18 =0x30004000+0xc0000000>> 18=0x30007000*/
bic r2, r3, #0x00f00000 /* r2=0x30000000+flags */
str r2, [r0] @ PAGE_OFFSET + 0MB /* 0x30004000+0xc0000000>> 18=0x30000000+flags --->将0xc0000000映射到0x30000000 */

/* 将将0xc0000000映射到0x30000000，并且将页表项写入上面页表项的后面一项当中 */
add r0, r0, #(TEXTADDR & 0x00f00000) >> 18 /* r0=0x30004000+0xc0000000>> 18+(0xc0008000&0x00f00000) >> 18=0x30004000+0xc0000000>> 18*/
str r3, [r0], #4 @ KERNEL + 0MB /* r0=0x30007000+4=0x30000000+flags */
/* 将0xc0000000+1M映射到0x30000000+1M */
add r3, r3, #1 << 20 /*r3=0x30000000+1M */
str r3, [r0], #4 @ KERNEL + 1MB /*r0=0x30007004+4=0x30000000+1M */
/* 将0xc0000000+2M映射到0x30000000+2M */
add r3, r3, #1 << 20 /*r3=0x30000000+2M */
str r3, [r0], #4 @ KERNEL + 2MB /*r0=0x30007008+4=0x30000000+2M */
/* 将0xc0000000+3M映射到0x30000000+3M */
add r3, r3, #1 << 20 /*r3=0x30000000+3M */
str r3, [r0], #4 @ KERNEL + 3MB /*r0=0x3000700c+4=0x30000000+3M */

/*
* Ensure that the first section of RAM is present.
* we assume that:
* 1. the RAM is aligned to a 32MB boundary
* 2. the kernel is executing in the same 32MB chunk
* as the start of RAM.
*/
/* 将0xc0000000映射到0x30000000 */
bic r0, r0, #0x01f00000 >> 18 @ round down /* r0=0x30007000=0x30004000+0xc0000000>> 18 */
and r2, r5, #0xfe000000 @ round down /* r2=0x30000000&0xfe000000=0x30000000 */
add r3, r8, r2 @ flags + rambase /* r3=0x30000000+flags */
str r3, [r0] /* 0x30004000+0xc0000000>> 18=0x30000000+flags */

bic r8, r8, #0x0c @ turn off cacheable
@ and bufferable bits
#ifdef CONFIG_DEBUG_LL /* 调试相关 */
/*
* Map in IO space for serial debugging.
* This allows debug messages to be output
* via a serial console before paging_init.
*/
add r0, r4, r7
rsb r3, r7, #0x4000 @ PTRS_PER_PGD*sizeof(long)
cmp r3, #0x0800
addge r2, r0, #0x0800
addlt r2, r0, r3
orr r3, r6, r8
1: str r3, [r0], #4
add r3, r3, #1 << 20
teq r0, r2
bne 1b
#if defined(CONFIG_ARCH_NETWINDER) || defined(CONFIG_ARCH_CATS)
/*
* If we're using the NetWinder, we need to map in
* the 16550-type serial port for the debug messages
*/
teq r1, #MACH_TYPE_NETWINDER
teqne r1, #MACH_TYPE_CATS
bne 1f
add r0, r4, #0x3fc0
mov r3, #0x7c000000
orr r3, r3, r8
str r3, [r0], #4
add r3, r3, #1 << 20
str r3, [r0], #4
1:
#endif
#endif
#ifdef CONFIG_ARCH_RPC
/*
* Map in screen at 0x02000000 & SCREEN2_BASE
* Similar reasons here - for debug. This is
* only for Acorn RiscPC architectures.
*/
add r0, r4, #0x80 @ 02000000
mov r3, #0x02000000
orr r3, r3, r8
str r3, [r0]
add r0, r4, #0x3600 @ d8000000
str r3, [r0]
#endif
mov pc, lr

/*
* Exception handling. Something went wrong and we can't
* proceed. We ought to tell the user, but since we
* don't have any guarantee that we're even running on
* the right architecture, we do virtually nothing.
* r0 = ascii error character:
* a = invalid architecture
* p = invalid processor
* i = invalid calling convention
*
* Generally, only serious errors cause this.
*/
__error:
#ifdef CONFIG_DEBUG_LL
mov r8, r0 @ preserve r0
adr r0, err_str
bl printascii
mov r0, r8
bl printch
#endif
#ifdef CONFIG_ARCH_RPC
/*
* Turn the screen red on a error - RiscPC only.
*/
mov r0, #0x02000000
mov r3, #0x11
orr r3, r3, r3, lsl #8
orr r3, r3, r3, lsl #16
str r3, [r0], #4
str r3, [r0], #4
str r3, [r0], #4
str r3, [r0], #4
#endif
1: mov r0, r0
b 1b

#ifdef CONFIG_DEBUG_LL
err_str: .asciz "\nError: "
.align
#endif

/*
* Read processor ID register (CP#15, CR0), and look up in the linker-built
* supported processor list. Note that we can't use the absolute addresses
* for the __proc_info lists since we aren't running with the MMU on
* (and therefore, we are not in the correct address space). We have to
* calculate the offset.
*
* Returns:
* r5, r6, r7 corrupted
* r8 = page table flags
* r9 = processor ID
* r10 = pointer to processor structure
*/
__lookup_processor_type:
adr r5, 2f
ldmia r5, {r7, r9, r10}
sub r5, r5, r10 @ convert addresses
add r7, r7, r5 @ to our address space
add r10, r9, r5
mrc p15, 0, r9, c0, c0 @ get processor id
1: ldmia r10, {r5, r6, r8} @ value, mask, mmuflags
and r6, r6, r9 @ mask wanted bits
teq r5, r6
moveq pc, lr
add r10, r10, #36 @ sizeof(proc_info_list)
cmp r10, r7
blt 1b
mov r10, #0 @ unknown processor
mov pc, lr

/*
* Look in include/asm-arm/procinfo.h and arch/arm/kernel/arch.[ch] for
* more information about the __proc_info and __arch_info structures.
*/
2: .long __proc_info_end
.long __proc_info_begin
.long 2b
.long __arch_info_begin
.long __arch_info_end

/*
* Lookup machine architecture in the linker-build list of architectures.
* Note that we can't use the absolute addresses for the __arch_info
* lists since we aren't running with the MMU on (and therefore, we are
* not in the correct address space). We have to calculate the offset.
*
* r1 = machine architecture number
* Returns:
* r2, r3, r4 corrupted
* r5 = physical start address of RAM
* r6 = physical address of IO
* r7 = byte offset into page tables for IO
*/
__lookup_architecture_type:
adr r4, 2b
ldmia r4, {r2, r3, r5, r6, r7} @ throw away r2, r3
sub r5, r4, r5 @ convert addresses
add r4, r6, r5 @ to our address space
add r7, r7, r5
1: ldr r5, [r4] @ get machine type
teq r5, r1
beq 2f
add r4, r4, #SIZEOF_MACHINE_DESC
cmp r4, r7
blt 1b
mov r7, #0 @ unknown architecture
mov pc, lr
2: ldmib r4, {r5, r6, r7} @ found, get results
mov pc, lr

/*****************************************************************************************************************************************/
/* mm-armv.c */
/*
* linux/arch/arm/mm/mm-armv.c
*
* Copyright (C) 1998-2000 Russell King
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Page table sludge for ARM v3 and v4 processor architectures.
*/
#include
#include
#include
#include

#include
#include
#include
#include
#include

#include

/*
* These are useful for identifing cache coherency
* problems by allowing the cache or the cache and
* writebuffer to be turned off. (Note: the write
* buffer should not be on and the cache off).
*/
static int __init nocache_setup(char *__unused)
{
cr_alignment &= ~4;
cr_no_alignment &= ~4;
flush_cache_all();
set_cr(cr_alignment);
return 1;
}

static int __init nowrite_setup(char *__unused)
{
cr_alignment &= ~(8|4);
cr_no_alignment &= ~(8|4);
flush_cache_all();
set_cr(cr_alignment);
return 1;
}

static int __init noalign_setup(char *__unused)
{
cr_alignment &= ~2;
cr_no_alignment &= ~2;
set_cr(cr_alignment);
return 1;
}

__setup("noalign", noalign_setup);
__setup("nocache", nocache_setup);
__setup("nowb", nowrite_setup);

#define FIRST_KERNEL_PGD_NR (FIRST_USER_PGD_NR + USER_PTRS_PER_PGD)

#define clean_cache_area(start,size) \
cpu_cache_clean_invalidate_range((unsigned long)start, ((unsigned long)start) + size, 0);

/*
* need to get a 16k page for level 1
*/
/* 如果缓存队列中没有多余的页面则通过物理页面分配器完成
该函数用于分配全局目录表*/
pgd_t *get_pgd_slow(struct mm_struct *mm)
{
pgd_t *new_pgd, *init_pgd;
pmd_t *new_pmd, *init_pmd;
pte_t *new_pte, *init_pte;

new_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, 2);
if (!new_pgd)
goto no_pgd;

memzero(new_pgd, FIRST_KERNEL_PGD_NR * sizeof(pgd_t));

init_pgd = pgd_offset_k(0);

if (vectors_base() == 0) {
init_pmd = pmd_offset(init_pgd, 0);
init_pte = pte_offset(init_pmd, 0);

/*
* This lock is here just to satisfy pmd_alloc and pte_lock
*/
spin_lock(&mm->page_table_lock);

/*
* On ARM, first page must always be allocated since it
* contains the machine vectors.
*/
new_pmd = pmd_alloc(mm, new_pgd, 0);
if (!new_pmd)
goto no_pmd;

new_pte = pte_alloc(mm, new_pmd, 0);
if (!new_pte)
goto no_pte;

set_pte(new_pte, *init_pte);

spin_unlock(&mm->page_table_lock);
}

/*
* Copy over the kernel and IO PGD entries
*/
memcpy(new_pgd + FIRST_KERNEL_PGD_NR, init_pgd + FIRST_KERNEL_PGD_NR,
(PTRS_PER_PGD - FIRST_KERNEL_PGD_NR) * sizeof(pgd_t));

/*
* FIXME: this should not be necessary
*/
clean_cache_area(new_pgd, PTRS_PER_PGD * sizeof(pgd_t));

return new_pgd;

no_pte:
spin_unlock(&mm->page_table_lock);
pmd_free(new_pmd);
free_pages((unsigned long)new_pgd, 2);
return NULL;

no_pmd:
spin_unlock(&mm->page_table_lock);
free_pages((unsigned long)new_pgd, 2);
return NULL;

no_pgd:
return NULL;
}

/* 释放全局目录表 */
void free_pgd_slow(pgd_t *pgd)
{
pmd_t *pmd;
pte_t *pte;

if (!pgd)
return;

/* pgd is always present and good */
pmd = (pmd_t *)pgd;
if (pmd_none(*pmd))
goto free;
if (pmd_bad(*pmd)) {
pmd_ERROR(*pmd);
pmd_clear(pmd);
goto free;
}

pte = pte_offset(pmd, 0);
pmd_clear(pmd);
pte_free(pte);
pmd_free(pmd);
free:
free_pages((unsigned long) pgd, 2);
}

/*
* Create a SECTION PGD between VIRT and PHYS in domain
* DOMAIN with protection PROT
*/
static inline void
alloc_init_section(unsigned long virt, unsigned long phys, int prot)
{
pmd_t pmd;

pmd_val(pmd) = phys | prot;

set_pmd(pmd_offset(pgd_offset_k(virt), virt), pmd);
}

/*
* Add a PAGE mapping between VIRT and PHYS in domain
* DOMAIN with protection PROT. Note that due to the
* way we map the PTEs, we must allocate two PTE_SIZE'd
* blocks - one for the Linux pte table, and one for
* the hardware pte table.
*/
static inline void
alloc_init_page(unsigned long virt, unsigned long phys, int domain, int prot)
{
pmd_t *pmdp;
pte_t *ptep;

pmdp = pmd_offset(pgd_offset_k(virt), virt);

if (pmd_none(*pmdp)) {
pte_t *ptep = alloc_bootmem_low_pages(2 * PTRS_PER_PTE *
sizeof(pte_t));

ptep += PTRS_PER_PTE;

set_pmd(pmdp, __mk_pmd(ptep, PMD_TYPE_TABLE | PMD_DOMAIN(domain)));
}
ptep = pte_offset(pmdp, virt);

set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, __pgprot(prot)));
}

/*
* Clear any PGD mapping. On a two-level page table system,
* the clearance is done by the middle-level functions (pmd)
* rather than the top-level (pgd) functions.
*/
static inline void clear_mapping(unsigned long virt)
{
pmd_clear(pmd_offset(pgd_offset_k(virt), virt));
}

/*
* Create the page directory entries and any necessary
* page tables for the mapping specified by `md'. We
* are able to cope here with varying sizes and address
* offsets, and we take full advantage of sections.
*/
static void __init create_mapping(struct map_desc *md)
{
unsigned long virt, length;
int prot_sect, prot_pte;
long off;

if (md->prot_read && md->prot_write &&
!md->cacheable && !md->bufferable) {
printk(KERN_WARNING "Security risk: creating user "
"accessible mapping for 0x%08lx at 0x%08lx\n",
md->physical, md->virtual);
}

if (md->virtual != vectors_base() && md->virtual < PAGE_OFFSET) {
printk(KERN_WARNING "MM: not creating mapping for "
"0x%08lx at 0x%08lx in user region\n",
md->physical, md->virtual);
}

prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
(md->prot_read ? L_PTE_USER : 0) |
(md->prot_write ? L_PTE_WRITE : 0) |
(md->cacheable ? L_PTE_CACHEABLE : 0) |
(md->bufferable ? L_PTE_BUFFERABLE : 0);

prot_sect = PMD_TYPE_SECT | PMD_DOMAIN(md->domain) |
(md->prot_read ? PMD_SECT_AP_READ : 0) |
(md->prot_write ? PMD_SECT_AP_WRITE : 0) |
(md->cacheable ? PMD_SECT_CACHEABLE : 0) |
(md->bufferable ? PMD_SECT_BUFFERABLE : 0);

virt = md->virtual;
off = md->physical - virt;
length = md->length;

while ((virt & 0xfffff || (virt + off) & 0xfffff) && length >= PAGE_SIZE) {
alloc_init_page(virt, virt + off, md->domain, prot_pte);

virt += PAGE_SIZE;
length -= PAGE_SIZE;
}

while (length >= PGDIR_SIZE) {
alloc_init_section(virt, virt + off, prot_sect);

virt += PGDIR_SIZE;
length -= PGDIR_SIZE;
}

while (length >= PAGE_SIZE) {
alloc_init_page(virt, virt + off, md->domain, prot_pte);

virt += PAGE_SIZE;
length -= PAGE_SIZE;
}
}

/*
* In order to soft-boot, we need to insert a 1:1 mapping in place of
* the user-mode pages. This will then ensure that we have predictable
* results when turning the mmu off
*/
void setup_mm_for_reboot(char mode)
{
pgd_t *pgd;
pmd_t pmd;
int i;

if (current->mm && current->mm->pgd)
pgd = current->mm->pgd;
else
pgd = init_mm.pgd;

for (i = 0; i < FIRST_USER_PGD_NR + USER_PTRS_PER_PGD; i++) {
pmd_val(pmd) = (i << PGDIR_SHIFT) |
PMD_SECT_AP_WRITE | PMD_SECT_AP_READ |
PMD_TYPE_SECT;
set_pmd(pmd_offset(pgd + i, i << PGDIR_SHIFT), pmd);
}
}

/*
* Setup initial mappings. We use the page we allocated for zero page to hold
* the mappings, which will get overwritten by the vectors in traps_init().
* The mappings must be in virtual address order.
*/
/* 页表初始化，该函数初始化对应于ZONE_DMA和ZONE_NORMAL的所有物理内存所必要的页表 */
void __init memtable_init(struct meminfo *mi)
{
struct map_desc *init_maps, *p, *q;
unsigned long address = 0;
int i;

init_maps = p = alloc_bootmem_low_pages(PAGE_SIZE);

for (i = 0; i < mi->nr_banks; i++) {
if (mi->bank[i].size == 0)
continue;

p->physical = mi->bank[i].start;
p->virtual = __phys_to_virt(p->physical);
p->length = mi->bank[i].size;
p->domain = DOMAIN_KERNEL;
p->prot_read = 0;
p->prot_write = 1;
p->cacheable = 1;
p->bufferable = 1;

p ++;
}

#ifdef FLUSH_BASE
p->physical = FLUSH_BASE_PHYS;
p->virtual = FLUSH_BASE;
p->length = PGDIR_SIZE;
p->domain = DOMAIN_KERNEL;
p->prot_read = 1;
p->prot_write = 0;
p->cacheable = 1;
p->bufferable = 1;

p ++;
#endif

#ifdef FLUSH_BASE_MINICACHE
p->physical = FLUSH_BASE_PHYS + PGDIR_SIZE;
p->virtual = FLUSH_BASE_MINICACHE;
p->length = PGDIR_SIZE;
p->domain = DOMAIN_KERNEL;
p->prot_read = 1;
p->prot_write = 0;
p->cacheable = 1;
p->bufferable = 0;

p ++;
#endif

/*
* Go through the initial mappings, but clear out any
* pgdir entries that are not in the description.
*/
q = init_maps;
do {
if (address < q->virtual || q == p) {
clear_mapping(address);
address += PGDIR_SIZE;
} else {
create_mapping(q);

address = q->virtual + q->length;
address = (address + PGDIR_SIZE - 1) & PGDIR_MASK;

q ++;
}
} while (address != 0);

/*
* Create a mapping for the machine vectors at virtual address 0
* or 0xffff0000. We should always try the high mapping.
*/
init_maps->physical = virt_to_phys(init_maps);
init_maps->virtual = vectors_base();
init_maps->length = PAGE_SIZE;
init_maps->domain = DOMAIN_USER;
init_maps->prot_read = 0;
init_maps->prot_write = 0;
init_maps->cacheable = 1;
init_maps->bufferable = 0;

create_mapping(init_maps);
}

/*
* Create the architecture specific mappings
*/
void __init iotable_init(struct map_desc *io_desc)
{
int i;

for (i = 0; io_desc[i].last == 0; i++)
create_mapping(io_desc + i);
}

static inline void free_memmap(int node, unsigned long start, unsigned long end)
{
unsigned long pg, pgend;

start = __phys_to_virt(start);
end = __phys_to_virt(end);

pg = PAGE_ALIGN((unsigned long)(virt_to_page(start)));
pgend = ((unsigned long)(virt_to_page(end))) & PAGE_MASK;

start = __virt_to_phys(pg);
end = __virt_to_phys(pgend);

free_bootmem_node(NODE_DATA(node), start, end - start);
}

static inline void free_unused_memmap_node(int node, struct meminfo *mi)
{
unsigned long bank_start, prev_bank_end = 0;
unsigned int i;

/*
* [FIXME] This relies on each bank being in address order. This
* may not be the case, especially if the user has provided the
* information on the command line.
*/
for (i = 0; i < mi->nr_banks; i++) {
if (mi->bank[i].size == 0 || mi->bank[i].node != node)
continue;

bank_start = mi->bank[i].start & PAGE_MASK;

/*
* If we had a previous bank, and there is a space
* between the current bank and the previous, free it.
*/
if (prev_bank_end && prev_bank_end != bank_start)
free_memmap(node, prev_bank_end, bank_start);

prev_bank_end = PAGE_ALIGN(mi->bank[i].start +
mi->bank[i].size);
}
}

/*
* The mem_map array can get very big. Free
* the unused area of the memory map.
*/
void __init create_memmap_holes(struct meminfo *mi)
{
int node;

for (node = 0; node < numnodes; node++)
free_unused_memmap_node(node, mi);
}

/*
* PTE table allocation cache.
*
* This is a move away from our custom 2K page allocator. We now use the
* slab cache to keep track of these objects.
*
* With this, it is questionable as to whether the PGT cache gains us
* anything. We may be better off dropping the PTE stuff from our PGT
* cache implementation.
*/
kmem_cache_t *pte_cache;

/*
* The constructor gets called for each object within the cache when the
* cache page is created. Note that if slab tries to misalign the blocks,
* we BUG() loudly.
*/
static void pte_cache_ctor(void *pte, kmem_cache_t *cache, unsigned long flags)
{
unsigned long block = (unsigned long)pte;

if (block & 2047)
BUG();

memzero(pte, 2 * PTRS_PER_PTE * sizeof(pte_t));
cpu_cache_clean_invalidate_range(block, block +
PTRS_PER_PTE * sizeof(pte_t), 0);
}

void __init pgtable_cache_init(void)
{
pte_cache = kmem_cache_create("pte-cache",
2 * PTRS_PER_PTE * sizeof(pte_t), 0, 0,
pte_cache_ctor, NULL);
if (!pte_cache)
BUG();
}

/*****************************************************************************************************************************************/
/* init.c */
/* 初始化页表，该函数用于完成页表收尾工作*/
void __init paging_init(struct meminfo *mi, struct machine_desc *mdesc)
{
void *zero_page;
int node;

memcpy(&meminfo, mi, sizeof(meminfo));

/*
* allocate the zero page. Note that we count on this going ok.
*/
zero_page = alloc_bootmem_low_pages(PAGE_SIZE);

/*
* initialise the page tables.
*/
memtable_init(mi); /* 页表初始化，该函数初始化对应于ZONE_DMA和ZONE_NORMAL的所有物理内存所必要的页表 */
if (mdesc->map_io)
mdesc->map_io();
flush_cache_all();
flush_tlb_all();

/*
* initialise the zones within each node
*/
for (node = 0; node < numnodes; node++) {
unsigned long zone_size[MAX_NR_ZONES];
unsigned long zhole_size[MAX_NR_ZONES];
struct bootmem_data *bdata;
pg_data_t *pgdat;
int i;

/*
* Initialise the zone size information.
*/
for (i = 0; i < MAX_NR_ZONES; i++) {
zone_size[i] = 0;
zhole_size[i] = 0;
}

pgdat = NODE_DATA(node);
bdata = pgdat->bdata;

/*
* The size of this node has already been determined.
* If we need to do anything fancy with the allocation
* of this memory to the zones, now is the time to do
* it.
*/
zone_size[0] = bdata->node_low_pfn -
(bdata->node_boot_start >> PAGE_SHIFT);

/*
* If this zone has zero size, skip it.
*/
if (!zone_size[0])
continue;

/*
* For each bank in this node, calculate the size of the
* holes. holes = node_size - sum(bank_sizes_in_node)
*/
zhole_size[0] = zone_size[0];
for (i = 0; i < mi->nr_banks; i++) {
if (mi->bank[i].node != node)
continue;

zhole_size[0] -= mi->bank[i].size >> PAGE_SHIFT;
}

/*
* Adjust the sizes according to any special
* requirements for this machine type.
*/
arch_adjust_zones(node, zone_size, zhole_size);

free_area_init_node(node, pgdat, 0, zone_size,
bdata->node_boot_start, zhole_size);
}

/*
* finish off the bad pages once
* the mem_map is initialised
*/
memzero(zero_page, PAGE_SIZE);
empty_zero_page = virt_to_page(zero_page);
flush_dcache_page(empty_zero_page);
}

阅读(2506) | 评论(0) | 转发(0) |

上一篇：linux内存管理之页表管理（1）

下一篇：linux内存管理之引导内存分配器

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6