BSD amd64 内存管理分析 (freebsd9.0) by chishanmingshen
http://chishanmingshen.blog.chinaunix.net
第一部分 基本流程
elf64_exec(struct preloaded_file *fp)
第一次设置页表:2M*512=1G空间的映射.
__exec((void *)VTOP(amd64_tramp), modulep, kernend);
amd64_tramp:
设置cr3.打开分页机制.此时是32bit模式
跳到64bit模式.(之前的entry_hi/entry_lo即btext地址)
ljmp $0x8, $VTOP(longmode)
locore.S
call hammer_time(其中会调用getmemsize(kmdp, physfree)->pmap_bootstrap()->create_pagetable().)
call mi_startup(module init)
0.pmap_bootstrap(vm_paddr_t *firstaddr)
create_pagetables(firstaddr)
virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
#define KERNBASE KVADDR(KPML4I, KPDPI, 0, 0)/(511,510,0,0)最后1G空间
virtual_end = VM_MAX_KERNEL_ADDRESS;
#define VM_MAX_KERNEL_ADDRESS KVADDR(KPML4I, NPDPEPG-1, NKPDE-1, NPTEPG-1)/*511,510,511*/最后留了2M
/*kernel_pmap记录PML4表基址的虚拟地址,从物理地址KPML4.*/
kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys);
1. SYS_INIT's vm_mem_init()
vm_set_page_size
virtual_avail = vm_page_startup(virtual_avail);/*初始化各个物理页面,然后加入到freelist中*/
遍历phys_avail[],得到段数:nblocks,总的空间大小:total.
vm_pageq_init()/*page queue*/
扣去umaslb得到new_end,将umaslb调pmap_map和uma_startup.(支持Dmap,所以不递增vaddr.
#define PHYS_TO_DMAP(x) ((x) | DMAP_MIN_ADDRESS)
#define DMAP_MIN_ADDRESS KVADDR(DMPML4I, 0, 0, 0)/*510*/
计算可用物理页面总数为page_range个.
npages = (total - (page_range * sizeof(struct vm_page)) - (end - new_end)) / PAGE_SIZE;
vm_page_array指向pmap_map()映射后的vm_page[]空间,共npages个页面
phys_avail[biggestone + 1] = new_end;最后一段内存更正为到new_end结束,扣除了vm_page[].
vm_page_array_size = page_range;
vm_phys_init();/*初始化物理内存分配器*/
对所有段调用vm_phys_create_seg(phys_avail[i], phys_avail[i + 1],VM_FREELIST_DEFAULT);
更新到vm_phys_segs[]中.
vm_phys_free_queues[vm_nfreelists][VM_NFREEPOOL]
/*static int vm_nfreelists = VM_FREELIST_DEFAULT + 1;*/
遍历phys_avail[],对所有物理页调用vm_phys_add_page(pa).
vm_phys_add_page(pa/*vm_paddr 物理地址*/):初始化一个物理页面,同时将它加到free list中.
m = vm_phys_paddr_to_vm_page(vm_paddr_t pa):/*找到给定物理地址对应的vm_page*/
遍历vm_phys_segs[],找到对应的vm_page结构指针,并返回该指针.
return &(seg->first_page[atop(pa - seg->start)]);
pmap_page_init(m);
vm_phys_free_pages(m, 0);/*加到freelist中*/
return (vaddr);
/*最后将可以用的虚拟地址返回,其中vm_page[]的空间已经加进去了.
返回的virtual_avail,由外面使用,即普通物理页面空间*/
3.vm_object_init();
3.1
kernel_object_store (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS)
3.2
kmem_object_store (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS)
3.3
1个vm_object的zoune: obj_zone
4.vm_map_startup();
创建1个vm_map的zone:mapzone callby vm_map_create()
创建2个vm_map_entry的zone :kmapentzone和mapentzone 根据vm_map生成vm_map_entry时,由vm_mpa的system_map决定.
5.kmem_init(virtual_avail, virtual_end);
vm_map_t m;
根据给定的物理地址范围,如kernel_pmap, 在虚拟空间min和max内, 生成一个vm_map m.
/*vm kernel_pmap:-2G->-4M*/
m = vm_map_create(kernel_pmap, VM_MIN_KERNEL_ADDRESS/*-2G*/, end/*VM_MAX_KERNEL_ADDRESS*/);
vm_map_t result = uma_zalloc(mapzone, M_WAITOK);/*从mapzone中分一个vm_map*/
_vm_map_init(result, min, max);给vm_map赋值各个字段(例如,result->pmap = pmap)
kernel_map = m;/*内核总空间*/
(void) vm_map_insert(m, NULL, (vm_ooffset_t) 0, VM_MIN_KERNEL_ADDRESS, start, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
#define VM_MAX_KERNEL_ADDRESS KVADDR(KPML4I, NPDPEPG-1, NKPDE-1, NPTEPG-1) -2M
#define VM_MIN_KERNEL_ADDRESS KVADDR(KPML4I, KPDPI, 0, 0) -2G
6.pmap_init();
需要虚拟内存的模块可以调用了
建立一个管理255个页面(放置1M个页表项)的对象
kptobj = vm_object_allocate(OBJT_DEFAULT, NKPDE);
对每个vm_page初始化pv_list//m = &vm_page_array[i];
每个vm_page都有一个struct pv_entry.这些pv_entry由pvinit带头.
pvinit = (struct pv_entry *) kmem_alloc(kernel_map,initial_pvs * sizeof (struct pv_entry));
7.vm_pager_init()
初始化已知的页
pagertab[]
第二部分 初始化
SYSINIT's KMEM module first kmeminit()
vm_kmem_size = 2*物理内存
kmem_map = kmem_suballoc(kernel_map, &kmembase, &kmemlimit,vm_kmem_size);申请kmem_map空间
kmem_suballoc(parent/*kernel_map*/, min/*output*/, max/*output*/, size/*要分得的sub映射空间*/):
*min = (vm_offset_t) vm_map_min(parent);
*max = *min + size;
result = vm_map_create(vm_map_pmap(parent), *min, *max);
return result;/*将新分配的vm_map返回.即是kmem_map空间.*/
kmem_map->system_map = 1;
mt_zone = uma_zcreate("mt_zone", sizeof(struct malloc_type_internal),
静态数组kmemzones[]处理:根据kmemzones[indx].kz_zone申请uma_zone结构体(2^4,...,2^12).
uma_zcreate->uma_zalloc_internel返回zone
uma_zone_slab->slab_alloc->uma_small_alloc->kmem_malloc
SYSINIT's KMEM module second malloc_init(void *data)
struct malloc_type_internal *mtip = uma_zalloc(mt_zone, M_WAITOK | M_ZERO);
第三部分 申请内存
void *
malloc(unsigned long size, struct malloc_type *mtp, int flags)
1.小内存
indx = kmemsize[size >> KMEM_ZSHIFT];/*根据申请内存大小得到index
zone = kmemzones[indx].kz_zone; /*由index得到对应的zone
va = uma_zalloc(zone, flags);
2.大内存
uma_large_malloc(size, flags)
slab = uma_zalloc_internal(slabzone, NULL, wait);
mem = page_alloc(NULL, size, &flags, wait);
slab->us_data = mem;
static uma_slab_t
slab_zalloc(uma_zone_t zone, int wait)
mem = keg->uk_allocf(zone, keg->uk_ppera * UMA_SLAB_SIZE,
&flags, wait);
m = vm_page_alloc()得到vm_page.
vm_phys_alloc_pages(pool, order);
f1 = vm_phys_free_queues[flind][pool];
m = TAILQ_FIRST(&fl[oind].pl);
TAILQ_REMOVE(&fl[oind].pl, m, pageq);
vm_phys_split_pages(m, oind, fl, order);
buddy算法,找比order稍大的vm_page
1.如果是对象非空, vm_page_insert(m, object, pindex);将vm_map加入到vm
2.如果空对象, 则m->pindex = pindex;
return m;
pa = m->phys_addr;
dump_add_page(pa);
va = (void *)PHYS_TO_DMAP(pa);/*内核申请的故放到DMAP*/
/*#ifdef UMA_MD_SMALL_ALLOC
keg->uk_allocf = uma_small_alloc;/************?
#else
keg->uk_allocf = page_alloc;
#endif*/
void *
uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
ret = ((void *)kmem_malloc(kmem_map, bytes, M_NOWAIT));
从指定的vm_map,即kmem_map中申请内存,大小为bytes.
return ret;
第四部分 页表相关
static void
create_pagetables(vm_paddr_t *firstaddr)
{
int i;
/* Allocate pages */
KPTphys = allocpages(firstaddr, NKPT);
KPML4phys = allocpages(firstaddr, 1);
KPDPphys = allocpages(firstaddr, NKPML4E);
KPDphys = allocpages(firstaddr, NKPDPE);
ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
if (ndmpdp < 4) /* Minimum 4GB of dirmap */
ndmpdp = 4;
DMPDPphys = allocpages(firstaddr, NDMPML4E);
DMPDphys = allocpages(firstaddr, ndmpdp);
dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
/* Fill in the underlying page table pages */
/* Read-only from zero to physfree */
/* XXX not fully used, underneath 2M pages */
for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {/*1->PT表*/
((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G | PG_U;
}
/* Now map the page tables at their location within PTmap */
for (i = 0; i < NKPT; i++) {/*2->PD表*/
((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_U;
}
/* Map from zero to end of allocations under 2M pages */
/* This replaces some of the KPTphys entries above */
for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {/*2->PD表 2M 直接跳过1*/
((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G | PG_U;
}
/* And connect up the PD to the PDP */
for (i = 0; i < NKPDPE; i++) {/*3->PDP表*/
((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + (i << PAGE_SHIFT);
((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U | PG_U;
}
/*2->DMPD表 2M 直接跳过1*/
/* Now set up the direct map space using 2MB pages */
for (i = 0; i < NPDEPG * ndmpdp; i++) {/*
((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT;
((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G | PG_U;
}
/*3->DMPDP表*/
/* And the direct map space's PDP */
for (i = 0; i < ndmpdp; i++) {
((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (i << PAGE_SHIFT);
((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
}
/* And recursively map PML4 to itself in order to get PTmap */
/*4->PML4表*/
((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;/*256 递归*/
((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
/* Connect the Direct Map slot up to the PML4 */
((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys;/*510 */
((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U;
/* Connect the KVA slot up to the PML4 */
((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;/*511 */
((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
}
uma调用和page_alloc:都实际从kmem_map子空间中分配空间.而不是kernel_map.
static void *
page_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
{
void *p; /* Returned page */
*pflag = UMA_SLAB_KMEM;
p = (void *) kmem_malloc(kmem_map, bytes, wait);
return (p);
}
vm_offset_t
kmem_malloc(map/*从哪个空间,比如kmem_map*/, size/*申请内存的大小*/, flags)
/*被uma_small_alloc()和page_alloc()调用.作用就是从kmem_map子空间中申请空间,大小为size.*/
addr = vm_map_findspace()来找出addr这个起始虚拟地址.
offset = addr - VM_MIN_KERNEL_ADDRESS;
vm_object_reference(kmem_object);
/*插入新的vm_map_entry_t,代表空间size大小*/
vm_map_insert(map, kmem_object, offset, addr, addr + size,VM_PROT_ALL, VM_PROT_ALL, 0);
/*逐页调用vm_page_alloc()来为每页生成vm_page结构体*/
vm_page_t m = vm_page_alloc(kmem_object, OFF_TO_IDX(offset + i), pflags);
/**************NOW, call pmap_enter!!!***************/
pmap_enter(kernel_pmap, addr + i, m, VM_PROT_ALL, 1);
return addr;
#define OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT))
_vm_object_allocate分配2个对象:kmem_object,kernel_object.空间是一样大的:
(VM_MAX_KERNEL_ADDRESS - VM_MAX_KERNEL_ADDRESS)
struct pmap {
struct mtx pm_mtx;
pml4_entry_t *pm_pml4; /* KVA of level 4 page table */
TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */
u_int pm_active; /* active on cpus */
/* spare u_int here due to padding */
struct pmap_statistics pm_stats; /* pmap statistics */
};
void
pmap_growkernel(vm_offset_t addr)
/*
* Address of current and alternate address space page table maps
* and directories.
* XXX it might be saner to just direct map all of physical memory
* into the kernel using 2MB pages. We have enough space to do
* it (2^47 bits of KVM, while current max physical addressability
* is 2^40 physical bits). Then we can get rid of the evil hole
* in the page tables and the evil overlapping.
*/
内核可以有空间2^48/2,即内核和userland各一半.
目前内核仅仅用了2个表项,即1024G,
#ifdef _KERNEL
#define addr_PTmap (KVADDR(PML4PML4I, 0, 0, 0))
#define addr_PDmap (KVADDR(PML4PML4I, PML4PML4I, 0, 0))
#define addr_PDPmap (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, 0))
#define addr_PML4map (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I))
#define addr_PML4pml4e (addr_PML4map + (PML4PML4I * sizeof(pml4_entry_t)))
#define PTmap ((pt_entry_t *)(addr_PTmap))
#define PDmap ((pd_entry_t *)(addr_PDmap))
#define PDPmap ((pd_entry_t *)(addr_PDPmap))
#define PML4map ((pd_entry_t *)(addr_PML4map))
#define PML4pml4e ((pd_entry_t *)(addr_PML4pml4e))
extern u_int64_t KPML4phys; /* physical address of kernel level 4 */
#endif
在内核访问va的方法是通过PTmap
PMAP_INLINE pt_entry_t *
vtopte(vm_offset_t va)
{
u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
return (PTmap + ((va >> PAGE_SHIFT) & mask));
}
第五部分
待续(by chishanmingshen)。。。