arm-linux(kernel-2.6.13)的启动过程(1.2/2)
回到setup_arch()
经过这样的处理,setup.c文件中的meminfo可就不在是
static struct meminfo meminfo __initdata = { 0, };
而是
static struct meminfo meminfo __initdata = { 1,{0x30000000,0x4000000,0},{}, };
表示当前有一个内存区域,物理地址是从0x30000000开始,大小是64M,节点是0
paging_init(&meminfo, mdesc);
这是个庞大的函数,里面有很多好东西。显然他有根据机器描述符填充meminfo(setup.c)的倾向。一点一点看,估计看完天都黑了。
先把他考过来。
现在到了init.c了,这个文件里也有一个 meminfo,这个跟setup.c中的是两回事。
static struct meminfo meminfo __initdata = { 0, }; (在init.c中)
/*
* paging_init() sets up the page tables, initialises the zone memory
* maps, and sets up the zero page, bad page and bad page tables.
*/
void __init paging_init(struct meminfo *mi, struct machine_desc *mdesc)
{
void *zero_page;
int node;
bootmem_init(mi);
先分析下这个函数。
/*
* Initialise the bootmem allocator for all nodes. This is called
* early during the architecture specific initialisation.
*/
static void __init bootmem_init(struct meminfo *mi)
{
struct node_info node_info[MAX_NUMNODES], *np = node_info;
unsigned int bootmap_pages, bootmap_pfn, map_pg;
int node, initrd_node;
现来看看node_info这个结构是什么。
struct node_info {
unsigned int start;
unsigned int end;
int bootmap_pages;
};
#define NODES_SHIFT 2 /* Normally, Max 4 Nodes */
#define MAX_NUMNODES (1 << NODES_SHIFT)
很简单的数据结构。Max 4 Nodes
bootmap_pages = find_memend_and_nodes(mi, np);
把find_memend_and_nodes函数贴过来,我们已经下潜到第n层函数调用了。^_^
/*
* Scan the memory info structure and pull out:
* - the end of memory
* - the number of nodes
* - the pfn range of each node
* - the number of bootmem bitmap pages
*/
static unsigned int __init
find_memend_and_nodes(struct meminfo *mi, struct node_info *np)
{
unsigned int i, bootmem_pages = 0, memend_pfn = 0;
for (i = 0; i < MAX_NUMNODES; i++) {
np[i].start = -1U;
np[i].end = 0;
np[i].bootmap_pages = 0;
}
初始化这四个节点信息结构,在上一层函数中定义的。
for (i = 0; i < mi->nr_banks; i++) { 已经有一个bank了哦,nr_banks=1。
unsigned long start, end;
int node;
if (mi->bank[i].size == 0) { 我们ram的大小是64M,这里不成立。
/*
* Mark this bank with an invalid node number
*/
mi->bank[i].node = -1;
continue;
}
node = mi->bank[i].node; 这里node=0
/*
* Make sure we haven't exceeded the maximum number of nodes
* that we have in this configuration. If we have, we're in
* trouble. (maybe we ought to limit, instead of bugging?)
*/
if (node >= MAX_NUMNODES)
BUG();
node_set_online(node);这是个宏定义
#define node_set_online(node) set_bit((node), node_online_map.bits)
nodemask_t node_online_map = { { [0] = 1UL } };
EXPORT_SYMBOL(node_online_map);
typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
#define DECLARE_BITMAP(name,bits) \
unsigned long name[BITS_TO_LONGS(bits)]
#define BITS_TO_LONGS(bits) \
(((bits)+BITS_PER_LONG-1)/BITS_PER_LONG) (4+32-1)/32 = 1
BITS_PER_LONG = 32
经过处理后应该是
typedef struct {
unsigned long bits[1];
} nodemask_t;
#define set_bit(nr,p) ATOMIC_BITOP_LE(set_bit,nr,p)
#define ATOMIC_BITOP_LE(name,nr,p) \
(__builtin_constant_p(nr) ? \
____atomic_##name(nr, p) : \
_##name##_le(nr,p))
扩展后应该是
#define ATOMIC_BITOP_LE(name,nr,p) \
(__builtin_constant_p(0) ? \
____atomic_set_bit(0, node_online_map.bits) : \
_set_bit_le(0,node_online_map.bits))
似乎他的意图是想把node_online_map.bits的bit0 写为1,但是本来就是1呀?如下。
c02abbc0 :
c02abbc0: 00000001 andeq r0, r0, r1
好了,回到find_memend_and_nodes
/*
* Get the start and end pfns for this bank
*/
start = mi->bank[i].start >> PAGE_SHIFT;
end = (mi->bank[i].start + mi->bank[i].size) >> PAGE_SHIFT;
得到起始和结束页号0x30000 ~ 0x34000
if (np[node].start > start)
np[node].start = start;
if (np[node].end < end)
np[node].end = end;
填写到节点信息中。
if (memend_pfn < end)
memend_pfn = end;
填写memend_pfn为最末页号
}
/*
* Calculate the number of pages we require to
* store the bootmem bitmaps.
*/
for_each_online_node(i) {
if (np[i].end == 0)
continue;
np[i].bootmap_pages = bootmem_bootmap_pages(np[i].end -
np[i].start);
bootmem_pages += np[i].bootmap_pages;
}
#define for_each_online_node(node) for_each_node_mask((node), node_online_map)
#define for_each_node_mask(node, mask) \
for ((node) = first_node(mask); \
(node) < MAX_NUMNODES; \
(node) = next_node((node), (mask)))
#define first_node(src) __first_node(&(src))
static inline int __first_node(const nodemask_t *srcp)
{
return min_t(int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
}
#define next_node(n, src) __next_node((n), &(src))
static inline int __next_node(int n, const nodemask_t *srcp)
{
return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
}
#define find_first_bit(p,sz) _find_first_bit_le(p,sz)
#define find_next_bit(p,sz,off) _find_next_bit_le(p,sz,off)
/*
* Purpose : Find a 'one' bit
* Prototype: int find_first_bit(const unsigned long *addr, unsigned int maxbit);
*/
ENTRY(_find_first_bit_le)
teq r1, #0
beq 3f
mov r2, #0
1: ldrb r3, [r0, r2, lsr #3]
movs r3, r3
bne .found @ any now set - found zero bit
add r2, r2, #8 @ next bit pointer
2: cmp r2, r1 @ any more?
blo 1b
3: mov r0, r1 @ no free bits
RETINSTR(mov,pc,lr)
/*
* Purpose : Find next 'one' bit
* Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset)
*/
ENTRY(_find_next_bit_le)
teq r1, #0
beq 3b
ands ip, r2, #7
beq 1b @ If new byte, goto old routine
ldrb r3, [r0, r2, lsr #3]
movs r3, r3, lsr ip @ shift off unused bits
bne .found
orr r2, r2, #7 @ if zero, then no bits here
add r2, r2, #1 @ align bit pointer
b 2b @ loop for next bit
看看bootmem_bootmap_pages作了什么?
/* return the number of _pages_ that will be allocated for the boot bitmap */
unsigned long __init bootmem_bootmap_pages (unsigned long pages)
{
unsigned long mapsize;
mapsize = (pages+7)/8; 表示需要多少byte存放内存页面的bit映射表
mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; 将这些byte页面对齐
mapsize >>= PAGE_SHIFT; 计算出这些byte需要多少页面。
return mapsize; 返回页面数。
}
我的内存64M,0x4000个页面,0x4000/8=2048bytes,也就需要一个页面了。
经过for_each_online_node(i)的处理,node_info[0]应该成了这个样子
node_info[0] = {0x30000,0x34000,1} 表示可用内存有0x30000 ~ 0x34000的物理页面,需要一个页的容量来存放页面位图
bootmem_pages = 1
返回find_memend_and_node。
high_memory = __va(memend_pfn << PAGE_SHIFT);
确定最高内存的地址,high_memory在memory.c中定义的全局指针。
/*
* A number of key systems in x86 including ioremap() rely on the assumption
* that high_memory defines the upper bound on direct map memory, then end
* of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and
* highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
* and ZONE_HIGHMEM.
*/
void * high_memory;
对解释很模糊
/*
* This doesn't seem to be used by the Linux memory
* manager any more. If we can get rid of it, we
* also get rid of some of the stuff above as well.
*
* Note: max_low_pfn and max_pfn reflect the number
* of _pages_ in the system, not the maximum PFN.
*/
max_low_pfn = memend_pfn - O_PFN_DOWN(PHYS_OFFSET); PHYS_OFFSET=0x30000000
max_pfn = memend_pfn - O_PFN_DOWN(PHYS_OFFSET);
return bootmem_pages;
}
在bootmem.c中的定义。
/*
* Access to this subsystem has to be serialized externally. (this is
* true for the boot process anyway)
*/
unsigned long max_low_pfn;
unsigned long min_low_pfn;
unsigned long max_pfn;
#define O_PFN_DOWN(x) ((x) >> PAGE_SHIFT)
这个函数的总体功能是找出页面位图需要的页数,也就是1页。
返回到bootmem_init
bootmap_pfn = find_bootmap_pfn(0, mi, bootmap_pages);
这段函数用来找到 内核镜像后面的那个没有用到的第一个页面的页号(物理的),关键代码 是 start_pfn = O_PFN_UP(__pa(&_end));
O_PFN_UP 使得内核镜像 与 内存使用bit位图 页面之间存在 hole。
initrd_node = check_initrd(mi);
没有用到initrd。check_initrd(mi)返回-2。
map_pg = bootmap_pfn;
/*
* Initialise the bootmem nodes.
*
* What we really want to do is:
*
* unmap_all_regions_except_kernel();
* for_each_node_in_reverse_order(node) {
* map_node(node);
* allocate_bootmem_map(node);
* init_bootmem_node(node);
* free_bootmem_node(node);
* }
*
* but this is a 2.5-type change. For now, we just set
* the nodes up in reverse order.
*
* (we could also do with rolling bootmem_init and paging_init
* into one generic "memory_init" type function).
*/
np += num_online_nodes() - 1;
#define num_online_nodes() nodes_weight(node_online_map)
for (node = num_online_nodes() - 1; node >= 0; node--, np--) {
/*
* If there are no pages in this node, ignore it.
* Note that node 0 must always have some pages.
*/
if (np->end == 0 || !node_online(node)) {
if (node == 0)
BUG();
continue;
}
/*
* Initialise the bootmem allocator.
*/
init_bootmem_node(NODE_DATA(node), map_pg, np->start, np->end);
在linux/mmzone.h中
#ifndef CONFIG_NEED_MULTIPLE_NODES 我的配置文件中,没有定义这个量
extern struct pglist_data contig_page_data;
#define NODE_DATA(nid) (&contig_page_data) 所以应该取这个宏定义,
#define NODE_MEM_MAP(nid) mem_map
#define MAX_NODES_SHIFT 1
#define pfn_to_nid(pfn) (0)
#else /* CONFIG_NEED_MULTIPLE_NODES */
#include 否则,就会取这里的宏定义
#endif /* !CONFIG_NEED_MULTIPLE_NODES */
也就是说NODE_DATA(node)返回节点的contig_page_data结构指针。我们只有节点0,所以总是返回这个指针。看看pglist_data结构是什么。
typedef struct pglist_data {
struct zone node_zones[MAX_NR_ZONES];
struct zonelist node_zonelists[GFP_ZONETYPES];
int nr_zones;
#ifdef CONFIG_FLAT_NODE_MEM_MAP 配置文件中定义了这个量。
struct page *node_mem_map;
#endif
struct bootmem_data *bdata; 这个结构这里显得比较重要,贴在了下面。
unsigned long node_start_pfn;
unsigned long node_present_pages; /* total number of physical pages */
unsigned long node_spanned_pages; /* total size of physical page
range, including holes */
int node_id;
struct pglist_data *pgdat_next;
wait_queue_head_t kswapd_wait;
struct task_struct *kswapd;
int kswapd_max_order;
} pg_data_t;
/*
* node_bootmem_map is a map pointer - the bits represent all physical
* memory pages (including holes) on the node.
*/
typedef struct bootmem_data {
unsigned long node_boot_start;
unsigned long node_low_pfn;
void *node_bootmem_map;
unsigned long last_offset;
unsigned long last_pos;
unsigned long last_success; /* Previous allocation point. To speed
* up searching */
} bootmem_data_t;
回来继续看
init_bootmem_node(&contig_page_data, map_pg, 0x30000, 0x34000);
他原封不动的调用了init_bootmem_core
unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn,unsigned long startpfn, unsigned long endpfn)
{
return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn));
}
看init_bootmem_core。
/*
* Called once to set up the allocator itself.
*/
static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
unsigned long mapstart, unsigned long start, unsigned long end)
{
bootmem_data_t *bdata = pgdat->bdata; 取得bdata.
unsigned long mapsize = ((end - start)+7)/8; 需要的字节数(整个物理内存的)
pgdat->pgdat_next = pgdat_list; //pgdat_list是全局指针,定义struct pglist_data *pgdat_list;
pgdat_list = pgdat; //这样就组成了一个双向链表
mapsize = ALIGN(mapsize, sizeof(long)); //四字节对其
bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); 得到那片即将作为bit位图的页面的虚拟地址
bdata->node_boot_start = (start << PAGE_SHIFT); 物理起始地址0x30000000
bdata->node_low_pfn = end; 结束页面号。
/*
* Initially all pages are reserved - setup_arch() has to
* register free RAM areas explicitly.
*/
memset(bdata->node_bootmem_map, 0xff, mapsize); 全部填充为0xff,表示0x30000000 ~ 0x34000000都不可用
return mapsize;
}
现在的物理内存都不可用了。
free_bootmem_node_bank(node, mi);
看看怎么释放物理内存的使用情况的。
/*
* Register all available RAM in this node with the bootmem allocator.
*/
static inline void free_bootmem_node_bank(int node, struct meminfo *mi)
{
pg_data_t *pgdat = NODE_DATA(node);
int bank;
for (bank = 0; bank < mi->nr_banks; bank++)
if (mi->bank[bank].node == node)
free_bootmem_node(pgdat, mi->bank[bank].start,
mi->bank[bank].size);
}
他的解释是注册这个节点的可用RAM,看free_bootmem_node
void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size)
{
free_bootmem_core(pgdat->bdata, physaddr, size);
}
free_bootmem_core(bootmem_data_t *bdata, 0x30000000, 0x4000000)
static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size)
{
unsigned long i;
unsigned long start;
/*
* round down end of usable mem, partially free pages are
* considered reserved.
*/
unsigned long sidx;
unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE;
unsigned long end = (addr + size)/PAGE_SIZE;
BUG_ON(!size);
BUG_ON(end > bdata->node_low_pfn);
if (addr < bdata->last_success)
bdata->last_success = addr;
/*
* Round up the beginning of the address.
*/
start = (addr + PAGE_SIZE-1) / PAGE_SIZE;
sidx = start - (bdata->node_boot_start/PAGE_SIZE);
for (i = sidx; i < eidx; i++) {
if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map)))
BUG();
}
}
这段代码是init_bootmem_core的反过程,现在所有的物理内存都可用了。(0x30000000 ~ 0x34000000)
map_pg += np->bootmap_pages; 指向下一个要当作内存位图的页面,已经无意义。
/*
* If this is node 0, we need to reserve some areas ASAP -
* we may use bootmem on node 0 to setup the other nodes.
*/
if (node == 0)
reserve_node_zero(bootmap_pfn, bootmap_pages);
}
BUG_ON(map_pg != bootmap_pfn + bootmap_pages);
}
这个函数 reserve_node_zero()真正的把需要保留的内存页保留了下来。
bootmap_pfn : 内核镜像后面的那个没有用到的第一个页面的页号(物理的)
bootmap_pages : 1
/*
* Reserve the various regions of node 0
*/
static __init void reserve_node_zero(unsigned int bootmap_pfn, unsigned int bootmap_pages)
{
pg_data_t *pgdat = NODE_DATA(0);
unsigned long res_size = 0;
/*
* Register the kernel text and data with bootmem.
* Note that this can only be in node 0.
*/
reserve_bootmem_node(pgdat, __pa(&_stext), &_end - &_stext);
在0节点中保留内核镜像部分的内存,大概是0x30008000到以后的1.5M左右。
/*
* Reserve the page tables. These are already in use,
* and can only be in node 0.
*/
reserve_bootmem_node(pgdat, __pa(swapper_pg_dir),
PTRS_PER_PGD * sizeof(pgd_t));
需要的信息如下
.globl swapper_pg_dir
.equ swapper_pg_dir, TEXTADDR - 0x4000
extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
#define PTRS_PER_PGD 2048
typedef struct { unsigned long pgd[2]; } pgd_t;
就是把0x30004000 ~ 0x30008000这段(16K)用于页目录的内容保留下来。
/*
* And don't forget to reserve the allocator bitmap,
* which will be freed later.
*/
reserve_bootmem_node(pgdat, bootmap_pfn << PAGE_SHIFT,
bootmap_pages << PAGE_SHIFT);
保留内核镜像后面的那些bit位图。我们好不容易创建的,可不能丟掉。
}
0x30000100不是还有参数吗?现在已经取得了参数了,不需要了。现在物理内存的布局大概是:
0x30000000 0x30004000 0x30008000 0x3xxxxxxx 0x34000000
| | | | |
|___________|___________|___________|______________________________________________|
| | | | | |
| | 页表 | image | 一页 (4k) | |
| | | | | |
|___________|___________|___________|_______________|______________________________|
可用 页目录(保留) kernel(保留) bit位图(保留)
bootmem_init到这里结束,回到paging_init中。
memcpy(&meminfo, mi, sizeof(meminfo));
拷贝到自己的meminfo中,以前我们用的meminfo保存在setup.c中,这次到了init.c中。
/*
* allocate the zero page. Note that we count on this going ok.
*/
zero_page = alloc_bootmem_low_pages(PAGE_SIZE);
分配一个0页,看看他在那里分配的,什么作用?
#define alloc_bootmem_low_pages(x) \
__alloc_bootmem((x), PAGE_SIZE, 0)
这样的参数
__alloc_bootmem(PAGE_SIZE, PAGE_SIZE, 0)
void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal)
{
pg_data_t *pgdat = pgdat_list; //这个指针不陌生 contig_page_data
void *ptr;
for_each_pgdat(pgdat) //这个循环只有一次了
if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, 主体是这个函数了,希望他不要让我们失望,别返回NULL。
align, goal)))
return(ptr); 这样就可以退出历史舞台了
/*
* Whoops, we cannot satisfy the allocation request.
*/
printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
panic("Out of memory");
return NULL;
}
/*
* We 'merge' subsequent allocations to save space. We might 'lose' 合并了子分配去节省空间
* some fraction of a page if allocations cannot be satisfied due to
* size constraints on boxes where there is physical RAM space
* fragmentation - in these cases (mostly large memory boxes) this
* is not a problem.
*
* On low memory boxes we get it right in 100% of the cases.
*
* alignment has to be a power of 2 value.
*
* NOTE: This function is _not_ reentrant. 不可重入的函数
*/
看下参数
__alloc_bootmem_core(*bdata, PAGE_SIZE,PAGE_SIZE,0)
static void * __init
__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
unsigned long align, unsigned long goal)
{
unsigned long offset, remaining_size, areasize, preferred;
unsigned long i, start = 0, incr, eidx;
void *ret;
if(!size) {
printk("__alloc_bootmem_core(): zero-sized request\n");
BUG();
}
BUG_ON(align & (align-1));
eidx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
贴来需要的数据
bdata->node_boot_start = (start << PAGE_SHIFT); 物理起始地址0x30000000
bdata->node_low_pfn = end; 结束页面号。
eidx存放的是bit位图的最后一个索引
offset = 0;
if (align &&
(bdata->node_boot_start & (align - 1UL)) != 0) 显然后面的条件不满足
offset = (align - (bdata->node_boot_start & (align - 1UL)));
offset >>= PAGE_SHIFT;
这样offset还是0
/*
* We try to allocate bootmem pages above 'goal' 首先尝试获得引导内存页面在goal上面
* first, then we try to allocate lower pages. 然后尝试获得底端内存页面
*/
if (goal && (goal >= bdata->node_boot_start) && 显然不成立
((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) {
preferred = goal - bdata->node_boot_start;
if (bdata->last_success >= preferred)
preferred = bdata->last_success;
} else
preferred = 0; 这里有效
preferred = ALIGN(preferred, align) >> PAGE_SHIFT;
看看是怎么对齐的。
#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1))
还是0,这个是向上对齐的,如果x是1,那么就变成了align,这里就是page_size
preferred += offset;
areasize = (size+PAGE_SIZE-1)/PAGE_SIZE; 页为单位的区域,这里是1个页
incr = align >> PAGE_SHIFT ? : 1; incr=1
restart_scan:
for (i = preferred; i < eidx; i += incr) { 从0扫描到结束的内存bit位图
unsigned long j;
i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i); 查找可用内存,别忘了0x30000000~0x30004000这4页可用页
i = ALIGN(i, incr); //这个多余
if (test_bit(i, bdata->node_bootmem_map)) 这里显然通过了
continue;
for (j = i + 1; j < i + areasize; ++j) { //由于areasize是1,这个扫描显得多余了。
if (j >= eidx)
goto fail_block;
if (test_bit (j, bdata->node_bootmem_map))
goto fail_block;
}
start = i; 所以我们很快得到了一页的可用空间,她在最顶端,也就是0x30000000~0x30001000这个页面
goto found; 此时的i=0
fail_block:
i = ALIGN(j, incr);
}
if (preferred > offset) {
preferred = offset;
goto restart_scan;
}
return NULL;
found:
bdata->last_success = start << PAGE_SHIFT; 填0,很讽刺,最后成功的是第一个页面
BUG_ON(start >= eidx);
/*
* Is the next page of the previous allocation-end the start
* of this allocation's buffer? If yes then we can 'merge'
* the previous partial page with this allocation.
*/
if (align < PAGE_SIZE && //显然这里不通过
bdata->last_offset && bdata->last_pos+1 == start) {
offset = ALIGN(bdata->last_offset, align);
BUG_ON(offset > PAGE_SIZE);
remaining_size = PAGE_SIZE-offset;
if (size < remaining_size) {
areasize = 0;
/* last_pos unchanged */
bdata->last_offset = offset+size;
ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
bdata->node_boot_start);
} else {
remaining_size = size - remaining_size;
areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE;
ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
bdata->node_boot_start);
bdata->last_pos = start+areasize-1;
bdata->last_offset = remaining_size;
}
bdata->last_offset &= ~PAGE_MASK;
} else {
bdata->last_pos = start + areasize - 1; 填0
bdata->last_offset = size & ~PAGE_MASK; 填1
ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start); 返回找到的页面的虚拟地址,也就是0xc0000000
}
/*
* Reserve the area now:
*/
for (i = start; i < start+areasize; i++) 保留这个区间的页面,这里只有一个页面
if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map)))
BUG();
memset(ret, 0, size); 把0xc0000000~0xc0001000这第一个页面填充0
return ret; 返回0xc0000000这个指针
}
回到paging_init中
/*
* initialise the page tables.
*/
memtable_init(mi);
在汇编部分只映射了4m空间的ram,现在用c建立完整的页表。
/*
* Setup initial mappings. We use the page we allocated for zero page to hold
* the mappings, which will get overwritten by the vectors in traps_init().
* The mappings must be in virtual address order.
*/
void __init memtable_init(struct meminfo *mi)
{
struct map_desc *init_maps, *p, *q;
unsigned long address = 0;
int i;
build_mem_type_table();
看看他作什么。
/*
* Adjust the PMD section entries according to the CPU in use.
*/
static void __init build_mem_type_table(void)
{
struct cachepolicy *cp;
unsigned int cr = get_cr(); //得到cr寄存器的数值,cp15的控制寄存器。
unsigned int user_pgprot;
int cpu_arch = cpu_architecture(); //得到cpu的版本,这里是3,#define CPU_ARCH_ARMv4T 3
int i;
if (cpu_arch < CPU_ARCH_ARMv5) { //4<5
if (cachepolicy >= CPOLICY_WRITEALLOC) 不满足
cachepolicy = CPOLICY_WRITEBACK;
ecc_mask = 0;
}
需要的数据:
static unsigned int cachepolicy __initdata = CPOLICY_WRITEBACK;
static unsigned int ecc_mask __initdata = 0;
上面两个全局变量都是mm-armv.c中的私有数据。
#define CPOLICY_UNCACHED 0
#define CPOLICY_BUFFERED 1
#define CPOLICY_WRITETHROUGH 2
#define CPOLICY_WRITEBACK 3
#define CPOLICY_WRITEALLOC 4
dump中:
c001e8f8 :
c001e8f8: 00000000 andeq r0, r0, r0
c001e868 :
c001e868: 00000003 andeq r0, r0, r3
显然cachepolicy的初始化数据是CPOLICY_WRITEBACK=3
if (cpu_arch <= CPU_ARCH_ARMv5TEJ) {
for (i = 0; i < ARRAY_SIZE(mem_types); i++) {
if (mem_types[i].prot_l1)
mem_types[i].prot_l1 |= PMD_BIT4;
if (mem_types[i].prot_sect)
mem_types[i].prot_sect |= PMD_BIT4;
}
}
需要的数据:
有8种内存类型。
#define MT_DEVICE 0
#define MT_CACHECLEAN 1
#define MT_MINICLEAN 2
#define MT_LOW_VECTORS 3
#define MT_HIGH_VECTORS 4
#define MT_MEMORY 5
#define MT_ROM 6
#define MT_IXP2000_DEVICE 7
struct mem_types {
unsigned int prot_pte;
unsigned int prot_l1;
unsigned int prot_sect;
unsigned int domain;
};
static struct mem_types mem_types[] __initdata = {
[MT_DEVICE] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_WRITE,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PMD_TYPE_SECT | PMD_SECT_UNCACHED |
PMD_SECT_AP_WRITE,
.domain = DOMAIN_IO,
},
[MT_CACHECLEAN] = {
.prot_sect = PMD_TYPE_SECT,
.domain = DOMAIN_KERNEL,
},
[MT_MINICLEAN] = {
.prot_sect = PMD_TYPE_SECT | PMD_SECT_MINICACHE,
.domain = DOMAIN_KERNEL,
},
[MT_LOW_VECTORS] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_EXEC,
.prot_l1 = PMD_TYPE_TABLE,
.domain = DOMAIN_USER,
},
[MT_HIGH_VECTORS] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_USER | L_PTE_EXEC,
.prot_l1 = PMD_TYPE_TABLE,
.domain = DOMAIN_USER,
},
[MT_MEMORY] = {
.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
.domain = DOMAIN_KERNEL,
},
[MT_ROM] = {
.prot_sect = PMD_TYPE_SECT,
.domain = DOMAIN_KERNEL,
},
[MT_IXP2000_DEVICE] = { /* IXP2400 requires XCB=101 for on-chip I/O */
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_WRITE,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PMD_TYPE_SECT | PMD_SECT_UNCACHED |
PMD_SECT_AP_WRITE | PMD_SECT_BUFFERABLE |
PMD_SECT_TEX(1),
.domain = DOMAIN_IO,
}
};
根据需要填充PMD_BIT4。
cp = &cache_policies[cachepolicy]; 指向cache_policies[3]
user_pgprot = cp->pte;
需要的数据:
struct cachepolicy {
const char policy[16];
unsigned int cr_mask;
unsigned int pmd;
unsigned int pte;
};
static struct cachepolicy cache_policies[] __initdata = {
{
.policy = "uncached",
.cr_mask = CR_W|CR_C,
.pmd = PMD_SECT_UNCACHED,
.pte = 0,
}, {
.policy = "buffered",
.cr_mask = CR_C,
.pmd = PMD_SECT_BUFFERED,
.pte = PTE_BUFFERABLE,
}, {
.policy = "writethrough",
.cr_mask = 0,
.pmd = PMD_SECT_WT,
.pte = PTE_CACHEABLE,
}, {
.policy = "writeback", //名字,写回的方案
.cr_mask = 0,
.pmd = PMD_SECT_WB, // 使用cache和buffer
.pte = PTE_BUFFERABLE|PTE_CACHEABLE, //2级页表(1 << 2)|(1 << 3)
}, {
.policy = "writealloc",
.cr_mask = 0,
.pmd = PMD_SECT_WBWA,
.pte = PTE_BUFFERABLE|PTE_CACHEABLE,
}
};
将cache的使用组成若干不同的策略,存放在cache_policies中,方便使用。
if (cpu_arch >= CPU_ARCH_ARMv5) {
mem_types[MT_LOW_VECTORS].prot_pte |= cp->pte & PTE_CACHEABLE;
mem_types[MT_HIGH_VECTORS].prot_pte |= cp->pte & PTE_CACHEABLE;
} else {
mem_types[MT_LOW_VECTORS].prot_pte |= cp->pte;
mem_types[MT_HIGH_VECTORS].prot_pte |= cp->pte;
mem_types[MT_MINICLEAN].prot_sect &= ~PMD_SECT_TEX(1);
}
根据arm版本 设置内存类型 中 的相关标志。
mem_types[MT_LOW_VECTORS].prot_l1 |= ecc_mask;
mem_types[MT_HIGH_VECTORS].prot_l1 |= ecc_mask;
mem_types[MT_MEMORY].prot_sect |= ecc_mask | cp->pmd;
mem_types[MT_ROM].prot_sect |= cp->pmd;
通用的方案。
for (i = 0; i < 16; i++) {
unsigned long v = pgprot_val(protection_map[i]);
v &= (~(PTE_BUFFERABLE|PTE_CACHEABLE)) | user_pgprot;
protection_map[i] = __pgprot(v);
}
pgprot_kernel = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG |
L_PTE_DIRTY | L_PTE_WRITE |
L_PTE_EXEC | cp->pte);
switch (cp->pmd) {
case PMD_SECT_WT:
mem_types[MT_CACHECLEAN].prot_sect |= PMD_SECT_WT;
break;
case PMD_SECT_WB:
case PMD_SECT_WBWA:
mem_types[MT_CACHECLEAN].prot_sect |= PMD_SECT_WB;
break;
}
printk("Memory policy: ECC %sabled, Data cache %s\n",
ecc_mask ? "en" : "dis", cp->policy);
}
不看了,不懂,就学过v4了。^_^
回到memtable_init。
init_maps = p = alloc_bootmem_low_pages(PAGE_SIZE);
这段函数已经分析过了,它得到第一个可用的内存页面,这里应该是第2个页面0x30001000~0x30002000。
init_maps = p=0x30001000,这个样子。
这里的alloc_bootmem_low_pages分配了一个空页,跟kmalloc()相似。看看map_desc这个数据结构
struct map_desc {
unsigned long virtual;
unsigned long physical;
unsigned long length;
unsigned int type; //内存的类型,就是那8种之一
};
内核用这个结构来描述ram的类型,物理,虚拟地址,和长度。
for (i = 0; i < mi->nr_banks; i++) { //只有一个内存bank,所以扫描1次。
if (mi->bank[i].size == 0)
continue;
p->physical = mi->bank[i].start;
p->virtual = __phys_to_virt(p->physical);
p->length = mi->bank[i].size;
p->type = MT_MEMORY;
p ++;
}
我们的0x30000000~0x34000000这片内存得到了描述。
#ifdef FLUSH_BASE
p->physical = FLUSH_BASE_PHYS;
p->virtual = FLUSH_BASE;
p->length = PGDIR_SIZE;
p->type = MT_CACHECLEAN;
p ++;
#endif
#ifdef FLUSH_BASE_MINICACHE
p->physical = FLUSH_BASE_PHYS + PGDIR_SIZE;
p->virtual = FLUSH_BASE_MINICACHE;
p->length = PGDIR_SIZE;
p->type = MT_MINICLEAN;
p ++;
#endif
似乎没有FLUSH_BASE FLUSH_BASE_MINICACHE的定义。
/*
* Go through the initial mappings, but clear out any
* pgdir entries that are not in the description.
*/
q = init_maps;
do {
if (address < q->virtual || q == p) {
clear_mapping(address);
address += PGDIR_SIZE;
} else {
create_mapping(q);
address = q->virtual + q->length;
address = (address + PGDIR_SIZE - 1) & PGDIR_MASK;
q ++;
}
} while (address != 0);
分析下这个do while
address = 0,q指向了第一个map_desc,q经过++指向第2个map_desc描述符。
#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
PGDIR_SHIFT determines what a third-level page table entry can map
#define PGDIR_SHIFT 21
看看清除映射函数 clear_mapping
/*
* Clear any PGD mapping. On a two-level page table system,
* the clearance is done by the middle-level functions (pmd)
* rather than the top-level (pgd) functions.
*/页目录的清除是由中间(pmd)目录函数完成的,而不是pgd的函数。
static inline void clear_mapping(unsigned long virt)
{
pmd_clear(pmd_off_k(virt));
}
看看pmd_clear的参数pmd_off_k(virt)是什么?
pmd_off_k(virt) = (init_mm->pgd+(addr) >> 21)
init_mm->pgd的类型是pgd_t类型的,而 typedef unsigned long pgd_t[2];
可见这个函数返回了virt这个虚拟地址在中间页目录中的索引值,他索引了2048个 8byte 长的pmd,从0xc0004000增长到0xc0008000-8,
#define INIT_MM(name) \
{ \
.mm_rb = RB_ROOT, \
.pgd = swapper_pg_dir, .pgd是页表所在的虚拟地址,在0xc0004000
...
}
#define pmd_clear(pmdp) \
do { \
pmdp[0] = __pmd(0); \
pmdp[1] = __pmd(0); \
clean_pmd_entry(pmdp); \
} while (0)
所以这个pmd_clear,可以擦除0xc0004000~0xc0008000之间的数据(页目录),当然它自己认为自己擦除了2048个8byte长度的pmd。
当addr到了0xc0000000这个虚拟地址的时候,转向else分支,创建这部分内存的页表。
看看create_mapping是怎么根据map_desc的内容来创建页表的?
/*
* Create the page directory entries and any necessary
* page tables for the mapping specified by `md'. We
* are able to cope here with varying sizes and address
* offsets, and we take full advantage of sections and
* supersections.
*/
static void __init create_mapping(struct map_desc *md)
{
unsigned long virt, length;
int prot_sect, prot_l1, domain;
pgprot_t prot_pte;
long off;
if (md->virtual != vectors_base() && md->virtual < TASK_SIZE) {
printk(KERN_WARNING "BUG: not creating mapping for "
"0x%08lx at 0x%08lx in user region\n",
md->physical, md->virtual);
return;
}
只能给系统空间 或者 中断向量 所在的空间创建映射,绝对不可给用户虚拟空间创建映射。
if ((md->type == MT_DEVICE || md->type == MT_ROM) &&
md->virtual >= PAGE_OFFSET && md->virtual < VMALLOC_END) {
printk(KERN_WARNING "BUG: mapping for 0x%08lx at 0x%08lx "
"overlaps vmalloc space\n",
md->physical, md->virtual);
}
这里只是为了让我们检查一下,映射的范围有没有覆盖到VMALLOC_START VMALLOC_END-1。
这里其实没有覆盖到
domain = mem_types[md->type].domain;
prot_pte = __pgprot(mem_types[md->type].prot_pte);
prot_l1 = mem_types[md->type].prot_l1 | PMD_DOMAIN(domain);
prot_sect = mem_types[md->type].prot_sect | PMD_DOMAIN(domain);
virt = md->virtual;
off = md->physical - virt;
length = md->length;
if (mem_types[md->type].prot_l1 == 0 &&
(virt & 0xfffff || (virt + off) & 0xfffff || (virt + length) & 0xfffff)) {
printk(KERN_WARNING "BUG: map for 0x%08lx at 0x%08lx can not "
"be mapped using pages, ignoring.\n",
md->physical, md->virtual);
return;
}
不是合法的一级描述符,并且虚拟地址,偏移量,长度 有不是1M对齐的,就不能进行映射了(创建页表)。
while ((virt & 0xfffff || (virt + off) & 0xfffff) && length >= PAGE_SIZE) {
alloc_init_page(virt, virt + off, prot_l1, prot_pte);
virt += PAGE_SIZE;
length -= PAGE_SIZE;
}
/*
* A section mapping covers half a "pgdir" entry.
*/
while (length >= (PGDIR_SIZE / 2)) {
alloc_init_section(virt, virt + off, prot_sect);
virt += (PGDIR_SIZE / 2);
length -= (PGDIR_SIZE / 2);
}
在这里完成了ram 0xc0000000~0xc4000000的映射,就是为我的ram创建了页中间目录(pmd)。
这样,我们在汇编部分创建页表,就被覆盖了,好处是linux的视野变大了(他看到了所有物理ram:0x30000000~0x34000000)。
while (length >= PAGE_SIZE) {
alloc_init_page(virt, virt + off, prot_l1, prot_pte);
virt += PAGE_SIZE;
length -= PAGE_SIZE;
}
}
这几个 while 语句的含义为: 若虚拟地址与 1M (2^20)没有对齐(即低 20 位不全位 0) ,
则建立二级页面映射;然后为1M的部分逐段建立段(section)映射;最后为PAGE_SIZE部分建立二级页表映射。
可见长度小于PAGE_SIZE的部分是不会被映射的。
看看alloc_init_section。
static inline void
alloc_init_section(unsigned long virt, unsigned long phys, int prot)
{
pmd_t *pmdp = pmd_off_k(virt); //把指针强制转换成unsigned long *类型的。
if (virt & (1 << 20)) //对于虚拟地址在1,3,5,7...M部分的内容,pmdp指向 8byte部分的后4byte。
pmdp++; //个人感觉,linux 的pmd让代码显得愚蠢。让读代码的人困惑。
*pmdp = __pmd(phys | prot); //填充这个pgd,对cpu来说,这个条目就是一个pgd描述符(段描述符)。
flush_pmd_entry(pmdp);
}
回到do while中,在if部分,把虚拟地址余下的部分0xc4000000~0xffffffff对应的pmd条目清除了。
到了这里就完成了对pmd的创建过程。
忘了alloc_init_page()这个函数了,分析下建立中断向量所在页面的页表的过程。
/*
* Add a PAGE mapping between VIRT and PHYS in domain
* DOMAIN with protection PROT. Note that due to the
* way we map the PTEs, we must allocate two PTE_SIZE'd
* blocks - one for the Linux pte table, and one for
* the hardware pte table.
*/
static inline void
alloc_init_page(unsigned long virt, unsigned long phys, unsigned int prot_l1, pgprot_t prot)
{
pmd_t *pmdp = pmd_off_k(virt); //得到这个虚拟地址的pmd索引(相对0xc0004000来说)
pte_t *ptep;
if (pmd_none(*pmdp)) { // 中断向量的pmd还没有动过(是0),所以这里是真。
unsigned long pmdval;
ptep = alloc_bootmem_low_pages(2 * PTRS_PER_PTE *
sizeof(pte_t));
//这里应该是第3个页面0x30002000~0x30003000。PTRS_PER_PTE = 512,sizeof(pte_t) = sizeof(unsigned long) = 4
//也是打算取得一个页面。
pmdval = __pa(ptep) | prot_l1;
pmdp[0] = __pmd(pmdval);
pmdp[1] = __pmd(pmdval + 256 * sizeof(pte_t));
flush_pmd_entry(pmdp);
}
ptep = pte_offset_kernel(pmdp, virt);
set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, prot));
}
好多宏,我不看了,:-(
现在的页表不再是那个当初只有四个段描述符的页表了。
lzd> md 0x30007000
30007000: 3000041e 3010041e 3020041e 3030041e ...0...0.. 0..00
30007010: 3040041e 3050041e 3060041e 3070041e ..@0..P0..`0..p0
30007020: 3080041e 3090041e 30a0041e 30b0041e ...0...0...0...0
30007030: 30c0041e 30d0041e 30e0041e 30f0041e ...0...0...0...0
30007040: 3100041e 3110041e 3120041e 3130041e ...1...1.. 1..01
30007050: 3140041e 3150041e 3160041e 3170041e ..@1..P1..`1..p1
30007060: 3180041e 3190041e 31a0041e 31b0041e ...1...1...1...1
30007070: 31c0041e 31d0041e 31e0041e 31f0041e ...1...1...1...1
30007080: 3200041e 3210041e 3220041e 3230041e ...2...2.. 2..02
30007090: 3240041e 3250041e 3260041e 3270041e ..@2..P2..`2..p2
300070a0: 3280041e 3290041e 32a0041e 32b0041e ...2...2...2...2
300070b0: 32c0041e 32d0041e 32e0041e 32f0041e ...2...2...2...2
300070c0: 3300041e 3310041e 3320041e 3330041e ...3...3.. 3..03
300070d0: 3340041e 3350041e 3360041e 3370041e ..@3..P3..`3..p3
300070e0: 3380041e 3390041e 33a0041e 33b0041e ...3...3...3...3
300070f0: 33c0041e 33d0041e 33e0041e 33f0041e ...3...3...3...3
它已经映射了所有ram(0x30000000~0x34000000)区间,对应这虚拟地址0xc0000000~0xc4000000,对了还有
lzd> md 0x30007ff0
30007ff0: 00000000 00000000 30002031 30002431 ........1 .01$.0 //确实是第三个页面哦(30002xxx)!!!
给中断向量用的描述符。256 * sizeof(pte_t) = 0x400,显然linux在0xffff0000地址处寻找中断向量。
回到memtable_init()
/*
* Create a mapping for the machine vectors at the high-vectors
* location (0xffff0000). If we aren't using high-vectors, also
* create a mapping at the low-vectors virtual address.
*/
linux默认中断向量从0xffff0000开始,所以为倒数第16个页面创建映射,当然可能失败。
init_maps->physical = virt_to_phys(init_maps);
init_maps->virtual = 0xffff0000;
init_maps->length = PAGE_SIZE;
init_maps->type = MT_HIGH_VECTORS;
create_mapping(init_maps);
if (!vectors_high()) {
init_maps->virtual = 0;
init_maps->type = MT_LOW_VECTORS;
create_mapping(init_maps);
}
如果失败了,这里就一定成功。
中断向量从虚拟地址0开始,就映射这个第1(或者0)个页面(4k)
flush_cache_all();
local_flush_tlb_all();
冲洗缓冲池和页表缓存
top_pmd = pmd_off_k(0xffff0000);
保存最高pmd索引到top_pmd(全局的)。
}
回到paging_init中...
阅读(3193) | 评论(0) | 转发(0) |