arm－linux（kernel-2.6.13）的启动过程（1.2/2）-mclovein-ChinaUnix博客

Suliven的小屋

首页　| 　博文目录　| 　关于我

mclovein

博客访问： 415628
博文数量： 62
博客积分： 1483
博客等级：上尉
技术积分： 779
用户组：普通用户
注册时间： 2009-02-24 12:25

文章分类

全部博文（62）

硬件（7）
qt入门/开发（9）
成长的脚印（19）
c语言（4）
心情不错（0）
驱动入门（13）
应用编程（8）
未分配的博文（2）

文章存档

2012年（2）

2011年（6）

2010年（6）

2009年（48）

我的朋友

myfaxmai

相关博文

arm－linux（kernel-2.6.13）的启动过程（1.2/2）

分类： LINUX

2009-10-09 13:02:08

arm－linux（kernel-2.6.13）的启动过程（1.2/2）

回到setup_arch()

经过这样的处理，setup.c文件中的meminfo可就不在是
static struct meminfo meminfo __initdata = { 0, };
而是
static struct meminfo meminfo __initdata = { 1,{0x30000000,0x4000000,0},{}, };
表示当前有一个内存区域，物理地址是从0x30000000开始，大小是64M，节点是0

   paging_init(&meminfo, mdesc);

这是个庞大的函数，里面有很多好东西。显然他有根据机器描述符填充meminfo（setup.c）的倾向。一点一点看，估计看完天都黑了。
先把他考过来。

现在到了init.c了，这个文件里也有一个 meminfo，这个跟setup.c中的是两回事。

static struct meminfo meminfo __initdata = { 0, }; （在init.c中）
/*
* paging_init() sets up the page tables, initialises the zone memory
* maps, and sets up the zero page, bad page and bad page tables.
*/
void __init paging_init(struct meminfo *mi, struct machine_desc *mdesc)
{
   void *zero_page;
   int node;

   bootmem_init(mi);
先分析下这个函数。
/*
* Initialise the bootmem allocator for all nodes. This is called
* early during the architecture specific initialisation.
*/
static void __init bootmem_init(struct meminfo *mi)
{
   struct node_info node_info[MAX_NUMNODES], *np = node_info;
   unsigned int bootmap_pages, bootmap_pfn, map_pg;
   int node, initrd_node;

现来看看node_info这个结构是什么。
struct node_info {
   unsigned int start;
   unsigned int end;
   int bootmap_pages;
};
#define NODES_SHIFT   2   /* Normally, Max 4 Nodes */
#define MAX_NUMNODES    (1 << NODES_SHIFT)
很简单的数据结构。Max 4 Nodes

   bootmap_pages = find_memend_and_nodes(mi, np);

把find_memend_and_nodes函数贴过来，我们已经下潜到第n层函数调用了。^_^
/*
* Scan the memory info structure and pull out:
* - the end of memory
* - the number of nodes
* - the pfn range of each node
* - the number of bootmem bitmap pages
*/
static unsigned int __init
find_memend_and_nodes(struct meminfo *mi, struct node_info *np)
{
   unsigned int i, bootmem_pages = 0, memend_pfn = 0;

   for (i = 0; i < MAX_NUMNODES; i++) {
       np[i].start = -1U;
       np[i].end = 0;
       np[i].bootmap_pages = 0;
   }

初始化这四个节点信息结构，在上一层函数中定义的。

   for (i = 0; i < mi->nr_banks; i++) { 已经有一个bank了哦，nr_banks=1。
       unsigned long start, end;
       int node;

       if (mi->bank[i].size == 0) { 我们ram的大小是64M，这里不成立。
           /*
           * Mark this bank with an invalid node number
           */
           mi->bank[i].node = -1;
           continue;
       }

       node = mi->bank[i].node; 这里node=0

       /*
       * Make sure we haven't exceeded the maximum number of nodes
       * that we have in this configuration. If we have, we're in
       * trouble. (maybe we ought to limit, instead of bugging?)
       */
       if (node >= MAX_NUMNODES)
           BUG();
       node_set_online(node);这是个宏定义

#define node_set_online(node)       set_bit((node), node_online_map.bits)

nodemask_t node_online_map = { { [0] = 1UL } };
EXPORT_SYMBOL(node_online_map);

typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;

#define DECLARE_BITMAP(name,bits) \
   unsigned long name[BITS_TO_LONGS(bits)]

#define BITS_TO_LONGS(bits) \
   (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG) （4＋32－1）／32 ＝ 1

BITS_PER_LONG ＝ 32
经过处理后应该是
typedef struct {
   unsigned long bits[1];
} nodemask_t;

#define set_bit(nr,p)           ATOMIC_BITOP_LE(set_bit,nr,p)

#define   ATOMIC_BITOP_LE(name,nr,p)       \
   (__builtin_constant_p(nr) ?       \
   ____atomic_##name(nr, p) :       \
   _##name##_le(nr,p))

扩展后应该是

#define   ATOMIC_BITOP_LE(name,nr,p)       \
   (__builtin_constant_p(0) ?       \
   ____atomic_set_bit(0, node_online_map.bits) :       \
   _set_bit_le(0,node_online_map.bits))
似乎他的意图是想把node_online_map.bits的bit0 写为1，但是本来就是1呀？如下。

c02abbc0 :
c02abbc0:   00000001    andeq   r0, r0, r1

好了，回到find_memend_and_nodes

       /*
       * Get the start and end pfns for this bank
       */
       start = mi->bank[i].start >> PAGE_SHIFT;
       end   = (mi->bank[i].start + mi->bank[i].size) >> PAGE_SHIFT;

得到起始和结束页号0x30000 ～ 0x34000
       if (np[node].start > start)
           np[node].start = start;

       if (np[node].end < end)
           np[node].end = end;

填写到节点信息中。

       if (memend_pfn < end)
           memend_pfn = end;

填写memend_pfn为最末页号

   }

   /*
   * Calculate the number of pages we require to
   * store the bootmem bitmaps.
   */
   for_each_online_node(i) {
       if (np[i].end == 0)
           continue;

       np[i].bootmap_pages = bootmem_bootmap_pages(np[i].end -
                                np[i].start);
       bootmem_pages += np[i].bootmap_pages;
   }

#define for_each_online_node(node) for_each_node_mask((node), node_online_map)
#define for_each_node_mask(node, mask)           \
   for ((node) = first_node(mask);           \
       (node) < MAX_NUMNODES;           \
       (node) = next_node((node), (mask)))

#define first_node(src) __first_node(&(src))
static inline int __first_node(const nodemask_t *srcp)
{
   return min_t(int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
}

#define next_node(n, src) __next_node((n), &(src))
static inline int __next_node(int n, const nodemask_t *srcp)
{
   return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
}
#define find_first_bit(p,sz)       _find_first_bit_le(p,sz)
#define find_next_bit(p,sz,off)       _find_next_bit_le(p,sz,off)

/*
* Purpose : Find a 'one' bit
* Prototype: int find_first_bit(const unsigned long *addr, unsigned int maxbit);
*/
ENTRY(_find_first_bit_le)
       teq   r1, #0
       beq   3f
       mov   r2, #0
1:       ldrb   r3, [r0, r2, lsr #3]
       movs   r3, r3
       bne   .found           @ any now set - found zero bit
       add   r2, r2, #8       @ next bit pointer
2:       cmp   r2, r1           @ any more?
       blo   1b
3:       mov   r0, r1           @ no free bits
       RETINSTR(mov,pc,lr)

/*
* Purpose : Find next 'one' bit
* Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset)
*/
ENTRY(_find_next_bit_le)
       teq   r1, #0
       beq   3b
       ands   ip, r2, #7
       beq   1b           @ If new byte, goto old routine
       ldrb   r3, [r0, r2, lsr #3]
       movs   r3, r3, lsr ip       @ shift off unused bits
       bne   .found
       orr   r2, r2, #7       @ if zero, then no bits here
       add   r2, r2, #1       @ align bit pointer
       b   2b           @ loop for next bit

看看bootmem_bootmap_pages作了什么？
/* return the number of _pages_ that will be allocated for the boot bitmap */
unsigned long __init bootmem_bootmap_pages (unsigned long pages)
{
   unsigned long mapsize;

   mapsize = (pages+7)/8; 表示需要多少byte存放内存页面的bit映射表
   mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; 将这些byte页面对齐
   mapsize >>= PAGE_SHIFT; 计算出这些byte需要多少页面。

   return mapsize; 返回页面数。
}
我的内存64M，0x4000个页面，0x4000/8=2048bytes，也就需要一个页面了。

经过for_each_online_node(i)的处理，node_info[0]应该成了这个样子
node_info[0] = {0x30000,0x34000,1} 表示可用内存有0x30000 ~ 0x34000的物理页面，需要一个页的容量来存放页面位图
bootmem_pages = 1
返回find_memend_and_node。

   high_memory = __va(memend_pfn << PAGE_SHIFT);

确定最高内存的地址，high_memory在memory.c中定义的全局指针。
/*
* A number of key systems in x86 including ioremap() rely on the assumption
* that high_memory defines the upper bound on direct map memory, then end
* of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and
* highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
* and ZONE_HIGHMEM.
*/
void * high_memory;
对解释很模糊

   /*
   * This doesn't seem to be used by the Linux memory
   * manager any more. If we can get rid of it, we
   * also get rid of some of the stuff above as well.
   *
   * Note: max_low_pfn and max_pfn reflect the number
   * of _pages_ in the system, not the maximum PFN.
   */
   max_low_pfn = memend_pfn - O_PFN_DOWN(PHYS_OFFSET); PHYS_OFFSET=0x30000000
   max_pfn =    memend_pfn - O_PFN_DOWN(PHYS_OFFSET);

   return bootmem_pages;
}

在bootmem.c中的定义。
/*
* Access to this subsystem has to be serialized externally. (this is
* true for the boot process anyway)
*/
unsigned long max_low_pfn;
unsigned long min_low_pfn;
unsigned long max_pfn;
#define O_PFN_DOWN(x)   ((x) >> PAGE_SHIFT)
这个函数的总体功能是找出页面位图需要的页数，也就是1页。

返回到bootmem_init

   bootmap_pfn   = find_bootmap_pfn(0, mi, bootmap_pages);

这段函数用来找到内核镜像后面的那个没有用到的第一个页面的页号(物理的)，关键代码是 start_pfn   = O_PFN_UP(__pa(&_end));
O_PFN_UP 使得内核镜像与内存使用bit位图页面之间存在 hole。

   initrd_node   = check_initrd(mi);

没有用到initrd。check_initrd(mi)返回-2。

   map_pg = bootmap_pfn;

   /*
   * Initialise the bootmem nodes.
   *
   * What we really want to do is:
   *
   *   unmap_all_regions_except_kernel();
   *   for_each_node_in_reverse_order(node) {
   *     map_node(node);
   *     allocate_bootmem_map(node);
   *     init_bootmem_node(node);
   *     free_bootmem_node(node);
   *   }
   *
   * but this is a 2.5-type change. For now, we just set
   * the nodes up in reverse order.
   *
   * (we could also do with rolling bootmem_init and paging_init
   * into one generic "memory_init" type function).
   */

   np += num_online_nodes() - 1;

#define num_online_nodes()   nodes_weight(node_online_map)

   for (node = num_online_nodes() - 1; node >= 0; node--, np--) {
       /*
       * If there are no pages in this node, ignore it.
       * Note that node 0 must always have some pages.
       */
       if (np->end == 0 || !node_online(node)) {
           if (node == 0)
               BUG();
           continue;
       }

       /*
       * Initialise the bootmem allocator.
       */
       init_bootmem_node(NODE_DATA(node), map_pg, np->start, np->end);

在linux/mmzone.h中
#ifndef CONFIG_NEED_MULTIPLE_NODES 我的配置文件中，没有定义这个量
extern struct pglist_data contig_page_data;
#define NODE_DATA(nid)       (&contig_page_data) 所以应该取这个宏定义，
#define NODE_MEM_MAP(nid)   mem_map
#define MAX_NODES_SHIFT       1
#define pfn_to_nid(pfn)       (0)

#else /* CONFIG_NEED_MULTIPLE_NODES */

#include 否则，就会取这里的宏定义

#endif /* !CONFIG_NEED_MULTIPLE_NODES */

也就是说NODE_DATA(node)返回节点的contig_page_data结构指针。我们只有节点0，所以总是返回这个指针。看看pglist_data结构是什么。

typedef struct pglist_data {
   struct zone node_zones[MAX_NR_ZONES];
   struct zonelist node_zonelists[GFP_ZONETYPES];
   int nr_zones;
#ifdef CONFIG_FLAT_NODE_MEM_MAP 配置文件中定义了这个量。
   struct page *node_mem_map;
#endif
   struct bootmem_data *bdata; 这个结构这里显得比较重要，贴在了下面。
   unsigned long node_start_pfn;
   unsigned long node_present_pages; /* total number of physical pages */
   unsigned long node_spanned_pages; /* total size of physical page
                         range, including holes */
   int node_id;
   struct pglist_data *pgdat_next;
   wait_queue_head_t kswapd_wait;
   struct task_struct *kswapd;
   int kswapd_max_order;
} pg_data_t;

/*
* node_bootmem_map is a map pointer - the bits represent all physical
* memory pages (including holes) on the node.
*/
typedef struct bootmem_data {
   unsigned long node_boot_start;
   unsigned long node_low_pfn;
   void *node_bootmem_map;
   unsigned long last_offset;
   unsigned long last_pos;
   unsigned long last_success;   /* Previous allocation point. To speed
                   * up searching */
} bootmem_data_t;
回来继续看
       init_bootmem_node(&contig_page_data, map_pg, 0x30000, 0x34000);
他原封不动的调用了init_bootmem_core
unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn,unsigned long startpfn, unsigned long endpfn)
{
   return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn));
}
看init_bootmem_core。
/*
* Called once to set up the allocator itself.
*/
static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
   unsigned long mapstart, unsigned long start, unsigned long end)
{
   bootmem_data_t *bdata = pgdat->bdata; 取得bdata.
   unsigned long mapsize = ((end - start)+7)/8; 需要的字节数(整个物理内存的)

   pgdat->pgdat_next = pgdat_list; //pgdat_list是全局指针，定义struct pglist_data *pgdat_list;
   pgdat_list = pgdat; //这样就组成了一个双向链表

   mapsize = ALIGN(mapsize, sizeof(long)); //四字节对其
   bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); 得到那片即将作为bit位图的页面的虚拟地址
   bdata->node_boot_start = (start << PAGE_SHIFT); 物理起始地址0x30000000
   bdata->node_low_pfn = end; 结束页面号。

   /*
   * Initially all pages are reserved - setup_arch() has to
   * register free RAM areas explicitly.
   */
   memset(bdata->node_bootmem_map, 0xff, mapsize); 全部填充为0xff，表示0x30000000 ～ 0x34000000都不可用

   return mapsize;
}

现在的物理内存都不可用了。

       free_bootmem_node_bank(node, mi);

看看怎么释放物理内存的使用情况的。
/*
* Register all available RAM in this node with the bootmem allocator.
*/
static inline void free_bootmem_node_bank(int node, struct meminfo *mi)
{
   pg_data_t *pgdat = NODE_DATA(node);
   int bank;

   for (bank = 0; bank < mi->nr_banks; bank++)
       if (mi->bank[bank].node == node)
           free_bootmem_node(pgdat, mi->bank[bank].start,
                      mi->bank[bank].size);
}
他的解释是注册这个节点的可用RAM，看free_bootmem_node
void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size)
{
   free_bootmem_core(pgdat->bdata, physaddr, size);
}
free_bootmem_core(bootmem_data_t *bdata, 0x30000000, 0x4000000)
static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size)
{
   unsigned long i;
   unsigned long start;
   /*
   * round down end of usable mem, partially free pages are
   * considered reserved.
   */
   unsigned long sidx;
   unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE;
   unsigned long end = (addr + size)/PAGE_SIZE;

   BUG_ON(!size);
   BUG_ON(end > bdata->node_low_pfn);

   if (addr < bdata->last_success)
       bdata->last_success = addr;

   /*
   * Round up the beginning of the address.
   */
   start = (addr + PAGE_SIZE-1) / PAGE_SIZE;
   sidx = start - (bdata->node_boot_start/PAGE_SIZE);

   for (i = sidx; i < eidx; i++) {
       if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map)))
           BUG();
   }
}
这段代码是init_bootmem_core的反过程，现在所有的物理内存都可用了。（0x30000000 ～ 0x34000000）

       map_pg += np->bootmap_pages; 指向下一个要当作内存位图的页面，已经无意义。

       /*
       * If this is node 0, we need to reserve some areas ASAP -
       * we may use bootmem on node 0 to setup the other nodes.
       */
       if (node == 0)
           reserve_node_zero(bootmap_pfn, bootmap_pages);
   }

   BUG_ON(map_pg != bootmap_pfn + bootmap_pages);
}

这个函数 reserve_node_zero()真正的把需要保留的内存页保留了下来。
bootmap_pfn ：内核镜像后面的那个没有用到的第一个页面的页号(物理的)
bootmap_pages ：　１

/*
* Reserve the various regions of node 0
*/
static __init void reserve_node_zero(unsigned int bootmap_pfn, unsigned int bootmap_pages)
{
   pg_data_t *pgdat = NODE_DATA(0);
   unsigned long res_size = 0;

   /*
   * Register the kernel text and data with bootmem.
   * Note that this can only be in node 0.
   */
   reserve_bootmem_node(pgdat, __pa(&_stext), &_end - &_stext);　
在０节点中保留内核镜像部分的内存，大概是0x30008000到以后的1.5M左右。

   /*
   * Reserve the page tables. These are already in use,
   * and can only be in node 0.
   */
   reserve_bootmem_node(pgdat, __pa(swapper_pg_dir),
                 PTRS_PER_PGD * sizeof(pgd_t));

需要的信息如下
   .globl   swapper_pg_dir
   .equ   swapper_pg_dir, TEXTADDR - 0x4000
extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
#define PTRS_PER_PGD       2048
typedef struct { unsigned long pgd[2]; } pgd_t;
就是把0x30004000　～　0x30008000这段(16K)用于页目录的内容保留下来。

   /*
   * And don't forget to reserve the allocator bitmap,
   * which will be freed later.
   */
   reserve_bootmem_node(pgdat, bootmap_pfn << PAGE_SHIFT,
                 bootmap_pages << PAGE_SHIFT);
保留内核镜像后面的那些bit位图。我们好不容易创建的，可不能丟掉。
}

0x30000100不是还有参数吗？现在已经取得了参数了，不需要了。现在物理内存的布局大概是：

0x30000000 0x30004000 0x30008000    0x3xxxxxxx                                       0x34000000
   |           |           |           |                                                  |
   |___________|___________|___________|______________________________________________|
   |           |           |           |               |                               |
   |             |   页表       |   image   |   一页   (4k)   |                               |
   |           |           |           |               |                               |
   |___________|___________|___________|_______________|______________________________|
       可用       　页目录(保留) kernel(保留)　　bit位图(保留)

bootmem_init到这里结束，回到paging_init中。

   memcpy(&meminfo, mi, sizeof(meminfo));

拷贝到自己的meminfo中，以前我们用的meminfo保存在setup.c中，这次到了init.c中。

   /*
   * allocate the zero page. Note that we count on this going ok.
   */
   zero_page = alloc_bootmem_low_pages(PAGE_SIZE);

分配一个0页，看看他在那里分配的，什么作用？
#define alloc_bootmem_low_pages(x) \
   __alloc_bootmem((x), PAGE_SIZE, 0)
这样的参数
__alloc_bootmem(PAGE_SIZE, PAGE_SIZE, 0)

void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal)
{
   pg_data_t *pgdat = pgdat_list;　//这个指针不陌生　contig_page_data
   void *ptr;

   for_each_pgdat(pgdat)　//这个循环只有一次了
       if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,　主体是这个函数了，希望他不要让我们失望，别返回NULL。
                       align, goal)))
           return(ptr);　这样就可以退出历史舞台了

   /*
   * Whoops, we cannot satisfy the allocation request.
   */
   printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
   panic("Out of memory");
   return NULL;
}

/*
* We 'merge' subsequent allocations to save space. We might 'lose'　合并了子分配去节省空间
* some fraction of a page if allocations cannot be satisfied due to
* size constraints on boxes where there is physical RAM space
* fragmentation - in these cases (mostly large memory boxes) this
* is not a problem.
*
* On low memory boxes we get it right in 100% of the cases.
*
* alignment has to be a power of 2 value.
*
* NOTE: This function is _not_ reentrant.　不可重入的函数
*/
看下参数
__alloc_bootmem_core(*bdata, PAGE_SIZE,PAGE_SIZE,0)

static void * __init
__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
       unsigned long align, unsigned long goal)
{
   unsigned long offset, remaining_size, areasize, preferred;
   unsigned long i, start = 0, incr, eidx;
   void *ret;

   if(!size) {
       printk("__alloc_bootmem_core(): zero-sized request\n");
       BUG();
   }
   BUG_ON(align & (align-1));

   eidx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);

贴来需要的数据
bdata->node_boot_start = (start << PAGE_SHIFT); 物理起始地址0x30000000
bdata->node_low_pfn = end; 结束页面号。
eidx存放的是bit位图的最后一个索引

   offset = 0;
   if (align &&
        (bdata->node_boot_start & (align - 1UL)) != 0)　显然后面的条件不满足
       offset = (align - (bdata->node_boot_start & (align - 1UL)));
   offset >>= PAGE_SHIFT;

这样offset还是０

   /*
   * We try to allocate bootmem pages above 'goal'　首先尝试获得引导内存页面在goal上面
   * first, then we try to allocate lower pages.　　然后尝试获得底端内存页面
   */
   if (goal && (goal >= bdata->node_boot_start) && 　显然不成立
        ((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) {
       preferred = goal - bdata->node_boot_start;

       if (bdata->last_success >= preferred)
           preferred = bdata->last_success;
   } else
       preferred = 0;　这里有效

   preferred = ALIGN(preferred, align) >> PAGE_SHIFT;
看看是怎么对齐的。
#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1))
还是0，这个是向上对齐的，如果x是１，那么就变成了align，这里就是page_size

   preferred += offset;
   areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;　页为单位的区域，这里是１个页
   incr = align >> PAGE_SHIFT ? : 1;　incr=1

restart_scan:
   for (i = preferred; i < eidx; i += incr) {　从0扫描到结束的内存bit位图
       unsigned long j;
       i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i);　查找可用内存，别忘了0x30000000～0x30004000这4页可用页
       i = ALIGN(i, incr);　//这个多余
       if (test_bit(i, bdata->node_bootmem_map))　这里显然通过了
           continue;
       for (j = i + 1; j < i + areasize; ++j) {　//由于areasize是１，这个扫描显得多余了。
           if (j >= eidx)
               goto fail_block;
           if (test_bit (j, bdata->node_bootmem_map))
               goto fail_block;
       }
       start = i;　　所以我们很快得到了一页的可用空间，她在最顶端，也就是0x30000000～0x30001000这个页面
       goto found;　此时的i=0
   fail_block:
       i = ALIGN(j, incr);
   }

   if (preferred > offset) {
       preferred = offset;
       goto restart_scan;
   }
   return NULL;

found:
   bdata->last_success = start << PAGE_SHIFT;　填0，很讽刺，最后成功的是第一个页面
   BUG_ON(start >= eidx);

   /*
   * Is the next page of the previous allocation-end the start
   * of this allocation's buffer? If yes then we can 'merge'
   * the previous partial page with this allocation.
   */
   if (align < PAGE_SIZE &&　//显然这里不通过
        bdata->last_offset && bdata->last_pos+1 == start) {
       offset = ALIGN(bdata->last_offset, align);
       BUG_ON(offset > PAGE_SIZE);
       remaining_size = PAGE_SIZE-offset;
       if (size < remaining_size) {
           areasize = 0;
           /* last_pos unchanged */
           bdata->last_offset = offset+size;
           ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
                       bdata->node_boot_start);
       } else {
           remaining_size = size - remaining_size;
           areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE;
           ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
                       bdata->node_boot_start);
           bdata->last_pos = start+areasize-1;
           bdata->last_offset = remaining_size;
       }
       bdata->last_offset &= ~PAGE_MASK;
   } else {
       bdata->last_pos = start + areasize - 1;　填０
       bdata->last_offset = size & ~PAGE_MASK;　填１
       ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start);　返回找到的页面的虚拟地址，也就是0xc0000000
   }

   /*
   * Reserve the area now:
   */
   for (i = start; i < start+areasize; i++)　保留这个区间的页面，这里只有一个页面
       if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map)))
           BUG();
   memset(ret, 0, size);　把0xc0000000~0xc0001000这第一个页面填充0
   return ret;　返回0xc0000000这个指针
}

回到paging_init中

   /*
   * initialise the page tables.
   */
   memtable_init(mi);
在汇编部分只映射了4m空间的ram，现在用c建立完整的页表。
/*
* Setup initial mappings. We use the page we allocated for zero page to hold
* the mappings, which will get overwritten by the vectors in traps_init().
* The mappings must be in virtual address order.
*/
void __init memtable_init(struct meminfo *mi)
{
   struct map_desc *init_maps, *p, *q;
   unsigned long address = 0;
   int i;

   build_mem_type_table();

看看他作什么。
/*
* Adjust the PMD section entries according to the CPU in use.
*/
static void __init build_mem_type_table(void)
{
   struct cachepolicy *cp;
   unsigned int cr = get_cr();　//得到cr寄存器的数值，cp15的控制寄存器。
   unsigned int user_pgprot;
   int cpu_arch = cpu_architecture();　//得到cpu的版本，这里是3，#define CPU_ARCH_ARMv4T       3
   int i;

   if (cpu_arch < CPU_ARCH_ARMv5) {　//４<5
       if (cachepolicy >= CPOLICY_WRITEALLOC)　不满足
           cachepolicy = CPOLICY_WRITEBACK;
       ecc_mask = 0;
   }
需要的数据：
static unsigned int cachepolicy __initdata = CPOLICY_WRITEBACK;
static unsigned int ecc_mask __initdata = 0;
上面两个全局变量都是mm-armv.c中的私有数据。
#define CPOLICY_UNCACHED   0
#define CPOLICY_BUFFERED   1
#define CPOLICY_WRITETHROUGH   2
#define CPOLICY_WRITEBACK   3
#define CPOLICY_WRITEALLOC   4
dump中：
c001e8f8 :
c001e8f8:   00000000    andeq   r0, r0, r0
c001e868 :
c001e868:   00000003    andeq   r0, r0, r3
显然cachepolicy的初始化数据是CPOLICY_WRITEBACK=3

   if (cpu_arch <= CPU_ARCH_ARMv5TEJ) {
       for (i = 0; i < ARRAY_SIZE(mem_types); i++) {
           if (mem_types[i].prot_l1)
               mem_types[i].prot_l1 |= PMD_BIT4;
           if (mem_types[i].prot_sect)
               mem_types[i].prot_sect |= PMD_BIT4;
       }
   }

需要的数据：
有８种内存类型。
#define MT_DEVICE       0
#define MT_CACHECLEAN       1
#define MT_MINICLEAN       2
#define MT_LOW_VECTORS       3
#define MT_HIGH_VECTORS       4
#define MT_MEMORY       5
#define MT_ROM           6
#define MT_IXP2000_DEVICE   7
struct mem_types {
   unsigned int   prot_pte;
   unsigned int   prot_l1;
   unsigned int   prot_sect;
   unsigned int   domain;
};

static struct mem_types mem_types[] __initdata = {
   [MT_DEVICE] = {
       .prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
               L_PTE_WRITE,
       .prot_l1   = PMD_TYPE_TABLE,
       .prot_sect = PMD_TYPE_SECT | PMD_SECT_UNCACHED |
               PMD_SECT_AP_WRITE,
       .domain    = DOMAIN_IO,
   },
   [MT_CACHECLEAN] = {
       .prot_sect = PMD_TYPE_SECT,
       .domain    = DOMAIN_KERNEL,
   },
   [MT_MINICLEAN] = {
       .prot_sect = PMD_TYPE_SECT | PMD_SECT_MINICACHE,
       .domain    = DOMAIN_KERNEL,
   },
   [MT_LOW_VECTORS] = {
       .prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
               L_PTE_EXEC,
       .prot_l1   = PMD_TYPE_TABLE,
       .domain    = DOMAIN_USER,
   },
   [MT_HIGH_VECTORS] = {
       .prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
               L_PTE_USER | L_PTE_EXEC,
       .prot_l1   = PMD_TYPE_TABLE,
       .domain    = DOMAIN_USER,
   },
   [MT_MEMORY] = {
       .prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
       .domain    = DOMAIN_KERNEL,
   },
   [MT_ROM] = {
       .prot_sect = PMD_TYPE_SECT,
       .domain    = DOMAIN_KERNEL,
   },
   [MT_IXP2000_DEVICE] = { /* IXP2400 requires XCB=101 for on-chip I/O */
       .prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
               L_PTE_WRITE,
       .prot_l1   = PMD_TYPE_TABLE,
       .prot_sect = PMD_TYPE_SECT | PMD_SECT_UNCACHED |
               PMD_SECT_AP_WRITE | PMD_SECT_BUFFERABLE |
               PMD_SECT_TEX(1),
       .domain    = DOMAIN_IO,
   }
};
根据需要填充PMD_BIT4。

   cp = &cache_policies[cachepolicy];　指向cache_policies[3]
   user_pgprot = cp->pte;

需要的数据：
struct cachepolicy {
   const char   policy[16];
   unsigned int   cr_mask;
   unsigned int   pmd;
   unsigned int   pte;
};

static struct cachepolicy cache_policies[] __initdata = {
   {
       .policy       = "uncached",
       .cr_mask   = CR_W|CR_C,
       .pmd       = PMD_SECT_UNCACHED,
       .pte       = 0,
   }, {
       .policy       = "buffered",
       .cr_mask   = CR_C,
       .pmd       = PMD_SECT_BUFFERED,
       .pte       = PTE_BUFFERABLE,
   }, {
       .policy       = "writethrough",
       .cr_mask   = 0,
       .pmd       = PMD_SECT_WT,
       .pte       = PTE_CACHEABLE,
   }, {
       .policy       = "writeback",　//名字,写回的方案
       .cr_mask   = 0,
       .pmd       = PMD_SECT_WB,　//　使用cache和buffer
       .pte       = PTE_BUFFERABLE|PTE_CACHEABLE,　//２级页表(1 << 2)|(1 << 3)
   }, {
       .policy       = "writealloc",
       .cr_mask   = 0,
       .pmd       = PMD_SECT_WBWA,
       .pte       = PTE_BUFFERABLE|PTE_CACHEABLE,
   }
};
将cache的使用组成若干不同的策略，存放在cache_policies中，方便使用。

if (cpu_arch >= CPU_ARCH_ARMv5) {
       mem_types[MT_LOW_VECTORS].prot_pte |= cp->pte & PTE_CACHEABLE;
       mem_types[MT_HIGH_VECTORS].prot_pte |= cp->pte & PTE_CACHEABLE;
   } else {
       mem_types[MT_LOW_VECTORS].prot_pte |= cp->pte;
       mem_types[MT_HIGH_VECTORS].prot_pte |= cp->pte;
       mem_types[MT_MINICLEAN].prot_sect &= ~PMD_SECT_TEX(1);
   }
根据arm版本　设置内存类型　中　的相关标志。

   mem_types[MT_LOW_VECTORS].prot_l1 |= ecc_mask;
   mem_types[MT_HIGH_VECTORS].prot_l1 |= ecc_mask;
   mem_types[MT_MEMORY].prot_sect |= ecc_mask | cp->pmd;
   mem_types[MT_ROM].prot_sect |= cp->pmd;
通用的方案。

   for (i = 0; i < 16; i++) {
       unsigned long v = pgprot_val(protection_map[i]);
       v &= (~(PTE_BUFFERABLE|PTE_CACHEABLE)) | user_pgprot;
       protection_map[i] = __pgprot(v);
   }

   pgprot_kernel = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG |
               L_PTE_DIRTY | L_PTE_WRITE |
               L_PTE_EXEC | cp->pte);

   switch (cp->pmd) {
   case PMD_SECT_WT:
       mem_types[MT_CACHECLEAN].prot_sect |= PMD_SECT_WT;
       break;
   case PMD_SECT_WB:
   case PMD_SECT_WBWA:
       mem_types[MT_CACHECLEAN].prot_sect |= PMD_SECT_WB;
       break;
   }
   printk("Memory policy: ECC %sabled, Data cache %s\n",
       ecc_mask ? "en" : "dis", cp->policy);
}
不看了，不懂，就学过v4了。^_^
回到memtable_init。

   init_maps = p = alloc_bootmem_low_pages(PAGE_SIZE);
这段函数已经分析过了，它得到第一个可用的内存页面，这里应该是第２个页面0x30001000~0x30002000。
init_maps = p=0x30001000，这个样子。
这里的alloc_bootmem_low_pages分配了一个空页，跟kmalloc()相似。看看map_desc这个数据结构

struct map_desc {
   unsigned long virtual;
   unsigned long physical;
   unsigned long length;
   unsigned int type;　//内存的类型，就是那８种之一
};
内核用这个结构来描述ram的类型，物理，虚拟地址，和长度。

   for (i = 0; i < mi->nr_banks; i++) {　//只有一个内存bank，所以扫描１次。
       if (mi->bank[i].size == 0)
           continue;

       p->physical   = mi->bank[i].start;
       p->virtual    = __phys_to_virt(p->physical);
       p->length     = mi->bank[i].size;
       p->type       = MT_MEMORY;
       p ++;
   }
我们的0x30000000~0x34000000这片内存得到了描述。

#ifdef FLUSH_BASE
   p->physical   = FLUSH_BASE_PHYS;
   p->virtual    = FLUSH_BASE;
   p->length     = PGDIR_SIZE;
   p->type       = MT_CACHECLEAN;
   p ++;
#endif

#ifdef FLUSH_BASE_MINICACHE
   p->physical   = FLUSH_BASE_PHYS + PGDIR_SIZE;
   p->virtual    = FLUSH_BASE_MINICACHE;
   p->length     = PGDIR_SIZE;
   p->type       = MT_MINICLEAN;
   p ++;
#endif
似乎没有FLUSH_BASE　FLUSH_BASE_MINICACHE的定义。

   /*
   * Go through the initial mappings, but clear out any
   * pgdir entries that are not in the description.
   */
   q = init_maps;
   do {
       if (address < q->virtual || q == p) {
           clear_mapping(address);
           address += PGDIR_SIZE;
       } else {
           create_mapping(q);

           address = q->virtual + q->length;
           address = (address + PGDIR_SIZE - 1) & PGDIR_MASK;

           q ++;
       }
   } while (address != 0);

分析下这个do　while
address = 0,q指向了第一个map_desc,q经过++指向第２个map_desc描述符。
#define PGDIR_SIZE       (1UL << PGDIR_SHIFT)
PGDIR_SHIFT determines what a third-level page table entry can map
#define PGDIR_SHIFT       21
看看清除映射函数 clear_mapping
/*
* Clear any PGD mapping. On a two-level page table system,
* the clearance is done by the middle-level functions (pmd)
* rather than the top-level (pgd) functions.
*/页目录的清除是由中间(pmd)目录函数完成的，而不是pgd的函数。
static inline void clear_mapping(unsigned long virt)
{
   pmd_clear(pmd_off_k(virt));
}
看看pmd_clear的参数pmd_off_k(virt)是什么？

pmd_off_k(virt) = (init_mm->pgd+(addr) >> 21)
init_mm->pgd的类型是pgd_t类型的，而 typedef unsigned long pgd_t[2];
可见这个函数返回了virt这个虚拟地址在中间页目录中的索引值，他索引了2048个 8byte 长的pmd，从0xc0004000增长到0xc0008000-8，

#define INIT_MM(name) \
{                              \
   .mm_rb       = RB_ROOT,               \
   .pgd       = swapper_pg_dir, .pgd是页表所在的虚拟地址，在0xc0004000
...
}

#define pmd_clear(pmdp)           \
   do {               \
       pmdp[0] = __pmd(0);   \
       pmdp[1] = __pmd(0);   \
       clean_pmd_entry(pmdp);   \
   } while (0)
所以这个pmd_clear，可以擦除0xc0004000~0xc0008000之间的数据(页目录)，当然它自己认为自己擦除了2048个8byte长度的pmd。
当addr到了0xc0000000这个虚拟地址的时候，转向else分支，创建这部分内存的页表。
看看create_mapping是怎么根据map_desc的内容来创建页表的？

/*
* Create the page directory entries and any necessary
* page tables for the mapping specified by `md'. We
* are able to cope here with varying sizes and address
* offsets, and we take full advantage of sections and
* supersections.
*/
static void __init create_mapping(struct map_desc *md)
{
   unsigned long virt, length;
   int prot_sect, prot_l1, domain;
   pgprot_t prot_pte;
   long off;

   if (md->virtual != vectors_base() && md->virtual < TASK_SIZE) {
       printk(KERN_WARNING "BUG: not creating mapping for "
               "0x%08lx at 0x%08lx in user region\n",
               md->physical, md->virtual);
       return;
   }
只能给系统空间或者中断向量所在的空间创建映射，绝对不可给用户虚拟空间创建映射。

   if ((md->type == MT_DEVICE || md->type == MT_ROM) &&
        md->virtual >= PAGE_OFFSET && md->virtual < VMALLOC_END) {
       printk(KERN_WARNING "BUG: mapping for 0x%08lx at 0x%08lx "
               "overlaps vmalloc space\n",
               md->physical, md->virtual);
   }
这里只是为了让我们检查一下，映射的范围有没有覆盖到VMALLOC_START   VMALLOC_END-1。
这里其实没有覆盖到

   domain      = mem_types[md->type].domain;
   prot_pte = __pgprot(mem_types[md->type].prot_pte);
   prot_l1   = mem_types[md->type].prot_l1 | PMD_DOMAIN(domain);
   prot_sect = mem_types[md->type].prot_sect | PMD_DOMAIN(domain);

   virt   = md->virtual;
   off    = md->physical - virt;
   length = md->length;

   if (mem_types[md->type].prot_l1 == 0 &&
        (virt & 0xfffff || (virt + off) & 0xfffff || (virt + length) & 0xfffff)) {
       printk(KERN_WARNING "BUG: map for 0x%08lx at 0x%08lx can not "
               "be mapped using pages, ignoring.\n",
               md->physical, md->virtual);
       return;
   }
不是合法的一级描述符，并且虚拟地址，偏移量，长度有不是1M对齐的，就不能进行映射了(创建页表)。

   while ((virt & 0xfffff || (virt + off) & 0xfffff) && length >= PAGE_SIZE) {
       alloc_init_page(virt, virt + off, prot_l1, prot_pte);

       virt   += PAGE_SIZE;
       length -= PAGE_SIZE;
   }

   /*
   * A section mapping covers half a "pgdir" entry.
   */
   while (length >= (PGDIR_SIZE / 2)) {
       alloc_init_section(virt, virt + off, prot_sect);

       virt   += (PGDIR_SIZE / 2);
       length -= (PGDIR_SIZE / 2);
   }

在这里完成了ram 0xc0000000~0xc4000000的映射，就是为我的ram创建了页中间目录(pmd)。
这样，我们在汇编部分创建页表，就被覆盖了，好处是linux的视野变大了(他看到了所有物理ram:0x30000000~0x34000000)。

   while (length >= PAGE_SIZE) {
       alloc_init_page(virt, virt + off, prot_l1, prot_pte);

       virt   += PAGE_SIZE;
       length -= PAGE_SIZE;
   }
}

这几个 while 语句的含义为:   若虚拟地址与 1M (2^20)没有对齐(即低 20 位不全位 0) ,
则建立二级页面映射;然后为1M的部分逐段建立段(section)映射;最后为PAGE_SIZE部分建立二级页表映射。
可见长度小于PAGE_SIZE的部分是不会被映射的。

看看alloc_init_section。
static inline void
alloc_init_section(unsigned long virt, unsigned long phys, int prot)
{
   pmd_t *pmdp = pmd_off_k(virt); //把指针强制转换成unsigned long *类型的。

   if (virt & (1 << 20)) //对于虚拟地址在1,3,5,7...M部分的内容，pmdp指向 8byte部分的后4byte。
       pmdp++;              //个人感觉，linux 的pmd让代码显得愚蠢。让读代码的人困惑。

   *pmdp = __pmd(phys | prot); //填充这个pgd，对cpu来说，这个条目就是一个pgd描述符(段描述符)。
   flush_pmd_entry(pmdp);
}

回到do while中，在if部分，把虚拟地址余下的部分0xc4000000～0xffffffff对应的pmd条目清除了。
到了这里就完成了对pmd的创建过程。

忘了alloc_init_page()这个函数了，分析下建立中断向量所在页面的页表的过程。

/*
* Add a PAGE mapping between VIRT and PHYS in domain
* DOMAIN with protection PROT. Note that due to the
* way we map the PTEs, we must allocate two PTE_SIZE'd
* blocks - one for the Linux pte table, and one for
* the hardware pte table.
*/
static inline void
alloc_init_page(unsigned long virt, unsigned long phys, unsigned int prot_l1, pgprot_t prot)
{
   pmd_t *pmdp = pmd_off_k(virt); //得到这个虚拟地址的pmd索引(相对0xc0004000来说)
   pte_t *ptep;

   if (pmd_none(*pmdp)) { // 中断向量的pmd还没有动过(是0)，所以这里是真。
       unsigned long pmdval;
       ptep = alloc_bootmem_low_pages(2 * PTRS_PER_PTE *
                           sizeof(pte_t));
//这里应该是第3个页面0x30002000~0x30003000。PTRS_PER_PTE ＝ 512，sizeof(pte_t) ＝ sizeof(unsigned long) ＝ 4
//也是打算取得一个页面。
       pmdval = __pa(ptep) | prot_l1;
       pmdp[0] = __pmd(pmdval);
       pmdp[1] = __pmd(pmdval + 256 * sizeof(pte_t));
       flush_pmd_entry(pmdp);
   }
   ptep = pte_offset_kernel(pmdp, virt);

   set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, prot));
}
好多宏，我不看了，:-(

现在的页表不再是那个当初只有四个段描述符的页表了。
lzd> md 0x30007000
30007000: 3000041e 3010041e 3020041e 3030041e    ...0...0.. 0..00
30007010: 3040041e 3050041e 3060041e 3070041e    ..@0..P0..`0..p0
30007020: 3080041e 3090041e 30a0041e 30b0041e    ...0...0...0...0
30007030: 30c0041e 30d0041e 30e0041e 30f0041e    ...0...0...0...0
30007040: 3100041e 3110041e 3120041e 3130041e    ...1...1.. 1..01
30007050: 3140041e 3150041e 3160041e 3170041e    ..@1..P1..`1..p1
30007060: 3180041e 3190041e 31a0041e 31b0041e    ...1...1...1...1
30007070: 31c0041e 31d0041e 31e0041e 31f0041e    ...1...1...1...1
30007080: 3200041e 3210041e 3220041e 3230041e    ...2...2.. 2..02
30007090: 3240041e 3250041e 3260041e 3270041e    ..@2..P2..`2..p2
300070a0: 3280041e 3290041e 32a0041e 32b0041e    ...2...2...2...2
300070b0: 32c0041e 32d0041e 32e0041e 32f0041e    ...2...2...2...2
300070c0: 3300041e 3310041e 3320041e 3330041e    ...3...3.. 3..03
300070d0: 3340041e 3350041e 3360041e 3370041e    ..@3..P3..`3..p3
300070e0: 3380041e 3390041e 33a0041e 33b0041e    ...3...3...3...3
300070f0: 33c0041e 33d0041e 33e0041e 33f0041e    ...3...3...3...3
它已经映射了所有ram(0x30000000~0x34000000)区间，对应这虚拟地址0xc0000000~0xc4000000，对了还有
lzd> md 0x30007ff0
30007ff0: 00000000 00000000 30002031 30002431    ........1 .01$.0 //确实是第三个页面哦(30002xxx)!!!
给中断向量用的描述符。256 * sizeof(pte_t) ＝ 0x400，显然linux在0xffff0000地址处寻找中断向量。

回到memtable_init()
   /*
   * Create a mapping for the machine vectors at the high-vectors
   * location (0xffff0000). If we aren't using high-vectors, also
   * create a mapping at the low-vectors virtual address.
   */
linux默认中断向量从0xffff0000开始，所以为倒数第16个页面创建映射，当然可能失败。
   init_maps->physical   = virt_to_phys(init_maps);
   init_maps->virtual    = 0xffff0000;
   init_maps->length     = PAGE_SIZE;
   init_maps->type       = MT_HIGH_VECTORS;
   create_mapping(init_maps);

   if (!vectors_high()) {
       init_maps->virtual = 0;
       init_maps->type = MT_LOW_VECTORS;
       create_mapping(init_maps);
   }
如果失败了，这里就一定成功。
中断向量从虚拟地址0开始，就映射这个第1(或者0)个页面(4k)
   flush_cache_all();
   local_flush_tlb_all();
冲洗缓冲池和页表缓存

   top_pmd = pmd_off_k(0xffff0000);
保存最高pmd索引到top_pmd(全局的)。
}

回到paging_init中...

阅读(3257) | 评论(0) | 转发(0) |

上一篇：arm－linux（kernel-2.6.13）的启动过程（1.1/2）

下一篇：arm－linux（kernel-2.6.13）的启动过程（1.3/2）

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6