head.S分析.-piaoyizu-ChinaUnix博客

随风...embed.blog.chinaunix.net

首页　| 　博文目录　| 　关于我

piaoyizu

博客访问： 2159448
博文数量： 288
博客积分： 10594
博客等级：上将
技术积分： 3469
用户组：普通用户
注册时间： 2006-10-27 19:27

文章分类

全部博文（288）

Camera（2）
Embeddeb（145）

GStreamer（1）

Android（13）

Kernel study（9）

嵌入式环境（24）

Wince Developmen（2）

Driver study（55）

Basic study（41）
工作相关记录（49）

管理类资料（7）

技术类资料（42）
心情随笔（11）
English（8）
Bash脚本相关（21）
Linux C开发（52）
未分配的博文（0）

文章存档

2012年（4）

2011年（30）

2010年（40）

2009年（32）

2008年（71）

2007年（79）

2006年（32）

我的朋友

相关博文

head.S分析.

分类： C/C++

2008-04-08 17:45:07

linux/arch/arm/boot/compressed/head.S

这是ARM-Linux运行的第一个文件，这些代码是一个比较独立的代码包裹器。其作用就是解压Linux内核，并将PC指针跳到内核（vmlinux）的第一条指令。
Bootloader中传入到Linux中的参数总共有三个，Linux中用到的是第二个和第三个。第二个参数是architecture id，第三个是taglist的地址。Architecture id的arm芯片在Linux中一定要唯一。Taglist是bootload向Linux传入的参数列表（详细的解释请参考《booting arm linux.pdf》）。
//程序的入口点
  .section ".start", #alloc, #execinstr
/*
* sort out different calling conventions
*/
  .align
start:
  .type start,#function
  .rept 8//重复8次下面的指令，也就是空出中断向量表的位置
  mov r0, r0//就是nop指令
  .endr

  b 1f
  .word 0x016f2818  @ Magic numbers to help the loader
  .word start   @ absolute load/run zImage address
  .word _edata   @ zImage end address
1:  mov r7, r1   @ save architecture ID
  mov r8, r2   @ save atags pointer

#ifndef __ARM_ARCH_2__
  /*
   * Booting from Angel - need to enter SVC mode and disable
   * FIQs/IRQs (numeric definitions from angel arm.h source).
   * We only do this if we were in user mode on entry.
   */
  mrs r2, cpsr  @ get current mode
  tst r2, #3   @ not user?
  bne not_angel
  mov r0, #0x17  @ angel_SWIreason_EnterSVC
  swi 0x123456  @ angel_SWI_ARM
not_angel:
  mrs r2, cpsr  @ turn off interrupts to
  orr r2, r2, #0xc0  @ prevent angel from running
  msr cpsr_c, r2
#else
  teqp pc, #0x0c000003  @ turn off interrupts
#endif

一定要保证当前运行在SVC模式下，否则会跳到swi里面去（为什么？我不清楚，而且我没有处理过这个swi）。然后再关闭irq和fiq。

  /*
   * Note that some cache flushing and other stuff may
   * be needed here - is there an Angel SWI call for this?
   */

  /*
   * some architecture specific code can be inserted
   * by the linker here, but it should preserve r7, r8, and r9.
   */

读入地址表。因为我们的代码可以在任何地址执行，也就是位置无关代码（PIC），所以我们需要加上一个偏移量。下面有每一个列表项的具体意义。
GOT表的初值是连接器指定的，当时程序并不知道代码在哪个地址执行。如果当前运行的地址已经和表上的地址不一样，还要修正GOT表。
  .text
  adr r0, LC0
  ldmia r0, {r1, r2, r3, r4, r5, r6, ip, sp}
  subs r0, r0, r1  @ calculate the delta offset

      @ if delta is zero, we are
  beq not_relocated  @ running at the address we
      @ were linked at.

  /*
   * We're running at a different address. We need to fix
   * up various pointers:
   *   r5 - zImage base address
   *   r6 - GOT start
   *   ip - GOT end
   */
  add r5, r5, r0
  add r6, r6, r0
  add ip, ip, r0

  /*
   * If we're running fully PIC === CONFIG_ZBOOT_ROM = n,
   * we need to fix up pointers into the BSS region.
   *   r2 - BSS start
   *   r3 - BSS end
   *   sp - stack pointer
   */
  add r2, r2, r0
  add r3, r3, r0
  add sp, sp, r0

修改GOT（全局偏移表）表。根据当前的运行地址，修正该表。
  /*
   * Relocate all entries in the GOT table.
   */
1:  ldr r1, [r6, #0]  @ relocate entries in the GOT
  add r1, r1, r0  @ table. This fixes up the
  str r1, [r6], #4  @ C references.
  cmp r6, ip
  blo 1b

清BSS段，所有的arm程序都需要做这些的。

not_relocated: mov r0, #0
1:  str r0, [r2], #4  @ clear bss
  str r0, [r2], #4
  str r0, [r2], #4
  str r0, [r2], #4
  cmp r2, r3
  blo 1b

正如下面的注释所说，C环境我们已经设置好了。下面我们要打开cache和mmu。为什么要这样做呢？这只是一个解压程序呀？为了速度。那为什么要开mmu呢，而且只是做一个平板式的映射？还是为了速度。如果不开mmu的话，就只能打开icache。因为不开mmu的话就无法实现内存管理，而io区是决不能开dcache的。

  /*
   * The C runtime environment should now be setup
   * sufficiently. Turn the cache on, set up some
   * pointers, and start decompressing.
   */
  bl cache_on
是不是要跟读进去呢？对于只是对流程感兴趣的人只是知道打开cache就行了。不过跟进去是很有乐趣的，这就是为什么虽然Linux如此庞大，但仍有人会孜孜不倦的研究它的每一行代码的原因吧。反过来说，对于Linux内核的整体把握更加重要，要不然就成盲人摸象了。还有，想做ARM高手的人可以读Linux下的每一个汇编文件，因为Linux内核用ARM的东西还是比较全的。

mov r1, sp @ malloc space above stack
add r2, sp, #0x10000 @ 64k max

对下面这些地址的理解其实还是很麻烦，但有篇文档写得很清楚《About TEXTADDR, ZTEXTADDR, PAGE_OFFSET etc...》。下面程序的意义就是保证解压地址和当前程序的地址不重叠。上面分配了64KB的空间来做解压时的数据缓存。
/*
* Check to see if we will overwrite ourselves.
*   r4 = final kernel address//内核执行的最终实地址
*   r5 = start of this image//该程序的首地址
*   r2 = end of malloc space (and therefore this image)
* We basically want:
*   r4 >= r2 -> OK
*   r4 + image length <= r5 -> OK
*/
  cmp r4, r2
  bhs wont_overwrite
  add r0, r4, #4096*1024 @ 4MB largest kernel size
  cmp r0, r5
  bls wont_overwrite

如果空间不够了，只好解压到缓冲区地址后面。调用decompress_kernel进行解压缩，这段代码是用c实现的，和架构无关。

  mov r5, r2   @ decompress after malloc space
  mov r0, r5
  mov r3, r7
  bl decompress_kernel

完成了解压缩之后，由于空间不够，内核也没有解压到正确的地址，必须通过代码搬移来搬到指定的地址。搬运过程中有可能会覆盖掉现在运行的这段代码，所以必须将有可能会执行到的代码搬运到安全的地方，这里用的是解压缩了的代码的后面。

  add r0, r0, #127
  bic r0, r0, #127  @ align the kernel length
/*
* r0     = decompressed kernel length
* r1-r3 = unused
* r4     = kernel execution address
* r5     = decompressed kernel start
* r6     = processor ID
* r7     = architecture ID
* r8     = atags pointer
* r9-r14 = corrupted
*/
  add r1, r5, r0  @ end of decompressed kernel
  adr r2, reloc_start
  ldr r3, LC1
  add r3, r2, r3
1:  ldmia r2!, {r9 - r14}  @ copy relocation code
  stmia r1!, {r9 - r14}
  ldmia r2!, {r9 - r14}
  stmia r1!, {r9 - r14}
  cmp r2, r3
  blo 1b

bl cache_clean_flush//因为有代码搬移，所以必须先清理（clean）清除（flush）cache。
add pc, r5, r0 @ call relocation code

decompress_kernel共有4个参数，解压的内核地址、缓存区首地址、缓存区尾地址、和芯片ID，返回解压缩代码的长度。

/*
* We're not in danger of overwriting ourselves. Do this the simple way.
*
* r4     = kernel execution address
* r7     = architecture ID
*/
wont_overwrite: mov r0, r4
  mov r3, r7
  bl decompress_kernel
  b call_kernel

针对于不会出现代码覆盖的情况，就简单了。直接解压缩内核并且跳转到首地址运行。call_kernel这个函数我们会在下面分析它。

  .type LC0, #object
LC0:  .word LC0   @ r1
  .word __bss_start  @ r2
  .word _end   @ r3
  .word zreladdr  @ r4
  .word _start   @ r5
  .word _got_start  @ r6
  .word _got_end  @ ip
  .word user_stack+4096  @ sp
LC1:  .word reloc_end - reloc_start
  .size LC0, . - LC0

上面这个就是刚才我们说过的地址表，里面有几个符号的地址定义。LC0是在这里定义的。Zreladdr是在当前目录下的Makfile里定义的。其他的符号是在lds里定义的。

下面我们来分析一下有关cache和mmu的代码。通过这些代码我们可以看到Linux的高手们是如何通过汇编来实现各个ARM处理器的识别，以达到通用的目的。
/*
* Turn on the cache. We need to setup some page tables so that we
* can have both the I and D caches on.
*
* We place the page tables 16k down from the kernel execution address,
* and we hope that nothing else is using it. If we're using it, we
* will go pop!
*
* On entry,
* r4 = kernel execution address
* r6 = processor ID
* r7 = architecture number
* r8 = atags pointer
* r9 = run-time address of "start" (???)
* On exit,
* r1, r2, r3, r9, r10, r12 corrupted
* This routine must preserve:
* r4, r5, r6, r7, r8
*/
  .align 5
cache_on: mov r3, #8   @ cache_on function
  b call_cache_fn

这里涉及到了很多MMU、cache、writebuffer、TLB的操作和协处理器的编程。具体编程的东西，我就不想多说了，可以对这ARM的手册逐行的理解。至于为什么要这样做，熟悉了他们的工作原理后也就不难理解了（《ARM嵌入式系统开发》这本书就有个比较好的说明）。因为这里包含了太多的代码搬运、解压等费时的操作，所以打开cache是有必要的。由于要用到数据cache所以需要对mmu进行配置。为了简单这里制作了一级映射，而且是物理地址和虚拟地址相同的1:1映射。

__setup_mmu: sub r3, r4, #16384  @ Page directory size
  bic r3, r3, #0xff  @ Align the pointer
  bic r3, r3, #0x3f00
/*
* Initialise the page tables, turning on the cacheable and bufferable
* bits for the RAM area only.
*/
  mov r0, r3
  mov r9, r0, lsr #18
  mov r9, r9, lsl #18  @ start of RAM
  add r10, r9, #0x10000000 @ a reasonable RAM size
  mov r1, #0x12
  orr r1, r1, #3 << 10
  add r2, r3, #16384
1:  cmp r1, r9   @ if virt > start of RAM
  orrhs r1, r1, #0x0c  @ set cacheable, bufferable
  cmp r1, r10   @ if virt > end of RAM
  bichs r1, r1, #0x0c  @ clear cacheable, bufferable
  str r1, [r0], #4  @ 1:1 mapping
  add r1, r1, #1048576
  teq r0, r2
  bne 1b

参考下面的注释，如果当前在flash中运行，我们再映射2MB。就算是当前在RAM中执行其实也没关系，只不过是做了重复工作。

/*
* If ever we are running from Flash, then we surely want the cache
* to be enabled also for our execution instance... We map 2MB of it
* so there is no map overlap problem for up to 1 MB compressed kernel.
* If the execution is in RAM then we would only be duplicating the above.
*/
  mov r1, #0x1e
  orr r1, r1, #3 << 10
  mov r2, pc, lsr #20
  orr r1, r1, r2, lsl #20
  add r0, r3, r2, lsl #2
  str r1, [r0], #4
  add r1, r1, #1048576
  str r1, [r0]
  mov pc, lr

__armv4_cache_on:
  mov r12, lr
  bl __setup_mmu
  mov r0, #0
  mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
  mcr p15, 0, r0, c8, c7, 0 @ flush I,D TLBs
  mrc p15, 0, r0, c1, c0, 0 @ read control reg
  orr r0, r0, #0x5000  @ I-cache enable, RR cache replacement
  orr r0, r0, #0x0030
  bl __common_cache_on
  mov r0, #0
  mcr p15, 0, r0, c8, c7, 0 @ flush I,D TLBs
  mov pc, r12

__common_cache_on:
#ifndef DEBUG
  orr r0, r0, #0x000d  @ Write buffer, mmu
#endif
  mov r1, #-1
  mcr p15, 0, r3, c2, c0, 0 @ load page table pointer
  mcr p15, 0, r1, c3, c0, 0 @ load domain access control
  mcr p15, 0, r0, c1, c0, 0 @ load control register
  mov pc, lr

/*
* All code following this line is relocatable. It is relocated by
* the above code to the end of the decompressed kernel image and
* executed there. During this time, we have no stacks.
*
* r0     = decompressed kernel length
* r1-r3 = unused
* r4     = kernel execution address
* r5     = decompressed kernel start
* r6     = processor ID
* r7     = architecture ID
* r8     = atags pointer
* r9-r14 = corrupted
*/

下面这段代码是在解压空间不够的情况下需要重新定位的，具体原因上面已经说明。

  .align 5
reloc_start: add r9, r5, r0
  debug_reloc_start
  mov r1, r4
1:
  .rept 4
  ldmia r5!, {r0, r2, r3, r10 - r14} @ relocate kernel
  stmia r1!, {r0, r2, r3, r10 - r14}
  .endr

  cmp r5, r9
  blo 1b
  debug_reloc_end

这是最后一个函数了，这个时候一切实质性的工作已经做完。关闭cache，并跳转到真正的内核入口。

call_kernel: bl cache_clean_flush
  bl cache_off
  mov r0, #0   @ must be zero
  mov r1, r7   @ restore architecture number
  mov r2, r8   @ restore atags pointer
  mov pc, r4   @ call kernel

/*
* Here follow the relocatable cache support functions for the
* various processors. This is a generic hook for locating an
* entry and jumping to an instruction at the specified offset
* from the start of the block. Please note this is all position
* independent code.
*
* r1 = corrupted
* r2 = corrupted
* r3 = block offset
* r6 = corrupted
* r12 = corrupted
*/

通过下面函数我们可以通过proc_types结构体数组我们可以顺利的找到现在的处理器型号，并且会根据R3的偏移量跳转到相应的函数中。里面涉及到协处理器CP15中c0的操作，如果有疑问，可以参考ARM相关手册。

call_cache_fn: adr r12, proc_types
  mrc p15, 0, r6, c0, c0 @ get processor ID
1:  ldr r1, [r12, #0]  @ get value
  ldr r2, [r12, #4]  @ get mask
  eor r1, r1, r6  @ (real ^ match)
  tst r1, r2   @       & mask
  addeq pc, r12, r3  @ call cache function
  add r12, r12, #4*5
  b 1b

/*
* Table for cache operations. This is basically:
*   - CPU ID match
*   - CPU ID mask
*   - 'cache on' method instruction
*   - 'cache off' method instruction
*   - 'cache flush' method instruction
*
* We match an entry using: ((real_id ^ match) & mask) == 0
*
* Writethrough caches generally only need 'on' and 'off'
* methods. Writeback caches _must_ have the flush method
* defined.
*/
  .type proc_types,#object
proc_types:
  .word 0x41560600  @ ARM6/610
  .word 0xffffffe0
  b __arm6_cache_off @ works, but slow
  b __arm6_cache_off
  mov pc, lr
@  b __arm6_cache_on  @ untested
@  b __arm6_cache_off
@  b __armv3_cache_flush

  .word 0x00000000  @ old ARM ID
  .word 0x0000f000
  mov pc, lr
  mov pc, lr
  mov pc, lr

  .word 0x41007000  @ ARM7/710
  .word 0xfff8fe00
  b __arm7_cache_off
  b __arm7_cache_off
  mov pc, lr

  .word 0x41807200  @ ARM720T (writethrough)
  .word 0xffffff00
  b __armv4_cache_on
  b __armv4_cache_off
  mov pc, lr

  .word 0x00007000  @ ARM7 IDs
  .word 0x0000f000
  mov pc, lr
  mov pc, lr
  mov pc, lr

@ Everything from here on will be the new ID system.

  .word 0x4401a100  @ sa110 / sa1100
  .word 0xffffffe0
  b __armv4_cache_on
  b __armv4_cache_off
  b __armv4_cache_flush

  .word 0x6901b110  @ sa1110
  .word 0xfffffff0
  b __armv4_cache_on
  b __armv4_cache_off
  b __armv4_cache_flush

@ These match on the architecture ID

  .word 0x00020000  @ ARMv4T
  .word 0x000f0000
  b __armv4_cache_on
  b __armv4_cache_off
  b __armv4_cache_flush

  .word 0x00050000  @ ARMv5TE
  .word 0x000f0000
  b __armv4_cache_on
  b __armv4_cache_off
  b __armv4_cache_flush

  .word 0x00060000  @ ARMv5TEJ
  .word 0x000f0000
  b __armv4_cache_on
  b __armv4_cache_off
  b __armv4_cache_flush

  .word 0x00070000  @ ARMv6
  .word 0x000f0000
  b __armv4_cache_on
  b __armv4_cache_off
  b __armv6_cache_flush

  .word 0   @ unrecognised type
  .word 0
  mov pc, lr
  mov pc, lr
  mov pc, lr

.size proc_types, . - proc_types

/*
* Turn off the Cache and MMU. ARMv3 does not support
* reading the control register, but ARMv4 does.
*
* On entry, r6 = processor ID
* On exit,   r0, r1, r2, r3, r12 corrupted
* This routine must preserve: r4, r6, r7
*/
  .align 5
cache_off: mov r3, #12   @ cache_off function
  b call_cache_fn

//代码略

这里分配了4K的空间用来做堆栈。

reloc_end:

.align
.section ".stack", "w"
user_stack: .space 4096

阅读(3071) | 评论(1) | 转发(0) |

上一篇：Linux内核汉字显示技术

下一篇：GNU ARM汇编快速入门

给主人留下些什么吧！~~

chinaunix网友2008-06-24 17:26:29

我认为写这种东西很要紧的一件事是先指明是哪个内核版本，要不然很容易在阅读时糊涂

回复 | 举报

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6