Chinaunix首页 | 论坛 | 博客
  • 博客访问: 48782
  • 博文数量: 34
  • 博客积分: 0
  • 博客等级: 民兵
  • 技术积分: 10
  • 用 户 组: 普通用户
  • 注册时间: 2014-08-18 21:23
文章分类

全部博文(34)

文章存档

2014年(34)

我的朋友

分类: LINUX

2014-08-18 21:30:39

kernel 3.10内核源码分析--缺页异常(page fault)处理流程
基本原理
1、page fault由硬件产生,是一种“异常”。产生条件为:当CPU访问某线性地址,而该线性地址还没有对应的页表项,即还没有分配相应的物理内存并进行映射时,自动产生异常。
2、page fault基本流程:
从cr2中获取发生异常的地址
  缺页地址位于内核态
    位于vmalloc区?->从主内核页表同步数据到进程页表
    非vmalloc区 ->不应该产生page fault->oops
  缺页地址位于用户态
     缺页上下文发生在内核态
        exception table中有相应的处理项? ->进行修正
        没有 ->oops
     查找vma
        找到?-> 是否expand stack?->堆栈扩展
                       不是->正常的缺页处理:handle_mm_fault                    
        没找到->bad_area    

点击(此处)折叠或打开

  1. /*
  2.   *缺页异常主处理函数。
  3.   *regs:异常时的寄存器信息;
  4.   *error_code-当异常发生时,硬件压入栈中的错误代码。
  5.   *             当第0位被清0时,则异常是由一个不存在的页所引起的。否则是由无效的访问权限引起的。
  6.   *             如果第1位被清0,则异常由读访问或者执行访问所引起,如果被设置,则异常由写访问引起。
  7.   *             如果第2位被清0,则异常发生在内核态,否则异常发生在用户态。
  8.   */
  9. static void __kprobes
  10. __do_page_fault(struct pt_regs *regs, unsigned long error_code)
  11. {
  12.     struct vm_area_struct *vma;
  13.     struct task_struct *tsk;
  14.     unsigned long address;
  15.     struct mm_struct *mm;
  16.     int fault;
  17.     int write = error_code & PF_WRITE;
  18.     unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
  19.                     (write ? FAULT_FLAG_WRITE : 0);

  20.     tsk = current;
  21.     mm = tsk->mm;

  22.     /* Get the faulting address: */
  23.     //缺页异常的地址默认存放于CR2寄存器中,x86硬件决定
  24.     address = read_cr2();

  25.     /*
  26.      * Detect and handle instructions that would cause a page fault for
  27.      * both a tracked kernel page and a userspace page.
  28.      */
  29.     if (kmemcheck_active(regs))
  30.         kmemcheck_hide(regs);
  31.     prefetchw(&mm->mmap_sem);

  32.     // mmio不应该发生缺页,通常都会ioremap到vmalloc区,然后进行访问
  33.     if (unlikely(kmmio_fault(regs, address)))
  34.         return;

  35.     /*
  36.      * We fault-in kernel-space virtual memory on-demand. The
  37.      * 'reference' page table is init_mm.pgd.
  38.      *
  39.      * We MUST NOT take any locks for this case. We may
  40.      * be in an interrupt or a critical region, and should
  41.      * only copy the information from the master page table,
  42.      * nothing more.
  43.      *
  44.      * This verifies that the fault happens in kernel space
  45.      * (error_code & 4) == 0, and that the fault was not a
  46.      * protection error (error_code & 9) == 0.
  47.      */
  48.      /*
  49.      * 缺页地址位于内核空间。并不代表异常发生于内核空间,有可能是用户
  50.      * 态访问了内核空间的地址。
  51.      */
  52.     if (unlikely(fault_in_kernel_space(address))) {
  53.         if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
  54.             /*
  55.              * 检查发生缺页的地址是否在vmalloc区,是则进行相应的处理
  56.              * 主要是从内核主页表向进程页表同步数据
  57.              */
  58.             if (vmalloc_fault(address) >= 0)
  59.                 return;

  60.             if (kmemcheck_fault(regs, address, error_code))
  61.                 return;
  62.         }

  63.         /* Can handle a stale RO->RW TLB: */
  64.         /*
  65.          * 检查是否是由于陈旧的TLB导致的假的pagefault(由于TLB的延迟flush导致,
  66.          * 因为提前flush会有比较大的性能代价)
  67.          */
  68.         if (spurious_fault(error_code, address))
  69.             return;

  70.         /* kprobes don't want to hook the spurious faults: */
  71.         if (notify_page_fault(regs))
  72.             return;
  73.         /*
  74.          * Don't take the mm semaphore here. If we fixup a prefetch
  75.          * fault we could otherwise deadlock:
  76.          */
  77.         /*
  78.          * 有问题了: 由于异常地址位于内核态,触发内核异常,因为vmalloc
  79.          * 区的缺页异常前面已经处理过了,内核态的缺页异常只能发生在
  80.          * vmalloc区,如果不是,那就是内核异常了。
  81.          */
  82.         bad_area_nosemaphore(regs, error_code, address);

  83.         return;
  84.     }

  85.     // 进入到这里,说明异常地址位于用户态
  86.     /* kprobes don't want to hook the spurious faults: */
  87.     if (unlikely(notify_page_fault(regs)))
  88.         return;
  89.     /*
  90.      * It's safe to allow irq's after cr2 has been saved and the
  91.      * vmalloc fault has been handled.
  92.      *
  93.      * User-mode registers count as a user access even for any
  94.      * potential system fault or CPU buglet:
  95.      */
  96.     /*
  97.      * 开中断,这种情况下,是安全的,可以缩短因缺页异常导致的关中断时长。
  98.      * 老内核版本中(2.6.11)没有这样的操作
  99.      */
  100.     if (user_mode_vm(regs)) {
  101.         local_irq_enable();
  102.         error_code |= PF_USER;
  103.     } else {
  104.         if (regs->flags & X86_EFLAGS_IF)
  105.             local_irq_enable();
  106.     }

  107.     if (unlikely(error_code & PF_RSVD))
  108.         pgtable_bad(regs, error_code, address);

  109.     if (static_cpu_has(X86_FEATURE_SMAP)) {
  110.         if (unlikely(smap_violation(error_code, regs))) {
  111.             bad_area_nosemaphore(regs, error_code, address);
  112.             return;
  113.         }
  114.     }

  115.     perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);

  116.     /*
  117.      * If we're in an interrupt, have no user context or are running
  118.      * in an atomic region then we must not take the fault:
  119.      */
  120.     /*
  121.      * 当缺页异常发生于中断或其它atomic上下文中时,则产生异常。
  122.      * 这种情况下,不应该再产生page fault
  123.      */
  124.     if (unlikely(in_atomic() || !mm)) {
  125.         bad_area_nosemaphore(regs, error_code, address);
  126.         return;
  127.     }

  128.     /*
  129.      * When running in the kernel we expect faults to occur only to
  130.      * addresses in user space. All other faults represent errors in
  131.      * the kernel and should generate an OOPS. Unfortunately, in the
  132.      * case of an erroneous fault occurring in a code path which already
  133.      * holds mmap_sem we will deadlock attempting to validate the fault
  134.      * against the address space. Luckily the kernel only validly
  135.      * references user space from well defined areas of code, which are
  136.      * listed in the exceptions table.
  137.      *
  138.      * As the vast majority of faults will be valid we will only perform
  139.      * the source reference check when there is a possibility of a
  140.      * deadlock. Attempt to lock the address space, if we cannot we then
  141.      * validate the source. If this is invalid we can skip the address
  142.      * space check, thus avoiding the deadlock:
  143.      */
  144.     if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
  145.         /*
  146.          * 缺页发生在内核上下文,这种情况发生缺页的地址只能位于用户态地址空间
  147.          * 这种情况下,也只能为exceptions table中预先定义好的异常,如果exceptions
  148.          * table中没有预先定义的处理,或者缺页的地址位于内核态地址空间,则表示
  149.          * 错误,进入oops流程。
  150.          */
  151.         if ((error_code & PF_USER) == 0 &&
  152.          !search_exception_tables(regs->ip)) {
  153.             bad_area_nosemaphore(regs, error_code, address);
  154.             return;
  155.         }
  156. retry:
  157.         // 如果发生在用户态或者有exception table,说明不是内核异常
  158.         down_read(&mm->mmap_sem);
  159.     } else {
  160.         /*
  161.          * The above down_read_trylock() might have succeeded in
  162.          * which case we'll have missed the might_sleep() from
  163.          * down_read():
  164.          */
  165.         might_sleep();
  166.     }

  167.     // 在当前进程的地址空间中寻找发生异常的地址对应的VMA。
  168.     vma = find_vma(mm, address);
  169.     // 如果没找到VMA,则释放mem_sem信号量后,进入__bad_area_nosemaphore处理。
  170.     if (unlikely(!vma)) {
  171.         bad_area(regs, error_code, address);
  172.         return;
  173.     }
  174.     /* 找到VMA,且发生异常的虚拟地址位于vma的有效范围内,则为正常的缺页
  175.      * 异常,请求调页,分配物理内存 */
  176.     if (likely(vma->vm_start <= address))
  177.         goto good_area;
  178.     /* 如果异常地址不是位于紧挨着堆栈区的那个区域,同时又没有相应VMA,则
  179.      * 进程访问了非法地址,进入bad_area处理
  180.      */
  181.     if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
  182.         bad_area(regs, error_code, address);
  183.         return;
  184.     }
  185.     if (error_code & PF_USER) {
  186.         /*
  187.          * Accessing the stack below %sp is always a bug.
  188.          * The large cushion allows instructions like enter
  189.          * and pusha to work. ("enter $65535, $31" pushes
  190.          * 32 pointers and then decrements %sp by 65535.)
  191.          */
  192.         /*
  193.          * 压栈操作时,操作的地址最大的偏移为65536+32*sizeof(unsigned long),
  194.          * 该操作由pusha命令触发(老版本中,pusha命令最大只能操作32字节,即
  195.          * 同时压栈8个寄存器)。如果访问的地址距栈顶的距离超过了,则肯定是非法
  196.          * 地址访问了。
  197.          */
  198.         if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
  199.             bad_area(regs, error_code, address);
  200.             return;
  201.         }
  202.     }
  203.     
  204.     /*
  205.     * 运行到这里,说明设置了VM_GROWSDOWN标记,表示缺页异常地址位于堆栈区
  206.     * 需要扩展堆栈。说明: 堆栈区的虚拟地址空间也是动态分配和扩展的,不是
  207.     * 一开始就分配好的。
  208.     */
  209.     if (unlikely(expand_stack(vma, address))) {
  210.         bad_area(regs, error_code, address);
  211.         return;
  212.     }

  213.     /*
  214.      * Ok, we have a good vm_area for this memory access, so
  215.      * we can handle it..
  216.      */
  217.     /*
  218.      * 运行到这里,说明是正常的缺页异常,则进行请求调页,分配物理内存
  219.      */
  220. good_area:
  221.     if (unlikely(access_error(error_code, vma))) {
  222.         bad_area_access_error(regs, error_code, address);
  223.         return;
  224.     }

  225.     /*
  226.      * If for any reason at all we couldn't handle the fault,
  227.      * make sure we exit gracefully rather than endlessly redo
  228.      * the fault:
  229.      */
  230.     /*
  231.      * 分配物理内存,缺页异常的正常处理主函数
  232.      * 可能的情况有:1、请求调页/按需分配;2、COW;3、缺的页位于交换分区,
  233.      * 需要换入。
  234.      */
  235.     fault = handle_mm_fault(mm, vma, address, flags);

  236.     if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
  237.         if (mm_fault_error(regs, error_code, address, fault))
  238.             return;
  239.     }

  240.     /*
  241.      * Major/minor page fault accounting is only done on the
  242.      * initial attempt. If we go through a retry, it is extremely
  243.      * likely that the page will be found in page cache at that point.
  244.      */
  245.     if (flags & FAULT_FLAG_ALLOW_RETRY) {
  246.         if (fault & VM_FAULT_MAJOR) {
  247.             tsk->maj_flt++;
  248.             perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
  249.                  regs, address);
  250.         } else {
  251.             tsk->min_flt++;
  252.             perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
  253.                  regs, address);
  254.         }
  255.         if (fault & VM_FAULT_RETRY) {
  256.             /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
  257.              * of starvation. */
  258.             flags &= ~FAULT_FLAG_ALLOW_RETRY;
  259.             flags |= FAULT_FLAG_TRIED;
  260.             goto retry;
  261.         }
  262.     }

  263.     // VM86模式(兼容老环境)相关检查
  264.     check_v8086_mode(regs, address, tsk);

  265.     up_read(&mm->mmap_sem);
  266. }

阅读(4417) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~