kernel 3.10内核源码分析--缺页异常(page fault)处理流程-鬼鬼一哈-ChinaUnix博客

鬼鬼一哈的ChinaUnix博客

首页　| 　博文目录　| 　关于我

鬼鬼一哈

博客访问： 219504
博文数量： 76
博客积分： 106
博客等级：民兵
技术积分： 170
用户组：普通用户
注册时间： 2010-11-04 16:55

文章分类

全部博文（76）

存储（40）
linux基础（1）
未分配的博文（35）

文章存档

2014年（47）

2013年（2）

2012年（20）

2011年（7）

我的朋友

相关博文

kernel 3.10内核源码分析--缺页异常(page fault)处理流程

分类： LINUX

2014-12-30 21:41:22

原文地址：kernel 3.10内核源码分析--缺页异常(page fault)处理流程作者：humjb_1983

kernel 3.10内核源码分析--缺页异常(page fault)处理流程

基本原理
1、page fault由硬件产生，是一种“异常”。产生条件为：当CPU访问某线性地址，而该线性地址还没有对应的页表项，即还没有分配相应的物理内存并进行映射时，自动产生异常。
2、page fault基本流程：
从cr2中获取发生异常的地址
缺页地址位于内核态
    位于vmalloc区？->从主内核页表同步数据到进程页表
    非vmalloc区 ->不应该产生page fault->oops
缺页地址位于用户态
     缺页上下文发生在内核态
        exception table中有相应的处理项？ ->进行修正
        没有 ->oops
     查找vma
        找到？-> 是否expand stack？->堆栈扩展
                       不是->正常的缺页处理：handle_mm_fault
        没找到->bad_area

点击(此处)折叠或打开

/*
*缺页异常主处理函数。
*regs:异常时的寄存器信息；
*error_code-当异常发生时，硬件压入栈中的错误代码。
* 当第0位被清0时，则异常是由一个不存在的页所引起的。否则是由无效的访问权限引起的。
* 如果第1位被清0，则异常由读访问或者执行访问所引起，如果被设置，则异常由写访问引起。
* 如果第2位被清0，则异常发生在内核态，否则异常发生在用户态。
*/
static void __kprobes
__do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
unsigned long address;
struct mm_struct *mm;
int fault;
int write = error_code & PF_WRITE;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
(write ? FAULT_FLAG_WRITE : 0);
tsk = current;
mm = tsk->mm;
/* Get the faulting address: */
//缺页异常的地址默认存放于CR2寄存器中，x86硬件决定
address = read_cr2();
/*
* Detect and handle instructions that would cause a page fault for
* both a tracked kernel page and a userspace page.
*/
if (kmemcheck_active(regs))
kmemcheck_hide(regs);
prefetchw(&mm->mmap_sem);
// mmio不应该发生缺页，通常都会ioremap到vmalloc区，然后进行访问
if (unlikely(kmmio_fault(regs, address)))
return;
/*
* We fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
*
* We MUST NOT take any locks for this case. We may
* be in an interrupt or a critical region, and should
* only copy the information from the master page table,
* nothing more.
*
* This verifies that the fault happens in kernel space
* (error_code & 4) == 0, and that the fault was not a
* protection error (error_code & 9) == 0.
*/
/*
* 缺页地址位于内核空间。并不代表异常发生于内核空间，有可能是用户
* 态访问了内核空间的地址。
*/
if (unlikely(fault_in_kernel_space(address))) {
if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
/*
* 检查发生缺页的地址是否在vmalloc区，是则进行相应的处理
* 主要是从内核主页表向进程页表同步数据
*/
if (vmalloc_fault(address) >= 0)
return;
if (kmemcheck_fault(regs, address, error_code))
return;
}
/* Can handle a stale RO->RW TLB: */
/*
* 检查是否是由于陈旧的TLB导致的假的pagefault(由于TLB的延迟flush导致,
* 因为提前flush会有比较大的性能代价)。
*/
if (spurious_fault(error_code, address))
return;
/* kprobes don't want to hook the spurious faults: */
if (notify_page_fault(regs))
return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
/*
* 有问题了: 由于异常地址位于内核态，触发内核异常，因为vmalloc
* 区的缺页异常前面已经处理过了，内核态的缺页异常只能发生在
* vmalloc区，如果不是，那就是内核异常了。
*/
bad_area_nosemaphore(regs, error_code, address);
return;
}
// 进入到这里，说明异常地址位于用户态
/* kprobes don't want to hook the spurious faults: */
if (unlikely(notify_page_fault(regs)))
return;
/*
* It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled.
*
* User-mode registers count as a user access even for any
* potential system fault or CPU buglet:
*/
/*
* 开中断，这种情况下，是安全的，可以缩短因缺页异常导致的关中断时长。
* 老内核版本中(2.6.11)没有这样的操作
*/
if (user_mode_vm(regs)) {
local_irq_enable();
error_code |= PF_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
local_irq_enable();
}
if (unlikely(error_code & PF_RSVD))
pgtable_bad(regs, error_code, address);
if (static_cpu_has(X86_FEATURE_SMAP)) {
if (unlikely(smap_violation(error_code, regs))) {
bad_area_nosemaphore(regs, error_code, address);
return;
}
}
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
/*
* If we're in an interrupt, have no user context or are running
* in an atomic region then we must not take the fault:
*/
/*
* 当缺页异常发生于中断或其它atomic上下文中时，则产生异常。
* 这种情况下，不应该再产生page fault
*/
if (unlikely(in_atomic() || !mm)) {
bad_area_nosemaphore(regs, error_code, address);
return;
}
/*
* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in
* the kernel and should generate an OOPS. Unfortunately, in the
* case of an erroneous fault occurring in a code path which already
* holds mmap_sem we will deadlock attempting to validate the fault
* against the address space. Luckily the kernel only validly
* references user space from well defined areas of code, which are
* listed in the exceptions table.
*
* As the vast majority of faults will be valid we will only perform
* the source reference check when there is a possibility of a
* deadlock. Attempt to lock the address space, if we cannot we then
* validate the source. If this is invalid we can skip the address
* space check, thus avoiding the deadlock:
*/
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
/*
* 缺页发生在内核上下文，这种情况发生缺页的地址只能位于用户态地址空间
* 这种情况下，也只能为exceptions table中预先定义好的异常，如果exceptions
* table中没有预先定义的处理，或者缺页的地址位于内核态地址空间，则表示
* 错误，进入oops流程。
*/
if ((error_code & PF_USER) == 0 &&
!search_exception_tables(regs->ip)) {
bad_area_nosemaphore(regs, error_code, address);
return;
}
retry:
// 如果发生在用户态或者有exception table，说明不是内核异常
down_read(&mm->mmap_sem);
} else {
/*
* The above down_read_trylock() might have succeeded in
* which case we'll have missed the might_sleep() from
* down_read():
*/
might_sleep();
}
// 在当前进程的地址空间中寻找发生异常的地址对应的VMA。
vma = find_vma(mm, address);
// 如果没找到VMA，则释放mem_sem信号量后，进入__bad_area_nosemaphore处理。
if (unlikely(!vma)) {
bad_area(regs, error_code, address);
return;
}
/* 找到VMA，且发生异常的虚拟地址位于vma的有效范围内，则为正常的缺页
* 异常，请求调页，分配物理内存 */
if (likely(vma->vm_start <= address))
goto good_area;
/* 如果异常地址不是位于紧挨着堆栈区的那个区域，同时又没有相应VMA，则
* 进程访问了非法地址，进入bad_area处理
*/
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
bad_area(regs, error_code, address);
return;
}
if (error_code & PF_USER) {
/*
* Accessing the stack below %sp is always a bug.
* The large cushion allows instructions like enter
* and pusha to work. ("enter $65535, $31" pushes
* 32 pointers and then decrements %sp by 65535.)
*/
/*
* 压栈操作时，操作的地址最大的偏移为65536+32*sizeof(unsigned long),
* 该操作由pusha命令触发(老版本中，pusha命令最大只能操作32字节，即
* 同时压栈8个寄存器)。如果访问的地址距栈顶的距离超过了，则肯定是非法
* 地址访问了。
*/
if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
bad_area(regs, error_code, address);
return;
}
}
/*
* 运行到这里，说明设置了VM_GROWSDOWN标记，表示缺页异常地址位于堆栈区
* 需要扩展堆栈。说明: 堆栈区的虚拟地址空间也是动态分配和扩展的，不是
* 一开始就分配好的。
*/
if (unlikely(expand_stack(vma, address))) {
bad_area(regs, error_code, address);
return;
}
/*
* Ok, we have a good vm_area for this memory access, so
* we can handle it..
*/
/*
* 运行到这里，说明是正常的缺页异常，则进行请求调页，分配物理内存
*/
good_area:
if (unlikely(access_error(error_code, vma))) {
bad_area_access_error(regs, error_code, address);
return;
}
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* the fault:
*/
/*
* 分配物理内存，缺页异常的正常处理主函数
* 可能的情况有:1、请求调页/按需分配；2、COW；3、缺的页位于交换分区，
* 需要换入。
*/
fault = handle_mm_fault(mm, vma, address, flags);
if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
if (mm_fault_error(regs, error_code, address, fault))
return;
}
/*
* Major/minor page fault accounting is only done on the
* initial attempt. If we go through a retry, it is extremely
* likely that the page will be found in page cache at that point.
*/
if (flags & FAULT_FLAG_ALLOW_RETRY) {
if (fault & VM_FAULT_MAJOR) {
tsk->maj_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
regs, address);
} else {
tsk->min_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
regs, address);
}
if (fault & VM_FAULT_RETRY) {
/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
* of starvation. */
flags &= ~FAULT_FLAG_ALLOW_RETRY;
flags |= FAULT_FLAG_TRIED;
goto retry;
}
}
// VM86模式(兼容老环境)相关检查
check_v8086_mode(regs, address, tsk);
up_read(&mm->mmap_sem);
}

阅读(1022) | 评论(0) | 转发(0) |

上一篇：深入理解缓存cache（2）

下一篇：再谈Linux内核中的RCU机制

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6