分类: LINUX
2013-04-24 09:49:59
static inline int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags)
{
pte_t entry;
spinlock_t *ptl;
entry = *pte;
if (!pte_present(entry)) {//如果页不在主存中
if (pte_none(entry)) {//页表项内容为0,表明进程未访问过该页
/*如果vm_ops字段和fault字段都不为空,则说明这是一个基于文件的映射*/
if (vma->vm_ops) {
if (likely(vma->vm_ops->fault))
return do_linear_fault(mm, vma, address,
pte, pmd, flags, entry);
}
/*否则分配匿名页*/
return do_anonymous_page(mm, vma, address,
pte, pmd, flags);
}
/*属于非线性文件映射且已被换出*/
if (pte_file(entry))
return do_nonlinear_fault(mm, vma, address,
pte, pmd, flags, entry);
/*页不在主存中,但是页表项保存了相关信息,则表明该页被内核换出,则要进行换入操作*/
return do_swap_page(mm, vma, address,
pte, pmd, flags, entry);
}
...
...
}
首先要确定的一点就是pte对应的页是否驻留在主存中,因为pte有可能之前映射了页,但是该页被换出了。上面的代码给出了pte对应的页没有驻留在主存中的情况。如果pte对应的页没有驻留在主存中,且没有映射任何页,即pte_present()返回0,pte_none()返回0,则要判断要分配一个匿名页还是一个映射页。在Linux虚拟内存中,如果页对应的vma映射的是文件,则称为映射页,如果不是映射的文件,则称为匿名页。两者最大的区别体现在页和vma的组织上,因为在页框回收处理时要通过页来逆向搜索映射了该页的vma。对于匿名页的逆映射,vma都是通过vma结构体中的vma_anon_node(链表节点)和anon_vma(链表头)组织起来,再把该链表头的信息保存在页描述符中;而映射页和vma的组织是通过vma中的优先树节点和页描述符中的mapping->i_mmap优先树树根进行组织的,具体可以参看ULK3。
来看基于文件的映射的处理:
static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
pgoff_t pgoff = (((address & PAGE_MASK)
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
pte_unmap(page_table);//如果page_table之前用来建立了临时内核映射,则释放该映射
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
关键函数__do_fault():
static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
pte_t *page_table;
spinlock_t *ptl;
struct page *page;
pte_t entry;
int anon = 0;
int charged = 0;
struct page *dirty_page = NULL;
struct vm_fault vmf;
int ret;
int page_mkwrite = 0;
vmf.virtual_address = (void __user *)(address & PAGE_MASK);
vmf.pgoff = pgoff;
vmf.flags = flags;
vmf.page = NULL;
ret = vma->vm_ops->fault(vma, &vmf);//调用定义好的fault函数,确保将所需的文件数据读入到映射页
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
return ret;
if (unlikely(PageHWPoison(vmf.page))) {
if (ret & VM_FAULT_LOCKED)
unlock_page(vmf.page);
return VM_FAULT_HWPOISON;
}
/*
* For consistency in subsequent calls, make the faulted page always
* locked.
*/
if (unlikely(!(ret & VM_FAULT_LOCKED)))
lock_page(vmf.page);
else
VM_BUG_ON(!PageLocked(vmf.page));
/*
* Should we do an early C-O-W break?
*/
page = vmf.page;
if (flags & FAULT_FLAG_WRITE) {//写访问
if (!(vma->vm_flags & VM_SHARED)) {//私有映射,则要创建一个副本进行写时复制
anon = 1;// 标记为一个匿名映射
if (unlikely(anon_vma_prepare(vma))) {//创建一个anon_vma实例给vma
ret = VM_FAULT_OOM;
goto out;
}
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,//分配一个页
vma, address);
if (!page) {
ret = VM_FAULT_OOM;
goto out;
}
if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
ret = VM_FAULT_OOM;
page_cache_release(page);
goto out;
}
charged = 1;
/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
if (vma->vm_flags & VM_LOCKED)
clear_page_mlock(vmf.page);
/*创建数据的副本,将数据拷贝到新分配的页*/
copy_user_highpage(page, vmf.page, address, vma);
__SetPageUptodate(page);
} else {
/*
* If the page will be shareable, see if the backing
* address space wants to know that the page is about
* to become writable
*/
if (vma->vm_ops->page_mkwrite) {
int tmp;
unlock_page(page);
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
if (unlikely(tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
ret = tmp;
goto unwritable_page;
}
if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
lock_page(page);
if (!page->mapping) {
ret = 0; /* retry the fault */
unlock_page(page);
goto unwritable_page;
}
} else
VM_BUG_ON(!PageLocked(page));
page_mkwrite = 1;
}
}
}
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
/*
* This silly early PAGE_DIRTY setting removes a race
* due to the bad i386 page protection. But it's valid
* for other architectures too.
*
* Note that if FAULT_FLAG_WRITE is set, we either now have
* an exclusive copy of the page, or this is a shared mapping,
* so we can make it writable and dirty to avoid having to
* handle that later.
*/
/* Only go through if we didn't race with anybody else... */
if (likely(pte_same(*page_table, orig_pte))) {//确定没有竞争,也就是页表项中的内容和之前是一样的
flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot);//页表项指向对应的物理页
/*如果是写操作,则将页的访问权限置为RW*/
if (flags & FAULT_FLAG_WRITE)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/*如果之前生成的页是匿名的,则将其集成到逆向映射当中*/
if (anon) {
inc_mm_counter(mm, anon_rss);
page_add_new_anon_rmap(page, vma, address);//建立匿名页与第一个vma的逆向映射
} else {
inc_mm_counter(mm, file_rss);
page_add_file_rmap(page);//建立页与vma的普通映射
if (flags & FAULT_FLAG_WRITE) {
dirty_page = page;
get_page(dirty_page);
}
}
set_pte_at(mm, address, page_table, entry);//修改page_table使其指向entry对应的页框
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, address, entry);
} else {
if (charged)
mem_cgroup_uncharge_page(page);
if (anon)
page_cache_release(page);
else
anon = 1; /* no anon but release faulted_page */
}
pte_unmap_unlock(page_table, ptl);
out:
if (dirty_page) {
struct address_space *mapping = page->mapping;
if (set_page_dirty(dirty_page))
page_mkwrite = 1;
unlock_page(dirty_page);
put_page(dirty_page);
if (page_mkwrite && mapping) {
/*
* Some device drivers do not set page.mapping but still
* dirty their pages
*/
balance_dirty_pages_ratelimited(mapping);
}
/* file_update_time outside page_lock */
if (vma->vm_file)
file_update_time(vma->vm_file);
} else {
unlock_page(vmf.page);
if (anon)
page_cache_release(vmf.page);
}
return ret;
unwritable_page:
page_cache_release(page);
return ret;
}