mmap实现分析
本文不是介绍mmap函数的使用方法,而是分析其内核实现,相关使用方法网上已经有很多资料。Mmap的本质其实就是:为当前进程分配(或找到)一个合适的vma,然后为该vma设置对应的缺页处理函数。
我们知道mmap按照flag可以分为匿名映射和非匿名映射,又可分为shared映射和private映射。这样从两个维度,我们就得到了四种映射。
(1) 匿名shared映射:fd为-1,可用于父子进程通信。
(2) 匿名private映射:例如malloc大块的内存(大于128k)。
(3) 非匿名shared映射:常见的用于进程通信方式。
(4) 非匿名private映射:例如程序在启动时加载so时,就是用的这种方式,相当于“写时拷贝”。
下面我们就看下内核中几种方式的区别。
内核中mmap主要有函数sys_mmap_pgoff函数负责实现,该函数定义在mm/mmap.c中。
-
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
-
unsigned long, prot, unsigned long, flags,
-
unsigned long, fd, unsigned long, pgoff)
-
{
-
struct file *file = NULL;
-
unsigned long retval = -EBADF;
-
if (!(flags & MAP_ANONYMOUS)) { /*匿名映射*/
-
audit_mmap_fd(fd, flags);
-
if (unlikely(flags & MAP_HUGETLB))
-
return -EINVAL;
-
file = fget(fd); /*由fd找到对应的file结构*/
-
if (!file)
-
goto out;
-
if (is_file_hugepages(file))
-
len = ALIGN(len, huge_page_size(hstate_file(file)));
-
} else if (flags & MAP_HUGETLB) {
-
/*......*/
-
}
-
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
-
retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
-
if (file)
-
fput(file);
-
out:
-
return retval;
-
}
该函数主要功能由vm_mmap_pgoff来实现,而vm_mmap_pgoff主要逻辑就是调用了do_mmap_pgoff。下面我们看vm_mmap_pgoff的实现。
l do_mmap_pgoff
-
unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
-
unsigned long len, unsigned long prot,
-
unsigned long flags, unsigned long pgoff,
-
unsigned long *populate)
-
{
-
struct mm_struct * mm = current->mm;
-
struct inode *inode;
-
/*......*/
-
/* Obtain the address to map to. we verify (or select) it and ensure
-
* that it represents a valid section of the address space.
-
*/
-
addr = get_unmapped_area(file, addr, len, pgoff, flags);
-
if (addr & ~PAGE_MASK)
-
return addr;
-
/*......*/
-
addr = mmap_region(file, addr, len, vm_flags, pgoff);
-
/*......*/
-
return addr;
-
}
这个函数首先通过 get_unmapped_area创建(或获取)一个合适的vma,然后调用mmap_region对vma进行设置。我们具体看下mmap_region的实现。
l mmap_region
-
unsigned long mmap_region(struct file *file, unsigned long addr,
-
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
-
{
-
struct mm_struct *mm = current->mm;
-
struct vm_area_struct *vma, *prev;
-
int correct_wcount = 0;
-
int error;
-
struct rb_node **rb_link, *rb_parent;
-
unsigned long charged = 0;
-
struct inode *inode = file ? file_inode(file) : NULL;
-
/*......*/
-
if (file) { /*如果不是匿名映射*/
-
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
-
goto free_vma;
-
if (vm_flags & VM_DENYWRITE) {
-
error = deny_write_access(file);
-
if (error)
-
goto free_vma;
-
correct_wcount = 1;
-
}
-
vma->vm_file = get_file(file);
-
error = file->f_op->mmap(file, vma); /*调用对应文件系统的mmap函数*/
-
if (error)
-
goto unmap_and_free_vma;
-
addr = vma->vm_start;
-
pgoff = vma->vm_pgoff;
-
vm_flags = vma->vm_flags;
-
} else if (vm_flags & VM_SHARED) { /*shared 匿名映射*/
-
if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
-
goto free_vma;
-
error = shmem_zero_setup(vma);
-
if (error)
-
goto free_vma;
-
} /*private 匿名映射*/
-
file = vma->vm_file;
-
/*......*/
-
}
如果传入了fd,则调用对应文件系统的mmap函数。以ext4文件系统为例。其mmap函数为 ext4_file_mmap。
l ext4_file_mmap
-
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
-
{
-
struct address_space *mapping = file->f_mapping;
-
if (!mapping->a_ops->readpage)
-
return -ENOEXEC;
-
file_accessed(file);
-
vma->vm_ops = &ext4_file_vm_ops;
-
return 0;
-
}
可以看到这个函数只是设置vma->vm_ops为当前文件系统的处理函数。
-
static const struct vm_operations_struct ext4_file_vm_ops = {
-
.fault = filemap_fault,
-
.page_mkwrite = ext4_page_mkwrite,
-
.remap_pages = generic_file_remap_pages,
-
};
如果是匿名映射(不传入fd),且传入了shared flag。则调用shmem_zero_setup。
l shmem_zero_setup
-
int shmem_zero_setup(struct vm_area_struct *vma)
-
{
-
struct file *file;
-
loff_t size = vma->vm_end - vma->vm_start;
-
file = shmem_file_setup("dev/zero", size, vma->vm_flags);
-
if (IS_ERR(file))
-
return PTR_ERR(file);
-
if (vma->vm_file)
-
fput(vma->vm_file);
-
vma->vm_file = file;
-
vma->vm_ops = &shmem_vm_ops;
-
return 0;
-
}
可以看到这里将vma->vm_ops设置为tmpfs文件系统的shmem_vm_ops。
-
static const struct vm_operations_struct shmem_vm_ops = {
-
.fault = shmem_fault,
-
#ifdef CONFIG_NUMA
-
.set_policy = shmem_set_policy,
-
.get_policy = shmem_get_policy,
-
#endif
-
.remap_pages = generic_file_remap_pages,
-
};
整个mmap函数的处理过程如下:
我们知道mmap函数只是为进程分配了虚拟内存空间,并没有真的建立虚拟内存和物理内存的映射。这个建立映射的过程是到缺页中断的函数中进行的。
缺页中断的处理过程大体如下:
-
int handle_pte_fault(struct mm_struct *mm,
-
struct vm_area_struct *vma, unsigned long address,
-
pte_t *pte, pmd_t *pmd, unsigned int flags)
-
{
-
pte_t entry;
-
spinlock_t *ptl;
-
/*......*/
-
entry = *pte;
-
if (!pte_present(entry)) {
-
if (pte_none(entry)) {
-
if (vma->vm_ops)
-
return do_linear_fault(mm, vma, address,
-
pte, pmd, flags, entry);
-
/*匿名private 映射*/
-
return do_anonymous_page(mm, vma, address,
-
pte, pmd, flags);
-
}
-
}
-
-
return 0;
-
}
我们看到vma->vm_ops时会调用do_anonymous_page。这里需要注意,有人看到函数名就以为这是匿名映射的逻辑,但是根据前面的代码分析匿名shared的时候也是会设置vma->vm_ops的。只有一种情况不会设置,那就是匿名private映射。
所以综上,有以下结论:
(1)非匿名shared映射:调用文件各自文件系统的缺页函数;
(2)非匿名private映射:调用文件各自文件系统的缺页函数;
(3)匿名shared映射:调用tmpfs文件系统的缺页函数;
(4)匿名private映射:do_anonymous_page处理缺页,也是目前唯一支持THP(透明大页)的方式。
另外补充:其实我们常用的posix和systemV共享内存底层都是通过tmpfs实现的,详见 。但注意其实内核是有两个tmpfs文件系统的,一个是内核启动自行挂载的用于共享匿名映射和systemV共享内存,而另一个通过mount挂载,其大小默认为系统内存的1/2,用于posix共享内存。
阅读(9362) | 评论(1) | 转发(1) |