Chinaunix首页 | 论坛 | 博客
  • 博客访问: 83877
  • 博文数量: 13
  • 博客积分: 173
  • 博客等级: 入伍新兵
  • 技术积分: 100
  • 用 户 组: 普通用户
  • 注册时间: 2011-04-21 12:26
文章分类

全部博文(13)

文章存档

2012年(6)

2011年(7)

我的朋友

分类:

2011-12-04 15:22:19

unmap_region是整个收缩过程中的核心,它主要完成相应项表项的修改,具体映射页框的释放

代码如下:

static void unmap_region(struct mm_struct *mm,

     struct vm_area_struct *vma,

     struct vm_area_struct *prev,

     unsigned long start,

     unsigned long end)

{

     struct mmu_gather *tlb;

     unsigned long nr_accounted = 0;

 

     lru_add_drain();

     tlb = tlb_gather_mmu(mm, 0);

     //断开具体的vma映射

     unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);

     vm_unacct_memory(nr_accounted);

 

     //x86平台上,is_hugepage_only_range()恒为零

     if (is_hugepage_only_range(start, end - start))

         hugetlb_free_pgtables(tlb, prev, start, end);

     else

         //因为删除了一些映射,会造成一个页表空闲的情况,回收页表项所占的空间

         free_pgtables(tlb, prev, start, end);

     tlb_finish_mmu(tlb, start, end);

}

unmap_vmas用来释放pte所映射的页面。代码如下:

//参数说明:

//mm:进程描述符 vma:要删除的起始vma start_addr:要删除的线性区的起始地址

// end_addr:要删除的线性区的结束地址 details:在调用的时候置为了NULL ^_^

int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,

         struct vm_area_struct *vma, unsigned long start_addr,

         unsigned long end_addr, unsigned long *nr_accounted,

         struct zap_details *details)

{

     unsigned long zap_bytes = ZAP_BLOCK_SIZE;

     unsigned long tlb_start = 0;     /* For tlb_finish_mmu */

     int tlb_start_valid = 0;

     int ret = 0;

     int atomic = details && details->atomic;

 

     //遍历要删除的vma链表

     for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {

         unsigned long start;

         unsigned long end;

        

         //确定要断开映射的起始地址跟结束地址

         start = max(vma->vm_start, start_addr);

         if (start >= vma->vm_end)

              continue;

         end = min(vma->vm_end, end_addr);

         if (end <= vma->vm_start)

              continue;

 

         if (vma->vm_flags & VM_ACCOUNT)

              *nr_accounted += (end - start) >> PAGE_SHIFT;

 

         ret++;

         //while循环开始断开startend的所有被映射的页框,在足够的情况下一次释放zap_bytes

         while (start != end) {

              unsigned long block;

 

              if (!tlb_start_valid) {

                   tlb_start = start;

                   tlb_start_valid = 1;

              }

             

              //在条件编译下is_vm_hugetlb_page()为空

              if (is_vm_hugetlb_page(vma)) {

                   block = end - start;

                   unmap_hugepage_range(vma, start, end);

              } else {

                   //block:要释放的线性区大小

                   block = min(zap_bytes, end - start);

                   //断开从startstart + block之间的映射

                   unmap_page_range(*tlbp, vma, start,

                            start + block, details);

              }

              //更新起始地址

              start += block;

              zap_bytes -= block;

              if (!atomic && need_resched()) {

                   int fullmm = tlb_is_full_mm(*tlbp);

                   tlb_finish_mmu(*tlbp, tlb_start, start);

                   cond_resched_lock(&mm->page_table_lock);

                   *tlbp = tlb_gather_mmu(mm, fullmm);

                   tlb_start_valid = 0;

              }

              if ((long)zap_bytes > 0)

                   continue;

              zap_bytes = ZAP_BLOCK_SIZE;

         }

     }

     return ret;

}

跟进unmap_page_range():

static void unmap_page_range(struct mmu_gather *tlb,

         struct vm_area_struct *vma, unsigned long address,

         unsigned long end, struct zap_details *details)

{

     pgd_t * dir;

 

     BUG_ON(address >= end);

     //取得页目录

     dir = pgd_offset(vma->vm_mm, address);

     tlb_start_vma(tlb, vma);

     //断开pgd项对应的pmd

     do {

         zap_pmd_range(tlb, dir, address, end - address, details);

         //加上一个pgd大小,并对应PGD_SIZE

         address = (address + PGDIR_SIZE) & PGDIR_MASK;

         dir++;

     } while (address && (address < end));

     //x86为空函数,忽略

     tlb_end_vma(tlb, vma);

}

转入zap_pmd_range():

static void zap_pmd_range(struct mmu_gather *tlb,

         pgd_t * dir, unsigned long address,

         unsigned long size, struct zap_details *details)

{

     pmd_t * pmd;

     unsigned long end, pgd_boundary;

 

     //页目录没有映射

     if (pgd_none(*dir))

         return;

     //无效

     if (unlikely(pgd_bad(*dir))) {

         pgd_ERROR(*dir);

         pgd_clear(dir);

         return;

     }

     //找到起始的pmd

     pmd = pmd_offset(dir, address);

     end = address + size;

     pgd_boundary = ((address + PGDIR_SIZE) & PGDIR_MASK);

     if (pgd_boundary && (end > pgd_boundary))

         end = pgd_boundary;

     do {

         //根据pmd找到pte

          (tlb, pmd, address, end - address, details);

         address = (address + PMD_SIZE) & PMD_MASK;

         pmd++;

     } while (address && (address < end));

}

继续跟进zap_pte_range():

static void zap_pte_range(struct mmu_gather *tlb,

         pmd_t *pmd, unsigned long address,

         unsigned long size, struct zap_details *details)

{

     unsigned long offset;

     pte_t *ptep;

 

     //pmd没有映射页面

     if (pmd_none(*pmd))

         return;

     //无效情况

     if (unlikely(pmd_bad(*pmd))) {

         pmd_ERROR(*pmd);

         pmd_clear(pmd);

         return;

     }

     ptep = pte_offset_map(pmd, address);

     offset = address & ~PMD_MASK;

     if (offset + size > PMD_SIZE)

         size = PMD_SIZE - offset;

     size &= PAGE_MASK;

     if (details && !details->check_mapping && !details->nonlinear_vma)

         details = NULL;

     for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) {

         pte_t pte = *ptep;

         //pte没有映射页面

         if (pte_none(pte))

              continue;

         //相应的页在主存中

         if (pte_present(pte)) {

              struct page *page = NULL;

              //pte映射的物理地址转换为页面号

              unsigned long pfn = pte_pfn(pte);

              //如果页面号合法,则转换为相应的page,如果页面被保留(不可以断开映射),page``````````````//NULL

              if (pfn_valid(pfn)) {

                   page = pfn_to_page(pfn);

                   if (PageReserved(page))

                       //Reserverd:留给内核使用或者没有使用

                       page = NULL;

              }

              //函数调用时,detailsNULL。略过这部份代码 ^_^

              if (unlikely(details) && page) {

                   /*

                    * unmap_shared_mapping_pages() wants to

                    * invalidate cache without truncating:

                    * unmap shared but keep private pages.

                    */

                   if (details->check_mapping &&

                       details->check_mapping != page->mapping)

                       continue;

                   /*

                    * Each page->index must be checked when

                    * invalidating or truncating nonlinear.

                    */

                   if (details->nonlinear_vma &&

                       (page->index < details->first_index ||

                        page->index > details->last_index))

                       continue;

              }

              //清除pte值,并返回原来的pte

              pte = ptep_get_and_clear(ptep);

              tlb_remove_tlb_entry(tlb, ptep, address+offset);

              //如果page NULL,说明不需要释放page

              if (unlikely(!page))

                   continue;

              if (unlikely(details) && details->nonlinear_vma

                  && linear_page_index(details->nonlinear_vma,

                       address+offset) != page->index)

                   set_pte(ptep, pgoff_to_pte(page->index));

              //如果页面项为脏,置page为脏

              if (pte_dirty(pte))

                   set_page_dirty(page);

              if (pte_young(pte) && !PageAnon(page))

                   mark_page_accessed(page);

              tlb->freed++;

              page_remove_rmap(page);

              //tlb_remove_page里判断page的引用计数,如果没有引用了

              //调用free_page_and_swap_cache将页面释放

              tlb_remove_page(tlb, page);

              continue;

         }

        

         if (unlikely(details))

              continue;

     //如果页表项所映射的数据被交换到了磁盘,释放相关数据

         if (!pte_file(pte))

              free_swap_and_cache(pte_to_swp_entry(pte));

         //清除pte映射

         pte_clear(ptep);

     }

     pte_unmap(ptep-1);

}

通过上面的分析可以看到,内核是如何通过线性地址从pgd找到pte再释放相关页面的。到这一步,注意到,只是释放了pte所映射的页框,所以,可能会造成有很多pte项没有映射的状态,这部份pte所占的空间其实是可以回收的。

它是在free_pgtables()函数中完成的。代码如下:

static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,

     unsigned long start, unsigned long end)

{

     //PGD_SIZE大小对齐

     unsigned long first = start & PGDIR_MASK;

     //向上增加一个PGD_SIZE,last参数还会在接下来进行调整的

     unsigned long last = end + PGDIR_SIZE - 1;

     unsigned long start_index, end_index;

     struct mm_struct *mm = tlb->mm;

 

     //调整firstlast参数的值

     if (!prev) {

         prev = mm->mmap;

         if (!prev)

              goto no_mmaps;

         if (prev->vm_end > start) {

              if (last > prev->vm_start)

                   last = prev->vm_start;

              goto no_mmaps;

         }

     }

     for (;;) {

         struct vm_area_struct *next = prev->vm_next;

 

         if (next) {

              if (next->vm_start < start) {

                   prev = next;

                   continue;

              }

              if (last > next->vm_start)

                   last = next->vm_start;

         }

         if (prev->vm_end > first)

              first = prev->vm_end + PGDIR_SIZE - 1;

         break;

     }

no_mmaps:

     //非法退出

     if (last < first) 

         return;

     //first所在的页目录

     start_index = pgd_index(first);

     if (start_index < FIRST_USER_PGD_NR)

         start_index = FIRST_USER_PGD_NR;

     //last所在页目录

     end_index = pgd_index(last);

     if (end_index > start_index) {

         //将页目录start_indexend_index中所映射的pte所占空间释放掉

         clear_page_tables(tlb, start_index, end_index - start_index);

         flush_tlb_pgtables(mm, first & PGDIR_MASK, last & PGDIR_MASK);

     }

}

在研究代码之前,我们不妨先来思考几个问题:

1:一次要释放的多长的地址区间才合适呢?

i32二级映射关系为例来说明一下:

虽然pte在线性地址中只占有10位,但是实际上为pte分配内存的时候,却分配了一个页。也就是说,pgd中每一项所指向的pte占一个页面.2^10pte项占一个页面。而pte本身映射2^12大小的线性地址。所以,要释放一个pte内框所需的地址长度为2^10*2^12 = 2^22 = PGD_SIZE

I32的三级映射也类似

2prev指向的是什么?

调用这个函数的时候,prev指向的是什么区域的vma呢?

刚开始的时候:

 

detach_vmas_to_be_unmapped后:

看上面可以看出: clear_page_tables中,要操作的线性地址即为prev,prev->next之间的空洞线性地址。理解了这点之后,上面的代码就变得很简单了^_^

三:用户空间的伸展

先回顾一下sys_brk的代码:

asmlinkage unsigned long sys_brk(unsigned long brk)

{

         ……

         ……

         //前一部份是用户空间的收缩

         /* Check against rlimit.. */

         //不能超过数据段上限

         rlim = current->rlim[RLIMIT_DATA].rlim_cur;

         if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)

                   goto out;

 

         /* Check against existing mmap mappings. */

         //伸展空间已经有映射了

         if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))

                   goto out;

 

         /* Ok, looks good - let it rip. */

         //执行具体的伸展过程

         if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)

                   goto out;

set_brk:

         //设置新边界

         mm->brk = brk;

out:

         retval = mm->brk;

         up_write(&mm->mmap_sem);

         return retval;

}

在这有一个值得注意的地方:

find_vma_intersection()的实现如下:

//判断进程的地址空间是否与给定的地址区间相交叉

static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)

{

         //找到第一个结束地址大于addrvma

         struct vm_area_struct * vma = find_vma(mm,start_addr);

         //判断vma是否是给定地址区间有交叉

         if (vma && end_addr <= vma->vm_start)

                   vma = NULL;

         return vma;

}

那为什么sys_brk

find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)调用中,newbrk为什么要加上PAGE_SIZE呢?

这是因为newbrk oldbrk已经是经过页框对齐后的地址:如下

         newbrk = PAGE_ALIGN(brk);

         oldbrk = PAGE_ALIGN(mm->brk);

而且,每个vma的起始地址跟长度都是与页框对齐的(参考ULK3.注意到find_vma_intersection()判断是否交替的时候带有一个’=’.也就是判断newbrk的下一个页框是否在进程的线性区中

接着往下看,经过判断之后,就会进入到do_brk()

unsigned long do_brk(unsigned long addr, unsigned long len)

{

         struct mm_struct * mm = current->mm;

         struct vm_area_struct * vma, * prev;

         unsigned long flags;

         struct rb_node ** rb_link, * rb_parent;

         pgoff_t pgoff = addr >> PAGE_SHIFT;

         //长度按页框对齐,不过在我们这个流程来说,这个步骤是没必要的

         //因为start end都与页框对齐,end – start肯定也是与页框对齐的

         len = PAGE_ALIGN(len);

         if (!len)

                   return addr;

         //有效性判断

         if ((addr + len) > TASK_SIZE || (addr + len) < addr)

                   return -EINVAL;

 

         //VM_LOCKED: 页被锁住不能被交换出去

         if (mm->def_flags & VM_LOCKED) {

                   unsigned long locked, lock_limit;

                   locked = mm->locked_vm << PAGE_SHIFT;

                   lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;

                   locked += len;

                   if (locked > lock_limit && !capable(CAP_IPC_LOCK))

                            return -EAGAIN;

         }

 

         /*

          * Clear old maps.  this also does some error checking for us

          */

 munmap_back:

         //sys_brk的流程会进入到这个if吗???

         vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);

         if (vma && vma->vm_start < addr + len) {

                   if (do_munmap(mm, addr, len))

                            return -ENOMEM;

                   goto munmap_back;

         }

 

         //判断是否超过了限制

         if ((mm->total_vm << PAGE_SHIFT) + len

             > current->rlim[RLIMIT_AS].rlim_cur)

                   return -ENOMEM;

 

         if (mm->map_count > sysctl_max_map_count)

                   return -ENOMEM;

         //判断系统是否有足够的内存

         if (security_vm_enough_memory(len >> PAGE_SHIFT))

                   return -ENOMEM;

 

         flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;

 

         //判断是否可以合并

         //如果可以合并,就将基合并为一个VMA

         if (vma_merge(mm, prev, addr, addr + len, flags,

                                               NULL, NULL, pgoff, NULL))

                   goto out;

 

         //不可以合并,新建一个VMA

         vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);

         if (!vma) {

                   vm_unacct_memory(len >> PAGE_SHIFT);

                   return -ENOMEM;

         }

         memset(vma, 0, sizeof(*vma));

 

         //设值VMA的值

         vma->vm_mm = mm;

         vma->vm_start = addr;

         vma->vm_end = addr + len;

         vma->vm_pgoff = pgoff;

         vma->vm_flags = flags;

         vma->vm_page_prot = protection_map[flags & 0x0f];

         //将新分配的VMA插入到进程的VMA链表

         vma_link(mm, vma, prev, rb_link, rb_parent);

out:

         mm->total_vm += len >> PAGE_SHIFT;

         if (flags & VM_LOCKED) {

                   mm->locked_vm += len >> PAGE_SHIFT;

                   //如果定义了LOCKED。就为其分配内存

                   make_pages_present(addr, addr + len);

         }

         return addr;

}

make_pages_present()其实就是为每一个线性区模拟了一个缺页异常,然后再由缺页异常程序为之分配内存。

vm flag没有带VM_LOCKED的时候,它只是为进程分配了一个可以使用的线性地址,以后要访问这个地址的时候,就会产生缺页异常,具体关于缺页异常的处理,我们在下一节接着分析

四:总结

我们在前面分析过了vfree()的实现。还记得vfree()只是释放了内存页表项所映射的物理内存,而在进程管理的时候,sys_brk收缩线性区的时候,它不仅释放了内表所映射的物理内存还把空间页表项。PMD所占的内存释放掉了。内核这样处理是为了效率考虑的。

另外,sys_brk在扩展线性区的时候,仅分配了一个允许进程使用的合法的线性地址,等到真正要使用的时候再给其映射具体的内存,这在操作系统设计里也叫请求调页。等到下节分析缺页异常的时候,再来详细讨论

 
阅读(1602) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~