之前介绍了内存模型的node,接着介绍zone——内存域,内存域分不同的类型,内核用以下常量来枚举系统中的所有内存域:
- enum zone_type {
- #ifdef CONFIG_ZONE_DMA
- /*
- * ZONE_DMA is used when there are devices that are not able
- * to do DMA to all of addressable memory (ZONE_NORMAL). Then we
- * carve out the portion of memory that is needed for these devices.
- * The range is arch specific.
- *
- * Some examples
- *
- * Architecture Limit
- * ---------------------------
- * parisc, ia64, sparc <4G
- * s390 <2G
- * arm Various
- * alpha Unlimited or 0-16MB.
- *
- * i386, x86_64 and multiple other arches
- * <16M.
- */
- ZONE_DMA,
- #endif
- #ifdef CONFIG_ZONE_DMA32
- /*
- * x86_64 needs two ZONE_DMAs because it supports devices that are
- * only able to do DMA to the lower 16M but also 32 bit devices that
- * can only do DMA areas below 4G.
- */
- ZONE_DMA32,
- #endif
- /*
- * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
- * performed on pages in ZONE_NORMAL if the DMA devices support
- * transfers to all addressable memory.
- */
- ZONE_NORMAL,
- #ifdef CONFIG_HIGHMEM
- /*
- * A memory area that is only addressable by the kernel through
- * mapping portions into its own address space. This is for example
- * used by i386 to allow the kernel to address the memory beyond
- * 900MB. The kernel will set up special mappings (page
- * table entries on i386) for each page that the kernel needs to
- * access.
- */
- ZONE_HIGHMEM,
- #endif
- ZONE_MOVABLE,
- __MAX_NR_ZONES
- };
- ZONE_DMA:标记适合DMA的内存域
- ZONE_DMA32:标记了使用32位地址字可寻址、适合DMA的内存域
- ZONE_NORMAL:标记了可以直接映射到内核段的普通内存
- ZONE_HIGHMEM:标记了超出内核段的物理内存
- ZONE_MOVABLE:供防止物理内存碎片的极致使用,是一个伪内存域
- __MAX_NR_ZONES:表示结束标记,在迭代系统中的所有内存时,会使用该变量
表示内存域的代码如下:
- struct zone {
- /* Fields commonly accessed by the page allocator */
- /* zone watermarks, access with *_wmark_pages(zone) macros */
- unsigned long watermark[NR_WMARK];
- /*
- * When free pages are below this point, additional steps are taken
- * when reading the number of free pages to avoid per-cpu counter
- * drift allowing watermarks to be breached
- */
- unsigned long percpu_drift_mark;
- /*
- * We don't know if the memory that we're going to allocate will be freeable
- * or/and it will be released eventually, so to avoid totally wasting several
- * GB of ram we must reserve some of the lower zone memory (otherwise we risk
- * to run OOM on the lower zones despite there's tons of freeable ram
- * on the higher zones). This array is recalculated at runtime if the
- * sysctl_lowmem_reserve_ratio sysctl changes.
- */
- unsigned long lowmem_reserve[MAX_NR_ZONES];
- #ifdef CONFIG_NUMA
- int node;
- /*
- * zone reclaim becomes active if more unmapped pages exist.
- */
- unsigned long min_unmapped_pages;
- unsigned long min_slab_pages;
- #endif
- struct per_cpu_pageset __percpu *pageset;
- /*
- * free areas of different sizes
- */
- spinlock_t lock;
- int all_unreclaimable; /* All pages pinned */
- #ifdef CONFIG_MEMORY_HOTPLUG
- /* see spanned/present_pages for more description */
- seqlock_t span_seqlock;
- #endif
- struct free_area free_area[MAX_ORDER];
- #ifndef CONFIG_SPARSEMEM
- /*
- * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
- * In SPARSEMEM, this map is stored in struct mem_section
- */
- unsigned long *pageblock_flags;
- #endif /* CONFIG_SPARSEMEM */
- #ifdef CONFIG_COMPACTION
- /*
- * On compaction failure, 1<<compact_defer_shift compactions
- * are skipped before trying again. The number attempted since
- * last failure is tracked with compact_considered.
- */
- unsigned int compact_considered;
- unsigned int compact_defer_shift;
- #endif
- ZONE_PADDING(_pad1_)
- /* Fields commonly accessed by the page reclaim scanner */
- spinlock_t lru_lock;
- struct zone_lru {
- struct list_head list;
- } lru[NR_LRU_LISTS];
- struct zone_reclaim_stat reclaim_stat;
- unsigned long pages_scanned; /* since last reclaim */
- unsigned long flags; /* zone flags, see below */
- /* Zone statistics */
- atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
- /*
- * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
- * this zone's LRU. Maintained by the pageout code.
- */
- unsigned int inactive_ratio;
- ZONE_PADDING(_pad2_)
- /* Rarely used or read-mostly fields */
- /*
- * wait_table -- the array holding the hash table
- * wait_table_hash_nr_entries -- the size of the hash table array
- * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
- *
- * The purpose of all these is to keep track of the people
- * waiting for a page to become available and make them
- * runnable again when possible. The trouble is that this
- * consumes a lot of space, especially when so few things
- * wait on pages at a given time. So instead of using
- * per-page waitqueues, we use a waitqueue hash table.
- *
- * The bucket discipline is to sleep on the same queue when
- * colliding and wake all in that wait queue when removing.
- * When something wakes, it must check to be sure its page is
- * truly available, a la thundering herd. The cost of a
- * collision is great, but given the expected load of the
- * table, they should be so rare as to be outweighed by the
- * benefits from the saved space.
- *
- * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
- * primary users of these fields, and in mm/page_alloc.c
- * free_area_init_core() performs the initialization of them.
- */
- wait_queue_head_t * wait_table;
- unsigned long wait_table_hash_nr_entries;
- unsigned long wait_table_bits;
- /*
- * Discontig memory support fields.
- */
- struct pglist_data *zone_pgdat;
- /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
- unsigned long zone_start_pfn;
- /*
- * zone_start_pfn, spanned_pages and present_pages are all
- * protected by span_seqlock. It is a seqlock because it has
- * to be read outside of zone->lock, and it is done in the main
- * allocator path. But, it is written quite infrequently.
- *
- * The lock is declared along with zone->lock because it is
- * frequently read in proximity to zone->lock. It's good to
- * give them a chance of being in the same cacheline.
- */
- unsigned long spanned_pages; /* total size, including holes */
- unsigned long present_pages; /* amount of memory (excluding holes) */
- /*
- * rarely used fields:
- */
- const char *name;
- } ____cacheline_internodealigned_in_smp;
这个结构比较大,简单介绍一下:
以下是被页分配器(page allocator)访问的字段:
- unsigned long watermark[NR_WMARK]:代表页换出时使用的水印pages_min,pages_high,pages_low,NR_WMARK是定义在枚举类zone_watermarks里
- enum zone_watermarks {
- WMARK_MIN,
- WMARK_LOW,
- WMARK_HIGH,
- NR_WMARK
- };
这三个成员会影响交换守护进程的行为: - WMARK_HIGH:如果空闲页数多于watermark[WMARK_HIGH],则内存域的状态是理想的
- WMARK_LOW:如果内存页少于watermark[WMARK_LOW],则内核开始将内存里的页换出到硬盘
- WMARK_MIN:如果空闲页数少于watermark[WMARK_MIN],则内核中急需空闲页,此时页回收的压力比较大
- unsigned long percpu_drift_mark:这个字段比较玄乎,注视也看的不是很明白,英语不够好,大概的意思是说,在空闲页的数目少于这个点percpu_drift_mark的时候,当读取和空闲页数一样的内存页时,系统会采取额外的工作,防止单CPU页数漂移,从而导致水印被破坏。
- unsigned long lowmem_reserve[MAX_NR_ZONES]:这个是为各种内存域预留的页,用于一些不能失败的关键性内存分配
- struct per_cpu_pageset __percpu *pageset:用于实现每个CPU的热/冷页帧的列表
- struct free_area free_area[MAX_ORDER]:是用于伙伴系统的,每个数组元素指向对应阶也表的数组开头
以下是供页帧回收扫描器(page reclaim scanner)访问的字段,scanner会跟据页帧的活动情况对内存域中使用的页进行编目。如果页帧被频繁访问,则是活动的,相反则是不活动的,在需要换出页帧时,这样的信息是很重要的:
- spinlock_t lru_lock:
- unsigned long pages_scanned:指上次回收页时扫描了多少页
- flages 描述当前内存的状态
- typedef enum {
- ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ 防止并发回收
- ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */内存域立即可以被回收
- ZONE_CONGESTED, /* zone has many dirty pages backed by
- * a congested BDI
- */内存域里有很多脏页
- } zone_flags_t;
- atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] :内存域的统计信息
- unsigned int inactive_ratio :不活动页的比例
接着是一些很少使用或者大部分情况下是只读的字段:
- wait_table wait_table_hash_nr_entries wait_table_bits形成等待列队,可以等待某一页可供进程使用
- struct pglist_data *zone_pgdat 指向节点的指针
- unsigned long zone_start_pfn: zone_start_pfn == zone_start_paddr >> PAGE_SHIFT,只内存域的第一个页帧
- unsigned long spanned_pages;总页数,包含空洞
- unsigned long present_pages; 可用页数,不包哈空洞
- const char *name:内存域的惯用名词,有三个可选Normal,DMA,HighMem
阅读(4108) | 评论(0) | 转发(0) |