Chinaunix首页 | 论坛 | 博客
  • 博客访问: 206810
  • 博文数量: 33
  • 博客积分: 0
  • 博客等级: 民兵
  • 技术积分: 1277
  • 用 户 组: 普通用户
  • 注册时间: 2013-03-03 10:03
个人简介

现于杭州电子科技大学攻读硕士学位

文章分类

全部博文(33)

文章存档

2013年(33)

我的朋友

分类: LINUX

2013-10-11 22:05:41

/***************************************************************************************************************/
/* pagemap.h */
#ifndef _LINUX_PAGEMAP_H
#define _LINUX_PAGEMAP_H


/*
 * Page-mapping primitive inline functions
 *
 * Copyright 1995 Linus Torvalds
 */


#include
#include
#include


#include
#include
#include


/*
 * The page cache can done in larger chunks than
 * one page, because it allows for more efficient
 * throughput (it can then be mapped into user
 * space in smaller chunks for same flexibility).
 *
 * Or rather, it _will_ be done in larger chunks.
 */
#define PAGE_CACHE_SHIFT PAGE_SHIFT
#define PAGE_CACHE_SIZE PAGE_SIZE
#define PAGE_CACHE_MASK PAGE_MASK
#define PAGE_CACHE_ALIGN(addr) (((addr)+PAGE_CACHE_SIZE-1)&PAGE_CACHE_MASK)


/* 该宏用于增加已经在页面高速缓存中的页面的引用计数 */
#define page_cache_get(x) get_page(x)


/* 释放页面高速缓存 */
#define page_cache_release(x) __free_page(x)


/* 分配页面高速缓存 */
static inline struct page *page_cache_alloc(struct address_space *x)
{
return alloc_pages(x->gfp_mask, 0);
}


/*
 * From a kernel address, get the "struct page *"
 */
#define page_cache_entry(x) virt_to_page(x)


extern unsigned int page_hash_bits;
#define PAGE_HASH_BITS (page_hash_bits)
#define PAGE_HASH_SIZE (1 << PAGE_HASH_BITS)


extern atomic_t page_cache_size; /* # of pages currently in the hash table */
extern struct page **page_hash_table;


extern void page_cache_init(unsigned long);


/*
 * We use a power-of-two hash table to avoid a modulus,
 * and get a reasonable hash by knowing roughly how the
 * inode pointer and indexes are distributed (ie, we
 * roughly know which bits are "significant")
 *
 * For the time being it will work for struct address_space too (most of
 * them sitting inside the inodes). We might want to change it later.
 */
static inline unsigned long _page_hashfn(struct address_space * mapping, unsigned long index)
{
#define i (((unsigned long) mapping)/(sizeof(struct inode) & ~ (sizeof(struct inode) - 1)))
#define s(x) ((x)+((x)>>PAGE_HASH_BITS))
return s(i+index) & (PAGE_HASH_SIZE-1);
#undef i
#undef s
}


#define page_hash(mapping,index) (page_hash_table+_page_hashfn(mapping,index))


extern struct page * __find_get_page(struct address_space *mapping,
unsigned long index, struct page **hash);
#define find_get_page(mapping, index) \
__find_get_page(mapping, index, page_hash(mapping, index))
extern struct page * __find_lock_page (struct address_space * mapping,
unsigned long index, struct page **hash);
extern struct page * find_or_create_page(struct address_space *mapping,
unsigned long index, unsigned int gfp_mask);


extern void FASTCALL(lock_page(struct page *page));
extern void FASTCALL(unlock_page(struct page *page));
#define find_lock_page(mapping, index) \
__find_lock_page(mapping, index, page_hash(mapping, index))
extern struct page *find_trylock_page(struct address_space *, unsigned long);


extern void add_to_page_cache(struct page * page, struct address_space *mapping, unsigned long index);
extern void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index);
extern int add_to_page_cache_unique(struct page * page, struct address_space *mapping, unsigned long index, struct page **hash);


extern void ___wait_on_page(struct page *);
/* 使用该页面的进程必须在I/O能访问前,通过调用该函数添加到一个等待队列中 */
static inline void wait_on_page(struct page * page)
{
if (PageLocked(page))
___wait_on_page(page);
}


extern void FASTCALL(wakeup_page_waiters(struct page * page));


/*
 * Returns locked page at given index in given cache, creating it if needed.
 */
static inline struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
{
return find_or_create_page(mapping, index, mapping->gfp_mask);
}




extern struct page * grab_cache_page_nowait (struct address_space *, unsigned long);


typedef int filler_t(void *, struct page*);


extern struct page *read_cache_page(struct address_space *, unsigned long,
filler_t *, void *);
#endif
/***************************************************************************************************************/
/* swap.h */
 /* 将页面添加到active_list中 */
#define add_page_to_active_list(page) \
do { \
DEBUG_LRU_PAGE(page); \
SetPageActive(page); \
list_add(&(page)->lru, &active_list); \
nr_active_pages++; \
} while (0)


 /* 将页面添加到inactive_list中 */
#define add_page_to_inactive_list(page) \
do { \
DEBUG_LRU_PAGE(page); \
list_add(&(page)->lru, &inactive_list); \
nr_inactive_pages++; \
} while (0)


/* 将页面从active_list队列中删除 */
#define del_page_from_active_list(page) \
do { \
list_del(&(page)->lru); \
ClearPageActive(page); \
nr_active_pages--; \
} while (0)


/* 将页面从inactive_list队列中删除 */
#define del_page_from_inactive_list(page) \
do { \
list_del(&(page)->lru); \
nr_inactive_pages--; \
} while (0)
/***************************************************************************************************************/
/* page_alloc.c */
/*
 *  linux/mm/page_alloc.c
 *
 *  Manages the free list, the system allocates free pages here.
 *  Note that kmalloc() lives in slab.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
 *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
 *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
 *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
 */


#include
#include
#include
#include
#include
#include
#include
#include
#include


int nr_swap_pages; /* 交换页面计数器 */
int nr_active_pages; /* active_list队列中页面计数器 */
int nr_inactive_pages; /* inactive_list队列中页面计数器 */
LIST_HEAD(inactive_list);  /* 该链表是包含了需要回收的候选对象 */
LIST_HEAD(active_list);/* 该链表上包含的是所有进程使用的页面 */
pg_data_t *pgdat_list; /* 系统中每个内存节点都链接到该链表中 ,均由函数init_bootmem_core初始化节点*/


/*
 *
 * The zone_table array is used to look up the address of the
 * struct zone corresponding to a given zone number (ZONE_DMA,
 * ZONE_NORMAL, or ZONE_HIGHMEM).
 */
zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];/* 管理区表 */


EXPORT_SYMBOL(zone_table);


static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };


/*
 * Temporary debugging check.
 */
#define BAD_RANGE(zone, page) \
( \
(((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size)) \
|| (((page) - mem_map) < (zone)->zone_start_mapnr) \
|| ((zone) != page_zone(page)) \
)


/*
 * Freeing function for a buddy system allocator.
 * Contrary to prior comments, this is *NOT* hairy, and there
 * is no reason for anyone not to understand it.
 *
 * The concept of a buddy system is to maintain direct-mapped tables
 * (containing bit values) for memory blocks of various "orders".
 * The bottom level table contains the map for the smallest allocatable
 * units of memory (here, pages), and each level above it describes
 * pairs of units from the levels below, hence, "buddies".
 * At a high level, all that happens here is marking the table entry
 * at the bottom level available, and propagating the changes upward
 * as necessary, plus some accounting needed to play nicely with other
 * parts of the VM system.
 * At each level, we keep one bit for each pair of blocks, which
 * is set to 1 iff only one of the pair is allocated.  So when we
 * are allocating or freeing one, we can derive the state of the
 * other.  That is, if we allocate a small block, and both were   
 * free, the remainder of the region must be split into blocks.   
 * If a block is freed, and its buddy is also free, then this
 * triggers coalescing into a block of larger size.            
 *
 * -- wli
 */


static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order));
/* 释放页面的核心函数,它不能被直接调用 */
static void __free_pages_ok (struct page *page, unsigned int order)
{
unsigned long index, page_idx, mask, flags;
free_area_t *area;
struct page *base;
zone_t *zone;


/*
* Yes, think what happens when other parts of the kernel take 
* a reference to a page in order to pin it for io. -ben
*/
if (PageLRU(page)) {
if (unlikely(in_interrupt()))
BUG();
lru_cache_del(page);
}


if (page->buffers)
BUG();
if (page->mapping)
BUG();
if (!VALID_PAGE(page))
BUG();
if (PageLocked(page))
BUG();
if (PageActive(page))
BUG();
page->flags &= ~((1<

if (current->flags & PF_FREE_PAGES)
goto local_freelist;
 back_local_freelist:


zone = page_zone(page);


mask = (~0UL) << order;
base = zone->zone_mem_map;
page_idx = page - base;
if (page_idx & ~mask)
BUG();
index = page_idx >> (1 + order);


area = zone->free_area + order;


spin_lock_irqsave(&zone->lock, flags);


zone->free_pages -= mask;


while (mask + (1 << (MAX_ORDER-1))) {
struct page *buddy1, *buddy2;


if (area >= zone->free_area + MAX_ORDER)
BUG();
if (!__test_and_change_bit(index, area->map))
/*
* the buddy page is still allocated.
*/
break;
/*
* Move the buddy up one level.
* This code is taking advantage of the identity:
* -mask = 1+~mask
*/
buddy1 = base + (page_idx ^ -mask);
buddy2 = base + page_idx;
if (BAD_RANGE(zone,buddy1))
BUG();
if (BAD_RANGE(zone,buddy2))
BUG();


list_del(&buddy1->list);
mask <<= 1;
area++;
index >>= 1;
page_idx &= mask;
}
list_add(&(base + page_idx)->list, &area->free_list);


spin_unlock_irqrestore(&zone->lock, flags);
return;


 local_freelist:
if (current->nr_local_pages)
goto back_local_freelist;
if (in_interrupt())
goto back_local_freelist;


list_add(&page->list, ¤t->local_pages);
page->index = order;
current->nr_local_pages++;
}


/* 用于设置一个页面被使用(1)或者释放(0)
    index:全局的mem_map数组中的页面下标*/
#define MARK_USED(index, order, area) \
__change_bit((index) >> (1+(order)), (area)->map)


static inline struct page * expand (zone_t *zone, struct page *page,
unsigned long index, int low, int high, free_area_t * area)
{
unsigned long size = 1 << high;


while (high > low) {
if (BAD_RANGE(zone,page))
BUG();
area--;
high--;
size >>= 1;
list_add(&(page)->list, &(area)->free_list);
MARK_USED(index, high, area);
index += size;
page += size;
}
if (BAD_RANGE(zone,page))
BUG();
return page;
}


static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
/* 分配页面块 */
static struct page * rmqueue(zone_t *zone, unsigned int order)
{
free_area_t * area = zone->free_area + order;
unsigned int curr_order = order;
struct list_head *head, *curr;
unsigned long flags;
struct page *page;


spin_lock_irqsave(&zone->lock, flags);
do {
head = &area->free_list;
curr = head->next;


if (curr != head) {
unsigned int index;


page = list_entry(curr, struct page, list);
if (BAD_RANGE(zone,page))
BUG();
list_del(curr);
index = page - zone->zone_mem_map;
if (curr_order != MAX_ORDER-1)
MARK_USED(index, curr_order, area);
zone->free_pages -= 1UL << order;


page = expand(zone, page, index, order, curr_order, area);
spin_unlock_irqrestore(&zone->lock, flags);


set_page_count(page, 1);
if (BAD_RANGE(zone,page))
BUG();
if (PageLRU(page))
BUG();
if (PageActive(page))
BUG();
return page;
}
curr_order++;
area++;
} while (curr_order < MAX_ORDER);
spin_unlock_irqrestore(&zone->lock, flags);


return NULL;
}


#ifndef CONFIG_DISCONTIGMEM
/* 分配2^order页物理内存 */
struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order)
{
return __alloc_pages(gfp_mask, order,
contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
}
#endif


static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
{
struct page * page = NULL;
int __freed = 0;


if (!(gfp_mask & __GFP_WAIT))
goto out;
if (in_interrupt())
BUG();


current->allocation_order = order;
current->flags |= PF_MEMALLOC | PF_FREE_PAGES;


__freed = try_to_free_pages_zone(classzone, gfp_mask);


current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);


if (current->nr_local_pages) {
struct list_head * entry, * local_pages;
struct page * tmp;
int nr_pages;


local_pages = ¤t->local_pages;


if (likely(__freed)) {
/* pick from the last inserted so we're lifo */
entry = local_pages->next;
do {
tmp = list_entry(entry, struct page, list);
if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
list_del(entry);
current->nr_local_pages--;
set_page_count(tmp, 1);
page = tmp;


if (page->buffers)
BUG();
if (page->mapping)
BUG();
if (!VALID_PAGE(page))
BUG();
if (PageLocked(page))
BUG();
if (PageLRU(page))
BUG();
if (PageActive(page))
BUG();
if (PageDirty(page))
BUG();


break;
}
} while ((entry = entry->next) != local_pages);
}


nr_pages = current->nr_local_pages;
/* free in reverse order so that the global order will be lifo */
while ((entry = local_pages->prev) != local_pages) {
list_del(entry);
tmp = list_entry(entry, struct page, list);
__free_pages_ok(tmp, tmp->index);
if (!nr_pages--)
BUG();
}
current->nr_local_pages = 0;
}
 out:
*freed = __freed;
return page;
}


/*
 * This is the 'heart' of the zoned buddy allocator:
 */
 /* 分配2^order页物理内存 */
struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
{
unsigned long min;
zone_t **zone, * classzone;
struct page * page;
int freed;


zone = zonelist->zones;
classzone = *zone;
if (classzone == NULL)
return NULL;
min = 1UL << order;
for (;;) {
zone_t *z = *(zone++);
if (!z)
break;


min += z->pages_low;
if (z->free_pages > min) {
page = rmqueue(z, order);
if (page)
return page;
}
}


classzone->need_balance = 1;
mb();
if (waitqueue_active(&kswapd_wait))
wake_up_interruptible(&kswapd_wait);


zone = zonelist->zones;
min = 1UL << order;
for (;;) {
unsigned long local_min;
zone_t *z = *(zone++);
if (!z)
break;


local_min = z->pages_min;
if (!(gfp_mask & __GFP_WAIT))
local_min >>= 2;
min += local_min;
if (z->free_pages > min) {
page = rmqueue(z, order);
if (page)
return page;
}
}


/* here we're in the low on memory slow path */


rebalance:
if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) {
zone = zonelist->zones;
for (;;) {
zone_t *z = *(zone++);
if (!z)
break;


page = rmqueue(z, order);
if (page)
return page;
}
return NULL;
}


/* Atomic allocations - we can't balance anything */
if (!(gfp_mask & __GFP_WAIT))
return NULL;


page = balance_classzone(classzone, gfp_mask, order, &freed);
if (page)
return page;


zone = zonelist->zones;
min = 1UL << order;
for (;;) {
zone_t *z = *(zone++);
if (!z)
break;


min += z->pages_min;
if (z->free_pages > min) {
page = rmqueue(z, order);
if (page)
return page;
}
}


/* Don't let big-order allocations loop */
if (order > 3)
return NULL;


/* Yield for kswapd, and try again */
yield();
goto rebalance;
}


/*
 * Common helper functions.
 */
 /* 分配2^order数量的页面并返回一个虚拟地址 */
unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
{
struct page * page;


page = alloc_pages(gfp_mask, order);
if (!page)
return 0;
return (unsigned long) page_address(page);
}


unsigned long get_zeroed_page(unsigned int gfp_mask)
{
struct page * page;


page = alloc_pages(gfp_mask, 0);
if (page) {
void *address = page_address(page);
clear_page(address);
return (unsigned long) address;
}
return 0;
}


/* 从给定页面中释放次序为order的页面 */
void __free_pages(struct page *page, unsigned int order)
{
if (!PageReserved(page) && put_page_testzero(page))
__free_pages_ok(page, order);
}


/* 从给定虚拟地址空间删除一个页面 */
void free_pages(unsigned long addr, unsigned int order)
{
if (addr != 0)
__free_pages(virt_to_page(addr), order);
}


/*
 * Total amount of free (allocatable) RAM:
 */
unsigned int nr_free_pages (void)
{
unsigned int sum = 0;
zone_t *zone;


for_each_zone(zone)
sum += zone->free_pages;


return sum;
}


/*
 * Amount of free RAM allocatable as buffer memory:
 */
unsigned int nr_free_buffer_pages (void)
{
pg_data_t *pgdat;
unsigned int sum = 0;


for_each_pgdat(pgdat) {
zonelist_t *zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
zone_t **zonep = zonelist->zones;
zone_t *zone;


for (zone = *zonep++; zone; zone = *zonep++) {
unsigned long size = zone->size;
unsigned long high = zone->pages_high;
if (size > high)
sum += size - high;
}
}


return sum;
}


#if CONFIG_HIGHMEM
unsigned int nr_free_highpages (void)
{
pg_data_t *pgdat;
unsigned int pages = 0;


for_each_pgdat(pgdat)
pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;


return pages;
}
#endif


#define K(x) ((x) << (PAGE_SHIFT-10))


/*
 * Show free area list (used inside shift_scroll-lock stuff)
 * We also calculate the percentage fragmentation. We do this by counting the
 * memory on each free list with the exception of the first item on the list.
 */
void show_free_areas_core(pg_data_t *pgdat)
{
  unsigned int order;
unsigned type;
pg_data_t *tmpdat = pgdat;


printk("Free pages:      %6dkB (%6dkB HighMem)\n",
K(nr_free_pages()),
K(nr_free_highpages()));


while (tmpdat) {
zone_t *zone;
for (zone = tmpdat->node_zones;
      zone < tmpdat->node_zones + MAX_NR_ZONES; zone++)
printk("Zone:%s freepages:%6lukB min:%6lukB low:%6lukB " 
      "high:%6lukB\n", 
zone->name,
K(zone->free_pages),
K(zone->pages_min),
K(zone->pages_low),
K(zone->pages_high));

tmpdat = tmpdat->node_next;
}


printk("( Active: %d, inactive: %d, free: %d )\n",
      nr_active_pages,
      nr_inactive_pages,
      nr_free_pages());


for (type = 0; type < MAX_NR_ZONES; type++) {
struct list_head *head, *curr;
zone_t *zone = pgdat->node_zones + type;
  unsigned long nr, total, flags;


total = 0;
if (zone->size) {
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
head = &(zone->free_area + order)->free_list;
curr = head;
nr = 0;
for (;;) {
if ((curr = curr->next) == head)
break;
nr++;
}
total += nr * (1 << order);
printk("%lu*%lukB ", nr, K(1UL) << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
}
printk("= %lukB)\n", K(total));
}


#ifdef SWAP_CACHE_INFO
show_swap_cache_info();
#endif
}


void show_free_areas(void)
{
show_free_areas_core(pgdat_list);
}


/*
 * Builds allocation fallback zone lists.
 */
static inline void build_zonelists(pg_data_t *pgdat)
{
int i, j, k;


for (i = 0; i <= GFP_ZONEMASK; i++) {
zonelist_t *zonelist;
zone_t *zone;


zonelist = pgdat->node_zonelists + i;
memset(zonelist, 0, sizeof(*zonelist));


j = 0;
k = ZONE_NORMAL;
if (i & __GFP_HIGHMEM)
k = ZONE_HIGHMEM;
if (i & __GFP_DMA)
k = ZONE_DMA;


switch (k) {
default:
BUG();
/*
* fallthrough:
*/
case ZONE_HIGHMEM:
zone = pgdat->node_zones + ZONE_HIGHMEM;
if (zone->size) {
#ifndef CONFIG_HIGHMEM
BUG();
#endif
zonelist->zones[j++] = zone;
}
case ZONE_NORMAL:
zone = pgdat->node_zones + ZONE_NORMAL;
if (zone->size)
zonelist->zones[j++] = zone;
case ZONE_DMA:
zone = pgdat->node_zones + ZONE_DMA;
if (zone->size)
zonelist->zones[j++] = zone;
}
zonelist->zones[j++] = NULL;

}


/*
 * Helper functions to size the waitqueue hash table.
 * Essentially these want to choose hash table sizes sufficiently
 * large so that collisions trying to wait on pages are rare.
 * But in fact, the number of active page waitqueues on typical
 * systems is ridiculously low, less than 200. So this is even
 * conservative, even though it seems large.
 *
 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
 * waitqueues, i.e. the size of the waitq table given the number of pages.
 */
#define PAGES_PER_WAITQUEUE 256


static inline unsigned long wait_table_size(unsigned long pages)
{
unsigned long size = 1;


pages /= PAGES_PER_WAITQUEUE;


while (size < pages)
size <<= 1;


/*
* Once we have dozens or even hundreds of threads sleeping
* on IO we've got bigger problems than wait queue collision.
* Limit the size of the wait table to a reasonable size.
*/
size = min(size, 4096UL);


return size;
}


/*
 * This is an integer logarithm so that shifts can be used later
 * to extract the more random high bits from the multiplicative
 * hash function before the remainder is taken.
 */
static inline unsigned long wait_table_bits(unsigned long size)
{
return ffz(~size);
}


#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))


/*
 * Set up the zone data structures:
 *   - mark all pages reserved
 *   - mark all memory queues empty
 *   - clear the memory bitmaps
 */
 /* 非一致性内存访问管理区初始化函数,该函数用于向每个zone_t填充相关的信息,并为节点分配mem_map数组
      nid:被初始化管理区中节点的逻辑标识符
      pgdat:节点中被初始化的 pg_data_t ,在一致性内存访问中为contig_page_data
      gmap:被该函数用于设置指向分配节点的局部lmem_map数组的指针UMA结构中,它往往被忽略掉。因为NUMA将
                mem_map处理为起始于PAGE_OFF-SET的虚拟数组,而在UMA中,该指针指向全局mem_map变量
      zones_size:一个包含每个管理区大小的数组,管理区大小以页面为单位计算
      zone_start_paddr:第一个管理区的起始物理地址
      zholes_size:一个包含管理区中所有内存空洞大小的数组
      */
void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
unsigned long *zones_size, unsigned long zone_start_paddr, 
unsigned long *zholes_size, struct page *lmem_map)
{
unsigned long i, j;
unsigned long map_size;
unsigned long totalpages, offset, realtotalpages;
const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);


if (zone_start_paddr & ~PAGE_MASK)
BUG();


totalpages = 0;
for (i = 0; i < MAX_NR_ZONES; i++) {
unsigned long size = zones_size[i];
totalpages += size;
}
realtotalpages = totalpages;
if (zholes_size)
for (i = 0; i < MAX_NR_ZONES; i++)
realtotalpages -= zholes_size[i];

printk("On node %d totalpages: %lu\n", nid, realtotalpages);


/*
* Some architectures (with lots of mem and discontinous memory
* maps) have to search for a good mem_map area:
* For discontigmem, the conceptual mem map array starts from 
* PAGE_OFFSET, we need to align the actual array onto a mem map 
* boundary, so that MAP_NR works.
*/
map_size = (totalpages + 1)*sizeof(struct page);
if (lmem_map == (struct page *)0) {
lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
lmem_map = (struct page *)(PAGE_OFFSET + 
MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
}
*gmap = pgdat->node_mem_map = lmem_map;
pgdat->node_size = totalpages;
pgdat->node_start_paddr = zone_start_paddr;
pgdat->node_start_mapnr = (lmem_map - mem_map);
pgdat->nr_zones = 0;


offset = lmem_map - mem_map;
for (j = 0; j < MAX_NR_ZONES; j++) {
zone_t *zone = pgdat->node_zones + j;
unsigned long mask;
unsigned long size, realsize;


zone_table[nid * MAX_NR_ZONES + j] = zone; /* nid为节点ID,j是管理区引索号 */
realsize = size = zones_size[j];
if (zholes_size)
realsize -= zholes_size[j];


printk("zone(%lu): %lu pages.\n", j, size);
zone->size = size;
zone->name = zone_names[j];
zone->lock = SPIN_LOCK_UNLOCKED;
zone->zone_pgdat = pgdat;
zone->free_pages = 0;
zone->need_balance = 0;
if (!size)
continue;


/*
* The per-page waitqueue mechanism uses hashed waitqueues
* per zone.
*/
zone->wait_table_size = wait_table_size(size);
zone->wait_table_shift =
BITS_PER_LONG - wait_table_bits(zone->wait_table_size);
zone->wait_table = (wait_queue_head_t *)
alloc_bootmem_node(pgdat, zone->wait_table_size
* sizeof(wait_queue_head_t));


for(i = 0; i < zone->wait_table_size; ++i)
init_waitqueue_head(zone->wait_table + i);


pgdat->nr_zones = j+1;


mask = (realsize / zone_balance_ratio[j]);
if (mask < zone_balance_min[j])
mask = zone_balance_min[j];
else if (mask > zone_balance_max[j])
mask = zone_balance_max[j];
zone->pages_min = mask;
zone->pages_low = mask*2;
zone->pages_high = mask*3;


zone->zone_mem_map = mem_map + offset;
zone->zone_start_mapnr = offset;
zone->zone_start_paddr = zone_start_paddr;


if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1))
printk("BUG: wrong zone alignment, it will crash\n");


/*
* Initially all pages are reserved - free ones are freed
* up by free_all_bootmem() once the early boot process is
* done. Non-atomic initialization, single-pass.
*/
for (i = 0; i < size; i++) {
struct page *page = mem_map + offset + i;
set_page_zone(page, nid * MAX_NR_ZONES + j);
set_page_count(page, 0);
SetPageReserved(page);
INIT_LIST_HEAD(&page->list);
if (j != ZONE_HIGHMEM)
set_page_address(page, __va(zone_start_paddr));
zone_start_paddr += PAGE_SIZE;
}


offset += size;
for (i = 0; ; i++) {
unsigned long bitmap_size;


INIT_LIST_HEAD(&zone->free_area[i].free_list);
if (i == MAX_ORDER-1) {
zone->free_area[i].map = NULL;
break;
}


/*
* Page buddy system uses "index >> (i+1)",
* where "index" is at most "size-1".
*
* The extra "+3" is to round down to byte
* size (8 bits per byte assumption). Thus
* we get "(size-1) >> (i+4)" as the last byte
* we can access.
*
* The "+1" is because we want to round the
* byte allocation up rather than down. So
* we should have had a "+7" before we shifted
* down by three. Also, we have to add one as
* we actually _use_ the last bit (it's [0,n]
* inclusive, not [0,n[).
*
* So we actually had +7+1 before we shift
* down by 3. But (n+8) >> 3 == (n >> 3) + 1
* (modulo overflows, which we do not have).
*
* Finally, we LONG_ALIGN because all bitmap
* operations are on longs.
*/
bitmap_size = (size-1) >> (i+4);
bitmap_size = LONG_ALIGN(bitmap_size+1);
zone->free_area[i].map = 
 (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
}
}
build_zonelists(pgdat);
}
/* 一致性内存访问管理区初始化函数 */
void __init free_area_init(unsigned long *zones_size)
{
free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
}


static int __init setup_mem_frac(char *str)
{
int j = 0;


while (get_option(&str, &zone_balance_ratio[j++]) == 2);
printk("setup_mem_frac: ");
for (j = 0; j < MAX_NR_ZONES; j++) printk("%d  ", zone_balance_ratio[j]);
printk("\n");
return 1;
}


__setup("memfrac=", setup_mem_frac);
/***************************************************************************************************************/
/* swap.c */
/*
 *  linux/mm/swap.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */


/*
 * This file contains the default values for the opereation of the
 * Linux VM subsystem. Fine-tuning documentation can be found in
 * linux/Documentation/sysctl/vm.txt.
 * Started 18.12.91
 * Swap aging added 23.2.95, Stephen Tweedie.
 * Buffermem limits added 12.3.98, Rik van Riel.
 */


#include
#include
#include
#include
#include
#include


#include
#include /* for copy_to/from_user */
#include


/* How many pages do we try to swap or page in/out together? */
int page_cluster;


pager_daemon_t pager_daemon = {
512, /* base number for calculating the number of tries */
SWAP_CLUSTER_MAX, /* minimum number of tries */
8, /* do swap I/O in clusters of this size */
};


/*
 * Move an inactive page to the active list.
 */
 /* 将页面从inactive_list队列中移到active_list队列中 */
static inline void activate_page_nolock(struct page * page)
{
if (PageLRU(page) && !PageActive(page)) {
del_page_from_inactive_list(page);
add_page_to_active_list(page);
}
}


/* 将页面从inactive_list队列中移到active_list队列中 */
void activate_page(struct page * page)
{
spin_lock(&pagemap_lru_lock);
activate_page_nolock(page);
spin_unlock(&pagemap_lru_lock);
}


/**
 * lru_cache_add: add a page to the page lists
 * @page: the page to add
 */
 /* 把页面放入inactive_list队列中 */
void lru_cache_add(struct page * page)
{
if (!PageLRU(page)) {
spin_lock(&pagemap_lru_lock);
if (!TestSetPageLRU(page))
add_page_to_inactive_list(page); /* 将页面添加到inactive_list中 */
spin_unlock(&pagemap_lru_lock);
}
}


/**
 * __lru_cache_del: remove a page from the page lists
 * @page: the page to add
 *
 * This function is for when the caller already holds
 * the pagemap_lru_lock.
 */
void __lru_cache_del(struct page * page)
{
if (TestClearPageLRU(page)) {
if (PageActive(page)) {
del_page_from_active_list(page);
} else {
del_page_from_inactive_list(page);
}
}
}


/**
 * lru_cache_del: remove a page from the page lists
 * @page: the page to remove
 */
 /* 将页面从队列中删除 */
void lru_cache_del(struct page * page)
{
spin_lock(&pagemap_lru_lock);
__lru_cache_del(page);
spin_unlock(&pagemap_lru_lock);
}


/*
 * Perform any setup for the swap system
 */
void __init swap_setup(void)
{
unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);


/* Use a smaller cluster for small-memory machines */
if (megs < 16)
page_cluster = 2;
else
page_cluster = 3;
/*
* Right now other parts of the system means that we
* _really_ don't want to cluster much more
*/
}
/***************************************************************************************************************/
/* vmscan.c */

 /* 把nr_pages个页面从active_list尾部移到inactive_list */
static void refill_inactive(int nr_pages)
{
struct list_head * entry;


spin_lock(&pagemap_lru_lock);
entry = active_list.prev;
while (nr_pages && entry != &active_list) { /* nr_pages!=0并且active_list中还有页面 */
struct page * page;


page = list_entry(entry, struct page, lru);
entry = entry->prev;
if (PageTestandClearReferenced(page)) {  /* 如果引用标志位被设置过则清除它 */
list_del(&page->lru); /* 从链表中删除  */
list_add(&page->lru, &active_list); /* 添加到active_list链表中 */
continue;
}
             /* 否则引用标志位没有被设置 */
nr_pages--;


del_page_from_active_list(page); /* 将页面从active_list队列中删除 */
add_page_to_inactive_list(page); /* 添加到inactive_list队列中 */
SetPageReferenced(page); /* 设置页面被引用过 */
}
spin_unlock(&pagemap_lru_lock);
}


static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages));


/* 该函数用于收缩各种高速缓存
     refill_inactive()函数的参数通过该函数中 求比率得到
     nr_pages:active_list中页面数量*/
static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)
{
int chunk_size = nr_pages;
unsigned long ratio;


nr_pages -= kmem_cache_reap(gfp_mask); /* 选择一个待减小的slab高速缓存 */
if (nr_pages <= 0)
return 0;


nr_pages = chunk_size;
/* try to keep the active list 2/3 of the size of the cache */
ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2);
refill_inactive(ratio);


nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, priority);
if (nr_pages <= 0)
return 0;


shrink_dcache_memory(priority, gfp_mask);
shrink_icache_memory(priority, gfp_mask);
#ifdef CONFIG_QUOTA
shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
#endif


return nr_pages;
}

 /* 该守护进程负责在内存大小剩下很少时回收页面 */
int kswapd(void *unused)
{
struct task_struct *tsk = current;
DECLARE_WAITQUEUE(wait, tsk);


daemonize();
strcpy(tsk->comm, "kswapd");
sigfillset(&tsk->blocked);

/*
* Tell the memory management that we're a "memory allocator",
* and that if we need more memory we should get access to it
* regardless (see "__alloc_pages()"). "kswapd" should
* never get caught in the normal page freeing logic.
*
* (Kswapd normally doesn't need memory anyway, but sometimes
* you need a small amount of memory in order to be able to
* page out something else, and this flag essentially protects
* us from recursively trying to free more memory as we're
* trying to free the first piece of memory in the first place).
*/
tsk->flags |= PF_MEMALLOC;


/*
* Kswapd main loop.
*/
for (;;) {
__set_current_state(TASK_INTERRUPTIBLE);
add_wait_queue(&kswapd_wait, &wait);


mb();
if (kswapd_can_sleep()) /* 遍历所有管理区,检查在结构zone_t中的need_balance字段,如果有need_balance被设置则它不能睡眠 */
schedule();


__set_current_state(TASK_RUNNING);
remove_wait_queue(&kswapd_wait, &wait); /* 如果不能睡眠则移出kswapd_wait队列 */


/*
* If we actually get into a low-memory situation,
* the processes needing more memory will wake us
* up on a more timely basis.
*/
kswapd_balance(); /* 它遍历所有的管理区,如果need_balance字段被设置,它将使用try_to_free_pages_zone()来释放管理区内的
                                   页面直到达到pages_high阈值*/
run_task_queue(&tq_disk); /* 运行任务队列 tq_disk,以清除页面队列*/
}
}


/* 初始化内核线程 kswapd*/
static int __init kswapd_init(void)
{
printk("Starting kswapd\n");
swap_setup();
kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
return 0;
}


module_init(kswapd_init)








阅读(2646) | 评论(0) | 转发(0) |
0

上一篇:linux内存管理之高端内存管理

下一篇:没有了

给主人留下些什么吧!~~