Chinaunix首页 | 论坛 | 博客
  • 博客访问: 437438
  • 博文数量: 123
  • 博客积分: 2686
  • 博客等级: 少校
  • 技术积分: 1349
  • 用 户 组: 普通用户
  • 注册时间: 2009-12-23 22:11
文章分类
文章存档

2012年(3)

2011年(10)

2010年(100)

2009年(10)

我的朋友

分类: LINUX

2010-10-18 15:42:20

get_page_from_freelist is another important helper function used by the buddy system.

The fallback list zonelist is passed as parameter to the function. This list determine the order in which the other zones(or node)of the system are scanned if no pages are free in desired zone.


/*
 * get_page_from_freelist goes through the zonelist trying to allocate
 * a page.
 */

static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
        struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
        struct zone *preferred_zone, int migratetype)
{
    struct zoneref *z;//Comment 1
    struct page *page = NULL;
    int classzone_idx;
    struct zone *zone;
    nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
    int zlc_active = 0;        /* set if using zonelist_cache */
    int did_zlc_setup = 0;        /* just call zlc_setup() one time */

    classzone_idx = zone_idx(preferred_zone);//see Comment 2
zonelist_scan:
    /*
     * Scan zonelist, looking for a zone with enough free.
     * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
     */

    for_each_zone_zonelist_nodemask(zone, z, zonelist,
                        high_zoneidx, nodemask) {
        if (NUMA_BUILD && zlc_active &&
            !zlc_zone_worth_trying(zonelist, z, allowednodes))
                continue;

        //see below Comment 3

        if ((alloc_flags & ALLOC_CPUSET) &&
            !cpuset_zone_allowed_softwall(zone, gfp_mask))
                goto try_next_zone;

        BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
        if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
            unsigned long mark;
            int ret;

            //get watermark
            mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

            //see below comment 4 

            if (zone_watermark_ok(zone, order, mark,
                 classzone_idx, alloc_flags))
                goto try_this_zone;// if this zone has no enough pages, try next zone.

            if (zone_reclaim_mode == 0)
                goto this_zone_full;
            //see Comment 5
            ret = zone_reclaim(zone, gfp_mask, order);
            switch (ret) {
            case ZONE_RECLAIM_NOSCAN:
                /* did not scan */
                goto try_next_zone;
            case ZONE_RECLAIM_FULL:
                /* scanned but unreclaimable */
                goto this_zone_full;
            default:
                /* did we reclaim enough */
                if (!zone_watermark_ok(zone, order, mark,
                        classzone_idx, alloc_flags))
                    goto this_zone_full;
            }
        }

        //see below comment 6
try_this_zone:
        page = buffered_rmqueue(preferred_zone, zone, order,
                        gfp_mask, migratetype);
        if (page)
            break;
this_zone_full:
        if (NUMA_BUILD)
            zlc_mark_zone_full(zonelist, z);
try_next_zone:
        if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
            /*
             * we do zlc_setup after the first zone is tried but only
             * if there are multiple nodes make it worthwhile
             */

            allowednodes = zlc_setup(zonelist, alloc_flags);
            zlc_active = 1;
            did_zlc_setup = 1;
        }
    }

    if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
        /* Disable zlc cache for second zonelist scan */
        zlc_active = 0;
        goto zonelist_scan;
    }
    return page;
}



Comment 1:

/*
 * This struct contains information about a zone in a zonelist. It is stored
 * here to avoid dereferences into large structures and lookups of tables
 */

struct zoneref {
    struct zone *zone;    /* Pointer to actual zone */
    int zone_idx;        /* zone_idx(zoneref->zone) */
};



Comment 2:

/*
 * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
 */

#define zone_idx(zone)        ((zone) - (zone)->zone_pgdat->node_zones)



Comment 3:
cpuset_zone_allowed_softwall is another helper function to check whether the given zone belongs to the allowed CPUs for the process

Comment 4:
zone_watermark_ok() check each zone to find out whether enough pages are present and attempts to allocate a contiguous memory.

Comment 5:
Try to free pages.

int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
    int node_id;
    int ret;

    /*
     * Zone reclaim reclaims unmapped file backed pages and
     * slab pages if we are over the defined limits.
     *
     * A small portion of unmapped file backed pages is needed for
     * file I/O otherwise pages read by file I/O will be immediately
     * thrown out if the zone is overallocated. So we do not reclaim
     * if less than a specified percentage of the zone is used by
     * unmapped file backed pages.
     */

    if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
     zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
        return ZONE_RECLAIM_FULL;

    if (zone->all_unreclaimable)
        return ZONE_RECLAIM_FULL;

    /*
     * Do not scan if the allocation should not be delayed.
     */

    if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
        return ZONE_RECLAIM_NOSCAN;

    /*
     * Only run zone reclaim on the local zone or on zones that do not
     * have associated processors. This will favor the local processor
     * over remote processors and spread off node memory allocations
     * as wide as possible.
     */

    node_id = zone_to_nid(zone);
    if (node_state(node_id, N_CPU) && node_id != numa_node_id())
        return ZONE_RECLAIM_NOSCAN;

    if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
        return ZONE_RECLAIM_NOSCAN;

    ret = __zone_reclaim(zone, gfp_mask, order);
    zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);

    if (!ret)
        count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);

    return ret;
}



/*
 * Try to free up some pages from this zone through reclaim.
 */

static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
    /* Minimum pages needed in order to stay on node */
    const unsigned long nr_pages = 1 << order;
    struct task_struct *p = current;
    struct reclaim_state reclaim_state;
    int priority;
    struct scan_control sc = {
        .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
        .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
        .may_swap = 1,
        .nr_to_reclaim = max_t(unsigned long, nr_pages,
                 SWAP_CLUSTER_MAX),
        .gfp_mask = gfp_mask,
        .swappiness = vm_swappiness,
        .order = order,
    };
    unsigned long slab_reclaimable;

    disable_swap_token();
    cond_resched();
    /*
     * We need to be able to allocate from the reserves for RECLAIM_SWAP
     * and we also need to be able to write out pages for RECLAIM_WRITE
     * and RECLAIM_SWAP.
     */

    p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
    lockdep_set_current_reclaim_state(gfp_mask);
    reclaim_state.reclaimed_slab = 0;
    p->reclaim_state = &reclaim_state;

    if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
        /*
         * Free memory by calling shrink zone with increasing
         * priorities until we have enough memory freed.
         */

        priority = ZONE_RECLAIM_PRIORITY;
        do {
            note_zone_scanning_priority(zone, priority);
            shrink_zone(priority, zone, &sc);
            priority--;
        } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
    }

    slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
    if (slab_reclaimable > zone->min_slab_pages) {
        /*
         * shrink_slab() does not currently allow us to determine how
         * many pages were freed in this zone. So we take the current
         * number of slab pages and shake the slab until it is reduced
         * by the same nr_pages that we used for reclaiming unmapped
         * pages.
         *
         * Note that shrink_slab will free memory on all zones and may
         * take a long time.
         */

        while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
            zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
                slab_reclaimable - nr_pages)
            ;

        /*
         * Update nr_reclaimed by the number of slab pages we
         * reclaimed from this zone.
         */

        sc.nr_reclaimed += slab_reclaimable -
            zone_page_state(zone, NR_SLAB_RECLAIMABLE);
    }

    p->reclaim_state = NULL;
    current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
    lockdep_clear_current_reclaim_state();
    return sc.nr_reclaimed >= nr_pages;
}



Comment 6:
If the zone is suitable for the current allocation. buffered_rmqueue() try to remove the desired number of pages from it.

/*
 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
 * we cheat by calling it from here, in the order > 0 path. Saves a branch
 * or two.
 */

static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
            struct zone *zone, int order, gfp_t gfp_flags,
            int migratetype)
{
    unsigned long flags;
    struct page *page;
    int cold = !!(gfp_flags & __GFP_COLD);

again:
    if (likely(order == 0)) {
        struct per_cpu_pages *pcp;
        struct list_head *list;

        local_irq_save(flags);
        pcp = &this_cpu_ptr(zone->pageset)->pcp;
        list = &pcp->lists[migratetype];
        if (list_empty(list)) {
            pcp->count += rmqueue_bulk(zone, 0,
                    pcp->batch, list,
                    migratetype, cold);
            if (unlikely(list_empty(list)))
                goto failed;
        }

        if (cold)
            page = list_entry(list->prev, struct page, lru);
        else
            page = list_entry(list->next, struct page, lru);

        list_del(&page->lru);
        pcp->count--;
    } else {
        if (unlikely(gfp_flags & __GFP_NOFAIL)) {
            /*
             * __GFP_NOFAIL is not to be used in new code.
             *
             * All __GFP_NOFAIL callers should be fixed so that they
             * properly detect and handle allocation failures.
             *
             * We most definitely don't want callers attempting to
             * allocate greater than order-1 page units with
             * __GFP_NOFAIL.
             */

            WARN_ON_ONCE(order > 1);
        }
        spin_lock_irqsave(&zone->lock, flags);
        page = __rmqueue(zone, order, migratetype);
        spin_unlock(&zone->lock);
        if (!page)
            goto failed;
        __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
    }

    __count_zone_vm_events(PGALLOC, zone, 1 << order);
    zone_statistics(preferred_zone, zone);
    local_irq_restore(flags);

    VM_BUG_ON(bad_range(zone, page));
    if (prep_new_page(page, order, gfp_flags))
        goto again;
    return page;

failed:
    local_irq_restore(flags);
    return NULL;
}


阅读(1513) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~