Chinaunix首页 | 论坛 | 博客
  • 博客访问: 428732
  • 博文数量: 123
  • 博客积分: 2686
  • 博客等级: 少校
  • 技术积分: 1349
  • 用 户 组: 普通用户
  • 注册时间: 2009-12-23 22:11
文章分类
文章存档

2012年(3)

2011年(10)

2010年(100)

2009年(10)

我的朋友

分类: LINUX

2010-10-27 14:18:25


static inline struct page *
alloc_pages(gfp_t gfp_mask, unsigned int order)
{
    return alloc_pages_current(gfp_mask, order);
}


/**
 *     alloc_pages_current - Allocate pages.
 *
 *    @gfp:
 *        %GFP_USER user allocation,
 *     %GFP_KERNEL kernel allocation,
 *     %GFP_HIGHMEM highmem allocation,
 *     %GFP_FS don't call back into a file system.
 *     %GFP_ATOMIC don't sleep.
 *    @order: Power of two of allocation size in pages. 0 is a single page.
 *
 *    Allocate a page from the kernel page pool. When not in
 *    interrupt context and apply the current process NUMA policy.
 *    Returns NULL when no page can be allocated.
 *
 *    Don't call cpuset_update_task_memory_state() unless
 *    1) it's ok to take cpuset_sem (can WAIT), and
 *    2) allocating for current task (not interrupt).
 */

struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
    struct mempolicy *pol = current->mempolicy;
    struct page *page;

    if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
        pol = &default_policy;

    get_mems_allowed();
    /*
     * No reference counting needed for current->mempolicy
     * nor system default_policy
     */

    if (pol->mode == MPOL_INTERLEAVE)
        page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
    else
        page = __alloc_pages_nodemask(gfp, order,
            policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
    put_mems_allowed();
    return page;
}
EXPORT_SYMBOL(alloc_pages_current);


/*
 * This is the 'heart' of the zoned buddy allocator.
 */

struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
            struct zonelist *zonelist, nodemask_t *nodemask)
{
    enum zone_type high_zoneidx = gfp_zone(gfp_mask);
    struct zone *preferred_zone;
    struct page *page;
    int migratetype = allocflags_to_migratetype(gfp_mask);

    gfp_mask &= gfp_allowed_mask;

    lockdep_trace_alloc(gfp_mask);

    might_sleep_if(gfp_mask & __GFP_WAIT);

    if (should_fail_alloc_page(gfp_mask, order))
        return NULL;

    /*
     * Check the zones suitable for the gfp_mask contain at least one
     * valid zone. It's possible to have an empty zonelist as a result
     * of GFP_THISNODE and a memoryless node
     */

    if (unlikely(!zonelist->_zonerefs->zone))
        return NULL;

    get_mems_allowed();
    /* The preferred zone is used for statistics later */
    first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
    if (!preferred_zone) {
        put_mems_allowed();
        return NULL;
    }

    /* First allocation attempt */
    page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
            zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
            preferred_zone, migratetype);
    if (unlikely(!page))
        page = __alloc_pages_slowpath(gfp_mask, order,
                zonelist, high_zoneidx, nodemask,
                preferred_zone, migratetype);
    put_mems_allowed();

    trace_mm_page_alloc(page, order, gfp_mask, migratetype);
    return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);


/*
 * get_page_from_freelist goes through the zonelist trying to allocate
 * a page.
 */

static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
        struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
        struct zone *preferred_zone, int migratetype)
{
    struct zoneref *z;
    struct page *page = NULL;
    int classzone_idx;
    struct zone *zone;
    nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
    int zlc_active = 0;        /* set if using zonelist_cache */
    int did_zlc_setup = 0;        /* just call zlc_setup() one time */

    classzone_idx = zone_idx(preferred_zone);
zonelist_scan:
    /*
     * Scan zonelist, looking for a zone with enough free.
     * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
     */

    for_each_zone_zonelist_nodemask(zone, z, zonelist,
                        high_zoneidx, nodemask) {
        if (NUMA_BUILD && zlc_active &&
            !zlc_zone_worth_trying(zonelist, z, allowednodes))
                continue;
        if ((alloc_flags & ALLOC_CPUSET) &&
            !cpuset_zone_allowed_softwall(zone, gfp_mask))
                goto try_next_zone;

        BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
        if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
            unsigned long mark;
            int ret;

            mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
            if (zone_watermark_ok(zone, order, mark,
                 classzone_idx, alloc_flags))
                goto try_this_zone;

            if (zone_reclaim_mode == 0)
                goto this_zone_full;

            ret = zone_reclaim(zone, gfp_mask, order);
            switch (ret) {
            case ZONE_RECLAIM_NOSCAN:
                /* did not scan */
                goto try_next_zone;
            case ZONE_RECLAIM_FULL:
                /* scanned but unreclaimable */
                goto this_zone_full;
            default:
                /* did we reclaim enough */
                if (!zone_watermark_ok(zone, order, mark,
                        classzone_idx, alloc_flags))
                    goto this_zone_full;
            }
        }

try_this_zone:
        page = buffered_rmqueue(preferred_zone, zone, order,
                        gfp_mask, migratetype);
        if (page)
            break;
this_zone_full:
        if (NUMA_BUILD)
            zlc_mark_zone_full(zonelist, z);
try_next_zone:
        if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
            /*
             * we do zlc_setup after the first zone is tried but only
             * if there are multiple nodes make it worthwhile
             */

            allowednodes = zlc_setup(zonelist, alloc_flags);
            zlc_active = 1;
            did_zlc_setup = 1;
        }
    }

    if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
        /* Disable zlc cache for second zonelist scan */
        zlc_active = 0;
        goto zonelist_scan;
    }
    return page;
}


/*
 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
 * we cheat by calling it from here, in the order > 0 path. Saves a branch
 * or two.
 */

static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
            struct zone *zone, int order, gfp_t gfp_flags,
            int migratetype)
{
    unsigned long flags;
    struct page *page;
    int cold = !!(gfp_flags & __GFP_COLD);

again:
    if (likely(order == 0)) {
        struct per_cpu_pages *pcp;
        struct list_head *list;

        local_irq_save(flags);
        pcp = &this_cpu_ptr(zone->pageset)->pcp;
        list = &pcp->lists[migratetype];
        if (list_empty(list)) {
            pcp->count += rmqueue_bulk(zone, 0,
                    pcp->batch, list,
                    migratetype, cold);
            if (unlikely(list_empty(list)))
                goto failed;
        }

        if (cold)
            page = list_entry(list->prev, struct page, lru);
        else
            page = list_entry(list->next, struct page, lru);

        list_del(&page->lru);
        pcp->count--;
    } else {
        if (unlikely(gfp_flags & __GFP_NOFAIL)) {
            /*
             * __GFP_NOFAIL is not to be used in new code.
             *
             * All __GFP_NOFAIL callers should be fixed so that they
             * properly detect and handle allocation failures.
             *
             * We most definitely don't want callers attempting to
             * allocate greater than order-1 page units with
             * __GFP_NOFAIL.
             */

            WARN_ON_ONCE(order > 1);
        }
        spin_lock_irqsave(&zone->lock, flags);
        page = __rmqueue(zone, order, migratetype);
        spin_unlock(&zone->lock);
        if (!page)
            goto failed;
        __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
    }

    __count_zone_vm_events(PGALLOC, zone, 1 << order);
    zone_statistics(preferred_zone, zone);
    local_irq_restore(flags);

    VM_BUG_ON(bad_range(zone, page));
    if (prep_new_page(page, order, gfp_flags))
        goto again;
    return page;

failed:
    local_irq_restore(flags);
    return NULL;
}


1.

/*
 * Obtain a specified number of elements from the buddy allocator, all under
 * a single hold of the lock, for efficiency. Add them to the supplied list.
 * Returns the number of new pages which were placed at *list.
 */

static int rmqueue_bulk(struct zone *zone, unsigned int order,
            unsigned long count, struct list_head *list,
            int migratetype, int cold)
{
    int i;
    
    spin_lock(&zone->lock);
    for (i = 0; i < count; ++i) {
        struct page *page = __rmqueue(zone, order, migratetype);
        if (unlikely(page == NULL))
            break;

        /*
         * Split buddy pages returned by expand() are received here
         * in physical page order. The page is added to the callers and
         * list and the list head then moves forward. From the callers
         * perspective, the linked list is ordered by page number in
         * some conditions. This is useful for IO devices that can
         * merge IO requests if the physical pages are ordered
         * properly.
         */

        if (likely(cold == 0))
            list_add(&page->lru, list);
        else
            list_add_tail(&page->lru, list);
        set_page_private(page, migratetype);
        list = &page->lru;
    }
    __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
    spin_unlock(&zone->lock);
    return i;
}


2.

/*
 * Do the hard work of removing an element from the buddy allocator.
 * Call me with the zone->lock already held.
 */

static struct page *__rmqueue(struct zone *zone, unsigned int order,
                        int migratetype)
{
    struct page *page;

retry_reserve:
    page = __rmqueue_smallest(zone, order, migratetype);

    if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
        page = __rmqueue_fallback(zone, order, migratetype);

        /*
         * Use MIGRATE_RESERVE rather than fail an allocation. goto
         * is used because __rmqueue_smallest is an inline function
         * and we want just one call site
         */

        if (!page) {
            migratetype = MIGRATE_RESERVE;
            goto retry_reserve;
        }
    }

    trace_mm_page_alloc_zone_locked(page, order, migratetype);
    return page;
}


/*
 * Go through the free lists for the given migratetype and remove
 * the smallest available page from the freelists
 */

static inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
                        int migratetype)
{
    unsigned int current_order;
    struct free_area * area;
    struct page *page;

    /* Find a page of the appropriate size in the preferred list */
    for (current_order = order; current_order < MAX_ORDER; ++current_order) {
        area = &(zone->free_area[current_order]);
        if (list_empty(&area->free_list[migratetype]))
            continue;

        page = list_entry(area->free_list[migratetype].next,
                            struct page, lru);
        list_del(&page->lru);
        rmv_page_order(page);
        area->nr_free--;
        expand(zone, page, order, current_order, area, migratetype);
        return page;
    }

    return NULL;
}


/*
 * The order of subdivision here is critical for the IO subsystem.
 * Please do not alter this order without good reasons and regression
 * testing. Specifically, as large blocks of memory are subdivided,
 * the order in which smaller blocks are delivered depends on the order
 * they're subdivided in this function. This is the primary factor
 * influencing the order in which pages are delivered to the IO
 * subsystem according to empirical testing, and this is also justified
 * by considering the behavior of a buddy system containing a single
 * large block of memory acted on by a series of small allocations.
 * This behavior is a critical factor in sglist merging's success.
 *
 * -- wli
 */

static inline void expand(struct zone *zone, struct page *page,
    int low, int high, struct free_area *area,
    int migratetype)
{
    unsigned long size = 1 << high;

    while (high > low) {
        area--;
        high--;
        size >>= 1;
        VM_BUG_ON(bad_range(zone, &page[size]));
        list_add(&page[size].lru, &area->free_list[migratetype]);
        area->nr_free++;
        set_page_order(&page[size], high);
    }
}


/* Remove an element from the buddy allocator from the fallback list */
static inline struct page *
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
{
    struct free_area * area;
    int current_order;
    struct page *page;
    int migratetype, i;

    /* Find the largest possible block of pages in the other list */
    for (current_order = MAX_ORDER-1; current_order >= order;
                        --current_order) {
        for (i = 0; i < MIGRATE_TYPES - 1; i++) {
            migratetype = fallbacks[start_migratetype][i];

            /* MIGRATE_RESERVE handled later if necessary */
            if (migratetype == MIGRATE_RESERVE)
                continue;

            area = &(zone->free_area[current_order]);
            if (list_empty(&area->free_list[migratetype]))
                continue;

            page = list_entry(area->free_list[migratetype].next,
                    struct page, lru);
            area->nr_free--;

            /*
             * If breaking a large block of pages, move all free
             * pages to the preferred allocation list. If falling
             * back for a reclaimable kernel allocation, be more
             * agressive about taking ownership of free pages
             */

            if (unlikely(current_order >= (pageblock_order >> 1)) ||
                    start_migratetype == MIGRATE_RECLAIMABLE ||
                    page_group_by_mobility_disabled) {
                unsigned long pages;
                pages = move_freepages_block(zone, page,
                                start_migratetype);

                /* Claim the whole block if over half of it is free */
                if (pages >= (1 << (pageblock_order-1)) ||
                        page_group_by_mobility_disabled)
                    set_pageblock_migratetype(page,
                                start_migratetype);

                migratetype = start_migratetype;
            }

            /* Remove the page from the freelists */
            list_del(&page->lru);
            rmv_page_order(page);

            /* Take ownership for orders >= pageblock_order */
            if (current_order >= pageblock_order)
                change_pageblock_range(page, current_order,
                            start_migratetype);

            expand(zone, page, order, current_order, area, migratetype);

            trace_mm_page_alloc_extfrag(page, order, current_order,
                start_migratetype, migratetype);

            return page;
        }
    }

    return NULL;
}


阅读(1165) | 评论(1) | 转发(0) |
0

上一篇:__rmqueue_smallest()

下一篇:Data Structure

给主人留下些什么吧!~~

chinaunix网友2010-10-28 18:17:43

很好的, 收藏了 推荐一个博客,提供很多免费软件编程电子书下载: http://free-ebooks.appspot.com