Chinaunix首页 | 论坛 | 博客
  • 博客访问: 439522
  • 博文数量: 123
  • 博客积分: 2686
  • 博客等级: 少校
  • 技术积分: 1349
  • 用 户 组: 普通用户
  • 注册时间: 2009-12-23 22:11
文章分类
文章存档

2012年(3)

2011年(10)

2010年(100)

2009年(10)

我的朋友

分类: LINUX

2010-10-19 11:31:48

[_alloc_pages() > get_page_from_freelist() > buffered_rmqueue()]

it simply removes pages from the normal buddy system and adds them to the cache

If more than one page is to be allocated (as handled in the else branch), the kernel calls __rmqueue to select a suitable page block from the zone’s buddy lists. If necessary, the function automatically breaks down larger blocks and puts unused parts back in the lists (how this is done is described below).

Caution:
It can be the case that there are enough free pages in the zone to satisfy the allocation request, but that the pages are not contiguous. In this case, __rmqueue fails, and a NULL pointer is returned.


/*
 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
 * we cheat by calling it from here, in the order > 0 path. Saves a branch
 * or two.
 */

static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
            struct zone *zone, int order, gfp_t gfp_flags,
            int migratetype)
{
    unsigned long flags;
    struct page *page;
    int cold = !!(gfp_flags & __GFP_COLD);//see Commnent 0

again://see Comment 1
    if (likely(order == 0)) {
        struct per_cpu_pages *pcp;//see below Comment 2
        struct list_head *list;

        local_irq_save(flags);

      
        pcp = &this_cpu_ptr(zone->pageset)->pcp;
        list = &pcp->lists[migratetype];

        //if list is empty, refill the cache
        if (list_empty(list)) {

            //see Comment 3
            pcp->count += rmqueue_bulk(zone, 0,
                    pcp->batch, list,
                    migratetype, cold);
            if (unlikely(list_empty(list)))
                goto failed;
        }

        if (cold)
            page = list_entry(list->prev, struct page, lru);
        else
            page = list_entry(list->next, struct page, lru);

        list_del(&page->lru);
        pcp->count--;
    } else {
        if (unlikely(gfp_flags & __GFP_NOFAIL)) {
            /*
             * __GFP_NOFAIL is not to be used in new code.
             *
             * All __GFP_NOFAIL callers should be fixed so that they
             * properly detect and handle allocation failures.
             *
             * We most definitely don't want callers attempting to
             * allocate greater than order-1 page units with
             * __GFP_NOFAIL.
             */

            WARN_ON_ONCE(order > 1);
        }
        spin_lock_irqsave(&zone->lock, flags);
        page = __rmqueue(zone, order, migratetype);
        spin_unlock(&zone->lock);
        if (!page)
            goto failed;
        __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
    }

    __count_zone_vm_events(PGALLOC, zone, 1 << order);
    zone_statistics(preferred_zone, zone);
    local_irq_restore(flags);

    VM_BUG_ON(bad_range(zone, page));

    //see comment 4
    if (prep_new_page(page, order, gfp_flags))
        goto again;
    return page;

failed:
    local_irq_restore(flags);
    return NULL;
}



Comment 0:
If GFP_COLD is set in the allocation flags, then a cache-cold page must be taken from the per-CPU allocator if any exists. The double negation ensures that cold is either 0 or 1.

Comment 1:
When only a single page is desired, the kernel tries to speed up the request with the help of the per-CPU cache. If the cache is empty, the kernel takes the opportunity to check the cache fill level.







Comment 2:
struct per_cpu_pages {
    int count;        /* number of pages in the list */
    int high;        /* high watermark, emptying needed */
    int batch;        /* chunk size for buddy add/remove */

    /* Lists of pages, one per migrate type stored on the pcp-lists */
    struct list_head lists[MIGRATE_PCPTYPES];
};

Comment 3:

/*
 * Obtain a specified number of elements from the buddy allocator, all under
 * a single hold of the lock, for efficiency. Add them to the supplied list.
 * Returns the number of new pages which were placed at *list.
 */

static int rmqueue_bulk(struct zone *zone, unsigned int order,
            unsigned long count, struct list_head *list,
            int migratetype, int cold)
{
    int i;
    
    spin_lock(&zone->lock);
    for (i = 0; i < count; ++i) {

        //see Comment 3.1 about __rmqueue()
        struct page *page = __rmqueue(zone, order, migratetype);
        if (unlikely(page == NULL))
            break;

        /*
         * Split buddy pages returned by expand() are received here
         * in physical page order. The page is added to the callers and
         * list and the list head then moves forward. From the callers
         * perspective, the linked list is ordered by page number in
         * some conditions. This is useful for IO devices that can
         * merge IO requests if the physical pages are ordered
         * properly.
         */

        if (likely(cold == 0))
            list_add(&page->lru, list);
        else
            list_add_tail(&page->lru, list);
        set_page_private(page, migratetype);
        list = &page->lru;
    }
    __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
    spin_unlock(&zone->lock);
    return i;
}

 

Comment 3.1:


/*
 * Do the hard work of removing an element from the buddy allocator.
 * Call me with the zone->lock already held.
 */

static struct page *__rmqueue(struct zone *zone, unsigned int order,
                        int migratetype)
{
    struct page *page;

retry_reserve:

    //see comment 3.1.1
    page = __rmqueue_smallest(zone, order, migratetype);

    if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
        page = __rmqueue_fallback(zone, order, migratetype);

        /*
         * Use MIGRATE_RESERVE rather than fail an allocation. goto
         * is used because __rmqueue_smallest is an inline function
         * and we want just one call site
         */

        if (!page) {
            migratetype = MIGRATE_RESERVE;
            goto retry_reserve;
        }
    }

    trace_mm_page_alloc_zone_locked(page, order, migratetype);
    return page;
}



Comment 3.1.1:

/*
 * Go through the free lists for the given migratetype and remove
 * the smallest available page from the freelists
 */

static inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
                        int migratetype)
{
    unsigned int current_order;
    struct free_area * area;
    struct page *page;

    /* Find a page of the appropriate size in the preferred list */
    for (current_order = order; current_order < MAX_ORDER; ++current_order) {
        area = &(zone->free_area[current_order]);
        if (list_empty(&area->free_list[migratetype]))
            continue;

        page = list_entry(area->free_list[migratetype].next,
                            struct page, lru);
        list_del(&page->lru);
        rmv_page_order(page);//see following function definition
        area->nr_free--;
        expand(zone, page, order, current_order, area, migratetype);//see following function definition
        return page;
    }

    return NULL;
}


static inline void rmv_page_order(struct page *page)
{
    __ClearPageBuddy(page);
    set_page_private(page, 0);
}


/*
 * The order of subdivision here is critical for the IO subsystem.
 * Please do not alter this order without good reasons and regression
 * testing. Specifically, as large blocks of memory are subdivided,
 * the order in which smaller blocks are delivered depends on the order
 * they're subdivided in this function. This is the primary factor
 * influencing the order in which pages are delivered to the IO
 * subsystem according to empirical testing, and this is also justified
 * by considering the behavior of a buddy system containing a single
 * large block of memory acted on by a series of small allocations.
 * This behavior is a critical factor in sglist merging's success.
 *
 * -- wli
 */

static inline void expand(struct zone *zone, struct page *page,
    int low, int high, struct free_area *area,
    int migratetype)
{
    unsigned long size = 1 << high;

    while (high > low) {
        area--;
        high--;
        size >>= 1;
        VM_BUG_ON(bad_range(zone, &page[size]));
        list_add(&page[size].lru, &area->free_list[migratetype]);
        area->nr_free++;
        set_page_order(&page[size], high);
    }
}



Comment 4:
prep_new_page performs several checks on the pages to ensure that they leave the allocator in a perfect state — this means, in particular, that the page must not be in use in existing mappings and no incorrect flags like PG_locked or PG_buddy may be set because this would imply that the page is in use somewhere else and should not be on the free list. Normally, however, no error should occur because this would imply a kernel error elsewhere.


static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
{
    int i;

    for (i = 0; i < (1 << order); i++) {
        struct page *p = page + i;
        if (unlikely(check_new_page(p)))
            return 1;
    }

    set_page_private(page, 0);
    set_page_refcounted(page);

    arch_alloc_page(page, order);
    kernel_map_pages(page, 1 << order, 1);

    if (gfp_flags & __GFP_ZERO)
        prep_zero_page(page, order, gfp_flags);

    if (order && (gfp_flags & __GFP_COMP))
        prep_compound_page(page, order);

    return 0;
}


阅读(861) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~