warn_alloc():page allocation failure問題分析

關鍵詞:warn_alloc()、__GFP_XXX、order、CMA等等。html

 

在內存申請的時候常常會遇到相似「 xxx: page allocation failure: order:10...」類型的問題,這是warn_alloc()的輸出。node

warn_alloc()被以下函數調用:__alloc_pages_slowpath()、__vmalloc_area_node()、__vmalloc_node_rangelinux

下面分三部分了解這種問題的前因後果:緩存

  • 什麼狀況會致使warn_alloc()?
  • warn_alloc()都作了哪些事情?
  • 結合實際問題分析問題緣由。

 

1.觸發warn_alloc()狀況

要了什麼狀況下會致使warn_alloc(),就須要分析在何種狀況下會被調用。cookie

__alloc_pages_slowpath()表示頁面申請進入了slowpath,那相對就有fastpath。app

__alloc_pages_nodemask()中可知,這個fastpath就是get_page_from_freelist()。__alloc_pages_nodemask()是分配頁面的後備選擇。less

static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                        struct alloc_context *ac)
{
    bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
    struct page *page = NULL;
    unsigned int alloc_flags;
    unsigned long did_some_progress;
    enum compact_priority compact_priority;
    enum compact_result compact_result;
    int compaction_retries;
    int no_progress_loops;
    unsigned long alloc_start = jiffies;
    unsigned int stall_timeout = 10 * HZ;
    unsigned int cpuset_mems_cookie;

    if (order >= MAX_ORDER) {
        WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
        return NULL;
    }

    if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
                (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
        gfp_mask &= ~__GFP_ATOMIC;

retry_cpuset:
    compaction_retries = 0;
    no_progress_loops = 0;
    compact_priority = DEF_COMPACT_PRIORITY;
    cpuset_mems_cookie = read_mems_allowed_begin();

    ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
                    ac->high_zoneidx, ac->nodemask);
    if (!ac->preferred_zoneref->zone)------------------------------------------------找不到合適的zone,進入nopage處理。 goto nopage;

    alloc_flags = gfp_to_alloc_flags(gfp_mask);

    if (gfp_mask & __GFP_KSWAPD_RECLAIM)
        wake_all_kswapds(order, ac);

    page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    if (page)
        goto got_pg;

    if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER &&
        !gfp_pfmemalloc_allowed(gfp_mask)) {-----------------------------------------在定義__GFP_DIRECT_RECLAIM、__GFP_MEMALLOC而且order大於3,也即分配超過8頁內存的時候。
        page = __alloc_pages_direct_compact(gfp_mask, order,
                        alloc_flags, ac,
                        INIT_COMPACT_PRIORITY,
                        &compact_result);---------------------------------------------頁面較大狀況下,走直接頁面回收來獲取內存。 if (page)
            goto got_pg;

        if (gfp_mask & __GFP_NORETRY) {----------------------------------------------不作重試的狀況。 
            if (compact_result == COMPACT_DEFERRED)----------------------------------compaction不成功,進入nopage處理。 goto nopage;

            compact_priority = INIT_COMPACT_PRIORITY;
        }
    }

retry:
    /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
    if (gfp_mask & __GFP_KSWAPD_RECLAIM)
        wake_all_kswapds(order, ac);-------------------------------------------------喚醒kswapd內核線程,讓其處於工做狀態。 if (gfp_pfmemalloc_allowed(gfp_mask))
        alloc_flags = ALLOC_NO_WATERMARKS;

    if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
        ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
        ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
                    ac->high_zoneidx, ac->nodemask);
    }

    /* Attempt with potentially adjusted zonelist and alloc_flags */
    page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);-----------------申請內存分配,成功則返回struct page地址。 if (page)
        goto got_pg;

    /* Caller is not willing to reclaim, we can't balance anything */
    if (!can_direct_reclaim) {-------------------------------------------------------既不能內存規整direct compact,也沒法從freelist獲取內存的狀況,進入nopage流程。 
        WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
        goto nopage;
    }

    /* Avoid recursion of direct reclaim */
    if (current->flags & PF_MEMALLOC) {

        if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
            cond_resched();
            goto retry;
        }
        goto nopage;
    }

    /* Avoid allocations with no watermarks from looping endlessly */
    if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
        goto nopage;


    /* Try direct reclaim and then allocating */
    page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
                            &did_some_progress);
    if (page)
        goto got_pg;

    /* Try direct compaction and then allocating */
    page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
                    compact_priority, &compact_result);
    if (page)
        goto got_pg;

    /* Do not loop if specifically requested */
    if (gfp_mask & __GFP_NORETRY)--------------------------------------------------------------強調不容許循環重試狀況。 goto nopage;

    /*
     * Do not retry costly high order allocations unless they are
     * __GFP_REPEAT
     */
    if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))-------------------------針對高order狀況,而且不容許__GFP_REPEAT的狀況,進入nopage流程。 goto nopage;

    /* Make sure we know about allocations which stall for too long */
    if (time_after(jiffies, alloc_start + stall_timeout)) {------------------------------------內存分配持續時間超過stall_timeout,初始爲10秒,後面以10秒遞增報警。
        warn_alloc(gfp_mask,
            "page allocation stalls for %ums, order:%u",
            jiffies_to_msecs(jiffies-alloc_start), order);
        stall_timeout += 10 * HZ;
    }

    if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
                 did_some_progress > 0, &no_progress_loops))
        goto retry;

    if (did_some_progress > 0 &&
            should_compact_retry(ac, order, alloc_flags,
                compact_result, &compact_priority,
                &compaction_retries))
        goto retry;

    if (read_mems_allowed_retry(cpuset_mems_cookie))
        goto retry_cpuset;

    /* Reclaim has failed us, start killing things */
    page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);----------------------分配頁面,而且判斷是否須要啓動OOM killer,did_some_progress會致使retry。若是order小於3則不會進入OOM。 if (page)
        goto got_pg;

    /* Retry as long as the OOM killer is making progress */
    if (did_some_progress) {
        no_progress_loops = 0;
        goto retry;
    }

nopage:

    if (read_mems_allowed_retry(cpuset_mems_cookie))
        goto retry_cpuset;----------------------------------------------------------------------進入retry_cpuset循環處理。

    warn_alloc(gfp_mask,
            "page allocation failure: order:%u", order);----------------------------------------沒法知足分配order大小頁面。
got_pg:
    return page;
}

下面兩個函數都是vmalloc相關,__vmalloc_area_node()在分配失敗以後進入fail,調用warn_alloc()輸出log。ide

static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                 pgprot_t prot, int node)
{
    struct page **pages;
    unsigned int nr_pages, array_size, i;
    const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
    const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;

    nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
    array_size = (nr_pages * sizeof(struct page *));

    area->nr_pages = nr_pages;
    /* Please note that the recursion is strictly bounded. */
    if (array_size > PAGE_SIZE) {
        pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
                PAGE_KERNEL, node, area->caller);
    } else {
        pages = kmalloc_node(array_size, nested_gfp, node);
    }
    area->pages = pages;
    if (!area->pages) {
        remove_vm_area(area->addr);
        kfree(area);
        return NULL;
    }

    for (i = 0; i < area->nr_pages; i++) {
        struct page *page;

        if (node == NUMA_NO_NODE)
            page = alloc_page(alloc_mask);
        else
            page = alloc_pages_node(node, alloc_mask, 0);

        if (unlikely(!page)) {
            /* Successfully allocated i pages, free them in __vunmap() */
            area->nr_pages = i;
            goto fail;
        }
        area->pages[i] = page;
        if (gfpflags_allow_blocking(gfp_mask))
            cond_resched();
    }

    if (map_vm_area(area, prot, pages))
        goto fail;
    return area->addr;

fail:
    warn_alloc(gfp_mask,
              "vmalloc: allocation failure, allocated %ld of %ld bytes",
              (area->nr_pages*PAGE_SIZE), area->size);
    vfree(area->addr);
    return NULL;
}

 

 

void *__vmalloc_node_range(unsigned long size, unsigned long align,
            unsigned long start, unsigned long end, gfp_t gfp_mask,
            pgprot_t prot, unsigned long vm_flags, int node,
            const void *caller)
{
    struct vm_struct *area;
    void *addr;
    unsigned long real_size = size;

    size = PAGE_ALIGN(size);
    if (!size || (size >> PAGE_SHIFT) > totalram_pages)
        goto fail;

    area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
                vm_flags, start, end, node, gfp_mask, caller);
    if (!area)
        goto fail;

    addr = __vmalloc_area_node(area, gfp_mask, prot, node);
    if (!addr)
        return NULL;

    clear_vm_uninitialized_flag(area);

    kmemleak_alloc(addr, real_size, 2, gfp_mask);

    return addr;

fail:
    warn_alloc(gfp_mask,
              "vmalloc: allocation failure: %lu bytes", real_size);
    return NULL;
}

 

 

2. warn_alloc()解析

warn_alloc()首先顯示相關進程和內存分配gfp_mask信息,而後打印棧信息,函數

void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
{
    unsigned int filter = SHOW_MEM_FILTER_NODES;
    struct va_format vaf;
    va_list args;

    if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
        debug_guardpage_minorder() > 0)
        return;

    if (!(gfp_mask & __GFP_NOMEMALLOC))
        if (test_thread_flag(TIF_MEMDIE) ||
            (current->flags & (PF_MEMALLOC | PF_EXITING)))
            filter &= ~SHOW_MEM_FILTER_NODES;
    if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
        filter &= ~SHOW_MEM_FILTER_NODES;

    pr_warn("%s: ", current->comm);------------------------------------顯示對應進程名稱。

    va_start(args, fmt);
    vaf.fmt = fmt;
    vaf.va = &args;
    pr_cont("%pV", &vaf);
    va_end(args);------------------------------------------------------顯示warn_alloc()傳入的參數。

    pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);----------------顯示gfp_mask。

    dump_stack();------------------------------------------------------顯示棧信息。 if (!should_suppress_show_mem())
        show_mem(filter);----------------------------------------------顯示內存信息,這裏是重點。
}

show_mem()顯示詳細的內存信息。oop

void show_mem(unsigned int filter)
{
    pg_data_t *pgdat;
    unsigned long total = 0, reserved = 0, highmem = 0;

    printk("Mem-Info:\n");
    show_free_areas(filter);

    for_each_online_pgdat(pgdat) {
        unsigned long flags;
        int zoneid;

        pgdat_resize_lock(pgdat, &flags);
        for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
            struct zone *zone = &pgdat->node_zones[zoneid];
            if (!populated_zone(zone))
                continue;

            total += zone->present_pages;
            reserved += zone->present_pages - zone->managed_pages;

            if (is_highmem_idx(zoneid))
                highmem += zone->present_pages;
        }
        pgdat_resize_unlock(pgdat, &flags);
    }

    printk("%lu pages RAM\n", total);-------------------------------整個平臺的頁面統計信息:全部頁面數、reserved、cma等等。
    printk("%lu pages HighMem/MovableOnly\n", highmem);
    printk("%lu pages reserved\n", reserved);
#ifdef CONFIG_CMA
    printk("%lu pages cma reserved\n", totalcma_pages);
#endif
#ifdef CONFIG_QUICKLIST
    printk("%lu pages in pagetable cache\n",
        quicklist_total_size());
#endif
#ifdef CONFIG_MEMORY_FAILURE
    printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
#endif
}

show_free_areas()從全部node、不一樣node、不一樣zone、同一zone下不一樣order分別顯示空閒頁面信息。

void show_free_areas(unsigned int filter)
{
    unsigned long free_pcp = 0;
    int cpu;
    struct zone *zone;
    pg_data_t *pgdat;

    for_each_populated_zone(zone) {
        if (skip_free_areas_node(filter, zone_to_nid(zone)))
            continue;

        for_each_online_cpu(cpu)
            free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
    }

    printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"-----------------顯示全部node的統計信息。
        " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
        " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
        " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
        " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
        " free:%lu free_pcp:%lu free_cma:%lu\n",
        global_node_page_state(NR_ACTIVE_ANON),
        global_node_page_state(NR_INACTIVE_ANON),
        global_node_page_state(NR_ISOLATED_ANON),
        global_node_page_state(NR_ACTIVE_FILE),
        global_node_page_state(NR_INACTIVE_FILE),
        global_node_page_state(NR_ISOLATED_FILE),
        global_node_page_state(NR_UNEVICTABLE),
        global_node_page_state(NR_FILE_DIRTY),
        global_node_page_state(NR_WRITEBACK),
        global_node_page_state(NR_UNSTABLE_NFS),
        global_page_state(NR_SLAB_RECLAIMABLE),
        global_page_state(NR_SLAB_UNRECLAIMABLE),
        global_node_page_state(NR_FILE_MAPPED),
        global_node_page_state(NR_SHMEM),
        global_page_state(NR_PAGETABLE),
        global_page_state(NR_BOUNCE),
        global_page_state(NR_FREE_PAGES),
        free_pcp,
        global_page_state(NR_FREE_CMA_PAGES));

    for_each_online_pgdat(pgdat) {-------------------------------------------------分別顯示不一樣node的統計信息。
        printk("Node %d"
            " active_anon:%lukB"
            " inactive_anon:%lukB"
            " active_file:%lukB"
            " inactive_file:%lukB"
            " unevictable:%lukB"
            " isolated(anon):%lukB"
            " isolated(file):%lukB"
            " mapped:%lukB"
            " dirty:%lukB"
            " writeback:%lukB"
            " shmem:%lukB"
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
            " shmem_thp: %lukB"
            " shmem_pmdmapped: %lukB"
            " anon_thp: %lukB"
#endif
            " writeback_tmp:%lukB"
            " unstable:%lukB"
            " pages_scanned:%lu"
            " all_unreclaimable? %s"
            "\n",
            pgdat->node_id,
            K(node_page_state(pgdat, NR_ACTIVE_ANON)),
            K(node_page_state(pgdat, NR_INACTIVE_ANON)),
            K(node_page_state(pgdat, NR_ACTIVE_FILE)),
            K(node_page_state(pgdat, NR_INACTIVE_FILE)),
            K(node_page_state(pgdat, NR_UNEVICTABLE)),
            K(node_page_state(pgdat, NR_ISOLATED_ANON)),
            K(node_page_state(pgdat, NR_ISOLATED_FILE)),
            K(node_page_state(pgdat, NR_FILE_MAPPED)),
            K(node_page_state(pgdat, NR_FILE_DIRTY)),
            K(node_page_state(pgdat, NR_WRITEBACK)),
            K(node_page_state(pgdat, NR_SHMEM)),
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
            K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
            K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
                    * HPAGE_PMD_NR),
            K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
#endif
            K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
            K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
            node_page_state(pgdat, NR_PAGES_SCANNED),
            !pgdat_reclaimable(pgdat) ? "yes" : "no");
    }

    for_each_populated_zone(zone) {----------------------------------------------分別顯示全部zone的統計信息。 int i;

        if (skip_free_areas_node(filter, zone_to_nid(zone)))
            continue;

        free_pcp = 0;
        for_each_online_cpu(cpu)
            free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;

        show_node(zone);
        printk(KERN_CONT
            "%s"
            " free:%lukB"
            " min:%lukB"
            " low:%lukB"
            " high:%lukB"
            " active_anon:%lukB"
            " inactive_anon:%lukB"
            " active_file:%lukB"
            " inactive_file:%lukB"
            " unevictable:%lukB"
            " writepending:%lukB"
            " present:%lukB"
            " managed:%lukB"
            " mlocked:%lukB"
            " slab_reclaimable:%lukB"
            " slab_unreclaimable:%lukB"
            " kernel_stack:%lukB"
            " pagetables:%lukB"
            " bounce:%lukB"
            " free_pcp:%lukB"
            " local_pcp:%ukB"
            " free_cma:%lukB"
            "\n",
            zone->name,
            K(zone_page_state(zone, NR_FREE_PAGES)),
            K(min_wmark_pages(zone)),
            K(low_wmark_pages(zone)),
            K(high_wmark_pages(zone)),
            K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
            K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
            K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
            K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
            K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
            K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
            K(zone->present_pages),
            K(zone->managed_pages),
            K(zone_page_state(zone, NR_MLOCK)),
            K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
            K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
            zone_page_state(zone, NR_KERNEL_STACK_KB),
            K(zone_page_state(zone, NR_PAGETABLE)),
            K(zone_page_state(zone, NR_BOUNCE)),
            K(free_pcp),
            K(this_cpu_read(zone->pageset->pcp.count)),
            K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
        printk("lowmem_reserve[]:");
        for (i = 0; i < MAX_NR_ZONES; i++)
            printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
        printk(KERN_CONT "\n");
    }

    for_each_populated_zone(zone) {-------------------------------------------顯示全部zone下不一樣order空閒數目統計信息。
        unsigned int order;
        unsigned long nr[MAX_ORDER], flags, total = 0;
        unsigned char types[MAX_ORDER];

        if (skip_free_areas_node(filter, zone_to_nid(zone)))
            continue;
        show_node(zone);
        printk(KERN_CONT "%s: ", zone->name);

        spin_lock_irqsave(&zone->lock, flags);
        for (order = 0; order < MAX_ORDER; order++) {-------------------------遍歷當前zone的不一樣order,不一樣order區域數目存在nr[]中,total是總的頁面數目。 struct free_area *area = &zone->free_area[order];
            int type;

            nr[order] = area->nr_free;
            total += nr[order] << order;

            types[order] = 0;
            for (type = 0; type < MIGRATE_TYPES; type++) {
                if (!list_empty(&area->free_list[type]))
                    types[order] |= 1 << type;--------------------------------記錄order區域中頁面類型。
            }
        }
        spin_unlock_irqrestore(&zone->lock, flags);
        for (order = 0; order < MAX_ORDER; order++) {
            printk(KERN_CONT "%lu*%lukB ",
                   nr[order], K(1UL) << order);-------------------------------輸出不一樣order區域數量和區域大小。 if (nr[order])
                show_migration_types(types[order]);---------------------------輸出頁面類型。
        }
        printk(KERN_CONT "= %lukB\n", K(total));------------------------------顯示總大小。
    }

    hugetlb_show_meminfo();---------------------------------------------------顯示huge page統計信息。

    printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));---總的文件緩存頁面數量。

    show_swap_cache_info();----------------------------------------------------顯示swap cache統計信息。
}

不一樣的頁面有不一樣的屬性,在warn_alloc()輸出的字母對應了頁面的屬性。主要有M、U、E、C。

static void show_migration_types(unsigned char type)
{
    static const char types[MIGRATE_TYPES] = {
        [MIGRATE_UNMOVABLE]    = 'U',--------------------------不可移動。
        [MIGRATE_MOVABLE]    = 'M',----------------------------可移動。
        [MIGRATE_RECLAIMABLE]    = 'E',------------------------可回收。
        [MIGRATE_HIGHATOMIC]    = 'H',-------------------------等同於MIGRATE_PCPTYPES。
#ifdef CONFIG_CMA
        [MIGRATE_CMA]        = 'C',----------------------------CMA區域頁面。
#endif
#ifdef CONFIG_MEMORY_ISOLATION
        [MIGRATE_ISOLATE]    = 'I',
#endif
    };
    char tmp[MIGRATE_TYPES + 1];
    char *p = tmp;
    int i;

    for (i = 0; i < MIGRATE_TYPES; i++) {
        if (type & (1 << i))
            *p++ = types[i];
    }

    *p = '\0';
    printk(KERN_CONT "(%s) ", tmp);
}

通過上面的分析,基本上明白每一行的輸出的來源。具體每一個字段表示的內存含義,還須要結合代碼閱讀。

3. 實例解析

下面結合實際問題log輸出來分析問題,進而解決問題。

表示進程xxxx在分配order爲10個連續物理頁面時失敗,mode表示內存分配的頁模式,具體在include/linux/gfp.h中定義。

內存碎片會致使page分配失敗,即便還有不少空閒page。當order=0時,表示系統當前已經徹底OOM。

[ 2161.623563] xxxx: page allocation failure: order:10, mode:0x2084020(GFP_ATOMIC|__GFP_COMP)-----------------warn_alloc(),從這裏能夠知道是哪一個進程頁面分配失敗,而且有對應的gfp_mask。
[ 2161.632085] CPU: 0 PID: 179 Comm: AiApp Not tainted 4.9.56 #53---------------------------------------------dump_stack(),棧信息指出了更詳細的調用路徑。
[ 2161.637947] 
Call Trace:
[<802f63f2>] dump_stack+0x1e/0x3c
[<800f6cf4>] warn_alloc+0x100/0x148
[<800f709c>] __alloc_pages_nodemask+0x2bc/0xb5c
[<801120fe>] kmalloc_order+0x26/0x48
[<80112158>] kmalloc_order_trace+0x38/0x98
[<8012c5d8>] __kmalloc+0xf4/0x12c
[<8048ac78>] alloc_ep_req+0x5c/0x98
[<8048f232>] source_sink_recv+0x2a/0xe0
[<8048f35e>] usb_sourcesink_bulk_read+0x76/0x1c8
[<8048f770>] usb_sourcesink_read+0xfc/0x2c8
[<80134d58>] __vfs_read+0x30/0x108
[<80135c14>] vfs_read+0x94/0x128
[<80136d12>] SyS_read+0x52/0xd4
[<8004a246>] csky_systemcall+0x96/0xe0
[ 2161.689204] Mem-Info:--------------------------------------------------------------show_mem()
[ 2161.691518] active_anon:3268 inactive_anon:2 isolated_anon:0-----------------------全部node統計信息。
[ 2161.691518]  active_file:1271 inactive_file:89286 isolated_file:0
[ 2161.691518]  unevictable:0 dirty:343 writeback:0 unstable:0
[ 2161.691518]  slab_reclaimable:2019 slab_unreclaimable:644
[ 2161.691518]  mapped:4282 shmem:4 pagetables:59 bounce:0
[ 2161.691518]  free:62086 free_pcp:199 free_cma:60234
--------------------------------------------------------------------------------------只有一個node,輸出node 0統計信息。 [ 2161.724334] Node 0 active_anon:13072kB inactive_anon:8kB active_file:5084kB inactive_file:357144kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:17128kB dirty:1372kB writeback:0kB shmem:16kB writeback_tmp:0kB unstable:0kB pages_scanned:0 all_unreclaimable? no
--------------------------------------------------------------------------------------輸出Normal zone統計信息。 [
2161.748626] Normal free:248344kB min:2444kB low:3052kB high:3660kB active_anon:13072kB inactive_anon:8kB active_file:5084kB inactive_file:357144kB unevictable:0kB writepending:1372kB present:1048572kB managed:734568kB mlocked:0kB slab_reclaimable:8076kB slab_unreclaimable:2576kB kernel_stack:608kB pagetables:236kB bounce:0kB free_pcp:796kB local_pcp:796kB free_cma:240936kB [ 2161.781670] lowmem_reserve[]: 0 0 0
---------------------------------------------------------------------------------------輸出Normal zone下不一樣order的空閒狀況,包括其中頁面屬性。 [ 2161.785225] Normal: 4*4kB (UEC) 3*8kB (EC) 3*16kB (UEC) 2*32kB (UE) 2*64kB (UE) 2*128kB (UE) 2*256kB (EC) 1*512kB (E) 3*1024kB (UEC) 3*2048kB (UEC) 58*4096kB (C) = 248344kB 90573 total pagecache pages
---------------------------------------------------------------------------------------整個平臺頁面統計信息。 [
2161.803526] 262143 pages RAM [ 2161.806410] 0 pages HighMem/MovableOnly [ 2161.810264] 78501 pages reserved [ 2161.813509] 90112 pages cma reserved

從stack信息能夠得知,alloc_ep_req()是分配內存的起點。

struct usb_request *alloc_ep_req(struct usb_ep *ep, size_t len)
{
    struct usb_request      *req;

    req = usb_ep_alloc_request(ep, GFP_ATOMIC);
    if (req) {
        req->length = usb_endpoint_dir_out(ep->desc) ?
            usb_ep_align(ep, len) : len;
        req->buf = kmalloc(req->length, GFP_ATOMIC);
        if (!req->buf) {
            usb_ep_free_request(ep, req);
            req = NULL;
        }
    }
    return req;
}

 

3.1 GFP_ATOMIC和__GFP_COMP:頁面分配標誌

從代碼可知此時gfp_mask爲GFP_ATOMIC,這種狀況是不容許__GFP_DIRECT_RECLAIM頁面直接回收的。

#define GFP_ATOMIC    (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
#define __GFP_HIGH    ((__force gfp_t)___GFP_HIGH)----------------------------------------------表示更高優先級。
#define __GFP_ATOMIC    ((__force gfp_t)___GFP_ATOMIC)------------------------------------------表示調用者不能夠回收頁面或者睡眠,而且是高優先級。典型的應用是中斷處理中。
#define __GFP_KSWAPD_RECLAIM    ((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */----在內存分配的時候,主動喚醒kswapd線程。
#define __GFP_COMP    ((__force gfp_t)___GFP_COMP)----------------------------------------------複合頁標誌位,表示將兩個或多個也當作一個頁面。

GFP位掩碼定義以下:

#define ___GFP_DMA        0x01u
#define ___GFP_HIGHMEM        0x02u
#define ___GFP_DMA32        0x04u
#define ___GFP_MOVABLE        0x08u
#define ___GFP_RECLAIMABLE    0x10u
#define ___GFP_HIGH        0x20u
#define ___GFP_IO        0x40u
#define ___GFP_FS        0x80u
#define ___GFP_COLD        0x100u
#define ___GFP_NOWARN        0x200u
#define ___GFP_REPEAT        0x400u
#define ___GFP_NOFAIL        0x800u
#define ___GFP_NORETRY        0x1000u
#define ___GFP_MEMALLOC        0x2000u
#define ___GFP_COMP        0x4000u
#define ___GFP_ZERO        0x8000u
#define ___GFP_NOMEMALLOC    0x10000u
#define ___GFP_HARDWALL        0x20000u
#define ___GFP_THISNODE        0x40000u
#define ___GFP_ATOMIC        0x80000u
#define ___GFP_ACCOUNT        0x100000u
#define ___GFP_NOTRACK        0x200000u
#define ___GFP_DIRECT_RECLAIM    0x400000u
#define ___GFP_OTHER_NODE    0x800000u
#define ___GFP_WRITE        0x1000000u
#define ___GFP_KSWAPD_RECLAIM    0x2000000u

 

3.2 gfp和migrate轉換,進而alloc_flags:爲何不能使用CMA區域?

gfp_mask決定了申請頁面的migratetype,而後在CMA存在的狀況下根據migratetype決定是否可用CMA區域。

static inline unsigned int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
    unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

    /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
    BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

    alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);------------------------------__GFP_HIGH到ALLOC_HIGH轉換。 if (gfp_mask & __GFP_ATOMIC) {

        if (!(gfp_mask & __GFP_NOMEMALLOC))
            alloc_flags |= ALLOC_HARDER;

        alloc_flags &= ~ALLOC_CPUSET;
    } else if (unlikely(rt_task(current)) && !in_interrupt())
        alloc_flags |= ALLOC_HARDER;

#ifdef CONFIG_CMA
    if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)---------------------------將gfp_mask轉換到migratetype,判斷是不是MIGRATE_MOVABLE。若是是,則能夠在CMA去榆中分配。也就是說必須gfp_flags中包含__GFP_MOVABLE才能夠在CMA中分配。
        alloc_flags |= ALLOC_CMA;
#endif
    return alloc_flags;
}

#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)------------------------------___GFP_MOVABLE爲0x08,___GFP_RECLAIMABLE爲0x10。
#define GFP_MOVABLE_SHIFT 3

static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
{
    VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
    BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
    BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);

    if (unlikely(page_group_by_mobility_disabled))
        return MIGRATE_UNMOVABLE;

    /* Group based on mobility */
    return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;--------------------------這裏面只會與__GFP_RECLAIMABLE|__GFP_MOVABLE,而後右移3bit,就將___GFP_MOVABLE轉換到MIGRATE_MOVABLE,將__GFP_RECLAIMABLE轉換到MIGRATE_RECLAIMABLE。
}

因爲這次申請的gfp_mask可知沒有___GFP_MOVABLE,因此alloc_flags不會包括ALLOC_CMA。反之,若是要複用CMA進行內存申請,須要在gfp_mask中包括__GFP_MOVABLE。

從Normal區域空閒頁面能夠看出,有58個4MB空閒,可是屬於CMA區域。因此申請不成功。

 

3.3 問題的根源

結合warn_alloc()和實例概括以下:

1. 雖然存在不少空閒內存,可是alloc_ep_req()沒法使用

因爲alloc_ep_req()申請內存的gfp_mask爲GFP_ATOMIC|__GFP_COMP。

因爲不具有__GFP_MOVABLE,因此即便存在不少空閒4MB連續頁面,也沒法使用,由於這些4MB頁面都是CMA的。

[ 2161.785225] Normal: 4*4kB (UEC) 3*8kB (EC) 3*16kB (UEC) 2*32kB (UE) 2*64kB (UE) 2*128kB (UE) 2*256kB (EC) 1*512kB (E) 3*1024kB (UEC) 3*2048kB (UEC) 58*4096kB (C) = 248344kB-----光4MB CMA就達到了232M,其餘只有16MB。

2. 爲何剩下的內存絕大部分是CMA?

從Normal區域空閒頁面狀況看,絕大部分都是CMA的。可是初始化的時候存在不少其餘類型的頁面。

經過cat /proc/pagetypeinfo查看先後對比,能夠發現Movable類型的頁面基本被申請完。

因此這裏懷疑是內存泄漏,經過下面腳本跟蹤MemFree。

while true; do cat /proc/meminfo | grep MemFree; sleep 10; done

發現內存在不停的降低,達到260M左右的時候出現warn_alloc()。

因此問題的根源在內存泄漏。

3. 如何下降內存碎片?

對內存碎片,能夠提供頁面規整來解決。請參考《Linux內存管理 (16)內存規整

4. 調整/proc/sys/vm/min_free_kbytes

相關文章
相關標籤/搜索