專題:Linux內存管理專題html
關鍵詞:內存規整、頁面遷移、pageblock、MIGRATE_TYPES。node
內存碎片的產生:夥伴系統以頁爲單位進行管理,通過大量申請釋放,形成大量離散且不連續的頁面。這時就產生了不少碎片。數據結構
內存規整也即內存碎片整理,內存碎片也是以頁面爲單位的。實現基礎是內存頁面按照可移動性進行分組。內存規整的實現基礎是頁面遷移。框架
Linux內核以pageblock爲單位來管理頁的遷移屬性。異步
爲何須要內存規整?async
有些狀況下,物理設備須要大段連續物理內存。雖然此時空閒內存足夠,可是喲與沒法找到連續的物理內存,仍然形成內存分配失敗。ide
下面是內存頁面分配,以及分配失敗以後採起的措施,以便促成分配成功。函數
能夠看出採起的措施,愈來愈重。首先採用kswapd來進行頁面回收,而後嘗試頁面規整、直接頁面回收,最後是OOM殺死進程來獲取更多內存空間。oop
alloc_pages-------------------------------------頁面分配的入口 ->__alloc_pages_nodemask ->get_page_from_freelist--------------------直接從zonelist的空閒列表中分配頁面 ->__alloc_pages_slowpath--------------------在初次嘗試分配失敗後,進入slowpath路徑分配頁面 ->wake_all_kswapds------------------------喚醒kswapd內核線程進行頁面回收 ->get_page_from_freelist------------------kswapd頁面回收後再次進行頁面分配 ->__alloc_pages_direct_compact------------進行頁面規整,而後進行頁面分配 ->__alloc_pages_direct_reclaim------------直接頁面回收,而後進行頁面分配 ->__alloc_pages_may_oom-------------------嘗試觸發OOM
另外一條路徑是在kswapd的balance_pgdat中會判斷是否須要進行內存規整。post
kswapd ->balance_pgdat-------------------------------遍歷內存節點的zone,判斷是否處於平衡狀態即WMARK_HIGH。 ->compact_pgdat-----------------------------針對整個內存節點進行內存規整
其中compact_pddat->__compact_pgdat->compact_zone,最終的實現和__alloc_pages_direct_compact調用compact_zone同樣。
內存規整相關有兩個節點,compact_memory用於觸發內存規整;extfrag_threshold影響內核決策是採用內存規整仍是直接回收來知足大內存分配。
節點入口代碼:
static struct ctl_table vm_table[] = { ... #ifdef CONFIG_COMPACTION { .procname = "compact_memory", .data = &sysctl_compact_memory, .maxlen = sizeof(int), .mode = 0200, .proc_handler = sysctl_compaction_handler, }, { .procname = "extfrag_threshold", .data = &sysctl_extfrag_threshold, .maxlen = sizeof(int), .mode = 0644, .proc_handler = sysctl_extfrag_handler, .extra1 = &min_extfrag_threshold, .extra2 = &max_extfrag_threshold, }, #endif /* CONFIG_COMPACTION */
... { } }
打開compaction Tracepoint:echo 1 > /sys/kernel/debug/tracing/events/compaction/enable
觸發內存規整:sysctl -w vm.compact_memory=1
查看Tracepoint:cat /sys/kernel/debug/tracing/trace
在compact_zone中調用函數compaction_suitable->__compaction_suitable進行判斷是否進行內存規整。
和extfrag_threshold相關部分以下,若是當前fragindex不超過sysctl_extfrag_threshold,則不會繼續進行內存規整。
因此這個參數越小越傾向於進行內存規整,越大越不容易進行內存規整。
static unsigned long __compaction_suitable(struct zone *zone, int order, int alloc_flags, int classzone_idx) { ... fragindex = fragmentation_index(zone, order); if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) return COMPACT_NOT_SUITABLE_ZONE; return COMPACT_CONTINUE; }
設置extfrag_threshold:sysctl -w vm.extfrag_threshold=500
/sys/kernel/debug/extfrag/extfrag_index
/sys/kernel/debug/extfrag/unusable_index
在進入細節前,先看看內存規整函數框架。
__alloc_pages_direct_compact ->try_to_compact_pages-----------------直接內存規整來知足高階分配需求 ->compact_zone_order-----------------遍歷zonelist對每一個zone進行規整 ->compact_zone---------------------對zone進行規整
->compaction_suitable------------檢查是否繼續規整,COMPACT_PARTIAL/COMPACT_SKIPPED都跳過。 ->compact_finished---------------在while中判斷是否能夠中止內存規整 ->isolate_migratepages-----------查找能夠遷移頁面
->migrate_pages------------------進行頁面遷移操做 ->get_free_page_from_freelist------在規整完成後進行頁面分配操做
__alloc_pages_direct_compact首先執行規整操做,而後進行頁面分配。
static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac, enum migrate_mode mode, int *contended_compaction, bool *deferred_compaction) { unsigned long compact_result; struct page *page; if (!order)-----------------------------------------------------------------order爲0狀況,不用進行內存規整。 return NULL; current->flags |= PF_MEMALLOC; compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,-----進行內存規整,當前進程會置PF_MEMALLOC,避免進程遷移時發生死鎖。 mode, contended_compaction); current->flags &= ~PF_MEMALLOC; switch (compact_result) { case COMPACT_DEFERRED: *deferred_compaction = true; /* fall-through */ case COMPACT_SKIPPED: return NULL; default: break; } ... page = get_page_from_freelist(gfp_mask, order,-----------------------------進行內存分配 alloc_flags & ~ALLOC_NO_WATERMARKS, ac); ... count_vm_event(COMPACTFAIL); cond_resched(); return NULL; }
try_to_compact_pages執行內存規整,以pageblock爲單位,選擇pageblock中可遷移頁面。
unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac, enum migrate_mode mode, int *contended) { int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; struct zone *zone; int rc = COMPACT_DEFERRED; int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ *contended = COMPACT_CONTENDED_NONE; /* Check if the GFP flags allow compaction */ if (!order || !may_enter_fs || !may_perform_io) return COMPACT_SKIPPED; trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode); /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,-----------根據掩碼遍歷特定zone ac->nodemask) { int status; int zone_contended; if (compaction_deferred(zone, order)) continue; status = compact_zone_order(zone, order, gfp_mask, mode,-----------------------針對特定zone進行規整 &zone_contended, alloc_flags, ac->classzone_idx); rc = max(status, rc); /* * It takes at least one zone that wasn't lock contended * to clear all_zones_contended. */ all_zones_contended &= zone_contended; /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), ac->classzone_idx, alloc_flags)) {--------------------------------當前zoen水位是否高於WMARK_LOW,若是是則退出當前循環。 /* * We think the allocation will succeed in this zone, * but it is not certain, hence the false. The caller * will repeat this with true if allocation indeed * succeeds in this zone. */ compaction_defer_reset(zone, order, false); /* * It is possible that async compaction aborted due to * need_resched() and the watermarks were ok thanks to * somebody else freeing memory. The allocation can * however still fail so we better signal the * need_resched() contention anyway (this will not * prevent the allocation attempt). */ if (zone_contended == COMPACT_CONTENDED_SCHED) *contended = COMPACT_CONTENDED_SCHED; goto break_loop; } ... continue; break_loop: /* * We might not have tried all the zones, so be conservative * and assume they are not all lock contended. */ all_zones_contended = 0; break; } /* * If at least one zone wasn't deferred or skipped, we report if all * zones that were tried were lock contended. */ if (rc > COMPACT_SKIPPED && all_zones_contended) *contended = COMPACT_CONTENDED_LOCK; return rc; }
compact_zone_order調用compact_zone,最主要的就是將參數填入struct compact_control結構體,而後和zone一塊兒做爲參數傳遞給compact_zone。
struct compact_control數據結構記錄了被遷移的頁面,以及規整過程當中遷移到的頁面列表。
static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, enum migrate_mode mode, int *contended, int alloc_flags, int classzone_idx) { unsigned long ret; struct compact_control cc = { .nr_freepages = 0, .nr_migratepages = 0, .order = order,------------------------------------------須要規整的頁面階數 .gfp_mask = gfp_mask,------------------------------------頁面規整的頁面掩碼 .zone = zone, .mode = mode,--------------------------------------------頁面規整模式-同步、異步 .alloc_flags = alloc_flags, .classzone_idx = classzone_idx, }; INIT_LIST_HEAD(&cc.freepages);-------------------------------初始化遷移目的地的鏈表 INIT_LIST_HEAD(&cc.migratepages);----------------------------初始化將要遷移頁面鏈表 ret = compact_zone(zone, &cc); VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.migratepages)); *contended = cc.contended; return ret; } static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); const bool sync = cc->mode != MIGRATE_ASYNC; unsigned long last_migrated_pfn = 0; ret = compaction_suitable(zone, cc->order, cc->alloc_flags, cc->classzone_idx);-------------------------------根據當前zone水位來判斷是否須要進行內存規整,COMPACT_CONTINUE表示能夠作內存規整。 switch (ret) { case COMPACT_PARTIAL: case COMPACT_SKIPPED: /* Compaction is likely to fail */ return ret; case COMPACT_CONTINUE: /* Fall through to compaction */ ; } /* * Clear pageblock skip if there were failures recently and compaction * is about to be retried after being deferred. kswapd does not do * this reset as it'll reset the cached information when going to sleep. */ if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) __reset_isolation_suitable(zone); /* * Setup to move all movable pages to the end of the zone. Used cached * information on where the scanners should start but check that it * is initialised by ensuring the values are within zone boundaries. */ cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];-----------------表示從zone的開始頁面開始掃描和查找哪些頁面能夠被遷移。 cc->free_pfn = zone->compact_cached_free_pfn;-----------------------------從zone末端開始掃描和查找哪些空閒的頁面能夠用做遷移頁面的目的地。 if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {-----------------下面對free_pfn和migrate_pfn進行範圍限制。 cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); zone->compact_cached_free_pfn = cc->free_pfn; } if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { cc->migrate_pfn = start_pfn; zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; } trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync); migrate_prep_local(); while ((ret = compact_finished(zone, cc, migratetype)) == COMPACT_CONTINUE) {-----------------------------------while中從zone開頭掃描查找合適的遷移頁面,而後嘗試遷移到zone末端空閒頁面中,直到zone處於低水位WMARK_LOW之上。 int err; unsigned long isolate_start_pfn = cc->migrate_pfn; switch (isolate_migratepages(zone, cc)) {-----------------------------用於掃描和查找合適遷移的頁,從zone頭部開始找起,查找步長以pageblock_nr_pages爲單位。 case ISOLATE_ABORT: ret = COMPACT_PARTIAL; putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; goto out; case ISOLATE_NONE: /* * We haven't isolated and migrated anything, but * there might still be unflushed migrations from * previous cc->order aligned block. */ goto check_drain; case ISOLATE_SUCCESS: ; } err = migrate_pages(&cc->migratepages, compaction_alloc,--------------migrate_pages是頁面遷移核心函數,從cc->migratepages中摘取頁,而後嘗試去遷移。 compaction_free, (unsigned long)cc, cc->mode, MR_COMPACTION); trace_mm_compaction_migratepages(cc->nr_migratepages, err, &cc->migratepages); /* All pages were either migrated or will be released */ cc->nr_migratepages = 0; if (err) {------------------------------------------------------------沒處理成功的頁面會放回到合適的LRU鏈表中。 putback_movable_pages(&cc->migratepages); /* * migrate_pages() may return -ENOMEM when scanners meet * and we want compact_finished() to detect it */ if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) { ret = COMPACT_PARTIAL; goto out; } } ... } out: ... trace_mm_compaction_end(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync, ret); return ret; }
compaction_suitable根據當前zone水位決定是否須要繼續內存規整,主要工做由__compaction_suitable進行處理。
主要依據zone低水位和extfrag_threshold兩個參數進行判斷。
unsigned long compaction_suitable(struct zone *zone, int order, int alloc_flags, int classzone_idx) { unsigned long ret; ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx); trace_mm_compaction_suitable(zone, order, ret); if (ret == COMPACT_NOT_SUITABLE_ZONE) ret = COMPACT_SKIPPED; return ret; } static unsigned long __compaction_suitable(struct zone *zone, int order, int alloc_flags, int classzone_idx) { int fragindex; unsigned long watermark; /* * order == -1 is expected when compacting via * /proc/sys/vm/compact_memory */ if (order == -1) return COMPACT_CONTINUE; watermark = low_wmark_pages(zone); /* * If watermarks for high-order allocation are already met, there * should be no need for compaction at all. */ if (zone_watermark_ok(zone, order, watermark, classzone_idx, alloc_flags))--------------------------------------COMPACT_PARTIAL:若是知足低水位,則不須要進行內存規整。 return COMPACT_PARTIAL; /* * Watermarks for order-0 must be met for compaction. Note the 2UL. * This is because during migration, copies of pages need to be * allocated and for a short time, the footprint is higher */ watermark += (2UL << order);---------------------------------------------------增長水位高度爲watermark+2<<order。 if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags))--------COMPACT_SKIPPED:若是達不到新水位,說明當前zone中空閒頁面不多,不適合做內存規整,跳過此zone。 return COMPACT_SKIPPED; /* * fragmentation index determines if allocation failures are due to * low memory or external fragmentation * * index of -1000 would imply allocations might succeed depending on * watermarks, but we already failed the high-order watermark check * index towards 0 implies failure is due to lack of memory * index towards 1000 implies failure is due to fragmentation * * Only compact if a failure would be due to fragmentation. */ fragindex = fragmentation_index(zone, order); if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)-----------------由extfrag_threshold控制的內存規整流程 return COMPACT_NOT_SUITABLE_ZONE; return COMPACT_CONTINUE; }
compact_finished判斷內存規整流程是否能夠結束,結束的條件有兩個:
一是cc->migrate_pfn和cc->free_pfn兩個指針相遇;二是以order爲條件判斷當前zone的水位在低水位之上。
static int compact_finished(struct zone *zone, struct compact_control *cc, const int migratetype) { int ret; ret = __compact_finished(zone, cc, migratetype); trace_mm_compaction_finished(zone, cc->order, ret); if (ret == COMPACT_NO_SUITABLE_PAGE) ret = COMPACT_CONTINUE; return ret; } static int __compact_finished(struct zone *zone, struct compact_control *cc, const int migratetype) { unsigned int order; unsigned long watermark; if (cc->contended || fatal_signal_pending(current)) return COMPACT_PARTIAL; /* Compaction run completes if the migrate and free scanner meet */ if (cc->free_pfn <= cc->migrate_pfn) {-----------------------------------------掃描可遷移頁面和空閒頁面,從zone的頭尾向中間運行。當二者相遇,能夠中止規整。 /* Let the next compaction start anew. */ zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; zone->compact_cached_free_pfn = zone_end_pfn(zone); /* * Mark that the PG_migrate_skip information should be cleared * by kswapd when it goes to sleep. kswapd does not set the * flag itself as the decision to be clear should be directly * based on an allocation request. */ if (!current_is_kswapd()) zone->compact_blockskip_flush = true; return COMPACT_COMPLETE;--------------------------------------------------中止內存規整 } /* * order == -1 is expected when compacting via * /proc/sys/vm/compact_memory */ if (cc->order == -1)----------------------------------------------------------order爲-1表示強制執行內存規整,繼續內存規整 return COMPACT_CONTINUE; /* Compaction run is not finished if the watermark is not met */ watermark = low_wmark_pages(zone); if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, cc->alloc_flags))--------------------------------------不知足低水位條件,繼續內存規整。 return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { struct free_area *area = &zone->free_area[order]; /* Job done if page is free of the right migratetype */ if (!list_empty(&area->free_list[migratetype]))----------------------------空閒頁面爲空,沒法進行遷移,中止內存規整。 return COMPACT_PARTIAL; /* Job done if allocation would set block type */ if (order >= pageblock_order && area->nr_free) return COMPACT_PARTIAL; } return COMPACT_NO_SUITABLE_PAGE; }
isolate_migratepages掃描並尋找zone中可遷移頁面,結果回添加到cc->migratepages鏈表中。
掃描的一個重要參數是頁的遷移屬性參考MIGRATE_TYPES有詳細解釋。
Linux內核以pageblock爲單位來管理頁的遷移屬性,一個pageblock大小爲4MB大小,即2^10個頁面。
pageblock_nr_pages即爲1024個頁面。
static isolate_migrate_t isolate_migratepages(struct zone *zone, struct compact_control *cc) { unsigned long low_pfn, end_pfn; struct page *page; const isolate_mode_t isolate_mode = (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); /* * Start at where we last stopped, or beginning of the zone as * initialized by compact_zone() */ low_pfn = cc->migrate_pfn; /* Only scan within a pageblock boundary */ end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); /* * Iterate over whole pageblocks until we find the first suitable. * Do not cross the free scanner. */ for (; end_pfn <= cc->free_pfn;---------------------------------------從cc->migrate_pfn開始以pageblock_nr_pages爲步長向zone尾部進行掃描。 low_pfn = end_pfn, end_pfn += pageblock_nr_pages) { /* * This can potentially iterate a massively long zone with * many pageblocks unsuitable, so periodically check if we * need to schedule, or even abort async compaction. */ if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) && compact_should_abort(cc)) break; page = pageblock_pfn_to_page(low_pfn, end_pfn, zone); if (!page) continue; /* If isolation recently failed, do not retry */ if (!isolation_suitable(cc, page)) continue; /* * For async compaction, also only scan in MOVABLE blocks. * Async compaction is optimistic to see if the minimum amount * of work satisfies the allocation. */ if (cc->mode == MIGRATE_ASYNC && !migrate_async_suitable(get_pageblock_migratetype(page)))----migrate_async_suitable判斷pageblock是不是MIGRATE_MOVABLE和MIGRATE_CMA兩種類型,這兩種類型能夠遷移。 continue; /* Perform the isolation */ low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, isolate_mode);---------------------------掃描和分離pageblock中的頁面是不是和遷移。 if (!low_pfn || cc->contended) { acct_isolated(zone, cc); return ISOLATE_ABORT; } /* * Either we isolated something and proceed with migration. Or * we failed and compact_zone should decide if we should * continue or not. */ break; } acct_isolated(zone, cc); /* * Record where migration scanner will be restarted. If we end up in * the same pageblock as the free scanner, make the scanners fully * meet so that compact_finished() terminates compaction. */ cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn; return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; }
compaction_alloc()從zone的末尾開始查找空閒頁面,並把空閒頁面添加到cc->freepages鏈表中。而後從cc->freepages中摘除頁面,返回給migrate_pages做爲遷移使用。
compaction_free是規整失敗的處理函數,將空閒頁面返回給cc->freepages。
static struct page *compaction_alloc(struct page *migratepage, unsigned long data, int **result) { struct compact_control *cc = (struct compact_control *)data; struct page *freepage; /* * Isolate free pages if necessary, and if we are not aborting due to * contention. */ if (list_empty(&cc->freepages)) { if (!cc->contended) isolate_freepages(cc);--------------------------------------查找能夠用來做爲遷移目的頁面 if (list_empty(&cc->freepages))---------------------------------若是沒有頁面可被用來做爲遷移目的頁面,返回NULL。 return NULL; } freepage = list_entry(cc->freepages.next, struct page, lru); list_del(&freepage->lru);-------------------------------------------將空閒頁面從cc->freepages中摘除。 cc->nr_freepages--; return freepage;----------------------------------------------------找到能夠被用做遷移目的的頁面 } static void compaction_free(struct page *page, unsigned long data) { struct compact_control *cc = (struct compact_control *)data; list_add(&page->lru, &cc->freepages);-------------------------------失敗狀況下,將頁面放回cc->freepages。 cc->nr_freepages++; }