夥伴系統中用於分配頁的函數以下:node
alloc_pages(mask,order)分配2^order頁並返回一個struct page的實例,表示分配的內存塊的起始頁。alloc_page(mask)是前者在order=0狀況下的簡化形式,只分配一頁。linux
get_zeroed_page(mask)分配一頁並返回一個page實例,頁對應的內存填充0(全部其餘函數分配以後的內容是未定義的)。算法
__get_free_pages(mask,order)和__get_free_page(mask)的工做方式與上述函數相同,但返回分配內存塊的虛擬地址,而不是page實例。緩存
get_dma_pages(gfp_mask,order)用來得到適用於DMA的頁。app
在空閒內存沒法知足請求以致於分配失敗的狀況下,全部上述函數都返回空指針 (alloc_pages和alloc_page)或者0(get_zeroed_page、__get_free_pages和 __get_free_page)。所以內核在各次分配以後必須檢查返回的結果。這種慣例與設計得很好的用戶層應用程序沒有什麼不一樣,但在內核中忽略檢查 將會致使嚴重得多的故障。less
前述全部函數中使用的mask參數的語義是什麼?linux將內核劃分爲內存域,內核提供了所謂的內存域修飾符,來指定從哪一個內存域分配所需的頁。函數
- #define __GFP_DMA ((__force gfp_t)0x01u)
- #define __GFP_HIGHMEM ((__force gfp_t)0x02u)
- #define __GFP_DMA32 ((__force gfp_t)0x04u)
除了內存域修飾符以外,掩碼中還能夠設置一些標誌,這些額外的標誌並不限制從哪一個物理內存段分配內存,但確實能夠改變分配器的行爲。
- #define __GFP_WAIT ((__force gfp_t)0x10u) //表示分配內存的請求能夠中斷。也就是說,調度器在該請求期間可隨意選擇另外一個過程執行,或者該請求能夠被另外一個更重要的事件中斷。
- #define __GFP_HIGH ((__force gfp_t)0x20u) //若是請求很是重要,則設置__GFP_HIGH,即內核急切的須要內存時。在分配內存失敗可能給內核帶來嚴重得後果時,通常會設置該標誌
- #define __GFP_IO ((__force gfp_t)0x40u) //在查找空閒內存期間內核能夠進行I/O操做。這意味着若是內核在內存分配期間換出頁,那麼僅當設置該標誌時,才能將選擇的頁寫入磁盤。
- #define __GFP_FS ((__force gfp_t)0x80u) //容許內核執行VFS操做
- #define __GFP_COLD ((__force gfp_t)0x100u) //若是須要分配不在CPU高速緩存中的「冷」頁時,則設置__GFP_COLD。
- #define __GFP_NOWARN ((__force gfp_t)0x200u) //在分配失敗時禁止內核故障警告。
- #define __GFP_REPEAT ((__force gfp_t)0x400u) //在分配失敗後自動重試,但在嘗試若干次以後會中止。
- #define __GFP_NOFAIL ((__force gfp_t)0x800u) //在分配失敗後一直重試,直至成功。
- #define __GFP_NORETRY ((__force gfp_t)0x1000u)//不重試,可能失敗
- #define __GFP_COMP ((__force gfp_t)0x4000u)//增長複合頁元數據
- #define __GFP_ZERO ((__force gfp_t)0x8000u)//在分配成功時,將返回填充字節0的頁。
- #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) //不適用緊急分配鏈表
- #define __GFP_HARDWALL ((__force gfp_t)0x20000u) // 只在NUMA系統上有意義。它限制只在分配到當前進程的各個CPU所關聯的結點分配內存。若是進程容許在全部的CPU上運行(默認狀況下),該標誌是沒有 意義的。只有進程能夠運行的CPU受限時,該標誌纔有意義。
- #define __GFP_THISNODE ((__force gfp_t)0x40000u)//頁只在NUMA系統上有意義,若是設置該比特位,則內存分配失敗的狀況下不容許使用其餘結點做爲備用,須要保證在當前結點或者明確指定的結點上成功分配內存。
- #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) //將分配的內存標記爲可回收
- #define __GFP_MOVABLE ((__force gfp_t)0x100000u) //將分配的內存標記爲可移動
-
- #define __GFP_BITS_SHIFT 21 /* Room for 21 __GFP_FOO bits */
- #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
-
- /* This equals 0, but use constants in case they ever change */
- #define GFP_NOWAIT (GFP_ATOMIC & ~__GFP_HIGH)
因爲這些標誌老是組合使用,內核作了一些分組,包含了用於各類標準情形的適當地標誌。
- #define GFP_ATOMIC (__GFP_HIGH)//用於原子分配,在任何狀況下都不能中斷,可能使用緊急分配鏈表中的內存
- #define GFP_NOIO (__GFP_WAIT)//明確禁止IO操做,但能夠被中斷
- #define GFP_NOFS (__GFP_WAIT | __GFP_IO)//明確禁止訪問VFS層操做,但能夠被中斷
- #define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS)//內核分配的默認配置
- #define GFP_TEMPORARY (__GFP_WAIT | __GFP_IO | __GFP_FS | \
- __GFP_RECLAIMABLE)
- #define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)//用戶分配的默認配置
- #define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \
- __GFP_HIGHMEM)//是GFP_USER的一個擴展,頁用於用戶空間,它容許分配沒法直接映射的高端內存,使用高端內存頁是沒有壞處的,由於用戶過程的地址空間老是經過非線性頁表組織的
- #define GFP_HIGHUSER_MOVABLE (__GFP_WAIT | __GFP_IO | __GFP_FS | \
- __GFP_HARDWALL | __GFP_HIGHMEM | \
- __GFP_MOVABLE)//相似於GFP_HIGHUSER,但分配是在虛擬內存域ZONE_MOVABLE中進行
- #define GFP_NOFS_PAGECACHE (__GFP_WAIT | __GFP_IO | __GFP_MOVABLE)
- #define GFP_USER_PAGECACHE (__GFP_WAIT | __GFP_IO | __GFP_FS | \
- __GFP_HARDWALL | __GFP_MOVABLE)
- #define GFP_HIGHUSER_PAGECACHE (__GFP_WAIT | __GFP_IO | __GFP_FS | \
- __GFP_HARDWALL | __GFP_HIGHMEM | \
- __GFP_MOVABLE)
-
- #ifdef CONFIG_NUMA
- #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
- #else
- #define GFP_THISNODE ((__force gfp_t)0)
- #endif
-
- /* This mask makes up all the page movable related flags */
- #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
-
- /* Control page allocator reclaim behavior */
- #define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\
- __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\
- __GFP_NORETRY|__GFP_NOMEMALLOC)
-
- /* Control allocation constraints */
- #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
-
- /* Do not use these with a slab allocator */
- #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
-
- /* Flag - indicates that the buffer will be suitable for DMA. Ignored on some
- platforms, used as appropriate on others */
-
- #define GFP_DMA __GFP_DMA
-
- /* 4GB DMA on some platforms */
- #define GFP_DMA32 __GFP_DMA32
- #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
- #define __get_free_page(gfp_mask) \
- __get_free_pages((gfp_mask),0)
- #define __get_dma_pages(gfp_mask, order) \
- __get_free_pages((gfp_mask) | GFP_DMA,(order))
- fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
- {
- struct page * page;
- page = alloc_pages(gfp_mask, order);
- if (!page)
- return 0;
- return (unsigned long) page_address(page);
- }
- #define alloc_pages(gfp_mask, order) \
- alloc_pages_node(numa_node_id(), gfp_mask, order)
根據上面的代碼,能夠得出各個分配函數之間的關係以下圖所示:
![](http://static.javashuo.com/static/loading.gif)
主要的函數是alloc_pages_node。alloc_pages_node源代碼的詳細分析以下:oop
- static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
- unsigned int order)
- {
- if (unlikely(order >= MAX_ORDER))//執行一個檢查,避免分配過大的內存塊
- return NULL;
-
- /* Unknown node is current node */
- if (nid < 0)//若是指定負的結點ID(不存在),內核自動地使用當前執行CPU對應的結點ID。
- nid = numa_node_id();
-
- return __alloc_pages(gfp_mask, order,
- NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_mask));//接下來的工做委託給__alloc_pages,只需傳遞一組適當地參數,請注意,gfp_zone用於選擇分配內存的內存域。
- }
- static inline enum zone_type gfp_zone(gfp_t flags)//本函數比較好理解,就是根據指定的標誌肯定內存域
- {
- int base = 0;
-
- #ifdef CONFIG_NUMA
- if (flags & __GFP_THISNODE)
- base = MAX_NR_ZONES;
- #endif
-
- #ifdef CONFIG_ZONE_DMA
- if (flags & __GFP_DMA)
- return base + ZONE_DMA;
- #endif
- #ifdef CONFIG_ZONE_DMA32
- if (flags & __GFP_DMA32)
- return base + ZONE_DMA32;
- #endif
- if ((flags & (__GFP_HIGHMEM | __GFP_MOVABLE)) ==
- (__GFP_HIGHMEM | __GFP_MOVABLE))
- return base + ZONE_MOVABLE;
- #ifdef CONFIG_HIGHMEM
- if (flags & __GFP_HIGHMEM)
- return base + ZONE_HIGHMEM;
- #endif
- return base + ZONE_NORMAL;
- }
__alloc_pages源代碼詳細分析以下:
- struct page * fastcall
- __alloc_pages(gfp_t gfp_mask, unsigned int order,
- <pre> struct zonelist *zonelist)//<span style="text- align: justify; " lang="EN-US">gfp_mask</span>< span style="text-align: justify; ">是一些標誌位,用來制定如何尋找空閒頁框,< span style="text-align: justify; " lang="EN-US">order</span>& lt;span style="text-align: justify; ">用來表示所需物理塊的大小,從空閒鏈表中獲取</span& gt;<span style="text-align: justify; " lang="EN-US">2^order< /span><span style="text-align: justify; ">頁內存,< span style="text-align: justify; ">在管理區鏈表</span>< span style="text-align: justify; " lang="EN-US">zonelist</span& gt;<span style="text-align: justify; ">中依次查找每一個區,從中找到知足要求的區< /span></span></span></pre> {const gfp_t wait = gfp_mask & __GFP_WAIT;//gfp_mask是申請內存時用到的控制字,這一句 就是爲了檢測咱們的控制字裏面是否有__GPF_WAIT這個屬性struct zone **z;//<span style="text- align:justify">管理區結構 體</span>struct page *page;struct reclaim_state reclaim_state;struct task_struct *p =
- current;int do_retry;int alloc_flags;int did_some_progress;might_sleep_if(wait);// 若是在gfp_mask中設置了__GFP_WAIT位,代表內核能夠阻塞當前進程,來等待空閒頁面。在分配開始以前即阻塞,目的是爲了等待其它進程釋放 更多的頁面if (should_fail_alloc_page(gfp_mask, order))//經過簡單算法在真正分配前檢查分配是否會失 敗,避免進入真正的分配程序後浪費系統時間return NULL;restart:z
- = zonelist->zones; //zonelist 是struct node中的一個成員,它表示系統內全部normal內存頁區的鏈接鏈表,<span style="text- align:justify; text-indent:28px">首先 讓</span><span style="text-align:justify; text- indent:28px" lang="EN-US">z</span><span style="text- align:justify; text-indent:28px">指向第一個管理區</span>if
- (unlikely(*z == NULL)) {// 若是發現頭指針爲空,即沒有指向struct zone的有效指針,咱們就直接返回錯誤 /* * Happens if we have an empty zonelist as a result of * GFP_THISNODE being used on a memoryless node */return NULL;}page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,zonelist,
- ALLOC_WMARK_LOW|ALLOC_CPUSET);//get_page_from_freelist 以指定的watermark來分配頁面。<span style="text-indent:21pt">每一個zone struct中定義 了三個watermark:pages_min, pages_low, pages_high,表示zone中應保持的空閒頁面的閾 值。</span><span style="text-indent:21pt"> get_page_from_freelist函數經過設置Alloc
- flags來選擇watermark。</span><span style="text-indent:21pt"></span>if (page)// 首先以pages_low watermark分配頁面,若是分配成功,則跳轉到 got_pggoto got_pg;/* * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and * __GFP_NOWARN set) should not cause reclaim since the subsystem
- * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim * using a larger set of nodes after it has established that the * allowed per node queues are empty and that nodes are * over allocated. */if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)//若是pages_low watermark分配失敗的話,檢查gfp_mask,若是GFP_THISNODE標誌被設置,代表不能重試,所以跳轉到nopage,返回失敗goto
- nopage;for (z = zonelist->zones; *z; z++)wakeup_kswapd(*z, order);// 不然調用kswapd對zonelist中的全部zone進行頁面回收,期待能將一些閒置頁面交換到文件系統中 /* * OK, we're below the kswapd watermark and have kicked background * reclaim. Now things get more complex, so set up alloc_flags according
- * to how we want to proceed. * * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will * set both ALLOC_HARDER
- (!wait) and ALLOC_HIGH (__GFP_HIGH). */alloc_flags = ALLOC_WMARK_MIN;// 設置alloc_flags的值,以page_min watermark來分配內存 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)//倘若進程是非中 斷處理程序的實時進程,或者該進程不能被阻塞,那麼這個時候,我要在最低閾值的標準的基礎上,再次下降閾值 alloc_flags |= ALLOC_HARDER;if
- (gfp_mask & __GFP_HIGH)//<span style="text- align:justify">容許使用保留頁面</span><span style="text- align:justify" lang="EN-US">__GFP_HIGH</span>alloc_flags |= ALLOC_HIGH;if (wait)alloc_flags |= ALLOC_CPUSET;/* * Go through the zonelist again. Let __GFP_HIGH and allocations
- * coming from realtime tasks go deeper into reserves. * * This is the last chance, in general, before the goto nopage. * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */page = get_page_from_freelist(gfp_mask,
- order, zonelist, alloc_flags);// 以指定的watermark來分配頁面,詳細討論見下文if (page)//分配成功,就進入got_pggoto got_pg; /* This allocation should allow future memory freeing. */rebalance://上面的 第二次分配失敗 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))// 若是當前進程容許本次申請的內存能夠被釋放,而且不處於軟硬中斷的狀態,咱們不顧忌必須保留最小空閒內存頁,強行分配&&
- !in_interrupt()) {if (!(gfp_mask & __GFP_NOMEMALLOC)) {// 若是gfp_mask設置不須要保留緊急內存區域,以不設watermark再次分配頁面 nofail_alloc:/* go through the zonelist yet again, ignoring mins * /page = get_page_from_freelist(gfp_mask, order,zonelist, ALLOC_NO_WATERMARKS); //以不設watermark進行第三次分配if
- (page)// 第三次分配成功goto got_pg;if (gfp_mask & __GFP_NOFAIL) {//第三次分配失敗,若是 gfp_mask設置了__GFP_NOFAIL,則不斷重試,直到分配成功 congestion_wait(WRITE, HZ/50);goto nofail_alloc;}}goto nopage;} /* Atomic allocations - we can't balance anything */if (!wait) //<span style="text-align:justify; text-indent:28px">原子分配,不容許阻塞,則只 能返回失敗信號,分配失敗</span>goto
- nopage;cond_resched();// 從新調度以後,試圖釋放一些不經常使用的頁面/* We now go into synchronous reclaim * /cpuset_memory_pressure_bump();//開始進行同步內存回收p->flags |= PF_MEMALLOC;// 進程的標誌位設置爲PF_MEMALLOCreclaim_state.reclaimed_slab = 0;//對於再也不活躍的SLAB也給回收掉 p->reclaim_state = &reclaim_state;//改變進程回收的狀態did_some_progress
- = try_to_free_pages(zonelist->zones, order, gfp_mask);// 該函數選擇最近不十分活躍的頁,將其寫到交換區,在物理內存中騰出空間p->reclaim_state = NULL;p-> flags &= ~PF_MEMALLOC;cond_resched(); if (order != 0)drain_all_local_pages(); if (likely(did_some_progress)) {//<span style="background- color:rgb(240,243,250)">調度以後,若是確實釋放了一部分頁面,則從新分配頁面</span>page
- = get_page_from_freelist(gfp_mask, order,zonelist, alloc_flags);if (page)goto got_pg;} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {//若是內核可能執行影響VFS層的調用而又沒有設置GFP_NORETRY,那麼調用OOM killerif (!try_set_zone_oom(zonelist)) {schedule_timeout_uninterruptible(1);goto
- restart;}/* * Go through the zonelist yet one more time, keep * very high watermark here, this is only to catch * a parallel oom killing, we must fail if we're still * under heavy pressure. */page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,zonelist,
- ALLOC_WMARK_HIGH|ALLOC_CPUSET);if (page) {clear_zonelist_oom(zonelist);goto got_pg;}/* The OOM killer will not help higher order allocs so fail */if (order > PAGE_ALLOC_COSTLY_ORDER) {// 殺死一個進程未必當即出現多餘2^PAGE_ALLOC_CODTLY_ORDER頁的連續內存區,所以若是當前要分配如此大的內存區,那麼內核會饒恕所 選擇的進程,不執行殺死進程的任務,而是認可失敗並跳轉到nopageclear_zonelist_oom(zonelist);goto
- nopage;}out_of_memory(zonelist, gfp_mask, order);// 選擇一個內核認爲犯有分配過多內存「罪行」的進程,並殺死該進程。這有很大概率騰出較多的空閒頁,而後跳轉到標號restart,重試分配內存的操做 clear_zonelist_oom(zonelist);goto restart;}/* * Don't let big-order allocations loop unless the caller explicitly * requests that. Wait
- for some write requests to complete then retry. * * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order * <= 3, but that may not be true in other implementations. *///若是設置了__GFP_NORETRY,或內核不容許可能影響VFS層的操做do_retry = 0;if (!(gfp_mask & __GFP_NORETRY))
- {// 沒有設置__GFP_NORETRYif ((order <= PAGE_ALLOC_COSTLY_ORDER) ||(gfp_mask & amp; __GFP_REPEAT))//若是分配長度小於2^PAGE_ALLOC_COSTLY_ORDER或設置了__GFP_REPEAT,則 內核進入無限循環do_retry = 1;if (gfp_mask & __GFP_NOFAIL)//若是設置了不容許分配失敗,內核也會 進入無限循環do_retry = 1;}if (do_retry) {congestion_wait(WRITE,
- HZ/50);goto rebalance;}nopage:if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {printk(KERN_WARNING "%s: page allocation failure."" order:%d, mode:0x%x\n",p->comm, order, gfp_mask);dump_stack();show_mem();}got_pg:return page;}
- <pre></pre>
- <p><span style="font-size:18px">get_page_from_freelist源代碼的詳細分析以下:</span></p>
- <p></p>
- <pre name="code" class="cpp">static struct page *
- get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, int alloc_flags)
- {
- struct zone **z;//管理區結構體
- struct page *page = NULL;
- int classzone_idx = zone_idx(zonelist->zones[0]);//#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) 獲取管理區的編號
- struct zone *zone;
- nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
- int zlc_active = 0; /* set if using zonelist_cache */
- int did_zlc_setup = 0; /* just call zlc_setup() one time */
- enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */
-
- zonelist_scan:
- /*
- * Scan zonelist, looking for a zone with enough free.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
- */
- z = zonelist->zones;//讓z指向第一個管理區
- //<span style="word- wrap: break-word; text-indent: 28px; background- color: rgb(245, 247, 248); " lang="EN-US"><span style="word- wrap: break-word; "> </span></span><span style="word- wrap: break-word; text-indent: 28px; background- color: rgb(245, 247, 248); ">在容許的節點中,遍歷知足要求的管理區</span>
- do {
- /*
- * In NUMA, this could be a policy zonelist which contains
- * zones that may not be allowed by the current gfp_mask.
- * Check the zone is allowed by the current flags
- */
- if (unlikely(alloc_should_filter_zonelist(zonelist))) {//根據zonelist->zlcache_ptr來肯定是否須要過濾掉此內存區鏈表,關於過濾的條件還不是很清楚,請指教
- if (highest_zoneidx == -1)
- highest_zoneidx = gfp_zone(gfp_mask);//gfp_zone用於指定分配內存的內存域
- if (zone_idx(*z) > highest_zoneidx)//首先考慮利用上面指定的內存域,對於一些分配代價高於指定內存域的內存域先不考慮
- continue;
- }
-
- if (NUMA_BUILD && zlc_active &&//<span style="text-indent: 28px; background-color: rgb(245, 247, 248); ">是第一遍分配,在其餘管理區中分配頁面時須要考慮其頁面是否充足</span>
- !zlc_zone_worth_trying(zonelist, z, allowednodes))//<span style="text-indent: 28px; background-color: rgb(245, 247, 248); ">該管理區頁面不是很充足,考慮下一個管理區</span>
- continue;
- zone = *z;
- if ((alloc_flags & ALLOC_CPUSET) &&
- !cpuset_zone_allowed_softwall(zone, gfp_mask))//<span style="text-indent: 28px; background-color: rgb(245, 247, 248); ">當前分配標誌不容許在該管理區中分配頁面</span>
- goto try_next_zone;
-
- if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {//<span style="text-indent: 28px; background-color: rgb(245, 247, 248); ">分配時須要考慮watermark</span>
- unsigned long mark;//<span style="text-indent: 28px; background-color: rgb(245, 247, 248); ">根據分配標誌,肯定使用哪個watermark</span>
- if (alloc_flags & ALLOC_WMARK_MIN)
- mark = zone->pages_min;
- else if (alloc_flags & ALLOC_WMARK_LOW)
- mark = zone->pages_low;
- else
- mark = zone->pages_high;
- if (!zone_watermark_ok(zone, order, mark,
- classzone_idx, alloc_flags)) {//<span style="text-indent: 28px; background-color: rgb(245, 247, 248); ">該管理區的可用內存不能夠知足本次分配的要求</span>
- if (!zone_reclaim_mode ||//但不知足分配要求時,若是此內存域不能回收內存或者是回收不到可用內存時,就會跳轉到this_zone_full
- !zone_reclaim(zone, gfp_mask, order))
- goto this_zone_full;
- }
- }
-
- page = buffered_rmqueue(zonelist, zone, order, gfp_mask);//<span style="text-indent: 28px; background-color: rgb(245, 247, 248); ">調用夥伴系統的分配函數</span>
- if (page)//<span style="word- wrap: break-word; text-indent: 28px; background- color: rgb(245, 247, 248); " lang="EN-US"><span style="word- wrap: break-word; "> </span></span><span style="word- wrap: break-word; text-indent: 28px; background- color: rgb(245, 247, 248); ">從夥伴系統分配成功,退出</span>
- break;
- this_zone_full:
- if (NUMA_BUILD)
- zlc_mark_zone_full(zonelist, z);//<span style="text-indent: 28px; background-color: rgb(245, 247, 248); ">標記該管理區空間不足,下次分配時將略過本管理區,避免浪費太多時間</span>
- try_next_zone:
- if (NUMA_BUILD && !did_zlc_setup) {//<span style="text-indent: 28px; background-color: rgb(245, 247, 248); ">當前管理區內存不足,須要加大在其餘區中的分配力度</span>
- /* we do zlc_setup after the first zone is tried */
- allowednodes = zlc_setup(zonelist, alloc_flags);
- zlc_active = 1;
- did_zlc_setup = 1;
- }
- } while (*(++z) != NULL);
-
- if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {//<span style="word- wrap: break-word; text-indent: 28px; background- color: rgb(245, 247, 248); " lang="EN-US"><span style="word- wrap: break-word; "> </span></span><span style="word- wrap: break-word; text-indent: 28px; background- color: rgb(245, 247, 248); ">第一遍分配不成功,則取 消</span><span style="word-wrap: break-word; text- indent: 28px; background-color: rgb(245, 247, 248); " lang="EN-US">& lt;span style="word-wrap: break-word; ">zlc_active</span>< /span><span style="word-wrap: break-word; text- indent: 28px; background-color: rgb(245, 247, 248); ">,這樣會盡可能從其餘節點中分配內 存</span>
- /* Disable zlc cache for second zonelist scan */
- zlc_active = 0;
- goto zonelist_scan;
- }
- return page;
- }
- </pre>
- <p><span style="font-size:18px; color:#ff0000">關於上面一段代碼中zlc_active的做用不明白,還望理解的人指點一下。</span></p>
- <p></p>
- <pre name="code" class="cpp">struct zonelist {
- struct zonelist_cache *zlcache_ptr; // NULL or &zlcache
- struct zone *zones[MAX_ZONES_PER_ZONELIST + 1]; // NULL delimited
- #ifdef CONFIG_NUMA
- struct zonelist_cache zlcache; // optional ...
- #endif
- };</pre><br>
- <pre name="code" class="cpp">struct zonelist_cache {
- unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */
- DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */
- unsigned long last_full_zap; /* when last zap'd (jiffies) */
- };</pre><br>
- <span style="font-size:18px">zone_watermark_ok源代碼詳細分析以下:</span><br>
- <p></p>
- <p></p>
- <pre name="code" class="cpp">int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int alloc_flags)
- {
- /* free_pages my go negative - that's OK */
- long min = mark;
- long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;//zone_page_state用來訪問每一個內存域的統計量,在此處,獲得的是空閒頁的數目
- int o;
-
- if (alloc_flags & ALLOC_HIGH)//設置了ALLOC_HIGH以後,將最小值標記減小一半
- min -= min / 2;
- if (alloc_flags & ALLOC_HARDER)//設置了ALLOC_HARDER以後,將最小值標記減小1/4
- min -= min / 4;
-
- if (free_pages <= min + z->lowmem_reserve[classzone_idx])//檢查空閒頁的數目是否小於最小值與lowmem_reserve中制定的緊急分配值之和,若是小於則不進行內存分配
- return 0;
- for (o = 0; o < order; o++) {//若是不小於,則代碼遍歷全部小於當前階的分配階
- /* At the next order, this order's pages become unavailable */
- free_pages -= z->free_area[o].nr_free << o;//從free_pages減去當前分配階的全部空閒頁
-
- /* Require fewer higher order pages to be free */
- min >>= 1;// 每升高一階,所需空閒頁的最小值減半,<span style="background- color: rgb(245, 247, 248); ">由於階數越高,每個塊中包含的頁面就越多。咱們假設初始水線是2^n,那麼對階數0 來講,min的值就應當是2^n,對階數爲1來講,min的值就應當除以2變爲2^(n-1),由於對於階數1來講,每一個塊包含的頁面數爲 2</span>
-
- if (free_pages <= min)//若是內核遍歷全部的低端內存域以後,發現內存不足,則不進行內存分配
- return 0;
- }
- return 1;
- }
- </pre><span style="font-size:18px">buffered_rmqueue源代碼詳細分析以下:</span>
- <p></p>
- <p></p>
- <pre name="code" class="cpp">static struct page *buffered_rmqueue(struct zonelist *zonelist,
- struct zone *zone, int order, gfp_t gfp_flags)
- {
- unsigned long flags;
- struct page *page;
- int cold = !!(gfp_flags & __GFP_COLD);//若是分配參數指定了__GFP_COLD標誌,則設置cold標誌,兩次取反操做確保cold是0或者1,why?請指教
- int cpu;
- int migratetype = allocflags_to_migratetype(gfp_flags);//根據gfp_flags得到遷移類型
-
- again:
- cpu = get_cpu();//獲取本CPU
- if (likely(order == 0)) {//分配單頁,須要管理每CPU頁面緩存
- struct per_cpu_pages *pcp;
-
- pcp = &zone_pcp(zone, cpu)->pcp[cold];//取得本CPU的頁面緩存對象
- local_irq_save(flags);//這裏須要關中斷,由於內存回收過程可能發送核間中斷,強制每一個核從每CPU緩存中釋放頁面。並且中斷處理函數也會分配單頁。
- if (!pcp->count) {//緩存爲空,須要擴大緩存的大小
- pcp->count = rmqueue_bulk(zone, 0,
- pcp->batch, &pcp->list, migratetype);//從夥伴系統中摘除一批頁面到緩存中,補充的頁面個數由每CPU緩存的batch字段指定
- if (unlikely(!pcp->count))//若是緩存仍然爲空,那麼說明夥伴系統中頁面也沒有了,分配失敗
- goto failed;
- }
-
- /* Find a page of the appropriate migrate type */
- list_for_each_entry(page, &pcp->list, lru)//遍歷每CPU緩存中的全部頁,檢查是否有指定類型的遷移類型的頁可用
- if (page_private(page) == migratetype)
- break;
-
- /* Allocate more to the pcp list if necessary */
- if (unlikely(&page->lru == &pcp->list)) {
- pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, &pcp->list, migratetype);
- page = list_entry(pcp->list.next, struct page, lru);
- }
-
- list_del(&page->lru);//將頁面從每CPU緩存鏈表中取出,並將每CPU緩存計數減1
- pcp->count--;
- } else {
- spin_lock_irqsave(&zone->lock, flags);
- page = __rmqueue(zone, order, migratetype);
- spin_unlock(&zone->lock);
- if (!page)
- goto failed;
- }
-
- __count_zone_vm_events(PGALLOC, zone, 1 << order);
- zone_statistics(zonelist, zone);
- local_irq_restore(flags);
- put_cpu();
-
- VM_BUG_ON(bad_range(zone, page));
- if (prep_new_page(page, order, gfp_flags))
- goto again;
- return page;
-
- failed:
- local_irq_restore(flags);
- put_cpu();
- return NULL;
- }</pre><span style="color:rgb(255,0,0); font-family:Arial; font-size:18px; line-height:26px"> 我也知道有不少的細節都沒有分析到位,可是我也沒有辦法,曾經想着把裏面涉及到的每個函數都分析到位,可是那樣的話本身至關的痛苦,由於那樣的結果就是 不少天都沒有辦法前進一點,會讓人至關的有挫敗感,最後只能選擇大概先都過一遍,由於本身是一個內核的初學者,而內核先後的關聯又很大,也只能先過一遍, 到後面我會從新回來看我寫得博客,能增進一些分析就增進一些分析。若是您認爲上面確實有很重要的地方我沒有分析到,但願您指 點。</span><br>
- <br>
- <br>
- <p></p>
- <br>
- <p></p>
- <p><br>
- </p>
- <br>
- <br>
- <br>
- <br>
- <br>
- <br>
- <br>
- <br>
- <br>
- <p></p>