經過這段代碼能夠看出,它調用了kmem_cache_alloc_node函數,在task_struct的緩存區域task_struct分配了一塊內存node
static struct kmem_cache *task_struct_cachep; task_struct_cachep = kmem_cache_create("task_struct", arch_task_struct_size, align, SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL); static inline struct task_struct *alloc_task_struct_node(int node) { return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); } static inline void free_task_struct(struct task_struct *tsk) { kmem_cache_free(task_struct_cachep, tsk); }
一、在系統初始化的時候,task_struct_cachep 會被 kmem_cache_create 函數建立。算法
二、這個函數也比較容易看懂、專門用於分配 task_struct 對象的緩存。這個緩存區的名字就叫 task_struct。數組
三、緩存區中每一塊的大小正好等於 task_struct 的大小,也即 arch_task_struct_size。緩存
一、kmem_cache_alloc_node函數的做用?數據結構
一、有了這個緩存區,每次建立task_struct的時候,咱們就不用到內存裏面去分配,先在緩存裏面看看有沒有直接可用的,這就是kmem_cache_alloc_node的做用app
二、kmem_cache_free的做用less
當一個進程結束,task_struct 也不用直接被銷燬,而是放回到緩存中,這就是kmem_cache_free的做用,async
這樣,新進程建立的時候,咱們就能夠直接用現成的緩存中的task_struct了函數
struct kmem_cache { struct kmem_cache_cpu __percpu *cpu_slab; /* Used for retriving partial slabs etc */ unsigned long flags; unsigned long min_partial; int size; /* The size of an object including meta data */ int object_size; /* The size of an object without meta data */ int offset; /* Free pointer offset. */ #ifdef CONFIG_SLUB_CPU_PARTIAL int cpu_partial; /* Number of per cpu partial objects to keep around */ #endif struct kmem_cache_order_objects oo; /* Allocation and freeing of slabs */ struct kmem_cache_order_objects max; struct kmem_cache_order_objects min; gfp_t allocflags; /* gfp flags to use on each alloc */ int refcount; /* Refcount for slab cache destroy */ void (*ctor)(void *); ...... const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ ...... struct kmem_cache_node *node[MAX_NUMNODES]; };
一、在 struct kemem_cache裏面,有個變量struct list_head list,這個結構咱們已經看到過屢次了ui
二、咱們能夠想象一下,對於操做系統來說,要建立和管理的緩存絕對不止task_struct,難道mm_struct就不須要嗎?
三、fs_struct就不須要嗎?都須要,所以全部的緩存最後都會放在一個鏈表裏面這就是LIST_HEAD(slab_caches)
對於緩存來來說,其實就是分配了連續幾頁的答內存塊,而後根據緩存對象的大小,切成小內存塊因此咱們這裏有三個kmem_cache_order_objects 類型的變量:
一、這裏面有order,就是2的order次方個頁面的答內存塊,
二、objects就是可以存放的緩存對象的數量
最終,咱們講答內存塊切分紅小內存塊,樣子就像下面這樣
每一項的結構都是緩存對象後面跟一個下一個空閒對象的指針,這樣很是方便將全部的空閒對象鏈成一個鏈,其實這就至關於我們數據結構
裏面學的,用數組實現一個可隨機插入和刪除的鏈表
因此,這裏面有三個變量:size是包含這個指針的大小,object_size是純的大小,offset就是把下一個空閒對象的指針存放在這一項裏的偏移量
那這些緩存對象那些被分配了,那些在空着,什麼狀況下整個大內存塊被分配完,須要向夥伴系統申請幾個頁造成新的大內存塊?這些信息該由誰來維護呢?
接下來就是最重要的兩個成員變量出場的時候了kmem_cache_cpu和kmem_cache_node,它們是每一個NUMA節點上有一個,咱們只須要看一個節點裏面的狀況
咱們來看一下,kemem_cache_cpu裏面是如何存放緩存塊的
struct kmem_cache_cpu { void **freelist; /* Pointer to next available object */ unsigned long tid; /* Globally unique transaction id */ struct page *page; /* The slab from which we are allocating */ #ifdef CONFIG_SLUB_CPU_PARTIAL struct page *partial; /* Partially allocated frozen slabs */ #endif ...... };
在這裏,page指向的答內存塊的第一個頁,緩存塊就是從裏面分配的,freelist指向大內存塊裏面第一個空閒的項按照上面說的,這一項會有指針指向下一個空閒的項,最終全部空閒的項會造成一個鏈表
partial指向的頁是大內存塊的第一個頁,之因此明叫partial(部分),就是由於它裏面部分被分配出去了,部分是空的,這是一個備用列表當page滿了,就會從這裏找
咱們來看一下,kemem_cache_node這的定義
struct kmem_cache_node { spinlock_t list_lock; ...... #ifdef CONFIG_SLUB unsigned long nr_partial; struct list_head partial; ...... #endif };
這裏面也有一個 partial,是一個鏈表。這個鏈表裏存放的是部分空閒的大內存塊。這是 kmem_cache_cpu 裏面的 partial的備用列表,若是那裏沒有,就到這裏來找。
下面咱們就來看看這個分配過程。kmem_cache_alloc_node 會調用 slab_alloc_node。你仍是先重點看這裏面的註釋,這裏面說的就是快速通道和普統統道的概念
/* * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) * have the fastpath folded into their functions. So no function call * overhead for requests that can be satisfied on the fastpath. * * The fastpath works by first checking if the lockless freelist can be used. * If not then __slab_alloc is called for slow processing. * * Otherwise we can simply pick the next object from the lockless free list. */ static __always_inline void *slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr) { void *object; struct kmem_cache_cpu *c; struct page *page; unsigned long tid; ...... tid = this_cpu_read(s->cpu_slab->tid); c = raw_cpu_ptr(s->cpu_slab); ...... object = c->freelist; page = c->page; if (unlikely(!object || !node_match(page, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); stat(s, ALLOC_SLOWPATH); } ...... return object; }
快速通道很簡單,取出 cpu_slab 也即kmem_cache_cpu 的 freelist,這就是第一個空閒的項,能夠直接返回了。若是沒有空閒的了,則只好進入普統統道,調用 __slab_alloc。
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c) { void *freelist; struct page *page; ...... redo: ...... /* must check again c->freelist in case of cpu migration or IRQ */ freelist = c->freelist; if (freelist) goto load_freelist; freelist = get_freelist(s, page); if (!freelist) { c->page = NULL; stat(s, DEACTIVATE_BYPASS); goto new_slab; } load_freelist: c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); return freelist; new_slab: if (slub_percpu_partial(c)) { page = c->page = slub_percpu_partial(c); slub_set_percpu_partial(c, page); stat(s, CPU_PARTIAL_ALLOC); goto redo; } freelist = new_slab_objects(s, gfpflags, node, &c); ...... return freeli
若是真的還不行,那就要到 new_slab_objects 了
static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, int node, struct kmem_cache_cpu **pc) { void *freelist; struct kmem_cache_cpu *c = *pc; struct page *page; freelist = get_partial(s, flags, node, c); if (freelist) return freelist; page = new_slab(s, flags, node); if (page) { c = raw_cpu_ptr(s->cpu_slab); if (c->page) flush_slab(s, c); freelist = page->freelist; page->freelist = NULL; stat(s, ALLOC_SLAB); c->page = page; *pc = c; } else freelist = NULL; return freelis
在這裏面,get_partial 會根據 node id找到相應的 kmem_cache_node,而後調用 get_partial_node,開始在這個節點進行分配
/* * Try to allocate a partial slab from a specific node. */ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, struct kmem_cache_cpu *c, gfp_t flags) { struct page *page, *page2; void *object = NULL; int available = 0; int objects; ...... list_for_each_entry_safe(page, page2, &n->partial, lru) { void *t; t = acquire_slab(s, n, page, object == NULL, &objects); if (!t) break; available += objects; if (!object) { c->page = page; stat(s, ALLOC_FROM_PARTIAL); object = t; } else { put_cpu_partial(s, page, 0); stat(s, CPU_PARTIAL_NODE); } if (!kmem_cache_has_cpu_partial(s) || available > slub_cpu_partial(s) / 2) break; } ...... return object;
acquire_slab 會從 kmem_cache_node的partial 鏈表中拿下一大塊內存來,而且將 freelist也就是第一塊空閒的緩存塊,賦值給t
而且當第一輪循環的時候,將kmem_cache_cpu的page指向去下來的這一大塊內存,返回的object就是這塊內存裏面的第一個緩存t
若是kmem_cache_cpu也有一個partial,就會進行第二輪,再次取下一大塊內存來,此次調用put_cpu_partial,放到 kmem_cache_cpu的 partial 裏面。
若是kmem_cache_node裏面也沒有空閒的內存,這就說明原來分配的頁裏面都放滿了,就要回到 new_slab_objects 函數,裏面new_slab 函數會調用 allocate_slab。
static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; void *start, *p; int idx, order; bool shuffle; flags &= gfp_allowed_mask; ...... page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { oo = s->min; alloc_gfp = flags; /* * Allocation may have failed due to fragmentation. * Try a lower order alloc if possible */ page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) goto out; stat(s, ORDER_FALLBACK); } ...... return page; }
在這裏,咱們看到了alloc_slab_page 分配頁面。分配的時候,要按kmem_cache_order_objects 裏面的 order來。若是第一次分配不成功、說明內存已經很緊張了,那就換成min版本的kmem_cache_order_objects
好了,這個複雜的層層分配機制,咱們就講到這裏,你理解到這裏也就夠用了
虛擬地址空間很是大、物理內存不可能有這麼多的空間放得下,因此通常狀況下,頁面只有在被使用的時候,纔會放在物理內存
若是過一段時間不被使用,即使用戶進程並無釋放,物理內存管理也有責任作必定的干預,
例如這些物理內存中的頁面換出到硬盤上去;將空出的物理內存,交給活躍的進程去使用
一、分配內存的時候發現沒有地方了,就試圖回收一下
二、內存管理系統主動去作,而不是等真的出事再作,這就是內核線程kswapd
/* * The background pageout daemon, started as a kernel thread * from the init process. * * This basically trickles out pages so that we have _some_ * free memory available even if there is no other activity * that frees anything up. This is needed for things like routing * etc, where we otherwise might have all activity going on in * asynchronous contexts that cannot page things out. * * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */ static int kswapd(void *p) { unsigned int alloc_order, reclaim_order; unsigned int classzone_idx = MAX_NR_ZONES - 1; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; for ( ; ; ) { ...... kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, classzone_idx); ...... reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); ...... } }
例如,我們解析申請一個頁面的時候,會調用get_page_from_freelist,接下來的調用鏈
經過這個調用鏈,能夠看出,頁面換出也是之內存節點爲單位的
這裏的調用鏈是 balance_pgdat kswapd_shrink_node->shrink_node是之內存節點爲單位的,最後也調用shrink_node會調用 shrink_node_memcg。
這裏面有一個循環處理頁面的列表,看這個函數的註釋,其實和上面咱們想表達的內存換出是同樣的
/* * This is a basic per-node page freer. Used by both kswapd and direct reclaim. */ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg, struct scan_control *sc, unsigned long *lru_pages) { ...... unsigned long nr[NR_LRU_LISTS]; enum lru_list lru; ...... while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { unsigned long nr_anon, nr_file, percentage; unsigned long nr_scanned; for_each_evictable_lru(lru) { if (nr[lru]) { nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); nr[lru] -= nr_to_scan; nr_reclaimed += shrink_list(lru, nr_to_scan, lruvec, memcg, sc); } } ...... } ......
enum lru_list { LRU_INACTIVE_ANON = LRU_BASE, LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, LRU_UNEVICTABLE, NR_LRU_LISTS }; #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct mem_cgroup *memcg, struct scan_control *sc) { if (is_active_lru(lru)) { if (inactive_list_is_low(lruvec, is_file_lru(lru), memcg, sc, true)) shrink_active_list(nr_to_scan, lruvec, sc, lru); return 0; } return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
從上面的代碼能夠看出:
一、shrink_list會先縮減毀約頁面列表,再壓縮不毀約的頁面列表,
二、對於不活躍的縮減,shrink_inactive_list就須要對頁面進行回收;
三、對於匿名頁來說,須要分配swap,將內存頁寫入文件系統;
四、對於內存映射關聯了文件的,咱們須要將在內存中對於文件的修改寫回到文件中
好了,對於物理內存的管理就講到這裏,咱們來總結一下,對於物理內存來說,從下層到上層的關係及分配模式如何:
一、物理內存分NUMA節點,分別進行管理
二、每一個 NUMA 節點分紅多個內存區域;
三、每一個內存區域分紅多個物理頁面;
四、夥伴系統將多個連續的頁面做爲一個大的內存塊分配給上層;
五、kswapd 負責物理頁面的換入換出;
六、Slub Allocator 將從夥伴系統申請的大內存塊切成小內存,分配給其餘系統