kmalloc、vmalloc和malloc這三個經常使用的API函數具備至關的份量,三者看上去很類似,但在實現上大有講究。kmalloc基於slab分配器,slab緩衝區創建在一個連續的物理地址的大塊內存之上,因此緩衝對象也是物理地址連續的。若是在內核中不須要連續的物理地址,而僅僅須要內核空間裏連續的虛擬地址的內存塊,該如何處理呢?這時vmalloc()就派上用場了。node
vmalloc()函數聲明以下:express
[mm/vmalloc.c] /** * vmalloc - allocate virtually contiguous memory * @size: allocation size * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. */ void *vmalloc(unsigned long size) { return __vmalloc_node_flags(size, NUMA_NO_NODE, GFP_KERNEL | __GFP_HIGHMEM); }
vmalloc使用的分配掩碼是「GFP_KERNEL|__GFP_HIGHMEM」,說明會優先使用高端內存High Memory。安全
static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, int node, const void *caller) { return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, prot, 0, node, caller); }
這裏的VMALLOC_START和VMALLOC_END是vmalloc中最重要的宏,這兩個宏定義在arch/arm/include/pgtable.h頭文件中。ARM64架構定義在arch/arm64/include/asm/pgtable.h頭文件中。VMALLOC_START是vmalloc區域的開始地址,它是在High_memory指定的高端內存開始地址再加上8MB大小的安全區域(VMALLOC_OFFSET)。在ARM Vexpress平臺殺昂,vmalloc的內存範圍是從0xf000_000到0xff00_0000,大小爲240MB,high_memory全局變量的計算在sanity_check_meminfo()函數中。數據結構
[arch/arm/include/pgtable.h] #define VMALLOC_OFFSET (8*1024*1024) #define VMALLOC_START (((unsigned long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)) #define VMALLOC_END 0xff000000UL [vmalloc()-> __vmalloc_node() -> __vmalloc_node_range()] void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) { struct vm_struct *area; void *addr; unsigned long real_size = size; size = PAGE_ALIGN(size); if (!size || (size >> PAGE_SHIFT) > totalram_pages) goto fail; area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); if (!area) goto fail; addr = __vmalloc_area_node(area, gfp_mask, prot, node); if (!addr) return NULL; /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED * flag. It means that vm_struct is not fully initialized. * Now, it is fully initialized, so remove this flag here. */ clear_vm_uninitialized_flag(area); /* * A ref_count = 2 is needed because vm_struct allocated in * __get_vm_area_node() contains a reference to the virtual address of * the vmalloc'ed block. */ kmemleak_alloc(addr, real_size, 2, gfp_mask); return addr; fail: warn_alloc_failed(gfp_mask, 0, "vmalloc: allocation failure: %lu bytes\n", real_size); return NULL; }
在__vmalloc_node_range()函數中,第9行代碼vmalloc分配的大小要以頁面大小對齊。若是vmalloc要分配的大小爲10Byte,那麼vmalloc仍是會分配出一個頁,剩下的4086Byte就浪費了。架構
第10行代碼,判斷要分配的內存大小不能爲0或者不能大於系統的全部內存。less
[vmalloc->__vmalloc_node_range()->__get_vm_area_node()] static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller) { struct vmap_area *va; struct vm_struct *area; BUG_ON(in_interrupt()); if (flags & VM_IOREMAP) align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER); size = PAGE_ALIGN(size); if (unlikely(!size)) return NULL; area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!area)) return NULL; if (!(flags & VM_NO_GUARD)) size += PAGE_SIZE; va = alloc_vmap_area(size, align, start, end, node, gfp_mask); if (IS_ERR(va)) { kfree(area); return NULL; } setup_vmalloc_vm(area, va, flags, caller); return area; }
在__get_vm_area_node()函數中,第7行代碼確保當前不在中斷上下文中,由於這個函數有可能睡眠。函數
第8行代碼又計算了一次對齊。優化
第10行代碼分配了一個struct vm_struct數據結構來描述這個vmalloc區域。ui
第12行代碼,若是flags中沒有定義VM_NO_GUARD標誌位,那麼要多分配一個頁來作安全墊,例如咱們要分配4KB的大小內存,vmalloc分配了8KB的內存塊。this
下面重點要看下第15行代碼的alloc_vmap_area()函數。
/* * Allocate a region of KVA of the specified size and alignment, within the * vstart and vend. */ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, int node, gfp_t gfp_mask) { struct vmap_area *va; struct rb_node *n; unsigned long addr; int purged = 0; struct vmap_area *first; BUG_ON(!size); BUG_ON(size & ~PAGE_MASK); BUG_ON(!is_power_of_2(align)); va = kmalloc_node(sizeof(struct vmap_area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); /* * Only scan the relevant parts containing pointers to other objects * to avoid false negatives. */ kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK); retry: spin_lock(&vmap_area_lock); /* * Invalidate cache if we have more permissive parameters. * cached_hole_size notes the largest hole noticed _below_ * the vmap_area cached in free_vmap_cache: if size fits * into that hole, we want to scan from vstart to reuse * the hole instead of allocating above free_vmap_cache. * Note that __free_vmap_area may update free_vmap_cache * without updating cached_hole_size or cached_align. */ if (!free_vmap_cache || size < cached_hole_size || vstart < cached_vstart || align < cached_align) { nocache: cached_hole_size = 0; free_vmap_cache = NULL; } /* record if we encounter less permissive parameters */ cached_vstart = vstart; cached_align = align; /* find starting point for our search */ if (free_vmap_cache) { first = rb_entry(free_vmap_cache, struct vmap_area, rb_node); addr = ALIGN(first->va_end, align); if (addr < vstart) goto nocache; if (addr + size < addr) goto overflow; } else { addr = ALIGN(vstart, align); if (addr + size < addr) goto overflow; n = vmap_area_root.rb_node; first = NULL; while (n) { struct vmap_area *tmp; tmp = rb_entry(n, struct vmap_area, rb_node); if (tmp->va_end >= addr) { first = tmp; if (tmp->va_start <= addr) break; n = n->rb_left; } else n = n->rb_right; } if (!first) goto found; } /* from the starting point, walk areas until a suitable hole is found */ while (addr + size > first->va_start && addr + size <= vend) { if (addr + cached_hole_size < first->va_start) cached_hole_size = first->va_start - addr; addr = ALIGN(first->va_end, align); if (addr + size < addr) goto overflow; if (list_is_last(&first->list, &vmap_area_list)) goto found; first = list_entry(first->list.next, struct vmap_area, list); } found: if (addr + size > vend) goto overflow; va->va_start = addr; va->va_end = addr + size; va->flags = 0; __insert_vmap_area(va); free_vmap_cache = &va->rb_node; spin_unlock(&vmap_area_lock); BUG_ON(va->va_start & (align-1)); BUG_ON(va->va_start < vstart); BUG_ON(va->va_end > vend); return va; overflow: spin_unlock(&vmap_area_lock); if (!purged) { purge_vmap_area_lazy(); purged = 1; goto retry; } if (printk_ratelimit()) pr_warn("vmap allocation for size %lu failed: " "use vmalloc=<size> to increase size.\n", size); kfree(va); return ERR_PTR(-EBUSY); }
alloc_vmap_area()在vmalloc整個空間中查找一塊大小合適的而且沒有人使用的空間,這段空間稱爲hole。注意這個參數vstart是指VMALLOC_START,vend是指VMALLOC_END。
第25行代碼,free_vmap_cache、cached_hole_size和cached_vstart這幾個變量是在幾年前增長的一個優化選項中,核心思想是從上一次查找的結果中開始查找。這裏假設暫時忽略free_vmap_cache這個優化,從47行代碼開始看起。
查找的地址從VMALLOC_START開始,首先從vmap_area_root這顆紅黑樹上查找,這個紅黑樹裏存放着系統中正在使用的vmalloc區塊,遍歷左子葉節點找區間地址最小區塊。若是區塊的開始地址等於VMALLOC_START,說明這區塊是第一塊vmalloc區塊。若是紅黑樹沒有一個節點,說明整個vmalloc區間都是空的,見第66行代碼。
第54~64行代碼,這裏遍歷的結果是返回起始地址最小vmalloc區塊,這個區塊有多是VMALLOC_START開始的,也有可能不是。
而後從VMALLOC_START地址開始,查找每一個已存在的vmalloc的區塊的縫隙hole可否容納目前要分配內存的大小。若是不能再已有vmalloc區塊的縫隙中找到合適的hole,那麼從最後一塊vmalloc區塊的結束地址開始一個新的vmalloc區域,見第71~83行代碼。
第92行代碼,找到新區塊hole後,調用__insert_vmap_area()函數把這個hole註冊到紅黑樹上。
static void __insert_vmap_area(struct vmap_area *va) { struct rb_node **p = &vmap_area_root.rb_node; struct rb_node *parent = NULL; struct rb_node *tmp; while (*p) { struct vmap_area *tmp_va; parent = *p; tmp_va = rb_entry(parent, struct vmap_area, rb_node); if (va->va_start < tmp_va->va_end) p = &(*p)->rb_left; else if (va->va_end > tmp_va->va_start) p = &(*p)->rb_right; else BUG(); } rb_link_node(&va->rb_node, parent, p); rb_insert_color(&va->rb_node, &vmap_area_root); /* address-sort this list */ tmp = rb_prev(&va->rb_node); if (tmp) { struct vmap_area *prev; prev = rb_entry(tmp, struct vmap_area, rb_node); list_add_rcu(&va->list, &prev->list); } else list_add_rcu(&va->list, &vmap_area_list); }
回到__get_vm_area_node()函數的第16行代碼上,把剛找到的struct vmap_area *va的相關信息填到struct vm_struct *vm中。
static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { spin_lock(&vmap_area_lock); vm->flags = flags; vm->addr = (void *)va->va_start; vm->size = va->va_end - va->va_start; vm->caller = caller; va->vm = vm; va->flags |= VM_VM_AREA; spin_unlock(&vmap_area_lock); }
回到__vmalloc_node_range()函數中的第16行代碼中的 __vmalloc_area_node()。
[vmalloc()->__vmalloc_node_range()->__vmalloc_area_node()] static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node) { const int order = 0; struct page **pages; unsigned int nr_pages, array_size, i; const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, PAGE_KERNEL, node, area->caller); area->flags |= VM_VPAGES; } else { pages = kmalloc_node(array_size, nested_gfp, node); } area->pages = pages; if (!area->pages) { remove_vm_area(area->addr); kfree(area); return NULL; } for (i = 0; i < area->nr_pages; i++) { struct page *page; if (node == NUMA_NO_NODE) page = alloc_page(alloc_mask); else page = alloc_pages_node(node, alloc_mask, order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ area->nr_pages = i; goto fail; } area->pages[i] = page; if (gfp_mask & __GFP_WAIT) cond_resched(); } if (map_vm_area(area, prot, pages)) goto fail; return area->addr; fail: warn_alloc_failed(gfp_mask, order, "vmalloc: allocation failure, allocated %ld of %ld bytes\n", (area->nr_pages*PAGE_SIZE), area->size); vfree(area->addr); return NULL; }
在__vmalloc_area_node()函數中,首先計算vmalloc分配內存大小有幾個頁面,而後使用alloc_page()這個API來分配物理頁面,而且使用area->pages保存已分配的頁面page數據結構指針,最後調用map_vm_area()函數來創建頁面映射。
map_vm_area()函數最後調用vmap_page_range_noflush()來創建頁面映射關係。
static int vmap_page_range_noflush(unsigned long start, unsigned long end, pgprot_t prot, struct page **pages) { pgd_t *pgd; unsigned long next; unsigned long addr = start; int err = 0; int nr = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); if (err) return err; } while (pgd++, addr = next, addr != end); return nr; }
pgd_offset_k()首先從init_mm中獲取指向PGD頁面目錄下的基地址,而後經過地址addr來找到對應的PGD表項。while循環裏從開始地址addr到結束地址,按照PGDIR_SIZE的大小依次調用vmap_pud_range()來處理PGD頁表。pgd_offset_k()宏定義以下:
#define pgd_index(addr) ((addr) >> PGDIR_SHIFT) #define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr)) #define pgd_offset_k(addr) pgd_offset(&init_mm, addr) #define pgd_addr_end(addr, end) ({ \ unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \ (__boundary - 1 < (end) - 1) ? __boudary : (end); } )
vmap_pud_range()函數會依次調用vmap_pmd_range()。在ARM Vexpress平臺中,頁表是二級頁表,因此PUD和PMD都指向PGD,最後直接調用vmap_pte_range()。
static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr) { pte_t *pte; /* * nr is a running index into the array which helps higher level * callers keep track of where we're up to. */ pte = pte_alloc_kernel(pmd, addr); if (!pte) return -ENOMEM; do { struct page *page = pages[*nr]; if (WARN_ON(!pte_none(*pte))) return -EBUSY; if (WARN_ON(!page)) return -ENOMEM; set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); return 0; }
在此場景中,對應的pmd頁表項內容爲空,即pmd_none(*(pmd)),因此須要新分配pte頁表項。
static inline pte_t * pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { pte_t *pte = (pte_t *)__get_free_page(PGALLOC_GFP); if(pte) clean_pte_table(pte); return pte; }
mk_pte()宏利用剛分配的page頁面和頁面屬性prot來新生成一個PTE entry,最後經過set_pte_at()函數把PTE entry設置到硬件頁表PTE頁表項中。