vmalloc是一個接口函數, 內核代碼使用它來分配在虛擬內存中連續但在物理內存中不必定連續的內存。node
只須要一個參數,以字節爲單位。linux
使用vmalloc的最著名的實例是內核對模塊的實現. 由於模塊可能在任什麼時候候加載, 若是模塊數據比較多, 那麼沒法保證有足夠的連續內存可用, 特別是在系統已經運行了比較長時間的狀況下.git
若是可以用小塊內存拼接出足夠的內存, 那麼使用vmalloc能夠規避該問題。c#
由於用於vmalloc的內存頁老是必須映射在內核地址空間中, 所以使用ZONE_HIGHMEM內存域的頁要優於其餘內存域. 這使得內核能夠節省更寶貴的較低端內存域, 而又不會帶來額外的壞處. 所以, vmalloc等映射函數是內核出於自身的目的(並不是由於用戶空間應用程序)使用高端內存頁的少數情形之一.
內核在管理虛擬內存中的VMALLOC區域時, 內核必須跟蹤哪些子區域被使用、哪些是空閒的. 爲此定義了一個數據結構vm_struct:
api
31struct vm_struct { 32 struct vm_struct *next; 33 void *addr; 34 unsigned long size; 35 unsigned long flags; 36 struct page **pages; 37 unsigned int nr_pages; 38 phys_addr_t phys_addr; 39 const void *caller; 40};
一個vm_struct表明一個vmalloc區域。數組
經過next造成一個鏈表。數據結構
addr是映射的首地址,size爲映射地址區間的大小。app
pages是一組指針,這些指針描述映射到這個區間裏面的一個個真實的物理頁對應的page指針。ide
nr_pages表示該地址區間映射了多少物理頁。函數
phys_addr僅當用ioremap映射了由物理地址描述的物理內存區域時才須要。該信息保存在phys_addr中。
caller指向調用__vmalloc_node_flags被調用的地址。
flags的取值以下:
12/* bits in flags of vmalloc's vm_struct below */ 13#define VM_IOREMAP 0x00000001 /* ioremap() and friends */ 14#define VM_ALLOC 0x00000002 /* vmalloc() */ 15#define VM_MAP 0x00000004 /* vmap()ed pages */ 16#define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */ 17#define VM_VPAGES 0x00000010 /* buffer for pages was vmalloc'ed */ 18#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ 19#define VM_NO_GUARD 0x00000040 /* don't add guard page */ 20#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ 21/* bits [20..32] reserved for arch specific ioremap internals */
由於vmalloc的調用函數都在一塊兒,貼上以下:
1710/** 1711 * __vmalloc_node - allocate virtually contiguous memory 1712 * @size: allocation size 1713 * @align: desired alignment 1714 * @gfp_mask: flags for the page level allocator 1715 * @prot: protection mask for the allocated pages 1716 * @node: node to use for allocation or NUMA_NO_NODE 1717 * @caller: caller's return address 1718 * 1719 * Allocate enough pages to cover @size from the page level 1720 * allocator with @gfp_mask flags. Map them into contiguous 1721 * kernel virtual space, using a pagetable protection of @prot. 1722 */ 1723static void *__vmalloc_node(unsigned long size, unsigned long align, 1724 gfp_t gfp_mask, pgprot_t prot, 1725 int node, const void *caller) 1726{ 1727 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, 1728 gfp_mask, prot, 0, node, caller); 1729} 1730 ... 1737 1738static inline void *__vmalloc_node_flags(unsigned long size, 1739 int node, gfp_t flags) 1740{ 1741 return __vmalloc_node(size, 1, flags, PAGE_KERNEL, 1742 node, __builtin_return_address(0)); 1743} 1744 1745/** 1746 * vmalloc - allocate virtually contiguous memory 1747 * @size: allocation size 1748 * Allocate enough pages to cover @size from the page level 1749 * allocator and map them into contiguous kernel virtual space. 1750 * 1751 * For tight control over page level allocator and protection flags 1752 * use __vmalloc() instead. 1753 */ 1754void *vmalloc(unsigned long size) 1755{ 1756 return __vmalloc_node_flags(size, NUMA_NO_NODE, 1757 GFP_KERNEL | __GFP_HIGHMEM); 1758} 1759EXPORT_SYMBOL(vmalloc);
最終調到__vmalloc_node_range,並把VMALLOC_START和VMALLOC_END傳入,該函數是vmalloc的主要實現,用來從(start, end)中申請一段大小爲size的虛擬地址空間,並給這塊虛擬地址空間申請物理內存(基本是不連續的),並寫入頁表。
VMALLOC_START和VMALLOC_END在arm中的定義以下:
36/* 37 * Just any arbitrary offset to the start of the vmalloc VM area: the 38 * current 8MB value just means that there will be a 8MB "hole" after the 39 * physical memory until the kernel virtual memory starts. That means that 40 * any out-of-bounds memory accesses will hopefully be caught. 41 * The vmalloc() routines leaves a hole of 4kB between each vmalloced 42 * area for the same reason. ;) 43 */ 44#define VMALLOC_OFFSET (8*1024*1024) 45#define VMALLOC_START (((unsigned long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)) 46#define VMALLOC_END 0xff800000UL 47
能夠看出VMALLOC_END爲0xff800000UL,但VMALLOC_START與high_memory有關。high_memory在sanity_check_meminfo中被肯定:
static void * __initdata vmalloc_min = 1082 (void *)(VMALLOC_END - (240 << 20) - VMALLOC_OFFSET); 1083 1084/* 1085 * vmalloc=size forces the vmalloc area to be exactly 'size' 1086 * bytes. This can be used to increase (or decrease) the vmalloc 1087 * area - the default is 240m. 1088 */ 1089static int __init early_vmalloc(char *arg) 1090{ 1091 unsigned long vmalloc_reserve = memparse(arg, NULL); 1092 1093 if (vmalloc_reserve < SZ_16M) { 1094 vmalloc_reserve = SZ_16M; 1095 pr_warn("vmalloc area too small, limiting to %luMB\n", 1096 vmalloc_reserve >> 20); 1097 } 1098 1099 if (vmalloc_reserve > VMALLOC_END - (PAGE_OFFSET + SZ_32M)) { 1100 vmalloc_reserve = VMALLOC_END - (PAGE_OFFSET + SZ_32M); 1101 pr_warn("vmalloc area is too big, limiting to %luMB\n", 1102 vmalloc_reserve >> 20); 1103 } 1104 1105 vmalloc_min = (void *)(VMALLOC_END - vmalloc_reserve); 1106 return 0; 1107} 1108early_param("vmalloc", early_vmalloc); 1109 1110phys_addr_t arm_lowmem_limit __initdata = 0; 1111 1112void __init sanity_check_meminfo(void) 1113{ 1114 phys_addr_t memblock_limit = 0; 1115 int highmem = 0; 1116 phys_addr_t vmalloc_limit = __pa(vmalloc_min - 1) + 1; 1117 struct memblock_region *reg; 1118 bool should_use_highmem = false; 1119 1120 for_each_memblock(memory, reg) { 1121 phys_addr_t block_start = reg->base; 1122 phys_addr_t block_end = reg->base + reg->size; 1123 phys_addr_t size_limit = reg->size; 1124 1125 if (reg->base >= vmalloc_limit) 1126 highmem = 1; 1127 else 1128 size_limit = vmalloc_limit - reg->base; 1129 1130 1131 if (!IS_ENABLED(CONFIG_HIGHMEM) || cache_is_vipt_aliasing()) { 1132 1133 if (highmem) { 1134 pr_notice("Ignoring RAM at %pa-%pa (!CONFIG_HIGHMEM)\n", 1135 &block_start, &block_end); 1136 memblock_remove(reg->base, reg->size); 1137 should_use_highmem = true; 1138 continue; 1139 } 1140 1141 if (reg->size > size_limit) { 1142 phys_addr_t overlap_size = reg->size - size_limit; 1143 1144 pr_notice("Truncating RAM at %pa-%pa to -%pa", 1145 &block_start, &block_end, &vmalloc_limit); 1146 memblock_remove(vmalloc_limit, overlap_size); 1147 block_end = vmalloc_limit; 1148 should_use_highmem = true; 1149 } 1150 } 1151 1152 if (!highmem) { 1153 if (block_end > arm_lowmem_limit) { 1154 if (reg->size > size_limit) 1155 arm_lowmem_limit = vmalloc_limit; 1156 else 1157 arm_lowmem_limit = block_end; 1158 } 1159 1160 /* 1161 * Find the first non-pmd-aligned page, and point 1162 * memblock_limit at it. This relies on rounding the 1163 * limit down to be pmd-aligned, which happens at the 1164 * end of this function. 1165 * 1166 * With this algorithm, the start or end of almost any 1167 * bank can be non-pmd-aligned. The only exception is 1168 * that the start of the bank 0 must be section- 1169 * aligned, since otherwise memory would need to be 1170 * allocated when mapping the start of bank 0, which 1171 * occurs before any free memory is mapped. 1172 */ 1173 if (!memblock_limit) { 1174 if (!IS_ALIGNED(block_start, PMD_SIZE)) 1175 memblock_limit = block_start; 1176 else if (!IS_ALIGNED(block_end, PMD_SIZE)) 1177 memblock_limit = arm_lowmem_limit; 1178 } 1179 1180 } 1181 } 1182 1183 if (should_use_highmem) 1184 pr_notice("Consider using a HIGHMEM enabled kernel.\n"); 1185 1186 high_memory = __va(arm_lowmem_limit - 1) + 1; 1187 1188 if (!memblock_limit) 1189 memblock_limit = arm_lowmem_limit; 1190 1191 /* 1192 * Round the memblock limit down to a pmd size. This 1193 * helps to ensure that we will allocate memory from the 1194 * last full pmd, which should be mapped. 1195 */ 1196 memblock_limit = round_down(memblock_limit, PMD_SIZE); 1197 1198 memblock_set_current_limit(memblock_limit); 1199}
若是在bootargs裏面沒有vmalloc=的字段,vmalloc佔用的虛擬地址空間爲240MB,若是設置了該參數大小爲P,會用VMALLOC_END-P賦給vmalloc_min。
sanity_check_meminfo對於vmalloc來說最重要的做用就是根據memblock裏面的內存塊肯定arm_lowmem_limit的地址,使其不會與vmalloc區間重疊。
OK, VMALLOC_START與VMALLOC_END肯定。
下面來看下這個函數的實現:
1649/** 1650 * __vmalloc_node_range - allocate virtually contiguous memory 1651 * @size: allocation size 1652 * @align: desired alignment 1653 * @start: vm area range start 1654 * @end: vm area range end 1655 * @gfp_mask: flags for the page level allocator 1656 * @prot: protection mask for the allocated pages 1657 * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) 1658 * @node: node to use for allocation or NUMA_NO_NODE 1659 * @caller: caller's return address 1660 * 1661 * Allocate enough pages to cover @size from the page level 1662 * allocator with @gfp_mask flags. Map them into contiguous 1663 * kernel virtual space, using a pagetable protection of @prot. 1664 */ 1665void *__vmalloc_node_range(unsigned long size, unsigned long align, 1666 unsigned long start, unsigned long end, gfp_t gfp_mask, 1667 pgprot_t prot, unsigned long vm_flags, int node, 1668 const void *caller) 1669{ 1670 struct vm_struct *area; 1671 void *addr; 1672 unsigned long real_size = size; 1673 1674 size = PAGE_ALIGN(size); 1675 if (!size || (size >> PAGE_SHIFT) > totalram_pages) 1676 goto fail; 1677 1678 area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | 1679 vm_flags, start, end, node, gfp_mask, caller); 1680 if (!area) 1681 goto fail; 1682 1683 addr = __vmalloc_area_node(area, gfp_mask, prot, node); 1684 if (!addr) 1685 return NULL; 1686 1687 /* 1688 * In this function, newly allocated vm_struct has VM_UNINITIALIZED 1689 * flag. It means that vm_struct is not fully initialized. 1690 * Now, it is fully initialized, so remove this flag here. 1691 */ 1692 clear_vm_uninitialized_flag(area); 1693 1694 /* 1695 * A ref_count = 2 is needed because vm_struct allocated in 1696 * __get_vm_area_node() contains a reference to the virtual address of 1697 * the vmalloc'ed block. 1698 */ 1699 kmemleak_alloc(addr, real_size, 2, gfp_mask); 1700 1701 return addr; 1702 1703fail: 1704 warn_alloc_failed(gfp_mask, 0, 1705 "vmalloc: allocation failure: %lu bytes\n", 1706 real_size); 1707 return NULL; 1708} 1709
先調用__get_vm_area_node在vmap_area組成的紅黑樹中找到一個位置,把由該空間組成的vmap_area插入紅黑樹。
而後調用setup_vmalloc_vm 把該空間保存在vm_struct中。
1317static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 1318 unsigned long flags, const void *caller) 1319{ 1320 spin_lock(&vmap_area_lock); 1321 vm->flags = flags; 1322 vm->addr = (void *)va->va_start; 1323 vm->size = va->va_end - va->va_start; 1324 vm->caller = caller; 1325 va->vm = vm; 1326 va->flags |= VM_VM_AREA; 1327 spin_unlock(&vmap_area_lock); 1328}
而後回到__vmalloc_node_range中,申請完虛擬地址空間後,接着調用__vmalloc_area_node 來申請具體的物理頁,並把這些頁和對應的虛擬地址填入頁表。
1591static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 1592 pgprot_t prot, int node) 1593{ 1594 const int order = 0; 1595 struct page **pages; 1596 unsigned int nr_pages, array_size, i; 1597 const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 1598 const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; 1599 1600 nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; 1601 array_size = (nr_pages * sizeof(struct page *)); 1602 1603 area->nr_pages = nr_pages; 1604 /* Please note that the recursion is strictly bounded. */ 1605 if (array_size > PAGE_SIZE) { 1606 pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, 1607 PAGE_KERNEL, node, area->caller); 1608 area->flags |= VM_VPAGES; 1609 } else { 1610 pages = kmalloc_node(array_size, nested_gfp, node); 1611 } 1612 area->pages = pages; 1613 if (!area->pages) { 1614 remove_vm_area(area->addr); 1615 kfree(area); 1616 return NULL; 1617 } 1618 1619 for (i = 0; i < area->nr_pages; i++) { 1620 struct page *page; 1621 1622 if (node == NUMA_NO_NODE) 1623 page = alloc_page(alloc_mask); 1624 else 1625 page = alloc_pages_node(node, alloc_mask, order); 1626 1627 if (unlikely(!page)) { 1628 /* Successfully allocated i pages, free them in __vunmap() */ 1629 area->nr_pages = i; 1630 goto fail; 1631 } 1632 area->pages[i] = page; 1633 if (gfpflags_allow_blocking(gfp_mask)) 1634 cond_resched(); 1635 } 1636 1637 if (map_vm_area(area, prot, pages)) 1638 goto fail; 1639 return area->addr; 1640 1641fail: 1642 warn_alloc_failed(gfp_mask, order, 1643 "vmalloc: allocation failure, allocated %ld of %ld bytes\n", 1644 (area->nr_pages*PAGE_SIZE), area->size); 1645 vfree(area->addr); 1646 return NULL; 1647} 1648
首先爲pages數組申請一段連續的虛擬地址空間(小於1頁,使用kmalloc,大於1頁,調vmalloc來保證虛擬地址空間的連續性),用來存入申請的物理頁對應的page結構體,而後會申請nr_pages個物理頁(注意是一次申請一頁,所以申請到的頁基本是不連續的)。
最終經過map_vm_area把這些物理頁和虛擬地址空間對應起來,寫入頁表。
另外,社區還有對vmalloc的頁面進行區分的patch。以下:
diff --git a/fs/proc/page.c b/fs/proc/page.c index 792c78a49174..fc83dae1af7b 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -156,6 +156,8 @@ u64 stable_page_flags(struct page *page) u |= 1 << KPF_BALLOON; if (PageTable(page)) u |= 1 << KPF_PGTABLE; + if (PageVMalloc(page)) + u |= 1 << KPF_VMALLOC; if (page_is_idle(page)) u |= 1 << KPF_IDLE; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 42619e16047f..c51ddd27bfb4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -141,6 +141,11 @@ struct page { spinlock_t ptl; #endif }; + struct { /* VMalloc pages */ + struct vm_struct *vm_area; + unsigned long vm_offset; + unsigned long _vm_id; /* MAPPING_VMalloc */ + }; /** @rcu_head: You can use this to free a page by RCU. */ struct rcu_head rcu_head; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 901943e4754b..5232433175c1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -699,6 +699,31 @@ PAGE_TYPE_OPS(Kmemcg, kmemcg) */ PAGE_TYPE_OPS(Table, table) +/* + * vmalloc pages may be mapped to userspace, so we need some other way + * to distinguish them from other kinds of pages. Use page->mapping + * for this purpose. Values below 0x1000 cannot be real pointers. + */ +#define MAPPING_VMalloc (void *)0x440 + +#define PAGE_MAPPING_OPS(name) \ +static __always_inline int Page##name(struct page *page) \ +{ \ + return page->mapping == MAPPING_##name; \ +} \ +static __always_inline void __SetPage##name(struct page *page) \ +{ \ + VM_BUG_ON_PAGE(page->mapping != NULL, page); \ + page->mapping = MAPPING_##name; \ +} \ +static __always_inline void __ClearPage##name(struct page *page) \ +{ \ + VM_BUG_ON_PAGE(page->mapping != MAPPING_##name, page); \ + page->mapping = NULL; \ +} + +PAGE_MAPPING_OPS(VMalloc) + extern bool is_free_buddy_page(struct page *page); __PAGEFLAG(Isolated, isolated, PF_ANY); diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h index 21b9113c69da..6800968b8f47 100644 --- a/include/uapi/linux/kernel-page-flags.h +++ b/include/uapi/linux/kernel-page-flags.h @@ -36,5 +36,6 @@ #define KPF_ZERO_PAGE 24 #define KPF_IDLE 25 #define KPF_PGTABLE 26 +#define KPF_VMALLOC 27 #endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5fbf27e7f956..98bc690d472d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1535,7 +1535,7 @@ static void __vunmap(const void *addr, int deallocate_pages) for (i = 0; i < area->nr_pages; i++) { struct page *page = area->pages[i]; - BUG_ON(!page); + __ClearPageVMalloc(page); __free_pages(page, 0); } @@ -1704,6 +1704,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->nr_pages = i; goto fail; } + __SetPageVMalloc(page); + page->vm_area = area; + page->vm_offset = i; area->pages[i] = page; if (gfpflags_allow_blocking(gfp_mask)) cond_resched(); diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index cce853dca691..25cc21855be4 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -132,6 +132,7 @@ static const char * const page_flag_names[] = { [KPF_THP] = "t:thp", [KPF_BALLOON] = "o:balloon", [KPF_PGTABLE] = "g:pgtable", + [KPF_VMALLOC] = "V:vmalloc", [KPF_ZERO_PAGE] = "z:zero_page", [KPF_IDLE] = "i:idle_page",