專題:Linux內存管理專題html
關鍵詞:RMAP、頁面遷移。node
頁面遷移的初衷是爲NUMA系統提供一種將進程遷移到任意內存節點的能力,後來內存規整和內存熱插拔場景都使用了此功能。app
頁面遷移將就頁面內容,遷移到新的頁面。異步
須要分配新頁面,將舊頁面內容拷貝到新頁面;而後利用RAMP技術解除舊頁面的映射關係,並將映射關係映射到新頁面。async
Linux提供了migrate_pages系統調用,從old_nodes中獲取原內存節點,從new_nodes中獲取目的內存節點;而後將當前進程的mm_struct做爲參數,調用do_migrate_pages進行遷移操做。ide
SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, const unsigned long __user *, old_nodes, const unsigned long __user *, new_nodes) { const struct cred *cred = current_cred(), *tcred; struct mm_struct *mm = NULL; struct task_struct *task; nodemask_t task_nodes; int err; nodemask_t *old; nodemask_t *new; NODEMASK_SCRATCH(scratch); if (!scratch) return -ENOMEM; old = &scratch->mask1; new = &scratch->mask2; err = get_nodes(old, old_nodes, maxnode); if (err) goto out; err = get_nodes(new, new_nodes, maxnode); if (err) goto out; ... mm = get_task_mm(task); put_task_struct(task); if (!mm) { err = -EINVAL; goto out; } err = do_migrate_pages(mm, old, new, capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); ... }
do_migrate_pages最終將遷移工做交給migrate_pages來處理:do_migrate_pages-->migrate_to_node-->migrate_pages。函數
migrate_pages-------------------------------------頁面遷移核心函數 unmap_and_move get_new_page------------------------------分配新頁面 __unmap_and_move--------------------------遷移頁面到新頁面 move_to_new_page page_mapping----------------------找到頁面對應的地址空間 migrate_page----------------------將舊頁面的相關信息遷移到新頁面 migrate_page_copy remove_migration_ptes-------------利用方向映射找到映射舊頁面的每一個PTE remove_migration_pte----------處理其中一個虛擬地址
from表示將要遷移的頁面鏈表,get_new_page是內存函數指針,put_new_page是遷移失敗時釋放目標頁面的函數指針,private是傳遞給get_new_page的參數,mode是遷移模式,reason表示遷移緣由。oop
int migrate_pages(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, enum migrate_mode mode, int reason) { int retry = 1; int nr_failed = 0; int nr_succeeded = 0; int pass = 0; struct page *page; struct page *page2; int swapwrite = current->flags & PF_SWAPWRITE; int rc; if (!swapwrite) current->flags |= PF_SWAPWRITE; for(pass = 0; pass < 10 && retry; pass++) {--------------------------嘗試10次,從from摘取一個頁面,調用unmap_and_move()函數進行頁遷移,返回MIGRATEPAGE_SUCCESS表示頁遷移成功。 retry = 0; list_for_each_entry_safe(page, page2, from, lru) { cond_resched(); if (PageHuge(page)) rc = unmap_and_move_huge_page(get_new_page, put_new_page, private, page, pass > 2, mode); else rc = unmap_and_move(get_new_page, put_new_page, private, page, pass > 2, mode); switch(rc) { case -ENOMEM: goto out; case -EAGAIN: retry++; break; case MIGRATEPAGE_SUCCESS: nr_succeeded++; break; default: /* * Permanent failure (-EBUSY, -ENOSYS, etc.): * unlike -EAGAIN case, the failed page is * removed from migration page list and not * retried in the next outer loop. */ nr_failed++; break; } } } rc = nr_failed + retry; out: if (nr_succeeded) count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); if (nr_failed) count_vm_events(PGMIGRATE_FAIL, nr_failed); trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason); if (!swapwrite) current->flags &= ~PF_SWAPWRITE; return rc; }
newpage是由get_new_page分配的頁面,__unmap_and_move嘗試遷移頁面page到新分配的頁面newpage。post
__unmap_new_page被unmap_new_page調用,其中force在嘗試次數超過2時,就被置1。
static int __unmap_and_move(struct page *page, struct page *newpage, int force, enum migrate_mode mode) { int rc = -EAGAIN; int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; if (!trylock_page(page)) {------------------------------------嘗試給頁面枷鎖,返回false表示已經有別的進程給page枷鎖,返回true表示當前進程能夠成功獲取鎖。 if (!force || mode == MIGRATE_ASYNC)----------------------加鎖失敗,且強制遷移或異步模式,則忽略這個頁面。 goto out; /* * It's not safe for direct compaction to call lock_page. * For example, during page readahead pages are added locked * to the LRU. Later, when the IO completes the pages are * marked uptodate and unlocked. However, the queueing * could be merging multiple pages for one bio (e.g. * mpage_readpages). If an allocation happens for the * second or third page, the process can end up locking * the same page twice and deadlocking. Rather than * trying to be clever about what pages can be locked, * avoid the use of lock_page for direct compaction * altogether. */ if (current->flags & PF_MEMALLOC)-------------------------可能在直接內存壓縮路徑上,睡眠等待頁面鎖是不安全的,忽略此頁面。 goto out; lock_page(page); } if (PageWriteback(page)) { /* * Only in the case of a full synchronous migration is it * necessary to wait for PageWriteback. In the async case, * the retry loop is too short and in the sync-light case, * the overhead of stalling is too much */ if (mode != MIGRATE_SYNC) { rc = -EBUSY; goto out_unlock; } if (!force) goto out_unlock; wait_on_page_writeback(page); } /* * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, * we cannot notice that anon_vma is freed while we migrates a page. * This get_anon_vma() delays freeing anon_vma pointer until the end * of migration. File cache pages are no problem because of page_lock() * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. */ if (PageAnon(page) && !PageKsm(page)) { /* * Only page_lock_anon_vma_read() understands the subtleties of * getting a hold on an anon_vma from outside one of its mms. */ anon_vma = page_get_anon_vma(page); if (anon_vma) { /* * Anon page */ } else if (PageSwapCache(page)) { /* * We cannot be sure that the anon_vma of an unmapped * swapcache page is safe to use because we don't * know in advance if the VMA that this page belonged * to still exists. If the VMA and others sharing the * data have been freed, then the anon_vma could * already be invalid. * * To avoid this possibility, swapcache pages get * migrated but are not remapped when migration * completes */ } else { goto out_unlock; } } if (unlikely(isolated_balloon_page(page))) { /* * A ballooned page does not need any special attention from * physical to virtual reverse mapping procedures. * Skip any attempt to unmap PTEs or to remap swap cache, * in order to avoid burning cycles at rmap level, and perform * the page migration right away (proteced by page lock). */ rc = balloon_page_migrate(newpage, page, mode); goto out_unlock; } /* * Corner case handling: * 1. When a new swap-cache page is read into, it is added to the LRU * and treated as swapcache but it has no rmap yet. * Calling try_to_unmap() against a page->mapping==NULL page will * trigger a BUG. So handle it here. * 2. An orphaned page (see truncate_complete_page) might have * fs-private metadata. The page can be picked up due to memory * offlining. Everywhere else except page reclaim, the page is * invisible to the vm, so the page can not be migrated. So try to * free the metadata, so the page can be freed. */ if (!page->mapping) { VM_BUG_ON_PAGE(PageAnon(page), page); if (page_has_private(page)) { try_to_free_buffers(page); goto out_unlock; } goto skip_unmap; } /* Establish migration ptes or remove ptes */ if (page_mapped(page)) {----------------------------------------------有pte映射的頁面,調用try_to_unmap()解除頁面全部映射 try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); page_was_mapped = 1; } skip_unmap: if (!page_mapped(page))-----------------------------------------------已經解除完全部映射的頁面,將頁面遷移到新分配的頁面newpage。 rc = move_to_new_page(newpage, page, page_was_mapped, mode); if (rc && page_was_mapped)--------------------------------------------rc不爲0表示遷移頁面失敗,調用remove_migration_ptes()刪掉遷移的pte。 remove_migration_ptes(page, page); /* Drop an anon_vma reference if we took one */ if (anon_vma) put_anon_vma(anon_vma); out_unlock: unlock_page(page); out: return rc; }
move_to_new_page試講page內容遷移到newpage,mode是遷移模式異步、或者同步。
static int move_to_new_page(struct page *newpage, struct page *page, int page_was_mapped, enum migrate_mode mode) { struct address_space *mapping; int rc; /* * Block others from accessing the page when we get around to * establishing additional references. We are the only one * holding a reference to the new page at this point. */ if (!trylock_page(newpage))--------------------------------------------持鎖失敗,說明可能被其它進程加鎖,BUG進行處理。 BUG(); /* Prepare mapping for the new page.*/ newpage->index = page->index; newpage->mapping = page->mapping; if (PageSwapBacked(page)) SetPageSwapBacked(newpage); mapping = page_mapping(page);-------------------------------------------檢查當前頁面你是否被映射。若是page屬於slab或是匿名頁面,返回爲空。SWAP則返回swap_address_space空間;其他page cache直接返回page->mapping。 if (!mapping) rc = migrate_page(mapping, newpage, page, mode);--------------------slab或者匿名頁面調用migrate_page()將舊頁面相關信息遷移到新頁面。 else if (mapping->a_ops->migratepage) /* * Most pages have a mapping and most filesystems provide a * migratepage callback. Anonymous pages are part of swap * space which also has its own migratepage callback. This * is the most common path for page migration. */ rc = mapping->a_ops->migratepage(mapping, newpage, page, mode);-------------------------------有mapping的狀況,調用migratepage函數進行遷移。 else rc = fallback_migrate_page(mapping, newpage, page, mode); if (rc != MIGRATEPAGE_SUCCESS) { newpage->mapping = NULL; } else { mem_cgroup_migrate(page, newpage, false); if (page_was_mapped) remove_migration_ptes(page, newpage); page->mapping = NULL; } unlock_page(newpage); return rc; }
migrate_pte函數進行頁面複製工做。
int migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) { int rc; BUG_ON(PageWriteback(page)); /* Writeback must be complete */ rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);-------------對於匿名頁面來講,什麼也不作直接返回。 if (rc != MIGRATEPAGE_SUCCESS) return rc; migrate_page_copy(newpage, page);--------------------------------------------------把頁面page複製到新頁面newpage中。 return MIGRATEPAGE_SUCCESS; }
remove_migration_ptes利用RMAP反向映射系統找到映射舊頁面的每一個pte,而後和新頁面創建新的映射關係。
static void remove_migration_ptes(struct page *old, struct page *new) { struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, .arg = old, }; rmap_walk(new, &rwc); } static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, unsigned long addr, void *old) { struct mm_struct *mm = vma->vm_mm; swp_entry_t entry; pmd_t *pmd; pte_t *ptep, pte; spinlock_t *ptl; if (unlikely(PageHuge(new))) { ptep = huge_pte_offset(mm, addr); if (!ptep) goto out; ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep); } else { pmd = mm_find_pmd(mm, addr); if (!pmd) goto out; ptep = pte_offset_map(pmd, addr);-------------------------------------經過mm和addr找到對應頁表項pte。 /* * Peek to check is_swap_pte() before taking ptlock? No, we * can race mremap's move_ptes(), which skips anon_vma lock. */ ptl = pte_lockptr(mm, pmd); } spin_lock(ptl); pte = *ptep; if (!is_swap_pte(pte)) goto unlock; entry = pte_to_swp_entry(pte); if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) goto unlock; get_page(new); pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); if (pte_swp_soft_dirty(*ptep)) pte = pte_mksoft_dirty(pte); /* Recheck VMA as permissions can change since migration started */ if (is_write_migration_entry(entry)) pte = maybe_mkwrite(pte, vma); #ifdef CONFIG_HUGETLB_PAGE if (PageHuge(new)) { pte = pte_mkhuge(pte); pte = arch_make_huge_pte(pte, vma, new, 0); } #endif flush_dcache_page(new); set_pte_at(mm, addr, ptep, pte);-----------------------------------------把映射的pte頁表項內容設置到新頁面pte中,從新創建映射關係。 if (PageHuge(new)) { if (PageAnon(new)) hugepage_add_anon_rmap(new, vma, addr); else page_dup_rmap(new); } else if (PageAnon(new)) page_add_anon_rmap(new, vma, addr); else page_add_file_rmap(new);---------------------------------------------把新頁面newpage添加到RMAP反向映射系統中。 /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, ptep);---------------------------------------更新相應的cache unlock: pte_unmap_unlock(ptep, ptl); out: return SWAP_AGAIN; }