Linux內存管理 (15)頁面遷移

時間 2019-12-20

標籤 linux 內存管理頁面遷移欄目 Linux 简体版

原文原文鏈接

專題：Linux內存管理專題html

關鍵詞：RMAP、頁面遷移。node

相關章節：反向映射RMAP、內存規整。安全

頁面遷移的初衷是爲NUMA系統提供一種將進程遷移到任意內存節點的能力，後來內存規整和內存熱插拔場景都使用了此功能。app

頁面遷移將就頁面內容，遷移到新的頁面。異步

須要分配新頁面，將舊頁面內容拷貝到新頁面；而後利用RAMP技術解除舊頁面的映射關係，並將映射關係映射到新頁面。async

1. migrate_pages系統調用

Linux提供了migrate_pages系統調用，從old_nodes中獲取原內存節點，從new_nodes中獲取目的內存節點；而後將當前進程的mm_struct做爲參數，調用do_migrate_pages進行遷移操做。ide

SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
        const unsigned long __user *, old_nodes,
        const unsigned long __user *, new_nodes)
{
    const struct cred *cred = current_cred(), *tcred;
    struct mm_struct *mm = NULL;
    struct task_struct *task;
    nodemask_t task_nodes;
    int err;
    nodemask_t *old;
    nodemask_t *new;
    NODEMASK_SCRATCH(scratch);

    if (!scratch)
        return -ENOMEM;

    old = &scratch->mask1;
    new = &scratch->mask2;

    err = get_nodes(old, old_nodes, maxnode);
    if (err)
        goto out;

    err = get_nodes(new, new_nodes, maxnode);
    if (err)
        goto out;
...
    mm = get_task_mm(task);
    put_task_struct(task);

    if (!mm) {
        err = -EINVAL;
        goto out;
    }

 err = do_migrate_pages(mm, old, new,
        capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); ...
}

do_migrate_pages最終將遷移工做交給migrate_pages來處理：do_migrate_pages-->migrate_to_node-->migrate_pages。函數

2. migrate_pages

migrate_pages-------------------------------------頁面遷移核心函數
    unmap_and_move
        get_new_page------------------------------分配新頁面
        __unmap_and_move--------------------------遷移頁面到新頁面
            move_to_new_page
                page_mapping----------------------找到頁面對應的地址空間
                migrate_page----------------------將舊頁面的相關信息遷移到新頁面
                    migrate_page_copy
                remove_migration_ptes-------------利用方向映射找到映射舊頁面的每一個PTE
                    remove_migration_pte----------處理其中一個虛擬地址

from表示將要遷移的頁面鏈表，get_new_page是內存函數指針，put_new_page是遷移失敗時釋放目標頁面的函數指針，private是傳遞給get_new_page的參數，mode是遷移模式，reason表示遷移緣由。oop

int migrate_pages(struct list_head *from, new_page_t get_new_page,
        free_page_t put_new_page, unsigned long private,
        enum migrate_mode mode, int reason)
{
    int retry = 1;
    int nr_failed = 0;
    int nr_succeeded = 0;
    int pass = 0;
    struct page *page;
    struct page *page2;
    int swapwrite = current->flags & PF_SWAPWRITE;
    int rc;

    if (!swapwrite)
        current->flags |= PF_SWAPWRITE;

    for(pass = 0; pass < 10 && retry; pass++) {--------------------------嘗試10次，從from摘取一個頁面，調用unmap_and_move()函數進行頁遷移，返回MIGRATEPAGE_SUCCESS表示頁遷移成功。
        retry = 0;

        list_for_each_entry_safe(page, page2, from, lru) {
            cond_resched();

            if (PageHuge(page))
                rc = unmap_and_move_huge_page(get_new_page,
                        put_new_page, private, page,
                        pass > 2, mode);
            else
                rc = unmap_and_move(get_new_page, put_new_page,
                        private, page, pass > 2, mode);

            switch(rc) {
            case -ENOMEM:
                goto out;
            case -EAGAIN:
                retry++;
                break;
            case MIGRATEPAGE_SUCCESS:
                nr_succeeded++;
                break;
            default:
                /*
                 * Permanent failure (-EBUSY, -ENOSYS, etc.):
                 * unlike -EAGAIN case, the failed page is
                 * removed from migration page list and not
                 * retried in the next outer loop.
                 */
                nr_failed++;
                break;
            }
        }
    }
    rc = nr_failed + retry;
out:
    if (nr_succeeded)
        count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
    if (nr_failed)
        count_vm_events(PGMIGRATE_FAIL, nr_failed);
    trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);

    if (!swapwrite)
        current->flags &= ~PF_SWAPWRITE;

    return rc;
}

newpage是由get_new_page分配的頁面，__unmap_and_move嘗試遷移頁面page到新分配的頁面newpage。post

__unmap_new_page被unmap_new_page調用，其中force在嘗試次數超過2時，就被置1。

static int __unmap_and_move(struct page *page, struct page *newpage,
                int force, enum migrate_mode mode)
{
    int rc = -EAGAIN;
    int page_was_mapped = 0;
    struct anon_vma *anon_vma = NULL;

    if (!trylock_page(page)) {------------------------------------嘗試給頁面枷鎖，返回false表示已經有別的進程給page枷鎖，返回true表示當前進程能夠成功獲取鎖。 if (!force || mode == MIGRATE_ASYNC)----------------------加鎖失敗，且強制遷移或異步模式，則忽略這個頁面。 goto out;

        /*
         * It's not safe for direct compaction to call lock_page.
         * For example, during page readahead pages are added locked
         * to the LRU. Later, when the IO completes the pages are
         * marked uptodate and unlocked. However, the queueing
         * could be merging multiple pages for one bio (e.g.
         * mpage_readpages). If an allocation happens for the
         * second or third page, the process can end up locking
         * the same page twice and deadlocking. Rather than
         * trying to be clever about what pages can be locked,
         * avoid the use of lock_page for direct compaction
         * altogether.
         */
        if (current->flags & PF_MEMALLOC)-------------------------可能在直接內存壓縮路徑上，睡眠等待頁面鎖是不安全的，忽略此頁面。 goto out;

        lock_page(page);
    }

    if (PageWriteback(page)) {
        /*
         * Only in the case of a full synchronous migration is it
         * necessary to wait for PageWriteback. In the async case,
         * the retry loop is too short and in the sync-light case,
         * the overhead of stalling is too much
         */
        if (mode != MIGRATE_SYNC) {
            rc = -EBUSY;
            goto out_unlock;
        }
        if (!force)
            goto out_unlock;
        wait_on_page_writeback(page);
    }
    /*
     * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
     * we cannot notice that anon_vma is freed while we migrates a page.
     * This get_anon_vma() delays freeing anon_vma pointer until the end
     * of migration. File cache pages are no problem because of page_lock()
     * File Caches may use write_page() or lock_page() in migration, then,
     * just care Anon page here.
     */
    if (PageAnon(page) && !PageKsm(page)) {
        /*
         * Only page_lock_anon_vma_read() understands the subtleties of
         * getting a hold on an anon_vma from outside one of its mms.
         */
        anon_vma = page_get_anon_vma(page);
        if (anon_vma) {
            /*
             * Anon page
             */
        } else if (PageSwapCache(page)) {
            /*
             * We cannot be sure that the anon_vma of an unmapped
             * swapcache page is safe to use because we don't
             * know in advance if the VMA that this page belonged
             * to still exists. If the VMA and others sharing the
             * data have been freed, then the anon_vma could
             * already be invalid.
             *
             * To avoid this possibility, swapcache pages get
             * migrated but are not remapped when migration
             * completes
             */
        } else {
            goto out_unlock;
        }
    }

    if (unlikely(isolated_balloon_page(page))) {
        /*
         * A ballooned page does not need any special attention from
         * physical to virtual reverse mapping procedures.
         * Skip any attempt to unmap PTEs or to remap swap cache,
         * in order to avoid burning cycles at rmap level, and perform
         * the page migration right away (proteced by page lock).
         */
        rc = balloon_page_migrate(newpage, page, mode);
        goto out_unlock;
    }

    /*
     * Corner case handling:
     * 1. When a new swap-cache page is read into, it is added to the LRU
     * and treated as swapcache but it has no rmap yet.
     * Calling try_to_unmap() against a page->mapping==NULL page will
     * trigger a BUG.  So handle it here.
     * 2. An orphaned page (see truncate_complete_page) might have
     * fs-private metadata. The page can be picked up due to memory
     * offlining.  Everywhere else except page reclaim, the page is
     * invisible to the vm, so the page can not be migrated.  So try to
     * free the metadata, so the page can be freed.
     */
    if (!page->mapping) {
        VM_BUG_ON_PAGE(PageAnon(page), page);
        if (page_has_private(page)) {
            try_to_free_buffers(page);
            goto out_unlock;
        }
        goto skip_unmap;
    }

    /* Establish migration ptes or remove ptes */
    if (page_mapped(page)) {----------------------------------------------有pte映射的頁面，調用try_to_unmap()解除頁面全部映射
        try_to_unmap(page,
            TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
        page_was_mapped = 1;
    }

skip_unmap:
    if (!page_mapped(page))-----------------------------------------------已經解除完全部映射的頁面，將頁面遷移到新分配的頁面newpage。
        rc = move_to_new_page(newpage, page, page_was_mapped, mode);

    if (rc && page_was_mapped)--------------------------------------------rc不爲0表示遷移頁面失敗，調用remove_migration_ptes()刪掉遷移的pte。
        remove_migration_ptes(page, page);

    /* Drop an anon_vma reference if we took one */
    if (anon_vma)
        put_anon_vma(anon_vma);

out_unlock:
    unlock_page(page);
out:
    return rc;
}

move_to_new_page試講page內容遷移到newpage，mode是遷移模式異步、或者同步。

static int move_to_new_page(struct page *newpage, struct page *page,
                int page_was_mapped, enum migrate_mode mode)
{
    struct address_space *mapping;
    int rc;

    /*
     * Block others from accessing the page when we get around to
     * establishing additional references. We are the only one
     * holding a reference to the new page at this point.
     */
    if (!trylock_page(newpage))--------------------------------------------持鎖失敗，說明可能被其它進程加鎖，BUG進行處理。
        BUG();

    /* Prepare mapping for the new page.*/
    newpage->index = page->index;
    newpage->mapping = page->mapping;
    if (PageSwapBacked(page))
        SetPageSwapBacked(newpage);

    mapping = page_mapping(page);-------------------------------------------檢查當前頁面你是否被映射。若是page屬於slab或是匿名頁面，返回爲空。SWAP則返回swap_address_space空間；其他page cache直接返回page->mapping。 if (!mapping)
        rc = migrate_page(mapping, newpage, page, mode);--------------------slab或者匿名頁面調用migrate_page()將舊頁面相關信息遷移到新頁面。 else if (mapping->a_ops->migratepage)
        /*
         * Most pages have a mapping and most filesystems provide a
         * migratepage callback. Anonymous pages are part of swap
         * space which also has its own migratepage callback. This
         * is the most common path for page migration.
         */
        rc = mapping->a_ops->migratepage(mapping,
                        newpage, page, mode);-------------------------------有mapping的狀況，調用migratepage函數進行遷移。
    else
        rc = fallback_migrate_page(mapping, newpage, page, mode);

    if (rc != MIGRATEPAGE_SUCCESS) {
        newpage->mapping = NULL;
    } else {
        mem_cgroup_migrate(page, newpage, false);
        if (page_was_mapped)
            remove_migration_ptes(page, newpage);
        page->mapping = NULL;
    }

    unlock_page(newpage);

    return rc;
}

migrate_pte函數進行頁面複製工做。

int migrate_page(struct address_space *mapping,
        struct page *newpage, struct page *page,
        enum migrate_mode mode)
{
    int rc;

    BUG_ON(PageWriteback(page));    /* Writeback must be complete */

    rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);-------------對於匿名頁面來講，什麼也不作直接返回。 if (rc != MIGRATEPAGE_SUCCESS)
        return rc;

    migrate_page_copy(newpage, page);--------------------------------------------------把頁面page複製到新頁面newpage中。 return MIGRATEPAGE_SUCCESS;
}

remove_migration_ptes利用RMAP反向映射系統找到映射舊頁面的每一個pte，而後和新頁面創建新的映射關係。

static void remove_migration_ptes(struct page *old, struct page *new)
{
    struct rmap_walk_control rwc = {
        .rmap_one = remove_migration_pte,
        .arg = old,
    };

    rmap_walk(new, &rwc);
}

static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
                 unsigned long addr, void *old)
{
    struct mm_struct *mm = vma->vm_mm;
    swp_entry_t entry;
     pmd_t *pmd;
    pte_t *ptep, pte;
     spinlock_t *ptl;

    if (unlikely(PageHuge(new))) {
        ptep = huge_pte_offset(mm, addr);
        if (!ptep)
            goto out;
        ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep);
    } else {
        pmd = mm_find_pmd(mm, addr);
        if (!pmd)
            goto out;

        ptep = pte_offset_map(pmd, addr);-------------------------------------經過mm和addr找到對應頁表項pte。

        /*
         * Peek to check is_swap_pte() before taking ptlock?  No, we
         * can race mremap's move_ptes(), which skips anon_vma lock.
         */

        ptl = pte_lockptr(mm, pmd);
    }

     spin_lock(ptl);
    pte = *ptep;
    if (!is_swap_pte(pte))
        goto unlock;

    entry = pte_to_swp_entry(pte);

    if (!is_migration_entry(entry) ||
        migration_entry_to_page(entry) != old)
        goto unlock;

    get_page(new);
    pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
    if (pte_swp_soft_dirty(*ptep))
        pte = pte_mksoft_dirty(pte);

    /* Recheck VMA as permissions can change since migration started  */
    if (is_write_migration_entry(entry))
        pte = maybe_mkwrite(pte, vma);

#ifdef CONFIG_HUGETLB_PAGE
    if (PageHuge(new)) {
        pte = pte_mkhuge(pte);
        pte = arch_make_huge_pte(pte, vma, new, 0);
    }
#endif
    flush_dcache_page(new);
    set_pte_at(mm, addr, ptep, pte);-----------------------------------------把映射的pte頁表項內容設置到新頁面pte中，從新創建映射關係。 if (PageHuge(new)) {
        if (PageAnon(new))
            hugepage_add_anon_rmap(new, vma, addr);
        else
            page_dup_rmap(new);
    } else if (PageAnon(new))
        page_add_anon_rmap(new, vma, addr);
    else
        page_add_file_rmap(new);---------------------------------------------把新頁面newpage添加到RMAP反向映射系統中。 /* No need to invalidate - it was non-present before */
    update_mmu_cache(vma, addr, ptep);---------------------------------------更新相應的cache
unlock:
    pte_unmap_unlock(ptep, ptl);
out:
    return SWAP_AGAIN;
}