kswap線程主要用於頁面的按期換出,接下來講說kswap線程的實現node
int inactive_shortage(void)
{
int shortage = 0;
//系統應該維持的物理內存由xxxhigh跟target維持
//實際的由下面3個函數統計,若是無法知足那就返回正數
shortage += freepages.high;
shortage += inactive_target;
shortage -= nr_free_pages();
shortage -= nr_inactive_clean_pages();
shortage -= nr_inactive_dirty_pages;
if (shortage > 0)
return shortage;
return 0;
}
/*
* Check if there are zones with a severe shortage of free pages,
* or if all zones have a minor shortage.
*/
int free_shortage(void)
{
pg_data_t *pgdat = pgdat_list;//節點
int sum = 0;
int freeable = nr_free_pages() + nr_inactive_clean_pages();//實際空閒
int freetarget = freepages.high + inactive_target / 3;//理論空閒
//實際小於理論,直接返回差值,表示須要擴充
/* Are we low on free pages globally? */
if (freeable < freetarget)
return freetarget - freeable;
/* If not, are we very low on any particular zone? */
do {
int i;
for(i = 0; i < MAX_NR_ZONES; i++) {
zone_t *zone = pgdat->node_zones+ i;//獲取管理區
if (zone->size && (zone->inactive_clean_pages +
zone->free_pages < zone->pages_min+1)) {//空閒頁面+乾淨不活躍頁面是否小於最低水準
/* + 1 to have overlap with alloc_pages() !! */
sum += zone->pages_min + 1;
sum -= zone->free_pages;
sum -= zone->inactive_clean_pages;
}
}
pgdat = pgdat->node_next;
} while (pgdat);
return sum;
}
/**
* refill_inactive_scan - scan the active list and find pages to deactivate
* @priority: the priority at which to scan
* @oneshot: exit after deactivating one page
*
* This function will scan a portion of the active list to find
* unused pages, those pages will then be moved to the inactive list.
*///據priority的值掃描隊列一部分頁面,priority爲0時才所有掃描
int refill_inactive_scan(unsigned int priority, int oneshot)
{
struct list_head * page_lru;
struct page * page;
int maxscan, page_active = 0;//maxscan控制掃描頁面數目
int ret = 0;
/* Take the lock while messing with the list... */
spin_lock(&pagemap_lru_lock);
maxscan = nr_active_pages >> priority;
while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {
page = list_entry(page_lru, struct page, lru);
/* Wrong page on list?! (list corruption, should not happen) */
if (!PageActive(page)) {//掃描的頁面必須是在活躍隊列中
printk("VM: refill_inactive, wrong page on list.\n");
list_del(page_lru);
nr_active_pages--;
continue;
}
/* 判斷頁面是否受到訪問,,決定增長或減小壽命,若是減小壽命到0,那說明此頁面好久都沒訪問了Do aging on the pages. */
if (PageTestandClearReferenced(page)) {
age_page_up_nolock(page);
page_active = 1;
} else {
age_page_down_ageonly(page);
/*
* Since we don't hold a reference on the page
* ourselves, we have to do our test a bit more
* strict then deactivate_page(). This is needed
* since otherwise the system could hang shuffling
* unfreeable pages from the active list to the
* inactive_dirty list and back again...
*
* SUBTLE: we can have buffer pages with count 1.
*///緩衝頁面若是引用計數大於1,說明還要用戶空間映射,不能轉爲不活躍頁面
if (page->age == 0 && page_count(page) <=
(page->buffers ? 2 : 1)) {
deactivate_page_nolock(page);
page_active = 0;
} else {
page_active = 1;
}
}
/*
* If the page is still on the active list, move it
* to the other end of the list. Otherwise it was
* deactivated by age_page_down and we exit successfully.
*/
if (page_active || PageActive(page)) {
list_del(page_lru);//若是頁面仍是活躍的,就放入活躍尾部
list_add(page_lru, &active_list);
} else {
ret = 1;
if (oneshot)//根據oneshot參數選擇是否繼續掃描一次
break;
}
}
spin_unlock(&pagemap_lru_lock);
return ret;
}
static int do_try_to_free_pages(unsigned int gfp_mask, int user)
{
int ret = 0;
/*
若是頁面緊缺,或者髒的不活躍頁面的數量大於空閒頁面跟不活躍乾淨頁面的數目
就須要調用page_launder試圖把不活躍狀態的髒頁面洗淨,使得它們成爲馬上可分配的
頁面
*/
if (free_shortage() || nr_inactive_dirty_pages > nr_free_pages() +
nr_inactive_clean_pages())
ret += page_launder(gfp_mask, user);
/*若是內存依舊緊缺
* If needed, we move pages from the active list
* to the inactive list. We also "eat" pages from
* the inode and dentry cache whenever we do this.
*///釋放dentry目錄項跟inode數據結構的緩存,即便關閉這些,頁面也不會馬上釋放
//而是保存到lru隊列做爲後備
if (free_shortage() || inactive_shortage()) {
shrink_dcache_memory(6, gfp_mask);//釋放dentry目錄項緩存
shrink_icache_memory(6, gfp_mask);//釋放inode緩存
ret += refill_inactive(gfp_mask, user);//user表示是否有等待隊列的進程
} else {
/*
* 不然回收slab緩存
*/
kmem_cache_reap(gfp_mask);
ret = 1;
}
return ret;
}
int page_launder(int gfp_mask, int sync)
{
int launder_loop, maxscan, cleaned_pages, maxlaunder;
int can_get_io_locks;
struct list_head * page_lru;
struct page * page;
/*
* We can only grab the IO locks (eg. for flushing dirty
* buffers to disk) if __GFP_IO is set.
*/
can_get_io_locks = gfp_mask & __GFP_IO;
launder_loop = 0;
maxlaunder = 0;
cleaned_pages = 0;
dirty_page_rescan:
spin_lock(&pagemap_lru_lock);
maxscan = nr_inactive_dirty_pages;//避免重複處理同一頁面,設定的變量
//對不活躍髒頁面隊列掃描
while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&
maxscan-- > 0) {
page = list_entry(page_lru, struct page, lru);
/* Wrong page on list?! (list corruption, should not happen) */
if (!PageInactiveDirty(page)) {檢查其標誌是否爲1
printk("VM: page_launder, wrong page on list.\n");
list_del(page_lru);//從隊列中刪除
nr_inactive_dirty_pages--;
page->zone->inactive_dirty_pages--;
continue;
}
/* 到了髒隊列,因爲可能受到訪問,就會放入活躍頁面隊列Page is or was in use? Move it to the active list. */
if (PageTestandClearReferenced(page) || page->age > 0 ||
(!page->buffers && page_count(page) > 1) ||
page_ramdisk(page)) {
del_page_from_inactive_dirty_list(page);//刪除非活躍隊列
add_page_to_active_list(page);//加入到活躍隊列中
continue;
}
/*頁面是否被鎖住,是的話表示把它移到隊列尾部
* The page is locked. IO in progress?
* Move it to the back of the list.
*/
if (TryLockPage(page)) {
list_del(page_lru);
list_add(page_lru, &inactive_dirty_list);
continue;
}
/*
* Dirty swap-cache page? Write it out if
* last copy..
*/
if (PageDirty(page)) {//是髒頁面
int (*writepage)(struct page *) = page->mapping->a_ops->writepage;
int result;
if (!writepage)//若是沒有提供具體寫swp的函數,則放入活躍隊列中
goto page_active;
/*判斷是不是第一次掃描,是的話就移到隊列尾部,繼續 First time through? Move it to the back of the list */
if (!launder_loop) {
list_del(page_lru);
list_add(page_lru, &inactive_dirty_list);
UnlockPage(page);
continue;
}
/* OK, do a physical asynchronous write to swap. */
ClearPageDirty(page);//清除page結構的_dirty位,防止再次寫入
page_cache_get(page);//增長page->count表示多了一個用戶操做此
//頁面,由於kswap線程把這個頁面寫出到swp設備中
spin_unlock(&pagemap_lru_lock);
result = writepage(page);
page_cache_release(page);//count--完成了寫入操做
//因此就用戶--了
/* And re-start the thing.. */
spin_lock(&pagemap_lru_lock);
if (result != 1)//寫入失敗的話
continue;
/* writepage refused to do anything */
set_page_dirty(page);//又設置爲髒頁
goto page_active;
}
/*
* 若是頁面不是髒的而後又是用於緩存文件讀寫的頁面
*/
if (page->buffers) {
int wait, clearedbuf;
int freed_page = 0;
/*
* Since we might be doing disk IO, we have to
* drop the spinlock and take an extra reference
* on the page so it doesn't go away from under us.
*/
del_page_from_inactive_dirty_list(page);//脫離髒隊列
page_cache_get(page);//表示kswap進程須要做用於page,count++
spin_unlock(&pagemap_lru_lock);
/* Will we do (asynchronous) IO? */
if (launder_loop && maxlaunder == 0 && sync)
wait = 2; /* Synchrounous IO */
else if (launder_loop && maxlaunder-- > 0)
wait = 1; /* Async IO */
else
wait = 0; /* No IO */
/*試圖將頁面釋放,這裏是count減一 Try to free the page buffers. */
clearedbuf = try_to_free_buffers(page, wait);
/*
* Re-take the spinlock. Note that we cannot
* unlock the page yet since we're still
* accessing the page_struct here...
*/
spin_lock(&pagemap_lru_lock);
/* 不能釋放或者說釋放失敗繼續放入髒隊列The buffers were not freed. */
if (!clearedbuf) {
add_page_to_inactive_dirty_list(page);
/*/*頁面只在buffer cache隊列中,而不在某個文件的inode->i_mapping中,這樣的頁有超級塊,索引節點位圖等等,它們不屬於某個文件,所以咱們就成功釋放了一個頁面*/
若是該頁面只用於緩存,而非映射The page was only in the buffer cache. */
} else if (!page->mapping) {
atomic_dec(&buffermem_pages);
freed_page = 1;
cleaned_pages++;
/* *不然這個頁面還在某個文件的inode->i_mapping中,而且還有超過2個用戶(the cache and us)在訪問它,例若有多個進程映射到該文件若是該頁有幾個用戶,加入到活躍隊列中The page has more users besides the cache and us. */
} else if (page_count(page) > 2) {
add_page_to_active_list(page);
/* 最後,只剩下page->mapping && page_count(page) == 2,說明雖然這個頁面還在某個inode->i_mapping中,可是已經沒有任何用戶在訪問他們了,所以能夠釋放該頁面OK, we "created" a freeable page. */
} else /* page->mapping && page_count(page) == 2 */ {
add_page_to_inactive_clean_list(page);
cleaned_pages++;
}
/*
* Unlock the page and drop the extra reference.
* We can only do it here because we ar accessing
* the page struct above.
*/
UnlockPage(page);
page_cache_release(page);//最終釋放頁面到空閒隊列緩存中
/*
* If we're freeing buffer cache pages, stop when
* we've got enough free memory.
釋放了一個頁面,而且系統內存再也不緊缺,那就中止
*/
if (freed_page && !free_shortage())
break;
continue;//頁面再也不是髒頁面,而且屬於address_space紅
} else if (page->mapping && !PageDirty(page)) {
/*
* If a page had an extra reference in
* deactivate_page(), we will find it here.
* Now the page is really freeable, so we
* move it to the inactive_clean list.
*/
del_page_from_inactive_dirty_list(page);//轉移到不活躍隊列中
add_page_to_inactive_clean_list(page);
UnlockPage(page);
cleaned_pages++;
} else {
page_active:
/*
* OK, we don't know what to do with the page.
* It's no use keeping it here, so we move it to
* the active list.
*/
del_page_from_inactive_dirty_list(page);
add_page_to_active_list(page);
UnlockPage(page);
}
}
spin_unlock(&pagemap_lru_lock);
/*
* If we don't have enough free pages, we loop back once
* to queue the dirty pages for writeout. When we were called
* by a user process (that /needs/ a free page) and we didn't
* free anything yet, we wait synchronously on the writeout of
* MAX_SYNC_LAUNDER pages.
*
* We also wake up bdflush, since bdflush should, under most
* loads, flush out the dirty pages before we have to wait on
* IO.
*///若是內存繼續緊缺,那就二次掃描一趟
if (can_get_io_locks && !launder_loop && free_shortage()) {
launder_loop = 1;
/* If we cleaned pages, never do synchronous IO. */
if (cleaned_pages)
sync = 0;
/* We only do a few "out of order" flushes. */
maxlaunder = MAX_LAUNDER;
/* Kflushd takes care of the rest. */
wakeup_bdflush(0);
goto dirty_page_rescan;
}
/* Return the number of pages moved to the inactive_clean list. */
return cleaned_pages;//返回有多少頁面被移到不活躍乾淨頁面中
}
/*
* We need to make the locks finer granularity, but right
* now we need this so that we can do page allocations
* without holding the kernel lock etc.
*
* We want to try to free "count" pages, and we want to
* cluster them so that we get good swap-out behaviour.
*
* OTOH, if we're a user process (and not kswapd), we
* really care about latency. In that case we don't try
* to free too many pages.
*/
static int refill_inactive(unsigned int gfp_mask, int user)
{
int priority, count, start_count, made_progress;
count = inactive_shortage() + free_shortage();//獲取須要的頁面數目
if (user)
count = (1 << page_cluster);
start_count = count;
/* 任什麼時候候,當頁面緊缺時,從slab開始回收Always trim SLAB caches when memory gets low. */
kmem_cache_reap(gfp_mask);
priority = 6;//從最低優先級別6開始
do {
made_progress = 0;
//每次循環都要檢查下當前進程是否被設置被調度,設置了,說明某個中斷程序須要調度
if (current->need_resched) {
__set_current_state(TASK_RUNNING);
schedule();
}
//掃描活躍頁面隊列,試圖從中找出能夠轉入不活躍狀態頁面
while (refill_inactive_scan(priority, 1)) {
made_progress = 1;
if (--count <= 0)
goto done;
}
/*
* don't be too light against the d/i cache since
* refill_inactive() almost never fail when there's
* really plenty of memory free.
*/
shrink_dcache_memory(priority, gfp_mask);
shrink_icache_memory(priority, gfp_mask);
/*試圖找出一個進程,掃描其映射表,找到能夠轉入不活躍狀態頁面
* Then, try to page stuff out..
*/
while (swap_out(priority, gfp_mask)) {
made_progress = 1;
if (--count <= 0)
goto done;
}
/*
* If we either have enough free memory, or if
* page_launder() will be able to make enough
* free memory, then stop.
*/
if (!inactive_shortage() || !free_shortage())
goto done;
/*
* Only switch to a lower "priority" if we
* didn't make any useful progress in the
* last loop.
*/
if (!made_progress)
priority--;
} while (priority >= 0);
/* Always end on a refill_inactive.., may sleep... */
while (refill_inactive_scan(0, 1)) {
if (--count <= 0)
goto done;
}
done:
return (count < start_count);
}
static int swap_out(unsigned int priority, int gfp_mask)
{
int counter;//循環次數
int __ret = 0;
/*
* We make one or two passes through the task list, indexed by
* assign = {0, 1}:
* Pass 1: select the swappable task with maximal RSS that has
* not yet been swapped out.
* Pass 2: re-assign rss swap_cnt values, then select as above.
*
* With this approach, there's no need to remember the last task
* swapped out. If the swap-out fails, we clear swap_cnt so the
* task won't be selected again until all others have been tried.
*
* Think of swap_cnt as a "shadow rss" - it tells us which process
* we want to page out (always try largest first).
*///根據內核中進程的個數跟調用swap_out的優先級計算獲得的
counter = (nr_threads << SWAP_SHIFT) >> priority;
if (counter < 1)
counter = 1;
for (; counter >= 0; counter--) {
struct list_head *p;
unsigned long max_cnt = 0;
struct mm_struct *best = NULL;
int assign = 0;
int found_task = 0;
select:
spin_lock(&mmlist_lock);
p = init_mm.mmlist.next;
for (; p != &init_mm.mmlist; p = p->next) {
struct mm_struct *mm = list_entry(p, struct mm_struct, mmlist);
if (mm->rss <= 0)
continue;
found_task++;
/* Refresh swap_cnt? */
if (assign == 1) {////增長這層判斷目的是,但咱們找不到mm->swap_cnt不爲0的mm時候,
咱們就會設置assign=1,而後再重新掃描一遍,這次就會直接把內存頁面數量賦值給還沒有考察頁面數量,
從而重新刷新一次,這樣咱們就會從最富有的進程開始下手,mm->swap_cnt用於保證咱們所說的輪流坐莊,
mm->rss則是保證劫富濟貧第二輪循環,將mm->rss拷貝到mm_swap_cnt,從最大的開始繼續
mm->swap_cnt = (mm->rss >> SWAP_SHIFT);//記錄一次輪換中還沒有內存頁面還沒有考察的數量
if (mm->swap_cnt < SWAP_MIN)
mm->swap_cnt = SWAP_MIN;
}
if (mm->swap_cnt > max_cnt) {
max_cnt = mm->swap_cnt;
best = mm;
}
}///從循環退出來,咱們就找到了最大的mm->swap_cnt的mm
/* Make sure it doesn't disappear */
if (best)
atomic_inc(&best->mm_users);
spin_unlock(&mmlist_lock);
/*
* We have dropped the tasklist_lock, but we
* know that "mm" still exists: we are running
* with the big kernel lock, and exit_mm()
* cannot race with us.
*/
if (!best) {
if (!assign && found_task > 0) {//第一次進入,表示全部進程mm->swap_cnt都爲0,第2次不會再進入了,通常不會出現第2次
assign = 1;//第二輪循環
goto select;
}
break;
} else {//掃出一個最佳換出的進程,調用swap_out_mm
__ret = swap_out_mm(best, gfp_mask);
mmput(best);
break;
}
}
return __ret;
}
/*
* The swap-out functions return 1 if they successfully
* threw something out, and we got a free page. It returns
* zero if it couldn't do anything, and any other value
* indicates it decreased rss, but the page was shared.
*
* NOTE! If it sleeps, it *must* return 1 to make sure we
* don't continue with the swap-out. Otherwise we may be
* using a process that no longer actually exists (it might
* have died while we slept).
*/
static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
{
pte_t pte;
swp_entry_t entry;
struct page * page;
int onlist;
pte = *page_table;//獲取頁表項
if (!pte_present(pte))//是否存在物理內存中
goto out_failed;
page = pte_page(pte);//獲取具體的頁
if ((!VALID_PAGE(page)) || PageReserved(page))//頁面不合法或者頁面不容許換出swap分區
goto out_failed;
if (!mm->swap_cnt)
return 1;
//須要具體的考察訪問一個頁面,swap_cnt減一
mm->swap_cnt--;
onlist = PageActive(page);//判斷是否活躍
/* Don't look at this pte if it's been accessed recently. */
if (ptep_test_and_clear_young(page_table)) {//測試頁面是否訪問過(訪問過說明年輕)
age_page_up(page);//增長保留觀察時間
goto out_failed;
}
if (!onlist)//即便不在活躍隊列,並且最近沒有訪問,還不能馬上換出,而要保留觀察,直到其
//page->age等於0爲止
age_page_down_ageonly(page);
/*
* If the page is in active use by us, or if the page
* is in active use by others, don't unmap it or
* (worse) start unneeded IO.
*/
if (page->age > 0)
goto out_failed;
if (TryLockPage(page))
goto out_failed;
/* From this point on, the odds are that we're going to
* nuke this pte, so read and clear the pte. This hook
* is needed on CPUs which update the accessed and dirty
* bits in hardware.
*///把頁表項的內容清0(撤銷了映射)
pte = ptep_get_and_clear(page_table);
flush_tlb_page(vma, address);
/*
* Is the page already in the swap cache? If so, then
* we can just drop our reference to it without doing
* any IO - it's already up-to-date on disk.
*
* Return 0, as we didn't actually free any real
* memory, and we should just continue our scan.
*/
if (PageSwapCache(page)) {//判斷該頁是否已經在swap緩存中
entry.val = page->index;
if (pte_dirty(pte))
set_page_dirty(page);//轉入髒頁面
set_swap_pte:
swap_duplicate(entry);//對index作一些印證
set_pte(page_table, swp_entry_to_pte(entry));//設置pte爲swap的索引了,這樣完成了交換
drop_pte:
UnlockPage(page);
mm->rss--;//物理頁面斷開的映射,因此rss--
deactivate_page(page);//將其從活躍隊列移到不活躍隊列中
page_cache_release(page);//釋放頁面緩存
out_failed:
return 0;
}
/*
* Is it a clean page? Then it must be recoverable
* by just paging it in again, and we can just drop
* it..
*
* However, this won't actually free any real
* memory, as the page will just be in the page cache
* somewhere, and as such we should just continue
* our scan.
*
* Basically, this just makes it possible for us to do
* some real work in the future in "refill_inactive()".
*/
flush_cache_page(vma, address);
if (!pte_dirty(pte))
goto drop_pte;
/*
* Ok, it's really dirty. That means that
* we should either create a new swap cache
* entry for it, or we should write it back
* to its own backing store.
*/
if (page->mapping) {
set_page_dirty(page);
goto drop_pte;
}
/*
* This is a dirty, swappable page. First of all,
* get a suitable swap entry for it, and make sure
* we have the swap cache set up to associate the
* page with that swap entry.
*/
entry = get_swap_page();
if (!entry.val)
goto out_unlock_restore; /* No swap space left */
/* Add it to the swap cache and mark it dirty */
add_to_swap_cache(page, entry);
set_page_dirty(page);
goto set_swap_pte;
out_unlock_restore:
set_pte(page_table, pte);
UnlockPage(page);
return 0;
}