內存管理模型

時間 2019-11-30

標籤內存管理模型简体版

原文原文鏈接

題外語：本人對linux內核的瞭解尚淺，若是有差池歡迎指正，也歡迎提問交流！css

首先要理解一下每個進程是如何維護本身獨立的尋址空間的，個人電腦裏呢是8G內存空間。瞭解過的朋友應該都知道這是虛擬內存技術解決的這個問題，然而再linux中具體是怎樣的模型解決的操做系統的這個設計需求的呢，讓咱們從linux源碼的片斷開始看吧！（如下內核源碼均來自fedora21 64位系統的fc-3.19.3版本內核）node

<include/linux/mm_type.h>中對於物理頁面的定義struct page，也就是咱們常說的頁表，關於這裏的結構體的每一個變量/位的操做函數大部分在<include/linux/mm.h>中。linux

  1 struct page {
  2     /* First double word block */
  3     unsigned long flags;        /* Atomic flags, some possibly
  4                      * updated asynchronously */
  5     union {
  6         struct address_space *mapping;    /* If low bit clear, points to
  7                          * inode address_space, or NULL.
  8                          * If page mapped as anonymous
  9                          * memory, low bit is set, and
 10                          * it points to anon_vma object:
 11                          * see PAGE_MAPPING_ANON below.
 12                          */
 13         void *s_mem;            /* slab first object */
 14     };
 15 
 16     /* Second double word */
 17     struct {
 18         union {
 19             pgoff_t index;        /* Our offset within mapping. */
 20             void *freelist;        /* sl[aou]b first free object */
 21             bool pfmemalloc;    /* If set by the page allocator,
 22                          * ALLOC_NO_WATERMARKS was set
 23                          * and the low watermark was not
 24                          * met implying that the system
 25                          * is under some pressure. The
 26                          * caller should try ensure
 27                          * this page is only used to
 28                          * free other pages.
 29                          */
 30         };
 31 
 32         union {
 33 #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
 34     defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 35             /* Used for cmpxchg_double in slub */
 36             unsigned long counters;
 37 #else
 38             /*
 39              * Keep _count separate from slub cmpxchg_double data.
 40              * As the rest of the double word is protected by
 41              * slab_lock but _count is not.
 42              */
 43             unsigned counters;
 44 #endif
 45 
 46             struct {
 47 
 48                 union {
 49                     /*
 50                      * Count of ptes mapped in
 51                      * mms, to show when page is
 52                      * mapped & limit reverse map
 53                      * searches.
 54                      *
 55                      * Used also for tail pages
 56                      * refcounting instead of
 57                      * _count. Tail pages cannot
 58                      * be mapped and keeping the
 59                      * tail page _count zero at
 60                      * all times guarantees
 61                      * get_page_unless_zero() will
 62                      * never succeed on tail
 63                      * pages.
 64                      */
 65                     atomic_t _mapcount;
 66 
 67                     struct { /* SLUB */
 68                         unsigned inuse:16;
 69                         unsigned objects:15;
 70                         unsigned frozen:1;
 71                     };
 72                     int units;    /* SLOB */
 73                 };
 74                 atomic_t _count;        /* Usage count, see below. */
 75             };
 76             unsigned int active;    /* SLAB */
 77         };
 78     };
 79 
 80     /* Third double word block */
 81     union {
 82         struct list_head lru;    /* Pageout list, eg. active_list
 83                      * protected by zone->lru_lock !
 84                      * Can be used as a generic list
 85                      * by the page owner.
 86                      */
 87         struct {        /* slub per cpu partial pages */
 88             struct page *next;    /* Next partial slab */
 89 #ifdef CONFIG_64BIT
 90             int pages;    /* Nr of partial slabs left */
 91             int pobjects;    /* Approximate # of objects */
 92 #else
 93             short int pages;
 94             short int pobjects;
 95 #endif
 96         };
 97 
 98         struct slab *slab_page; /* slab fields */
 99         struct rcu_head rcu_head;    /* Used by SLAB
100                          * when destroying via RCU
101                          */
102 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
103         pgtable_t pmd_huge_pte; /* protected by page->ptl */
104 #endif
105     };
106 
107     /* Remainder is not double word aligned */
108     union {
109         unsigned long private;        /* Mapping-private opaque data:
110                           * usually used for buffer_heads
111                          * if PagePrivate set; used for
112                          * swp_entry_t if PageSwapCache;
113                          * indicates order in the buddy
114                          * system if PG_buddy is set.
115                          */
116 #if USE_SPLIT_PTE_PTLOCKS
117 #if ALLOC_SPLIT_PTLOCKS
118         spinlock_t *ptl;
119 #else
120         spinlock_t ptl;
121 #endif
122 #endif
123         struct kmem_cache *slab_cache;    /* SL[AU]B: Pointer to slab */
124         struct page *first_page;    /* Compound tail pages */
125     };
126 
127 #ifdef CONFIG_MEMCG
128     struct mem_cgroup *mem_cgroup;
129 #endif
130 
131     /*
132      * On machines where all RAM is mapped into kernel address space,
133      * we can simply calculate the virtual address. On machines with
134      * highmem some memory is mapped into kernel virtual memory
135      * dynamically, so we need a place to store that address.
136      * Note that this field could be 16 bits on x86 ... ;)
137      *
138      * Architectures with slow multiplication can define
139      * WANT_PAGE_VIRTUAL in asm/page.h
140      */
141 #if defined(WANT_PAGE_VIRTUAL)
142     void *virtual;            /* Kernel virtual address (NULL if
143                        not kmapped, ie. highmem) */
144 #endif /* WANT_PAGE_VIRTUAL */
145 
146 #ifdef CONFIG_KMEMCHECK
147     /*
148      * kmemcheck wants to track the status of each byte in a page; this
149      * is a pointer to such a status block. NULL if not tracked.
150      */
151     void *shadow;
152 #endif
153 
154 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
155     int _last_cpupid;
156 #endif
157 }

View Code

在整個struct page的定義裏面的註釋對每一個位都做了詳盡的解釋，但我仍是以爲有幾個重要的定義要重複一下：算法

（1）void*virtual:頁的虛擬地址（因爲在64位系統之中C語言裏的void*指針的長度最長爲64bit，尋址空間是2^64大遠遠超出了當前主流微機的硬件內存RAM的大小（8GB，16GB左右）這也就給虛擬空間尋址，交換技術提供了可能性）對virtual中的虛擬地址進行映射須要經過四級頁表來進行。windows

（2）pgoff_t index:這個變量和freelist被定義在同一個union中，index變量被內存管理子系統中的多個模塊使用，好比高速緩存。數組

（3）unsigned long flags:flag變量不多有設成long的可見裏面的信息量比較大，這裏是用來存放頁的狀態，好比鎖/未鎖，換出（虛擬內存用），激活等等。緩存

再繼續說內存管理機制以前，有一點很是重要，就是linux中關於進程和內存之間的對應關係。session

linux中的每個進程維護一個PCB，而這個PCB就是/include/linux/sched.h中定義的task_struct，在這個結構體的定義之中有定義變量：app

struct mm_struct *mm, *active_mm;less

這也就是進程和內存管理的橋樑之一，也是因而可知進程和內存塊/頁之間的關係是一對多的（考慮進程共享的內存的話是多對多），進程在裝入內存的時候，操做系統的工做的實質是將task_struct中的相關的內存數據映射到部分映射到物理內存之中，而對於並無映射的頁就採起交換技術來解決。和windows系統中的程序裝入過程相比較，windows中的程序裝入過程都是靠loader完成的，loader的工做就是針對PE格式的可執行文件經過二進制的分析（好比IDT，IAT等等）進行裝入，不少狀況下一個進程都會被裝入到同一個虛擬地址之中0x40000000（90%都是裝入這裏）。而linux之中，咱們的進程是根據調度算法來安排其在虛擬地址之中的分佈狀況，buddy算法能夠將進程的使用的頁儘量整齊地裝入（其實這裏我有些不是很清楚的地方，linux若是這麼動態分配內存那麼該如何處理一些動態加載的庫的問題，像windows中的dll文件都是經過計算偏移來重定位，而linux會怎麼作呢？）進程在已經裝入物理內存的頁的基礎之上開始執行指令，跳轉到並未被裝入物理內存的頁的虛擬地址的時候，會觸發一個缺頁中斷，缺頁中斷觸發頁的交換的過程，從而幫助程序繼續執行，這也就是虛擬內存的過程。

  1 struct task_struct {
  2     volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
  3     void *stack;
  4     atomic_t usage;
  5     unsigned int flags;    /* per process flags, defined below */
  6     unsigned int ptrace;
  7 
  8 #ifdef CONFIG_SMP
  9     struct llist_node wake_entry;
 10     int on_cpu;
 11     struct task_struct *last_wakee;
 12     unsigned long wakee_flips;
 13     unsigned long wakee_flip_decay_ts;
 14 
 15     int wake_cpu;
 16 #endif
 17     int on_rq;
 18 
 19     int prio, static_prio, normal_prio;
 20     unsigned int rt_priority;
 21     const struct sched_class *sched_class;
 22     struct sched_entity se;
 23     struct sched_rt_entity rt;
 24 #ifdef CONFIG_CGROUP_SCHED
 25     struct task_group *sched_task_group;
 26 #endif
 27     struct sched_dl_entity dl;
 28 
 29 #ifdef CONFIG_PREEMPT_NOTIFIERS
 30     /* list of struct preempt_notifier: */
 31     struct hlist_head preempt_notifiers;
 32 #endif
 33 
 34 #ifdef CONFIG_BLK_DEV_IO_TRACE
 35     unsigned int btrace_seq;
 36 #endif
 37 
 38     unsigned int policy;
 39     int nr_cpus_allowed;
 40     cpumask_t cpus_allowed;
 41 
 42 #ifdef CONFIG_PREEMPT_RCU
 43     int rcu_read_lock_nesting;
 44     union rcu_special rcu_read_unlock_special;
 45     struct list_head rcu_node_entry;
 46 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 47 #ifdef CONFIG_PREEMPT_RCU
 48     struct rcu_node *rcu_blocked_node;
 49 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 50 #ifdef CONFIG_TASKS_RCU
 51     unsigned long rcu_tasks_nvcsw;
 52     bool rcu_tasks_holdout;
 53     struct list_head rcu_tasks_holdout_list;
 54     int rcu_tasks_idle_cpu;
 55 #endif /* #ifdef CONFIG_TASKS_RCU */
 56 
 57 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 58     struct sched_info sched_info;
 59 #endif
 60 
 61     struct list_head tasks;
 62 #ifdef CONFIG_SMP
 63     struct plist_node pushable_tasks;
 64     struct rb_node pushable_dl_tasks;
 65 #endif
 66 
 67     struct mm_struct *mm, *active_mm;
 68 #ifdef CONFIG_COMPAT_BRK
 69     unsigned brk_randomized:1;
 70 #endif
 71     /* per-thread vma caching */
 72     u32 vmacache_seqnum;
 73     struct vm_area_struct *vmacache[VMACACHE_SIZE];
 74 #if defined(SPLIT_RSS_COUNTING)
 75     struct task_rss_stat    rss_stat;
 76 #endif
 77 /* task state */
 78     int exit_state;
 79     int exit_code, exit_signal;
 80     int pdeath_signal;  /*  The signal sent when the parent dies  */
 81     unsigned int jobctl;    /* JOBCTL_*, siglock protected */
 82 
 83     /* Used for emulating ABI behavior of previous Linux versions */
 84     unsigned int personality;
 85 
 86     unsigned in_execve:1;    /* Tell the LSMs that the process is doing an
 87                  * execve */
 88     unsigned in_iowait:1;
 89 
 90     /* Revert to default priority/policy when forking */
 91     unsigned sched_reset_on_fork:1;
 92     unsigned sched_contributes_to_load:1;
 93 
 94 #ifdef CONFIG_MEMCG_KMEM
 95     unsigned memcg_kmem_skip_account:1;
 96 #endif
 97 
 98     unsigned long atomic_flags; /* Flags needing atomic access. */
 99 
100     pid_t pid;
101     pid_t tgid;
102 
103 #ifdef CONFIG_CC_STACKPROTECTOR
104     /* Canary value for the -fstack-protector gcc feature */
105     unsigned long stack_canary;
106 #endif
107     /*
108      * pointers to (original) parent process, youngest child, younger sibling,
109      * older sibling, respectively.  (p->father can be replaced with
110      * p->real_parent->pid)
111      */
112     struct task_struct __rcu *real_parent; /* real parent process */
113     struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
114     /*
115      * children/sibling forms the list of my natural children
116      */
117     struct list_head children;    /* list of my children */
118     struct list_head sibling;    /* linkage in my parent's children list */
119     struct task_struct *group_leader;    /* threadgroup leader */
120 
121     /*
122      * ptraced is the list of tasks this task is using ptrace on.
123      * This includes both natural children and PTRACE_ATTACH targets.
124      * p->ptrace_entry is p's link on the p->parent->ptraced list.
125      */
126     struct list_head ptraced;
127     struct list_head ptrace_entry;
128 
129     /* PID/PID hash table linkage. */
130     struct pid_link pids[PIDTYPE_MAX];
131     struct list_head thread_group;
132     struct list_head thread_node;
133 
134     struct completion *vfork_done;        /* for vfork() */
135     int __user *set_child_tid;        /* CLONE_CHILD_SETTID */
136     int __user *clear_child_tid;        /* CLONE_CHILD_CLEARTID */
137 
138     cputime_t utime, stime, utimescaled, stimescaled;
139     cputime_t gtime;
140 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
141     struct cputime prev_cputime;
142 #endif
143 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
144     seqlock_t vtime_seqlock;
145     unsigned long long vtime_snap;
146     enum {
147         VTIME_SLEEPING = 0,
148         VTIME_USER,
149         VTIME_SYS,
150     } vtime_snap_whence;
151 #endif
152     unsigned long nvcsw, nivcsw; /* context switch counts */
153     u64 start_time;        /* monotonic time in nsec */
154     u64 real_start_time;    /* boot based time in nsec */
155 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
156     unsigned long min_flt, maj_flt;
157 
158     struct task_cputime cputime_expires;
159     struct list_head cpu_timers[3];
160 
161 /* process credentials */
162     const struct cred __rcu *real_cred; /* objective and real subjective task
163                      * credentials (COW) */
164     const struct cred __rcu *cred;    /* effective (overridable) subjective task
165                      * credentials (COW) */
166     char comm[TASK_COMM_LEN]; /* executable name excluding path
167                      - access with [gs]et_task_comm (which lock
168                        it with task_lock())
169                      - initialized normally by setup_new_exec */
170 /* file system info */
171     int link_count, total_link_count;
172 #ifdef CONFIG_SYSVIPC
173 /* ipc stuff */
174     struct sysv_sem sysvsem;
175     struct sysv_shm sysvshm;
176 #endif
177 #ifdef CONFIG_DETECT_HUNG_TASK
178 /* hung task detection */
179     unsigned long last_switch_count;
180 #endif
181 /* CPU-specific state of this task */
182     struct thread_struct thread;
183 /* filesystem information */
184     struct fs_struct *fs;
185 /* open file information */
186     struct files_struct *files;
187 /* namespaces */
188     struct nsproxy *nsproxy;
189 /* signal handlers */
190     struct signal_struct *signal;
191     struct sighand_struct *sighand;
192 
193     sigset_t blocked, real_blocked;
194     sigset_t saved_sigmask;    /* restored if set_restore_sigmask() was used */
195     struct sigpending pending;
196 
197     unsigned long sas_ss_sp;
198     size_t sas_ss_size;
199     int (*notifier)(void *priv);
200     void *notifier_data;
201     sigset_t *notifier_mask;
202     struct callback_head *task_works;
203 
204     struct audit_context *audit_context;
205 #ifdef CONFIG_AUDITSYSCALL
206     kuid_t loginuid;
207     unsigned int sessionid;
208 #endif
209     struct seccomp seccomp;
210 
211 /* Thread group tracking */
212        u32 parent_exec_id;
213        u32 self_exec_id;
214 /* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
215  * mempolicy */
216     spinlock_t alloc_lock;
217 
218     /* Protection of the PI data structures: */
219     raw_spinlock_t pi_lock;
220 
221 #ifdef CONFIG_RT_MUTEXES
222     /* PI waiters blocked on a rt_mutex held by this task */
223     struct rb_root pi_waiters;
224     struct rb_node *pi_waiters_leftmost;
225     /* Deadlock detection and priority inheritance handling */
226     struct rt_mutex_waiter *pi_blocked_on;
227 #endif
228 
229 #ifdef CONFIG_DEBUG_MUTEXES
230     /* mutex deadlock detection */
231     struct mutex_waiter *blocked_on;
232 #endif
233 #ifdef CONFIG_TRACE_IRQFLAGS
234     unsigned int irq_events;
235     unsigned long hardirq_enable_ip;
236     unsigned long hardirq_disable_ip;
237     unsigned int hardirq_enable_event;
238     unsigned int hardirq_disable_event;
239     int hardirqs_enabled;
240     int hardirq_context;
241     unsigned long softirq_disable_ip;
242     unsigned long softirq_enable_ip;
243     unsigned int softirq_disable_event;
244     unsigned int softirq_enable_event;
245     int softirqs_enabled;
246     int softirq_context;
247 #endif
248 #ifdef CONFIG_LOCKDEP
249 # define MAX_LOCK_DEPTH 48UL
250     u64 curr_chain_key;
251     int lockdep_depth;
252     unsigned int lockdep_recursion;
253     struct held_lock held_locks[MAX_LOCK_DEPTH];
254     gfp_t lockdep_reclaim_gfp;
255 #endif
256 
257 /* journalling filesystem info */
258     void *journal_info;
259 
260 /* stacked block device info */
261     struct bio_list *bio_list;
262 
263 #ifdef CONFIG_BLOCK
264 /* stack plugging */
265     struct blk_plug *plug;
266 #endif
267 
268 /* VM state */
269     struct reclaim_state *reclaim_state;
270 
271     struct backing_dev_info *backing_dev_info;
272 
273     struct io_context *io_context;
274 
275     unsigned long ptrace_message;
276     siginfo_t *last_siginfo; /* For ptrace use.  */
277     struct task_io_accounting ioac;
278 #if defined(CONFIG_TASK_XACCT)
279     u64 acct_rss_mem1;    /* accumulated rss usage */
280     u64 acct_vm_mem1;    /* accumulated virtual memory usage */
281     cputime_t acct_timexpd;    /* stime + utime since last update */
282 #endif
283 #ifdef CONFIG_CPUSETS
284     nodemask_t mems_allowed;    /* Protected by alloc_lock */
285     seqcount_t mems_allowed_seq;    /* Seqence no to catch updates */
286     int cpuset_mem_spread_rotor;
287     int cpuset_slab_spread_rotor;
288 #endif
289 #ifdef CONFIG_CGROUPS
290     /* Control Group info protected by css_set_lock */
291     struct css_set __rcu *cgroups;
292     /* cg_list protected by css_set_lock and tsk->alloc_lock */
293     struct list_head cg_list;
294 #endif
295 #ifdef CONFIG_FUTEX
296     struct robust_list_head __user *robust_list;
297 #ifdef CONFIG_COMPAT
298     struct compat_robust_list_head __user *compat_robust_list;
299 #endif
300     struct list_head pi_state_list;
301     struct futex_pi_state *pi_state_cache;
302 #endif
303 #ifdef CONFIG_PERF_EVENTS
304     struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
305     struct mutex perf_event_mutex;
306     struct list_head perf_event_list;
307 #endif
308 #ifdef CONFIG_DEBUG_PREEMPT
309     unsigned long preempt_disable_ip;
310 #endif
311 #ifdef CONFIG_NUMA
312     struct mempolicy *mempolicy;    /* Protected by alloc_lock */
313     short il_next;
314     short pref_node_fork;
315 #endif
316 #ifdef CONFIG_NUMA_BALANCING
317     int numa_scan_seq;
318     unsigned int numa_scan_period;
319     unsigned int numa_scan_period_max;
320     int numa_preferred_nid;
321     unsigned long numa_migrate_retry;
322     u64 node_stamp;            /* migration stamp  */
323     u64 last_task_numa_placement;
324     u64 last_sum_exec_runtime;
325     struct callback_head numa_work;
326 
327     struct list_head numa_entry;
328     struct numa_group *numa_group;
329 
330     /*
331      * numa_faults is an array split into four regions:
332      * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
333      * in this precise order.
334      *
335      * faults_memory: Exponential decaying average of faults on a per-node
336      * basis. Scheduling placement decisions are made based on these
337      * counts. The values remain static for the duration of a PTE scan.
338      * faults_cpu: Track the nodes the process was running on when a NUMA
339      * hinting fault was incurred.
340      * faults_memory_buffer and faults_cpu_buffer: Record faults per node
341      * during the current scan window. When the scan completes, the counts
342      * in faults_memory and faults_cpu decay and these values are copied.
343      */
344     unsigned long *numa_faults;
345     unsigned long total_numa_faults;
346 
347     /*
348      * numa_faults_locality tracks if faults recorded during the last
349      * scan window were remote/local. The task scan period is adapted
350      * based on the locality of the faults with different weights
351      * depending on whether they were shared or private faults
352      */
353     unsigned long numa_faults_locality[2];
354 
355     unsigned long numa_pages_migrated;
356 #endif /* CONFIG_NUMA_BALANCING */
357 
358     struct rcu_head rcu;
359 
360     /*
361      * cache last used pipe for splice
362      */
363     struct pipe_inode_info *splice_pipe;
364 
365     struct page_frag task_frag;
366 
367 #ifdef    CONFIG_TASK_DELAY_ACCT
368     struct task_delay_info *delays;
369 #endif
370 #ifdef CONFIG_FAULT_INJECTION
371     int make_it_fail;
372 #endif
373     /*
374      * when (nr_dirtied >= nr_dirtied_pause), it's time to call
375      * balance_dirty_pages() for some dirty throttling pause
376      */
377     int nr_dirtied;
378     int nr_dirtied_pause;
379     unsigned long dirty_paused_when; /* start of a write-and-pause period */
380 
381 #ifdef CONFIG_LATENCYTOP
382     int latency_record_count;
383     struct latency_record latency_record[LT_SAVECOUNT];
384 #endif
385     /*
386      * time slack values; these are used to round up poll() and
387      * select() etc timeout values. These are in nanoseconds.
388      */
389     unsigned long timer_slack_ns;
390     unsigned long default_timer_slack_ns;
391 
392 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
393     /* Index of current stored address in ret_stack */
394     int curr_ret_stack;
395     /* Stack of return addresses for return function tracing */
396     struct ftrace_ret_stack    *ret_stack;
397     /* time stamp for last schedule */
398     unsigned long long ftrace_timestamp;
399     /*
400      * Number of functions that haven't been traced
401      * because of depth overrun.
402      */
403     atomic_t trace_overrun;
404     /* Pause for the tracing */
405     atomic_t tracing_graph_pause;
406 #endif
407 #ifdef CONFIG_TRACING
408     /* state flags for use by tracers */
409     unsigned long trace;
410     /* bitmask and counter of trace recursion */
411     unsigned long trace_recursion;
412 #endif /* CONFIG_TRACING */
413 #ifdef CONFIG_MEMCG
414     struct memcg_oom_info {
415         struct mem_cgroup *memcg;
416         gfp_t gfp_mask;
417         int order;
418         unsigned int may_oom:1;
419     } memcg_oom;
420 #endif
421 #ifdef CONFIG_UPROBES
422     struct uprobe_task *utask;
423 #endif
424 #if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
425     unsigned int    sequential_io;
426     unsigned int    sequential_io_avg;
427 #endif
428 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
429     unsigned long    task_state_change;
430 #endif
431 };

View Code

愚蠢的問題1：

MMU是由硬件實現的專門爲解決虛擬地址和物理地址映射問題而設計的部件，那麼爲何要在linux的源代碼中體現呢？爲何在要在軟件中再描述一次呢？

虛擬地址到物理地址的映射，（目前而講）須要4級頁表索引的訪問來完成。在mm_struct結構體中的定義之中有一個pdg_t類型的指針名叫pgd（PageGlobalDirectory），由此出發繼續向下級訪問有pud（PageUpperDirectory）pmd（PageMiddleDirectory）pte（PageTableEntry），最後一級是具體的頁表很遺憾的是，我暫時沒有在3.19內核的源碼中找到關於pte_t的定義，可是根據書籍上的描述應該是一個指向struct page數組的指針。

因而咱們能夠這樣總結，程序在執行的過程會有大量的跳轉的過程，而每次的跳轉須要一個操做數即地址，這個地址是一個虛擬地址，而後根據該虛擬地址進行MMU的操做，過程當中獲得一個頁表，首先根據頁表判斷該頁是否已經存在於物理內存中，若是不是的話則進行一次交換的操做，上文已經闡述過該過程，頁交換完成以後，尋址過程就得以繼續進行了，此時使用相同的虛擬地址訪問到的是另外一個物理頁面，即交換進入的物理頁面。

愚蠢的問題2：

虛擬內存的機制像是把物理內存和外部存儲容量共同地址編碼，這個共同的編碼就是虛擬地址，所謂「編碼」過程不必定是順序一對一的，可是虛擬地址和頁表的索引之間必定是個滿射關係。

這是我最初對於虛擬內存機制的理解，表面看起來沒有什麼問題，可仍是當考慮每一個進程的尋址空間獨立性的時候就會發現問題，相同的地址在兩個進程中映射外部地址應該能夠是不相同的，但是一旦將他們看做共同地址編碼，就不會有相同的邏輯地址映射到不一樣的物理地址這回事了。

其實答案很簡單一句話：每一個進程維護一個頁表 !

最後一張大圖歸納一下上文