磁盤驅動就是實現磁盤空間和內存空間數據上的交互,在上一篇中咱們討論了內存端的Page Segment Block Sector相關的概念,本文以3.14內核爲例,討論這部份內存是如何被組織管理的。咱們知道,爲了解決CPU和內存的速度不匹配,計算機系統引入了Cache緩存機制,這種硬件Cache的速度接近CPU內部寄存器的速度,能夠提升系統效率,一樣的思路也適用於解決內存和磁盤的速度不匹配問題,此外,磁盤可能是機械式的,從壽命的角度考慮也不適合頻繁讀寫,因此內核就將一部份內存做爲緩存,提升磁盤訪問速度的同時延長磁盤壽命,這種緩存就是磁盤高速緩存。包括頁高速緩存(Page Cache,對完整數據頁進行操做的磁盤高速緩存) + 目錄項高速緩存(Dentry Cache,描述文件系統路徑名的目錄項對象) + 索引節點高速緩存(Buffer Cache,存放的是描述磁盤索引節點的索引節點對象),本文主要討論頁高速緩存,有了頁高速緩存,內核的代碼和數據結構沒必要從磁盤讀,也沒必要寫入磁盤。頁高速緩存能夠看做特定文件系統層的一部分。html
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
└── vfs_read(f.file, buf, count, &pos);
└──file->f_op->read(file, buf, count, pos);
└──do_sync_read(file, buf, count, pos);
└──filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
├──generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos)
└──filemap_write_and_wait_range(mapping, pos, pos + iov_length(iov, nr_segs) - 1);
-----------------------------------Page Cache----------------------------------------------------
int mpage_readpage(struct page *page, get_block_t get_block)
└──wait_on_sync_kiocb(&kiocb);
├──do_mpage_readpage(bio, page, 1, &last_block_in_bio, &map_bh, &first_logical_block, get_block);
└──mpage_bio_submit(READ, bio);
└──submit_bio(rw, bio);node
絕大多數狀況下,內核在讀寫磁盤時都引用頁高速緩存。新頁被追加到頁高速緩存以知足用戶態進程的讀請求。若是頁再也不高速緩存中,新頁就被加到高速緩存中,而後用從磁盤讀出的數據填充它,若是內存有足夠的空閒空間,就讓該頁在高速緩存中長期保留,使其餘進程再使用該頁時再也不訪問磁盤。linux
一樣,在把一頁數據寫到塊設備以前,內核首先檢查對應的頁是否已經在高速緩存中,若是不在,就要先在其中增長一個新項,並用要寫到磁盤中的數據填充該項。IO數據的傳送不是立刻開始,而是延遲幾秒纔對磁盤進行更新,從而使進程有機會隊要寫入磁盤的數據作進一步的修改。數組
頁高速緩存肯多是下面幾種之一:緩存
既然是創建一塊磁盤空間和一塊內存空間之間的關係,那麼就要經過相關的結構表示這種關係,在磁盤端,存儲空間本質上都是屬於一個文件,Linux中用inode結構表示一個文件,內存端,Linux內核用address_space來組織一組內存頁,因此,咱們能夠在inode結構中找到相應的address_space對象域,而這個文件就成爲該頁的全部者(owner)。簡單的追一下代碼,咱們能夠畫出下面這張關係圖,本節主要圍繞這張圖討論數據結構
inode是內核中描述一個文件的結構,更多關於inode的討論,能夠參考Linux設備文件三大結構:inode,file,file_operations,本文中咱們主要關心i_mapping和i_data兩個成員。app
//3.14/include/linux/fs.h 527 struct inode { 541 struct address_space *i_mapping; 594 struct address_space i_data; 616 };
struct inode
--541-->指向這個inode擁有的address_space對象
--594-->這個inode擁有的address_space對象async
頁高速緩存的核心結構就address_space對象,他是一個嵌入在頁全部者的索引節點對象中的數據結構。高速緩存中的許多頁均可能屬於一個全部者,從而可能被連接到同一個address_space對象。該對象還在全部者的頁和對這些頁的操做之間創建起連接關係。函數
412 struct address_space { 413 struct inode *host; /* owner: inode, block_device */ 414 struct radix_tree_root page_tree; /* radix tree of all pages */ 415 spinlock_t tree_lock; /* and lock protecting it */ 416 unsigned int i_mmap_writable;/* count VM_SHARED mappings */ 417 struct rb_root i_mmap; /* tree of private and shared mappings */ 418 struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ 419 struct mutex i_mmap_mutex; /* protect tree, count, list */ 420 /* Protected by tree_lock together with the radix tree */ 421 unsigned long nrpages; /* number of total pages */ 422 pgoff_t writeback_index;/* writeback starts here */ 423 const struct address_space_operations *a_ops; /* methods */ 424 unsigned long flags; /* error bits/gfp mask */ 425 struct backing_dev_info *backing_dev_info; /* device readahead, etc */ 426 spinlock_t private_lock; /* for use by the address_space */ 427 struct list_head private_list; /* ditto */ 428 void *private_data; /* ditto */ 429 } __attribute__((aligned(sizeof(long))));
struct address_space
--413-->這個address_space對象所屬的inode對象
--414-->這個address_space對象擁有的radix_tree_root對象
--425-->指向backing_dev_info對象,這個對象描述了全部者的數據所在的塊設備,一般嵌入在塊設備的請求隊列描述符中。ui
描述一個radix樹的根,內核使用這個數據結構快速的查找增刪一個inode擁有的頁高速緩存頁
64 struct radix_tree_root { 65 unsigned int height; 66 gfp_t gfp_mask; 67 struct radix_tree_node __rcu *rnode; 68 };
50 struct radix_tree_node { 51 unsigned int height; /* Height from the bottom */ 52 unsigned int count; 53 union { 54 struct radix_tree_node *parent; /* Used when ascending tree */ 55 struct rcu_head rcu_head; /* Used when freeing node */ 56 }; 57 void __rcu *slots[RADIX_TREE_MAP_SIZE]; 58 unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; 59 };
struct radix_tree_node
--51-->當前樹的深度,不包括葉子節點的層數
--52-->記錄節點中非空指針數量的計數器
--57-->slot是包含64個指針的數組,每一個元素能夠指向其餘節點(struct radix_tree_node)或者頁描述符(struct page),上層節點指向其餘節點,底層節點指向頁描述符(葉子節點)
--58-->tag二維數組用於對radix_tree_node基樹進行標記,下面就是一個頁可能的標誌
74 enum pageflags { 75 PG_locked, /* Page is locked. Don't touch. */ 76 PG_error, 77 PG_referenced, 78 PG_uptodate, 79 PG_dirty, 80 PG_lru, 81 PG_active, 82 PG_slab, 83 PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ 84 PG_arch_1, 85 PG_reserved, 86 PG_private, /* If pagecache, has fs-private data */ 87 PG_private_2, /* If pagecache, has fs aux data */ 88 PG_writeback, /* Page is under writeback */ 93 PG_compound, /* A compound page */ 95 PG_swapcache, /* Swap page: swp_entry_t in private */ 96 PG_mappedtodisk, /* Has blocks allocated on-disk */ 97 PG_reclaim, /* To be reclaimed asap */ 98 PG_swapbacked, /* Page is backed by RAM/swap */ 99 PG_unevictable, /* Page is "unevictable" */ 112 __NR_PAGEFLAGS, 113 114 /* Filesystems */ 115 PG_checked = PG_owner_priv_1, 116 117 /* Two page bits are conscripted by FS-Cache to maintain local caching 118 * state. These bits are set on pages belonging to the netfs's inodes 119 * when those inodes are being locally cached. 120 */ 121 PG_fscache = PG_private_2, /* page backed by cache */ 122 123 /* XEN */ 124 PG_pinned = PG_owner_priv_1, 125 PG_savepinned = PG_dirty, 126 127 /* SLOB */ 128 PG_slob_free = PG_private, 129 };
page就是內核中頁描述符,經過radix樹的操做,咱們最終能夠找到一組page,這組page歸屬於一個inode。咱們能夠看到其中的pgoff_t index成員,用來表示當前頁在整組高速緩存頁中的索引。至此,咱們就經過一個文件的inode找到了它擁有的頁高速緩存,接接下來就是使用塊設備驅動實現相應的頁緩存和磁盤的數據交互。
44 struct page { 48 union { 49 struct address_space *mapping; /* If low bit clear, points to 57 }; 59 /* Second double word */ 60 struct { 61 union { 62 pgoff_t index; /* Our offset within mapping. */ 73 }; 121 }; 198 }
頁高速緩存的基本操做是增刪查更,在此基礎上能夠封裝更高級的API
static inline int add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
void delete_from_page_cache(struct page *page)
struct page *find_get_page(struct address_space *mapping, pgoff_t offset) unsigned find_get_pages(struct address_space *mapping, pgoff_t start,unsigned int nr_pages, struct page **pages) //搶不到鎖會阻塞 struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) //搶不到鎖當即返回try??? //搶不到鎖會阻塞,可是若是頁不存在就建立新的 struct page *find_or_create_page(struct address_space *mapping,pgoff_t index, gfp_t gfp_mask)
truct page *read_cache_page(struct address_space *mapping,pgoff_t index,int (*filler)(void *, struct page *),void *data)
block便是那個VFS或文件系統中的最小邏輯操做單位,一個頁高速緩存能夠由幾個block構成,他們之間的關係以下:
每一個塊緩衝區都有相應的buffer_head對象描述,該描述符包含內核必須瞭解的,有關如何處理塊的全部信息,
62 struct buffer_head { 63 unsigned long b_state; /* buffer state bitmap (see above) */ 64 struct buffer_head *b_this_page;/* circular list of page's buffers */ 65 struct page *b_page; /* the page this bh is mapped to */ 66 67 sector_t b_blocknr; /* start block number */ 68 size_t b_size; /* size of mapping */ 69 char *b_data; /* pointer to data within the page */ 70 71 struct block_device *b_bdev; 72 bh_end_io_t *b_end_io; /* I/O completion */ 73 void *b_private; /* reserved for b_end_io */ 74 struct list_head b_assoc_buffers; /* associated with another mapping */ 75 struct address_space *b_assoc_map; /* mapping this buffer is 76 associated with */ 77 atomic_t b_count; /* users using this buffer_head */ 78 };
struct buffer_head
--63-->緩衝區狀態標誌
--64-->指向緩衝區的鏈表中的下一個元素的指針
--65-->指向擁有該塊的緩衝區頁的描述符的指針
--67-->塊引用計數
--68-->塊大小
--69-->表示塊緩衝區在緩衝區頁中的位置,實際上,這個位置的編號依賴於頁是否在高端內存,若是在高端內存,則b_data字段存放的是塊緩衝區相對於頁的起始位置的偏移量,不然,b_data存放的是塊緩衝區的線性地址
--71-->指向IO完成方法數據的指針
--72-->IO完成方法
--73-->指向IO完成方法數據的指針
--74-->爲與某個索引節點相關的間接塊的鏈表提供的指針
下面是b_state可能的取值
//include/linux/buffer_head.h 19 enum bh_state_bits { 20 BH_Uptodate, /* Contains valid data */ 21 BH_Dirty, /* Is dirty */ 22 BH_Lock, /* Is locked */ 23 BH_Req, /* Has been submitted for I/O */ 24 BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise 25 * IO completion of other buffers in the page 26 */ 27 28 BH_Mapped, /* Has a disk mapping */ 29 BH_New, /* Disk mapping was newly created by get_block */ 30 BH_Async_Read, /* Is under end_buffer_async_read I/O */ 31 BH_Async_Write, /* Is under end_buffer_async_write I/O */ 32 BH_Delay, /* Buffer is not yet allocated on disk */ 33 BH_Boundary, /* Block is followed by a discontiguity */ 34 BH_Write_EIO, /* I/O error on write */ 35 BH_Unwritten, /* Buffer is allocated on disk but not written */ 36 BH_Quiet, /* Buffer Error Prinks to be quiet */ 37 BH_Meta, /* Buffer contains metadata */ 38 BH_Prio, /* Buffer should be submitted with REQ_PRIO */ 39 BH_Defer_Completion, /* Defer AIO completion to workqueue */ 40 41 BH_PrivateStart,/* not a state bit, but the first bit available 42 * for private allocation by other entities 43 */ 44 };
將塊設備緩衝區所在的頁添加到頁高速緩存中
static int grow_buffers(struct block_device *bdev, sector_t block, int size)
將塊設備緩衝區所在的頁從頁高速緩存中剔除
int try_to_free_buffers(struct page *page) int try_to_release_page(struct page *page, gfp_t gfp_mask)
在頁高速緩存中搜索塊
當內核須要讀寫一個單獨的物理設備塊時(例如一個超級塊),必須檢查全部請求的塊,緩衝區是否已經在頁高速緩存中。在頁高速緩存中搜索執行的塊緩衝區(由塊設備描述符的地址bdev和邏輯塊號nr表示)的過程能夠分紅3個步驟
__find_get_block(struct block_device *bdev, sector_t block, unsigned size) __getblk(struct block_device *bdev, sector_t block, unsigned size) __bread(struct block_device *bdev, sector_t block, unsigned size)
塊緩衝區最終要提交到通用塊層進行IO操做,相關的API以下
int submit_bh(int rw, struct buffer_head *bh) void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
在髒頁回寫機制中,首先,內核使用一個backing_dev_info對象來描述一個bdi設備,即backing device info——非易失存儲設備,這些backing_dev_info都會掛接到bdi_list鏈表中,咱們能夠從其註冊函數bdi_register()中看出。因爲bdi的低速,因此也就有了本文討論的頁緩衝機制以及髒頁回寫問題,當一個高速緩存頁被修改時,內核就會將相應的page對象中的相應的tag置爲PG_dirty,即"髒頁",髒頁須要在合適的時候回寫到磁盤對於髒頁回寫,2.6.2x/3x之前的內核經過動態的建立/刪除pdflush線程來實現髒頁回寫,可是2.6.2x/3x以後的內核對這個方面進行的改進,採用writeback機制進行回寫,writeback機制須要的核心結構和方法的關係以下。
能夠看出,一個邏輯磁盤--一個gendisk對象--一個request_queue對象--一個backing_dev_info對象,這個backing_dev_info對象就是髒頁回寫的核心結構
64 struct backing_dev_info { 65 struct list_head bdi_list; 97 struct bdi_writeback wb; /* default writeback info for this bdi */ 100 struct list_head work_list; 102 struct device *dev; 110 };
struct backing_dev_info
--65-->將全部的backing_dev_info連接起來的鏈表節點
--97-->bdi_writeback對象,使用延遲工做進行髒頁回寫
--100-->這個bdi設備中等待被處理的頁的描述
--102-->表示這是一個device
37 /* 38 * Passed into wb_writeback(), essentially a subset of writeback_control 39 */ 40 struct wb_writeback_work { 41 long nr_pages; 42 struct super_block *sb; 43 unsigned long *older_than_this; 44 enum writeback_sync_modes sync_mode; 45 unsigned int tagged_writepages:1; 46 unsigned int for_kupdate:1; 47 unsigned int range_cyclic:1; 48 unsigned int for_background:1; 49 unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ 50 enum wb_reason reason; /* why was writeback initiated? */ 51 52 struct list_head list; /* pending work list */ 53 struct completion *done; /* set if the caller waits */ 54 };
51 struct bdi_writeback { 52 struct backing_dev_info *bdi; /* our parent bdi */ 53 unsigned int nr; 54 55 unsigned long last_old_flush; /* last old data flush */ 56 57 struct delayed_work dwork; /* work item used for writeback */ 58 struct list_head b_dirty; /* dirty inodes */ 59 struct list_head b_io; /* parked for writeback */ 60 struct list_head b_more_io; /* parked for more writeback */ 61 spinlock_t list_lock; /* protects the b_* lists */ 62 };
struct bdi_writeback
--57-->延遲工做對象,最終會調用下面的函數處理髒頁
778 /* 779 * Explicit flushing or periodic writeback of "old" data. 780 * 781 * Define "old": the first time one of an inode's pages is dirtied, we mark the 782 * dirtying-time in the inode's address_space. So this periodic writeback code 783 * just walks the superblock inode list, writing back any inodes which are 784 * older than a specific point in time. 785 * 786 * Try to run once per dirty_writeback_interval. But if a writeback event 787 * takes longer than a dirty_writeback_interval interval, then leave a 788 * one-second gap. 789 * 790 * older_than_this takes precedence over nr_to_write. So we'll only write back 791 * all dirty pages if they are all attached to "old" mappings. 792 */ 793 static long wb_writeback(struct bdi_writeback *wb, 794 struct wb_writeback_work *work)