Linux塊設備IO子系統(二) _頁高速緩存

時間 2019-11-06

原文原文鏈接

磁盤驅動就是實現磁盤空間和內存空間數據上的交互，在上一篇中咱們討論了內存端的Page Segment Block Sector相關的概念，本文以3.14內核爲例，討論這部份內存是如何被組織管理的。咱們知道，爲了解決CPU和內存的速度不匹配，計算機系統引入了Cache緩存機制，這種硬件Cache的速度接近CPU內部寄存器的速度，能夠提升系統效率，一樣的思路也適用於解決內存和磁盤的速度不匹配問題，此外，磁盤可能是機械式的，從壽命的角度考慮也不適合頻繁讀寫，因此內核就將一部份內存做爲緩存，提升磁盤訪問速度的同時延長磁盤壽命，這種緩存就是磁盤高速緩存。包括頁高速緩存(Page Cache，對完整數據頁進行操做的磁盤高速緩存) + 目錄項高速緩存(Dentry Cache，描述文件系統路徑名的目錄項對象) + 索引節點高速緩存(Buffer Cache，存放的是描述磁盤索引節點的索引節點對象)，本文主要討論頁高速緩存，有了頁高速緩存，內核的代碼和數據結構沒必要從磁盤讀，也沒必要寫入磁盤。頁高速緩存能夠看做特定文件系統層的一部分。html

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
   └── vfs_read(f.file, buf, count, &pos);
            └──file->f_op->read(file, buf, count, pos);
            └──do_sync_read(file, buf, count, pos);
                      └──filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
                                 ├──generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos)
                                 └──filemap_write_and_wait_range(mapping, pos, pos + iov_length(iov, nr_segs) - 1);
-----------------------------------Page Cache----------------------------------------------------
int mpage_readpage(struct page *page, get_block_t get_block)
           └──wait_on_sync_kiocb(&kiocb);
                      ├──do_mpage_readpage(bio, page, 1, &last_block_in_bio, &map_bh, &first_logical_block, get_block);
                       └──mpage_bio_submit(READ, bio);
                                 └──submit_bio(rw, bio);node

絕大多數狀況下，內核在讀寫磁盤時都引用頁高速緩存。新頁被追加到頁高速緩存以知足用戶態進程的讀請求。若是頁再也不高速緩存中，新頁就被加到高速緩存中，而後用從磁盤讀出的數據填充它，若是內存有足夠的空閒空間，就讓該頁在高速緩存中長期保留，使其餘進程再使用該頁時再也不訪問磁盤。linux

一樣，在把一頁數據寫到塊設備以前，內核首先檢查對應的頁是否已經在高速緩存中，若是不在，就要先在其中增長一個新項，並用要寫到磁盤中的數據填充該項。IO數據的傳送不是立刻開始，而是延遲幾秒纔對磁盤進行更新，從而使進程有機會隊要寫入磁盤的數據作進一步的修改。數組

頁高速緩存肯多是下面幾種之一：緩存

含有普通文件數據的頁(上篇中的一個Page)
含有目錄的頁
含有直接從塊設備文件(跳過文件系統)讀出的數據的頁。
含有用戶態進程數據的頁
屬於特殊文件系統文件的頁，如shm

從inode到page

既然是創建一塊磁盤空間和一塊內存空間之間的關係，那麼就要經過相關的結構表示這種關係，在磁盤端，存儲空間本質上都是屬於一個文件，Linux中用inode結構表示一個文件，內存端，Linux內核用address_space來組織一組內存頁，因此，咱們能夠在inode結構中找到相應的address_space對象域，而這個文件就成爲該頁的全部者(owner)。簡單的追一下代碼，咱們能夠畫出下面這張關係圖，本節主要圍繞這張圖討論數據結構

inode

inode是內核中描述一個文件的結構，更多關於inode的討論，能夠參考Linux設備文件三大結構：inode,file,file_operations，本文中咱們主要關心i_mapping和i_data兩個成員。app

//3.14/include/linux/fs.h
 527 struct inode {
 541         struct address_space    *i_mapping;
 594         struct address_space    i_data;
 616 };

struct inode
--541-->指向這個inode擁有的address_space對象
--594-->這個inode擁有的address_space對象async

address_space

頁高速緩存的核心結構就address_space對象，他是一個嵌入在頁全部者的索引節點對象中的數據結構。高速緩存中的許多頁均可能屬於一個全部者，從而可能被連接到同一個address_space對象。該對象還在全部者的頁和對這些頁的操做之間創建起連接關係。函數

412 struct address_space {
 413         struct inode            *host;          /* owner: inode, block_device */
 414         struct radix_tree_root  page_tree;      /* radix tree of all pages */
 415         spinlock_t              tree_lock;      /* and lock protecting it */
 416         unsigned int            i_mmap_writable;/* count VM_SHARED mappings */
 417         struct rb_root          i_mmap;         /* tree of private and shared mappings */
 418         struct list_head        i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
 419         struct mutex            i_mmap_mutex;   /* protect tree, count, list */
 420         /* Protected by tree_lock together with the radix tree */
 421         unsigned long           nrpages;        /* number of total pages */
 422         pgoff_t                 writeback_index;/* writeback starts here */
 423         const struct address_space_operations *a_ops;   /* methods */
 424         unsigned long           flags;          /* error bits/gfp mask */
 425         struct backing_dev_info *backing_dev_info; /* device readahead, etc */
 426         spinlock_t              private_lock;   /* for use by the address_space */
 427         struct list_head        private_list;   /* ditto */
 428         void                    *private_data;  /* ditto */
 429 } __attribute__((aligned(sizeof(long))));

struct address_space
--413-->這個address_space對象所屬的inode對象
--414-->這個address_space對象擁有的radix_tree_root對象
--425-->指向backing_dev_info對象，這個對象描述了全部者的數據所在的塊設備，一般嵌入在塊設備的請求隊列描述符中。ui

radix_tree_root

描述一個radix樹的根，內核使用這個數據結構快速的查找增刪一個inode擁有的頁高速緩存頁

64 struct radix_tree_root {
 65         unsigned int            height;
 66         gfp_t                   gfp_mask;
 67         struct radix_tree_node  __rcu *rnode;  
 68 };

50 struct radix_tree_node {
  51         unsigned int    height;         /* Height from the bottom */
  52         unsigned int    count;
  53         union {
  54                 struct radix_tree_node *parent; /* Used when ascending tree */
  55                 struct rcu_head rcu_head;       /* Used when freeing node */
  56         };
  57         void __rcu      *slots[RADIX_TREE_MAP_SIZE];
  58         unsigned long   tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
  59 };

struct radix_tree_node
--51-->當前樹的深度，不包括葉子節點的層數
--52-->記錄節點中非空指針數量的計數器
--57-->slot是包含64個指針的數組，每一個元素能夠指向其餘節點（struct radix_tree_node）或者頁描述符（struct page），上層節點指向其餘節點，底層節點指向頁描述符(葉子節點)
--58-->tag二維數組用於對radix_tree_node基樹進行標記，下面就是一個頁可能的標誌

74 enum pageflags {
 75         PG_locked,              /* Page is locked. Don't touch. */
 76         PG_error,
 77         PG_referenced,
 78         PG_uptodate,                           
 79         PG_dirty,
 80         PG_lru,
 81         PG_active,
 82         PG_slab,
 83         PG_owner_priv_1,        /* Owner use. If pagecache, fs may use*/
 84         PG_arch_1,
 85         PG_reserved,
 86         PG_private,             /* If pagecache, has fs-private data */
 87         PG_private_2,           /* If pagecache, has fs aux data */
 88         PG_writeback,           /* Page is under writeback */
 93         PG_compound,            /* A compound page */
 95         PG_swapcache,           /* Swap page: swp_entry_t in private */
 96         PG_mappedtodisk,        /* Has blocks allocated on-disk */
 97         PG_reclaim,             /* To be reclaimed asap */
 98         PG_swapbacked,          /* Page is backed by RAM/swap */
 99         PG_unevictable,         /* Page is "unevictable"  */
112         __NR_PAGEFLAGS,
113 
114         /* Filesystems */
115         PG_checked = PG_owner_priv_1,
116 
117         /* Two page bits are conscripted by FS-Cache to maintain local caching
118          * state.  These bits are set on pages belonging to the netfs's inodes
119          * when those inodes are being locally cached.
120          */
121         PG_fscache = PG_private_2,      /* page backed by cache */
122 
123         /* XEN */
124         PG_pinned = PG_owner_priv_1,
125         PG_savepinned = PG_dirty,
126 
127         /* SLOB */
128         PG_slob_free = PG_private,
129 };

page

page就是內核中頁描述符，經過radix樹的操做，咱們最終能夠找到一組page，這組page歸屬於一個inode。咱們能夠看到其中的pgoff_t index成員，用來表示當前頁在整組高速緩存頁中的索引。至此，咱們就經過一個文件的inode找到了它擁有的頁高速緩存，接接下來就是使用塊設備驅動實現相應的頁緩存和磁盤的數據交互。

44 struct page {
 48         union {
 49                 struct address_space *mapping;  /* If low bit clear, points to
 57         };
 59         /* Second double word */
 60         struct {
 61                 union {
 62                         pgoff_t index;          /* Our offset within mapping. */
 73                 };
 121         };
 198 }

頁高速緩存的基本操做是增刪查更，在此基礎上能夠封裝更高級的API

增長page

static inline int add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)

刪除page

void delete_from_page_cache(struct page *page)

查找page

struct page *find_get_page(struct address_space *mapping, pgoff_t offset) 
unsigned find_get_pages(struct address_space *mapping, pgoff_t start,unsigned int nr_pages, struct page **pages)

//搶不到鎖會阻塞
struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)   
//搶不到鎖當即返回try???

//搶不到鎖會阻塞，可是若是頁不存在就建立新的
struct page *find_or_create_page(struct address_space *mapping,pgoff_t index, gfp_t gfp_mask)

更新page

truct page *read_cache_page(struct address_space *mapping,pgoff_t index,int (*filler)(void *, struct page *),void *data)

從page 到 block

block便是那個VFS或文件系統中的最小邏輯操做單位，一個頁高速緩存能夠由幾個block構成，他們之間的關係以下：

buffer_head

每一個塊緩衝區都有相應的buffer_head對象描述，該描述符包含內核必須瞭解的，有關如何處理塊的全部信息，

62 struct buffer_head {        
 63         unsigned long b_state;          /* buffer state bitmap (see above) */  
 64         struct buffer_head *b_this_page;/* circular list of page's buffers */  
 65         struct page *b_page;            /* the page this bh is mapped to */
 66 
 67         sector_t b_blocknr;             /* start block number */
 68         size_t b_size;                  /* size of mapping */
 69         char *b_data;                   /* pointer to data within the page */ 
 70 
 71         struct block_device *b_bdev;
 72         bh_end_io_t *b_end_io;          /* I/O completion */
 73         void *b_private;                /* reserved for b_end_io */
 74         struct list_head b_assoc_buffers; /* associated with another mapping */
 75         struct address_space *b_assoc_map;      /* mapping this buffer is
 76                                                    associated with */         
 77         atomic_t b_count;               /* users using this buffer_head */
 78 };

struct buffer_head
--63-->緩衝區狀態標誌
--64-->指向緩衝區的鏈表中的下一個元素的指針
--65-->指向擁有該塊的緩衝區頁的描述符的指針
--67-->塊引用計數
--68-->塊大小
--69-->表示塊緩衝區在緩衝區頁中的位置，實際上，這個位置的編號依賴於頁是否在高端內存，若是在高端內存，則b_data字段存放的是塊緩衝區相對於頁的起始位置的偏移量，不然，b_data存放的是塊緩衝區的線性地址
--71-->指向IO完成方法數據的指針
--72-->IO完成方法
--73-->指向IO完成方法數據的指針
--74-->爲與某個索引節點相關的間接塊的鏈表提供的指針

下面是b_state可能的取值

//include/linux/buffer_head.h
 19 enum bh_state_bits {
 20         BH_Uptodate,    /* Contains valid data */
 21         BH_Dirty,       /* Is dirty */
 22         BH_Lock,        /* Is locked */
 23         BH_Req,         /* Has been submitted for I/O */
 24         BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise
 25                           * IO completion of other buffers in the page
 26                           */
 27 
 28         BH_Mapped,      /* Has a disk mapping */
 29         BH_New,         /* Disk mapping was newly created by get_block */
 30         BH_Async_Read,  /* Is under end_buffer_async_read I/O */
 31         BH_Async_Write, /* Is under end_buffer_async_write I/O */
 32         BH_Delay,       /* Buffer is not yet allocated on disk */
 33         BH_Boundary,    /* Block is followed by a discontiguity */
 34         BH_Write_EIO,   /* I/O error on write */
 35         BH_Unwritten,   /* Buffer is allocated on disk but not written */
 36         BH_Quiet,       /* Buffer Error Prinks to be quiet */
 37         BH_Meta,        /* Buffer contains metadata */
 38         BH_Prio,        /* Buffer should be submitted with REQ_PRIO */
 39         BH_Defer_Completion, /* Defer AIO completion to workqueue */
 40 
 41         BH_PrivateStart,/* not a state bit, but the first bit available
 42                          * for private allocation by other entities
 43                          */
 44 };

增

將塊設備緩衝區所在的頁添加到頁高速緩存中

static int grow_buffers(struct block_device *bdev, sector_t block, int size)

刪

將塊設備緩衝區所在的頁從頁高速緩存中剔除

int try_to_free_buffers(struct page *page) 
int try_to_release_page(struct page *page, gfp_t gfp_mask)

查

在頁高速緩存中搜索塊
當內核須要讀寫一個單獨的物理設備塊時(例如一個超級塊)，必須檢查全部請求的塊，緩衝區是否已經在頁高速緩存中。在頁高速緩存中搜索執行的塊緩衝區(由塊設備描述符的地址bdev和邏輯塊號nr表示)的過程能夠分紅3個步驟

獲取一個指針，讓它指向包含指定的塊設備的address_space對象(bdev->bd_inode->imapping)
獲取塊設備的大小(bdev->bd_block_size)，並計算包含指定塊的頁索引。這須要在邏輯塊號上進行移位操做，eg，若是塊的大小是1024字節，每一個緩衝區頁包含4個塊緩衝區，那麼頁的索引是nr/4
在塊設備的基樹中搜索緩衝區頁，得到頁描述符後，內核訪問緩衝區首部，它描述了頁中塊緩衝區的狀態

__find_get_block(struct block_device *bdev, sector_t block, unsigned size) 
__getblk(struct block_device *bdev, sector_t block, unsigned size)    
__bread(struct block_device *bdev, sector_t block, unsigned size)

交

塊緩衝區最終要提交到通用塊層進行IO操做，相關的API以下

int submit_bh(int rw, struct buffer_head *bh) 
void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])

髒頁回寫

在髒頁回寫機制中，首先，內核使用一個backing_dev_info對象來描述一個bdi設備，即backing device info——非易失存儲設備，這些backing_dev_info都會掛接到bdi_list鏈表中，咱們能夠從其註冊函數bdi_register()中看出。因爲bdi的低速，因此也就有了本文討論的頁緩衝機制以及髒頁回寫問題，當一個高速緩存頁被修改時，內核就會將相應的page對象中的相應的tag置爲PG_dirty，即"髒頁"，髒頁須要在合適的時候回寫到磁盤對於髒頁回寫，2.6.2x/3x之前的內核經過動態的建立/刪除pdflush線程來實現髒頁回寫，可是2.6.2x/3x以後的內核對這個方面進行的改進，採用writeback機制進行回寫，writeback機制須要的核心結構和方法的關係以下。

能夠看出，一個邏輯磁盤--一個gendisk對象--一個request_queue對象--一個backing_dev_info對象，這個backing_dev_info對象就是髒頁回寫的核心結構

64 struct backing_dev_info {
 65         struct list_head bdi_list;
 97         struct bdi_writeback wb;  /* default writeback info for this bdi */
100         struct list_head work_list;
102         struct device *dev;
110 };

struct backing_dev_info
--65-->將全部的backing_dev_info連接起來的鏈表節點
--97-->bdi_writeback對象，使用延遲工做進行髒頁回寫
--100-->這個bdi設備中等待被處理的頁的描述
--102-->表示這是一個device

37 /*     
  38  * Passed into wb_writeback(), essentially a subset of writeback_control
  39  */    
  40 struct wb_writeback_work {          
  41         long nr_pages;
  42         struct super_block *sb;
  43         unsigned long *older_than_this;
  44         enum writeback_sync_modes sync_mode;
  45         unsigned int tagged_writepages:1;
  46         unsigned int for_kupdate:1;
  47         unsigned int range_cyclic:1;   
  48         unsigned int for_background:1;
  49         unsigned int for_sync:1;        /* sync(2) WB_SYNC_ALL writeback */
  50         enum wb_reason reason;          /* why was writeback initiated? */
  51        
  52         struct list_head list;          /* pending work list */
  53         struct completion *done;        /* set if the caller waits */
  54 };

51 struct bdi_writeback { 
 52         struct backing_dev_info *bdi;   /* our parent bdi */
 53         unsigned int nr;
 54 
 55         unsigned long last_old_flush;   /* last old data flush */
 56 
 57         struct delayed_work dwork;      /* work item used for writeback */
 58         struct list_head b_dirty;       /* dirty inodes */
 59         struct list_head b_io;          /* parked for writeback */
 60         struct list_head b_more_io;     /* parked for more writeback */
 61         spinlock_t list_lock;           /* protects the b_* lists */
 62 };

struct bdi_writeback
--57-->延遲工做對象，最終會調用下面的函數處理髒頁

778 /*     
 779  * Explicit flushing or periodic writeback of "old" data.
 780  *
 781  * Define "old": the first time one of an inode's pages is dirtied, we mark the
 782  * dirtying-time in the inode's address_space.  So this periodic writeback code
 783  * just walks the superblock inode list, writing back any inodes which are
 784  * older than a specific point in time.
 785  *     
 786  * Try to run once per dirty_writeback_interval.  But if a writeback event
 787  * takes longer than a dirty_writeback_interval interval, then leave a
 788  * one-second gap.
 789  *      
 790  * older_than_this takes precedence over nr_to_write.  So we'll only write back
 791  * all dirty pages if they are all attached to "old" mappings.
 792  */
 793 static long wb_writeback(struct bdi_writeback *wb,
 794                          struct wb_writeback_work *work)

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。