磁盤性能統計

iostat統計磁盤信息的時候,使用的是/proc/diskstats 。而/proc/diskstats是誰在寫入呢?node

主要數據結構:ios

//genhd.h
struct disk_stats {
    unsigned long sectors[2];    /* READs and WRITEs */
    unsigned long ios[2];
    unsigned long merges[2];
    unsigned long ticks[2]; // jiffies差
    unsigned long io_ticks; // 從入隊列到完成io的時間
    unsigned long time_in_queue;
};

 

proc初始化:數據結構

//block/genhd.c
static int __init proc_genhd_init(void)
{
    proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
    proc_create("partitions", 0, NULL, &proc_partitions_operations);
    return 0;
}

 

static const struct file_operations proc_diskstats_operations = {
    .open        = diskstats_open,
    .read        = seq_read,
    .llseek        = seq_lseek,
    .release    = seq_release,
};
static int diskstats_open(struct inode *inode, struct file *file)
{
    return seq_open(file, &diskstats_op);
}
static const struct seq_operations diskstats_op = {
    .start    = disk_seqf_start,
    .next    = disk_seqf_next,
    .stop    = disk_seqf_stop,
    .show    = diskstats_show
};

看到,diskstats_show這個函數纔是關鍵:app

static int diskstats_show(struct seq_file *seqf, void *v)
{
    ......
     disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
    while ((hd = disk_part_iter_next(&piter))) {
        cpu = part_stat_lock();
        part_round_stats(cpu, hd);
        part_stat_unlock();
        seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
               "%u %lu %lu %lu %u %u %u %u\n",
               MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
               disk_name(gp, hd->partno, buf),
               part_stat_read(hd, ios[READ]),
               part_stat_read(hd, merges[READ]),
               part_stat_read(hd, sectors[READ]),
               jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
               part_stat_read(hd, ios[WRITE]),
               part_stat_read(hd, merges[WRITE]),
               part_stat_read(hd, sectors[WRITE]),
               jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
               part_in_flight(hd),
               jiffies_to_msecs(part_stat_read(hd, io_ticks)),
               jiffies_to_msecs(part_stat_read(hd, time_in_queue))
            );
    }
    disk_part_iter_exit(&piter);
}

 

/proc/diskstats各列具體的函數參考下面:函數

$cat /proc/diskstats

22 0 hdc 159807 57894 6328277 1476593 179991 467858 5184662 2664218 0 886604 4140851

$cat /sys/block/hdc/stat

159807 57894 6328277 1476593 179989 467844 5184534 2664218 0 886604 4140851


/proc/diskstats文件比/sys/block/hdc/stat文件多3個域,從左至右分別對應主設備號,次設備號和設備名稱。後續的11個域在這兩個文件裏是相同的,它們的函義將在下面解釋。除了第9個域,全部的域都是從啓動時的累積值。

第1個域:讀磁盤的次數,成功完成讀的總次數。

第2個域:合併讀次數, field 6 – 合併寫次數。爲了效率可能會合並相鄰的讀和寫。從而兩次4K的讀在它最終被處理到磁盤上以前可能會變成一次8K的讀,才被計數(和排隊),所以只有一次I/O操做。這個域使你知道這樣的操做有多頻繁。

第3個域:讀扇區的次數,成功讀過的扇區總次數。

第4個域:讀花費的毫秒數,這是全部讀操做所花費的毫秒數(用__make_request()到end_that_request_last()測量)。

第5個域:寫完成的次數,成功寫完成的總次數。

第7個域:寫扇區的次數,成功寫扇區總次數。

第8個域:寫花費的毫秒數,這是全部寫操做所花費的毫秒數(用__make_request()到end_that_request_last()測量)。

第9個域:I/O的當前進度,只有這個域應該是0。當請求被交給適當的request_queue_t時增長和請求完成時減少。

第10個域:花在I/O操做上的毫秒數,這個域會增加只要field 9不爲0。

第11個域:加權, 花在I/O操做上的毫秒數,在每次I/O開始,I/O結束,I/O合併時這個域都會增長。這能夠給I/O完成時間和存儲那些能夠累積的提供一個便利的測量標準。spa

 

而驅動層須要怎麼提供這些數據呢?driver須要調用相似這樣的一組函數:code

part_stat_inc、part_stat_add、__part_stat_add(其中part_stat_add是調用 __part_stat_add,只不夠它同時操做partition)orm

 

iostat是怎麼根據/proc/diskstats來獲得各項數據呢?blog

 1 //iostat.c function read_diskstats_stat
 2 if ((fp = fopen(DISKSTATS, "r")) == NULL)
 3       return;
 4 
 5    while (fgets(line, 256, fp) != NULL) {
 6 
 7       /* major minor name rio rmerge rsect ruse wio wmerge wsect wuse running use aveq */
 8       i = sscanf(line, "%u %u %s %lu %lu %llu %lu %lu %lu %llu %lu %lu %lu %lu",
 9          &major, &minor, dev_name,
10          &rd_ios, &rd_merges_or_rd_sec, &rd_sec_or_wr_ios, &rd_ticks_or_wr_sec,
11          &wr_ios, &wr_merges, &wr_sec, &wr_ticks, &ios_pgr, &tot_ticks, &rq_ticks);
12 
13       if (i == 14) {
14      /* Device */
15      sdev.rd_ios     = rd_ios;
16      sdev.rd_merges  = rd_merges_or_rd_sec;
17      sdev.rd_sectors = rd_sec_or_wr_ios;
18      sdev.rd_ticks   = rd_ticks_or_wr_sec;
19      sdev.wr_ios     = wr_ios;
20      sdev.wr_merges  = wr_merges;
21      sdev.wr_sectors = wr_sec;
22      sdev.wr_ticks   = wr_ticks;
23      sdev.ios_pgr    = ios_pgr;
24      sdev.tot_ticks  = tot_ticks;
25      sdev.rq_ticks   = rq_ticks;
26       }
27       else if (i == 7) {
28      /* Partition */
29      if (DISPLAY_EXTENDED(flags) || (!dlist_idx && !DISPLAY_PARTITIONS(flags)))
30         continue;
31 
32      sdev.rd_ios     = rd_ios;
33      sdev.rd_sectors = rd_merges_or_rd_sec;
34      sdev.wr_ios     = rd_sec_or_wr_ios;
35      sdev.wr_sectors = rd_ticks_or_wr_sec;
36       }
37       else
38      /* Unknown entry: Ignore it */
39      continue;
 1 void write_ext_stat(int curr, unsigned long long itv, int flags, int fctr,
 2             struct io_hdr_stats *shi, struct io_stats *ioi,
 3             struct io_stats *ioj)
 4 {
 5    unsigned long long rd_sec, wr_sec;
 6    double tput, util, await, svctm, arqsz, nr_ios;
 7     
 8    /*
 9     * Counters overflows are possible, but don't need to be handled in
10     * a special way: the difference is still properly calculated if the
11     * result is of the same type as the two values.
12     * Exception is field rq_ticks which is incremented by the number of
13     * I/O in progress times the number of milliseconds spent doing I/O.
14     * But the number of I/O in progress (field ios_pgr) happens to be
15     * sometimes negative...
16     */
17    nr_ios = (ioi->rd_ios - ioj->rd_ios) + (ioi->wr_ios - ioj->wr_ios);
18    tput = ((double) nr_ios) * HZ / itv;
19    util = S_VALUE(ioj->tot_ticks, ioi->tot_ticks, itv);
20    svctm = tput ? util / tput : 0.0;
21    /*
22     * Kernel gives ticks already in milliseconds for all platforms
23     * => no need for further scaling.
24     */
25    await = nr_ios ?
26       ((ioi->rd_ticks - ioj->rd_ticks) + (ioi->wr_ticks - ioj->wr_ticks)) /
27       nr_ios : 0.0;
28 
29    rd_sec = ioi->rd_sectors - ioj->rd_sectors;
30    if ((ioi->rd_sectors < ioj->rd_sectors) && (ioj->rd_sectors <= 0xffffffff))
31       rd_sec &= 0xffffffff;
32    wr_sec = ioi->wr_sectors - ioj->wr_sectors;
33    if ((ioi->wr_sectors < ioj->wr_sectors) && (ioj->wr_sectors <= 0xffffffff))
34       wr_sec &= 0xffffffff;
35 
36    arqsz = nr_ios ? (rd_sec + wr_sec) / nr_ios : 0.0;
37 
38    /*      DEV   rrq/s wrq/s   r/s   w/s  rsec  wsec  rqsz  qusz await svctm %util */
39    printf("%-13s %8.2f %8.2f %7.2f %7.2f %8.2f %8.2f %8.2f %8.2f %7.2f %6.2f %6.2f\n",
40       shi->name,
41       S_VALUE(ioj->rd_merges, ioi->rd_merges, itv),
42       S_VALUE(ioj->wr_merges, ioi->wr_merges, itv),
43       S_VALUE(ioj->rd_ios, ioi->rd_ios, itv),
44       S_VALUE(ioj->wr_ios, ioi->wr_ios, itv),
45       ll_s_value(ioj->rd_sectors, ioi->rd_sectors, itv) / fctr,
46       ll_s_value(ioj->wr_sectors, ioi->wr_sectors, itv) / fctr,
47       arqsz,
48       S_VALUE(ioj->rq_ticks, ioi->rq_ticks, itv) / 1000.0,
49       await,
50       /* The ticks output is biased to output 1000 ticks per second */
51       svctm,
52       /* Again: Ticks in milliseconds */
53       util / 10.0);
54 }
相關文章
相關標籤/搜索