iostat統計磁盤信息的時候,使用的是/proc/diskstats 。而/proc/diskstats是誰在寫入呢?node
主要數據結構:ios
//genhd.h struct disk_stats { unsigned long sectors[2]; /* READs and WRITEs */ unsigned long ios[2]; unsigned long merges[2]; unsigned long ticks[2]; // jiffies差 unsigned long io_ticks; // 從入隊列到完成io的時間 unsigned long time_in_queue; };
proc初始化:數據結構
//block/genhd.c static int __init proc_genhd_init(void) { proc_create("diskstats", 0, NULL, &proc_diskstats_operations); proc_create("partitions", 0, NULL, &proc_partitions_operations); return 0; }
static const struct file_operations proc_diskstats_operations = { .open = diskstats_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, };
static int diskstats_open(struct inode *inode, struct file *file) { return seq_open(file, &diskstats_op); }
static const struct seq_operations diskstats_op = { .start = disk_seqf_start, .next = disk_seqf_next, .stop = disk_seqf_stop, .show = diskstats_show };
看到,diskstats_show這個函數纔是關鍵:app
static int diskstats_show(struct seq_file *seqf, void *v) { ...... disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { cpu = part_stat_lock(); part_round_stats(cpu, hd); part_stat_unlock(); seq_printf(seqf, "%4d %7d %s %lu %lu %lu " "%u %lu %lu %lu %u %u %u %u\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), disk_name(gp, hd->partno, buf), part_stat_read(hd, ios[READ]), part_stat_read(hd, merges[READ]), part_stat_read(hd, sectors[READ]), jiffies_to_msecs(part_stat_read(hd, ticks[READ])), part_stat_read(hd, ios[WRITE]), part_stat_read(hd, merges[WRITE]), part_stat_read(hd, sectors[WRITE]), jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])), part_in_flight(hd), jiffies_to_msecs(part_stat_read(hd, io_ticks)), jiffies_to_msecs(part_stat_read(hd, time_in_queue)) ); } disk_part_iter_exit(&piter); }
/proc/diskstats各列具體的函數參考下面:函數
$cat /proc/diskstats
22 0 hdc 159807 57894 6328277 1476593 179991 467858 5184662 2664218 0 886604 4140851
$cat /sys/block/hdc/stat
159807 57894 6328277 1476593 179989 467844 5184534 2664218 0 886604 4140851
/proc/diskstats文件比/sys/block/hdc/stat文件多3個域,從左至右分別對應主設備號,次設備號和設備名稱。後續的11個域在這兩個文件裏是相同的,它們的函義將在下面解釋。除了第9個域,全部的域都是從啓動時的累積值。
第1個域:讀磁盤的次數,成功完成讀的總次數。
第2個域:合併讀次數, field 6 – 合併寫次數。爲了效率可能會合並相鄰的讀和寫。從而兩次4K的讀在它最終被處理到磁盤上以前可能會變成一次8K的讀,才被計數(和排隊),所以只有一次I/O操做。這個域使你知道這樣的操做有多頻繁。
第3個域:讀扇區的次數,成功讀過的扇區總次數。
第4個域:讀花費的毫秒數,這是全部讀操做所花費的毫秒數(用__make_request()到end_that_request_last()測量)。
第5個域:寫完成的次數,成功寫完成的總次數。
第7個域:寫扇區的次數,成功寫扇區總次數。
第8個域:寫花費的毫秒數,這是全部寫操做所花費的毫秒數(用__make_request()到end_that_request_last()測量)。
第9個域:I/O的當前進度,只有這個域應該是0。當請求被交給適當的request_queue_t時增長和請求完成時減少。
第10個域:花在I/O操做上的毫秒數,這個域會增加只要field 9不爲0。
第11個域:加權, 花在I/O操做上的毫秒數,在每次I/O開始,I/O結束,I/O合併時這個域都會增長。這能夠給I/O完成時間和存儲那些能夠累積的提供一個便利的測量標準。spa
而驅動層須要怎麼提供這些數據呢?driver須要調用相似這樣的一組函數:code
part_stat_inc、part_stat_add、__part_stat_add(其中part_stat_add是調用 __part_stat_add,只不夠它同時操做partition)orm
iostat是怎麼根據/proc/diskstats來獲得各項數據呢?blog
1 //iostat.c function read_diskstats_stat 2 if ((fp = fopen(DISKSTATS, "r")) == NULL) 3 return; 4 5 while (fgets(line, 256, fp) != NULL) { 6 7 /* major minor name rio rmerge rsect ruse wio wmerge wsect wuse running use aveq */ 8 i = sscanf(line, "%u %u %s %lu %lu %llu %lu %lu %lu %llu %lu %lu %lu %lu", 9 &major, &minor, dev_name, 10 &rd_ios, &rd_merges_or_rd_sec, &rd_sec_or_wr_ios, &rd_ticks_or_wr_sec, 11 &wr_ios, &wr_merges, &wr_sec, &wr_ticks, &ios_pgr, &tot_ticks, &rq_ticks); 12 13 if (i == 14) { 14 /* Device */ 15 sdev.rd_ios = rd_ios; 16 sdev.rd_merges = rd_merges_or_rd_sec; 17 sdev.rd_sectors = rd_sec_or_wr_ios; 18 sdev.rd_ticks = rd_ticks_or_wr_sec; 19 sdev.wr_ios = wr_ios; 20 sdev.wr_merges = wr_merges; 21 sdev.wr_sectors = wr_sec; 22 sdev.wr_ticks = wr_ticks; 23 sdev.ios_pgr = ios_pgr; 24 sdev.tot_ticks = tot_ticks; 25 sdev.rq_ticks = rq_ticks; 26 } 27 else if (i == 7) { 28 /* Partition */ 29 if (DISPLAY_EXTENDED(flags) || (!dlist_idx && !DISPLAY_PARTITIONS(flags))) 30 continue; 31 32 sdev.rd_ios = rd_ios; 33 sdev.rd_sectors = rd_merges_or_rd_sec; 34 sdev.wr_ios = rd_sec_or_wr_ios; 35 sdev.wr_sectors = rd_ticks_or_wr_sec; 36 } 37 else 38 /* Unknown entry: Ignore it */ 39 continue;
1 void write_ext_stat(int curr, unsigned long long itv, int flags, int fctr, 2 struct io_hdr_stats *shi, struct io_stats *ioi, 3 struct io_stats *ioj) 4 { 5 unsigned long long rd_sec, wr_sec; 6 double tput, util, await, svctm, arqsz, nr_ios; 7 8 /* 9 * Counters overflows are possible, but don't need to be handled in 10 * a special way: the difference is still properly calculated if the 11 * result is of the same type as the two values. 12 * Exception is field rq_ticks which is incremented by the number of 13 * I/O in progress times the number of milliseconds spent doing I/O. 14 * But the number of I/O in progress (field ios_pgr) happens to be 15 * sometimes negative... 16 */ 17 nr_ios = (ioi->rd_ios - ioj->rd_ios) + (ioi->wr_ios - ioj->wr_ios); 18 tput = ((double) nr_ios) * HZ / itv; 19 util = S_VALUE(ioj->tot_ticks, ioi->tot_ticks, itv); 20 svctm = tput ? util / tput : 0.0; 21 /* 22 * Kernel gives ticks already in milliseconds for all platforms 23 * => no need for further scaling. 24 */ 25 await = nr_ios ? 26 ((ioi->rd_ticks - ioj->rd_ticks) + (ioi->wr_ticks - ioj->wr_ticks)) / 27 nr_ios : 0.0; 28 29 rd_sec = ioi->rd_sectors - ioj->rd_sectors; 30 if ((ioi->rd_sectors < ioj->rd_sectors) && (ioj->rd_sectors <= 0xffffffff)) 31 rd_sec &= 0xffffffff; 32 wr_sec = ioi->wr_sectors - ioj->wr_sectors; 33 if ((ioi->wr_sectors < ioj->wr_sectors) && (ioj->wr_sectors <= 0xffffffff)) 34 wr_sec &= 0xffffffff; 35 36 arqsz = nr_ios ? (rd_sec + wr_sec) / nr_ios : 0.0; 37 38 /* DEV rrq/s wrq/s r/s w/s rsec wsec rqsz qusz await svctm %util */ 39 printf("%-13s %8.2f %8.2f %7.2f %7.2f %8.2f %8.2f %8.2f %8.2f %7.2f %6.2f %6.2f\n", 40 shi->name, 41 S_VALUE(ioj->rd_merges, ioi->rd_merges, itv), 42 S_VALUE(ioj->wr_merges, ioi->wr_merges, itv), 43 S_VALUE(ioj->rd_ios, ioi->rd_ios, itv), 44 S_VALUE(ioj->wr_ios, ioi->wr_ios, itv), 45 ll_s_value(ioj->rd_sectors, ioi->rd_sectors, itv) / fctr, 46 ll_s_value(ioj->wr_sectors, ioi->wr_sectors, itv) / fctr, 47 arqsz, 48 S_VALUE(ioj->rq_ticks, ioi->rq_ticks, itv) / 1000.0, 49 await, 50 /* The ticks output is biased to output 1000 ticks per second */ 51 svctm, 52 /* Again: Ticks in milliseconds */ 53 util / 10.0); 54 }