什麼是iowait?
顧名思義,就是系統由於io致使的進程wait。再深一點講就是:這時候系統在作io,致使沒有進程在幹活,cpu在執行idle進程空轉,因此說iowait的產生要知足兩個條件,一是進程在等io,二是等io時沒有進程可運行。html
Iowait是如何計算的?
先說說用戶如何看到iowait吧
咱們一般用vmstat就能看到iowat,圖中的wa就是(標紅)
這個數據是vmstat通過計算文件/proc/stat中的數據得到,因此說你們看到的是可以大概反應一個系統iowait水平的數據表象。關於/proc/stat中的數據都表明了什麼意思,你們本身google吧,再也不贅述。node
那/proc/stat文件中的這些數據是從哪來的呢?
Kernel中有個proc_misc.c文件會專門輸出這些數據,這個文件對應的函數是show_stat
部分代碼:算法
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
for_each_possible_cpu(i) {
int
j;
user = cputime64_add(user, kstat_cpu(i).cpustat.user);
nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
system
= cputime64_add(
system
, kstat_cpu(i).cpustat.
system
);
idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
for
(j = 0 ; j < NR_IRQS ; j++)
sum += kstat_cpu(i).irqs[j];
}
….
seq_printf(p,
"\nctxt %llu\n"
"btime %lu\n"
"processes %lu\n"
"procs_running %lu\n"
"procs_blocked %lu\n"
,
nr_context_switches(),
(unsigned
long
)jif,
total_forks,
nr_running(),
nr_iowait());
…
|
這部分代碼會輸出你在/proc/stat中看到的數據,經過代碼咱們得知iowait來自
iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);tcp
那麼 cpustat.iowait是誰來修改的呢?
咱們找到了這個函數account_system_time函數
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
void
account_system_time(
struct
task_struct *p,
int
hardirq_offset,
cputime_t cputime)
{
struct
cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
struct
rq *rq = this_rq();
//在smp環境下獲取當前的run queue
cputime64_t tmp;
p->stime = cputime_add(p->stime, cputime);
/* Add system time to cpustat. */
tmp = cputime_to_cputime64(cputime);
if
(hardirq_count() - hardirq_offset)
//在作硬中斷
cpustat->irq = cputime64_add(cpustat->irq, tmp);
else
if
(softirq_count())
//在作軟中斷
cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
else
if
(p != rq->idle)
//程序在正常運行,非idle
cpustat->
system
= cputime64_add(cpustat->
system
, tmp);
else
if
(atomic_read(&rq->nr_iowait) > 0)
//既不作中斷,並且在idle,那麼就是iowait
cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
else
cpustat->idle = cputime64_add(cpustat->idle, tmp);
/* Account for system time used */
acct_update_integrals(p);
}
|
咱們能夠看出,當某個cpu產生iowait時,那麼這個cpu上確定有進程在進行io,而且在等待io完成(rq->nr_iowait>0),而且這個cpu上沒有進程可運行(p == rq->idle),cpu在idle。工具
誰在產生iowait?
那麼是誰修改了rq->nr_iowait呢?
重點終於來了,呵呵。this
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
|
void
__sched io_schedule(
void
)
{
struct
rq *rq = &__raw_get_cpu_var(runqueues);
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
schedule();
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
}
long
__sched io_schedule_timeout(
long
timeout)
{
struct
rq *rq = &__raw_get_cpu_var(runqueues);
long
ret;
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
ret = schedule_timeout(timeout);
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
return
ret;
}
|
因此產生iowait的根源被咱們找到了,就是函數io_schedule, io_schedule_timeout,顧名思義,這兩個函數是用來作進程切換的,並且切換的緣由是有io。只不過io_schedule_timeout還給出了一個sleep的時間,也就是timeout。google
systemtap來跟一下究竟是誰在何時調用了這兩個函數?
在這裏咱們以引擎爲例子,trace進程searcher_serveratom
Stap腳本Block.stp:(只截取了部分程序)spa
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
|
probe io_schedule = kernel.function(
"io_schedule"
){
#if(tid() == target_pid){
if
(isinstr(execname(),
"searcher_server"
)){
stat[execname(), tid(), probefunc()]++;
count++;
printf
(
"trace time:%s\n"
,
ctime
(gettimeofday_s()));
print_stack(backtrace());
}
}
probe io_schedule_timeout = kernel.function(
"io_schedule_timeout"
){
#if(tid() == target_pid){
if
(isinstr(execname(),
"searcher_server"
)){
stat[execname(), tid(), probefunc()]++;
count++;
printf
(
"trace time:%s\n"
,
ctime
(gettimeofday_s()));
print_stack(backtrace());
}
}
probe io_schedule, io_schedule_timeout{
}
probe begin{
printf
(
"begin %s\n"
,
ctime
(gettimeofday_s()));
if
($# == 2){
if
(@1 ==
"pid"
)
target_pid =
strtol
(@2, 10)
if
(@1 ==
"name"
)
target_name = @2
printf
(
"pid:%d name:%s\n"
, target_pid, target_name);
}
else
{
printf
(
"arguments error\n"
);
exit
();
}
}
probe end{
printf
(
"end %s\n"
,
ctime
(gettimeofday_s()));
}
probe timer.ms(1000){
printf
(
"%s running...\n"
,
ctime
(gettimeofday_s()));
foreach([proc, tid, func] in stat- limit 100){
printf
(
"%s:%d=>%s %d\n"
, proc, tid, func, stat[proc,tid,func]);
}
delete
stat;
printf
(
"%s average schedule times:%d\n"
,
ctime
(gettimeofday_s()), count);
count = 0;
}
|
程序的大意就是在1S內,統計哪一個進程分別調用了多少次這兩個函數。而且把調用時的堆棧print出來,這樣能更清楚地看到究竟是哪一個系統調用跑到了這個地方。
在最正常的狀態下,跑一下機器:
此時新 引擎 searcher QPS有1500+,cpu busy有88%,iowait幾乎爲0,內存在mmap時所有用MAP_LOCKED被鎖在內存中
1
2
3
4
5
6
7
8
9
10
11
|
sudo stap block.stp pid 5739 -DMAXSKIPPED=1000000
Fri Jul 6 05:57:21 2012 average schedule times:0
Fri Jul 6 05:57:22 2012 running...
Fri Jul 6 05:57:22 2012 average schedule times:0
Fri Jul 6 05:57:23 2012 running...
Fri Jul 6 05:57:23 2012 average schedule times:0
Fri Jul 6 05:57:24 2012 running...
Fri Jul 6 05:57:24 2012 average schedule times:0
Fri Jul 6 05:57:25 2012 running...
Fri Jul 6 05:57:25 2012 average schedule times:0
…
|
跑了一會發現並無調用到io schedule,這也符合咱們的預期。
咱們再一邊跑dd一邊stap抓取
sudo stap block.stp pid 5739 -DMAXSKIPPED=1000000 > directdd
起兩個dd進程,寫10G的數據,不走page cache,direct寫
dd if=/dev/zero of=a count=20000000 oflag=direct
dd if=/dev/zero of=b count=20000000 oflag=direct
一共寫20G
能夠看出direct dd產生的iowait極小,最高才1.4左右,可是對searcher卻也形成了不小的影響,經過vmstat的結果來看,當執行dd以後進程上下文切換從2W+飆到了8W+,被block的searcher線程爲個位數。
寫log
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
trace
time
:Fri Jul 6 06:46:23 2012
0xffffffff8006377c : io_schedule+0x1/0x67 [kernel]
0xffffffff800153e7 : sync_buffer+0x3b/0x3f [kernel]
0xffffffff800639e6 : __wait_on_bit+0x40/0x6e [kernel]
0xffffffff80063a80 : out_of_line_wait_on_bit+0x6c/0x78 [kernel]
0xffffffff8003a9d3 : sync_dirty_buffer+0x96/0xcb [kernel]
0xffffffff8000fd42 : generic_file_buffered_write+0x1cc/0x675 [kernel]
0x00000ffffffff800
trace
time
:Fri Jul 6 06:46:23 2012
0xffffffff8006377c : io_schedule+0x1/0x67 [kernel]
0xffffffff80028a90 : get_request_wait+0xd8/0x11f [kernel]
0xffffffff8000bfff : __make_request+0x33d/0x401 [kernel]
0xffffffff8001c049 : generic_make_request+0x211/0x228 [kernel]
0xffffffff80033472 : submit_bio+0xe4/0xeb [kernel]
0xffffffff8001a793 : submit_bh+0xf1/0x111 [kernel]
0x00000ffffffff800
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
grep -E
'run|ks_searcher'
directdd | more
searcher_server:21001=>io_schedule 1
searcher_server:20813=>io_schedule 1
Fri Jul 6 06:46:20 2012 running...
Fri Jul 6 06:46:21 2012 running...
Fri Jul 6 06:46:22 2012 running...
Fri Jul 6 06:46:23 2012 running...
Fri Jul 6 06:46:24 2012 running...
searcher_server:20813=>io_schedule 2
searcher_server:21014=>io_schedule 1
Fri Jul 6 06:46:25 2012 running...
Fri Jul 6 06:46:26 2012 running...
Fri Jul 6 06:46:27 2012 running...
Fri Jul 6 06:46:28 2012 running...
Fri Jul 6 06:46:29 2012 running...
|
經stap追查發現,切換次數的增長都是因爲direct dd致使的:
因爲是direct寫,因此每寫一次都要作io schedule
1
2
3
4
5
6
|
0xffffffff80062391 : schedule+0x1/0xcd4 [kernel]
0xffffffff800637ba : io_schedule+0x3f/0x67 [kernel]
0xffffffff800f281b : __blockdev_direct_IO+0x8bc/0xa35 [kernel]
0xffffffff800c4c91 : generic_file_direct_IO+0xff/0x119 [kernel]
0xffffffff8001edd1 : generic_file_direct_write+0x60/0xf2 [kernel]
0xffffffff8001646e : __generic_file_aio_write_nolock+0x2b8/0x3b6 [kernel]
|
Searcher latency上升和searcher相對溫和的io schedule、進程切換都有關係,可是這時的主因應該是進程切換,進程切換還會形成頻繁的進程遷移,TLB flush ,Cache pollution。
再作一次新的實驗,把dd的direct flag去掉,讓page cache生效
Searcher的運行環境和運行壓力和上同
能夠看出帶page cache的dd對searcher影響更大,咱們先看一下vmstat抓取到的數據
平均被block的線程數據不少,甚至在某個時刻能夠運行的線程數量爲0
block layer寫請求
1
2
3
4
5
6
7
8
|
trace
time
:Fri Jul 6 07:13:45 2012
0xffffffff8006377c : io_schedule+0x1/0x67 [kernel]
0xffffffff80028a90 : get_request_wait+0xd8/0x11f [kernel]
0xffffffff8000bfff : __make_request+0x33d/0x401 [kernel]
0xffffffff8001c049 : generic_make_request+0x211/0x228 [kernel]
0xffffffff80033472 : submit_bio+0xe4/0xeb [kernel]
0xffffffff8001a793 : submit_bh+0xf1/0x111 [kernel]
0x00000ffffffff800
|
Sync buffer
1
2
3
4
5
6
7
8
|
trace
time
:Fri Jul 6 07:13:46 2012
0xffffffff8006377c : io_schedule+0x1/0x67 [kernel]
0xffffffff800153e7 : sync_buffer+0x3b/0x3f [kernel]
0xffffffff800639e6 : __wait_on_bit+0x40/0x6e [kernel]
0xffffffff80063a80 : out_of_line_wait_on_bit+0x6c/0x78 [kernel]
0xffffffff8003a9d3 : sync_dirty_buffer+0x96/0xcb [kernel]
0xffffffff8001cdc3 : mpage_writepages+0x1bf/0x37e [kernel]
0x00000ffffffff800
|
此時的dirty ratio已大於40%,須要作blk_congestion_wait,這個能夠算是最嚴厲的懲罰了。。
1
2
3
4
5
6
7
8
|
trace
time
:Fri Jul 6 07:13:48 2012
0xffffffff800631bb : io_schedule_timeout+0x1/0x79 [kernel]
0xffffffff8003b426 : blk_congestion_wait+0x67/0x81 [kernel]
0xffffffff800c7e68 : balance_dirty_pages_ratelimited_nr+0x17d/0x1fa [kernel]
0xffffffff8000fd81 : generic_file_buffered_write+0x20b/0x675 [kernel]
0xffffffff8001651f : __generic_file_aio_write_nolock+0x369/0x3b6 [kernel]
0xffffffff8002157b : generic_file_aio_write+0x65/0xc1 [kernel]
0x00000ffffffff800
|
Searcher用到的某些頁被刷出去,須要sync page read
1
2
3
4
5
6
7
8
|
trace
time
:Fri Jul 6 07:13:49 2012
0xffffffff8006377c : io_schedule+0x1/0x67 [kernel]
0xffffffff80028936 : sync_page+0x3e/0x43 [kernel]
0xffffffff800638fe : __wait_on_bit_lock+0x36/0x66 [kernel]
0xffffffff8003fbad : __lock_page+0x5e/0x64 [kernel]
0xffffffff800139f8 : find_lock_page+0x69/0xa2 [kernel]
0xffffffff800c45a5 : grab_cache_page_write_begin+0x2c/0x89 [kernel]
0x00000ffffffff800
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
searcher_server:21010=>io_schedule 3
searcher_server:21003=>io_schedule 1
Fri Jul 6 07:14:39 2012 running...
searcher_server:21010=>io_schedule 7
Fri Jul 6 07:14:40 2012 running...
searcher_server:21008=>io_schedule 1
Fri Jul 6 07:14:41 2012 running...
Fri Jul 6 07:14:42 2012 running...
searcher_server:21004=>io_schedule 1
Fri Jul 6 07:14:43 2012 running...
searcher_server:21014=>io_schedule 11
searcher_server:21015=>io_schedule 1
searcher_server:21008=>io_schedule 1
Fri Jul 6 07:14:44 2012 running...
Fri Jul 6 07:14:45 2012 running...
Fri Jul 6 07:14:46 2012 running...
searcher_server:21003=>io_schedule 2
Fri Jul 6 07:14:47 2012 running...
Fri Jul 6 07:14:48 2012 running...
Fri Jul 6 07:14:49 2012 running...
|
當帶page cahce進行dd時,很容易就能達到10%的background dirty ratio和40%的dirty ratio,達到40%的時候buffered write就變成了sync write。經stap trace發現每次blk_congestion_wait都要耗時100ms左右,也就是說一個線程要被block 100ms,很致命。
爲了減小io的影響,咱們把log給禁掉
再作一次實驗
把寫log關掉以後居然還有iowait,是誰形成的呢?
1
2
3
4
5
6
7
8
|
trace
time
:Fri Jul 6 02:08:28 2012
0xffffffff8006377c : io_schedule+0x1/0x67 [kernel]
0xffffffff80028936 : sync_page+0x3e/0x43 [kernel]
0xffffffff800638fe : __wait_on_bit_lock+0x36/0x66 [kernel]
0xffffffff8003fbad : __lock_page+0x5e/0x64 [kernel]
0xffffffff80013881 : filemap_nopage+0x268/0x360 [kernel]
0xffffffff8000898c : __handle_mm_fault+0x1fa/0xf99 [kernel]
0x00000ffffffff800
|
咱們的內存都被mlock了,居然還有sync page,爲啥呢?
用blktrace和debugfs追了一下,發現居然是一個算法數據的問題
/path/of/data
原來是這個文件的數據被dd給刷出去了,致使還要從新read到內存
而後寫了個程序把這個數據也lock到內存中
./lock /path/of/data
Lock數據再從新跑dd with page cache
能夠看到,iowait水平又下降了很多,那麼此時此刻,誰還在製造iowait呢?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
trace
time
:Fri Jul 6 03:45:04 2012
0xffffffff800631bb : io_schedule_timeout+0x1/0x79 [kernel]
0xffffffff8003b426 : blk_congestion_wait+0x67/0x81 [kernel]
0xffffffff800ca438 : try_to_free_pages+0x252/0x2d7 [kernel]
0xffffffff8000f40d : __alloc_pages+0x1cb/0x2ce [kernel]
0xffffffff80008e62 : __handle_mm_fault+0x6d0/0xf99 [kernel]
0xffffffff80066b25 : do_page_fault+0x4cb/0x830 [kernel]
trace
time
:Fri Jul 6 03:45:04 2012
0xffffffff800631bb : io_schedule_timeout+0x1/0x79 [kernel]
0xffffffff8003b426 : blk_congestion_wait+0x67/0x81 [kernel]
0xffffffff800ca438 : try_to_free_pages+0x252/0x2d7 [kernel]
0xffffffff8000f40d : __alloc_pages+0x1cb/0x2ce [kernel]
0xffffffff8002600b : tcp_sendmsg+0x567/0xb0e [kernel]
0xffffffff80037c60 : do_sock_write+0xc6/0x102 [kernel]
|
原來是內存不多了,致使申請內存時要走到try_to_free_pages(平時極少走到),走到這一步說明系統內存已經少的可憐。可是沒辦法,誰讓searcher還要去malloc呢,這些malloc來自兩部分:1,mempool申請的內存,其實這個是徹底能夠抹掉的 2,算法so中STL部分用到的內存。
關掉log,將數據都lock在內存中,下降了iowait的水平,可是要讓searcher不受影響,還要作更多的工做,好比不申請內存。
如何消除searcher(或應用系統)的iowait?
1, 沒有io
不寫log,或者把寫log的事情交給一個專門線程來作,searcher不作buffered write;不作disk read,尤爲是sync page這類操做。
2, 全內存且不申請內存
用到的數據read once,全內存且lock住;把mempool作到完美,起碼作到99%的case不作內存申請。
3, 儘可能減小其餘應用的io影響
其實就是能將dd的負面影響降到最少,如用cgroup;在scp多個大文件的時候,在傳輸過程當中及時清理每一個大文件的page cache,將系統的dirty ratio維持在10%如下,尤爲是不能達到40%。
工具鏈接:
SystemtapBlktrace+Blkparse+Debugfs 結合使用,會找到每次io對應的磁盤塊所屬inode,還能夠查看塊內容