[apue] 等待子進程的那些事兒

時間 2019-11-17

標籤 apue 等待進程那些事兒简体版

原文原文鏈接

談到等待子進程，首先想到的就是SIGCHLD信號與wait函數族，本文試圖釐清兩者的方方面面，以及組合使用時可能不當心掉進去的坑。git

1. 首先談單獨使用SIGCHLD的場景。下面是一段典型的代碼片斷：github

 1 #include "../apue.h" 
 2 #include <sys/wait.h> 
 3 
 4 #define CLD_NUM 2
 5 static void sig_cld (int signo)
 6 {
 7     pid_t pid = 0; 
 8     int status = 0; 
 9     printf ("SIGCHLD received\n"); 
10     if (signal (SIGCHLD, sig_cld) == SIG_ERR)
11         perror ("signal error"); 
12     if ((pid = wait (&status)) < 0)
13         perror ("wait(in signal) error"); 
14     printf ("pid (wait in signal) = %d\n", pid); 
15 }
16 
17 int main ()
18 {
19     pid_t pid = 0; 
20     __sighandler_t ret = signal (SIGCHLD, sig_cld);
21     if (ret == SIG_ERR)
22         perror ("signal error"); 
23     else 
24         printf ("old handler %x\n", ret); 
25 
26     for (int i=0; i<CLD_NUM; ++ i)
27     {
28         if ((pid = fork ()) < 0)
29             perror ("fork error"); 
30         else if (pid == 0) 
31         {
32             sleep (3); 
33             printf ("child %u exit\n", getpid ()); 
34             _exit (0); 
35         }
36 
37         sleep (1); 
38     }
39 
40     for (int i=0; i<CLD_NUM; ++ i)
41     {
42         pause (); 
43         printf ("wake up by signal %d\n", i); 
44     }
45 
46     printf ("parent exit\n"); 
47     return 0; 
48 }

父進程啓動了兩個子進程，在SIGCHLD信號處理器中調用wait等待已結束的子進程，回收進程信息，防止產生殭屍進程（zombie）。上面的代碼會有以下的輸出：shell

old handler 0
child 28542 exit
SIGCLD received
pid (wait in signal) = 28542
wake up by signal 0
child 28543 exit
SIGCLD received
pid (wait in signal) = 28543
wake up by signal 1
parent exit

固然捕獲SIGCHLD，也可使用sigaction接口：bash

 1 #include "../apue.h" 
 2 #include <sys/wait.h> 
 3 
 4 #define CLD_NUM 2
 5 static void sig_cld (int signo, siginfo_t *info, void* param)
 6 {
 7     int status = 0; 
 8     if (signo == SIGCHLD)
 9     {
10         if (info->si_code == CLD_EXITED ||
11                 info->si_code == CLD_KILLED || 
12                 info->si_code == CLD_DUMPED)
13         {
14             //printf ("child %d die\n", info->si_pid); 
15             if (waitpid (info->si_pid, &status, 0) < 0)
16                 perror ("wait(in signal) error"); 
17             printf ("pid (wait in signal) = %d\n", info->si_pid); 
18         }
19         else 
20         {
21             printf ("unknown signal code %d\n", info->si_code); 
22         }
23     }
24 }
25 
26 int main ()
27 {
28     pid_t pid = 0; 
29     struct sigaction act; 
30     sigemptyset (&act.sa_mask); 
31     act.sa_sigaction = sig_cld; 
32     act.sa_flags = SA_SIGINFO | SA_NOCLDSTOP; 
33     int ret = sigaction (SIGCHLD, &act, 0); 
34     if (ret == -1)
35         perror ("sigaction error"); 
36 
37     for (int i=0; i<CLD_NUM; ++ i)
38     {
39         if ((pid = fork ()) < 0)
40             perror ("fork error"); 
41         else if (pid == 0) 
42         {
43             sleep (3); 
44             printf ("child %u exit\n", getpid ()); 
45             _exit (0); 
46         }
47 
48         sleep (1); 
49     }
50 
51     for (int i=0; i<CLD_NUM; ++ i)
52     {
53         pause (); 
54         printf ("wake up by signal %d\n", i); 
55     }
56 
57     printf ("parent exit\n"); 
58     return 0; 
59 }

輸出是同樣的。數據結構

關於signal與sigaction的區別，有如下幾點：異步

a) 使用sigaction能夠避免從新安裝信號處理器的問題；函數

b) 使用sigaction能夠在wait以前得知是哪一個子進程結束了，這是經過指定SA_SIGINFO標誌位，並提供帶siginfo_t參數的信號處理器來實現的（info->si_pid就是結束的進程號）；測試

c) 使用sigaction能夠獲取除子進程結束之外的狀態變動通知，例如掛起、繼續，默認接收相應通知，除非指定SA_NOCLDSTOP標誌。而對於signal而言，沒有辦法不接收子進程非結束狀態的通知（此時調用wait可能會卡死）；ui

d) 使用sigaction能夠自動wait已結束的子進程，只要指定SA_NOCLDWAIT標誌便可。此時在信號處理器中不用再調用wait函數了。spa

　　當使用SA_NOCLDWAIT標誌位時，使用systemtap能夠觀察到子進程仍是向父進程發送了SIGCHLD信號的：

30049    cldsig           30048 cldsig           17     SIGCHLD         
30050    cldsig           30048 cldsig           17     SIGCHLD

　　頗有多是系統內部自動wait了相關子進程。

　　另外在使用SA_NOCLDWAIT時，能夠不指定信號處理器，此時sa_sigaction字段能夠設置爲SIG_DFL。

關於SIGCHLD信號，有如下幾點須要注意：

a) 若是在註冊信號以前，就已經有已結束但未等待的子進程存在，則事件不會被觸發；

b) 能夠爲SIGCHLD註冊一個處理器，也能夠忽略該信號（SIG_IGN），忽略時系統自動回收已結束的子進程；

　　當正常捕獲SIGCHLD時，使用systemtap是能夠觀察到子進程向父進程發送的SIGCHLD信號的：

29877    cldsig           29876 cldsig           17     SIGCHLD         
29878    cldsig           29876 cldsig           17     SIGCHLD         
29876    cldsig           27771 bash             17     SIGCHLD

　　當忽略SIGCHLD時，是看不到的，只能看到父進程結束時向bash發送的SIGCHLD信號：

29893    cldsig           27771 bash             17     SIGCHLD

　　這裏注意一下兩者在細節處的一點區別。

c) 還有一個SIGCLD信號，在大多數unix like系統中與SIGCHLD表現一致，在某些古老的unix系統上，可能有獨特的表現須要注意，這方面請參考 apue 第十章第七節

　　在我測試的環境上（CentOS 6.7），該信號被定義爲SIGCHLD，所以是徹底相同的；

關於使用信號等待子進程最後須要談的一點就是信號的競爭行爲，對上面的例子稍加修改，就能夠演示一下：

 1 #include "../apue.h" 
 2 #include <sys/wait.h> 
 3 
 4 #define CLD_NUM 2
 5 void pid_remove (pid_t pid)
 6 {
 7     printf ("remove pid %u\n", pid); 
 8 }
 9 void pid_add (pid_t pid)
10 {
11     printf ("add pid %u\n", pid); 
12 }
13 
14 static void sig_cld (int signo)
15 {
16     pid_t pid = 0; 
17     int status = 0; 
18     printf ("SIGCHLD received\n"); 
19     if (signal (SIGCHLD, sig_cld) == SIG_ERR)
20         perror ("signal error"); 
21     if ((pid = wait (&status)) < 0)
22         perror ("wait(in signal) error"); 
23     printf ("pid (wait in signal) = %d\n", pid); 
24     pid_remove (pid); 
25 }
26 
27 int main ()
28 {
29     pid_t pid = 0; 
30     __sighandler_t ret = signal (SIGCHLD, sig_cld);
31     if (ret == SIG_ERR)
32         perror ("signal error"); 
33     else 
34         printf ("old handler %x\n", ret); 
35 
36     for (int i=0; i<CLD_NUM; ++ i)
37     {
38         if ((pid = fork ()) < 0)
39             perror ("fork error"); 
40         else if (pid == 0) 
41         {
42             //sleep (3); 
43             printf ("child %u exit\n", getpid ()); 
44             _exit (0); 
45         }
46 
47         sleep (1);
48         pid_add (pid);  
49     }
50 
51     sleep (1); 
52     printf ("parent exit\n"); 
53     return 0; 
54 }

父進程在啓動子進程後須要將它的信息經過pid_add添加到某種數據結構中，當收到SIGCHLD信號後，又經過pid_remove將它從這個數據結構中移出。

在上面的例子中，子進程一啓動就退出了，快到甚至父進程尚未來得及執行pid_add就先執行了pid_remove，這必然致使某種問題。

（注意，爲了能更好的呈現信號競爭的問題，這裏故意在父進程sleep以後調用pid_add），執行結果以下：

old handler 0
child 31213 exit
SIGCLD received
pid (wait in signal) = 31213
remove pid 31213
add pid 31213
child 31214 exit
SIGCLD received
pid (wait in signal) = 31214
remove pid 31214
add pid 31214
parent exit

能夠看到，remove老是在add以前執行。而解決方案也很直接，就是在pid_add完成以前，咱們須要屏蔽SIGCHLD信號：

 1     for (int i=0; i<CLD_NUM; ++ i)
 2     {
 3         sigset_t mask; 
 4         sigemptyset(&mask);
 5         sigaddset(&mask, SIGCHLD);
 6         sigprocmask(SIG_BLOCK, &mask, NULL);
 7         if ((pid = fork ()) < 0)
 8             perror ("fork error"); 
 9         else if (pid == 0) 
10         {
11             sigprocmask(SIG_UNBLOCK, &mask, NULL);
12             //sleep (3); 
13             printf ("child %u exit\n", getpid ()); 
14             _exit (0); 
15         }
16 
17         sleep (1);
18         pid_add (pid);  
19         sigprocmask(SIG_UNBLOCK, &mask, NULL);
20     }

這裏用到了sigprocmask去屏蔽以及解除某種信號的屏蔽。新的代碼運行結果以下：

old handler 0
child 31246 exit
add pid 31246
SIGCLD received
pid (wait in signal) = 31246
remove pid 31246
child 31247 exit
SIGCLD received
pid (wait in signal) = 31247
remove pid 31247
add pid 31247
parent exit

能夠看到一切正常了，add此次位於remove以前。

總結一下，使用SIGCHLD信號適合異步等待子進程的場景，而且一般搭配wait來回收子進程。

2. 而後談單獨使用wait函數族的場景。典型代碼以下：

 1 #include "../apue.h" 
 2 #include <sys/wait.h> 
 3 
 4 #define CLD_NUM 2
 5 int main ()
 6 {
 7     pid_t pid = 0; 
 8     for (int i=0; i<CLD_NUM; ++ i)
 9     {
10         if ((pid = fork ()) < 0)
11             perror ("fork error"); 
12         else if (pid == 0) 
13         {
14             sleep (3); 
15             printf ("child %u exit\n", getpid ()); 
16             _exit (0); 
17         }
18 
19         sleep (1); 
20     }
21 
22     int status = 0; 
23     for (int i=0; i<CLD_NUM; ++ i)
24     {
25         if ((pid = wait (&status)) < 0)
26             perror ("wait error"); 
27 
28         printf ("pid = %d\n", pid); 
29     }
30 
31     printf ("parent exit\n"); 
32     return 0; 
33 }

與以前場景不一樣的是，這裏父進程同步等待啓動的子進程結束。上面的代碼會有以下輸出：

child 28583 exit
child 28584 exit
pid = 28583
pid = 28584
parent exit

關於wait函數族，須要注意如下幾點：

a) wait用於等待任何一個子進程，至關於waitpid(-1, status, 0); 當沒有任何子進程存在時，返回-1，errno設置爲ECHILD；

b) waitpid相對於wait的優點在於：

　　i) 能夠指定子進程(組)來等待；

　　ii) 能夠捕獲子進程除結束之外的其它狀態變動通知，如掛起（WUNTRACED）、繼續（WCONTINUED）等；

　　iii) 能夠不阻塞的測試某個子進程是否已結束（WNOHANG）；

c) wait函數族可被信號中斷，此時返回-1，errno設置爲EINTR，必要時須要重啓wait；

總結一下，使用wait函數族適合同步等待子進程，例如某種命令執行器進程，一般配合waitpid來回收子進程。

3. 最後談談混合使用同步wait與異步wait函數族的場景。

其實前面已經提到SIGCHLD要搭配wait使用，但那是異步使用wait的單一場景，而這裏講的混合，是指同時在信號處理器與執行流程中使用wait。

例如bash，它除了在主流程中同步等待前臺正在運行的子進程，還必需在信號處理器中異步接收後臺運行子進程的狀態反饋，這樣就不得不混合使用wait。

同步等待某個子進程通常使用waitpid，而在信號處理器中通常使用wait，典型的代碼以下所示：

 1 #include "../apue.h" 
 2 #include <sys/wait.h> 
 3 #include <errno.h> 
 4 
 5 #define CLD_NUM 2
 6 
 7 static void sig_cld (int signo)
 8 {
 9     pid_t pid = 0; 
10     int status = 0; 
11     printf ("SIGCLD received\n"); 
12     if (signal (SIGCLD, sig_cld) == SIG_ERR)
13         perror ("signal error"); 
14 
15     if ((pid = wait (&status)) < 0)
16         perror ("wait(in signal) error"); 
17     else
18         printf ("pid (wait in signal) = %d\n", pid); 
19 }
20 
21 int main ()
22 {
23     pid_t pid = 0; 
24     __sighandler_t ret = signal (SIGCLD, sig_cld);
25     if (ret == SIG_ERR)
26         perror ("signal error"); 
27     else 
28         printf ("old handler %x\n", ret); 
29 
30     for (int i=0; i<CLD_NUM; ++ i)
31     {
32         if ((pid = fork ()) < 0)
33             perror ("fork error"); 
34         else if (pid == 0) 
35         {
36             if (i % 2 == 0) { 
37                 // simulate background
38                 sleep (3); 
39             }
40             else {
41                 // simulate foreground
42                 sleep (4); 
43             }
44 
45             printf ("child %u exit\n", getpid ()); 
46             _exit (0); 
47         }
48 
49         sleep (1); 
50     }
51 
52     int status = 0; 
53     printf ("before wait pid %u\n", pid); 
54     if (waitpid (pid, &status, 0) < 0)
55         printf ("wait %u error %d\n", pid, errno); 
56     else
57         printf ("wait child pid = %d\n", pid); 
58 
59     sleep (2);
60     printf ("parent exit\n"); 
61     return 0; 
62 }

父進程啓動兩個子進程，第一個休眠3秒後退出，第二個休眠4秒後退出，因爲父進程同步等待的是第二個子進程，所以第二個進程模擬前臺進程，第一個進程模擬後臺進程。運行輸出以下：

old handler 0
before wait pid 2481
child 2480 exit
SIGCLD received
pid (wait in signal) = 2480
wait 2481 error 4
child 2481 exit
SIGCLD received
pid (wait in signal) = 2481
parent exit

此時同步等待的waitpid被信號中斷了（EINTR)，此種狀況下，咱們須要重啓waitpid：

 1     int status = 0; 
 2     while (1) { 
 3         printf ("before wait pid %u\n", pid); 
 4         if (waitpid (pid, &status, 0) < 0)
 5         {
 6             int err = errno; 
 7             printf ("wait %u error %d\n", pid, err); 
 8             if (err == EINTR)
 9                 continue; 
10         }
11         else
12             printf ("wait child pid = %d\n", pid); 
13 
14         break; 
15     }

若是因EINTR引起的錯誤，則從新調用waitpid；不然，退出。新的代碼輸出以下：

old handler 0
before wait pid 2513
child 2512 exit
SIGCLD received
pid (wait in signal) = 2512
wait 2513 error 4
before wait pid 2513
child 2513 exit
SIGCLD received
wait(in signal) error: No child processes
wait child pid = 2513
parent exit

能夠看到兩個進程退出時，都收到了SIGCHLD信號，只是前臺進程被waitpid優先等待到了，因此信號處理器中的wait返回的ECHILD錯誤，可是若是還有其它子進程在運行，這裏將會在信號處理器的wait中卡死。

以前提到，可使用SIG_IGN來自動回收子進程，這裏試一下使用SIG_IGN來代替sig_cld，看看有什麼改觀。

old handler 0
before wait pid 2557
child 2556 exit
child 2557 exit
wait 2557 error 10
parent exit

一樣的，兩個子進程都走了忽略信號，而同步等待的waitpid因沒有進程可等返回了ECHILD。由於waitpid是指定進程等待的，因此即便還有其它子進程存在，這個也會返回錯誤，不會卡死在那裏。

相比上面的方法，彷佛好了一點，可是由於咱們沒有安裝處理器，因此無從得知哪一個後臺進程結束了，這並非咱們想到的結果。

以前提到，可使用sigaction代替signal以獲取更多的控制，咱們看看換新的方式捕獲信號，會不會有一些改變，新的代碼邏輯以下：

 1 #include "../apue.h" 
 2 #include <sys/wait.h> 
 3 #include <errno.h> 
 4 
 5 #define CLD_NUM 2
 6 
 7 static void sig_cld (int signo, siginfo_t *info, void* param)
 8 {
 9     int status = 0; 
10     if (signo == SIGCHLD)
11     {
12         if (info->si_code == CLD_EXITED ||
13                 info->si_code == CLD_KILLED || 
14                 info->si_code == CLD_DUMPED)
15         {
16             if (waitpid (info->si_pid, &status, 0) < 0)
17                 err_ret ("wait(in signal) %u error", info->si_pid); 
18             else 
19                 printf ("pid (wait in signal) = %d\n", info->si_pid); 
20         }
21         else 
22         {
23             printf ("unknown signal code %d\n", info->si_code); 
24         }
25     }
26 }
27 
28 int main ()
29 {
30     pid_t pid = 0; 
31     struct sigaction act; 
32     sigemptyset (&act.sa_mask); 
33     act.sa_sigaction = sig_cld; 
34     act.sa_flags = SA_SIGINFO | SA_NOCLDSTOP; 
35     int ret = sigaction (SIGCHLD, &act, 0); 
36     if (ret == -1)
37         perror ("sigaction error"); 
38 
39     for (int i=0; i<CLD_NUM; ++ i)
40     {
41         if ((pid = fork ()) < 0)
42             perror ("fork error"); 
43         else if (pid == 0) 
44         {
45             if (i % 2 == 0) { 
46                 // simulate background
47                 sleep (3); 
48             }
49             else {
50                 // simulate foreground
51                 sleep (4); 
52             }
53 
54             printf ("child %u exit\n", getpid ()); 
55             _exit (0); 
56         }
57 
58         sleep (1); 
59     }
60 
61     int status = 0; 
62     while (1) { 
63         printf ("before wait pid %u\n", pid); 
64         if (waitpid (pid, &status, 0) < 0)
65         {
66             int err = errno; 
67             printf ("wait %u error %d\n", pid, err); 
68             if (err == EINTR)
69                 continue; 
70         }
71         else
72             printf ("wait child pid = %d\n", pid); 
73 
74         break; 
75     }
76 
77     sleep (2);
78     printf ("parent exit\n"); 
79     return 0; 
80 }

運行輸出以下：

before wait pid 2585
child 2584 exit
pid (wait in signal) = 2584
wait 2585 error 4
before wait pid 2585
child 2585 exit
wait(in signal) 2585 error: No child processes
wait child pid = 2585
parent exit

結果與使用signal很類似，可是由於在信號處理器中咱們能明確的知道是哪一個子進程終結了，使用的是waitpid而不是wait，因此即便還有其它子進程未結束，也不會在信號處理器的waitpid中卡住。

結論是不管使用signal仍是sigaction，同步等待的waitpid總比SIGCHLD信號處理器中的wait(xxx)具備更高的優先級。

固然，這個前提是在父進程同步waitpid以前，子進程尚未結束；若是要等待的子進程先結束了，SIGCHLD固然先被執行，這種狀況下，建議先使用sigprocmask屏蔽SIGCHLD信號，而後在waitpid以前解除屏蔽。雖然不能保證徹底解決信號競爭的問題，也能極大的緩解此種狀況，即便出現了信號競爭，致使同步等待的waitpid返回ECHILD，咱們也能從這些錯誤碼中得知發生的事情，不會出現卡死的狀況。

出於好奇，咱們看一下改使用SIG_IGN後的運行效果：

before wait pid 2613
child 2612 exit
child 2613 exit
wait 2613 error 10
parent exit

與使用signal時並沒有二致，仍然是忽略信號佔了上風。

結論是不管使用signal仍是sigaction，當忽略SIGCHLD信號時，信號優先於wait被忽略。

出於一樣的緣由，這種方式咱們並不採納。以前提到，sigaction還有一種高級的忽略SIGCHLD的方式，即指定SA_NOCLDWAIT標誌位，同時給信號處理器指定SIG_DFL，這種狀況下，咱們看看輸出會有什麼變化：

before wait pid 2719
child 2718 exit
child 2719 exit
wait 2719 error 10
parent exit

能夠看到，與使用SIG_IGN並沒有二致。

與SIG_IGN不一樣的是，咱們能夠爲SIGCHLD提供一個處理器，雖然在此信號處理器中無需再次等待子進程，可是咱們擁有了獲取子進程信息的能力，相對而言，比SIG_IGN更有用一些。新的輸出以下：

before wait pid 2737
child 2736 exit
pid (auto wait in signal) = 2736
wait 2737 error 4
before wait pid 2737
child 2737 exit
pid (auto wait in signal) = 2737
wait 2737 error 10
parent exit

能夠看到，同步waitpid仍然返回ECHILD，顯然是信號更具備優先級。

好了，到這裏就全明瞭了，對於混合使用同步與異步wait的應用來講，最佳的方法應該是同步waitpid等待前臺進程，異步使用sigaction註冊SIGCHLD信號處理器等待後臺進程，且不設置SA_NOCLDWAIT標誌位。

在處理器中也應使用waitpid等待子進程，如返回ECHILD錯誤，證實該子進程是前臺進程，已經被同步wait掉了，不須要後續處理；不然做爲後臺進程處理。

沒有多少人會有機會寫一個shell，可是並不是只有shell纔有混合使用同步、異步等待子進程的場景，考慮下面個場景：

 1 #include "../apue.h" 
 2 #include <unistd.h> 
 3 #include <sys/wait.h> 
 4 
 5 #define PAGER "${PAGER:-more}"
 6 
 7 #define USE_SIG 2
 8 static void sig_cld (int signo)
 9 {
10     pid_t pid = 0; 
11     int status = 0; 
12     printf ("SIGCLD received\n"); 
13     if (signal (SIGCLD, sig_cld) == SIG_ERR)
14         perror ("signal error"); 
15 
16     if ((pid = wait (&status)) < 0)
17         perror ("wait(in signal) error"); 
18 
19     printf ("pid (wait in signal) = %d\n", pid); 
20 }
21 
22 void install_handler (__sighandler_t h)
23 {
24     __sighandler_t ret = signal (SIGCLD, h);
25     if (ret == SIG_ERR)
26         perror ("signal error"); 
27     else 
28         printf ("old handler %x\n", ret); 
29 }
30 
31 int main (int argc, char *argv[])
32 {
33     int n = 0; 
34 #if USE_SIG == 1
35     install_handler (sig_cld); 
36 #elif USE_SIG == 2
37     install_handler (SIG_IGN); 
38 #endif
39 
40     char line[MAXLINE] = { 0 }; 
41     FILE *fpin = NULL, *fpout = NULL; 
42     if (argc != 2)
43         err_quit ("usage: ppage <pathname>"); 
44 
45     fpin = fopen (argv[1], "r"); 
46     if (fpin == NULL)
47         err_sys ("can't open %s", argv[1]); 
48 
49     fpout = popen (PAGER, "w"); 
50     if (fpout == NULL)
51         err_sys ("popen %s error", PAGER); 
52 
53     while (fgets (line, MAXLINE, fpin) != NULL) { 
54         if (fputs (line, fpout) == EOF)
55             err_sys ("fputs error to pipe"); 
56     }
57 
58     if (ferror (fpin))
59         err_sys ("fgets error"); 
60 
61     int ret = pclose(fpout); 
62     if (ret == -1)
63         err_sys ("pclose error"); 
64     else 
65         printf ("worker return %d\n", ret); 
66 
67     return 0; 
68 }