【原創】遇到一個 rabbitmqctl 沒法退出的問題


---------------------------- ---------------------------- ---------------------------- ----------------------------
重大發現:這兩天再次深究問題根源,結果又有重大發現,詳見文章最後新增部分!!!
---------------------------- ---------------------------- ---------------------------- ----------------------------

      這兩天又遇到了一個奇怪的問題:簡單的說,就是在 vmware 經過 rabbitmq 的 init 腳本沒法正常 stop 掉相應的進程(實際上調用的是 rabbitmqctl stop <pidfile> )。

問題表現

確認 RabbitMQ 的運行狀態和對應的 pid 文件內容
[root@gfsd1 ~]# 
[root@gfsd1 ~]# ps aux|grep rabbit
root     11813  0.0  0.0 106368  1004 pts/2    S    19:21   0:00 /bin/sh /etc/init.d/rabbitmq start
root     11815  0.0  0.0 106100  1200 pts/2    S    19:21   0:00 /bin/bash -c ulimit -S -c 0 >/dev/null 2>&1 ; /usr/local/rabbitmq/sbin/rabbitmq-server
root     11817 46.1  0.3 2806172 53252 pts/2   Sl   19:21   0:07 /usr/local/rabbitmq/erts-6.0/bin/beam.smp -W w -K true -A30 -P 1048576 -- -root /usr/local/rabbitmq -progname erl -- -home /root -- -boot /usr/local/rabbitmq/releases/3.3.5/start_clean -pa /usr/local/rabbitmq/sbin/../ebin -noshell -noinput -s rabbit boot -sname rabbitmq@rmq_1 -boot start_sasl -config /usr/local/rabbitmq/sbin/../etc/rabbitmq/rabbitmq -kernel inet_default_connect_options [{nodelay,true}] -sasl errlog_type error -sasl sasl_error_logger false -rabbit error_logger {file,"/usr/local/rabbitmq/sbin/../var/log/rabbitmq/rabbitmq@rmq_1.log"} -rabbit sasl_error_logger {file,"/usr/local/rabbitmq/sbin/../var/log/rabbitmq/rabbitmq@rmq_1-sasl.log"} -rabbit enabled_plugins_file "/usr/local/rabbitmq/sbin/../etc/rabbitmq/enabled_plugins" -rabbit plugins_dir "/usr/local/rabbitmq/sbin/../plugins" -rabbit plugins_expand_dir "/usr/local/rabbitmq/sbin/../var/lib/rabbitmq/mnesia/rabbitmq@rmq_1-plugins-expand" -os_mon start_cpu_sup false -os_mon start_disksup false -os_mon start_memsup false -mnesia dir "/usr/local/rabbitmq/sbin/../var/lib/rabbitmq/mnesia/rabbitmq@rmq_1"
root     12534  0.0  0.0 103252   836 pts/2    S+   19:21   0:00 grep rabbit
[root@gfsd1 ~]# 
[root@gfsd1 ~]# 
[root@gfsd1 ~]# cat /var/run/rabbitmq/pid 
11817
[root@gfsd1 ~]#
經過 init 腳本 stop 服務
[root@gfsd1 ~]# /etc/init.d/rabbitmq stop
Stopping rabbitmq-server: /var/run/rabbitmq/pid
Stopping and halting node rabbitmq@rmq_1 ...

(卡住)
查看日誌,發現 RabbitMQ 服務已經執行了中止動做。
[root@gfsd1 ~]# tail -f /usr/local/rabbitmq/sbin/../var/log/rabbitmq/rabbitmq\@rmq_1.log
...
=INFO REPORT==== 14-Mar-2016::19:21:38 ===
Stopping RabbitMQ

=INFO REPORT==== 14-Mar-2016::19:21:38 ===
stopped TCP Listener on [::]:6672

=ERROR REPORT==== 14-Mar-2016::19:21:38 ===
AMQP connection <0.424.0> (running), channel 0 - error:
{amqp_error,connection_forced,
            "broker forced connection closure with reason 'shutdown'",none}

=INFO REPORT==== 14-Mar-2016::19:21:38 ===
Halting Erlang VM
對應代碼在 rabbit.erl 中,以下所示

能夠看到,最終調用了 init:stop() 進行了中止。
      然而,此時雖然 RabbitMQ 服務進程已經退出,但調用 rabbitmqctl stop <pidfile> 進程卻退不出來!
[root@gfsd1 ~]# 
[root@gfsd1 ~]# ps ajxf|grep rabbit
27541 13082 13082 27541 pts/2    13082 S+       0   0:00  |   |   \_ /bin/sh /etc/init.d/rabbitmq stop
13082 13127 13082 27541 pts/2    13082 Sl+      0   0:00  |   |       \_ /usr/local/rabbitmq/erts-6.0/bin/beam.smp -- -root /usr/local/rabbitmq -progname erl -- -home /root -- -boot /usr/local/rabbitmq/releases/3.3.5/start_clean -pa /usr/local/rabbitmq/sbin/../ebin -noshell -noinput -hidden -sname rabbitmqctl13127 -boot start_clean -s rabbit_control_main -nodename rabbitmq@rmq_1 -extra stop /var/run/rabbitmq/pid
12431 13687 13686 12431 pts/5    13686 S+       0   0:00  |       \_ grep rabbit
[root@gfsd1 ~]#
經過 Ctrl+C 中止調用後,能夠看到 rabbitmqctl stop <pidfile> 調用進程才中止。
[root@gfsd1 ~]# /etc/init.d/rabbitmq stop
Stopping rabbitmq-server: /var/run/rabbitmq/pid
Stopping and halting node rabbitmq@rmq_1 ...

(卡住)

^C
BREAK: (a)bort (c)ontinue (p)roc info (i)nfo (l)oaded
       (v)ersion (k)ill (D)b-tables (d)istribution
remove pid file and dir
rabbitmq-server.
[root@gfsd1 ~]# ^C
[root@gfsd1 ~]# 
[root@gfsd1 ~]# 
[root@gfsd1 ~]# ps ajxf|grep rabbit  
27541  1658  1657 27541 pts/2     1657 S+       0   0:00  |   |   \_ grep rabbit
[root@gfsd1 ~]#  
[root@gfsd1 ~]# 
[root@gfsd1 ~]#  ll /var/run/rabbitmq/pid
ls: cannot access /var/run/rabbitmq/pid: No such file or directory
[root@gfsd1 ~]#

問題分析

既然 RabbitMQ 服務可以正常中止,那麼問題就只有一個:爲何 rabbitmqctl stop <pidfile> 會退不出來?讓咱們 查看下相關代碼

在 rabbitmqctl 腳本中
...
exec ${ERL_DIR}erl \
    -pa "${RABBITMQ_HOME}/ebin" \
    -noinput \
    -hidden \
    ${RABBITMQ_CTL_ERL_ARGS} \
    -sname rabbitmqctl$$ \
    -boot "${CLEAN_BOOT_FILE}" \
    -s rabbit_control_main \        # 入口函數
    -nodename $RABBITMQ_NODENAME \
    -extra "$@"
在 rabbit_control_main.erl 中
...
-define(EXTERNAL_CHECK_INTERVAL, 1000).
...
start() ->
    ...
    {Command, Opts, Args} =
        case parse_arguments(init:get_plain_arguments(), NodeStr) of
            {ok, Res}  -> Res;
            no_command -> print_error("could not recognise command", []),
                          usage()
        end,
...
    %% The reason we don't use a try/catch here is that rpc:call turns
    %% thrown errors into normal return values
    case catch action(Command, Node, Args, Opts, Inform) of
        ok ->
            case Quiet of
                true  -> ok;
                false -> io:format("...done.~n")
            end,
            rabbit_misc:quit(0);
        {ok, Info} ->
            case Quiet of
                true  -> ok;
                false -> io:format("...done (~p).~n", [Info])
            end,
            rabbit_misc:quit(0);
        ...
    end.
對應到實際狀況,咱們的 Command 爲 stop ,因此會調用
...
action(stop, Node, Args, _Opts, Inform) ->
    Inform("Stopping and halting node ~p", [Node]),
    Res = call(Node, {rabbit, stop_and_halt, []}),
    case {Res, Args} of
        {ok, [PidFile]} -> wait_for_process_death(              %% 等待指定 pid 的進程退出
                             read_pid_file(PidFile, false));    %% 讀取 pid 文件的內容
        {ok, [_, _| _]} -> exit({badarg, Args});
        _               -> ok
    end,
    Res;
...
wait_for_process_death(Pid) ->
    case process_up(Pid) of
        true  -> timer:sleep(?EXTERNAL_CHECK_INTERVAL),   %% 只要認爲指定 pid 的進程未退出,則每隔 1s 檢查一次
                 wait_for_process_death(Pid);
        false -> ok
    end.
...
% Test using some OS clunkiness since we shouldn't trust
% rpc:call(os, getpid, []) at this point
process_up(Pid) ->
    with_os([{unix, fun () ->
                            run_ps(Pid) =:= 0    %% Linux 中會執行這裏。從上面的斷定知道,只有不等於 0 時,才認爲目標進程已經退出
                    end},
             {win32, fun () ->
                             Cmd = "tasklist /nh /fi \"pid eq " ++ Pid ++ "\" ",
                             Res = rabbit_misc:os_cmd(Cmd ++ "2>&1"),
                             case re:run(Res, "erl\\.exe", [{capture, none}]) of
                                 match -> true;
                                 _     -> false
                             end
                     end}]).

with_os(Handlers) ->
    {OsFamily, _} = os:type(),
    case proplists:get_value(OsFamily, Handlers) of
        undefined -> throw({unsupported_os, OsFamily});
        Handler   -> Handler()
    end.

run_ps(Pid) ->
    Port = erlang:open_port({spawn, "ps -p " ++ Pid},            %% 這裏就是整個問題的關鍵了
                            [exit_status, {line, 16384},
                             use_stdio, stderr_to_stdout]),
    exit_loop(Port).

exit_loop(Port) ->
    receive
        {Port, {exit_status, Rc}} -> Rc;                 %% 對應上面,只有 Rc 不等於 0 時,纔會認爲目標進程已退出
        {Port, _}                 -> exit_loop(Port)
    end.
知道了代碼執行流程,簡單添加一些關鍵打印,從新測試以下

問題復現

存在問題的環境
[root@gfsd1 ~]# 
[root@gfsd1 ~]# ps aux|grep rabbit
root     11813  0.0  0.0 106368  1004 pts/2    S    19:21   0:00 /bin/sh /etc/init.d/rabbitmq start
root     11815  0.0  0.0 106100  1200 pts/2    S    19:21   0:00 /bin/bash -c ulimit -S -c 0 >/dev/null 2>&1 ; /usr/local/rabbitmq/sbin/rabbitmq-server
root     11817 46.1  0.3 2806172 53252 pts/2   Sl   19:21   0:07 /usr/local/rabbitmq/erts-6.0/bin/beam.smp -W w -K true -A30 -P 1048576 -- -root /usr/local/rabbitmq -progname erl -- -home /root -- -boot /usr/local/rabbitmq/releases/3.3.5/start_clean -pa /usr/local/rabbitmq/sbin/../ebin -noshell -noinput -s rabbit boot -sname rabbitmq@rmq_1 -boot start_sasl -config /usr/local/rabbitmq/sbin/../etc/rabbitmq/rabbitmq -kernel inet_default_connect_options [{nodelay,true}] -sasl errlog_type error -sasl sasl_error_logger false -rabbit error_logger {file,"/usr/local/rabbitmq/sbin/../var/log/rabbitmq/rabbitmq@rmq_1.log"} -rabbit sasl_error_logger {file,"/usr/local/rabbitmq/sbin/../var/log/rabbitmq/rabbitmq@rmq_1-sasl.log"} -rabbit enabled_plugins_file "/usr/local/rabbitmq/sbin/../etc/rabbitmq/enabled_plugins" -rabbit plugins_dir "/usr/local/rabbitmq/sbin/../plugins" -rabbit plugins_expand_dir "/usr/local/rabbitmq/sbin/../var/lib/rabbitmq/mnesia/rabbitmq@rmq_1-plugins-expand" -os_mon start_cpu_sup false -os_mon start_disksup false -os_mon start_memsup false -mnesia dir "/usr/local/rabbitmq/sbin/../var/lib/rabbitmq/mnesia/rabbitmq@rmq_1"
root     12534  0.0  0.0 103252   836 pts/2    S+   19:21   0:00 grep rabbit
[root@gfsd1 ~]# 
[root@gfsd1 ~]# 
[root@gfsd1 ~]# cat /var/run/rabbitmq/pid 
11817
[root@gfsd1 ~]# 
[root@gfsd1 ~]# 
[root@gfsd1 ~]# /etc/init.d/rabbitmq stop
Stopping rabbitmq-server: /var/run/rabbitmq/pid
Stopping and halting node rabbitmq@rmq_1 ...
Args = ["/var/run/rabbitmq/pid"]
 ...
action_stop => Res = ok   Args = ["/var/run/rabbitmq/pid"]
 ...
read_pid_file => PidFile : "/var/run/rabbitmq/pid"   Wait = false
wait_for_process_death Pid = "11817"
run_ps => erlang:open_port -> ps -p "11817"
exit_loop => Data = {data,{eol,"  PID TTY          TIME CMD"}}
exit_loop => Data = {data,{eol,"11817 pts/2    00:00:08 beam.smp"}}
exit_loop => Exit = {exit_status,0}
wait_for_process_death Pid = "11817"
run_ps => erlang:open_port -> ps -p "11817"
exit_loop => Data = {data,{eol,"  PID TTY          TIME CMD"}}
exit_loop => Data = {data,{eol,"11817 pts/2    00:00:08 beam.smp"}}
exit_loop => Exit = {exit_status,0}
wait_for_process_death Pid = "11817"
run_ps => erlang:open_port -> ps -p "11817"
exit_loop => Data = {data,{eol,"  PID TTY          TIME CMD"}}
exit_loop => Exit = {exit_status,0}
wait_for_process_death Pid = "11817"
run_ps => erlang:open_port -> ps -p "11817"
exit_loop => Data = {data,{eol,"  PID TTY          TIME CMD"}}
exit_loop => Exit = {exit_status,0}
wait_for_process_death Pid = "11817"
run_ps => erlang:open_port -> ps -p "11817"
exit_loop => Data = {data,{eol,"  PID TTY          TIME CMD"}}
exit_loop => Exit = {exit_status,0}
wait_for_process_death Pid = "11817"
run_ps => erlang:open_port -> ps -p "11817"
exit_loop => Data = {data,{eol,"  PID TTY          TIME CMD"}}
exit_loop => Exit = {exit_status,0}
...
(永遠停不下來)
...
^C
BREAK: (a)bort (c)ontinue (p)roc info (i)nfo (l)oaded
       (v)ersion (k)ill (D)b-tables (d)istribution
^Cremove pid file and dir
rabbitmq-server.
[root@gfsd1 ~]# ^C
[root@gfsd1 ~]#
能夠看到不管執行多少次循環,永遠返回 {exit_status,0} ,因此致使 rabbitmqctl stop <pidfile> 沒法中止。

正常環境中的輸出
[root@Betty ~]# 
[root@Betty ~]# /etc/init.d/rabbitmq start
Starting rabbitmq-server: SUCCESS
rabbitmq-server.
[root@Betty ~]# 
[root@Betty ~]# 
[root@Betty ~]# ps aux|grep rabbit        
root      2431  0.0  0.0 106456  1004 pts/3    S    14:40   0:00 /bin/sh /etc/init.d/rabbitmq start
root      2434  0.0  0.0 106092  1172 pts/3    S    14:40   0:00 /bin/bash -c ulimit -S -c 0 >/dev/null 2>&1 ; /usr/sbin/rabbitmq-server
root      2436 42.8  1.4 2296732 56332 pts/3   Sl   14:40   0:02 /usr/local/lib/erlang/erts-6.0/bin/beam.smp -W w -K true -A30 -P 1048576 -- -root /usr/local/lib/erlang -progname erl -- -home /root -- -pa /usr/lib/rabbitmq/sbin/../ebin -noshell -noinput -s rabbit boot -sname rmq_betty -boot start_sasl -config /etc/rabbitmq/rabbitmq -kernel inet_default_connect_options [{nodelay,true}] -sasl errlog_type error -sasl sasl_error_logger false -rabbit error_logger {file,"/var/log/rabbitmq/rmq_betty.log"} -rabbit sasl_error_logger {file,"/var/log/rabbitmq/rmq_betty-sasl.log"} -rabbit enabled_plugins_file "/etc/rabbitmq/enabled_plugins" -rabbit plugins_dir "/usr/lib/rabbitmq/sbin/../plugins" -rabbit plugins_expand_dir "/var/lib/rabbitmq/mnesia/rmq_betty-plugins-expand" -os_mon start_cpu_sup false -os_mon start_disksup false -os_mon start_memsup false -mnesia dir "/var/lib/rabbitmq/mnesia/rmq_betty" -kernel inet_dist_listen_min 25672 -kernel inet_dist_listen_max 25672
root      2567  0.0  0.0 103256   856 pts/3    S+   14:40   0:00 grep rabbit
[root@Betty ~]# 
[root@Betty ~]# 
[root@Betty ~]# cat /var/run/rabbitmq/pid 
2436
[root@Betty ~]# 
[root@Betty ~]# 
[root@Betty ~]# ps -p 2436
  PID TTY          TIME CMD
 2436 pts/3    00:00:02 beam.smp
[root@Betty ~]# 
[root@Betty ~]# 
[root@Betty ~]# /etc/init.d/rabbitmq stop
Stopping rabbitmq-server: Stopping and halting node rmq_betty@Betty ...
Args = ["/var/run/rabbitmq/pid"]
 ...
read_pid_file => PidFile : "/var/run/rabbitmq/pid"   Wait = false
wait_for_process_death Pid = "2436"
run_ps => erlang:open_port -> ps -p "2436"
exit_loop => Data = {data,{eol,"  PID TTY          TIME CMD"}}
exit_loop => Data = {data,{eol," 2436 pts/3    00:00:02 beam.smp"}}
exit_loop => Exit = {exit_status,0}
wait_for_process_death Pid = "2436"
run_ps => erlang:open_port -> ps -p "2436"
exit_loop => Data = {data,{eol,"  PID TTY          TIME CMD"}}
exit_loop => Data = {data,{eol," 2436 pts/3    00:00:02 beam.smp"}}
exit_loop => Exit = {exit_status,0}
wait_for_process_death Pid = "2436"
run_ps => erlang:open_port -> ps -p "2436"
exit_loop => Data = {data,{eol,"  PID TTY          TIME CMD"}}
exit_loop => Exit = {exit_status,1}
...done.
rabbitmq-server.
[root@Betty ~]#
能夠看到,循環檢測 3 次後就返回了 {exit_status,1} ,從而 rabbitmqctl stop <pidfile> 可以正常退出。

問題深刻

深刻研究一下 erlang:open_port/2 的使用

手冊內容以下:

-=-=-=-=-=- 我是開始研究大數據問題的分隔線 -=-=-=-=-=-=-

open_port(PortName, PortSettings) -> port()
Returns a port identifier as the result of opening a new Erlang port. A port can be seen as an external Erlang process.

PortName is one of the following:

{spawn, Command}
Starts an external program.  Command is the name of the external program which will be run.  Command runs
outside the Erlang work space unless an Erlang driver with the name Command is found. If found, that driver will
be started. A driver runs in the Erlang workspace, which means that it is linked with the Erlang runtime system.
When starting external programs on Solaris, the system call vfork is used in preference to  fork for performance
reasons, although it has a history of being less robust. If there are problems with using  vfork, setting the
environment variable ERL_NO_VFORK to any value will cause fork to be used instead.
For external programs, the PATH is searched (or an equivalent method is used to find programs, depending on
operating system). This is done by invoking the shell on certain platforms. The first space separated token of
the command will be considered as the name of the executable (or driver). This (among other things) makes
this option unsuitable for running programs having spaces in file or directory names. Use {spawn_executable,
Command} instead if spaces in executable file names is desired.

{line, L}
Messages are delivered on a per line basis. Each line (delimited by the OS-dependent newline sequence) is
delivered in one single message. The message data format is {Flag, Line}, where  Flag is either  eol or
noeol and Line is the actual data delivered (without the newline sequence).
消息以行爲基礎單位進行投遞;
每一行輸出內容(由操做系統相關的換行序列肯定)做爲單獨一條消息投遞;
消息數據格式爲 {Flag, Line} ,其中 Flag 是 eol 或者 noeol ;Line 爲實際投遞的數據內容(不包含換行序列)

L specifies the maximum line length in bytes. Lines longer than this will be delivered in more than one message,
with the  Flag set to  noeol for all but the last message. If end of file is encountered anywhere else than
immediately following a newline sequence, the last line will also be delivered with the Flag set to noeol. In
all other cases, lines are delivered with Flagset to eol.
L 指定了以字節爲單位的最大行內容長度;
超過規定長度的行,將會以多條消息進行投遞,除最後一條消息外,均設置 Flag 爲 noeol ;
若遇到了文件的結束,但沒有換行序列,則最後一行內容一樣會被投遞,且設置 Flag 爲 noeol ;
在其餘狀況下,行數據會設置 Flag 爲 eol 進行投遞。

The {packet, N} and {line, L} settings are mutually exclusive.

exit_status
This is only valid for  {spawn, Command} where  Command refers to an external program, and for
{spawn_executable, Command}.

When the external process connected to the port exits, a message of the form  {Port,
{exit_status,Status}} is sent to the connected process, where Status is the exit status of the external
process. If the program aborts, on Unix the same convention is used as the shells do (i.e., 128+signal).
If the  eofoption has been given as well, the  eofmessage and the  exit_statusmessage appear in an
unspecified order.
當與 port 鏈接的外部進程退出時,形式爲 {Port,{exit_status,Status}} 的消息會被髮送到 erlang 進程,其中 Status 的內容爲外部進程的退出狀態;
若是外部程序主動 abort 了,在 Unix 系統中,將採用相似 shell 形式的退出碼策略(即 128+signal)

If the port program closes its stdout without exiting, the exit_status option will not work.
若是 port 程序關閉了其 stdout 卻沒有退出,則 exit_status 選項不起做用。

use_stdio
This is only valid for {spawn, Command}and {spawn_executable, Command}. It allows the standard
input and output (file descriptors 0 and 1) of the spawned (UNIX) process for communication with Erlang.
該選項容許被建立 Unix 進程的 stdin 和 stdout 能夠和 Erlang 進行通訊。

stderr_to_stdout
Affects ports to external programs. The executed program gets its standard error file redirected to its standard
output file. stderr_to_stdout and nouse_stdio are mutually exclusive.
該選項對外部程序的 port 產生影響;
可執行程序的 stderr 會被重定向到其 stdout 上;

-=-=-=-=-=- 我是開始研究大數據問題的分隔線 -=-=-=-=-=-=-

從手冊中能夠知道,RabbitMQ 代碼中是經過 Erlang Port 調用操做系統的命令,並根據返回結果的內容來斷定對應的進程是否已中止的。



但在咱們的環境中卻遇到了外部進程退出狀態錯誤的問題。

解決辦法

1.修改 init 腳本,將調用 rabbitmqctl stop <pidfile> 的地方,改爲 rabbitmqctl stop(須要相應調整針對 pidfile 的各類條件斷定
2.不要使用 VMware 虛擬機,由於在 KVM 虛擬機上沒有發現此問題(VMware 粉不要噴我~) node



---------------------------- ---------------------------- ---------------------------- ----------------------------
重大發現:根本緣由是服務器被攻擊了,ps 文件被替換掉了!

【被攻擊機器】

確認問題 ps 的退出碼
[root@gfsd1 ~]# 
[root@gfsd1 ~]# ps -p 123
  PID TTY          TIME CMD
  123 ?        00:00:00 kthrotld/1
[root@gfsd1 ~]# 
[root@gfsd1 ~]# echo $?
0
[root@gfsd1 ~]# 
[root@gfsd1 ~]# ps -p 1234
  PID TTY          TIME CMD
[root@gfsd1 ~]# 
[root@gfsd1 ~]# echo $?   
0                          -- 正常的 ps 程序,此處應該返回 1
[root@gfsd1 ~]#
確認問題 ps 程序有哪些特徵
[root@gfsd1 ~]# uname -a
Linux gfsd1 2.6.32-431.el6.x86_64 #1 SMP Fri Nov 22 03:15:09 UTC 2013 x86_64 x86_64 x86_64 GNU/Linux
[root@gfsd1 ~]#

ps 文件大小和正常機器上不一樣
[root@gfsd1 ~]# ll /bin/ps 
-rwxr-xr-x 1 root root 1223123 Mar 18 10:42 /bin/ps
[root@gfsd1 ~]# 
[root@gfsd1 ~]# 

64bit 機器上出現了 32bit 的 ps 文件
[root@gfsd1 ~]# file /bin/ps   
/bin/ps: ELF 32-bit LSB executable, Intel 80386, version 1 (SYSV), statically linked, for GNU/Linux 2.2.5, not stripped
[root@gfsd1 ~]# 

正常機器上 ps 是動態連接的
[root@gfsd1 ~]# ldd /bin/ps       
        not a dynamic executable
[root@gfsd1 ~]#
跟蹤問題 ps 的執行
[root@gfsd1 ~]# 
[root@gfsd1 ~]# strace -f ps -p 1234
execve("/bin/ps", ["ps", "-p", "1234"], [/* 32 vars */]) = 0   -- 調用問題 ps 程序
[ Process PID=14740 runs in 32 bit mode. ]
...
set_thread_area(0xffc3db64)             = 0
set_tid_address(0xa12c878)              = 14740
...
getrlimit(RLIMIT_STACK, {rlim_cur=-4284481536, rlim_max=564187122106368*1024}) = 0
_sysctl({0x2081076ec, -3941488, (nil), (nil), (nil), 18429817727855099905}) = 0
...
close(3)                                = -1 EBADF (Bad file descriptor)
...
close(1023)                             = -1 EBADF (Bad file descriptor)
readlink("/proc/14740/exe", "/bin/ps", 1024) = 7
stat64("/bin/ps", {st_mode=S_IFREG|0755, st_size=1223123, ...}) = 0
getppid()                               = 14734
readlink("/proc/14734/exe", "/usr/bin/strace", 255) = 15
readlink("/proc/14740/exe", "/bin/ps", 1024) = 7
readlink("/proc/14740/exe", "/bin/ps", 1024) = 7
readlink("/proc/14740/exe", "/bin/ps", 1024) = 7
access("/usr/bin/dpkgd/ps", F_OK)       = 0
pipe([3, 4])                            = 0
clone(Process 14845 attached
child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0) = 14845    -- 建立了一個線程(正常 ps 沒有該行爲)
...
[pid 14845] dup2(4, 1 <unfinished ...>
[pid 14740] <... fstat64 resumed> {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
[pid 14845] <... dup2 resumed> )        = 1
[pid 14740] mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0 <unfinished ...>
[pid 14845] close(4)                    = 0
...
[pid 14845] execve("/bin/sh", ["sh", "-c", "/usr/bin/dpkgd/ps -p 1234"], [/* 40 vars */] <unfinished ...>   -- 在新建線程中調用的是 /usr/bin/dpkgd/ 目錄下的 ps 文件!!!
[pid 14740] read(3,  <unfinished ...>
[pid 14845] <... execve resumed> )      = 0
[ Process PID=14845 runs in 64 bit mode. ]
...
[pid 14845] getdents(4, /* 281 entries */, 32768) = 7352
[pid 14845] stat("/proc/1", {st_mode=S_IFDIR|0555, st_size=0, ...}) = 0
[pid 14845] open("/proc/1/stat", O_RDONLY) = 5
[pid 14845] read(5, "1 (init) S 0 1 1 0 -1 4202752 30"..., 1023) = 260
[pid 14845] close(5)                    = 0
[pid 14845] open("/proc/1/status", O_RDONLY) = 5
[pid 14845] read(5, "Name:\tinit\nState:\tS (sleeping)\nT"..., 1023) = 870
[pid 14845] close(5)                    = 0
[pid 14845] stat("/proc/2", {st_mode=S_IFDIR|0555, st_size=0, ...}) = 0
[pid 14845] open("/proc/2/stat", O_RDONLY) = 5
[pid 14845] read(5, "2 (kthreadd) S 0 0 0 0 -1 214961"..., 1023) = 155
[pid 14845] close(5)                    = 0
[pid 14845] open("/proc/2/status", O_RDONLY) = 5
[pid 14845] read(5, "Name:\tkthreadd\nState:\tS (sleepin"..., 1023) = 656
[pid 14845] close(5)                    = 0
...
[pid 14845] stat("/proc/28465", {st_mode=S_IFDIR|0555, st_size=0, ...}) = 0
[pid 14845] open("/proc/28465/stat", O_RDONLY) = 5
[pid 14845] read(5, "28465 (inet_gethost) S 28451 284"..., 1023) = 237
[pid 14845] close(5)                    = 0
[pid 14845] open("/proc/28465/status", O_RDONLY) = 5
[pid 14845] read(5, "Name:\tinet_gethost\nState:\tS (sle"..., 1023) = 884
[pid 14845] close(5)                    = 0
[pid 14845] getdents(4, /* 0 entries */, 32768) = 0
[pid 14845] close(4)                    = 0
[pid 14845] fstat(1, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
[pid 14845] mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f4528922000
[pid 14845] write(1, "  PID TTY          TIME CMD\n", 28) = 28
[ Process PID=14740 runs in 32 bit mode. ]
[pid 14740] <... read resumed> "  PID TTY          TIME CMD\n", 4096) = 28
[ Process PID=14845 runs in 64 bit mode. ]
[pid 14845] exit_group(1)               = ?                        -- 線程中的 ps 退出碼爲 1
Process 14845 detached
[ Process PID=14740 runs in 32 bit mode. ]
fstat64(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 1), ...}) = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xfffffffff778a000
--- SIGCHLD (Child exited) @ 0 (0) ---
write(1, "  PID TTY          TIME CMD\n", 28  PID TTY          TIME CMD
) = 28
read(3, "", 4096)                       = 0
close(3)                                = 0
waitpid(14845, [{WIFEXITED(s) && WEXITSTATUS(s) == 1}], 0) = 14845      -- 能夠看到,問題 ps 成功拿到了正確 ps 的退出碼
munmap(0xf778b000, 4096)                = 0
munmap(0xf778a000, 4096)                = 0
exit_group(0)                           = ?       -- 但問題 ps 最終以退出碼 0 退出
[root@gfsd1 ~]#
由上面的信息,能夠知道正確的 ps 程序被搬移到了下面的位置
[root@gfsd1 ~]# ll /usr/bin/dpkgd/
total 436
-rwxr-xr-x 1 root root 145872 Jan 28 05:00 lsof
-rwxr-xr-x 1 root root 128192 Jan 28 05:00 netstat
-rwxr-xr-x 1 root root  87088 Jan 28 05:00 ps
-rwxr-xr-x 1 root root  74840 Jan 28 05:00 ss
[root@gfsd1 ~]# 
[root@gfsd1 ~]# file /usr/bin/dpkgd/*
/usr/bin/dpkgd/lsof:    ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.18, stripped
/usr/bin/dpkgd/netstat: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.18, stripped
/usr/bin/dpkgd/ps:      ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.18, stripped
/usr/bin/dpkgd/ss:      ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.18, stripped
[root@gfsd1 ~]# 
[root@gfsd1 ~]# 
[root@gfsd1 ~]# ldd /usr/bin/dpkgd/* 
/usr/bin/dpkgd/lsof:
        linux-vdso.so.1 =>  (0x00007fff86bff000)
        libselinux.so.1 => /lib64/libselinux.so.1 (0x0000003522000000)
        libc.so.6 => /lib64/libc.so.6 (0x0000003ef8200000)
        libdl.so.2 => /lib64/libdl.so.2 (0x0000003ef7e00000)
        /lib64/ld-linux-x86-64.so.2 (0x0000003ef7a00000)
/usr/bin/dpkgd/netstat:
        linux-vdso.so.1 =>  (0x00007fff3ddc2000)
        libselinux.so.1 => /lib64/libselinux.so.1 (0x0000003522000000)
        libc.so.6 => /lib64/libc.so.6 (0x0000003ef8200000)
        libdl.so.2 => /lib64/libdl.so.2 (0x0000003ef7e00000)
        /lib64/ld-linux-x86-64.so.2 (0x0000003ef7a00000)
/usr/bin/dpkgd/ps:
        linux-vdso.so.1 =>  (0x00007fff961b6000)
        libselinux.so.1 => /lib64/libselinux.so.1 (0x0000003522000000)
        libproc-3.2.8.so => /lib64/libproc-3.2.8.so (0x0000003ef8600000)
        libc.so.6 => /lib64/libc.so.6 (0x0000003ef8200000)
        libdl.so.2 => /lib64/libdl.so.2 (0x0000003ef7e00000)
        /lib64/ld-linux-x86-64.so.2 (0x0000003ef7a00000)
/usr/bin/dpkgd/ss:
        linux-vdso.so.1 =>  (0x00007fffb51e1000)
        libresolv.so.2 => /lib64/libresolv.so.2 (0x0000003ef9e00000)
        libc.so.6 => /lib64/libc.so.6 (0x0000003ef8200000)
        /lib64/ld-linux-x86-64.so.2 (0x0000003ef7a00000)
[root@gfsd1 ~]#
能夠看到,被攻擊的可執行程序不止 ps 一個...

【正常機器】

正常 ps 程序的特徵
[root@YOYO ~]# uname -a
Linux YOYO 2.6.32-358.el6.x86_64 #1 SMP Fri Feb 22 00:31:26 UTC 2013 x86_64 x86_64 x86_64 GNU/Linux
[root@YOYO ~]# 
[root@YOYO ~]# ll /bin/ps 
-rwxr-xr-x. 1 root root 87088 11月 15 2012 /bin/ps
[root@YOYO ~]# 
[root@YOYO ~]# file /bin/ps 
/bin/ps: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.18, stripped
[root@YOYO ~]# 
[root@YOYO ~]# ldd /bin/ps 
        linux-vdso.so.1 =>  (0x00007fffb6d24000)
        libselinux.so.1 => /lib64/libselinux.so.1 (0x00000034e7000000)
        libproc-3.2.8.so => /lib64/libproc-3.2.8.so (0x0000003f38400000)
        libc.so.6 => /lib64/libc.so.6 (0x0000003f38000000)
        libdl.so.2 => /lib64/libdl.so.2 (0x0000003f37c00000)
        /lib64/ld-linux-x86-64.so.2 (0x0000003f37800000)
[root@YOYO ~]#
跟蹤正常 ps 程序的執行
[root@YOYO ~]# strace -f ps -p 1234                
execve("/bin/ps", ["ps", "-p", "1234"], [/* 28 vars */]) = 0
...
getdents(4, /* 241 entries */, 32768)   = 6192
stat("/proc/1", {st_mode=S_IFDIR|0555, st_size=0, ...}) = 0
open("/proc/1/stat", O_RDONLY)          = 5
read(5, "1 (init) S 0 1 1 0 -1 4202752 29"..., 1023) = 256
close(5)                                = 0
open("/proc/1/status", O_RDONLY)        = 5
read(5, "Name:\tinit\nState:\tS (sleeping)\nT"..., 1023) = 866
close(5)                                = 0
stat("/proc/2", {st_mode=S_IFDIR|0555, st_size=0, ...}) = 0
open("/proc/2/stat", O_RDONLY)          = 5
read(5, "2 (kthreadd) S 0 0 0 0 -1 214961"..., 1023) = 156
close(5)                                = 0
open("/proc/2/status", O_RDONLY)        = 5
read(5, "Name:\tkthreadd\nState:\tS (sleepin"..., 1023) = 654
close(5)                                = 0
...
stat("/proc/25701", {st_mode=S_IFDIR|0555, st_size=0, ...}) = 0
open("/proc/25701/stat", O_RDONLY)      = 5
read(5, "25701 (bash) S 25697 25701 25701"..., 1023) = 257
close(5)                                = 0
open("/proc/25701/status", O_RDONLY)    = 5
read(5, "Name:\tbash\nState:\tS (sleeping)\nT"..., 1023) = 879
close(5)                                = 0
getdents(4, /* 0 entries */, 32768)     = 0
close(4)                                = 0
fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 2), ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f4d2da3c000
write(1, "  PID TTY          TIME CMD\n", 28  PID TTY          TIME CMD
) = 28
exit_group(1)                           = ?
[root@YOYO ~]#
能夠看到正常狀況下 ps -p xxx 是以退出碼 1 退出的。

---------------------------- ---------------------------- ---------------------------- ----------------------------

與此木馬程序相關的文章:這裏 。
系統漏洞相關文檔:《NTRG_ElasticBotnetReport_06102015》
漏洞號:CVE-2015-1427

問題 ps 中的符號片斷
linux

...
08048b30 T CAttackIe::UpdateCurVariant(unsigned int)
08048b36 T CTcpAttack::UpdateCurVariant(unsigned int)
08048be4 T CAttackIcmp::MakePacket(unsigned int)
08048c5a T CAttackSyn::MakePacket(unsigned int)
08048d02 T CAttackUdp::MakePacket(unsigned int)
08048dce T CPacketAttack::SendPacket()
08048eac T CPacketAttack::Create()
08048f7e T CAttackAmp::Create()
08049070 T CPacketAttack::UpdateCurVariant(unsigned int)
08049306 T CTcpAttack::Do(unsigned int)
08049546 T CAttackPrx::MakePacket(unsigned int)
080497d6 T CAttackAmp::MakePacket(unsigned int)
0804996c T CAttackDns::MakePacket(unsigned int)
08049bf8 T CAttackCc::Destroy()
08049cc8 T CAttackCc::Create()
08049e8c T CAttackTns::Destroy()
08049f5c T CAttackTns::Create()
0804a05a T CAttackPrx::Create()
0804a2a6 T CAttackDns::Create()
0804a4f2 T CAttackCompress::CreatePackets()
0804a6ec T CAttackCompress::MakePacket(unsigned int)
0804a8be T CAttackCompress::Create()
...
相關文章
相關標籤/搜索