前段時間的兩個工做.
一個是entity集羣庫, 能夠經過entity_id調用任意節點上的entity.
一個是名字服務, 能夠爲一系列pid註冊名字, 並能夠以這些名字調用對應的pid.
都會遇到同一些問題: 當咱們使用GenServer.call/2時, 發生了什麼, 會有什麼異常狀況發生? 哪些異常應該捕獲? 以什麼樣的方式處理這些異常/錯誤?
當call的pid所在的node崩潰時, 會有什麼異常? 在調用開始前/中崩潰, 有什麼不一樣.
當call的pid所在的node網絡忽然中斷呢? 會有什麼表現?
當call的pid崩潰時呢?
是否應該捕獲timeout?
這些問題在文檔中並無答案. 因此, 探索一下.node
erlang: OTP-21.0.9linux
gen_server.erl:203docker
%% ----------------------------------------------------------------- %% Make a call to a generic server. %% If the server is located at another node, that node will %% be monitored. %% If the client is trapping exits and is linked server termination %% is handled here (? Shall we do that here (or rely on timeouts) ?). %% ----------------------------------------------------------------- call(Name, Request) -> case catch gen:call(Name, '$gen_call', Request) of {ok,Res} -> Res; {'EXIT',Reason} -> exit({Reason, {?MODULE, call, [Name, Request]}}) end. call(Name, Request, Timeout) -> case catch gen:call(Name, '$gen_call', Request, Timeout) of {ok,Res} -> Res; {'EXIT',Reason} -> exit({Reason, {?MODULE, call, [Name, Request, Timeout]}}) end.
gen.erl:160網絡
do_call(Process, Label, Request, Timeout) when is_atom(Process) =:= false -> Mref = erlang:monitor(process, Process), %% OTP-21: %% Auto-connect is asynchronous. But we still use 'noconnect' to make sure %% we send on the monitored connection, and not trigger a new auto-connect. %% erlang:send(Process, {Label, {self(), Mref}, Request}, [noconnect]), receive {Mref, Reply} -> erlang:demonitor(Mref, [flush]), {ok, Reply}; {'DOWN', Mref, _, _, noconnection} -> Node = get_node(Process), exit({nodedown, Node}); {'DOWN', Mref, _, _, Reason} -> exit(Reason) after Timeout -> erlang:demonitor(Mref, [flush]), exit(timeout) end.
能夠看到, call一個process的過程:app
可能的狀況有async
那麼, 前面的各類異常, 會對應到哪些狀況呢? 有沒有意外?
先看看monitor一個process時到底作了什麼.ide
erlang.erl:1291ui
-type registered_name() :: atom(). -type registered_process_identifier() :: registered_name() | {registered_name(), node()}. -type monitor_process_identifier() :: pid() | registered_process_identifier(). -type monitor_port_identifier() :: port() | registered_name(). %% monitor/2 -spec monitor (process, monitor_process_identifier()) -> MonitorRef when MonitorRef :: reference(); (port, monitor_port_identifier()) -> MonitorRef when MonitorRef :: reference(); (time_offset, clock_service) -> MonitorRef when MonitorRef :: reference(). monitor(_Type, _Item) -> erlang:nif_error(undefined).
在monitor process時, 能夠是一個pid, name, name node tuple. 但這裏沒有具體實現, 找一下nif.
調試發現, 入口在bif.c:monitor_2this
Thread 5 "1_scheduler" hit Breakpoint 9, erts_monitor_create (type=type@entry=0, ref=ref@entry=140046391205050, orgn=3092376454563, trgt=trgt@entry=2645699855571, name=name@entry=18446744073709551611) at beam/erl_monitor_link.c:759 759 { (gdb) bt #0 erts_monitor_create (type=type@entry=0, ref=ref@entry=140046391205050, orgn=3092376454563, trgt=trgt@entry=2645699855571, name=name@entry=18446744073709551611) at beam/erl_monitor_link.c:759 #1 0x00000000004d5029 in monitor_2 (A__p=0x7f5f36e803e0, BIF__ARGS=0x7f5f379c0100, A__I=<optimized out>) at beam/bif.c:514 #2 0x000000000044042e in process_main () at x86_64-unknown-linux-gnu/opt/smp/beam_cold.h:59 #3 0x000000000043a0c6 in sched_thread_func (vesdp=0x7f5f35e44dc0) at beam/erl_process.c:8332 #4 0x00000000006467c9 in thr_wrapper (vtwd=0x7ffdb57b4180) at pthread/ethread.c:118 #5 0x00007f5f78d5c6ba in start_thread (arg=0x7f5f34e7f700) at pthread_create.c:333 #6 0x00007f5f7888a41d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109
先看本地pid的註冊, 當target爲atom時, 只是取一下pid, goto local_proccess, 邏輯上只有name是否有值的區別.atom
// 如果本地pid, 名字爲空 if (is_internal_pid(target)) { name = NIL; id = target; local_process: // make ref in call process ref = erts_make_ref(BIF_P); // 判斷monitor的進程非調用進程 if (id != BIF_P->common.id) { // create monitor data mdp = erts_monitor_create(ERTS_MON_TYPE_PROC, ref, BIF_P->common.id, id, name); // insert to process monitor tree erts_monitor_tree_insert(&ERTS_P_MONITORS(BIF_P), &mdp->origin); // 從這裏開始, 遠程pid的monitor有邏輯上的不一樣 // 給調用進程mdp->target發sig, monitor target id. if (!erts_proc_sig_send_monitor(&mdp->target, id)) erts_proc_sig_send_monitor_down(&mdp->target, am_noproc); } BIF_RET(ref); }
再看遠程pid的註冊
if (is_external_pid(target)) { ErtsDSigData dsd; int code; dep = external_pid_dist_entry(target); if (dep == erts_this_dist_entry) goto noproc; id = target; name = NIL; byname = 0; remote_process: ref = erts_make_ref(BIF_P); mdp = erts_monitor_create(ERTS_MON_TYPE_DIST_PROC, ref, BIF_P->common.id, id, name); erts_monitor_tree_insert(&ERTS_P_MONITORS(BIF_P), &mdp->origin); // 這裏開始和local pid有所不一樣 code = erts_dsig_prepare(&dsd, dep, BIF_P, ERTS_PROC_LOCK_MAIN, ERTS_DSP_RLOCK, 0, 1); switch (code) { // 若是調用時, 連接就沒創建, 或遠端節點NOT_ALIVE, 立刻會拋出 noconnection 異常. case ERTS_DSIG_PREP_NOT_ALIVE: case ERTS_DSIG_PREP_NOT_CONNECTED: erts_monitor_set_dead_dist(&mdp->target, dep->sysname); erts_proc_sig_send_monitor_down(&mdp->target, am_noconnection); code = ERTS_DSIG_SEND_OK; break; // 連接中或已建鏈 case ERTS_DSIG_PREP_PENDING: case ERTS_DSIG_PREP_CONNECTED: { #ifdef DEBUG int inserted = #endif // monitor a process (named or unnamed) on another node erts_monitor_dist_insert(&mdp->target, dep->mld); ASSERT(inserted); erts_de_runlock(dep); code = erts_dsig_send_monitor(&dsd, BIF_P->common.id, target, ref); break; } default: ERTS_ASSERT(! "Invalid dsig prepare result"); code = ERTS_DSIG_SEND_OK; break; } if (byname) erts_deref_dist_entry(dep); if (code == ERTS_DSIG_SEND_YIELD) ERTS_BIF_YIELD_RETURN(BIF_P, ref); BIF_RET(ref); }
若調用中pid退出, 本地pid, 會直接調用erl_proc_sig_queue.c:erts_proc_sig_send_monitor_down.
遠端pid, 會收到DOP_MONITOR_P_EXIT消息. dist.c:1665
case DOP_MONITOR_P_EXIT: { /* We are monitoring a process on the remote node which dies, we get {DOP_MONITOR_P_EXIT, Remote pid or name, Local pid, ref, reason} */ if (tuple_arity != 5) { goto invalid_message; } watched = tuple[2]; /* remote proc or name which died */ watcher = tuple[3]; ref = tuple[4]; reason = tuple[5]; if (is_not_ref(ref)) goto invalid_message; if (is_not_external_pid(watched) && is_not_atom(watched)) goto invalid_message; if (is_not_internal_pid(watcher)) { if (!is_external_pid(watcher)) goto invalid_message; if (erts_this_dist_entry == external_pid_dist_entry(watcher)) break; goto invalid_message; } erts_proc_sig_send_dist_monitor_down(dep, ref, watched, watcher, reason); break; }
最終和本地pid down同樣, 都會經過send_gen_exit_signal, 通知monitor的進程.
dist.c:erts_do_net_exits
dist.c:schedule_con_monitor_link_cleanup
dist.c:con_monitor_link_cleanup
過程, 檢測到網絡斷開後, 對本node的monitor的pid, 發送reason信息. todo 調試拿一下callstack.
先將EchoService的代碼貼出.
defmodule Service.Echo do @moduledoc """ """ use GenServer require Logger require Record Record.defrecordp :state, [ ] def start_link(params) do GenServer.start_link(__MODULE__, params, name: __MODULE__) end def init(_) do {:ok, state()} end def handle_info(_what, state) do {:noreply, state} end def handle_cast(_what, state) do {:noreply, state} end def handle_call({:sleep, timeout}=what, _from, state) do :timer.sleep(timeout) {:reply, what, state} end def handle_call({:raise_after, timeout, raise_msg}=what, _from, state) do :timer.sleep(timeout) raise raise_msg {:reply, what, state} end def handle_call(what, _from, state) do {:reply, what, state} end end
iex(xxxxxx@xxxxxx.)47> GenServer.call({Service.Echo, :"xxxxx@xxxxxx."}, :hello) ** (exit) exited in: GenServer.call({Service.Echo, :"xxxxx@xxxxxx."}, :hello, 5000) ** (EXIT) no connection to xxxxx@xxxxxx. (elixir) lib/gen_server.ex:924: GenServer.call/3
iex(xxxxx@xxxxx.)47> xxxxx.call(:echo_service, {:sleep, 50000}, 100000) ** (exit) exited in: GenServer.call(#PID<45205.2967.0>, {:sleep, 50000}, 100000) ** (EXIT) shutdown (elixir) lib/gen_server.ex:924: GenServer.call/3 (xxxxxxxxx) lib/xxxxxxx/xxxxxxxxx.ex:35: xxxxxxx/4
iex(xxxxxx@xxxxxx.)54> GenServer.call({Service.Echo, :"xxxxxx@xxxxxx."}, {:sleep, 1000000}, 50000000) ** (exit) exited in: GenServer.call({Service.Echo, :"xxxxxx@xxxxxx."}, {:sleep, 1000000}, 50000000) ** (EXIT) no connection to xxxxxx@xxxxxx. (elixir) lib/gen_server.ex:924: GenServer.call/3
iex(xxxxx@xxxxx.)48> GenServer.call({Service.Echo, :"xxxxxxx@xxxxxxx."}, {:raise_after, 3000, "hello"}) ** (exit) exited in: GenServer.call({Service.Echo, :"xxxxxxx@xxxxxx."}, {:raise_after, 3000, "hello"}, 5000) ** (EXIT) an exception was raised: ** (RuntimeError) hello lib/echo_service.ex:99: Service.Echo.handle_call/3 (stdlib) gen_server.erl:661: :gen_server.try_handle_call/4 (stdlib) gen_server.erl:690: :gen_server.handle_msg/6 (stdlib) proc_lib.erl:249: :proc_lib.init_p_do_apply/3 (elixir) lib/gen_server.ex:924: GenServer.call/3
用docker network disconnect multi-host-network xxxxx_echo_service模擬拔網線的效果
大約30-60S後, exit by no connection
iex(xxxxx@xxxxx.)48> GenServer.call({Service.Echo, :"xxxxxx@xxxxxxx."}, {:sleep, 1000000}, 50000000) ** (exit) exited in: GenServer.call({Service.Echo, :"xxxxxx@xxxxxxx."}, {:sleep, 1000000}, 50000000) ** (EXIT) no connection to xxxxxx@xxxxxxx. (elixir) lib/gen_server.ex:924: GenServer.call/3