x_service寫文件是單點存在瓶頸, 使用進程池來消除瓶頸.
分配進程時使用的隨機函數(Enum.random), 會調用Enumerable.count來得到List長度.
使用的Elixir發佈工具exrm有bug, 沒把調用固化(consolidated). 致使Enumerable.count每次調用時, 都須要call code_server來確認List模塊是否存在, 在訪問發生抖動時, 若分配主進程忽然積累了大量消息, call code_server 會變得更慢, 致使處理速度變慢, 消息越積越多, 性能愈來愈差, 並致使內存暴漲.html
Enum.map(:erlang.processes(), fn proc -> {:erlang.process\_info(proc, :memory), proc} end) |> Enum.sort(fn({{\_, a}, \_}, {{\_, b}, \_}) -> a > b end) |> List.first
對該進程process_info, 發現該進程有極高的reductions, message_queue_len, memorymongodb
iex(xxxx@xxxx)5> :erlang.process_info(pid) [registered_name: xxx, current_function: {:code_server, :call, 1}, initial_call: {:proc_lib, :init_p, 5}, status: :running, message_queue_len: 1426935, messages: [notify: {:log, :"xxx.xxxxx.xxxxxxx", <<107, 0, 0, 0, 3, 100, 97, 116, 97, 0, 82, 0, 0, 0, 2, 112, 108, 97, 121, 101, 114, 95, 115, 101, 115, 115, 105, 111, 110, 0, 11, 0, 0, 0, 52, 74, 85, 95, 89, 95, 55, ...>>}, ............ ...], links: [#PID<0.1749.0>], dictionary: [rand_seed: {%{bits: 58, jump: #Function<8.15449617/1 in :rand.mk_alg/1>, next: #Function<5.15449617/1 in :rand.mk_alg/1>, type: :exrop, uniform: #Function<6.15449617/1 in :rand.mk_alg/1>, uniform_n: #Function<7.15449617/2 in :rand.mk_alg/1>, weak_low_bits: 1}, [92689310913950825 | 4906624003405292]}, "$initial_call": {GenEvent, :init_it, 6}, "$ancestors": [XXX.Route.Supervisor, XXX.Supervisor, #PID<0.1747.0>]], trap_exit: false, error_handler: :error_handler, priority: :normal, group_leader: #PID<0.1746.0>, total_heap_size: 28600107, heap_size: 18481566, stack_size: 45, reductions: 7024105801, garbage_collection: [max_heap_size: %{error_logger: true, kill: true, size: 0}, min_bin_vheap_size: 46422, min_heap_size: 233, fullsweep_after: 65535, minor_gcs: 2], suspending: []]
該進程是個GenEvent進程有三個handle_event({:log, name, bin}, state)函數, 只有寫文件的handle_event有動過. 懷疑Enum.random的性能太差. benchmark發現平均耗時在0.7μs. 不會成爲瓶頸.app
壓測環境沒法重現, 只好將代碼回滾.
回滾後, 線上再也不出現內存暴漲.dom
線上elixir版本:async
Erlang/OTP 20 [erts-9.0] [source] [64-bit] [smp:16:16] [ds:16:16:10] [async-threads:10] [kernel-poll:false] Interactive Elixir (1.5.1) - press Ctrl+C to exit (type h() ENTER for help)
壓測環境elixir版本:ide
Erlang/OTP 20 [erts-9.3] [source] [64-bit] [smp:16:16] [ds:16:16:10] [async-threads:10] [hipe] [kernel-poll:false] Interactive Elixir (1.6.4) - press Ctrl+C to exit (type h() ENTER for help)
懷疑是elixir版本不同致使.
將壓測環境的elixir版本調整至和線上一致後, 重現過內存暴漲問題. 但沒法穩定重現.
發現 ctrl + c 後再 continue, 製造抖動能夠穩定重現內存暴漲問題.
去掉def handle_event({:log, _name, _bin}中的Enum.random後(用取餘輪轉代替random), 問題再也不出現.函數
由於去掉Enum.random能夠解決問題, 且只有特定elixir版本可重現問題, 因而比較elixir Enum.random在不一樣版本下的實現.工具
Enum.random最終會調用, random_integer(0, count_of_enumerable)
enum.ex:2763性能
defp random_integer(lower_limit, upper_limit) do lower_limit + :rand.uniform(upper_limit - lower_limit + 1) - 1 end
rand.erl:281測試
uniform/1 -spec uniform(N :: pos_integer()) -> X :: pos_integer(). uniform(N) -> {X, Seed} = uniform_s(N, seed_get()), _ = seed_put(Seed), X.
rand.erl:403
seed_get() -> case get(?SEED_DICT) of undefined -> seed(?DEFAULT_ALG_HANDLER); Old -> Old % no type checking here end.
存儲/得到seed用的進程字典, 效率不用擔憂.
由於elixir1.6.0版本就沒法重現問題(v1.5.1->v1.5.3均可重現). 對比了elixir v1.5.1/v1.6.3, 未發現可疑之處.
經過:observer.start能夠看到MainChannel的調用棧, 並trace進程的消息, 每一次Enum.random調用, 都會有一次code_server:call. 以前線上的調用棧, 也是call code_server, 這必定不是巧合.
https://elixir-lang.org/getti...
When working with Elixir projects, using the Mix build tool, you may see the output as follows:
Consolidated String.Chars Consolidated Collectable Consolidated List.Chars Consolidated IEx.Info Consolidated Enumerable Consolidated Inspect
Those are all protocols that ship with Elixir and they are being consolidated. Because a protocol can dispatch to any data type, the protocol must check on every call if an implementation for the given type exists. This may be expensive.
However, after our project is compiled using a tool like Mix, we know all modules that have been defined, including protocols and their implementations. This way, the protocol can be consolidated into a very simple and fast dispatch module.
From Elixir v1.2, protocol consolidation happens automatically for all projects. We will build our own project in the Mix and OTP guide.
反編譯v1.5.1的Enumerable, 發現確實會調用ensure_compiled.
dc = fn x -> p = :code.which(x) {:ok,{_,[{:abstract_code,{_, ac}}]}} = :beam_lib.chunks(p, [:abstract_code]) IO.write(:erl_prettypr.format(:erl_syntax.form_list(ac))) end
dc.(Enumerable) count(_@1) -> ('impl_for!'(_@1)):count(_@1). impl_for(_@1) when erlang:is_list(_@1) -> case 'impl_for?'('Elixir.Enumerable.List') of true -> 'Elixir.Enumerable.List':'__impl__'(target); false -> any_impl_for() end; 'impl_for?'(_@1) -> case 'Elixir.Code':'ensure_compiled?'(_@1) of true -> 'Elixir.Kernel':'function_exported?'(_@1, '__impl__', 1); false -> false; _@2 -> erlang:error({badbool, 'and', _@2}) end.
code.ex:575
@spec ensure_compiled(module) :: {:module, module} | {:error, :embedded | :badfile | :nofile | :on_load_failure} def ensure_compiled(module) when is_atom(module) do case :code.ensure_loaded(module) do {:error, :nofile} = error -> if is_pid(:erlang.get(:elixir_compiler_pid)) and Kernel.ErrorHandler.ensure_compiled(module, :module) do {:module, module} else error end other -> other end end
code.erl:149
-spec ensure_loaded(Module) -> {module, Module} | {error, What} when Module :: module(), What :: embedded | badfile | nofile | on_load_failure. ensure_loaded(Mod) when is_atom(Mod) -> call({ensure_loaded,Mod}).
code.erl:627
call(Req) -> code_server:call(Req).
code_server.erl:136
-spec call(term()) -> term(). call(Req) -> Ref = erlang:monitor(process, ?MODULE), ?MODULE ! {code_call, self(), Req}, receive {?MODULE, Reply} -> erlang:demonitor(Ref,[flush]), Reply; {'DOWN',Ref,process,_,_} -> exit({'DOWN',code_server,Req}) end.
~/ejoy/fuck/deps/exrm/lib/exrm/plugins » diff -u consolidation.ex ~/consolidation.ex --- consolidation.ex 2018-07-26 20:00:38.000000000 +0800 +++ /home/enjolras/consolidation.ex 2018-07-26 19:59:53.324240446 +0800 @@ -27,11 +27,13 @@ debug "Packaging consolidated protocols..." # Add overlay to relx.config which copies consolidated dir to release - consolidated_path = Path.join([Mix.Project.build_path, "consolidated"]) + consolidated_path = Path.join([Mix.Project.build_path, "lib", config.name, "consolidated"]) case File.ls(consolidated_path) do {:error, _} -> + IO.puts("consolidated notfound #{inspect consolidated_path}") config {:ok, filenames} -> + IO.puts("consolidated #{inspect filenames}") dest_path = "lib/#{config.name}-#{config.version}/consolidated" overlays = [overlay: Enum.map(filenames, fn name -> {:copy, '#{consolidated_path}/#{name}', '#{Path.join([dest_path, name])}'}
fix後, 從新打包. 如同上面同樣反編譯.
list的impl_for再也不調用'impl_for?', 也沒法再重現消息堆積問題.
impl_for(_@1) when erlang:is_list(_@1) -> case 'impl_for?'('Elixir.Enumerable.List') of true -> 'Elixir.Enumerable.List':'__impl__'(target); false -> any_impl_for() end;
而是直接返回.
impl_for(x) when erlang:is_list(x) -> 'Elixir.Enumerable.List';
enum.ex:1708
def random(enumerable) do case Enumerable.count(enumerable) do {:ok, 0} -> raise Enum.EmptyError {:ok, count} -> at(enumerable, random_integer(0, count - 1)) {:error, _} -> case take_random(enumerable, 1) do [] -> raise Enum.EmptyError [elem] -> elem end end end
enum.ex:1790
def random(enumerable) do result = case backwards_compatible_slice(enumerable) do {:ok, 0, _} -> [] {:ok, count, fun} when is_function(fun) -> fun.(random_integer(0, count - 1), 1) {:error, _} -> take_random(enumerable, 1) end case result do [] -> raise Enum.EmptyError [elem] -> elem end end # TODO: Remove me on Elixir v1.8 defp backwards_compatible_slice(args) do try do Enumerable.slice(args) catch :error, :undef -> case System.stacktrace() do [{module, :slice, [^args], _} | _] -> {:error, module} stack -> :erlang.raise(:error, :undef, stack) end end end
一樣反編譯, Enumerable.slice 同樣會調用impl_for, 也會call code_server, 沒法解釋爲何v1.6.3沒法重現.
slice(_@1) -> ('impl_for!'(_@1)):slice(_@1).