elixir:找出进程终止的原因

elixir: find out the reason of process termination

在性能测试期间,我的应用程序终止并显示以下日志:

17:17:28.187 [info]  SIGTERM received - shutting down

17:17:28.187 [info] SIGTERM received - shutting down

17:17:28.188 [error] GenServer #PID<0.3707.0> terminating
** (stop) 'stopping because dependent process <0.3703.0> died: shutdown'
Last message: {:EXIT, #PID<0.3703.0>, :shutdown}
17:17:28.189 [error] gen_server <0.3707.0> terminated with reason: "stopping because dependent process <0.3703.0> died: shutdown"
17:17:28.190 [error] CRASH REPORT Process <0.3707.0> with 0 neighbours exited with reason: "stopping because dependent process <0.3703.0> died: shutdown" in gen_server:handle_common_reply/8 line 751
17:17:28.190 [error] Supervisor {<0.3705.0>,amqp_connection_sup} had child connection started with amqp_gen_connection:start_link(<0.3706.0>, {amqp_params_network,<<"publicmq-npperfcom1">>,<<"publicmq-npperfcom1">>,<<"/publicmq-npperfcom1">>,...}) at <0.3707.0> exit with reason "stopping because dependent process <0.3703.0> died: shutdown" in context child_terminated
17:17:28.190 [error] Supervisor {<0.3705.0>,amqp_connection_sup} had child connection started with amqp_gen_connection:start_link(<0.3706.0>, {amqp_params_network,<<"publicmq-npperfcom1">>,<<"publicmq-npperfcom1">>,<<"/publicmq-npperfcom1">>,...}) at <0.3707.0> exit with reason reached_max_restart_intensity in context shutdown

生成与 amqp 连接的代码如下所示:

defmodule MyApp.Events.AmqpTransport do

  require Logger
  use GenServer
  use AMQP

  @restart_delay 2000 # 2 seconds

  defmodule State do
    @moduledoc false
    @type t :: %__MODULE__{
      exchange: String.t,
      channel: AMQP.Channel.t,
      routing_key: String.t,
      emitter_id: String.t,
      np_tracking_id: String.t
    }
    defstruct [:exchange, :channel, :routing_key, :emitter_id, :np_tracking_id]
  end

  def start_link(_) do
    GenServer.start_link(__MODULE__, [], name: __MODULE__)
  end

  def init(_opts) do
    Process.flag(:trap_exit, true)
    send(self(), :connect)
    {:ok, nil}
  end

  def handle_info(:connect, _state) do
    username = get_conf(:username)
    password = get_conf(:password)
    host = get_conf(:host)
    port = get_conf(:port)
    vhost = String.replace(get_conf(:vhost), "/", "%2f")
    amqp_url = "amqp://#{username}:#{password}@#{host}:#{port}/#{vhost}"
    Logger.info("amqp transport connecting to #{amqp_url}")
    case Connection.open(amqp_url) do
      {:ok, conn} ->
        Process.link(conn.pid)
        {:ok, chan} = Channel.open(conn)
        :ok = AMQP.Exchange.declare(chan, get_conf(:exchange), :topic, durable: true)
        state = %State{
          exchange: get_conf(:exchange),
          channel: chan,
          routing_key: get_conf(:routing_key),
          emitter_id: Application.fetch_env!(:coups_events, :emitter_id),
          np_tracking_id: Application.fetch_env!(:coups_events, :np_tracking_id),
        }
        {:noreply, state}
      {:error, err} ->
        Logger.error("amqp transport failed\n Err: #{inspect(err)}\n Retrying to connect ...")
        Process.send_after(self(), :connect, @restart_delay)
        {:noreply, nil}
    end
  end

  def handle_info({:EXIT, pid, reason}, _state) do
    Logger.error("amqp transport failed with #{inspect(reason)}")
    Process.unlink(pid)
    Process.send_after(self(), :connect, @restart_delay)
    {:noreply, nil}
  end

  def handle_cast({:emit, event}, state) do
    # event publishing
    {:noreply, state}
  end

  defp get_conf(key) do
    conf = Application.get_env(:events, :amqp)
    conf[key]
  end
end

问题:

  1. AmqpTransport 没有陷阱退出。为什么?
  2. 在错误日志中我看到了 pids。我可以在那里看到命名进程吗?
  3. 那里发生了什么使进程死亡?我怎样才能进一步调查细节?

Connection 开始由 amqp 应用程序的主管作为 amqp_sup:start_connection_sup(AmqpParams3) (Connection.open/2 simply delegates to :amqp_connection.start/2) 在内部监督。

将一个进程链接到两个捕获退出的进程是棘手的,而且通常不是幂等的,这就是为什么 official documentation suggestsProcess.monitor/1 底层连接并走完整路径重新启动监视的原因过程也是如此。

我记得 Andrea Leopardi 曾就相关主题发表过 complaints,但它对我来说一直很管用。