正在检测 Elixir/OTP supervisor child 产卵和终止
Detecting Elixir/OTP supervisor child spawn and termination
我正在 Elixir 中构建一个作业队列,作为一项学术练习。目前,我的工作人员必须在创建队列时手动将自己注册到队列中(参见 MyQuestion.Worker.start_link
)。
我希望我的主管在 created/restart 时将可用的工作人员注册到队列中,因为这似乎有助于测试工作人员并最大限度地减少耦合。
有没有办法完成我在 MyQuestion.Supervisor
下面的代码中描述的内容?
defmodule MyQuestion.Supervisor do
use Supervisor
def start_link do
supervisor = Supervisor.start_link(__MODULE__, :ok)
end
def init(:ok) do
children = [
worker(MyQuestion.JobQueue, []),
worker(MyQuestion.Worker, [], id: :worker_0),
worker(MyQuestion.Worker, [], id: :worker_1)]
supervise(children, strategy: :rest_for_one)
end
# LOOKING FOR SOMETHING LIKE THIS
# on worker spawn, I want to add the worker to the queue
def child_spawned(pid, {MyQuestion.Worker, _, _}) do
# add worker to queue
MyQuestion.JobQueue.add_new_worker(pid)
end
# LOOKING FOR SOMETHING LIKE THIS
# I want some way to do the following (imagine the callback existed)
def child_terminated(pid, reason, state)
# with this information I could tell the job queue to mark
# the job associated with the pid as failed and to retry
# or maybe extract the job id from the worker state, etc.
MyQuestion.JobQueue.remove_worker(pid)
MyQuestion.JobQueue.restart_job_for_failed_worker(pid)
end
end
defmodule MyQuestion.JobQueue do
def start_link do
Agent.start_link(fn -> [] end, name: __MODULE__)
end
def new_worker(pid) do
# register pid with agent state in available worker list, etc.
end
def add_job(job_description) do
# find idle worker and run job
<... snip ...>
end
<... snip ...>
end
defmodule MyQuestion.Worker do
use GenServer
def start_link do
# start worker
{:ok, worker} = GenServer.start_link(__MODULE__, [])
# Now we have a worker pid, so we can register that pid with the queue
# I wish this could be in the supervisor or else where.
MyQuestion.JobQueue.add_new_worker(worker)
# must return gen server's start link
{:ok, worker}
end
<... snip ...>
end
他们的关键是调用 Process.monitor(pid)
的组合——然后你将收到对 handle_info
的调用——并手动调用 Supervisor.start_child
给你 pids。
我之前曾尝试使用 handle_info
但始终无法调用它。 Process.monitor(pid)
必须从您要接收通知的同一进程调用,因此您必须从 handle_call
函数内部调用它以将监视器与您的服务器进程相关联。可能有一个函数可以将 运行 代码作为另一个进程(即 run_from_process(job_queue_pid, fn -> Process.monitor(pid_to_monitor) end)
),但我找不到任何东西。
附件是作业队列的一个非常简单的实现。我只接触 Elixir 一天,所以代码既混乱又不合惯用,但我附上它是因为似乎缺少围绕该主题的示例代码。
查看 HeavyIndustry.JobQueue
、handle_info
、create_new_worker
。这段代码有一个明显的问题:它能够在 worker 崩溃时重新启动它们,但它无法从该代码开始下一个作业的队列(由于需要 GenServer.call
inside handle_info
,这让我们陷入僵局)。我认为您可以通过将启动作业的过程与跟踪作业的过程分开来解决这个问题。如果您 运行 示例代码,您最终会注意到它会停止 运行ning 作业,即使队列中仍有一个作业(:crash
作业)。
defmodule HeavyIndustry.Supervisor do
use Supervisor
def start_link do
Supervisor.start_link(__MODULE__, :ok)
end
def init(:ok) do
# default to supervising nothing, we will add
supervise([], strategy: :one_for_one)
end
def create_children(supervisor, worker_count) do
# create the job queue. defaults to no workers
Supervisor.start_child(supervisor, worker(HeavyIndustry.JobQueue, [[supervisor, worker_count]]))
end
end
defmodule HeavyIndustry.JobQueue do
use GenServer
@job_queue_name __MODULE__
def start_link(args, _) do
GenServer.start_link(__MODULE__, args, name: @job_queue_name)
end
def init([supervisor, n]) do
# set some default state
state = %{
supervisor: supervisor,
max_workers: n,
jobs: [],
workers: %{
idle: [],
busy: []
}
}
{:ok, state}
end
def setup() do
# we want to be aware of worker failures. we hook into this by calling
# Process.monitor(pid), but this links the calling process with the monitored
# process. To make sure the calls come to US and not the process that called
# setup, we create the workers by passing a message to our server process
state = GenServer.call(@job_queue_name, :setup)
# gross passing the whole state back here to monitor but the monitoring must
# be started from the server process and we can't call GenServer.call from
# inside the :setup call else we deadlock.
workers = state.workers.idle
GenServer.call(@job_queue_name, {:monitor_pids, workers})
end
def add_job(from, job) do
# add job to queue
{:ok, our_job_id} = GenServer.call(@job_queue_name, {:create_job, %{job: job, reply_to: from}})
# try to run the next job
case GenServer.call(@job_queue_name, :start_next_job) do
# started our job
{:ok, started_job_id = ^our_job_id} -> {:ok, :started}
# started *a* job
{:ok, _} -> {:ok, :pending}
# couldnt start any job but its ok...
{:error, :no_idle_workers} -> {:ok, :pending}
# something fell over...
{:error, e} -> {:error, e}
# yeah I know this is bad.
_ -> {:ok}
end
end
def start_next_job do
GenServer.call(@job_queue_name, :start_next_job)
end
##
# Internal API
##
def handle_call(:setup, _, state) do
workers = Enum.map(0..(state.max_workers-1), fn (n) ->
{:ok, pid} = start_new_worker(state.supervisor)
pid
end)
state = %{state | workers: %{state.workers | idle: workers}}
{:reply, state, state}
end
defp start_new_worker(supervisor) do
spec = Supervisor.Spec.worker(HeavyIndustry.Worker, [], id: :"Worker.#{:os.system_time}", restart: :temporary)
# start worker
Supervisor.start_child(supervisor, spec)
end
def handle_call({:monitor_pids, list}, _, state) do
Enum.each(list, &Process.monitor(&1))
{:reply, :ok, state}
end
def handle_call({:create_job, job}, from, state) do
job = %{
job: job.job,
reply_to: job.reply_to,
id: :os.system_time, # id for task
status: :pending, # start pending, go active, then remove
pid: nil
}
# add new job to jobs list
state = %{state | jobs: state.jobs ++ [job]}
{:reply, {:ok, job.id}, state}
end
def handle_call(:start_next_job, _, state) do
IO.puts "==> Start Next Job"
IO.inspect state
IO.puts "=================="
reply = case {find_idle_worker(state.workers), find_next_job(state.jobs)} do
{{:error, :no_idle_workers}, _} ->
# no workers for job, doesnt matter if we have a job
{:error, :no_idle_workers}
{_, nil} ->
# no job, doesnt matter if we have a worker
{:error, :no_more_jobs}
{{:ok, worker}, job} ->
# have worker, have job, do work
# update state to set job active and worker busy
jobs = state.jobs -- [job]
job = %{job | status: :active, pid: worker}
jobs = jobs ++ [job]
idle = state.workers.idle -- [worker]
busy = state.workers.busy ++ [worker]
state = %{state | jobs: jobs, workers: %{idle: idle, busy: busy}}
{:ok, task_id} = Task.start(fn ->
result = GenServer.call(worker, job.job)
remove_job(job)
free_worker(worker)
send job.reply_to, %{answer: result, job: job.job}
start_next_job
end)
{:ok, job.id}
end
{:reply, reply, state}
end
defp find_idle_worker(workers) do
case workers do
%{idle: [], busy: _} -> {:error, :no_idle_workers}
%{idle: [worker | idle], busy: busy} -> {:ok, worker}
end
end
defp find_next_job(jobs) do
jobs |> Enum.find(&(&1.status == :pending))
end
defp free_worker(worker) do
GenServer.call(@job_queue_name, {:free_worker, worker})
end
defp remove_job(job) do
GenServer.call(@job_queue_name, {:remove_job, job})
end
def handle_call({:free_worker, worker}, from, state) do
idle = state.workers.idle ++ [worker]
busy = state.workers.busy -- [worker]
{:reply, :ok, %{state | workers: %{idle: idle, busy: busy}}}
end
def handle_call({:remove_job, job}, from, state) do
jobs = state.jobs -- [job]
{:reply, :ok, %{state | jobs: jobs}}
end
def handle_info(msg = {reason, ref, :process, pid, _reason}, state) do
IO.puts "Worker collapsed: #{reason} #{inspect pid}, clear and restart job"
# find job for collapsed worker
# set job to pending again
job = Enum.find(state.jobs, &(&1.pid == pid))
fixed_job = %{job | status: :pending, pid: nil}
jobs = (state.jobs -- [job]) ++ [fixed_job]
# remote worker from lists
idle = state.workers.idle -- [pid]
busy = state.workers.busy -- [pid]
# start new worker
{:ok, pid} = start_new_worker(state.supervisor)
# add worker from lists
idle = state.workers.idle ++ [pid]
# cant call GenServer.call from here to monitor pid,
# so duplicate the code a bit...
Process.monitor(pid)
# update state
state = %{state | jobs: jobs, workers: %{idle: idle, busy: busy}}
{:noreply, state}
end
end
defmodule HeavyIndustry.Worker do
use GenServer
def start_link do
GenServer.start_link(__MODULE__, :ok)
end
def init(:ok) do
# workers have no persistent state
IO.puts "==> Worker up! #{inspect self}"
{:ok, nil}
end
def handle_call({:sum, list}, from, _) do
sum = Enum.reduce(list, fn (n, acc) -> acc + n end)
{:reply, sum, nil}
end
def handle_call({:fib, n}, from, _) do
sum = fib_calc(n)
{:reply, sum, nil}
end
def handle_call({:stop}, from, state) do
{:stop, "my-stop-reason", "my-stop-reply", state}
end
def handle_call({:crash}, from, _) do
{:reply, "this will crash" ++ 1234, nil}
end
def handle_call({:timeout}, from, _) do
:timer.sleep 10000
{:reply, "this will timeout", nil}
end
# Slow fib
defp fib_calc(0), do: 0
defp fib_calc(1), do: 1
defp fib_calc(n), do: fib_calc(n-1) + fib_calc(n-2)
end
defmodule Looper do
def start do
{:ok, pid} = HeavyIndustry.Supervisor.start_link
{:ok, job_queue} = HeavyIndustry.Supervisor.create_children(pid, 2)
HeavyIndustry.JobQueue.setup()
add_jobs
loop
end
def add_jobs do
jobs = [
{:sum, [100, 200, 300]},
{:crash},
{:fib, 35},
{:fib, 35},
{:sum, [88, 88, 99]},
{:fib, 35},
{:fib, 35},
{:fib, 35},
{:sum, 0..100},
# {:stop}, # stop not really a failure
{:sum, [88, 88, 99]},
# {:timeout},
{:sum, [-1]}
]
Enum.each(jobs, fn (job) ->
IO.puts "~~~~> Add job: #{inspect job}"
case HeavyIndustry.JobQueue.add_job(self, job) do
{:ok, :started} -> IO.puts "~~~~> Started job immediately"
{:ok, :pending} -> IO.puts "~~~~> Job in queue"
val -> IO.puts "~~~~> ... val: #{inspect val}"
end
end)
end
def loop do
receive do
value ->
IO.puts "~~~~> Received: #{inspect value}"
loop
end
end
end
Looper.start
我正在 Elixir 中构建一个作业队列,作为一项学术练习。目前,我的工作人员必须在创建队列时手动将自己注册到队列中(参见 MyQuestion.Worker.start_link
)。
我希望我的主管在 created/restart 时将可用的工作人员注册到队列中,因为这似乎有助于测试工作人员并最大限度地减少耦合。
有没有办法完成我在 MyQuestion.Supervisor
下面的代码中描述的内容?
defmodule MyQuestion.Supervisor do
use Supervisor
def start_link do
supervisor = Supervisor.start_link(__MODULE__, :ok)
end
def init(:ok) do
children = [
worker(MyQuestion.JobQueue, []),
worker(MyQuestion.Worker, [], id: :worker_0),
worker(MyQuestion.Worker, [], id: :worker_1)]
supervise(children, strategy: :rest_for_one)
end
# LOOKING FOR SOMETHING LIKE THIS
# on worker spawn, I want to add the worker to the queue
def child_spawned(pid, {MyQuestion.Worker, _, _}) do
# add worker to queue
MyQuestion.JobQueue.add_new_worker(pid)
end
# LOOKING FOR SOMETHING LIKE THIS
# I want some way to do the following (imagine the callback existed)
def child_terminated(pid, reason, state)
# with this information I could tell the job queue to mark
# the job associated with the pid as failed and to retry
# or maybe extract the job id from the worker state, etc.
MyQuestion.JobQueue.remove_worker(pid)
MyQuestion.JobQueue.restart_job_for_failed_worker(pid)
end
end
defmodule MyQuestion.JobQueue do
def start_link do
Agent.start_link(fn -> [] end, name: __MODULE__)
end
def new_worker(pid) do
# register pid with agent state in available worker list, etc.
end
def add_job(job_description) do
# find idle worker and run job
<... snip ...>
end
<... snip ...>
end
defmodule MyQuestion.Worker do
use GenServer
def start_link do
# start worker
{:ok, worker} = GenServer.start_link(__MODULE__, [])
# Now we have a worker pid, so we can register that pid with the queue
# I wish this could be in the supervisor or else where.
MyQuestion.JobQueue.add_new_worker(worker)
# must return gen server's start link
{:ok, worker}
end
<... snip ...>
end
他们的关键是调用 Process.monitor(pid)
的组合——然后你将收到对 handle_info
的调用——并手动调用 Supervisor.start_child
给你 pids。
我之前曾尝试使用 handle_info
但始终无法调用它。 Process.monitor(pid)
必须从您要接收通知的同一进程调用,因此您必须从 handle_call
函数内部调用它以将监视器与您的服务器进程相关联。可能有一个函数可以将 运行 代码作为另一个进程(即 run_from_process(job_queue_pid, fn -> Process.monitor(pid_to_monitor) end)
),但我找不到任何东西。
附件是作业队列的一个非常简单的实现。我只接触 Elixir 一天,所以代码既混乱又不合惯用,但我附上它是因为似乎缺少围绕该主题的示例代码。
查看 HeavyIndustry.JobQueue
、handle_info
、create_new_worker
。这段代码有一个明显的问题:它能够在 worker 崩溃时重新启动它们,但它无法从该代码开始下一个作业的队列(由于需要 GenServer.call
inside handle_info
,这让我们陷入僵局)。我认为您可以通过将启动作业的过程与跟踪作业的过程分开来解决这个问题。如果您 运行 示例代码,您最终会注意到它会停止 运行ning 作业,即使队列中仍有一个作业(:crash
作业)。
defmodule HeavyIndustry.Supervisor do
use Supervisor
def start_link do
Supervisor.start_link(__MODULE__, :ok)
end
def init(:ok) do
# default to supervising nothing, we will add
supervise([], strategy: :one_for_one)
end
def create_children(supervisor, worker_count) do
# create the job queue. defaults to no workers
Supervisor.start_child(supervisor, worker(HeavyIndustry.JobQueue, [[supervisor, worker_count]]))
end
end
defmodule HeavyIndustry.JobQueue do
use GenServer
@job_queue_name __MODULE__
def start_link(args, _) do
GenServer.start_link(__MODULE__, args, name: @job_queue_name)
end
def init([supervisor, n]) do
# set some default state
state = %{
supervisor: supervisor,
max_workers: n,
jobs: [],
workers: %{
idle: [],
busy: []
}
}
{:ok, state}
end
def setup() do
# we want to be aware of worker failures. we hook into this by calling
# Process.monitor(pid), but this links the calling process with the monitored
# process. To make sure the calls come to US and not the process that called
# setup, we create the workers by passing a message to our server process
state = GenServer.call(@job_queue_name, :setup)
# gross passing the whole state back here to monitor but the monitoring must
# be started from the server process and we can't call GenServer.call from
# inside the :setup call else we deadlock.
workers = state.workers.idle
GenServer.call(@job_queue_name, {:monitor_pids, workers})
end
def add_job(from, job) do
# add job to queue
{:ok, our_job_id} = GenServer.call(@job_queue_name, {:create_job, %{job: job, reply_to: from}})
# try to run the next job
case GenServer.call(@job_queue_name, :start_next_job) do
# started our job
{:ok, started_job_id = ^our_job_id} -> {:ok, :started}
# started *a* job
{:ok, _} -> {:ok, :pending}
# couldnt start any job but its ok...
{:error, :no_idle_workers} -> {:ok, :pending}
# something fell over...
{:error, e} -> {:error, e}
# yeah I know this is bad.
_ -> {:ok}
end
end
def start_next_job do
GenServer.call(@job_queue_name, :start_next_job)
end
##
# Internal API
##
def handle_call(:setup, _, state) do
workers = Enum.map(0..(state.max_workers-1), fn (n) ->
{:ok, pid} = start_new_worker(state.supervisor)
pid
end)
state = %{state | workers: %{state.workers | idle: workers}}
{:reply, state, state}
end
defp start_new_worker(supervisor) do
spec = Supervisor.Spec.worker(HeavyIndustry.Worker, [], id: :"Worker.#{:os.system_time}", restart: :temporary)
# start worker
Supervisor.start_child(supervisor, spec)
end
def handle_call({:monitor_pids, list}, _, state) do
Enum.each(list, &Process.monitor(&1))
{:reply, :ok, state}
end
def handle_call({:create_job, job}, from, state) do
job = %{
job: job.job,
reply_to: job.reply_to,
id: :os.system_time, # id for task
status: :pending, # start pending, go active, then remove
pid: nil
}
# add new job to jobs list
state = %{state | jobs: state.jobs ++ [job]}
{:reply, {:ok, job.id}, state}
end
def handle_call(:start_next_job, _, state) do
IO.puts "==> Start Next Job"
IO.inspect state
IO.puts "=================="
reply = case {find_idle_worker(state.workers), find_next_job(state.jobs)} do
{{:error, :no_idle_workers}, _} ->
# no workers for job, doesnt matter if we have a job
{:error, :no_idle_workers}
{_, nil} ->
# no job, doesnt matter if we have a worker
{:error, :no_more_jobs}
{{:ok, worker}, job} ->
# have worker, have job, do work
# update state to set job active and worker busy
jobs = state.jobs -- [job]
job = %{job | status: :active, pid: worker}
jobs = jobs ++ [job]
idle = state.workers.idle -- [worker]
busy = state.workers.busy ++ [worker]
state = %{state | jobs: jobs, workers: %{idle: idle, busy: busy}}
{:ok, task_id} = Task.start(fn ->
result = GenServer.call(worker, job.job)
remove_job(job)
free_worker(worker)
send job.reply_to, %{answer: result, job: job.job}
start_next_job
end)
{:ok, job.id}
end
{:reply, reply, state}
end
defp find_idle_worker(workers) do
case workers do
%{idle: [], busy: _} -> {:error, :no_idle_workers}
%{idle: [worker | idle], busy: busy} -> {:ok, worker}
end
end
defp find_next_job(jobs) do
jobs |> Enum.find(&(&1.status == :pending))
end
defp free_worker(worker) do
GenServer.call(@job_queue_name, {:free_worker, worker})
end
defp remove_job(job) do
GenServer.call(@job_queue_name, {:remove_job, job})
end
def handle_call({:free_worker, worker}, from, state) do
idle = state.workers.idle ++ [worker]
busy = state.workers.busy -- [worker]
{:reply, :ok, %{state | workers: %{idle: idle, busy: busy}}}
end
def handle_call({:remove_job, job}, from, state) do
jobs = state.jobs -- [job]
{:reply, :ok, %{state | jobs: jobs}}
end
def handle_info(msg = {reason, ref, :process, pid, _reason}, state) do
IO.puts "Worker collapsed: #{reason} #{inspect pid}, clear and restart job"
# find job for collapsed worker
# set job to pending again
job = Enum.find(state.jobs, &(&1.pid == pid))
fixed_job = %{job | status: :pending, pid: nil}
jobs = (state.jobs -- [job]) ++ [fixed_job]
# remote worker from lists
idle = state.workers.idle -- [pid]
busy = state.workers.busy -- [pid]
# start new worker
{:ok, pid} = start_new_worker(state.supervisor)
# add worker from lists
idle = state.workers.idle ++ [pid]
# cant call GenServer.call from here to monitor pid,
# so duplicate the code a bit...
Process.monitor(pid)
# update state
state = %{state | jobs: jobs, workers: %{idle: idle, busy: busy}}
{:noreply, state}
end
end
defmodule HeavyIndustry.Worker do
use GenServer
def start_link do
GenServer.start_link(__MODULE__, :ok)
end
def init(:ok) do
# workers have no persistent state
IO.puts "==> Worker up! #{inspect self}"
{:ok, nil}
end
def handle_call({:sum, list}, from, _) do
sum = Enum.reduce(list, fn (n, acc) -> acc + n end)
{:reply, sum, nil}
end
def handle_call({:fib, n}, from, _) do
sum = fib_calc(n)
{:reply, sum, nil}
end
def handle_call({:stop}, from, state) do
{:stop, "my-stop-reason", "my-stop-reply", state}
end
def handle_call({:crash}, from, _) do
{:reply, "this will crash" ++ 1234, nil}
end
def handle_call({:timeout}, from, _) do
:timer.sleep 10000
{:reply, "this will timeout", nil}
end
# Slow fib
defp fib_calc(0), do: 0
defp fib_calc(1), do: 1
defp fib_calc(n), do: fib_calc(n-1) + fib_calc(n-2)
end
defmodule Looper do
def start do
{:ok, pid} = HeavyIndustry.Supervisor.start_link
{:ok, job_queue} = HeavyIndustry.Supervisor.create_children(pid, 2)
HeavyIndustry.JobQueue.setup()
add_jobs
loop
end
def add_jobs do
jobs = [
{:sum, [100, 200, 300]},
{:crash},
{:fib, 35},
{:fib, 35},
{:sum, [88, 88, 99]},
{:fib, 35},
{:fib, 35},
{:fib, 35},
{:sum, 0..100},
# {:stop}, # stop not really a failure
{:sum, [88, 88, 99]},
# {:timeout},
{:sum, [-1]}
]
Enum.each(jobs, fn (job) ->
IO.puts "~~~~> Add job: #{inspect job}"
case HeavyIndustry.JobQueue.add_job(self, job) do
{:ok, :started} -> IO.puts "~~~~> Started job immediately"
{:ok, :pending} -> IO.puts "~~~~> Job in queue"
val -> IO.puts "~~~~> ... val: #{inspect val}"
end
end)
end
def loop do
receive do
value ->
IO.puts "~~~~> Received: #{inspect value}"
loop
end
end
end
Looper.start