From cb286fdeba1782b75d1d7bca484d80e3cd499a98 Mon Sep 17 00:00:00 2001 From: Michael Loftis Date: Sun, 30 Dec 2018 15:16:26 +0000 Subject: [PATCH] Improves RetryQueue behavior reduces to one single timer firing once a second switches to a parallel worker model --- lib/pleroma/web/federator/retry_queue.ex | 195 +++++++++++++++++++++-- test/web/retry_queue_test.exs | 31 ++-- 2 files changed, 202 insertions(+), 24 deletions(-) diff --git a/lib/pleroma/web/federator/retry_queue.ex b/lib/pleroma/web/federator/retry_queue.ex index 5f1d430084..e4340af7c1 100644 --- a/lib/pleroma/web/federator/retry_queue.ex +++ b/lib/pleroma/web/federator/retry_queue.ex @@ -7,20 +7,34 @@ defmodule Pleroma.Web.Federator.RetryQueue do require Logger - # initial timeout, 5 min - @initial_timeout 30_000 + # seconds + @initial_timeout 30 @max_retries 5 + @max_jobs 20 + def init(args) do - {:ok, args} + queue_table = :ets.new(:pleroma_retry_queue, [:bag, :protected]) + + {:ok, %{args | queue_table: queue_table, running_jobs: :sets.new()}} end def start_link() do - enabled = Pleroma.Config.get([:retry_queue, :enabled], false) + enabled = + if Mix.env() == :test, do: true, else: Pleroma.Config.get([:retry_queue, :enabled], false) if enabled do Logger.info("Starting retry queue") - GenServer.start_link(__MODULE__, %{delivered: 0, dropped: 0}, name: __MODULE__) + + linkres = + GenServer.start_link( + __MODULE__, + %{delivered: 0, dropped: 0, queue_table: nil, running_jobs: nil}, + name: __MODULE__ + ) + + maybe_kickoff_timer() + linkres else Logger.info("Retry queue disabled") :ignore @@ -31,6 +45,14 @@ def enqueue(data, transport, retries \\ 0) do GenServer.cast(__MODULE__, {:maybe_enqueue, data, transport, retries + 1}) end + def get_stats() do + GenServer.call(__MODULE__, :get_stats) + end + + def reset_stats() do + GenServer.call(__MODULE__, :reset_stats) + end + def get_retry_params(retries) do if retries > @max_retries do {:drop, "Max retries reached"} @@ -39,16 +61,118 @@ def get_retry_params(retries) do end end - def handle_cast({:maybe_enqueue, data, transport, retries}, %{dropped: drop_count} = state) do + def get_retry_timer_interval() do + Pleroma.Config.get([:retry_queue, :interval], 1000) + end + + defp ets_count_expires(table, current_time) do + :ets.select_count( + table, + [ + { + {:"$1", :"$2"}, + [{:"=<", :"$1", {:const, current_time}}], + [true] + } + ] + ) + end + + defp ets_pop_n_expired(table, current_time, desired) do + {popped, _continuation} = + :ets.select( + table, + [ + { + {:"$1", :"$2"}, + [{:"=<", :"$1", {:const, current_time}}], + [:"$_"] + } + ], + desired + ) + + popped + |> List.foldl(true, fn e, acc -> + :ets.delete_object(table, e) + acc + end) + + popped + end + + def maybe_start_job(running_jobs, queue_table) do + # we don't want to hit the ets or the DateTime more times than we have to + # could optimize slightly further by not using the count, and instead grabbing + # up to N objects early... + current_time = DateTime.to_unix(DateTime.utc_now()) + n_running_jobs = :sets.size(running_jobs) + + if n_running_jobs < @max_jobs do + n_ready_jobs = ets_count_expires(queue_table, current_time) + + if n_ready_jobs > 0 do + # figure out how many we could start + available_job_slots = @max_jobs - n_running_jobs + start_n_jobs(running_jobs, queue_table, current_time, available_job_slots) + else + running_jobs + end + else + running_jobs + end + end + + defp start_n_jobs(running_jobs, _queue_table, _current_time, 0) do + running_jobs + end + + defp start_n_jobs(running_jobs, queue_table, current_time, available_job_slots) + when available_job_slots > 0 do + candidates = ets_pop_n_expired(queue_table, current_time, available_job_slots) + + candidates + |> List.foldl(running_jobs, fn {_, e}, rj -> + {:ok, pid} = Task.start(fn -> worker(e) end) + mref = Process.monitor(pid) + :sets.add_element(mref, rj) + end) + end + + def worker({:send, data, transport, retries}) do + case transport.publish_one(data) do + {:ok, _} -> + GenServer.cast(__MODULE__, :inc_delivered) + :delivered + + {:error, _reason} -> + enqueue(data, transport, retries) + :retry + end + end + + def handle_call(:get_stats, _from, %{delivered: delivery_count, dropped: drop_count} = state) do + {:reply, %{delivered: delivery_count, dropped: drop_count}, state} + end + + def handle_call(:reset_stats, _from, %{delivered: delivery_count, dropped: drop_count} = state) do + {:reply, %{delivered: delivery_count, dropped: drop_count}, + %{state | delivered: 0, dropped: 0}} + end + + def handle_cast(:reset_stats, state) do + {:noreply, %{state | delivered: 0, dropped: 0}} + end + + def handle_cast( + {:maybe_enqueue, data, transport, retries}, + %{dropped: drop_count, queue_table: queue_table, running_jobs: running_jobs} = state + ) do case get_retry_params(retries) do {:retry, timeout} -> - Process.send_after( - __MODULE__, - {:send, data, transport, retries}, - timeout - ) - - {:noreply, state} + :ets.insert(queue_table, {timeout, {:send, data, transport, retries}}) + running_jobs = maybe_start_job(running_jobs, queue_table) + {:noreply, %{state | running_jobs: running_jobs}} {:drop, message} -> Logger.debug(message) @@ -56,6 +180,20 @@ def handle_cast({:maybe_enqueue, data, transport, retries}, %{dropped: drop_coun end end + def handle_cast(:kickoff_timer, state) do + retry_interval = get_retry_timer_interval() + Process.send_after(__MODULE__, :retry_timer_run, retry_interval) + {:noreply, state} + end + + def handle_cast(:inc_delivered, %{delivered: delivery_count} = state) do + {:noreply, %{state | delivered: delivery_count + 1}} + end + + def handle_cast(:inc_dropped, %{dropped: drop_count} = state) do + {:noreply, %{state | dropped: drop_count + 1}} + end + def handle_info({:send, data, transport, retries}, %{delivered: delivery_count} = state) do case transport.publish_one(data) do {:ok, _} -> @@ -67,12 +205,39 @@ def handle_info({:send, data, transport, retries}, %{delivered: delivery_count} end end + def handle_info( + :retry_timer_run, + %{queue_table: queue_table, running_jobs: running_jobs} = state + ) do + maybe_kickoff_timer() + running_jobs = maybe_start_job(running_jobs, queue_table) + {:noreply, %{state | running_jobs: running_jobs}} + end + + def handle_info({:DOWN, ref, :process, _pid, _reason}, state) do + %{running_jobs: running_jobs, queue_table: queue_table} = state + running_jobs = :sets.del_element(ref, running_jobs) + running_jobs = maybe_start_job(running_jobs, queue_table) + {:noreply, %{state | running_jobs: running_jobs}} + end + def handle_info(unknown, state) do Logger.debug("RetryQueue: don't know what to do with #{inspect(unknown)}, ignoring") {:noreply, state} end - defp growth_function(retries) do - round(@initial_timeout * :math.pow(retries, 3)) + if Mix.env() == :test do + defp growth_function(_retries) do + _shutit = @initial_timeout + DateTime.to_unix(DateTime.utc_now()) - 1 + end + else + defp growth_function(retries) do + round(@initial_timeout * :math.pow(retries, 3)) + DateTime.to_unix(DateTime.utc_now()) + end + end + + defp maybe_kickoff_timer() do + GenServer.cast(__MODULE__, :kickoff_timer) end end diff --git a/test/web/retry_queue_test.exs b/test/web/retry_queue_test.exs index 9351b6c24b..ecb3ce5d06 100644 --- a/test/web/retry_queue_test.exs +++ b/test/web/retry_queue_test.exs @@ -3,7 +3,8 @@ # SPDX-License-Identifier: AGPL-3.0-only defmodule MockActivityPub do - def publish_one(ret) do + def publish_one({ret, waiter}) do + send(waiter, :complete) {ret, "success"} end end @@ -15,21 +16,33 @@ defmodule Pleroma.Web.Federator.RetryQueueTest do @small_retry_count 0 @hopeless_retry_count 10 + setup do + RetryQueue.reset_stats() + end + + test "RetryQueue responds to stats request" do + assert %{delivered: 0, dropped: 0} == RetryQueue.get_stats() + end + test "failed posts are retried" do {:retry, _timeout} = RetryQueue.get_retry_params(@small_retry_count) - assert {:noreply, %{delivered: 1}} == - RetryQueue.handle_info({:send, :ok, MockActivityPub, @small_retry_count}, %{ - delivered: 0 - }) + wait_task = + Task.async(fn -> + receive do + :complete -> :ok + end + end) + + RetryQueue.enqueue({:ok, wait_task.pid}, MockActivityPub, @small_retry_count) + Task.await(wait_task) + assert %{delivered: 1, dropped: 0} == RetryQueue.get_stats() end test "posts that have been tried too many times are dropped" do {:drop, _timeout} = RetryQueue.get_retry_params(@hopeless_retry_count) - assert {:noreply, %{dropped: 1}} == - RetryQueue.handle_cast({:maybe_enqueue, %{}, nil, @hopeless_retry_count}, %{ - dropped: 0 - }) + RetryQueue.enqueue({:ok, nil}, MockActivityPub, @hopeless_retry_count) + assert %{delivered: 0, dropped: 1} == RetryQueue.get_stats() end end