From d124d8645e1455fe5d4d2ab143f3ca09967f9572 Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Wed, 17 Jul 2024 12:40:07 -0400 Subject: [PATCH 1/3] Rework some Rich Media functionality for better error handling Oban should not retry jobs that are likely to fail again --- changelog.d/oban-rich-media-errors.fix | 1 + lib/pleroma/web/rich_media/backfill.ex | 24 ++++------ lib/pleroma/web/rich_media/helpers.ex | 44 ++++++++++++------- lib/pleroma/web/rich_media/parser.ex | 23 +++++----- lib/pleroma/web/rich_media/parsers/o_embed.ex | 2 +- lib/pleroma/workers/rich_media_worker.ex | 12 ++++- test/pleroma/web/rich_media/parser_test.exs | 12 ++--- test/support/http_request_mock.ex | 2 +- 8 files changed, 70 insertions(+), 50 deletions(-) create mode 100644 changelog.d/oban-rich-media-errors.fix diff --git a/changelog.d/oban-rich-media-errors.fix b/changelog.d/oban-rich-media-errors.fix new file mode 100644 index 0000000000..b904108db9 --- /dev/null +++ b/changelog.d/oban-rich-media-errors.fix @@ -0,0 +1 @@ +Prevent Rich Media backfill jobs from retrying in cases where it is likely they will fail again. diff --git a/lib/pleroma/web/rich_media/backfill.ex b/lib/pleroma/web/rich_media/backfill.ex index f1ee83bf0f..a66422e718 100644 --- a/lib/pleroma/web/rich_media/backfill.ex +++ b/lib/pleroma/web/rich_media/backfill.ex @@ -4,6 +4,7 @@ defmodule Pleroma.Web.RichMedia.Backfill do alias Pleroma.Web.RichMedia.Card + alias Pleroma.Web.RichMedia.Helpers alias Pleroma.Web.RichMedia.Parser alias Pleroma.Web.RichMedia.Parser.TTL alias Pleroma.Workers.RichMediaWorker @@ -16,8 +17,7 @@ defmodule Pleroma.Web.RichMedia.Backfill do Pleroma.Web.ActivityPub.ActivityPub ) - @spec run(map()) :: - :ok | {:error, {:invalid_metadata, any()} | :body_too_large | {:content, any()} | any()} + @spec run(map()) :: :ok | Parser.parse_errors() | Helpers.get_errors() def run(%{"url" => url} = args) do url_hash = Card.url_to_hash(url) @@ -33,22 +33,16 @@ def run(%{"url" => url} = args) do end warm_cache(url_hash, card) + :ok - {:error, {:invalid_metadata, fields}} -> - Logger.debug("Rich media incomplete or invalid metadata for #{url}: #{inspect(fields)}") + {:error, type} = error + when type in [:invalid_metadata, :body_too_large, :content_type, :validate] -> negative_cache(url_hash) + error - {:error, :body_too_large} -> - Logger.error("Rich media error for #{url}: :body_too_large") - negative_cache(url_hash) - - {:error, {:content_type, type}} -> - Logger.debug("Rich media error for #{url}: :content_type is #{type}") - negative_cache(url_hash) - - e -> - Logger.debug("Rich media error for #{url}: #{inspect(e)}") - {:error, e} + {:error, type} = error + when type in [:get, :head] -> + error end end diff --git a/lib/pleroma/web/rich_media/helpers.ex b/lib/pleroma/web/rich_media/helpers.ex index ea41bd285a..fba23c6577 100644 --- a/lib/pleroma/web/rich_media/helpers.ex +++ b/lib/pleroma/web/rich_media/helpers.ex @@ -5,26 +5,38 @@ defmodule Pleroma.Web.RichMedia.Helpers do alias Pleroma.Config + require Logger + + @type get_errors :: {:error, :body_too_large | :content_type | :head | :get} + + @spec rich_media_get(String.t()) :: {:ok, String.t()} | get_errors() def rich_media_get(url) do headers = [{"user-agent", Pleroma.Application.user_agent() <> "; Bot"}] - head_check = - case Pleroma.HTTP.head(url, headers, http_options()) do - # If the HEAD request didn't reach the server for whatever reason, - # we assume the GET that comes right after won't either - {:error, _} = e -> - e + with {_, {:ok, %Tesla.Env{status: 200, headers: headers}}} <- + {:head, Pleroma.HTTP.head(url, headers, http_options())}, + {_, :ok} <- {:content_type, check_content_type(headers)}, + {_, :ok} <- {:content_length, check_content_length(headers)}, + {_, {:ok, %Tesla.Env{status: 200, body: body}}} <- + {:get, Pleroma.HTTP.get(url, headers, http_options())} do + {:ok, body} + else + {:head, _} -> + Logger.debug("Rich media error for #{url}: HTTP HEAD failed") + {:error, :head} - {:ok, %Tesla.Env{status: 200, headers: headers}} -> - with :ok <- check_content_type(headers), - :ok <- check_content_length(headers), - do: :ok + {:content_type, {_, type}} -> + Logger.debug("Rich media error for #{url}: content-type is #{type}") + {:error, :content_type} - _ -> - :ok - end + {:content_length, {_, length}} -> + Logger.debug("Rich media error for #{url}: content-length is #{length}") + {:error, :body_too_large} - with :ok <- head_check, do: Pleroma.HTTP.get(url, headers, http_options()) + {:get, _} -> + Logger.debug("Rich media error for #{url}: HTTP GET failed") + {:error, :get} + end end defp check_content_type(headers) do @@ -32,7 +44,7 @@ defp check_content_type(headers) do {_, content_type} -> case Plug.Conn.Utils.media_type(content_type) do {:ok, "text", "html", _} -> :ok - _ -> {:error, {:content_type, content_type}} + _ -> {:error, content_type} end _ -> @@ -47,7 +59,7 @@ defp check_content_length(headers) do {_, maybe_content_length} -> case Integer.parse(maybe_content_length) do {content_length, ""} when content_length <= max_body -> :ok - {_, ""} -> {:error, :body_too_large} + {_, ""} -> {:error, maybe_content_length} _ -> :ok end diff --git a/lib/pleroma/web/rich_media/parser.ex b/lib/pleroma/web/rich_media/parser.ex index 7f6b5d388a..a3a522d7ab 100644 --- a/lib/pleroma/web/rich_media/parser.ex +++ b/lib/pleroma/web/rich_media/parser.ex @@ -3,6 +3,7 @@ # SPDX-License-Identifier: AGPL-3.0-only defmodule Pleroma.Web.RichMedia.Parser do + alias Pleroma.Web.RichMedia.Helpers require Logger @config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config) @@ -11,24 +12,26 @@ defp parsers do Pleroma.Config.get([:rich_media, :parsers]) end - def parse(nil), do: nil + @type parse_errors :: {:error, :rich_media_disabled | :validate} - @spec parse(String.t()) :: {:ok, map()} | {:error, any()} - def parse(url) do + @spec parse(String.t()) :: + {:ok, map()} | parse_errors() | Helpers.get_errors() + def parse(url) when is_binary(url) do with {_, true} <- {:config, @config_impl.get([:rich_media, :enabled])}, - :ok <- validate_page_url(url), - {:ok, data} <- parse_url(url) do + {_, :ok} <- {:validate, validate_page_url(url)}, + {_, {:ok, data}} <- {:parse, parse_url(url)} do data = Map.put(data, "url", url) {:ok, data} else {:config, _} -> {:error, :rich_media_disabled} - e -> e + {:validate, _} -> {:error, :validate} + {:parse, error} -> error end end defp parse_url(url) do - with {:ok, %Tesla.Env{body: html}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url), - {:ok, html} <- Floki.parse_document(html) do + with {:ok, body} <- Helpers.rich_media_get(url), + {:ok, html} <- Floki.parse_document(body) do html |> maybe_parse() |> clean_parsed_data() @@ -50,8 +53,8 @@ defp check_parsed_data(%{"title" => title} = data) {:ok, data} end - defp check_parsed_data(data) do - {:error, {:invalid_metadata, data}} + defp check_parsed_data(_data) do + {:error, :invalid_metadata} end defp clean_parsed_data(data) do diff --git a/lib/pleroma/web/rich_media/parsers/o_embed.ex b/lib/pleroma/web/rich_media/parsers/o_embed.ex index 0f303176ce..35ec144265 100644 --- a/lib/pleroma/web/rich_media/parsers/o_embed.ex +++ b/lib/pleroma/web/rich_media/parsers/o_embed.ex @@ -22,7 +22,7 @@ defp get_oembed_url([{"link", attributes, _children} | _]) do end defp get_oembed_data(url) do - with {:ok, %Tesla.Env{body: json}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url) do + with {:ok, json} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url) do Jason.decode(json) end end diff --git a/lib/pleroma/workers/rich_media_worker.ex b/lib/pleroma/workers/rich_media_worker.ex index ecc980a28a..0db97633f3 100644 --- a/lib/pleroma/workers/rich_media_worker.ex +++ b/lib/pleroma/workers/rich_media_worker.ex @@ -14,7 +14,17 @@ def perform(%Job{args: %{"op" => "expire", "url" => url} = _args}) do end def perform(%Job{args: %{"op" => "backfill", "url" => _url} = args}) do - Backfill.run(args) + case Backfill.run(args) do + :ok -> + :ok + + {:error, type} + when type in [:invalid_metadata, :body_too_large, :content_type, :validate] -> + :cancel + + error -> + {:error, error} + end end @impl Oban.Worker diff --git a/test/pleroma/web/rich_media/parser_test.exs b/test/pleroma/web/rich_media/parser_test.exs index a5f2563a20..8fd75b57a6 100644 --- a/test/pleroma/web/rich_media/parser_test.exs +++ b/test/pleroma/web/rich_media/parser_test.exs @@ -20,7 +20,7 @@ test "returns error when no metadata present" do end test "doesn't just add a title" do - assert {:error, {:invalid_metadata, _}} = Parser.parse("https://example.com/non-ogp") + assert {:error, :invalid_metadata} = Parser.parse("https://example.com/non-ogp") end test "parses ogp" do @@ -96,7 +96,7 @@ test "rejects invalid OGP data" do end test "returns error if getting page was not successful" do - assert {:error, :overload} = Parser.parse("https://example.com/error") + assert {:error, :get} = Parser.parse("https://example.com/error") end test "does a HEAD request to check if the body is too large" do @@ -104,17 +104,17 @@ test "does a HEAD request to check if the body is too large" do end test "does a HEAD request to check if the body is html" do - assert {:error, {:content_type, _}} = Parser.parse("https://example.com/pdf-file") + assert {:error, :content_type} = Parser.parse("https://example.com/pdf-file") end test "refuses to crawl incomplete URLs" do url = "example.com/ogp" - assert :error == Parser.parse(url) + assert {:error, :validate} == Parser.parse(url) end test "refuses to crawl malformed URLs" do url = "example.com[]/ogp" - assert :error == Parser.parse(url) + assert {:error, :validate} == Parser.parse(url) end test "refuses to crawl URLs of private network from posts" do @@ -126,7 +126,7 @@ test "refuses to crawl URLs of private network from posts" do "https://pleroma.local/notice/9kCP7V" ] |> Enum.each(fn url -> - assert :error == Parser.parse(url) + assert {:error, :validate} == Parser.parse(url) end) end diff --git a/test/support/http_request_mock.ex b/test/support/http_request_mock.ex index 20e410424b..ed044cf989 100644 --- a/test/support/http_request_mock.ex +++ b/test/support/http_request_mock.ex @@ -1724,7 +1724,7 @@ def post(url, query, body, headers) do ] def head(url, _query, _body, _headers) when url in @rich_media_mocks do - {:ok, %Tesla.Env{status: 404, body: ""}} + {:ok, %Tesla.Env{status: 200, body: ""}} end def head("https://example.com/pdf-file", _, _, _) do From 1e0d5934d53f95ac7737e844cf060d2cfe1be98d Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Wed, 17 Jul 2024 12:51:38 -0400 Subject: [PATCH 2/3] Fix return for cancelling job --- lib/pleroma/workers/rich_media_worker.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/pleroma/workers/rich_media_worker.ex b/lib/pleroma/workers/rich_media_worker.ex index 0db97633f3..a94a85ec71 100644 --- a/lib/pleroma/workers/rich_media_worker.ex +++ b/lib/pleroma/workers/rich_media_worker.ex @@ -20,7 +20,7 @@ def perform(%Job{args: %{"op" => "backfill", "url" => _url} = args}) do {:error, type} when type in [:invalid_metadata, :body_too_large, :content_type, :validate] -> - :cancel + {:cancel, type} error -> {:error, error} From f753bd33805159a08cc8b447daa1c9ace137deab Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Wed, 17 Jul 2024 13:12:51 -0400 Subject: [PATCH 3/3] Explicitly handle the GET and HEAD errors --- lib/pleroma/workers/rich_media_worker.ex | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/pleroma/workers/rich_media_worker.ex b/lib/pleroma/workers/rich_media_worker.ex index a94a85ec71..30f9d9e9ee 100644 --- a/lib/pleroma/workers/rich_media_worker.ex +++ b/lib/pleroma/workers/rich_media_worker.ex @@ -22,6 +22,10 @@ def perform(%Job{args: %{"op" => "backfill", "url" => _url} = args}) do when type in [:invalid_metadata, :body_too_large, :content_type, :validate] -> {:cancel, type} + {:error, type} + when type in [:get, :head] -> + {:error, type} + error -> {:error, error} end