Merge branch 'oban/rich-media-invalid' into 'develop'
Rework some Rich Media functionality for better error handling See merge request pleroma/pleroma!4182
This commit is contained in:
commit
4544505761
8 changed files with 74 additions and 50 deletions
1
changelog.d/oban-rich-media-errors.fix
Normal file
1
changelog.d/oban-rich-media-errors.fix
Normal file
|
@ -0,0 +1 @@
|
|||
Prevent Rich Media backfill jobs from retrying in cases where it is likely they will fail again.
|
|
@ -4,6 +4,7 @@
|
|||
|
||||
defmodule Pleroma.Web.RichMedia.Backfill do
|
||||
alias Pleroma.Web.RichMedia.Card
|
||||
alias Pleroma.Web.RichMedia.Helpers
|
||||
alias Pleroma.Web.RichMedia.Parser
|
||||
alias Pleroma.Web.RichMedia.Parser.TTL
|
||||
alias Pleroma.Workers.RichMediaWorker
|
||||
|
@ -16,8 +17,7 @@ defmodule Pleroma.Web.RichMedia.Backfill do
|
|||
Pleroma.Web.ActivityPub.ActivityPub
|
||||
)
|
||||
|
||||
@spec run(map()) ::
|
||||
:ok | {:error, {:invalid_metadata, any()} | :body_too_large | {:content, any()} | any()}
|
||||
@spec run(map()) :: :ok | Parser.parse_errors() | Helpers.get_errors()
|
||||
def run(%{"url" => url} = args) do
|
||||
url_hash = Card.url_to_hash(url)
|
||||
|
||||
|
@ -33,22 +33,16 @@ def run(%{"url" => url} = args) do
|
|||
end
|
||||
|
||||
warm_cache(url_hash, card)
|
||||
:ok
|
||||
|
||||
{:error, {:invalid_metadata, fields}} ->
|
||||
Logger.debug("Rich media incomplete or invalid metadata for #{url}: #{inspect(fields)}")
|
||||
{:error, type} = error
|
||||
when type in [:invalid_metadata, :body_too_large, :content_type, :validate] ->
|
||||
negative_cache(url_hash)
|
||||
error
|
||||
|
||||
{:error, :body_too_large} ->
|
||||
Logger.error("Rich media error for #{url}: :body_too_large")
|
||||
negative_cache(url_hash)
|
||||
|
||||
{:error, {:content_type, type}} ->
|
||||
Logger.debug("Rich media error for #{url}: :content_type is #{type}")
|
||||
negative_cache(url_hash)
|
||||
|
||||
e ->
|
||||
Logger.debug("Rich media error for #{url}: #{inspect(e)}")
|
||||
{:error, e}
|
||||
{:error, type} = error
|
||||
when type in [:get, :head] ->
|
||||
error
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -5,26 +5,38 @@
|
|||
defmodule Pleroma.Web.RichMedia.Helpers do
|
||||
alias Pleroma.Config
|
||||
|
||||
require Logger
|
||||
|
||||
@type get_errors :: {:error, :body_too_large | :content_type | :head | :get}
|
||||
|
||||
@spec rich_media_get(String.t()) :: {:ok, String.t()} | get_errors()
|
||||
def rich_media_get(url) do
|
||||
headers = [{"user-agent", Pleroma.Application.user_agent() <> "; Bot"}]
|
||||
|
||||
head_check =
|
||||
case Pleroma.HTTP.head(url, headers, http_options()) do
|
||||
# If the HEAD request didn't reach the server for whatever reason,
|
||||
# we assume the GET that comes right after won't either
|
||||
{:error, _} = e ->
|
||||
e
|
||||
with {_, {:ok, %Tesla.Env{status: 200, headers: headers}}} <-
|
||||
{:head, Pleroma.HTTP.head(url, headers, http_options())},
|
||||
{_, :ok} <- {:content_type, check_content_type(headers)},
|
||||
{_, :ok} <- {:content_length, check_content_length(headers)},
|
||||
{_, {:ok, %Tesla.Env{status: 200, body: body}}} <-
|
||||
{:get, Pleroma.HTTP.get(url, headers, http_options())} do
|
||||
{:ok, body}
|
||||
else
|
||||
{:head, _} ->
|
||||
Logger.debug("Rich media error for #{url}: HTTP HEAD failed")
|
||||
{:error, :head}
|
||||
|
||||
{:ok, %Tesla.Env{status: 200, headers: headers}} ->
|
||||
with :ok <- check_content_type(headers),
|
||||
:ok <- check_content_length(headers),
|
||||
do: :ok
|
||||
{:content_type, {_, type}} ->
|
||||
Logger.debug("Rich media error for #{url}: content-type is #{type}")
|
||||
{:error, :content_type}
|
||||
|
||||
_ ->
|
||||
:ok
|
||||
{:content_length, {_, length}} ->
|
||||
Logger.debug("Rich media error for #{url}: content-length is #{length}")
|
||||
{:error, :body_too_large}
|
||||
|
||||
{:get, _} ->
|
||||
Logger.debug("Rich media error for #{url}: HTTP GET failed")
|
||||
{:error, :get}
|
||||
end
|
||||
|
||||
with :ok <- head_check, do: Pleroma.HTTP.get(url, headers, http_options())
|
||||
end
|
||||
|
||||
defp check_content_type(headers) do
|
||||
|
@ -32,7 +44,7 @@ defp check_content_type(headers) do
|
|||
{_, content_type} ->
|
||||
case Plug.Conn.Utils.media_type(content_type) do
|
||||
{:ok, "text", "html", _} -> :ok
|
||||
_ -> {:error, {:content_type, content_type}}
|
||||
_ -> {:error, content_type}
|
||||
end
|
||||
|
||||
_ ->
|
||||
|
@ -47,7 +59,7 @@ defp check_content_length(headers) do
|
|||
{_, maybe_content_length} ->
|
||||
case Integer.parse(maybe_content_length) do
|
||||
{content_length, ""} when content_length <= max_body -> :ok
|
||||
{_, ""} -> {:error, :body_too_large}
|
||||
{_, ""} -> {:error, maybe_content_length}
|
||||
_ -> :ok
|
||||
end
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-only
|
||||
|
||||
defmodule Pleroma.Web.RichMedia.Parser do
|
||||
alias Pleroma.Web.RichMedia.Helpers
|
||||
require Logger
|
||||
|
||||
@config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config)
|
||||
|
@ -11,24 +12,26 @@ defp parsers do
|
|||
Pleroma.Config.get([:rich_media, :parsers])
|
||||
end
|
||||
|
||||
def parse(nil), do: nil
|
||||
@type parse_errors :: {:error, :rich_media_disabled | :validate}
|
||||
|
||||
@spec parse(String.t()) :: {:ok, map()} | {:error, any()}
|
||||
def parse(url) do
|
||||
@spec parse(String.t()) ::
|
||||
{:ok, map()} | parse_errors() | Helpers.get_errors()
|
||||
def parse(url) when is_binary(url) do
|
||||
with {_, true} <- {:config, @config_impl.get([:rich_media, :enabled])},
|
||||
:ok <- validate_page_url(url),
|
||||
{:ok, data} <- parse_url(url) do
|
||||
{_, :ok} <- {:validate, validate_page_url(url)},
|
||||
{_, {:ok, data}} <- {:parse, parse_url(url)} do
|
||||
data = Map.put(data, "url", url)
|
||||
{:ok, data}
|
||||
else
|
||||
{:config, _} -> {:error, :rich_media_disabled}
|
||||
e -> e
|
||||
{:validate, _} -> {:error, :validate}
|
||||
{:parse, error} -> error
|
||||
end
|
||||
end
|
||||
|
||||
defp parse_url(url) do
|
||||
with {:ok, %Tesla.Env{body: html}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url),
|
||||
{:ok, html} <- Floki.parse_document(html) do
|
||||
with {:ok, body} <- Helpers.rich_media_get(url),
|
||||
{:ok, html} <- Floki.parse_document(body) do
|
||||
html
|
||||
|> maybe_parse()
|
||||
|> clean_parsed_data()
|
||||
|
@ -50,8 +53,8 @@ defp check_parsed_data(%{"title" => title} = data)
|
|||
{:ok, data}
|
||||
end
|
||||
|
||||
defp check_parsed_data(data) do
|
||||
{:error, {:invalid_metadata, data}}
|
||||
defp check_parsed_data(_data) do
|
||||
{:error, :invalid_metadata}
|
||||
end
|
||||
|
||||
defp clean_parsed_data(data) do
|
||||
|
|
|
@ -22,7 +22,7 @@ defp get_oembed_url([{"link", attributes, _children} | _]) do
|
|||
end
|
||||
|
||||
defp get_oembed_data(url) do
|
||||
with {:ok, %Tesla.Env{body: json}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url) do
|
||||
with {:ok, json} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url) do
|
||||
Jason.decode(json)
|
||||
end
|
||||
end
|
||||
|
|
|
@ -14,7 +14,21 @@ def perform(%Job{args: %{"op" => "expire", "url" => url} = _args}) do
|
|||
end
|
||||
|
||||
def perform(%Job{args: %{"op" => "backfill", "url" => _url} = args}) do
|
||||
Backfill.run(args)
|
||||
case Backfill.run(args) do
|
||||
:ok ->
|
||||
:ok
|
||||
|
||||
{:error, type}
|
||||
when type in [:invalid_metadata, :body_too_large, :content_type, :validate] ->
|
||||
{:cancel, type}
|
||||
|
||||
{:error, type}
|
||||
when type in [:get, :head] ->
|
||||
{:error, type}
|
||||
|
||||
error ->
|
||||
{:error, error}
|
||||
end
|
||||
end
|
||||
|
||||
@impl Oban.Worker
|
||||
|
|
|
@ -20,7 +20,7 @@ test "returns error when no metadata present" do
|
|||
end
|
||||
|
||||
test "doesn't just add a title" do
|
||||
assert {:error, {:invalid_metadata, _}} = Parser.parse("https://example.com/non-ogp")
|
||||
assert {:error, :invalid_metadata} = Parser.parse("https://example.com/non-ogp")
|
||||
end
|
||||
|
||||
test "parses ogp" do
|
||||
|
@ -96,7 +96,7 @@ test "rejects invalid OGP data" do
|
|||
end
|
||||
|
||||
test "returns error if getting page was not successful" do
|
||||
assert {:error, :overload} = Parser.parse("https://example.com/error")
|
||||
assert {:error, :get} = Parser.parse("https://example.com/error")
|
||||
end
|
||||
|
||||
test "does a HEAD request to check if the body is too large" do
|
||||
|
@ -104,17 +104,17 @@ test "does a HEAD request to check if the body is too large" do
|
|||
end
|
||||
|
||||
test "does a HEAD request to check if the body is html" do
|
||||
assert {:error, {:content_type, _}} = Parser.parse("https://example.com/pdf-file")
|
||||
assert {:error, :content_type} = Parser.parse("https://example.com/pdf-file")
|
||||
end
|
||||
|
||||
test "refuses to crawl incomplete URLs" do
|
||||
url = "example.com/ogp"
|
||||
assert :error == Parser.parse(url)
|
||||
assert {:error, :validate} == Parser.parse(url)
|
||||
end
|
||||
|
||||
test "refuses to crawl malformed URLs" do
|
||||
url = "example.com[]/ogp"
|
||||
assert :error == Parser.parse(url)
|
||||
assert {:error, :validate} == Parser.parse(url)
|
||||
end
|
||||
|
||||
test "refuses to crawl URLs of private network from posts" do
|
||||
|
@ -126,7 +126,7 @@ test "refuses to crawl URLs of private network from posts" do
|
|||
"https://pleroma.local/notice/9kCP7V"
|
||||
]
|
||||
|> Enum.each(fn url ->
|
||||
assert :error == Parser.parse(url)
|
||||
assert {:error, :validate} == Parser.parse(url)
|
||||
end)
|
||||
end
|
||||
|
||||
|
|
|
@ -1724,7 +1724,7 @@ def post(url, query, body, headers) do
|
|||
]
|
||||
|
||||
def head(url, _query, _body, _headers) when url in @rich_media_mocks do
|
||||
{:ok, %Tesla.Env{status: 404, body: ""}}
|
||||
{:ok, %Tesla.Env{status: 200, body: ""}}
|
||||
end
|
||||
|
||||
def head("https://example.com/pdf-file", _, _, _) do
|
||||
|
|
Loading…
Reference in a new issue