Merge branch 'rich-media-tests' into 'develop'

Pleroma.Web.RichMedia.Parser: Remove test-specific codepaths

See merge request pleroma/pleroma!4053
This commit is contained in:
feld 2024-02-07 05:47:38 +00:00
commit 72480e7b2f
12 changed files with 188 additions and 196 deletions

View file

View file

@ -8,10 +8,13 @@ defmodule Pleroma.Caching do
@callback put(Cachex.cache(), any(), any(), Keyword.t()) :: {Cachex.status(), boolean()}
@callback put(Cachex.cache(), any(), any()) :: {Cachex.status(), boolean()}
@callback fetch!(Cachex.cache(), any(), function() | nil) :: any()
@callback fetch(Cachex.cache(), any(), function() | nil) ::
{atom(), any()} | {atom(), any(), any()}
# @callback del(Cachex.cache(), any(), Keyword.t()) :: {Cachex.status(), boolean()}
@callback del(Cachex.cache(), any()) :: {Cachex.status(), boolean()}
@callback stream!(Cachex.cache(), any()) :: Enumerable.t()
@callback expire_at(Cachex.cache(), binary(), number()) :: {Cachex.status(), boolean()}
@callback expire(Cachex.cache(), binary(), number()) :: {Cachex.status(), boolean()}
@callback exists?(Cachex.cache(), any()) :: {Cachex.status(), boolean()}
@callback execute!(Cachex.cache(), function()) :: any()
@callback get_and_update(Cachex.cache(), any(), function()) ::

View file

@ -18,53 +18,10 @@ defmodule Pleroma.Web.RichMedia.Helpers do
recv_timeout: 2_000
]
@spec validate_page_url(URI.t() | binary()) :: :ok | :error
defp validate_page_url(page_url) when is_binary(page_url) do
validate_tld = @config_impl.get([Pleroma.Formatter, :validate_tld])
page_url
|> Linkify.Parser.url?(validate_tld: validate_tld)
|> parse_uri(page_url)
end
defp validate_page_url(%URI{host: host, scheme: "https"}) do
cond do
Linkify.Parser.ip?(host) ->
:error
host in @config_impl.get([:rich_media, :ignore_hosts], []) ->
:error
get_tld(host) in @config_impl.get([:rich_media, :ignore_tld], []) ->
:error
true ->
:ok
end
end
defp validate_page_url(_), do: :error
defp parse_uri(true, url) do
url
|> URI.parse()
|> validate_page_url
end
defp parse_uri(_, _), do: :error
defp get_tld(host) do
host
|> String.split(".")
|> Enum.reverse()
|> hd
end
def fetch_data_for_object(object) do
with true <- @config_impl.get([:rich_media, :enabled]),
{:ok, page_url} <-
HTML.extract_first_external_url_from_object(object),
:ok <- validate_page_url(page_url),
{:ok, rich_media} <- Parser.parse(page_url) do
%{page_url: page_url, rich_media: rich_media}
else

View file

@ -6,6 +6,7 @@ defmodule Pleroma.Web.RichMedia.Parser do
require Logger
@cachex Pleroma.Config.get([:cachex, :provider], Cachex)
@config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config)
defp parsers do
Pleroma.Config.get([:rich_media, :parsers])
@ -13,70 +14,66 @@ defp parsers do
def parse(nil), do: {:error, "No URL provided"}
if Pleroma.Config.get(:env) == :test do
@spec parse(String.t()) :: {:ok, map()} | {:error, any()}
def parse(url), do: parse_url(url)
else
@spec parse(String.t()) :: {:ok, map()} | {:error, any()}
def parse(url) do
with {:ok, data} <- get_cached_or_parse(url),
{:ok, _} <- set_ttl_based_on_image(data, url) do
{:ok, data}
end
@spec parse(String.t()) :: {:ok, map()} | {:error, any()}
def parse(url) do
with :ok <- validate_page_url(url),
{:ok, data} <- get_cached_or_parse(url),
{:ok, _} <- set_ttl_based_on_image(data, url) do
{:ok, data}
end
end
defp get_cached_or_parse(url) do
case @cachex.fetch(:rich_media_cache, url, fn ->
case parse_url(url) do
{:ok, _} = res ->
{:commit, res}
defp get_cached_or_parse(url) do
case @cachex.fetch(:rich_media_cache, url, fn ->
case parse_url(url) do
{:ok, _} = res ->
{:commit, res}
{:error, reason} = e ->
# Unfortunately we have to log errors here, instead of doing that
# along with ttl setting at the bottom. Otherwise we can get log spam
# if more than one process was waiting for the rich media card
# while it was generated. Ideally we would set ttl here as well,
# so we don't override it number_of_waiters_on_generation
# times, but one, obviously, can't set ttl for not-yet-created entry
# and Cachex doesn't support returning ttl from the fetch callback.
log_error(url, reason)
{:commit, e}
end
end) do
{action, res} when action in [:commit, :ok] ->
case res do
{:ok, _data} = res ->
res
{:error, reason} = e ->
# Unfortunately we have to log errors here, instead of doing that
# along with ttl setting at the bottom. Otherwise we can get log spam
# if more than one process was waiting for the rich media card
# while it was generated. Ideally we would set ttl here as well,
# so we don't override it number_of_waiters_on_generation
# times, but one, obviously, can't set ttl for not-yet-created entry
# and Cachex doesn't support returning ttl from the fetch callback.
log_error(url, reason)
{:commit, e}
end
end) do
{action, res} when action in [:commit, :ok] ->
case res do
{:ok, _data} = res ->
res
{:error, reason} = e ->
if action == :commit, do: set_error_ttl(url, reason)
e
end
{:error, reason} = e ->
if action == :commit, do: set_error_ttl(url, reason)
e
end
{:error, e} ->
{:error, {:cachex_error, e}}
end
{:error, e} ->
{:error, {:cachex_error, e}}
end
end
defp set_error_ttl(_url, :body_too_large), do: :ok
defp set_error_ttl(_url, {:content_type, _}), do: :ok
defp set_error_ttl(_url, :body_too_large), do: :ok
defp set_error_ttl(_url, {:content_type, _}), do: :ok
# The TTL is not set for the errors above, since they are unlikely to change
# with time
# The TTL is not set for the errors above, since they are unlikely to change
# with time
defp set_error_ttl(url, _reason) do
ttl = Pleroma.Config.get([:rich_media, :failure_backoff], 60_000)
@cachex.expire(:rich_media_cache, url, ttl)
:ok
end
defp set_error_ttl(url, _reason) do
ttl = Pleroma.Config.get([:rich_media, :failure_backoff], 60_000)
@cachex.expire(:rich_media_cache, url, ttl)
:ok
end
defp log_error(url, {:invalid_metadata, data}) do
Logger.debug(fn -> "Incomplete or invalid metadata for #{url}: #{inspect(data)}" end)
end
defp log_error(url, {:invalid_metadata, data}) do
Logger.debug(fn -> "Incomplete or invalid metadata for #{url}: #{inspect(data)}" end)
end
defp log_error(url, reason) do
Logger.warning(fn -> "Rich media error for #{url}: #{inspect(reason)}" end)
end
defp log_error(url, reason) do
Logger.warning(fn -> "Rich media error for #{url}: #{inspect(reason)}" end)
end
@doc """
@ -166,4 +163,46 @@ defp clean_parsed_data(data) do
end)
|> Map.new()
end
@spec validate_page_url(URI.t() | binary()) :: :ok | :error
defp validate_page_url(page_url) when is_binary(page_url) do
validate_tld = @config_impl.get([Pleroma.Formatter, :validate_tld])
page_url
|> Linkify.Parser.url?(validate_tld: validate_tld)
|> parse_uri(page_url)
end
defp validate_page_url(%URI{host: host, scheme: "https"}) do
cond do
Linkify.Parser.ip?(host) ->
:error
host in @config_impl.get([:rich_media, :ignore_hosts], []) ->
:error
get_tld(host) in @config_impl.get([:rich_media, :ignore_tld], []) ->
:error
true ->
:ok
end
end
defp validate_page_url(_), do: :error
defp parse_uri(true, url) do
url
|> URI.parse()
|> validate_page_url
end
defp parse_uri(_, _), do: :error
defp get_tld(host) do
host
|> String.split(".")
|> Enum.reverse()
|> hd
end
end

View file

@ -1,3 +1,3 @@
<link rel="alternate" type="application/json+oembed"
href="http://example.com/oembed.json"
href="https://example.com/oembed.json"
title="Bacon Lollys oEmbed Profile" />

View file

@ -336,13 +336,7 @@ test "fake statuses' preview card is not cached", %{conn: conn} do
path -> Pleroma.Test.StaticConfig.get(path)
end)
Tesla.Mock.mock(fn
%{
method: :get,
url: "https://example.com/twitter-card"
} ->
%Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/twitter_card.html")}
Tesla.Mock.mock_global(fn
env ->
apply(HttpRequestMock, :request, [env])
end)

View file

@ -49,6 +49,7 @@ test "it displays a chat message" do
:chat_message_id_idempotency_key_cache, ^id -> {:ok, "123"}
cache, key -> NullCache.get(cache, key)
end)
|> stub(:fetch, fn :rich_media_cache, _, _ -> {:ok, {:ok, %{}}} end)
chat_message = MessageReferenceView.render("show.json", chat_message_reference: cm_ref)

View file

@ -3,7 +3,7 @@
# SPDX-License-Identifier: AGPL-3.0-only
defmodule Pleroma.Web.RichMedia.HelpersTest do
use Pleroma.DataCase, async: true
use Pleroma.DataCase, async: false
alias Pleroma.StaticStubbedConfigMock, as: ConfigMock
alias Pleroma.Web.CommonAPI
@ -14,7 +14,7 @@ defmodule Pleroma.Web.RichMedia.HelpersTest do
import Tesla.Mock
setup do
mock(fn env -> apply(HttpRequestMock, :request, [env]) end)
mock_global(fn env -> apply(HttpRequestMock, :request, [env]) end)
ConfigMock
|> stub(:get, fn

View file

@ -3,95 +3,26 @@
# SPDX-License-Identifier: AGPL-3.0-only
defmodule Pleroma.Web.RichMedia.ParserTest do
use ExUnit.Case, async: true
use Pleroma.DataCase, async: false
alias Pleroma.Web.RichMedia.Parser
import Tesla.Mock
setup do
Tesla.Mock.mock(fn
%{
method: :get,
url: "http://example.com/ogp"
} ->
%Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/ogp.html")}
%{
method: :get,
url: "http://example.com/non-ogp"
} ->
%Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/non_ogp_embed.html")}
%{
method: :get,
url: "http://example.com/ogp-missing-title"
} ->
%Tesla.Env{
status: 200,
body: File.read!("test/fixtures/rich_media/ogp-missing-title.html")
}
%{
method: :get,
url: "http://example.com/twitter-card"
} ->
%Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/twitter_card.html")}
%{
method: :get,
url: "http://example.com/oembed"
} ->
%Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/oembed.html")}
%{
method: :get,
url: "http://example.com/oembed.json"
} ->
%Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/oembed.json")}
%{method: :get, url: "http://example.com/empty"} ->
%Tesla.Env{status: 200, body: "hello"}
%{method: :get, url: "http://example.com/malformed"} ->
%Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/malformed-data.html")}
%{method: :get, url: "http://example.com/error"} ->
{:error, :overload}
%{
method: :head,
url: "http://example.com/huge-page"
} ->
%Tesla.Env{
status: 200,
headers: [{"content-length", "2000001"}, {"content-type", "text/html"}]
}
%{
method: :head,
url: "http://example.com/pdf-file"
} ->
%Tesla.Env{
status: 200,
headers: [{"content-length", "1000000"}, {"content-type", "application/pdf"}]
}
%{method: :head} ->
%Tesla.Env{status: 404, body: "", headers: []}
end)
:ok
mock_global(fn env -> apply(HttpRequestMock, :request, [env]) end)
end
test "returns error when no metadata present" do
assert {:error, _} = Parser.parse("http://example.com/empty")
assert {:error, _} = Parser.parse("https://example.com/empty")
end
test "doesn't just add a title" do
assert {:error, {:invalid_metadata, _}} = Parser.parse("http://example.com/non-ogp")
assert {:error, {:invalid_metadata, _}} = Parser.parse("https://example.com/non-ogp")
end
test "parses ogp" do
assert Parser.parse("http://example.com/ogp") ==
assert Parser.parse("https://example.com/ogp") ==
{:ok,
%{
"image" => "http://ia.media-imdb.com/images/rock.jpg",
@ -99,12 +30,12 @@ test "parses ogp" do
"description" =>
"Directed by Michael Bay. With Sean Connery, Nicolas Cage, Ed Harris, John Spencer.",
"type" => "video.movie",
"url" => "http://example.com/ogp"
"url" => "https://example.com/ogp"
}}
end
test "falls back to <title> when ogp:title is missing" do
assert Parser.parse("http://example.com/ogp-missing-title") ==
assert Parser.parse("https://example.com/ogp-missing-title") ==
{:ok,
%{
"image" => "http://ia.media-imdb.com/images/rock.jpg",
@ -112,12 +43,12 @@ test "falls back to <title> when ogp:title is missing" do
"description" =>
"Directed by Michael Bay. With Sean Connery, Nicolas Cage, Ed Harris, John Spencer.",
"type" => "video.movie",
"url" => "http://example.com/ogp-missing-title"
"url" => "https://example.com/ogp-missing-title"
}}
end
test "parses twitter card" do
assert Parser.parse("http://example.com/twitter-card") ==
assert Parser.parse("https://example.com/twitter-card") ==
{:ok,
%{
"card" => "summary",
@ -125,12 +56,12 @@ test "parses twitter card" do
"image" => "https://farm6.staticflickr.com/5510/14338202952_93595258ff_z.jpg",
"title" => "Small Island Developing States Photo Submission",
"description" => "View the album on Flickr.",
"url" => "http://example.com/twitter-card"
"url" => "https://example.com/twitter-card"
}}
end
test "parses OEmbed and filters HTML tags" do
assert Parser.parse("http://example.com/oembed") ==
assert Parser.parse("https://example.com/oembed") ==
{:ok,
%{
"author_name" => "\u202E\u202D\u202Cbees\u202C",
@ -150,7 +81,7 @@ test "parses OEmbed and filters HTML tags" do
"thumbnail_width" => 150,
"title" => "Bacon Lollys",
"type" => "photo",
"url" => "http://example.com/oembed",
"url" => "https://example.com/oembed",
"version" => "1.0",
"web_page" => "https://www.flickr.com/photos/bees/2362225867/",
"web_page_short_url" => "https://flic.kr/p/4AK2sc",
@ -159,18 +90,18 @@ test "parses OEmbed and filters HTML tags" do
end
test "rejects invalid OGP data" do
assert {:error, _} = Parser.parse("http://example.com/malformed")
assert {:error, _} = Parser.parse("https://example.com/malformed")
end
test "returns error if getting page was not successful" do
assert {:error, :overload} = Parser.parse("http://example.com/error")
assert {:error, :overload} = Parser.parse("https://example.com/error")
end
test "does a HEAD request to check if the body is too large" do
assert {:error, :body_too_large} = Parser.parse("http://example.com/huge-page")
assert {:error, :body_too_large} = Parser.parse("https://example.com/huge-page")
end
test "does a HEAD request to check if the body is html" do
assert {:error, {:content_type, _}} = Parser.parse("http://example.com/pdf-file")
assert {:error, {:content_type, _}} = Parser.parse("https://example.com/pdf-file")
end
end

View file

@ -26,9 +26,15 @@ defmodule Pleroma.CachexProxy do
@impl true
defdelegate fetch!(cache, key, func), to: Cachex
@impl true
defdelegate fetch(cache, key, func), to: Cachex
@impl true
defdelegate expire_at(cache, str, num), to: Cachex
@impl true
defdelegate expire(cache, str, num), to: Cachex
@impl true
defdelegate exists?(cache, key), to: Cachex

View file

@ -1059,7 +1059,7 @@ def get("https://example.com/ogp-missing-data", _, _, _) do
}}
end
def get("http://example.com/malformed", _, _, _) do
def get("https://example.com/malformed", _, _, _) do
{:ok,
%Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/malformed-data.html")}}
end
@ -1472,6 +1472,37 @@ def get("https://yahoo.com/", _, _, _) do
{:ok, %Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/yahoo.html")}}
end
def get("https://example.com/error", _, _, _), do: {:error, :overload}
def get("https://example.com/ogp-missing-title", _, _, _) do
{:ok,
%Tesla.Env{
status: 200,
body: File.read!("test/fixtures/rich_media/ogp-missing-title.html")
}}
end
def get("https://example.com/oembed", _, _, _) do
{:ok, %Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/oembed.html")}}
end
def get("https://example.com/oembed.json", _, _, _) do
{:ok, %Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/oembed.json")}}
end
def get("https://example.com/twitter-card", _, _, _) do
{:ok, %Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/twitter_card.html")}}
end
def get("https://example.com/non-ogp", _, _, _) do
{:ok,
%Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/non_ogp_embed.html")}}
end
def get("https://example.com/empty", _, _, _) do
{:ok, %Tesla.Env{status: 200, body: "hello"}}
end
def get(url, query, body, headers) do
{:error,
"Mock response not implemented for GET #{inspect(url)}, #{query}, #{inspect(body)}, #{inspect(headers)}"}
@ -1545,17 +1576,41 @@ def post(url, query, body, headers) do
# Most of the rich media mocks are missing HEAD requests, so we just return 404.
@rich_media_mocks [
"https://example.com/empty",
"https://example.com/error",
"https://example.com/malformed",
"https://example.com/non-ogp",
"https://example.com/oembed",
"https://example.com/oembed.json",
"https://example.com/ogp",
"https://example.com/ogp-missing-data",
"https://example.com/ogp-missing-title",
"https://example.com/twitter-card",
"https://google.com/",
"https://yahoo.com/",
"https://pleroma.local/notice/9kCP7V"
"https://pleroma.local/notice/9kCP7V",
"https://yahoo.com/"
]
def head(url, _query, _body, _headers) when url in @rich_media_mocks do
{:ok, %Tesla.Env{status: 404, body: ""}}
end
def head("https://example.com/pdf-file", _, _, _) do
{:ok,
%Tesla.Env{
status: 200,
headers: [{"content-length", "1000000"}, {"content-type", "application/pdf"}]
}}
end
def head("https://example.com/huge-page", _, _, _) do
{:ok,
%Tesla.Env{
status: 200,
headers: [{"content-length", "2000001"}, {"content-type", "text/html"}]
}}
end
def head(url, query, body, headers) do
{:error,
"Mock response not implemented for HEAD #{inspect(url)}, #{query}, #{inspect(body)}, #{inspect(headers)}"}

View file

@ -28,6 +28,9 @@ def fetch!(_, key, func) do
end
end
@impl true
def fetch(_, key, func), do: func.(key)
@impl true
def get_and_update(_, _, func) do
func.(nil)
@ -36,6 +39,9 @@ def get_and_update(_, _, func) do
@impl true
def expire_at(_, _, _), do: {:ok, true}
@impl true
def expire(_, _, _), do: {:ok, true}
@impl true
def exists?(_, _), do: {:ok, false}