From 5a39866388c411f2bcee9848352f8c420513f34f Mon Sep 17 00:00:00 2001 From: Ekaterina Vaartis Date: Sat, 27 Aug 2022 01:43:59 +0300 Subject: [PATCH] Specifically strip mentions for search indexing --- lib/mix/tasks/pleroma/search/meilisearch.ex | 1 + lib/pleroma/search/meilisearch.ex | 3 ++- priv/scrubbers/search_indexing.ex | 24 +++++++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 priv/scrubbers/search_indexing.ex diff --git a/lib/mix/tasks/pleroma/search/meilisearch.ex b/lib/mix/tasks/pleroma/search/meilisearch.ex index 72a5582282..8379a0c252 100644 --- a/lib/mix/tasks/pleroma/search/meilisearch.ex +++ b/lib/mix/tasks/pleroma/search/meilisearch.ex @@ -13,6 +13,7 @@ defmodule Mix.Tasks.Pleroma.Search.Meilisearch do def run(["index"]) do start_pleroma() + Pleroma.HTML.compile_scrubbers() meili_version = ( diff --git a/lib/pleroma/search/meilisearch.ex b/lib/pleroma/search/meilisearch.ex index 0b90971b18..7af7f460a6 100644 --- a/lib/pleroma/search/meilisearch.ex +++ b/lib/pleroma/search/meilisearch.ex @@ -122,7 +122,8 @@ def object_to_search_data(object) do end content = - with {:ok, scrubbed} <- FastSanitize.strip_tags(content_str), + with {:ok, scrubbed} <- + FastSanitize.Sanitizer.scrub(content_str, Pleroma.HTML.Scrubber.SearchIndexing), trimmed <- String.trim(scrubbed) do trimmed end diff --git a/priv/scrubbers/search_indexing.ex b/priv/scrubbers/search_indexing.ex new file mode 100644 index 0000000000..02756ab797 --- /dev/null +++ b/priv/scrubbers/search_indexing.ex @@ -0,0 +1,24 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2022 Pleroma Authors +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Pleroma.HTML.Scrubber.SearchIndexing do + @moduledoc """ + An HTML scrubbing policy that scrubs things for searching. + """ + + require FastSanitize.Sanitizer.Meta + alias FastSanitize.Sanitizer.Meta + + # Explicitly remove mentions + def scrub({:a, attrs, children}) do + if(Enum.any?(attrs, fn {att, val} -> att == "class" and String.contains?(val, "mention") end), + do: nil, + # Strip the tag itself, leave only children (text, presumably) + else: children + ) + end + + Meta.strip_comments() + Meta.strip_everything_not_covered() +end