From 728643b623924144c71143a5780f6039e0ce42f6 Mon Sep 17 00:00:00 2001 From: Alex Gleason Date: Tue, 8 Nov 2022 17:06:16 -0600 Subject: [PATCH 1/4] LanguageDetector: strip non-language text to (hopefully) improve accuracy --- lib/pleroma/language/language_detector.ex | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/pleroma/language/language_detector.ex b/lib/pleroma/language/language_detector.ex index b19eb45711..0be69d220c 100644 --- a/lib/pleroma/language/language_detector.ex +++ b/lib/pleroma/language/language_detector.ex @@ -15,10 +15,18 @@ def missing_dependencies do end end + # Strip tags from text, etc. + defp prepare_text(text) do + text + |> Floki.parse_fragment!() + |> Floki.filter_out(".h-card, .mention, .hashtag, .u-url, .quote-inline, .recipients-inline, code, pre") + |> Floki.text() + end + def detect(text) do provider = get_provider() - {:ok, text} = text |> FastSanitize.strip_tags() + text = prepare_text(text) word_count = text |> String.split(~r/\s+/) |> Enum.count() if word_count < @words_threshold or !provider or !provider.configured? do From f844c4ba13231a76ac163c773ac4ffeb8268c81a Mon Sep 17 00:00:00 2001 From: Alex Gleason Date: Tue, 8 Nov 2022 17:23:41 -0600 Subject: [PATCH 2/4] ActivityDraft: detect language from content_html so it can strip links --- lib/pleroma/web/common_api/activity_draft.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/pleroma/web/common_api/activity_draft.ex b/lib/pleroma/web/common_api/activity_draft.ex index fdc9999f55..dc7b4c6097 100644 --- a/lib/pleroma/web/common_api/activity_draft.ex +++ b/lib/pleroma/web/common_api/activity_draft.ex @@ -236,7 +236,7 @@ defp sensitive(draft) do defp language(draft) do language = Utils.get_valid_language(draft.params[:language]) || - LanguageDetector.detect(draft.full_payload) + LanguageDetector.detect(draft.content_html <> " " <> draft.summary) %__MODULE__{draft | language: language} end From 0ed5f9dcab41bd327668df51d8d3dab1992ceff5 Mon Sep 17 00:00:00 2001 From: Alex Gleason Date: Tue, 8 Nov 2022 17:25:09 -0600 Subject: [PATCH 3/4] Docker: use the lightweight fasttext model --- Dockerfile | 6 +++--- config/docker.exs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 530c6f64f1..fd521245bd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -30,7 +30,7 @@ LABEL maintainer="hello@soapbox.pub" \ org.opencontainers.image.description="Rebased" \ org.opencontainers.image.authors="hello@soapbox.pub" \ org.opencontainers.image.vendor="soapbox.pub" \ - org.opencontainers.image.documentation="https://gitlab.com/soapbox-pub/soapbox-be" \ + org.opencontainers.image.documentation="https://gitlab.com/soapbox-pub/rebased" \ org.opencontainers.image.licenses="AGPL-3.0" \ org.opencontainers.image.url="https://soapbox.pub" \ org.opencontainers.image.revision=$VCS_REF \ @@ -48,8 +48,8 @@ RUN apt-get update &&\ mkdir -p /etc/pleroma &&\ chown -R pleroma /etc/pleroma &&\ mkdir -p /usr/share/fasttext &&\ - curl -L https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -o /usr/share/fasttext/lid.176.bin &&\ - chmod 0644 /usr/share/fasttext/lid.176.bin + curl -L https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz -o /usr/share/fasttext/lid.176.ftz &&\ + chmod 0644 /usr/share/fasttext/lid.176.ftz USER pleroma diff --git a/config/docker.exs b/config/docker.exs index 40ea225b62..b204d89edf 100644 --- a/config/docker.exs +++ b/config/docker.exs @@ -36,7 +36,7 @@ provider: Pleroma.Language.LanguageDetector.Fasttext config :pleroma, Pleroma.Language.LanguageDetector.Fasttext, - model: "/usr/share/fasttext/lid.176.bin" + model: "/usr/share/fasttext/lid.176.ftz" # We can't store the secrets in this file, since this is baked into the docker image if not File.exists?("/var/lib/pleroma/secret.exs") do From 6dc55531db10690f926f63ab9b5f83623d56ef07 Mon Sep 17 00:00:00 2001 From: Alex Gleason Date: Tue, 8 Nov 2022 17:47:17 -0600 Subject: [PATCH 4/4] mix format --- lib/pleroma/language/language_detector.ex | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/pleroma/language/language_detector.ex b/lib/pleroma/language/language_detector.ex index 0be69d220c..42d200a287 100644 --- a/lib/pleroma/language/language_detector.ex +++ b/lib/pleroma/language/language_detector.ex @@ -19,7 +19,9 @@ def missing_dependencies do defp prepare_text(text) do text |> Floki.parse_fragment!() - |> Floki.filter_out(".h-card, .mention, .hashtag, .u-url, .quote-inline, .recipients-inline, code, pre") + |> Floki.filter_out( + ".h-card, .mention, .hashtag, .u-url, .quote-inline, .recipients-inline, code, pre" + ) |> Floki.text() end