From 4c1d0dbb69b055847d71eb4485bf27e89d9fab71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?marcin=20miko=C5=82ajczak?= Date: Thu, 3 Nov 2022 00:13:09 +0100 Subject: [PATCH] Language detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: marcin mikołajczak --- config/description.exs | 22 +++++++++ lib/pleroma/application_requirements.ex | 22 ++++++++- lib/pleroma/language/language_detector.ex | 34 ++++++++++++++ .../language/language_detector/fasttext.ex | 47 +++++++++++++++++++ .../language/language_detector/provider.ex | 11 +++++ lib/pleroma/language/translation.ex | 12 ++--- lib/pleroma/web/common_api/activity_draft.ex | 15 +++--- 7 files changed, 147 insertions(+), 16 deletions(-) create mode 100644 lib/pleroma/language/language_detector.ex create mode 100644 lib/pleroma/language/language_detector/fasttext.ex create mode 100644 lib/pleroma/language/language_detector/provider.ex diff --git a/config/description.exs b/config/description.exs index bf50ac1dab..f3395863c8 100644 --- a/config/description.exs +++ b/config/description.exs @@ -3594,5 +3594,27 @@ suggestions: ["YOUR_API_KEY"] } ] + }, + %{ + group: :pleroma, + key: Pleroma.Language.LanguageDetector, + type: :group, + description: "Language detection providers", + children: [ + %{ + key: :provider, + type: :module, + suggestions: [ + Pleroma.Language.LanguageDetector.Fasttext + ] + }, + %{ + group: {:subgroup, Pleroma.Language.LanguageDetector.Fasttext}, + key: :model, + label: "fastText language detection model", + type: :string, + suggestions: ["/usr/share/fasttext/lid.176.bin"] + } + ] } ] diff --git a/lib/pleroma/application_requirements.ex b/lib/pleroma/application_requirements.ex index 44b1c1705e..94d1ef7731 100644 --- a/lib/pleroma/application_requirements.ex +++ b/lib/pleroma/application_requirements.ex @@ -187,7 +187,27 @@ defp check_system_commands!(:ok) do false end - if Enum.all?([preview_proxy_commands_status | filter_commands_statuses], & &1) do + language_detector_commands_status = + if Pleroma.Language.LanguageDetector.missing_dependencies() == [] do + true + else + Logger.error( + "The following dependencies required by the currently enabled " <> + "language detection provider are not installed: " <> + inspect(Pleroma.Language.LanguageDetector.missing_dependencies()) + ) + + false + end + + if Enum.all?( + [ + preview_proxy_commands_status, + language_detector_commands_status + | filter_commands_statuses + ], + & &1 + ) do :ok else {:error, diff --git a/lib/pleroma/language/language_detector.ex b/lib/pleroma/language/language_detector.ex new file mode 100644 index 0000000000..3901a8b90b --- /dev/null +++ b/lib/pleroma/language/language_detector.ex @@ -0,0 +1,34 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2022 Pleroma Authors +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Pleroma.Language.LanguageDetector do + @words_threshold 4 + + def missing_dependencies do + provider = get_provider() + + if provider do + provider.missing_dependencies() + else + nil + end + end + + def detect(text) do + provider = get_provider() + + {:ok, text} = text |> FastSanitize.strip_tags() + word_count = text |> String.split(~r/\s+/) |> Enum.count() + + if word_count < @words_threshold or !provider or !provider.configured? do + nil + else + provider.detect(text) + end + end + + defp get_provider() do + Pleroma.Config.get([__MODULE__, :provider]) + end +end diff --git a/lib/pleroma/language/language_detector/fasttext.ex b/lib/pleroma/language/language_detector/fasttext.ex new file mode 100644 index 0000000000..d479d21255 --- /dev/null +++ b/lib/pleroma/language/language_detector/fasttext.ex @@ -0,0 +1,47 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2022 Pleroma Authors +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Pleroma.Language.LanguageDetector.Fasttext do + import Pleroma.Web.Utils.Guards, only: [not_empty_string: 1] + + alias Pleroma.Language.LanguageDetector.Provider + + @behaviour Provider + + @impl Provider + def missing_dependencies do + if Pleroma.Utils.command_available?("fasttext") do + [] + else + ["fasttext"] + end + end + + @impl Provider + def configured?, do: not_empty_string(get_model()) + + @impl Provider + def detect(text) do + text_path = Path.join(System.tmp_dir!(), "fasttext-#{Ecto.UUID.generate()}") + + File.write(text_path, text) + + detected_language = + case System.cmd("fasttext", ["predict", get_model(), text_path]) do + {"__label__" <> language, _} -> + language |> String.trim() + + _ -> + nil + end + + File.rm(text_path) + + detected_language + end + + defp get_model do + Pleroma.Config.get([__MODULE__, :model]) + end +end diff --git a/lib/pleroma/language/language_detector/provider.ex b/lib/pleroma/language/language_detector/provider.ex new file mode 100644 index 0000000000..08e7c8eef6 --- /dev/null +++ b/lib/pleroma/language/language_detector/provider.ex @@ -0,0 +1,11 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2022 Pleroma Authors +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Pleroma.Language.LanguageDetector.Provider do + @callback missing_dependencies() :: [String.t()] + + @callback configured?() :: boolean() + + @callback detect(text :: String.t()) :: String.t() | nil +end diff --git a/lib/pleroma/language/translation.ex b/lib/pleroma/language/translation.ex index c9cd9d2dd5..c812385258 100644 --- a/lib/pleroma/language/translation.ex +++ b/lib/pleroma/language/translation.ex @@ -6,9 +6,9 @@ defmodule Pleroma.Language.Translation do @cachex Pleroma.Config.get([:cachex, :provider], Cachex) def configured? do - service = get_service() + provider = get_provider() - !!service and service.configured? + !!provider and provider.configured? end def translate(text, source_language, target_language) do @@ -16,13 +16,13 @@ def translate(text, source_language, target_language) do case @cachex.get(:translations_cache, cache_key) do {:ok, nil} -> - service = get_service() + provider = get_provider() result = - if !service or !service.configured? do + if !configured?() do {:error, :not_found} else - service.translate(text, source_language, target_language) + provider.translate(text, source_language, target_language) end store_result(result, cache_key) @@ -37,7 +37,7 @@ def translate(text, source_language, target_language) do end end - defp get_service, do: Pleroma.Config.get([__MODULE__, :provider]) + defp get_provider, do: Pleroma.Config.get([__MODULE__, :provider]) defp get_cache_key(text, source_language, target_language) do "#{source_language}/#{target_language}/#{content_hash(text)}" diff --git a/lib/pleroma/web/common_api/activity_draft.ex b/lib/pleroma/web/common_api/activity_draft.ex index 9d4283b7fd..91be325380 100644 --- a/lib/pleroma/web/common_api/activity_draft.ex +++ b/lib/pleroma/web/common_api/activity_draft.ex @@ -5,6 +5,7 @@ defmodule Pleroma.Web.CommonAPI.ActivityDraft do alias Pleroma.Activity alias Pleroma.Conversation.Participation + alias Pleroma.Language.LanguageDetector alias Pleroma.Object alias Pleroma.Web.ActivityPub.Builder alias Pleroma.Web.ActivityPub.Visibility @@ -226,18 +227,14 @@ defp sensitive(draft) do %__MODULE__{draft | sensitive: sensitive} end - defp language(%{params: %{language: language}} = draft) when not_empty_string(language) do - case Utils.get_valid_language(language) do - language when is_binary(language) -> - %__MODULE__{draft | language: language} + defp language(draft) do + language = + Utils.get_valid_language(draft.params[:language]) || + LanguageDetector.detect(draft.full_payload) - _ -> - draft - end + %__MODULE__{draft | language: language} end - defp language(draft), do: draft - defp object(draft) do emoji = Map.merge(Pleroma.Emoji.Formatter.get_emoji_map(draft.full_payload), draft.emoji)