Merge branch 'fasttext-improvements' into 'develop'

Fasttext improvements

See merge request soapbox-pub/rebased!210
This commit is contained in:
Alex Gleason 2022-11-08 23:59:59 +00:00
commit aa926ce3d3
4 changed files with 16 additions and 6 deletions

View file

@ -30,7 +30,7 @@ LABEL maintainer="hello@soapbox.pub" \
org.opencontainers.image.description="Rebased" \
org.opencontainers.image.authors="hello@soapbox.pub" \
org.opencontainers.image.vendor="soapbox.pub" \
org.opencontainers.image.documentation="https://gitlab.com/soapbox-pub/soapbox-be" \
org.opencontainers.image.documentation="https://gitlab.com/soapbox-pub/rebased" \
org.opencontainers.image.licenses="AGPL-3.0" \
org.opencontainers.image.url="https://soapbox.pub" \
org.opencontainers.image.revision=$VCS_REF \
@ -48,8 +48,8 @@ RUN apt-get update &&\
mkdir -p /etc/pleroma &&\
chown -R pleroma /etc/pleroma &&\
mkdir -p /usr/share/fasttext &&\
curl -L https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -o /usr/share/fasttext/lid.176.bin &&\
chmod 0644 /usr/share/fasttext/lid.176.bin
curl -L https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz -o /usr/share/fasttext/lid.176.ftz &&\
chmod 0644 /usr/share/fasttext/lid.176.ftz
USER pleroma

View file

@ -36,7 +36,7 @@
provider: Pleroma.Language.LanguageDetector.Fasttext
config :pleroma, Pleroma.Language.LanguageDetector.Fasttext,
model: "/usr/share/fasttext/lid.176.bin"
model: "/usr/share/fasttext/lid.176.ftz"
# We can't store the secrets in this file, since this is baked into the docker image
if not File.exists?("/var/lib/pleroma/secret.exs") do

View file

@ -15,10 +15,20 @@ def missing_dependencies do
end
end
# Strip tags from text, etc.
defp prepare_text(text) do
text
|> Floki.parse_fragment!()
|> Floki.filter_out(
".h-card, .mention, .hashtag, .u-url, .quote-inline, .recipients-inline, code, pre"
)
|> Floki.text()
end
def detect(text) do
provider = get_provider()
{:ok, text} = text |> FastSanitize.strip_tags()
text = prepare_text(text)
word_count = text |> String.split(~r/\s+/) |> Enum.count()
if word_count < @words_threshold or !provider or !provider.configured? do

View file

@ -236,7 +236,7 @@ defp sensitive(draft) do
defp language(draft) do
language =
Utils.get_valid_language(draft.params[:language]) ||
LanguageDetector.detect(draft.full_payload)
LanguageDetector.detect(draft.content_html <> " " <> draft.summary)
%__MODULE__{draft | language: language}
end