Merge branch 'fasttext-improvements' into 'develop'
Fasttext improvements See merge request soapbox-pub/rebased!210
This commit is contained in:
commit
aa926ce3d3
4 changed files with 16 additions and 6 deletions
|
@ -30,7 +30,7 @@ LABEL maintainer="hello@soapbox.pub" \
|
|||
org.opencontainers.image.description="Rebased" \
|
||||
org.opencontainers.image.authors="hello@soapbox.pub" \
|
||||
org.opencontainers.image.vendor="soapbox.pub" \
|
||||
org.opencontainers.image.documentation="https://gitlab.com/soapbox-pub/soapbox-be" \
|
||||
org.opencontainers.image.documentation="https://gitlab.com/soapbox-pub/rebased" \
|
||||
org.opencontainers.image.licenses="AGPL-3.0" \
|
||||
org.opencontainers.image.url="https://soapbox.pub" \
|
||||
org.opencontainers.image.revision=$VCS_REF \
|
||||
|
@ -48,8 +48,8 @@ RUN apt-get update &&\
|
|||
mkdir -p /etc/pleroma &&\
|
||||
chown -R pleroma /etc/pleroma &&\
|
||||
mkdir -p /usr/share/fasttext &&\
|
||||
curl -L https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -o /usr/share/fasttext/lid.176.bin &&\
|
||||
chmod 0644 /usr/share/fasttext/lid.176.bin
|
||||
curl -L https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz -o /usr/share/fasttext/lid.176.ftz &&\
|
||||
chmod 0644 /usr/share/fasttext/lid.176.ftz
|
||||
|
||||
USER pleroma
|
||||
|
||||
|
|
|
@ -36,7 +36,7 @@
|
|||
provider: Pleroma.Language.LanguageDetector.Fasttext
|
||||
|
||||
config :pleroma, Pleroma.Language.LanguageDetector.Fasttext,
|
||||
model: "/usr/share/fasttext/lid.176.bin"
|
||||
model: "/usr/share/fasttext/lid.176.ftz"
|
||||
|
||||
# We can't store the secrets in this file, since this is baked into the docker image
|
||||
if not File.exists?("/var/lib/pleroma/secret.exs") do
|
||||
|
|
|
@ -15,10 +15,20 @@ def missing_dependencies do
|
|||
end
|
||||
end
|
||||
|
||||
# Strip tags from text, etc.
|
||||
defp prepare_text(text) do
|
||||
text
|
||||
|> Floki.parse_fragment!()
|
||||
|> Floki.filter_out(
|
||||
".h-card, .mention, .hashtag, .u-url, .quote-inline, .recipients-inline, code, pre"
|
||||
)
|
||||
|> Floki.text()
|
||||
end
|
||||
|
||||
def detect(text) do
|
||||
provider = get_provider()
|
||||
|
||||
{:ok, text} = text |> FastSanitize.strip_tags()
|
||||
text = prepare_text(text)
|
||||
word_count = text |> String.split(~r/\s+/) |> Enum.count()
|
||||
|
||||
if word_count < @words_threshold or !provider or !provider.configured? do
|
||||
|
|
|
@ -236,7 +236,7 @@ defp sensitive(draft) do
|
|||
defp language(draft) do
|
||||
language =
|
||||
Utils.get_valid_language(draft.params[:language]) ||
|
||||
LanguageDetector.detect(draft.full_payload)
|
||||
LanguageDetector.detect(draft.content_html <> " " <> draft.summary)
|
||||
|
||||
%__MODULE__{draft | language: language}
|
||||
end
|
||||
|
|
Loading…
Reference in a new issue