mirror of
https://git.pleroma.social/pleroma/pleroma.git
synced 2025-04-24 13:57:23 -04:00
LanguageDetector: strip non-language text to (hopefully) improve accuracy
This commit is contained in:
parent
17d885fed8
commit
8bec926beb
1 changed files with 9 additions and 1 deletions
|
@ -15,10 +15,18 @@ defmodule Pleroma.Language.LanguageDetector do
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Strip tags from text, etc.
|
||||||
|
defp prepare_text(text) do
|
||||||
|
text
|
||||||
|
|> Floki.parse_fragment!()
|
||||||
|
|> Floki.filter_out(".h-card, .mention, .hashtag, .u-url, .quote-inline, .recipients-inline, code, pre")
|
||||||
|
|> Floki.text()
|
||||||
|
end
|
||||||
|
|
||||||
def detect(text) do
|
def detect(text) do
|
||||||
provider = get_provider()
|
provider = get_provider()
|
||||||
|
|
||||||
{:ok, text} = text |> FastSanitize.strip_tags()
|
text = prepare_text(text)
|
||||||
word_count = text |> String.split(~r/\s+/) |> Enum.count()
|
word_count = text |> String.split(~r/\s+/) |> Enum.count()
|
||||||
|
|
||||||
if word_count < @words_threshold or !provider or !provider.configured? do
|
if word_count < @words_threshold or !provider or !provider.configured? do
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue