mirror of
https://git.pleroma.social/pleroma/pleroma.git
synced 2025-04-24 13:57:23 -04:00
LanguageDetector: strip non-language text to (hopefully) improve accuracy
This commit is contained in:
parent
17d885fed8
commit
8bec926beb
1 changed files with 9 additions and 1 deletions
|
@ -15,10 +15,18 @@ defmodule Pleroma.Language.LanguageDetector do
|
|||
end
|
||||
end
|
||||
|
||||
# Strip tags from text, etc.
|
||||
defp prepare_text(text) do
|
||||
text
|
||||
|> Floki.parse_fragment!()
|
||||
|> Floki.filter_out(".h-card, .mention, .hashtag, .u-url, .quote-inline, .recipients-inline, code, pre")
|
||||
|> Floki.text()
|
||||
end
|
||||
|
||||
def detect(text) do
|
||||
provider = get_provider()
|
||||
|
||||
{:ok, text} = text |> FastSanitize.strip_tags()
|
||||
text = prepare_text(text)
|
||||
word_count = text |> String.split(~r/\s+/) |> Enum.count()
|
||||
|
||||
if word_count < @words_threshold or !provider or !provider.configured? do
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue