bazarr/libs/textdistance/algorithms/phonetic.py
JayZed eb296e13c1
Improved global search function
* Use Hamming textdistance library

Used Hamming textdistance to sort by closest match.

* Global search UI improvements

Increased dropdown height to show more results initially (and which can also be scrolled into view).
Scrollbars will appear automatically as needed.
Remove dropdown when Search box is cleared.

* Added textdistance 4.6.2 library
2024-06-08 06:14:39 -04:00

179 lines
6 KiB
Python

from __future__ import annotations
# built-in
from collections import defaultdict
from itertools import groupby, zip_longest
from typing import Any, Iterator, Sequence, TypeVar
# app
from .base import Base as _Base, BaseSimilarity as _BaseSimilarity
try:
# external
import numpy
except ImportError:
numpy = None # type: ignore[assignment]
__all__ = [
'MRA', 'Editex',
'mra', 'editex',
]
T = TypeVar('T')
class MRA(_BaseSimilarity):
"""Western Airlines Surname Match Rating Algorithm comparison rating
https://en.wikipedia.org/wiki/Match_rating_approach
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/mra.js
"""
def maximum(self, *sequences: str) -> int:
sequences = [list(self._calc_mra(s)) for s in sequences]
return max(map(len, sequences))
def _calc_mra(self, word: str) -> str:
if not word:
return word
word = word.upper()
word = word[0] + ''.join(c for c in word[1:] if c not in 'AEIOU')
# remove repeats like an UNIX uniq
word = ''.join(char for char, _ in groupby(word))
if len(word) > 6:
return word[:3] + word[-3:]
return word
def __call__(self, *sequences: str) -> int:
if not all(sequences):
return 0
sequences = [list(self._calc_mra(s)) for s in sequences]
lengths = list(map(len, sequences))
count = len(lengths)
max_length = max(lengths)
if abs(max_length - min(lengths)) > count:
return 0
for _ in range(count):
new_sequences = []
minlen = min(lengths)
for chars in zip(*sequences):
if not self._ident(*chars):
new_sequences.append(chars)
new_sequences = map(list, zip(*new_sequences))
# update sequences
ss: Iterator[tuple[Any, Any]]
ss = zip_longest(new_sequences, sequences, fillvalue=list())
sequences = [s1 + s2[minlen:] for s1, s2 in ss]
# update lengths
lengths = list(map(len, sequences))
if not lengths:
return max_length
return max_length - max(lengths)
class Editex(_Base):
"""
https://anhaidgroup.github.io/py_stringmatching/v0.3.x/Editex.html
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.14.3856&rep=rep1&type=pdf
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.18.2138&rep=rep1&type=pdf
https://github.com/chrislit/blob/master/abydos/distance/_editex.py
https://habr.com/ru/post/331174/ (RUS)
"""
groups: tuple[frozenset[str], ...] = (
frozenset('AEIOUY'),
frozenset('BP'),
frozenset('CKQ'),
frozenset('DT'),
frozenset('LR'),
frozenset('MN'),
frozenset('GJ'),
frozenset('FPV'),
frozenset('SXZ'),
frozenset('CSZ'),
)
ungrouped = frozenset('HW') # all letters in alphabet that not presented in `grouped`
def __init__(
self,
local: bool = False,
match_cost: int = 0,
group_cost: int = 1,
mismatch_cost: int = 2,
groups: tuple[frozenset[str], ...] = None,
ungrouped: frozenset[str] = None,
external: bool = True,
) -> None:
# Ensure that match_cost <= group_cost <= mismatch_cost
self.match_cost = match_cost
self.group_cost = max(group_cost, self.match_cost)
self.mismatch_cost = max(mismatch_cost, self.group_cost)
self.local = local
self.external = external
if groups is not None:
if ungrouped is None:
raise ValueError('`ungrouped` argument required with `groups`')
self.groups = groups
self.ungrouped = ungrouped
self.grouped = frozenset.union(*self.groups)
def maximum(self, *sequences: Sequence) -> int:
return max(map(len, sequences)) * self.mismatch_cost
def r_cost(self, *elements: str) -> int:
if self._ident(*elements):
return self.match_cost
if any(map(lambda x: x not in self.grouped, elements)):
return self.mismatch_cost
for group in self.groups:
if all(map(lambda x: x in group, elements)):
return self.group_cost
return self.mismatch_cost
def d_cost(self, *elements: str) -> int:
if not self._ident(*elements) and elements[0] in self.ungrouped:
return self.group_cost
return self.r_cost(*elements)
def __call__(self, s1: str, s2: str) -> float:
result = self.quick_answer(s1, s2)
if result is not None:
return result
# must do `upper` before getting length because some one-char lowercase glyphs
# are represented as two chars in uppercase.
# This might result in a distance that is greater than the maximum
# input sequence length, though, so we save that maximum first.
max_length = self.maximum(s1, s2)
s1 = ' ' + s1.upper()
s2 = ' ' + s2.upper()
len_s1 = len(s1) - 1
len_s2 = len(s2) - 1
d_mat: Any
if numpy:
d_mat = numpy.zeros((len_s1 + 1, len_s2 + 1), dtype=int)
else:
d_mat = defaultdict(lambda: defaultdict(int))
if not self.local:
for i in range(1, len_s1 + 1):
d_mat[i][0] = d_mat[i - 1][0] + self.d_cost(s1[i - 1], s1[i])
for j in range(1, len_s2 + 1):
d_mat[0][j] = d_mat[0][j - 1] + self.d_cost(s2[j - 1], s2[j])
for i, (cs1_prev, cs1_curr) in enumerate(zip(s1, s1[1:]), start=1):
for j, (cs2_prev, cs2_curr) in enumerate(zip(s2, s2[1:]), start=1):
d_mat[i][j] = min(
d_mat[i - 1][j] + self.d_cost(cs1_prev, cs1_curr),
d_mat[i][j - 1] + self.d_cost(cs2_prev, cs2_curr),
d_mat[i - 1][j - 1] + self.r_cost(cs1_curr, cs2_curr),
)
distance = d_mat[len_s1][len_s2]
return min(distance, max_length)
mra = MRA()
editex = Editex()