mirror of
https://github.com/morpheus65535/bazarr.git
synced 2025-06-29 01:43:10 -04:00
* Use Hamming textdistance library Used Hamming textdistance to sort by closest match. * Global search UI improvements Increased dropdown height to show more results initially (and which can also be scrolled into view). Scrollbars will appear automatically as needed. Remove dropdown when Search box is cleared. * Added textdistance 4.6.2 library
179 lines
6 KiB
Python
179 lines
6 KiB
Python
from __future__ import annotations
|
|
|
|
# built-in
|
|
from collections import defaultdict
|
|
from itertools import groupby, zip_longest
|
|
from typing import Any, Iterator, Sequence, TypeVar
|
|
|
|
# app
|
|
from .base import Base as _Base, BaseSimilarity as _BaseSimilarity
|
|
|
|
|
|
try:
|
|
# external
|
|
import numpy
|
|
except ImportError:
|
|
numpy = None # type: ignore[assignment]
|
|
|
|
|
|
__all__ = [
|
|
'MRA', 'Editex',
|
|
'mra', 'editex',
|
|
]
|
|
T = TypeVar('T')
|
|
|
|
|
|
class MRA(_BaseSimilarity):
|
|
"""Western Airlines Surname Match Rating Algorithm comparison rating
|
|
https://en.wikipedia.org/wiki/Match_rating_approach
|
|
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/mra.js
|
|
"""
|
|
|
|
def maximum(self, *sequences: str) -> int:
|
|
sequences = [list(self._calc_mra(s)) for s in sequences]
|
|
return max(map(len, sequences))
|
|
|
|
def _calc_mra(self, word: str) -> str:
|
|
if not word:
|
|
return word
|
|
word = word.upper()
|
|
word = word[0] + ''.join(c for c in word[1:] if c not in 'AEIOU')
|
|
# remove repeats like an UNIX uniq
|
|
word = ''.join(char for char, _ in groupby(word))
|
|
if len(word) > 6:
|
|
return word[:3] + word[-3:]
|
|
return word
|
|
|
|
def __call__(self, *sequences: str) -> int:
|
|
if not all(sequences):
|
|
return 0
|
|
sequences = [list(self._calc_mra(s)) for s in sequences]
|
|
lengths = list(map(len, sequences))
|
|
count = len(lengths)
|
|
max_length = max(lengths)
|
|
if abs(max_length - min(lengths)) > count:
|
|
return 0
|
|
|
|
for _ in range(count):
|
|
new_sequences = []
|
|
minlen = min(lengths)
|
|
for chars in zip(*sequences):
|
|
if not self._ident(*chars):
|
|
new_sequences.append(chars)
|
|
new_sequences = map(list, zip(*new_sequences))
|
|
# update sequences
|
|
ss: Iterator[tuple[Any, Any]]
|
|
ss = zip_longest(new_sequences, sequences, fillvalue=list())
|
|
sequences = [s1 + s2[minlen:] for s1, s2 in ss]
|
|
# update lengths
|
|
lengths = list(map(len, sequences))
|
|
|
|
if not lengths:
|
|
return max_length
|
|
return max_length - max(lengths)
|
|
|
|
|
|
class Editex(_Base):
|
|
"""
|
|
https://anhaidgroup.github.io/py_stringmatching/v0.3.x/Editex.html
|
|
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.14.3856&rep=rep1&type=pdf
|
|
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.18.2138&rep=rep1&type=pdf
|
|
https://github.com/chrislit/blob/master/abydos/distance/_editex.py
|
|
https://habr.com/ru/post/331174/ (RUS)
|
|
"""
|
|
groups: tuple[frozenset[str], ...] = (
|
|
frozenset('AEIOUY'),
|
|
frozenset('BP'),
|
|
frozenset('CKQ'),
|
|
frozenset('DT'),
|
|
frozenset('LR'),
|
|
frozenset('MN'),
|
|
frozenset('GJ'),
|
|
frozenset('FPV'),
|
|
frozenset('SXZ'),
|
|
frozenset('CSZ'),
|
|
)
|
|
ungrouped = frozenset('HW') # all letters in alphabet that not presented in `grouped`
|
|
|
|
def __init__(
|
|
self,
|
|
local: bool = False,
|
|
match_cost: int = 0,
|
|
group_cost: int = 1,
|
|
mismatch_cost: int = 2,
|
|
groups: tuple[frozenset[str], ...] = None,
|
|
ungrouped: frozenset[str] = None,
|
|
external: bool = True,
|
|
) -> None:
|
|
# Ensure that match_cost <= group_cost <= mismatch_cost
|
|
self.match_cost = match_cost
|
|
self.group_cost = max(group_cost, self.match_cost)
|
|
self.mismatch_cost = max(mismatch_cost, self.group_cost)
|
|
self.local = local
|
|
self.external = external
|
|
|
|
if groups is not None:
|
|
if ungrouped is None:
|
|
raise ValueError('`ungrouped` argument required with `groups`')
|
|
self.groups = groups
|
|
self.ungrouped = ungrouped
|
|
self.grouped = frozenset.union(*self.groups)
|
|
|
|
def maximum(self, *sequences: Sequence) -> int:
|
|
return max(map(len, sequences)) * self.mismatch_cost
|
|
|
|
def r_cost(self, *elements: str) -> int:
|
|
if self._ident(*elements):
|
|
return self.match_cost
|
|
if any(map(lambda x: x not in self.grouped, elements)):
|
|
return self.mismatch_cost
|
|
for group in self.groups:
|
|
if all(map(lambda x: x in group, elements)):
|
|
return self.group_cost
|
|
return self.mismatch_cost
|
|
|
|
def d_cost(self, *elements: str) -> int:
|
|
if not self._ident(*elements) and elements[0] in self.ungrouped:
|
|
return self.group_cost
|
|
return self.r_cost(*elements)
|
|
|
|
def __call__(self, s1: str, s2: str) -> float:
|
|
result = self.quick_answer(s1, s2)
|
|
if result is not None:
|
|
return result
|
|
|
|
# must do `upper` before getting length because some one-char lowercase glyphs
|
|
# are represented as two chars in uppercase.
|
|
# This might result in a distance that is greater than the maximum
|
|
# input sequence length, though, so we save that maximum first.
|
|
max_length = self.maximum(s1, s2)
|
|
s1 = ' ' + s1.upper()
|
|
s2 = ' ' + s2.upper()
|
|
len_s1 = len(s1) - 1
|
|
len_s2 = len(s2) - 1
|
|
d_mat: Any
|
|
if numpy:
|
|
d_mat = numpy.zeros((len_s1 + 1, len_s2 + 1), dtype=int)
|
|
else:
|
|
d_mat = defaultdict(lambda: defaultdict(int))
|
|
|
|
if not self.local:
|
|
for i in range(1, len_s1 + 1):
|
|
d_mat[i][0] = d_mat[i - 1][0] + self.d_cost(s1[i - 1], s1[i])
|
|
for j in range(1, len_s2 + 1):
|
|
d_mat[0][j] = d_mat[0][j - 1] + self.d_cost(s2[j - 1], s2[j])
|
|
|
|
for i, (cs1_prev, cs1_curr) in enumerate(zip(s1, s1[1:]), start=1):
|
|
for j, (cs2_prev, cs2_curr) in enumerate(zip(s2, s2[1:]), start=1):
|
|
d_mat[i][j] = min(
|
|
d_mat[i - 1][j] + self.d_cost(cs1_prev, cs1_curr),
|
|
d_mat[i][j - 1] + self.d_cost(cs2_prev, cs2_curr),
|
|
d_mat[i - 1][j - 1] + self.r_cost(cs1_curr, cs2_curr),
|
|
)
|
|
|
|
distance = d_mat[len_s1][len_s2]
|
|
return min(distance, max_length)
|
|
|
|
|
|
mra = MRA()
|
|
editex = Editex()
|