bazarr/libs/textdistance/benchmark.py
JayZed eb296e13c1
Improved global search function
* Use Hamming textdistance library

Used Hamming textdistance to sort by closest match.

* Global search UI improvements

Increased dropdown height to show more results initially (and which can also be scrolled into view).
Scrollbars will appear automatically as needed.
Remove dropdown when Search box is cleared.

* Added textdistance 4.6.2 library
2024-06-08 06:14:39 -04:00

139 lines
3.7 KiB
Python

from __future__ import annotations
# built-in
import json
import math
from collections import defaultdict
from timeit import timeit
from typing import Iterable, Iterator, NamedTuple
# external
from tabulate import tabulate
# app
from .libraries import LIBRARIES_PATH, prototype
# python3 -m textdistance.benchmark
libraries = prototype.clone()
class Lib(NamedTuple):
algorithm: str
library: str
function: str
time: float
setup: str
@property
def row(self) -> tuple[str, ...]:
time = '' if math.isinf(self.time) else f'{self.time:0.05f}'
return (self.algorithm, self.library.split('.')[0], time)
INTERNAL_SETUP = """
from textdistance import {} as cls
func = cls(external=False)
"""
STMT = """
func('text', 'test')
func('qwer', 'asdf')
func('a' * 15, 'b' * 15)
"""
RUNS = 4000
class Benchmark:
@staticmethod
def get_installed() -> Iterator[Lib]:
for alg in libraries.get_algorithms():
for lib in libraries.get_libs(alg):
# try load function
if not lib.get_function():
print(f'WARNING: cannot get func for {lib}')
continue
# return library info
yield Lib(
algorithm=alg,
library=lib.module_name,
function=lib.func_name,
time=float('Inf'),
setup=lib.setup,
)
@staticmethod
def get_external_benchmark(installed: Iterable[Lib]) -> Iterator[Lib]:
for lib in installed:
time = timeit(
stmt=STMT,
setup=lib.setup,
number=RUNS,
)
yield lib._replace(time=time)
@staticmethod
def get_internal_benchmark() -> Iterator[Lib]:
for alg in libraries.get_algorithms():
setup = f'func = __import__("textdistance").{alg}(external=False)'
yield Lib(
algorithm=alg,
library='**textdistance**',
function=alg,
time=timeit(
stmt=STMT,
setup=setup,
number=RUNS,
),
setup=setup,
)
@staticmethod
def filter_benchmark(
external: Iterable[Lib],
internal: Iterable[Lib],
) -> Iterator[Lib]:
limits = {i.algorithm: i.time for i in internal}
return filter(lambda x: x.time < limits[x.algorithm], external)
@staticmethod
def get_table(libs: list[Lib]) -> str:
table = tabulate(
[lib.row for lib in libs],
headers=['algorithm', 'library', 'time'],
tablefmt='github',
)
table += f'\nTotal: {len(libs)} libs.\n\n'
return table
@staticmethod
def save(libs: Iterable[Lib]) -> None:
data = defaultdict(list)
for lib in libs:
data[lib.algorithm].append([lib.library, lib.function])
with LIBRARIES_PATH.open('w', encoding='utf8') as f:
json.dump(obj=data, fp=f, indent=2, sort_keys=True)
@classmethod
def run(cls) -> None:
print('# Installed libraries:\n')
installed = list(cls.get_installed())
installed.sort()
print(cls.get_table(installed))
print('# Benchmarks (with textdistance):\n')
benchmark = list(cls.get_external_benchmark(installed))
benchmark_internal = list(cls.get_internal_benchmark())
benchmark += benchmark_internal
benchmark.sort(key=lambda x: (x.algorithm, x.time))
print(cls.get_table(benchmark))
benchmark = list(cls.filter_benchmark(benchmark, benchmark_internal))
cls.save(benchmark)
if __name__ == '__main__':
Benchmark.run()