Replaced chardet by charamel to improve character encoding detection.

This commit is contained in:
Louis Vézina 2020-09-23 10:35:16 -04:00
parent efafe4a126
commit 29671a4aff
104 changed files with 347 additions and 0 deletions

20
libs/charamel/__init__.py Normal file
View file

@ -0,0 +1,20 @@
"""
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Usage:
>>> import charamel
>>> detector = charamel.Detector()
>>> content = b'El espa\xf1ol o castellano del lat\xedn hablado'
>>> encoding = detector.detect(content)
>>> encoding
<Encoding.ISO_8859_14: 'iso8859_14'>
>>> content.decode(encoding)
'El español o castellano del latín hablado'
Licensed under Apache 2.0
"""
from .detector import Detector # noqa: F401
from .encoding import Encoding # noqa: F401
__version__ = '1.0.0'

133
libs/charamel/detector.py Normal file
View file

@ -0,0 +1,133 @@
"""
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Licensed under Apache 2.0
"""
import itertools
import math
from typing import Dict, List, Optional, Sequence, Set, Tuple
from charamel.encoding import Encoding
from charamel.resources import load_biases, load_features, load_weights
def _get_features(content: bytes) -> Set[int]:
"""
Extract unique byte uni-grams and bi-grams
Args:
content: Encoded text
Returns:
Set of integers that represent byte n-grams
"""
pairs = zip(content, itertools.islice(content, 1, None))
return set(content).union(x * 256 + y for x, y in pairs)
def _apply_sigmoid(value: float) -> float:
"""
Apply sigmoid function to given value
"""
return 1 / (1 + math.exp(-value))
class Detector:
"""
Universal encoding detector
"""
def __init__(
self,
encodings: Sequence[Encoding] = tuple(Encoding),
min_confidence: float = 0.0,
):
"""
Create universal encoding detector for given encodings
Args:
encodings: Encodings that will be supported by this Detector instance,
less encodings lead to faster runtime
min_confidence: Minimum confidence threshold for encodings
Example:
>>> detector = Detector(
... encodings=[Encoding.UTF_8, Encoding.BIG_5],
... min_confidence=0.7,
... )
"""
if not encodings:
raise ValueError('No encodings specified')
if not 0.0 <= min_confidence <= 1.0:
raise ValueError('min_confidence must be in range [0, 1]')
self._features = load_features()
self._weights = load_weights(encodings)
self._biases = load_biases(encodings)
self._min_confidence = min_confidence
def _score(self, content: bytes) -> Dict[Encoding, float]:
"""
Compute how likely each encoding is able to decode the content
Args:
content: Encoded text
Returns:
Real-valued score for each encoding
"""
scores = self._biases.copy()
features = _get_features(content).intersection(self._features)
indices = [self._features[feature] for feature in features]
for encoding, weights in self._weights.items():
scores[encoding] += sum(weights[index] for index in indices)
return scores
def detect(self, content: bytes) -> Optional[Encoding]:
"""
Detect the most probable encoding for given byte content
Args:
content: Encoded text
Returns:
Encoding or `None` if not confident enough
Example:
>>> detector = Detector()
>>> detector.detect(b'\xc4\xe3\xba\xc3')
<Encoding.GB_K: 'gbk'>
"""
scores = self._score(content)
if scores:
encoding, score = max(scores.items(), key=lambda x: x[1])
if _apply_sigmoid(score) >= self._min_confidence:
return encoding
return None
def probe(self, content: bytes, top: int = 3) -> List[Tuple[Encoding, float]]:
"""
Detect `top` probable encodings with confidences
Args:
content: Encoded text
top: How many of the most likely encodings to return
Example:
>>> detector = Detector()
>>> detector.probe(b'\xc4\xe3\xba\xc3')
[(<Encoding.GB_K: 'gbk'>, 0.6940633812304486),
(<Encoding.GB_18030: 'gb18030'>, 0.6886364021582343),
(<Encoding.GB_2312: 'gb2312'>, 0.6707061223726806)]
"""
scores = sorted(self._score(content).items(), key=lambda x: x[1], reverse=True)
confidences = [
(encoding, _apply_sigmoid(score)) for encoding, score in scores[:top]
]
return [
(encoding, confidence)
for encoding, confidence in confidences
if confidence >= self._min_confidence
]

122
libs/charamel/encoding.py Normal file
View file

@ -0,0 +1,122 @@
"""
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Licensed under Apache 2.0
"""
import encodings.aliases
import enum
@enum.unique
class Encoding(str, enum.Enum):
"""
Python character encodings
"""
ASCII = 'ascii'
BIG_5 = 'big5'
BIG_5_HKSCS = 'big5hkscs'
CP_037 = 'cp037'
CP_273 = 'cp273'
CP_424 = 'cp424'
CP_437 = 'cp437'
CP_500 = 'cp500'
CP_720 = 'cp720'
CP_737 = 'cp737'
CP_775 = 'cp775'
CP_850 = 'cp850'
CP_852 = 'cp852'
CP_855 = 'cp855'
CP_856 = 'cp856'
CP_857 = 'cp857'
CP_858 = 'cp858'
CP_860 = 'cp860'
CP_861 = 'cp861'
CP_862 = 'cp862'
CP_863 = 'cp863'
CP_864 = 'cp864'
CP_865 = 'cp865'
CP_866 = 'cp866'
CP_869 = 'cp869'
CP_874 = 'cp874'
CP_875 = 'cp875'
CP_932 = 'cp932'
CP_949 = 'cp949'
CP_950 = 'cp950'
CP_1006 = 'cp1006'
CP_1026 = 'cp1026'
CP_1125 = 'cp1125'
CP_1140 = 'cp1140'
CP_1250 = 'cp1250'
CP_1251 = 'cp1251'
CP_1252 = 'cp1252'
CP_1253 = 'cp1253'
CP_1254 = 'cp1254'
CP_1255 = 'cp1255'
CP_1256 = 'cp1256'
CP_1257 = 'cp1257'
CP_1258 = 'cp1258'
EUC_JP = 'euc_jp'
EUC_JIS_2004 = 'euc_jis_2004'
EUC_JIS_X_0213 = 'euc_jisx0213'
EUC_KR = 'euc_kr'
GB_2312 = 'gb2312'
GB_K = 'gbk'
GB_18030 = 'gb18030'
HZ = 'hz'
ISO_2022_JP = 'iso2022_jp'
ISO_2022_JP_1 = 'iso2022_jp_1'
ISO_2022_JP_2 = 'iso2022_jp_2'
ISO_2022_JP_2004 = 'iso2022_jp_2004'
ISO_2022_JP_3 = 'iso2022_jp_3'
ISO_2022_JP_EXT = 'iso2022_jp_ext'
ISO_2022_KR = 'iso2022_kr'
LATIN_1 = 'latin_1'
ISO_8859_2 = 'iso8859_2'
ISO_8859_3 = 'iso8859_3'
ISO_8859_4 = 'iso8859_4'
ISO_8859_5 = 'iso8859_5'
ISO_8859_6 = 'iso8859_6'
ISO_8859_7 = 'iso8859_7'
ISO_8859_8 = 'iso8859_8'
ISO_8859_9 = 'iso8859_9'
ISO_8859_10 = 'iso8859_10'
ISO_8859_11 = 'iso8859_11'
ISO_8859_13 = 'iso8859_13'
ISO_8859_14 = 'iso8859_14'
ISO_8859_15 = 'iso8859_15'
ISO_8859_16 = 'iso8859_16'
JOHAB = 'johab'
KOI_8_R = 'koi8_r'
KOI_8_T = 'koi8_t'
KOI_8_U = 'koi8_u'
KZ_1048 = 'kz1048'
MAC_CYRILLIC = 'mac_cyrillic'
MAC_GREEK = 'mac_greek'
MAC_ICELAND = 'mac_iceland'
MAC_LATIN_2 = 'mac_latin2'
MAC_ROMAN = 'mac_roman'
MAC_TURKISH = 'mac_turkish'
PTCP_154 = 'ptcp154'
SHIFT_JIS = 'shift_jis'
SHIFT_JIS_2004 = 'shift_jis_2004'
SHIFT_JIS_X_0213 = 'shift_jisx0213'
TIS_620 = 'tis_620'
UTF_32 = 'utf_32'
UTF_32_BE = 'utf_32_be'
UTF_32_LE = 'utf_32_le'
UTF_16 = 'utf_16'
UTF_16_BE = 'utf_16_be'
UTF_16_LE = 'utf_16_le'
UTF_7 = 'utf_7'
UTF_8 = 'utf_8'
UTF_8_SIG = 'utf_8_sig'
@classmethod
def _missing_(cls, value):
normalized = encodings.normalize_encoding(value).lower()
normalized = encodings.aliases.aliases.get(normalized, normalized)
if value != normalized:
return cls(normalized)
return super()._missing_(value)

View file

@ -0,0 +1,72 @@
"""
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Licensed under Apache 2.0
"""
import gzip
import pathlib
import struct
from typing import Any, Dict, List, Sequence
from charamel.encoding import Encoding
RESOURCE_DIRECTORY = pathlib.Path(__file__).parent.absolute()
WEIGHT_DIRECTORY = RESOURCE_DIRECTORY / 'weights'
def _unpack(file: pathlib.Path, pattern: str) -> List[Any]:
"""
Unpack struct values from file
Args:
file: File that stores struct-packed values
pattern: Struct pattern
Returns:
List of unpacked values
"""
with gzip.open(file, 'rb') as data:
return [values[0] for values in struct.iter_unpack(pattern, data.read())]
def load_features() -> Dict[int, int]:
"""
Load byte-level feature names and indices
Returns:
Mapping from features to their indices in weight matrix
"""
features = _unpack(RESOURCE_DIRECTORY / 'features.gzip', pattern='>H')
return {feature: index for index, feature in enumerate(features)}
def load_biases(encodings: Sequence[Encoding]) -> Dict[Encoding, float]:
"""
Load linear model bias values for given encodings
Args:
encodings: List of encodings
Returns:
Mapping from encodings to their biases
"""
biases = {}
with gzip.open(RESOURCE_DIRECTORY / 'biases.gzip', 'rb') as data:
for line in data:
encoding, bias = line.decode().split()
biases[encoding] = float(bias)
return {encoding: biases[encoding] for encoding in encodings}
def load_weights(encodings: Sequence[Encoding]) -> Dict[Encoding, List[float]]:
"""
:param encodings:
:return:
"""
weights = {}
for encoding in encodings:
weights[encoding] = _unpack(WEIGHT_DIRECTORY / f'{encoding}.gzip', pattern='>e')
return weights

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Some files were not shown because too many files have changed in this diff Show more