mirror of
https://github.com/morpheus65535/bazarr.git
synced 2025-06-28 01:15:09 -04:00
Updated vendored dependencies.
This commit is contained in:
parent
708fbfcd8e
commit
bbe2483e21
1750 changed files with 53887 additions and 34406 deletions
|
@ -18,6 +18,9 @@ from .app import create_app
|
||||||
|
|
||||||
app = create_app()
|
app = create_app()
|
||||||
ui_bp.register_blueprint(api_bp, url_prefix='/api')
|
ui_bp.register_blueprint(api_bp, url_prefix='/api')
|
||||||
|
# Mute UserWarning with flask-restx and Flask >= 2.2.0. Will be raised as an exception in 2.3.0
|
||||||
|
# https://github.com/python-restx/flask-restx/issues/485
|
||||||
|
warnings.filterwarnings('ignore', message='The setup method ')
|
||||||
app.register_blueprint(ui_bp, url_prefix=base_url.rstrip('/'))
|
app.register_blueprint(ui_bp, url_prefix=base_url.rstrip('/'))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -106,7 +106,7 @@ class MongoDBJobStore(BaseJobStore):
|
||||||
raise JobLookupError(job_id)
|
raise JobLookupError(job_id)
|
||||||
|
|
||||||
def remove_all_jobs(self):
|
def remove_all_jobs(self):
|
||||||
self.collection.remove()
|
self.collection.delete_many({})
|
||||||
|
|
||||||
def shutdown(self):
|
def shutdown(self):
|
||||||
self.client.close()
|
self.client.close()
|
||||||
|
@ -133,7 +133,7 @@ class MongoDBJobStore(BaseJobStore):
|
||||||
|
|
||||||
# Remove all the jobs we failed to restore
|
# Remove all the jobs we failed to restore
|
||||||
if failed_job_ids:
|
if failed_job_ids:
|
||||||
self.collection.remove({'_id': {'$in': failed_job_ids}})
|
self.collection.delete_many({'_id': {'$in': failed_job_ids}})
|
||||||
|
|
||||||
return jobs
|
return jobs
|
||||||
|
|
||||||
|
|
|
@ -191,12 +191,11 @@ class BaseScheduler(six.with_metaclass(ABCMeta)):
|
||||||
self.state = STATE_STOPPED
|
self.state = STATE_STOPPED
|
||||||
|
|
||||||
# Shut down all executors
|
# Shut down all executors
|
||||||
with self._executors_lock:
|
with self._executors_lock, self._jobstores_lock:
|
||||||
for executor in six.itervalues(self._executors):
|
for executor in six.itervalues(self._executors):
|
||||||
executor.shutdown(wait)
|
executor.shutdown(wait)
|
||||||
|
|
||||||
# Shut down all job stores
|
# Shut down all job stores
|
||||||
with self._jobstores_lock:
|
|
||||||
for jobstore in six.itervalues(self._jobstores):
|
for jobstore in six.itervalues(self._jobstores):
|
||||||
jobstore.shutdown()
|
jobstore.shutdown()
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,9 @@ try:
|
||||||
except (ImportError, RuntimeError): # pragma: nocover
|
except (ImportError, RuntimeError): # pragma: nocover
|
||||||
try:
|
try:
|
||||||
from PyQt4.QtCore import QObject, QTimer
|
from PyQt4.QtCore import QObject, QTimer
|
||||||
|
except ImportError:
|
||||||
|
try:
|
||||||
|
from PySide6.QtCore import QObject, QTimer # noqa
|
||||||
except ImportError:
|
except ImportError:
|
||||||
try:
|
try:
|
||||||
from PySide2.QtCore import QObject, QTimer # noqa
|
from PySide2.QtCore import QObject, QTimer # noqa
|
||||||
|
@ -14,7 +17,7 @@ except (ImportError, RuntimeError): # pragma: nocover
|
||||||
try:
|
try:
|
||||||
from PySide.QtCore import QObject, QTimer # noqa
|
from PySide.QtCore import QObject, QTimer # noqa
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError('QtScheduler requires either PyQt5, PyQt4, PySide2 '
|
raise ImportError('QtScheduler requires either PyQt5, PyQt4, PySide6, PySide2 '
|
||||||
'or PySide installed')
|
'or PySide installed')
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,8 @@ import six
|
||||||
from apscheduler.triggers.base import BaseTrigger
|
from apscheduler.triggers.base import BaseTrigger
|
||||||
from apscheduler.triggers.cron.fields import (
|
from apscheduler.triggers.cron.fields import (
|
||||||
BaseField, MonthField, WeekField, DayOfMonthField, DayOfWeekField, DEFAULT_VALUES)
|
BaseField, MonthField, WeekField, DayOfMonthField, DayOfWeekField, DEFAULT_VALUES)
|
||||||
from apscheduler.util import datetime_ceil, convert_to_datetime, datetime_repr, astimezone
|
from apscheduler.util import (
|
||||||
|
datetime_ceil, convert_to_datetime, datetime_repr, astimezone, localize, normalize)
|
||||||
|
|
||||||
|
|
||||||
class CronTrigger(BaseTrigger):
|
class CronTrigger(BaseTrigger):
|
||||||
|
@ -143,7 +144,7 @@ class CronTrigger(BaseTrigger):
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
difference = datetime(**values) - dateval.replace(tzinfo=None)
|
difference = datetime(**values) - dateval.replace(tzinfo=None)
|
||||||
return self.timezone.normalize(dateval + difference), fieldnum
|
return normalize(dateval + difference), fieldnum
|
||||||
|
|
||||||
def _set_field_value(self, dateval, fieldnum, new_value):
|
def _set_field_value(self, dateval, fieldnum, new_value):
|
||||||
values = {}
|
values = {}
|
||||||
|
@ -156,7 +157,7 @@ class CronTrigger(BaseTrigger):
|
||||||
else:
|
else:
|
||||||
values[field.name] = new_value
|
values[field.name] = new_value
|
||||||
|
|
||||||
return self.timezone.localize(datetime(**values))
|
return localize(datetime(**values), self.timezone)
|
||||||
|
|
||||||
def get_next_fire_time(self, previous_fire_time, now):
|
def get_next_fire_time(self, previous_fire_time, now):
|
||||||
if previous_fire_time:
|
if previous_fire_time:
|
||||||
|
|
|
@ -4,7 +4,9 @@ from math import ceil
|
||||||
from tzlocal import get_localzone
|
from tzlocal import get_localzone
|
||||||
|
|
||||||
from apscheduler.triggers.base import BaseTrigger
|
from apscheduler.triggers.base import BaseTrigger
|
||||||
from apscheduler.util import convert_to_datetime, timedelta_seconds, datetime_repr, astimezone
|
from apscheduler.util import (
|
||||||
|
convert_to_datetime, normalize, timedelta_seconds, datetime_repr,
|
||||||
|
astimezone)
|
||||||
|
|
||||||
|
|
||||||
class IntervalTrigger(BaseTrigger):
|
class IntervalTrigger(BaseTrigger):
|
||||||
|
@ -63,7 +65,7 @@ class IntervalTrigger(BaseTrigger):
|
||||||
next_fire_time = self._apply_jitter(next_fire_time, self.jitter, now)
|
next_fire_time = self._apply_jitter(next_fire_time, self.jitter, now)
|
||||||
|
|
||||||
if not self.end_date or next_fire_time <= self.end_date:
|
if not self.end_date or next_fire_time <= self.end_date:
|
||||||
return self.timezone.normalize(next_fire_time)
|
return normalize(next_fire_time)
|
||||||
|
|
||||||
def __getstate__(self):
|
def __getstate__(self):
|
||||||
return {
|
return {
|
||||||
|
|
|
@ -34,7 +34,7 @@ except ImportError:
|
||||||
__all__ = ('asint', 'asbool', 'astimezone', 'convert_to_datetime', 'datetime_to_utc_timestamp',
|
__all__ = ('asint', 'asbool', 'astimezone', 'convert_to_datetime', 'datetime_to_utc_timestamp',
|
||||||
'utc_timestamp_to_datetime', 'timedelta_seconds', 'datetime_ceil', 'get_callable_name',
|
'utc_timestamp_to_datetime', 'timedelta_seconds', 'datetime_ceil', 'get_callable_name',
|
||||||
'obj_to_ref', 'ref_to_obj', 'maybe_ref', 'repr_escape', 'check_callable_args',
|
'obj_to_ref', 'ref_to_obj', 'maybe_ref', 'repr_escape', 'check_callable_args',
|
||||||
'TIMEOUT_MAX')
|
'normalize', 'localize', 'TIMEOUT_MAX')
|
||||||
|
|
||||||
|
|
||||||
class _Undefined(object):
|
class _Undefined(object):
|
||||||
|
@ -90,9 +90,7 @@ def astimezone(obj):
|
||||||
if isinstance(obj, six.string_types):
|
if isinstance(obj, six.string_types):
|
||||||
return timezone(obj)
|
return timezone(obj)
|
||||||
if isinstance(obj, tzinfo):
|
if isinstance(obj, tzinfo):
|
||||||
if not hasattr(obj, 'localize') or not hasattr(obj, 'normalize'):
|
if obj.tzname(None) == 'local':
|
||||||
raise TypeError('Only timezones from the pytz library are supported')
|
|
||||||
if obj.zone == 'local':
|
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
'Unable to determine the name of the local timezone -- you must explicitly '
|
'Unable to determine the name of the local timezone -- you must explicitly '
|
||||||
'specify the name of the local timezone. Please refrain from using timezones like '
|
'specify the name of the local timezone. Please refrain from using timezones like '
|
||||||
|
@ -162,11 +160,7 @@ def convert_to_datetime(input, tz, arg_name):
|
||||||
if isinstance(tz, six.string_types):
|
if isinstance(tz, six.string_types):
|
||||||
tz = timezone(tz)
|
tz = timezone(tz)
|
||||||
|
|
||||||
try:
|
return localize(datetime_, tz)
|
||||||
return tz.localize(datetime_, is_dst=None)
|
|
||||||
except AttributeError:
|
|
||||||
raise TypeError(
|
|
||||||
'Only pytz timezones are supported (need the localize() and normalize() methods)')
|
|
||||||
|
|
||||||
|
|
||||||
def datetime_to_utc_timestamp(timeval):
|
def datetime_to_utc_timestamp(timeval):
|
||||||
|
@ -431,3 +425,14 @@ def iscoroutinefunction_partial(f):
|
||||||
# The asyncio version of iscoroutinefunction includes testing for @coroutine
|
# The asyncio version of iscoroutinefunction includes testing for @coroutine
|
||||||
# decorations vs. the inspect version which does not.
|
# decorations vs. the inspect version which does not.
|
||||||
return iscoroutinefunction(f)
|
return iscoroutinefunction(f)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(dt):
|
||||||
|
return datetime.fromtimestamp(dt.timestamp(), dt.tzinfo)
|
||||||
|
|
||||||
|
|
||||||
|
def localize(dt, tzinfo):
|
||||||
|
if hasattr(tzinfo, 'localize'):
|
||||||
|
return tzinfo.localize(dt)
|
||||||
|
|
||||||
|
return normalize(dt.replace(tzinfo=tzinfo))
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
# -*- coding: utf-8 -*-
|
# Copyright 2009-2022 Joshua Bronson. All rights reserved.
|
||||||
# Copyright 2009-2021 Joshua Bronson. All Rights Reserved.
|
|
||||||
#
|
#
|
||||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
|
@ -10,15 +9,15 @@
|
||||||
# * Welcome to the bidict source code *
|
# * Welcome to the bidict source code *
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
|
|
||||||
# Doing a code review? You'll find a "Code review nav" comment like the one
|
# Reading through the code? You'll find a "Code review nav" comment like the one
|
||||||
# below at the top and bottom of the most important source files. This provides
|
# below at the top and bottom of the key source files. Follow these cues to take
|
||||||
# a suggested initial path through the source when reviewing.
|
# a path through the code that's optimized for familiarizing yourself with it.
|
||||||
#
|
#
|
||||||
# Note: If you aren't reading this on https://github.com/jab/bidict, you may be
|
# If you're not reading this on https://github.com/jab/bidict already, go there
|
||||||
# viewing an outdated version of the code. Please head to GitHub to review the
|
# to ensure you have the latest version of the code. While there, you can also
|
||||||
# latest version, which contains important improvements over older versions.
|
# star the project, watch it for updates, fork the code, and submit an issue or
|
||||||
#
|
# pull request with any proposed changes. More information can be found linked
|
||||||
# Thank you for reading and for any feedback you provide.
|
# from README.rst, which is also shown on https://github.com/jab/bidict.
|
||||||
|
|
||||||
# * Code review nav *
|
# * Code review nav *
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
|
@ -28,6 +27,8 @@
|
||||||
|
|
||||||
"""The bidirectional mapping library for Python.
|
"""The bidirectional mapping library for Python.
|
||||||
|
|
||||||
|
----
|
||||||
|
|
||||||
bidict by example:
|
bidict by example:
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
@ -44,8 +45,9 @@ Please see https://github.com/jab/bidict for the most up-to-date code and
|
||||||
https://bidict.readthedocs.io for the most up-to-date documentation
|
https://bidict.readthedocs.io for the most up-to-date documentation
|
||||||
if you are reading this elsewhere.
|
if you are reading this elsewhere.
|
||||||
|
|
||||||
|
----
|
||||||
|
|
||||||
.. :copyright: (c) 2009-2021 Joshua Bronson.
|
.. :copyright: (c) 2009-2022 Joshua Bronson.
|
||||||
.. :license: MPLv2. See LICENSE for details.
|
.. :license: MPLv2. See LICENSE for details.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -53,35 +55,41 @@ if you are reading this elsewhere.
|
||||||
from sys import version_info as _version_info
|
from sys import version_info as _version_info
|
||||||
|
|
||||||
|
|
||||||
if _version_info < (3, 6): # pragma: no cover
|
if _version_info < (3, 7): # pragma: no cover
|
||||||
raise ImportError('Python 3.6+ is required.')
|
raise ImportError('Python 3.7+ is required.')
|
||||||
|
|
||||||
from ._abc import BidirectionalMapping, MutableBidirectionalMapping
|
from ._abc import BidirectionalMapping as BidirectionalMapping, MutableBidirectionalMapping as MutableBidirectionalMapping
|
||||||
from ._base import BidictBase
|
from ._base import BidictBase as BidictBase, GeneratedBidictInverse as GeneratedBidictInverse, BidictKeysView as BidictKeysView
|
||||||
from ._mut import MutableBidict
|
from ._bidict import MutableBidict as MutableBidict, bidict as bidict
|
||||||
from ._bidict import bidict
|
from ._frozenbidict import frozenbidict as frozenbidict
|
||||||
from ._frozenbidict import frozenbidict
|
from ._frozenordered import FrozenOrderedBidict as FrozenOrderedBidict
|
||||||
from ._frozenordered import FrozenOrderedBidict
|
from ._named import NamedBidictBase as NamedBidictBase, namedbidict as namedbidict
|
||||||
from ._named import namedbidict
|
from ._orderedbase import OrderedBidictBase as OrderedBidictBase
|
||||||
from ._orderedbase import OrderedBidictBase
|
from ._orderedbidict import OrderedBidict as OrderedBidict
|
||||||
from ._orderedbidict import OrderedBidict
|
from ._dup import ON_DUP_DEFAULT as ON_DUP_DEFAULT, ON_DUP_RAISE as ON_DUP_RAISE, ON_DUP_DROP_OLD as ON_DUP_DROP_OLD
|
||||||
from ._dup import ON_DUP_DEFAULT, ON_DUP_RAISE, ON_DUP_DROP_OLD, RAISE, DROP_OLD, DROP_NEW, OnDup, OnDupAction
|
from ._dup import RAISE as RAISE, DROP_OLD as DROP_OLD, DROP_NEW as DROP_NEW, OnDup as OnDup, OD as OD
|
||||||
from ._exc import BidictException, DuplicationError, KeyDuplicationError, ValueDuplicationError, KeyAndValueDuplicationError
|
from ._exc import BidictException as BidictException, DuplicationError as DuplicationError
|
||||||
from ._iter import inverted
|
from ._exc import KeyDuplicationError as KeyDuplicationError, ValueDuplicationError as ValueDuplicationError, KeyAndValueDuplicationError as KeyAndValueDuplicationError
|
||||||
|
from ._iter import inverted as inverted
|
||||||
from .metadata import (
|
from .metadata import (
|
||||||
__author__, __maintainer__, __copyright__, __email__, __credits__, __url__,
|
__author__ as __author__, __maintainer__ as __maintainer__, __copyright__ as __copyright__, __email__ as __email__,
|
||||||
__license__, __status__, __description__, __keywords__, __version__,
|
__url__ as __url__, __license__ as __license__, __status__ as __status__, __description__ as __description__,
|
||||||
|
__keywords__ as __keywords__, __version__ as __version__, __project_urls__ as __project_urls__,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Set __module__ of re-exported classes to the 'bidict' top-level module name
|
|
||||||
# so that private/internal submodules are not exposed to users e.g. in repr strings.
|
#: Alias
|
||||||
_locals = tuple(locals().items())
|
OnDupAction = OD
|
||||||
for _name, _obj in _locals: # pragma: no cover
|
|
||||||
|
|
||||||
|
# Set __module__ of re-exported classes to the 'bidict' top-level module, so that e.g.
|
||||||
|
# 'bidict.bidict' shows up as 'bidict.bidict` rather than 'bidict._bidict.bidict'.
|
||||||
|
for _obj in tuple(locals().values()): # pragma: no cover
|
||||||
if not getattr(_obj, '__module__', '').startswith('bidict.'):
|
if not getattr(_obj, '__module__', '').startswith('bidict.'):
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
_obj.__module__ = 'bidict'
|
_obj.__module__ = 'bidict'
|
||||||
except AttributeError: # raised when __module__ is read-only (as in OnDup)
|
except AttributeError: # __module__ is read-only (as in namedtuples like `OnDup`)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,26 +1,12 @@
|
||||||
# -*- coding: utf-8 -*-
|
# Copyright 2009-2022 Joshua Bronson. All rights reserved.
|
||||||
# Copyright 2009-2021 Joshua Bronson. All Rights Reserved.
|
|
||||||
#
|
#
|
||||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
|
||||||
#==============================================================================
|
|
||||||
# * Welcome to the bidict source code *
|
|
||||||
#==============================================================================
|
|
||||||
|
|
||||||
# Doing a code review? You'll find a "Code review nav" comment like the one
|
|
||||||
# below at the top and bottom of the most important source files. This provides
|
|
||||||
# a suggested initial path through the source when reviewing.
|
|
||||||
#
|
|
||||||
# Note: If you aren't reading this on https://github.com/jab/bidict, you may be
|
|
||||||
# viewing an outdated version of the code. Please head to GitHub to review the
|
|
||||||
# latest version, which contains important improvements over older versions.
|
|
||||||
#
|
|
||||||
# Thank you for reading and for any feedback you provide.
|
|
||||||
|
|
||||||
# * Code review nav *
|
# * Code review nav *
|
||||||
|
# (see comments in __init__.py)
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
# ← Prev: __init__.py Current: _abc.py Next: _base.py →
|
# ← Prev: __init__.py Current: _abc.py Next: _base.py →
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
|
@ -28,14 +14,14 @@
|
||||||
|
|
||||||
"""Provide the :class:`BidirectionalMapping` abstract base class."""
|
"""Provide the :class:`BidirectionalMapping` abstract base class."""
|
||||||
|
|
||||||
import typing as _t
|
import typing as t
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
|
|
||||||
from ._typing import KT, VT
|
from ._typing import KT, VT
|
||||||
|
|
||||||
|
|
||||||
class BidirectionalMapping(_t.Mapping[KT, VT]):
|
class BidirectionalMapping(t.Mapping[KT, VT]):
|
||||||
"""Abstract base class (ABC) for bidirectional mapping types.
|
"""Abstract base class for bidirectional mapping types.
|
||||||
|
|
||||||
Extends :class:`collections.abc.Mapping` primarily by adding the
|
Extends :class:`collections.abc.Mapping` primarily by adding the
|
||||||
(abstract) :attr:`inverse` property,
|
(abstract) :attr:`inverse` property,
|
||||||
|
@ -55,14 +41,13 @@ class BidirectionalMapping(_t.Mapping[KT, VT]):
|
||||||
|
|
||||||
:raises NotImplementedError: Meant to be overridden in subclasses.
|
:raises NotImplementedError: Meant to be overridden in subclasses.
|
||||||
"""
|
"""
|
||||||
# The @abstractproperty decorator prevents BidirectionalMapping subclasses from being
|
# The @abstractmethod decorator prevents BidirectionalMapping subclasses from being
|
||||||
# instantiated unless they override this method. So users shouldn't be able to get to the
|
# instantiated unless they override ``.inverse``. So this implementation of ``.inverse``
|
||||||
# point where they can unintentionally call this implementation of .inverse on something
|
# should never be unintentionally resolved from subclass instances. But raise here
|
||||||
# anyway. Could leave the method body empty, but raise NotImplementedError so it's extra
|
# anyway, so it's extra clear that this implementation should never be called.
|
||||||
# clear there's no reason to call this implementation (e.g. via super() after overriding).
|
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def __inverted__(self) -> _t.Iterator[_t.Tuple[VT, KT]]:
|
def __inverted__(self) -> t.Iterator[t.Tuple[VT, KT]]:
|
||||||
"""Get an iterator over the items in :attr:`inverse`.
|
"""Get an iterator over the items in :attr:`inverse`.
|
||||||
|
|
||||||
This is functionally equivalent to iterating over the items in the
|
This is functionally equivalent to iterating over the items in the
|
||||||
|
@ -78,23 +63,9 @@ class BidirectionalMapping(_t.Mapping[KT, VT]):
|
||||||
"""
|
"""
|
||||||
return iter(self.inverse.items())
|
return iter(self.inverse.items())
|
||||||
|
|
||||||
def values(self) -> _t.KeysView[VT]: # type: ignore [override] # https://github.com/python/typeshed/issues/4435
|
|
||||||
"""A set-like object providing a view on the contained values.
|
|
||||||
|
|
||||||
Override the implementation inherited from
|
class MutableBidirectionalMapping(BidirectionalMapping[KT, VT], t.MutableMapping[KT, VT]):
|
||||||
:class:`~collections.abc.Mapping`.
|
"""Abstract base class for mutable bidirectional mapping types."""
|
||||||
Because the values of a :class:`~bidict.BidirectionalMapping`
|
|
||||||
are the keys of its inverse,
|
|
||||||
this returns a :class:`~collections.abc.KeysView`
|
|
||||||
rather than a :class:`~collections.abc.ValuesView`,
|
|
||||||
which has the advantages of constant-time containment checks
|
|
||||||
and supporting set operations.
|
|
||||||
"""
|
|
||||||
return self.inverse.keys() # type: ignore [return-value]
|
|
||||||
|
|
||||||
|
|
||||||
class MutableBidirectionalMapping(BidirectionalMapping[KT, VT], _t.MutableMapping[KT, VT]):
|
|
||||||
"""Abstract base class (ABC) for mutable bidirectional mapping types."""
|
|
||||||
|
|
||||||
__slots__ = ()
|
__slots__ = ()
|
||||||
|
|
||||||
|
|
|
@ -1,26 +1,12 @@
|
||||||
# -*- coding: utf-8 -*-
|
# Copyright 2009-2022 Joshua Bronson. All rights reserved.
|
||||||
# Copyright 2009-2021 Joshua Bronson. All Rights Reserved.
|
|
||||||
#
|
#
|
||||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
|
||||||
#==============================================================================
|
|
||||||
# * Welcome to the bidict source code *
|
|
||||||
#==============================================================================
|
|
||||||
|
|
||||||
# Doing a code review? You'll find a "Code review nav" comment like the one
|
|
||||||
# below at the top and bottom of the most important source files. This provides
|
|
||||||
# a suggested initial path through the source when reviewing.
|
|
||||||
#
|
|
||||||
# Note: If you aren't reading this on https://github.com/jab/bidict, you may be
|
|
||||||
# viewing an outdated version of the code. Please head to GitHub to review the
|
|
||||||
# latest version, which contains important improvements over older versions.
|
|
||||||
#
|
|
||||||
# Thank you for reading and for any feedback you provide.
|
|
||||||
|
|
||||||
# * Code review nav *
|
# * Code review nav *
|
||||||
|
# (see comments in __init__.py)
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
# ← Prev: _abc.py Current: _base.py Next: _frozenbidict.py →
|
# ← Prev: _abc.py Current: _base.py Next: _frozenbidict.py →
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
|
@ -28,156 +14,265 @@
|
||||||
|
|
||||||
"""Provide :class:`BidictBase`."""
|
"""Provide :class:`BidictBase`."""
|
||||||
|
|
||||||
import typing as _t
|
import typing as t
|
||||||
from collections import namedtuple
|
import weakref
|
||||||
from copy import copy
|
from functools import partial
|
||||||
from weakref import ref
|
from itertools import starmap
|
||||||
|
from operator import eq
|
||||||
|
from types import MappingProxyType
|
||||||
|
|
||||||
from ._abc import BidirectionalMapping
|
from ._abc import BidirectionalMapping
|
||||||
from ._dup import ON_DUP_DEFAULT, RAISE, DROP_OLD, DROP_NEW, OnDup
|
from ._dup import ON_DUP_DEFAULT, RAISE, DROP_OLD, DROP_NEW, OnDup
|
||||||
from ._exc import DuplicationError, KeyDuplicationError, ValueDuplicationError, KeyAndValueDuplicationError
|
from ._exc import DuplicationError, KeyDuplicationError, ValueDuplicationError, KeyAndValueDuplicationError
|
||||||
from ._iter import _iteritems_args_kw
|
from ._iter import iteritems, inverted
|
||||||
from ._typing import _NONE, KT, VT, OKT, OVT, IterItems, MapOrIterItems
|
from ._typing import KT, VT, MISSING, OKT, OVT, IterItems, MapOrIterItems
|
||||||
|
|
||||||
|
|
||||||
_WriteResult = namedtuple('_WriteResult', 'key val oldkey oldval')
|
# Disable pyright strict diagnostics that are causing many false positives or are just not helpful in this file:
|
||||||
_DedupResult = namedtuple('_DedupResult', 'isdupkey isdupval invbyval fwdbykey')
|
# pyright: reportPrivateUsage=false, reportUnknownArgumentType=false, reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnnecessaryIsInstance=false
|
||||||
_NODUP = _DedupResult(False, False, _NONE, _NONE)
|
|
||||||
|
|
||||||
BT = _t.TypeVar('BT', bound='BidictBase') # typevar for BidictBase.copy
|
|
||||||
|
OldKV = t.Tuple[OKT[KT], OVT[VT]]
|
||||||
|
DedupResult = t.Optional[OldKV[KT, VT]]
|
||||||
|
Write = t.List[t.Callable[[], None]]
|
||||||
|
Unwrite = Write
|
||||||
|
PreparedWrite = t.Tuple[Write, Unwrite]
|
||||||
|
BT = t.TypeVar('BT', bound='BidictBase[t.Any, t.Any]')
|
||||||
|
|
||||||
|
|
||||||
|
class BidictKeysView(t.KeysView[KT], t.ValuesView[KT]):
|
||||||
|
"""Since the keys of a bidict are the values of its inverse (and vice versa),
|
||||||
|
the :class:`~collections.abc.ValuesView` result of calling *bi.values()*
|
||||||
|
is also a :class:`~collections.abc.KeysView` of *bi.inverse*.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
dict_keys: t.Type[t.KeysView[t.Any]] = type({}.keys())
|
||||||
|
BidictKeysView.register(dict_keys)
|
||||||
|
|
||||||
|
|
||||||
|
def get_arg(*args: MapOrIterItems[KT, VT]) -> MapOrIterItems[KT, VT]:
|
||||||
|
"""Ensure there's only a single arg in *args*, then return it."""
|
||||||
|
if len(args) > 1:
|
||||||
|
raise TypeError(f'Expected at most 1 positional argument, got {len(args)}')
|
||||||
|
return args[0] if args else ()
|
||||||
|
|
||||||
|
|
||||||
class BidictBase(BidirectionalMapping[KT, VT]):
|
class BidictBase(BidirectionalMapping[KT, VT]):
|
||||||
"""Base class implementing :class:`BidirectionalMapping`."""
|
"""Base class implementing :class:`BidirectionalMapping`."""
|
||||||
|
|
||||||
__slots__ = ['_fwdm', '_invm', '_inv', '_invweak', '__weakref__']
|
|
||||||
|
|
||||||
#: The default :class:`~bidict.OnDup`
|
#: The default :class:`~bidict.OnDup`
|
||||||
#: that governs behavior when a provided item
|
#: that governs behavior when a provided item
|
||||||
#: duplicates the key or value of other item(s).
|
#: duplicates the key or value of other item(s).
|
||||||
#:
|
#:
|
||||||
#: *See also* :ref:`basic-usage:Values Must Be Unique`, :doc:`extending`
|
#: *See also*
|
||||||
|
#: :ref:`basic-usage:Values Must Be Unique` (https://bidict.rtfd.io/basic-usage.html#values-must-be-unique),
|
||||||
|
#: :doc:`extending` (https://bidict.rtfd.io/extending.html)
|
||||||
on_dup = ON_DUP_DEFAULT
|
on_dup = ON_DUP_DEFAULT
|
||||||
|
|
||||||
_fwdm_cls: _t.Type[_t.MutableMapping[KT, VT]] = dict #: class of the backing forward mapping
|
_fwdm: t.MutableMapping[KT, VT] #: the backing forward mapping (*key* → *val*)
|
||||||
_invm_cls: _t.Type[_t.MutableMapping[VT, KT]] = dict #: class of the backing inverse mapping
|
_invm: t.MutableMapping[VT, KT] #: the backing inverse mapping (*val* → *key*)
|
||||||
|
|
||||||
#: The object used by :meth:`__repr__` for printing the contained items.
|
# Use Any rather than KT/VT in the following to avoid "ClassVar cannot contain type variables" errors:
|
||||||
_repr_delegate: _t.Callable = dict
|
_fwdm_cls: t.ClassVar[t.Type[t.MutableMapping[t.Any, t.Any]]] = dict #: class of the backing forward mapping
|
||||||
|
_invm_cls: t.ClassVar[t.Type[t.MutableMapping[t.Any, t.Any]]] = dict #: class of the backing inverse mapping
|
||||||
|
|
||||||
_inv: 'BidictBase[VT, KT]'
|
#: The class of the inverse bidict instance.
|
||||||
_inv_cls: '_t.Type[BidictBase[VT, KT]]'
|
_inv_cls: 't.ClassVar[t.Type[BidictBase[t.Any, t.Any]]]'
|
||||||
|
|
||||||
def __init_subclass__(cls, **kw):
|
#: Used by :meth:`__repr__` for the contained items.
|
||||||
super().__init_subclass__(**kw)
|
_repr_delegate: t.ClassVar[t.Any] = dict
|
||||||
# Compute and set _inv_cls, the inverse of this bidict class.
|
|
||||||
if '_inv_cls' in cls.__dict__:
|
def __init_subclass__(cls) -> None:
|
||||||
|
super().__init_subclass__()
|
||||||
|
cls._init_class()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _init_class(cls) -> None:
|
||||||
|
cls._ensure_inv_cls()
|
||||||
|
cls._set_reversed()
|
||||||
|
|
||||||
|
__reversed__: t.Any
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _set_reversed(cls) -> None:
|
||||||
|
"""Set __reversed__ for subclasses that do not set it explicitly
|
||||||
|
according to whether backing mappings are reversible.
|
||||||
|
"""
|
||||||
|
if cls is not BidictBase:
|
||||||
|
resolved = cls.__reversed__
|
||||||
|
overridden = resolved is not BidictBase.__reversed__
|
||||||
|
if overridden: # E.g. OrderedBidictBase, OrderedBidict, FrozenOrderedBidict
|
||||||
return
|
return
|
||||||
if cls._fwdm_cls is cls._invm_cls:
|
# The following will be False for MutableBidict, bidict, and frozenbidict on Python < 3.8,
|
||||||
cls._inv_cls = cls
|
# and True for them on 3.8+ (where dicts are reversible). Will also be True for custom
|
||||||
return
|
# subclasses like SortedBidict (see https://bidict.rtfd.io/extending.html#sortedbidict-recipes).
|
||||||
inv_cls = type(cls.__name__ + 'Inv', cls.__bases__, {
|
backing_reversible = all(issubclass(i, t.Reversible) for i in (cls._fwdm_cls, cls._invm_cls))
|
||||||
**cls.__dict__,
|
cls.__reversed__ = _fwdm_reversed if backing_reversible else None
|
||||||
'_inv_cls': cls,
|
|
||||||
|
@classmethod
|
||||||
|
def _ensure_inv_cls(cls) -> None:
|
||||||
|
"""Ensure :attr:`_inv_cls` is set, computing it dynamically if necessary.
|
||||||
|
|
||||||
|
See: :ref:`extending:Dynamic Inverse Class Generation`
|
||||||
|
(https://bidict.rtfd.io/extending.html#dynamic-inverse-class-generation)
|
||||||
|
|
||||||
|
Most subclasses will be their own inverse classes, but some
|
||||||
|
(e.g. those created via namedbidict) will have distinct inverse classes.
|
||||||
|
"""
|
||||||
|
if cls.__dict__.get('_inv_cls'):
|
||||||
|
return # Already set, nothing to do.
|
||||||
|
cls._inv_cls = cls._make_inv_cls()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _make_inv_cls(cls: t.Type[BT], _miss: t.Any = object()) -> 't.Type[BT]':
|
||||||
|
diff = cls._inv_cls_dict_diff()
|
||||||
|
cls_is_own_inv = all(getattr(cls, k, _miss) == v for (k, v) in diff.items())
|
||||||
|
if cls_is_own_inv:
|
||||||
|
return cls
|
||||||
|
# Suppress auto-calculation of _inv_cls's _inv_cls since we know it already.
|
||||||
|
# Works with the guard in BidictBase._ensure_inv_cls() to prevent infinite recursion.
|
||||||
|
diff['_inv_cls'] = cls
|
||||||
|
inv_cls = type(f'{cls.__name__}Inv', (cls, GeneratedBidictInverse), diff)
|
||||||
|
inv_cls.__module__ = cls.__module__
|
||||||
|
return t.cast(t.Type[BT], inv_cls)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _inv_cls_dict_diff(cls) -> t.Dict[str, t.Any]:
|
||||||
|
return {
|
||||||
'_fwdm_cls': cls._invm_cls,
|
'_fwdm_cls': cls._invm_cls,
|
||||||
'_invm_cls': cls._fwdm_cls,
|
'_invm_cls': cls._fwdm_cls,
|
||||||
})
|
}
|
||||||
cls._inv_cls = inv_cls
|
|
||||||
|
|
||||||
@_t.overload
|
@t.overload
|
||||||
def __init__(self, __arg: _t.Mapping[KT, VT], **kw: VT) -> None: ...
|
|
||||||
@_t.overload
|
|
||||||
def __init__(self, __arg: IterItems[KT, VT], **kw: VT) -> None: ...
|
|
||||||
@_t.overload
|
|
||||||
def __init__(self, **kw: VT) -> None: ...
|
def __init__(self, **kw: VT) -> None: ...
|
||||||
|
@t.overload
|
||||||
|
def __init__(self, __m: t.Mapping[KT, VT], **kw: VT) -> None: ...
|
||||||
|
@t.overload
|
||||||
|
def __init__(self, __i: IterItems[KT, VT], **kw: VT) -> None: ...
|
||||||
|
|
||||||
def __init__(self, *args: MapOrIterItems[KT, VT], **kw: VT) -> None:
|
def __init__(self, *args: MapOrIterItems[KT, VT], **kw: VT) -> None:
|
||||||
"""Make a new bidirectional dictionary.
|
"""Make a new bidirectional mapping.
|
||||||
The signature behaves like that of :class:`dict`.
|
The signature behaves like that of :class:`dict`.
|
||||||
Items passed in are added in the order they are passed,
|
Items passed in are added in the order they are passed,
|
||||||
respecting the :attr:`on_dup` class attribute in the process.
|
respecting the :attr:`on_dup` class attribute in the process.
|
||||||
"""
|
"""
|
||||||
#: The backing :class:`~collections.abc.Mapping`
|
self._fwdm = self._fwdm_cls()
|
||||||
#: storing the forward mapping data (*key* → *value*).
|
self._invm = self._invm_cls()
|
||||||
self._fwdm: _t.MutableMapping[KT, VT] = self._fwdm_cls()
|
|
||||||
#: The backing :class:`~collections.abc.Mapping`
|
|
||||||
#: storing the inverse mapping data (*value* → *key*).
|
|
||||||
self._invm: _t.MutableMapping[VT, KT] = self._invm_cls()
|
|
||||||
self._init_inv()
|
|
||||||
if args or kw:
|
if args or kw:
|
||||||
self._update(True, self.on_dup, *args, **kw)
|
self._update(get_arg(*args), kw, rbof=False)
|
||||||
|
|
||||||
def _init_inv(self) -> None:
|
|
||||||
# Create the inverse bidict instance via __new__, bypassing its __init__ so that its
|
|
||||||
# _fwdm and _invm can be assigned to this bidict's _invm and _fwdm. Store it in self._inv,
|
|
||||||
# which holds a strong reference to a bidict's inverse, if one is available.
|
|
||||||
self._inv = inv = self._inv_cls.__new__(self._inv_cls)
|
|
||||||
inv._fwdm = self._invm
|
|
||||||
inv._invm = self._fwdm
|
|
||||||
# Only give the inverse a weak reference to this bidict to avoid creating a reference cycle,
|
|
||||||
# stored in the _invweak attribute. See also the docs in
|
|
||||||
# :ref:`addendum:Bidict Avoids Reference Cycles`
|
|
||||||
inv._inv = None
|
|
||||||
inv._invweak = ref(self)
|
|
||||||
# Since this bidict has a strong reference to its inverse already, set its _invweak to None.
|
|
||||||
self._invweak = None
|
|
||||||
|
|
||||||
@property
|
|
||||||
def _isinv(self) -> bool:
|
|
||||||
return self._inv is None
|
|
||||||
|
|
||||||
|
# If Python ever adds support for higher-kinded types, `inverse` could use them, e.g.
|
||||||
|
# def inverse(self: BT[KT, VT]) -> BT[VT, KT]:
|
||||||
|
# Ref: https://github.com/python/typing/issues/548#issuecomment-621571821
|
||||||
@property
|
@property
|
||||||
def inverse(self) -> 'BidictBase[VT, KT]':
|
def inverse(self) -> 'BidictBase[VT, KT]':
|
||||||
"""The inverse of this bidict."""
|
"""The inverse of this bidirectional mapping instance."""
|
||||||
# Resolve and return a strong reference to the inverse bidict.
|
# When `bi.inverse` is called for the first time, this method
|
||||||
# One may be stored in self._inv already.
|
# computes the inverse instance, stores it for subsequent use, and then
|
||||||
if self._inv is not None:
|
# returns it. It also stores a reference on `bi.inverse` back to `bi`,
|
||||||
return self._inv
|
# but uses a weakref to avoid creating a reference cycle. Strong references
|
||||||
# Otherwise a weakref is stored in self._invweak. Try to get a strong ref from it.
|
# to inverse instances are stored in ._inv, and weak references are stored
|
||||||
assert self._invweak is not None
|
# in ._invweak.
|
||||||
inv = self._invweak()
|
|
||||||
|
# First check if a strong reference is already stored.
|
||||||
|
inv: 't.Optional[BidictBase[VT, KT]]' = getattr(self, '_inv', None)
|
||||||
if inv is not None:
|
if inv is not None:
|
||||||
return inv
|
return inv
|
||||||
# Refcount of referent must have dropped to zero, as in `bidict().inv.inv`. Init a new one.
|
# Next check if a weak reference is already stored.
|
||||||
self._init_inv() # Now this bidict will retain a strong ref to its inverse.
|
invweak = getattr(self, '_invweak', None)
|
||||||
return self._inv
|
if invweak is not None:
|
||||||
|
inv = invweak() # Try to resolve a strong reference and return it.
|
||||||
|
if inv is not None:
|
||||||
|
return inv
|
||||||
|
# No luck. Compute the inverse reference and store it for subsequent use.
|
||||||
|
inv = self._make_inverse()
|
||||||
|
self._inv: 't.Optional[BidictBase[VT, KT]]' = inv
|
||||||
|
self._invweak: 't.Optional[weakref.ReferenceType[BidictBase[VT, KT]]]' = None
|
||||||
|
# Also store a weak reference back to `instance` on its inverse instance, so that
|
||||||
|
# the second `.inverse` access in `bi.inverse.inverse` hits the cached weakref.
|
||||||
|
inv._inv = None
|
||||||
|
inv._invweak = weakref.ref(self)
|
||||||
|
# In e.g. `bidict().inverse.inverse`, this design ensures that a strong reference
|
||||||
|
# back to the original instance is retained before its refcount drops to zero,
|
||||||
|
# avoiding an unintended potential deallocation.
|
||||||
|
return inv
|
||||||
|
|
||||||
#: Alias for :attr:`inverse`.
|
def _make_inverse(self) -> 'BidictBase[VT, KT]':
|
||||||
inv = inverse
|
inv: 'BidictBase[VT, KT]' = self._inv_cls()
|
||||||
|
inv._fwdm = self._invm
|
||||||
|
inv._invm = self._fwdm
|
||||||
|
return inv
|
||||||
|
|
||||||
def __getstate__(self) -> dict:
|
@property
|
||||||
"""Needed to enable pickling due to use of :attr:`__slots__` and weakrefs.
|
def inv(self) -> 'BidictBase[VT, KT]':
|
||||||
|
"""Alias for :attr:`inverse`."""
|
||||||
*See also* :meth:`object.__getstate__`
|
return self.inverse
|
||||||
"""
|
|
||||||
state = {}
|
|
||||||
for cls in self.__class__.__mro__:
|
|
||||||
slots = getattr(cls, '__slots__', ())
|
|
||||||
for slot in slots:
|
|
||||||
if hasattr(self, slot):
|
|
||||||
state[slot] = getattr(self, slot)
|
|
||||||
# weakrefs can't be pickled.
|
|
||||||
state.pop('_invweak', None) # Added back in __setstate__ via _init_inv call.
|
|
||||||
state.pop('__weakref__', None) # Not added back in __setstate__. Python manages this one.
|
|
||||||
return state
|
|
||||||
|
|
||||||
def __setstate__(self, state: dict) -> None:
|
|
||||||
"""Implemented because use of :attr:`__slots__` would prevent unpickling otherwise.
|
|
||||||
|
|
||||||
*See also* :meth:`object.__setstate__`
|
|
||||||
"""
|
|
||||||
for slot, value in state.items():
|
|
||||||
setattr(self, slot, value)
|
|
||||||
self._init_inv()
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
"""See :func:`repr`."""
|
"""See :func:`repr`."""
|
||||||
clsname = self.__class__.__name__
|
clsname = self.__class__.__name__
|
||||||
if not self:
|
items = self._repr_delegate(self.items()) if self else ''
|
||||||
return f'{clsname}()'
|
return f'{clsname}({items})'
|
||||||
return f'{clsname}({self._repr_delegate(self.items())})'
|
|
||||||
|
|
||||||
# The inherited Mapping.__eq__ implementation would work, but it's implemented in terms of an
|
def values(self) -> BidictKeysView[VT]:
|
||||||
# inefficient ``dict(self.items()) == dict(other.items())`` comparison, so override it with a
|
"""A set-like object providing a view on the contained values.
|
||||||
|
|
||||||
|
Since the values of a bidict are equivalent to the keys of its inverse,
|
||||||
|
this method returns a set-like object for this bidict's values
|
||||||
|
rather than just a collections.abc.ValuesView.
|
||||||
|
This object supports set operations like union and difference,
|
||||||
|
and constant- rather than linear-time containment checks,
|
||||||
|
and is no more expensive to provide than the less capable
|
||||||
|
collections.abc.ValuesView would be.
|
||||||
|
|
||||||
|
See :meth:`keys` for more information.
|
||||||
|
"""
|
||||||
|
return t.cast(BidictKeysView[VT], self.inverse.keys())
|
||||||
|
|
||||||
|
def keys(self) -> t.KeysView[KT]:
|
||||||
|
"""A set-like object providing a view on the contained keys.
|
||||||
|
|
||||||
|
When *b._fwdm* is a :class:`dict`, *b.keys()* returns a
|
||||||
|
*dict_keys* object that behaves exactly the same as
|
||||||
|
*collections.abc.KeysView(b)*, except for
|
||||||
|
|
||||||
|
- offering better performance
|
||||||
|
|
||||||
|
- being reversible on Python 3.8+
|
||||||
|
|
||||||
|
- having a .mapping attribute in Python 3.10+
|
||||||
|
that exposes a mappingproxy to *b._fwdm*.
|
||||||
|
"""
|
||||||
|
fwdm = self._fwdm
|
||||||
|
kv = fwdm.keys() if isinstance(fwdm, dict) else BidictKeysView(self)
|
||||||
|
return kv
|
||||||
|
|
||||||
|
def items(self) -> t.ItemsView[KT, VT]:
|
||||||
|
"""A set-like object providing a view on the contained items.
|
||||||
|
|
||||||
|
When *b._fwdm* is a :class:`dict`, *b.items()* returns a
|
||||||
|
*dict_items* object that behaves exactly the same as
|
||||||
|
*collections.abc.ItemsView(b)*, except for:
|
||||||
|
|
||||||
|
- offering better performance
|
||||||
|
|
||||||
|
- being reversible on Python 3.8+
|
||||||
|
|
||||||
|
- having a .mapping attribute in Python 3.10+
|
||||||
|
that exposes a mappingproxy to *b._fwdm*.
|
||||||
|
"""
|
||||||
|
return self._fwdm.items() if isinstance(self._fwdm, dict) else super().items()
|
||||||
|
|
||||||
|
# The inherited collections.abc.Mapping.__contains__() method is implemented by doing a `try`
|
||||||
|
# `except KeyError` around `self[key]`. The following implementation is much faster,
|
||||||
|
# especially in the missing case.
|
||||||
|
def __contains__(self, key: t.Any) -> bool:
|
||||||
|
"""True if the mapping contains the specified key, else False."""
|
||||||
|
return key in self._fwdm
|
||||||
|
|
||||||
|
# The inherited collections.abc.Mapping.__eq__() method is implemented in terms of an inefficient
|
||||||
|
# `dict(self.items()) == dict(other.items())` comparison, so override it with a
|
||||||
# more efficient implementation.
|
# more efficient implementation.
|
||||||
def __eq__(self, other: object) -> bool:
|
def __eq__(self, other: object) -> bool:
|
||||||
"""*x.__eq__(other) ⟺ x == other*
|
"""*x.__eq__(other) ⟺ x == other*
|
||||||
|
@ -189,67 +284,51 @@ class BidictBase(BidirectionalMapping[KT, VT]):
|
||||||
is inherited by subclasses,
|
is inherited by subclasses,
|
||||||
in particular by the ordered bidict subclasses,
|
in particular by the ordered bidict subclasses,
|
||||||
so even with ordered bidicts,
|
so even with ordered bidicts,
|
||||||
:ref:`== comparison is order-insensitive <eq-order-insensitive>`.
|
:ref:`== comparison is order-insensitive <eq-order-insensitive>`
|
||||||
|
(https://bidict.rtfd.io/other-bidict-types.html#eq-is-order-insensitive).
|
||||||
|
|
||||||
*See also* :meth:`bidict.FrozenOrderedBidict.equals_order_sensitive`
|
*See also* :meth:`equals_order_sensitive`
|
||||||
"""
|
"""
|
||||||
if not isinstance(other, _t.Mapping) or len(self) != len(other):
|
if isinstance(other, t.Mapping):
|
||||||
return False
|
return self._fwdm.items() == other.items()
|
||||||
selfget = self.get
|
# Ref: https://docs.python.org/3/library/constants.html#NotImplemented
|
||||||
return all(selfget(k, _NONE) == v for (k, v) in other.items()) # type: ignore [arg-type]
|
return NotImplemented
|
||||||
|
|
||||||
def equals_order_sensitive(self, other: object) -> bool:
|
def equals_order_sensitive(self, other: object) -> bool:
|
||||||
"""Order-sensitive equality check.
|
"""Order-sensitive equality check.
|
||||||
|
|
||||||
*See also* :ref:`eq-order-insensitive`
|
*See also* :ref:`eq-order-insensitive`
|
||||||
|
(https://bidict.rtfd.io/other-bidict-types.html#eq-is-order-insensitive)
|
||||||
"""
|
"""
|
||||||
# Same short-circuit as in __eq__ above. Factoring out not worth function call overhead.
|
if not isinstance(other, t.Mapping) or len(self) != len(other):
|
||||||
if not isinstance(other, _t.Mapping) or len(self) != len(other):
|
|
||||||
return False
|
return False
|
||||||
return all(i == j for (i, j) in zip(self.items(), other.items()))
|
return all(starmap(eq, zip(self.items(), other.items())))
|
||||||
|
|
||||||
# The following methods are mutating and so are not public. But they are implemented in this
|
def _dedup(self, key: KT, val: VT, on_dup: OnDup) -> DedupResult[KT, VT]:
|
||||||
# non-mutable base class (rather than the mutable `bidict` subclass) because they are used here
|
|
||||||
# during initialization (starting with the `_update` method). (Why is this? Because `__init__`
|
|
||||||
# and `update` share a lot of the same behavior (inserting the provided items while respecting
|
|
||||||
# `on_dup`), so it makes sense for them to share implementation too.)
|
|
||||||
def _pop(self, key: KT) -> VT:
|
|
||||||
val = self._fwdm.pop(key)
|
|
||||||
del self._invm[val]
|
|
||||||
return val
|
|
||||||
|
|
||||||
def _put(self, key: KT, val: VT, on_dup: OnDup) -> None:
|
|
||||||
dedup_result = self._dedup_item(key, val, on_dup)
|
|
||||||
if dedup_result is not None:
|
|
||||||
self._write_item(key, val, dedup_result)
|
|
||||||
|
|
||||||
def _dedup_item(self, key: KT, val: VT, on_dup: OnDup) -> _t.Optional[_DedupResult]:
|
|
||||||
"""Check *key* and *val* for any duplication in self.
|
"""Check *key* and *val* for any duplication in self.
|
||||||
|
|
||||||
Handle any duplication as per the passed in *on_dup*.
|
Handle any duplication as per the passed in *on_dup*.
|
||||||
|
|
||||||
(key, val) already present is construed as a no-op, not a duplication.
|
If (key, val) is already present, return None
|
||||||
|
since writing (key, val) would be a no-op.
|
||||||
|
|
||||||
If duplication is found and the corresponding :class:`~bidict.OnDupAction` is
|
If duplication is found and the corresponding :class:`~bidict.OnDupAction` is
|
||||||
:attr:`~bidict.DROP_NEW`, return None.
|
:attr:`~bidict.DROP_NEW`, return None.
|
||||||
|
|
||||||
If duplication is found and the corresponding :class:`~bidict.OnDupAction` is
|
If duplication is found and the corresponding :class:`~bidict.OnDupAction` is
|
||||||
:attr:`~bidict.RAISE`, raise the appropriate error.
|
:attr:`~bidict.RAISE`, raise the appropriate exception.
|
||||||
|
|
||||||
If duplication is found and the corresponding :class:`~bidict.OnDupAction` is
|
If duplication is found and the corresponding :class:`~bidict.OnDupAction` is
|
||||||
:attr:`~bidict.DROP_OLD`,
|
:attr:`~bidict.DROP_OLD`, or if no duplication is found,
|
||||||
or if no duplication is found,
|
return *(oldkey, oldval)*.
|
||||||
return the :class:`_DedupResult` *(isdupkey, isdupval, oldkey, oldval)*.
|
|
||||||
"""
|
"""
|
||||||
fwdm = self._fwdm
|
fwdm, invm = self._fwdm, self._invm
|
||||||
invm = self._invm
|
oldval: OVT[VT] = fwdm.get(key, MISSING)
|
||||||
oldval: OVT = fwdm.get(key, _NONE)
|
oldkey: OKT[KT] = invm.get(val, MISSING)
|
||||||
oldkey: OKT = invm.get(val, _NONE)
|
isdupkey, isdupval = oldval is not MISSING, oldkey is not MISSING
|
||||||
isdupkey = oldval is not _NONE
|
|
||||||
isdupval = oldkey is not _NONE
|
|
||||||
dedup_result = _DedupResult(isdupkey, isdupval, oldkey, oldval)
|
|
||||||
if isdupkey and isdupval:
|
if isdupkey and isdupval:
|
||||||
if self._already_have(key, val, oldkey, oldval):
|
if key == oldkey:
|
||||||
|
assert val == oldval
|
||||||
# (key, val) duplicates an existing item -> no-op.
|
# (key, val) duplicates an existing item -> no-op.
|
||||||
return None
|
return None
|
||||||
# key and val each duplicate a different existing item.
|
# key and val each duplicate a different existing item.
|
||||||
|
@ -274,129 +353,205 @@ class BidictBase(BidirectionalMapping[KT, VT]):
|
||||||
assert on_dup.val is DROP_OLD
|
assert on_dup.val is DROP_OLD
|
||||||
# Fall through to the return statement on the last line.
|
# Fall through to the return statement on the last line.
|
||||||
# else neither isdupkey nor isdupval.
|
# else neither isdupkey nor isdupval.
|
||||||
return dedup_result
|
return oldkey, oldval
|
||||||
|
|
||||||
@staticmethod
|
def _prep_write(self, newkey: KT, newval: VT, oldkey: OKT[KT], oldval: OVT[VT], save_unwrite: bool) -> PreparedWrite:
|
||||||
def _already_have(key: KT, val: VT, oldkey: OKT, oldval: OVT) -> bool:
|
"""Given (newkey, newval) to insert, return the list of operations necessary to perform the write.
|
||||||
# Overridden by _orderedbase.OrderedBidictBase.
|
|
||||||
isdup = oldkey == key
|
|
||||||
assert isdup == (oldval == val), f'{key} {val} {oldkey} {oldval}'
|
|
||||||
return isdup
|
|
||||||
|
|
||||||
def _write_item(self, key: KT, val: VT, dedup_result: _DedupResult) -> _WriteResult:
|
*oldkey* and *oldval* are as returned by :meth:`_dedup`.
|
||||||
# Overridden by _orderedbase.OrderedBidictBase.
|
|
||||||
isdupkey, isdupval, oldkey, oldval = dedup_result
|
|
||||||
fwdm = self._fwdm
|
|
||||||
invm = self._invm
|
|
||||||
fwdm[key] = val
|
|
||||||
invm[val] = key
|
|
||||||
if isdupkey:
|
|
||||||
del invm[oldval]
|
|
||||||
if isdupval:
|
|
||||||
del fwdm[oldkey]
|
|
||||||
return _WriteResult(key, val, oldkey, oldval)
|
|
||||||
|
|
||||||
def _update(self, init: bool, on_dup: OnDup, *args: MapOrIterItems[KT, VT], **kw: VT) -> None:
|
If *save_unwrite* is true, also return the list of inverse operations necessary to undo the write.
|
||||||
# args[0] may be a generator that yields many items, so process input in a single pass.
|
This design allows :meth:`_update` to roll back a partially applied update that fails part-way through
|
||||||
if not args and not kw:
|
when necessary. This design also allows subclasses that require additional operations to complete
|
||||||
return
|
a write to easily extend this implementation. For example, :class:`bidict.OrderedBidictBase` calls this
|
||||||
can_skip_dup_check = not self and not kw and isinstance(args[0], BidirectionalMapping)
|
inherited implementation, and then extends the list of ops returned with additional operations
|
||||||
if can_skip_dup_check:
|
needed to keep its internal linked list nodes consistent with its items' order as changes are made.
|
||||||
self._update_no_dup_check(args[0]) # type: ignore [arg-type]
|
"""
|
||||||
return
|
fwdm, invm = self._fwdm, self._invm
|
||||||
can_skip_rollback = init or RAISE not in on_dup
|
write: t.List[t.Callable[[], None]] = [
|
||||||
if can_skip_rollback:
|
partial(fwdm.__setitem__, newkey, newval),
|
||||||
self._update_no_rollback(on_dup, *args, **kw)
|
partial(invm.__setitem__, newval, newkey),
|
||||||
|
]
|
||||||
|
unwrite: t.List[t.Callable[[], None]]
|
||||||
|
if oldval is MISSING and oldkey is MISSING: # no key or value duplication
|
||||||
|
# {0: 1, 2: 3} + (4, 5) => {0: 1, 2: 3, 4: 5}
|
||||||
|
unwrite = [
|
||||||
|
partial(fwdm.__delitem__, newkey),
|
||||||
|
partial(invm.__delitem__, newval),
|
||||||
|
] if save_unwrite else []
|
||||||
|
elif oldval is not MISSING and oldkey is not MISSING: # key and value duplication across two different items
|
||||||
|
# {0: 1, 2: 3} + (0, 3) => {0: 3}
|
||||||
|
write.extend((
|
||||||
|
partial(fwdm.__delitem__, oldkey),
|
||||||
|
partial(invm.__delitem__, oldval),
|
||||||
|
))
|
||||||
|
unwrite = [
|
||||||
|
partial(fwdm.__setitem__, newkey, oldval),
|
||||||
|
partial(invm.__setitem__, oldval, newkey),
|
||||||
|
partial(fwdm.__setitem__, oldkey, newval),
|
||||||
|
partial(invm.__setitem__, newval, oldkey),
|
||||||
|
] if save_unwrite else []
|
||||||
|
elif oldval is not MISSING: # just key duplication
|
||||||
|
# {0: 1, 2: 3} + (2, 4) => {0: 1, 2: 4}
|
||||||
|
write.append(partial(invm.__delitem__, oldval))
|
||||||
|
unwrite = [
|
||||||
|
partial(fwdm.__setitem__, newkey, oldval),
|
||||||
|
partial(invm.__setitem__, oldval, newkey),
|
||||||
|
partial(invm.__delitem__, newval),
|
||||||
|
] if save_unwrite else []
|
||||||
else:
|
else:
|
||||||
self._update_with_rollback(on_dup, *args, **kw)
|
assert oldkey is not MISSING # just value duplication
|
||||||
|
# {0: 1, 2: 3} + (4, 3) => {0: 1, 4: 3}
|
||||||
|
write.append(partial(fwdm.__delitem__, oldkey))
|
||||||
|
unwrite = [
|
||||||
|
partial(fwdm.__setitem__, oldkey, newval),
|
||||||
|
partial(invm.__setitem__, newval, oldkey),
|
||||||
|
partial(fwdm.__delitem__, newkey),
|
||||||
|
] if save_unwrite else []
|
||||||
|
return write, unwrite
|
||||||
|
|
||||||
def _update_no_dup_check(self, other: BidirectionalMapping[KT, VT]) -> None:
|
def _update(
|
||||||
write_item = self._write_item
|
self,
|
||||||
for (key, val) in other.items():
|
arg: MapOrIterItems[KT, VT],
|
||||||
write_item(key, val, _NODUP)
|
kw: t.Mapping[str, VT] = MappingProxyType({}),
|
||||||
|
*,
|
||||||
def _update_no_rollback(self, on_dup: OnDup, *args: MapOrIterItems[KT, VT], **kw: VT) -> None:
|
rbof: t.Optional[bool] = None,
|
||||||
put = self._put
|
on_dup: t.Optional[OnDup] = None,
|
||||||
for (key, val) in _iteritems_args_kw(*args, **kw):
|
) -> None:
|
||||||
put(key, val, on_dup)
|
"""Update, possibly rolling back on failure as per *rbof*."""
|
||||||
|
# Must process input in a single pass, since arg may be a generator.
|
||||||
def _update_with_rollback(self, on_dup: OnDup, *args: MapOrIterItems[KT, VT], **kw: VT) -> None:
|
if not arg and not kw:
|
||||||
"""Update, rolling back on failure."""
|
|
||||||
writes: _t.List[_t.Tuple[_DedupResult, _WriteResult]] = []
|
|
||||||
append_write = writes.append
|
|
||||||
dedup_item = self._dedup_item
|
|
||||||
write_item = self._write_item
|
|
||||||
for (key, val) in _iteritems_args_kw(*args, **kw):
|
|
||||||
try:
|
|
||||||
dedup_result = dedup_item(key, val, on_dup)
|
|
||||||
except DuplicationError:
|
|
||||||
undo_write = self._undo_write
|
|
||||||
for dedup_result, write_result in reversed(writes):
|
|
||||||
undo_write(dedup_result, write_result)
|
|
||||||
raise
|
|
||||||
if dedup_result is not None:
|
|
||||||
write_result = write_item(key, val, dedup_result)
|
|
||||||
append_write((dedup_result, write_result))
|
|
||||||
|
|
||||||
def _undo_write(self, dedup_result: _DedupResult, write_result: _WriteResult) -> None:
|
|
||||||
isdupkey, isdupval, _, _ = dedup_result
|
|
||||||
key, val, oldkey, oldval = write_result
|
|
||||||
if not isdupkey and not isdupval:
|
|
||||||
self._pop(key)
|
|
||||||
return
|
return
|
||||||
fwdm = self._fwdm
|
if on_dup is None:
|
||||||
invm = self._invm
|
on_dup = self.on_dup
|
||||||
if isdupkey:
|
if rbof is None:
|
||||||
fwdm[key] = oldval
|
rbof = RAISE in on_dup
|
||||||
invm[oldval] = key
|
if not self and not kw:
|
||||||
if not isdupval:
|
if isinstance(arg, BidictBase): # can skip dup check
|
||||||
del invm[val]
|
self._init_from(arg)
|
||||||
if isdupval:
|
return
|
||||||
invm[val] = oldkey
|
# If arg is not a BidictBase, fall through to the general treatment below,
|
||||||
fwdm[oldkey] = val
|
# which includes duplication checking. (If arg is some BidirectionalMapping
|
||||||
if not isdupkey:
|
# that does not inherit from BidictBase, it's a foreign implementation, so we
|
||||||
del fwdm[key]
|
# perform duplication checking to err on the safe side.)
|
||||||
|
|
||||||
|
# If we roll back on failure and we know that there are more updates to process than
|
||||||
|
# already-contained items, our rollback strategy is to update a copy of self (without
|
||||||
|
# rolling back on failure), and then to become the copy if all updates succeed.
|
||||||
|
if rbof and isinstance(arg, t.Sized) and len(arg) + len(kw) > len(self):
|
||||||
|
target = self.copy()
|
||||||
|
target._update(arg, kw, rbof=False, on_dup=on_dup)
|
||||||
|
self._init_from(target)
|
||||||
|
return
|
||||||
|
|
||||||
|
# There are more already-contained items than updates to process, or we don't know
|
||||||
|
# how many updates there are to process. If we need to roll back on failure,
|
||||||
|
# save a log of Unwrites as we update so we can undo changes if the update fails.
|
||||||
|
unwrites: t.List[Unwrite] = []
|
||||||
|
append_unwrite = unwrites.append
|
||||||
|
prep_write = self._prep_write
|
||||||
|
for (key, val) in iteritems(arg, **kw):
|
||||||
|
try:
|
||||||
|
dedup_result = self._dedup(key, val, on_dup)
|
||||||
|
except DuplicationError:
|
||||||
|
if rbof:
|
||||||
|
while unwrites: # apply saved unwrites
|
||||||
|
unwrite = unwrites.pop()
|
||||||
|
for unwriteop in unwrite:
|
||||||
|
unwriteop()
|
||||||
|
raise
|
||||||
|
if dedup_result is None: # no-op
|
||||||
|
continue
|
||||||
|
write, unwrite = prep_write(key, val, *dedup_result, save_unwrite=rbof)
|
||||||
|
for writeop in write: # apply the write
|
||||||
|
writeop()
|
||||||
|
if rbof and unwrite: # save the unwrite for later application if needed
|
||||||
|
append_unwrite(unwrite)
|
||||||
|
|
||||||
def copy(self: BT) -> BT:
|
def copy(self: BT) -> BT:
|
||||||
"""A shallow copy."""
|
"""Make a (shallow) copy of this bidict."""
|
||||||
# Could just ``return self.__class__(self)`` here instead, but the below is faster. It uses
|
# Could just `return self.__class__(self)` here, but the below is faster. The former
|
||||||
# __new__ to create a copy instance while bypassing its __init__, which would result
|
# would copy this bidict's items into a new instance one at a time (checking for duplication
|
||||||
# in copying this bidict's items into the copy instance one at a time. Instead, make whole
|
# for each item), whereas the below copies from the backing mappings all at once, and foregoes
|
||||||
# copies of each of the backing mappings, and make them the backing mappings of the copy,
|
# item-by-item duplication checking since the backing mappings have been checked already.
|
||||||
# avoiding copying items one at a time.
|
return self._from_other(self.__class__, self)
|
||||||
cp: BT = self.__class__.__new__(self.__class__)
|
|
||||||
cp._fwdm = copy(self._fwdm)
|
@staticmethod
|
||||||
cp._invm = copy(self._invm)
|
def _from_other(bt: t.Type[BT], other: MapOrIterItems[KT, VT], inv: bool = False) -> BT:
|
||||||
cp._init_inv()
|
"""Fast, private constructor based on :meth:`_init_from`.
|
||||||
return cp
|
|
||||||
|
If *inv* is true, return the inverse of the instance instead of the instance itself.
|
||||||
|
(Useful for pickling with dynamically-generated inverse classes -- see :meth:`__reduce__`.)
|
||||||
|
"""
|
||||||
|
inst = bt()
|
||||||
|
inst._init_from(other)
|
||||||
|
return t.cast(BT, inst.inverse) if inv else inst
|
||||||
|
|
||||||
|
def _init_from(self, other: MapOrIterItems[KT, VT]) -> None:
|
||||||
|
"""Fast init from *other*, bypassing item-by-item duplication checking."""
|
||||||
|
self._fwdm.clear()
|
||||||
|
self._invm.clear()
|
||||||
|
self._fwdm.update(other)
|
||||||
|
# If other is a bidict, use its existing backing inverse mapping, otherwise
|
||||||
|
# other could be a generator that's now exhausted, so invert self._fwdm on the fly.
|
||||||
|
inv = other.inverse if isinstance(other, BidictBase) else inverted(self._fwdm)
|
||||||
|
self._invm.update(inv) # pyright: ignore # https://github.com/jab/bidict/pull/242#discussion_r824223403
|
||||||
|
|
||||||
#: Used for the copy protocol.
|
#: Used for the copy protocol.
|
||||||
#: *See also* the :mod:`copy` module
|
#: *See also* the :mod:`copy` module
|
||||||
__copy__ = copy
|
__copy__ = copy
|
||||||
|
|
||||||
|
def __or__(self: BT, other: t.Mapping[KT, VT]) -> BT:
|
||||||
|
"""Return self|other."""
|
||||||
|
if not isinstance(other, t.Mapping):
|
||||||
|
return NotImplemented
|
||||||
|
new = self.copy()
|
||||||
|
new._update(other, rbof=False)
|
||||||
|
return new
|
||||||
|
|
||||||
|
def __ror__(self: BT, other: t.Mapping[KT, VT]) -> BT:
|
||||||
|
"""Return other|self."""
|
||||||
|
if not isinstance(other, t.Mapping):
|
||||||
|
return NotImplemented
|
||||||
|
new = self.__class__(other)
|
||||||
|
new._update(self, rbof=False)
|
||||||
|
return new
|
||||||
|
|
||||||
def __len__(self) -> int:
|
def __len__(self) -> int:
|
||||||
"""The number of contained items."""
|
"""The number of contained items."""
|
||||||
return len(self._fwdm)
|
return len(self._fwdm)
|
||||||
|
|
||||||
def __iter__(self) -> _t.Iterator[KT]:
|
def __iter__(self) -> t.Iterator[KT]:
|
||||||
"""Iterator over the contained keys."""
|
"""Iterator over the contained keys."""
|
||||||
return iter(self._fwdm)
|
return iter(self._fwdm)
|
||||||
|
|
||||||
def __getitem__(self, key: KT) -> VT:
|
def __getitem__(self, key: KT) -> VT:
|
||||||
"""*x.__getitem__(key) ⟺ x[key]*"""
|
"""*x.__getitem__(key) ⟺ x[key]*"""
|
||||||
return self._fwdm[key]
|
return self._fwdm[key]
|
||||||
|
|
||||||
# On Python 3.8+, dicts are reversible, so even non-Ordered bidicts can provide an efficient
|
def __reduce__(self) -> t.Tuple[t.Any, ...]:
|
||||||
# __reversed__ implementation. (On Python < 3.8, they cannot.) Once support is dropped for
|
"""Return state information for pickling."""
|
||||||
# Python < 3.8, can remove the following if statement to provide __reversed__ unconditionally.
|
# If this bidict's class is dynamically generated, pickle the inverse instead, whose
|
||||||
if hasattr(_fwdm_cls, '__reversed__'):
|
# (presumably not dynamically generated) class the caller is more likely to have a reference to
|
||||||
def __reversed__(self) -> _t.Iterator[KT]:
|
# somewhere in sys.modules that pickle can discover.
|
||||||
|
should_invert = isinstance(self, GeneratedBidictInverse)
|
||||||
|
cls, init_from = (self._inv_cls, self.inverse) if should_invert else (self.__class__, self)
|
||||||
|
return self._from_other, (cls, dict(init_from), should_invert) # type: ignore [call-overload]
|
||||||
|
|
||||||
|
|
||||||
|
# See BidictBase._set_reversed() above.
|
||||||
|
def _fwdm_reversed(self: BidictBase[KT, t.Any]) -> t.Iterator[KT]:
|
||||||
"""Iterator over the contained keys in reverse order."""
|
"""Iterator over the contained keys in reverse order."""
|
||||||
return reversed(self._fwdm) # type: ignore [no-any-return,call-overload]
|
assert isinstance(self._fwdm, t.Reversible)
|
||||||
|
return reversed(self._fwdm)
|
||||||
|
|
||||||
|
|
||||||
# Work around weakref slot with Generics bug on Python 3.6 (https://bugs.python.org/issue41451):
|
BidictBase._init_class()
|
||||||
BidictBase.__slots__.remove('__weakref__')
|
|
||||||
|
|
||||||
|
class GeneratedBidictInverse:
|
||||||
|
"""Base class for dynamically-generated inverse bidict classes."""
|
||||||
|
|
||||||
|
|
||||||
# * Code review nav *
|
# * Code review nav *
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
|
|
|
@ -1,51 +1,197 @@
|
||||||
# -*- coding: utf-8 -*-
|
# Copyright 2009-2022 Joshua Bronson. All rights reserved.
|
||||||
# Copyright 2009-2021 Joshua Bronson. All Rights Reserved.
|
|
||||||
#
|
#
|
||||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
|
||||||
#==============================================================================
|
|
||||||
# * Welcome to the bidict source code *
|
|
||||||
#==============================================================================
|
|
||||||
|
|
||||||
# Doing a code review? You'll find a "Code review nav" comment like the one
|
|
||||||
# below at the top and bottom of the most important source files. This provides
|
|
||||||
# a suggested initial path through the source when reviewing.
|
|
||||||
#
|
|
||||||
# Note: If you aren't reading this on https://github.com/jab/bidict, you may be
|
|
||||||
# viewing an outdated version of the code. Please head to GitHub to review the
|
|
||||||
# latest version, which contains important improvements over older versions.
|
|
||||||
#
|
|
||||||
# Thank you for reading and for any feedback you provide.
|
|
||||||
|
|
||||||
# * Code review nav *
|
# * Code review nav *
|
||||||
|
# (see comments in __init__.py)
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
# ← Prev: _mut.py Current: _bidict.py Next: _orderedbase.py →
|
# ← Prev: _frozenbidict.py Current: _bidict.py Next: _orderedbase.py →
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
|
|
||||||
|
|
||||||
"""Provide :class:`bidict`."""
|
"""Provide :class:`MutableBidict`."""
|
||||||
|
|
||||||
import typing as _t
|
import typing as t
|
||||||
|
|
||||||
from ._delegating import _DelegatingBidict
|
from ._abc import MutableBidirectionalMapping
|
||||||
from ._mut import MutableBidict
|
from ._base import BidictBase, get_arg
|
||||||
from ._typing import KT, VT
|
from ._dup import OnDup, ON_DUP_RAISE, ON_DUP_DROP_OLD
|
||||||
|
from ._typing import KT, VT, DT, ODT, MISSING, IterItems, MapOrIterItems
|
||||||
|
|
||||||
|
|
||||||
class bidict(_DelegatingBidict[KT, VT], MutableBidict[KT, VT]):
|
class MutableBidict(BidictBase[KT, VT], MutableBidirectionalMapping[KT, VT]):
|
||||||
"""Base class for mutable bidirectional mappings."""
|
"""Base class for mutable bidirectional mappings."""
|
||||||
|
|
||||||
__slots__ = ()
|
if t.TYPE_CHECKING:
|
||||||
|
@property
|
||||||
|
def inverse(self) -> 'MutableBidict[VT, KT]': ...
|
||||||
|
|
||||||
if _t.TYPE_CHECKING:
|
def _pop(self, key: KT) -> VT:
|
||||||
|
val = self._fwdm.pop(key)
|
||||||
|
del self._invm[val]
|
||||||
|
return val
|
||||||
|
|
||||||
|
def __delitem__(self, key: KT) -> None:
|
||||||
|
"""*x.__delitem__(y) ⟺ del x[y]*"""
|
||||||
|
self._pop(key)
|
||||||
|
|
||||||
|
def __setitem__(self, key: KT, val: VT) -> None:
|
||||||
|
"""Set the value for *key* to *val*.
|
||||||
|
|
||||||
|
If *key* is already associated with *val*, this is a no-op.
|
||||||
|
|
||||||
|
If *key* is already associated with a different value,
|
||||||
|
the old value will be replaced with *val*,
|
||||||
|
as with dict's :meth:`__setitem__`.
|
||||||
|
|
||||||
|
If *val* is already associated with a different key,
|
||||||
|
an exception is raised
|
||||||
|
to protect against accidental removal of the key
|
||||||
|
that's currently associated with *val*.
|
||||||
|
|
||||||
|
Use :meth:`put` instead if you want to specify different behavior in
|
||||||
|
the case that the provided key or value duplicates an existing one.
|
||||||
|
Or use :meth:`forceput` to unconditionally associate *key* with *val*,
|
||||||
|
replacing any existing items as necessary to preserve uniqueness.
|
||||||
|
|
||||||
|
:raises bidict.ValueDuplicationError: if *val* duplicates that of an
|
||||||
|
existing item.
|
||||||
|
|
||||||
|
:raises bidict.KeyAndValueDuplicationError: if *key* duplicates the key of an
|
||||||
|
existing item and *val* duplicates the value of a different
|
||||||
|
existing item.
|
||||||
|
"""
|
||||||
|
self.put(key, val, on_dup=self.on_dup)
|
||||||
|
|
||||||
|
def put(self, key: KT, val: VT, on_dup: OnDup = ON_DUP_RAISE) -> None:
|
||||||
|
"""Associate *key* with *val*, honoring the :class:`OnDup` given in *on_dup*.
|
||||||
|
|
||||||
|
For example, if *on_dup* is :attr:`~bidict.ON_DUP_RAISE`,
|
||||||
|
then *key* will be associated with *val* if and only if
|
||||||
|
*key* is not already associated with an existing value and
|
||||||
|
*val* is not already associated with an existing key,
|
||||||
|
otherwise an exception will be raised.
|
||||||
|
|
||||||
|
If *key* is already associated with *val*, this is a no-op.
|
||||||
|
|
||||||
|
:raises bidict.KeyDuplicationError: if attempting to insert an item
|
||||||
|
whose key only duplicates an existing item's, and *on_dup.key* is
|
||||||
|
:attr:`~bidict.RAISE`.
|
||||||
|
|
||||||
|
:raises bidict.ValueDuplicationError: if attempting to insert an item
|
||||||
|
whose value only duplicates an existing item's, and *on_dup.val* is
|
||||||
|
:attr:`~bidict.RAISE`.
|
||||||
|
|
||||||
|
:raises bidict.KeyAndValueDuplicationError: if attempting to insert an
|
||||||
|
item whose key duplicates one existing item's, and whose value
|
||||||
|
duplicates another existing item's, and *on_dup.kv* is
|
||||||
|
:attr:`~bidict.RAISE`.
|
||||||
|
"""
|
||||||
|
self._update([(key, val)], on_dup=on_dup)
|
||||||
|
|
||||||
|
def forceput(self, key: KT, val: VT) -> None:
|
||||||
|
"""Associate *key* with *val* unconditionally.
|
||||||
|
|
||||||
|
Replace any existing mappings containing key *key* or value *val*
|
||||||
|
as necessary to preserve uniqueness.
|
||||||
|
"""
|
||||||
|
self.put(key, val, on_dup=ON_DUP_DROP_OLD)
|
||||||
|
|
||||||
|
def clear(self) -> None:
|
||||||
|
"""Remove all items."""
|
||||||
|
self._fwdm.clear()
|
||||||
|
self._invm.clear()
|
||||||
|
|
||||||
|
@t.overload
|
||||||
|
def pop(self, __key: KT) -> VT: ...
|
||||||
|
@t.overload
|
||||||
|
def pop(self, __key: KT, __default: DT = ...) -> t.Union[VT, DT]: ...
|
||||||
|
|
||||||
|
def pop(self, key: KT, default: ODT[DT] = MISSING) -> t.Union[VT, DT]:
|
||||||
|
"""*x.pop(k[, d]) → v*
|
||||||
|
|
||||||
|
Remove specified key and return the corresponding value.
|
||||||
|
|
||||||
|
:raises KeyError: if *key* is not found and no *default* is provided.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return self._pop(key)
|
||||||
|
except KeyError:
|
||||||
|
if default is MISSING:
|
||||||
|
raise
|
||||||
|
return default
|
||||||
|
|
||||||
|
def popitem(self) -> t.Tuple[KT, VT]:
|
||||||
|
"""*x.popitem() → (k, v)*
|
||||||
|
|
||||||
|
Remove and return some item as a (key, value) pair.
|
||||||
|
|
||||||
|
:raises KeyError: if *x* is empty.
|
||||||
|
"""
|
||||||
|
key, val = self._fwdm.popitem()
|
||||||
|
del self._invm[val]
|
||||||
|
return key, val
|
||||||
|
|
||||||
|
@t.overload # type: ignore [override] # https://github.com/jab/bidict/pull/242#discussion_r825464731
|
||||||
|
def update(self, __m: t.Mapping[KT, VT], **kw: VT) -> None: ...
|
||||||
|
@t.overload
|
||||||
|
def update(self, __i: IterItems[KT, VT], **kw: VT) -> None: ...
|
||||||
|
@t.overload
|
||||||
|
def update(self, **kw: VT) -> None: ...
|
||||||
|
|
||||||
|
def update(self, *args: MapOrIterItems[KT, VT], **kw: VT) -> None:
|
||||||
|
"""Like calling :meth:`putall` with *self.on_dup* passed for *on_dup*."""
|
||||||
|
if args or kw:
|
||||||
|
self._update(get_arg(*args), kw)
|
||||||
|
|
||||||
|
@t.overload
|
||||||
|
def forceupdate(self, __m: t.Mapping[KT, VT], **kw: VT) -> None: ...
|
||||||
|
@t.overload
|
||||||
|
def forceupdate(self, __i: IterItems[KT, VT], **kw: VT) -> None: ...
|
||||||
|
@t.overload
|
||||||
|
def forceupdate(self, **kw: VT) -> None: ...
|
||||||
|
|
||||||
|
def forceupdate(self, *args: MapOrIterItems[KT, VT], **kw: VT) -> None:
|
||||||
|
"""Like a bulk :meth:`forceput`."""
|
||||||
|
if args or kw:
|
||||||
|
self._update(get_arg(*args), kw, on_dup=ON_DUP_DROP_OLD)
|
||||||
|
|
||||||
|
def __ior__(self, other: t.Mapping[KT, VT]) -> 'MutableBidict[KT, VT]':
|
||||||
|
"""Return self|=other."""
|
||||||
|
self.update(other)
|
||||||
|
return self
|
||||||
|
|
||||||
|
@t.overload
|
||||||
|
def putall(self, items: t.Mapping[KT, VT], on_dup: OnDup) -> None: ...
|
||||||
|
@t.overload
|
||||||
|
def putall(self, items: IterItems[KT, VT], on_dup: OnDup = ...) -> None: ...
|
||||||
|
|
||||||
|
def putall(self, items: MapOrIterItems[KT, VT], on_dup: OnDup = ON_DUP_RAISE) -> None:
|
||||||
|
"""Like a bulk :meth:`put`.
|
||||||
|
|
||||||
|
If one of the given items causes an exception to be raised,
|
||||||
|
none of the items is inserted.
|
||||||
|
"""
|
||||||
|
if items:
|
||||||
|
self._update(items, on_dup=on_dup)
|
||||||
|
|
||||||
|
|
||||||
|
class bidict(MutableBidict[KT, VT]):
|
||||||
|
"""The main bidirectional mapping type.
|
||||||
|
|
||||||
|
See :ref:`intro:Introduction` and :ref:`basic-usage:Basic Usage`
|
||||||
|
to get started (also available at https://bidict.rtfd.io).
|
||||||
|
"""
|
||||||
|
|
||||||
|
if t.TYPE_CHECKING:
|
||||||
@property
|
@property
|
||||||
def inverse(self) -> 'bidict[VT, KT]': ...
|
def inverse(self) -> 'bidict[VT, KT]': ...
|
||||||
|
|
||||||
|
|
||||||
# * Code review nav *
|
# * Code review nav *
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
# ← Prev: _mut.py Current: _bidict.py Next: _orderedbase.py →
|
# ← Prev: _frozenbidict.py Current: _bidict.py Next: _orderedbase.py →
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
|
|
|
@ -1,39 +0,0 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
# Copyright 2009-2021 Joshua Bronson. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
||||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
||||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
||||||
|
|
||||||
|
|
||||||
"""Provide :class:`_DelegatingBidict`."""
|
|
||||||
|
|
||||||
import typing as _t
|
|
||||||
|
|
||||||
from ._base import BidictBase
|
|
||||||
from ._typing import KT, VT
|
|
||||||
|
|
||||||
|
|
||||||
class _DelegatingBidict(BidictBase[KT, VT]):
|
|
||||||
"""Provide optimized implementations of several methods by delegating to backing dicts.
|
|
||||||
|
|
||||||
Used to override less efficient implementations inherited by :class:`~collections.abc.Mapping`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
__slots__ = ()
|
|
||||||
|
|
||||||
def __iter__(self) -> _t.Iterator[KT]:
|
|
||||||
"""Iterator over the contained keys."""
|
|
||||||
return iter(self._fwdm)
|
|
||||||
|
|
||||||
def keys(self) -> _t.KeysView[KT]:
|
|
||||||
"""A set-like object providing a view on the contained keys."""
|
|
||||||
return self._fwdm.keys() # type: ignore [return-value]
|
|
||||||
|
|
||||||
def values(self) -> _t.KeysView[VT]: # type: ignore [override] # https://github.com/python/typeshed/issues/4435
|
|
||||||
"""A set-like object providing a view on the contained values."""
|
|
||||||
return self._invm.keys() # type: ignore [return-value]
|
|
||||||
|
|
||||||
def items(self) -> _t.ItemsView[KT, VT]:
|
|
||||||
"""A set-like object providing a view on the contained items."""
|
|
||||||
return self._fwdm.items() # type: ignore [return-value]
|
|
|
@ -1,5 +1,4 @@
|
||||||
# -*- coding: utf-8 -*-
|
# Copyright 2009-2022 Joshua Bronson. All rights reserved.
|
||||||
# Copyright 2009-2021 Joshua Bronson. All Rights Reserved.
|
|
||||||
#
|
#
|
||||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
|
@ -9,11 +8,11 @@
|
||||||
"""Provide :class:`OnDup` and related functionality."""
|
"""Provide :class:`OnDup` and related functionality."""
|
||||||
|
|
||||||
|
|
||||||
from collections import namedtuple
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
import typing as t
|
||||||
|
|
||||||
|
|
||||||
class OnDupAction(Enum):
|
class OD(Enum):
|
||||||
"""An action to take to prevent duplication from occurring."""
|
"""An action to take to prevent duplication from occurring."""
|
||||||
|
|
||||||
#: Raise a :class:`~bidict.DuplicationError`.
|
#: Raise a :class:`~bidict.DuplicationError`.
|
||||||
|
@ -24,25 +23,26 @@ class OnDupAction(Enum):
|
||||||
DROP_NEW = 'DROP_NEW'
|
DROP_NEW = 'DROP_NEW'
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f'<{self.name}>'
|
return f'{self.__class__.__name__}.{self.name}'
|
||||||
|
|
||||||
|
|
||||||
RAISE = OnDupAction.RAISE
|
RAISE = OD.RAISE
|
||||||
DROP_OLD = OnDupAction.DROP_OLD
|
DROP_OLD = OD.DROP_OLD
|
||||||
DROP_NEW = OnDupAction.DROP_NEW
|
DROP_NEW = OD.DROP_NEW
|
||||||
|
|
||||||
|
|
||||||
class OnDup(namedtuple('_OnDup', 'key val kv')):
|
class OnDup(t.NamedTuple('_OnDup', [('key', OD), ('val', OD), ('kv', OD)])):
|
||||||
r"""A 3-tuple of :class:`OnDupAction`\s specifying how to handle the 3 kinds of duplication.
|
r"""A 3-tuple of :class:`OD`\s specifying how to handle the 3 kinds of duplication.
|
||||||
|
|
||||||
*See also* :ref:`basic-usage:Values Must Be Unique`
|
*See also* :ref:`basic-usage:Values Must Be Unique`
|
||||||
|
(https://bidict.rtfd.io/basic-usage.html#values-must-be-unique)
|
||||||
|
|
||||||
If *kv* is not specified, *val* will be used for *kv*.
|
If *kv* is not specified, *val* will be used for *kv*.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__slots__ = ()
|
__slots__ = ()
|
||||||
|
|
||||||
def __new__(cls, key: OnDupAction = DROP_OLD, val: OnDupAction = RAISE, kv: OnDupAction = RAISE) -> 'OnDup':
|
def __new__(cls, key: OD = DROP_OLD, val: OD = RAISE, kv: t.Optional[OD] = None) -> 'OnDup':
|
||||||
"""Override to provide user-friendly default values."""
|
"""Override to provide user-friendly default values."""
|
||||||
return super().__new__(cls, key, val, kv or val)
|
return super().__new__(cls, key, val, kv or val)
|
||||||
|
|
||||||
|
@ -51,7 +51,7 @@ class OnDup(namedtuple('_OnDup', 'key val kv')):
|
||||||
#: :meth:`~bidict.bidict.__init__`,
|
#: :meth:`~bidict.bidict.__init__`,
|
||||||
#: :meth:`~bidict.bidict.__setitem__`, and
|
#: :meth:`~bidict.bidict.__setitem__`, and
|
||||||
#: :meth:`~bidict.bidict.update` methods.
|
#: :meth:`~bidict.bidict.update` methods.
|
||||||
ON_DUP_DEFAULT = OnDup()
|
ON_DUP_DEFAULT = OnDup(key=DROP_OLD, val=RAISE, kv=RAISE)
|
||||||
#: An :class:`OnDup` whose members are all :obj:`RAISE`.
|
#: An :class:`OnDup` whose members are all :obj:`RAISE`.
|
||||||
ON_DUP_RAISE = OnDup(key=RAISE, val=RAISE, kv=RAISE)
|
ON_DUP_RAISE = OnDup(key=RAISE, val=RAISE, kv=RAISE)
|
||||||
#: An :class:`OnDup` whose members are all :obj:`DROP_OLD`.
|
#: An :class:`OnDup` whose members are all :obj:`DROP_OLD`.
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
# -*- coding: utf-8 -*-
|
# Copyright 2009-2022 Joshua Bronson. All rights reserved.
|
||||||
# Copyright 2009-2021 Joshua Bronson. All Rights Reserved.
|
|
||||||
#
|
#
|
||||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
|
|
|
@ -1,60 +1,45 @@
|
||||||
# -*- coding: utf-8 -*-
|
# Copyright 2009-2022 Joshua Bronson. All rights reserved.
|
||||||
# Copyright 2009-2021 Joshua Bronson. All Rights Reserved.
|
|
||||||
#
|
#
|
||||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
|
||||||
#==============================================================================
|
|
||||||
# * Welcome to the bidict source code *
|
|
||||||
#==============================================================================
|
|
||||||
|
|
||||||
# Doing a code review? You'll find a "Code review nav" comment like the one
|
|
||||||
# below at the top and bottom of the most important source files. This provides
|
|
||||||
# a suggested initial path through the source when reviewing.
|
|
||||||
#
|
|
||||||
# Note: If you aren't reading this on https://github.com/jab/bidict, you may be
|
|
||||||
# viewing an outdated version of the code. Please head to GitHub to review the
|
|
||||||
# latest version, which contains important improvements over older versions.
|
|
||||||
#
|
|
||||||
# Thank you for reading and for any feedback you provide.
|
|
||||||
|
|
||||||
# * Code review nav *
|
# * Code review nav *
|
||||||
|
# (see comments in __init__.py)
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
# ← Prev: _base.py Current: _frozenbidict.py Next: _mut.py →
|
# ← Prev: _base.py Current: _frozenbidict.py Next: _bidict.py →
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
|
|
||||||
"""Provide :class:`frozenbidict`, an immutable, hashable bidirectional mapping type."""
|
"""Provide :class:`frozenbidict`, an immutable, hashable bidirectional mapping type."""
|
||||||
|
|
||||||
import typing as _t
|
import typing as t
|
||||||
|
|
||||||
from ._delegating import _DelegatingBidict
|
from ._base import BidictBase
|
||||||
from ._typing import KT, VT
|
from ._typing import KT, VT
|
||||||
|
|
||||||
|
|
||||||
class frozenbidict(_DelegatingBidict[KT, VT]):
|
class frozenbidict(BidictBase[KT, VT]):
|
||||||
"""Immutable, hashable bidict type."""
|
"""Immutable, hashable bidict type."""
|
||||||
|
|
||||||
__slots__ = ('_hash',)
|
|
||||||
|
|
||||||
_hash: int
|
_hash: int
|
||||||
|
|
||||||
# Work around lack of support for higher-kinded types in mypy.
|
# Work around lack of support for higher-kinded types in Python.
|
||||||
# Ref: https://github.com/python/typing/issues/548#issuecomment-621571821
|
# Ref: https://github.com/python/typing/issues/548#issuecomment-621571821
|
||||||
# Remove this and similar type stubs from other classes if support is ever added.
|
if t.TYPE_CHECKING:
|
||||||
if _t.TYPE_CHECKING:
|
|
||||||
@property
|
@property
|
||||||
def inverse(self) -> 'frozenbidict[VT, KT]': ...
|
def inverse(self) -> 'frozenbidict[VT, KT]': ...
|
||||||
|
|
||||||
def __hash__(self) -> int:
|
def __hash__(self) -> int:
|
||||||
"""The hash of this bidict as determined by its items."""
|
"""The hash of this bidict as determined by its items."""
|
||||||
if getattr(self, '_hash', None) is None:
|
if getattr(self, '_hash', None) is None:
|
||||||
self._hash = _t.ItemsView(self)._hash() # type: ignore [attr-defined]
|
# The following is like hash(frozenset(self.items()))
|
||||||
|
# but more memory efficient. See also: https://bugs.python.org/issue46684
|
||||||
|
self._hash = t.ItemsView(self)._hash() # type: ignore [attr-defined] # https://github.com/python/typeshed/pull/7153
|
||||||
return self._hash
|
return self._hash
|
||||||
|
|
||||||
|
|
||||||
# * Code review nav *
|
# * Code review nav *
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
# ← Prev: _base.py Current: _frozenbidict.py Next: _mut.py →
|
# ← Prev: _base.py Current: _frozenbidict.py Next: _bidict.py →
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
|
|
|
@ -1,33 +1,19 @@
|
||||||
# -*- coding: utf-8 -*-
|
# Copyright 2009-2022 Joshua Bronson. All rights reserved.
|
||||||
# Copyright 2009-2021 Joshua Bronson. All Rights Reserved.
|
|
||||||
#
|
#
|
||||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
|
||||||
#==============================================================================
|
|
||||||
# * Welcome to the bidict source code *
|
|
||||||
#==============================================================================
|
|
||||||
|
|
||||||
# Doing a code review? You'll find a "Code review nav" comment like the one
|
|
||||||
# below at the top and bottom of the most important source files. This provides
|
|
||||||
# a suggested initial path through the source when reviewing.
|
|
||||||
#
|
|
||||||
# Note: If you aren't reading this on https://github.com/jab/bidict, you may be
|
|
||||||
# viewing an outdated version of the code. Please head to GitHub to review the
|
|
||||||
# latest version, which contains important improvements over older versions.
|
|
||||||
#
|
|
||||||
# Thank you for reading and for any feedback you provide.
|
|
||||||
|
|
||||||
# * Code review nav *
|
# * Code review nav *
|
||||||
|
# (see comments in __init__.py)
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
#← Prev: _orderedbase.py Current: _frozenordered.py Next: _orderedbidict.py →
|
#← Prev: _orderedbase.py Current: _frozenordered.py Next: _orderedbidict.py →
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
|
|
||||||
"""Provide :class:`FrozenOrderedBidict`, an immutable, hashable, ordered bidict."""
|
"""Provide :class:`FrozenOrderedBidict`, an immutable, hashable, ordered bidict."""
|
||||||
|
|
||||||
import typing as _t
|
import typing as t
|
||||||
|
|
||||||
from ._frozenbidict import frozenbidict
|
from ._frozenbidict import frozenbidict
|
||||||
from ._orderedbase import OrderedBidictBase
|
from ._orderedbase import OrderedBidictBase
|
||||||
|
@ -46,41 +32,16 @@ class FrozenOrderedBidict(OrderedBidictBase[KT, VT]):
|
||||||
|
|
||||||
If you are using Python 3.8+, frozenbidict gives you everything that
|
If you are using Python 3.8+, frozenbidict gives you everything that
|
||||||
FrozenOrderedBidict gives you, but with less space overhead.
|
FrozenOrderedBidict gives you, but with less space overhead.
|
||||||
|
On the other hand, using FrozenOrderedBidict when you are depending on
|
||||||
|
the ordering of the items can make the ordering dependence more explicit.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__slots__ = ('_hash',)
|
__hash__: t.Callable[[t.Any], int] = frozenbidict.__hash__ # pyright: ignore
|
||||||
__hash__ = frozenbidict.__hash__
|
|
||||||
|
|
||||||
if _t.TYPE_CHECKING:
|
if t.TYPE_CHECKING:
|
||||||
@property
|
@property
|
||||||
def inverse(self) -> 'FrozenOrderedBidict[VT, KT]': ...
|
def inverse(self) -> 'FrozenOrderedBidict[VT, KT]': ...
|
||||||
|
|
||||||
# Delegate to backing dicts for more efficient implementations of keys() and values().
|
|
||||||
# Possible with FrozenOrderedBidict but not OrderedBidict since FrozenOrderedBidict
|
|
||||||
# is immutable, i.e. these can't get out of sync after initialization due to mutation.
|
|
||||||
def keys(self) -> _t.KeysView[KT]:
|
|
||||||
"""A set-like object providing a view on the contained keys."""
|
|
||||||
return self._fwdm._fwdm.keys() # type: ignore [return-value]
|
|
||||||
|
|
||||||
def values(self) -> _t.KeysView[VT]: # type: ignore [override]
|
|
||||||
"""A set-like object providing a view on the contained values."""
|
|
||||||
return self._invm._fwdm.keys() # type: ignore [return-value]
|
|
||||||
|
|
||||||
# Can't delegate for items() because values in _fwdm and _invm are nodes.
|
|
||||||
|
|
||||||
# On Python 3.8+, delegate to backing dicts for a more efficient implementation
|
|
||||||
# of __iter__ and __reversed__ (both of which call this _iter() method):
|
|
||||||
if hasattr(dict, '__reversed__'):
|
|
||||||
def _iter(self, *, reverse: bool = False) -> _t.Iterator[KT]:
|
|
||||||
itfn = reversed if reverse else iter
|
|
||||||
return itfn(self._fwdm._fwdm) # type: ignore [operator,no-any-return]
|
|
||||||
else:
|
|
||||||
# On Python < 3.8, just optimize __iter__:
|
|
||||||
def _iter(self, *, reverse: bool = False) -> _t.Iterator[KT]:
|
|
||||||
if not reverse:
|
|
||||||
return iter(self._fwdm._fwdm)
|
|
||||||
return super()._iter(reverse=True)
|
|
||||||
|
|
||||||
|
|
||||||
# * Code review nav *
|
# * Code review nav *
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
# -*- coding: utf-8 -*-
|
# Copyright 2009-2022 Joshua Bronson. All rights reserved.
|
||||||
# Copyright 2009-2021 Joshua Bronson. All Rights Reserved.
|
|
||||||
#
|
#
|
||||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
|
@ -8,48 +7,26 @@
|
||||||
|
|
||||||
"""Functions for iterating over items in a mapping."""
|
"""Functions for iterating over items in a mapping."""
|
||||||
|
|
||||||
import typing as _t
|
from operator import itemgetter
|
||||||
from collections.abc import Mapping
|
import typing as t
|
||||||
from itertools import chain
|
|
||||||
|
|
||||||
from ._typing import KT, VT, IterItems, MapOrIterItems
|
from ._typing import KT, VT, IterItems, MapOrIterItems
|
||||||
|
|
||||||
|
|
||||||
_NULL_IT: IterItems = iter(())
|
def iteritems_mapping_or_iterable(arg: MapOrIterItems[KT, VT]) -> IterItems[KT, VT]:
|
||||||
|
"""Yield the items in *arg* based on whether it's a mapping."""
|
||||||
|
yield from arg.items() if isinstance(arg, t.Mapping) else arg # pyright: ignore
|
||||||
|
|
||||||
|
|
||||||
def _iteritems_mapping_or_iterable(arg: MapOrIterItems[KT, VT]) -> IterItems[KT, VT]:
|
def iteritems(__arg: MapOrIterItems[KT, VT], **kw: VT) -> IterItems[KT, VT]:
|
||||||
"""Yield the items in *arg*.
|
"""Yield the items from *arg* and then any from *kw* in the order given."""
|
||||||
|
yield from iteritems_mapping_or_iterable(__arg)
|
||||||
If *arg* is a :class:`~collections.abc.Mapping`, return an iterator over its items.
|
yield from kw.items() # type: ignore [misc]
|
||||||
Otherwise return an iterator over *arg* itself.
|
|
||||||
"""
|
|
||||||
return iter(arg.items() if isinstance(arg, Mapping) else arg)
|
|
||||||
|
|
||||||
|
|
||||||
def _iteritems_args_kw(*args: MapOrIterItems[KT, VT], **kw: VT) -> IterItems[KT, VT]:
|
swap = itemgetter(1, 0)
|
||||||
"""Yield the items from the positional argument (if given) and then any from *kw*.
|
|
||||||
|
|
||||||
:raises TypeError: if more than one positional argument is given.
|
|
||||||
"""
|
|
||||||
args_len = len(args)
|
|
||||||
if args_len > 1:
|
|
||||||
raise TypeError(f'Expected at most 1 positional argument, got {args_len}')
|
|
||||||
it: IterItems = ()
|
|
||||||
if args:
|
|
||||||
arg = args[0]
|
|
||||||
if arg:
|
|
||||||
it = _iteritems_mapping_or_iterable(arg)
|
|
||||||
if kw:
|
|
||||||
iterkw = iter(kw.items())
|
|
||||||
it = chain(it, iterkw) if it else iterkw
|
|
||||||
return it or _NULL_IT
|
|
||||||
|
|
||||||
|
|
||||||
@_t.overload
|
|
||||||
def inverted(arg: _t.Mapping[KT, VT]) -> IterItems[VT, KT]: ...
|
|
||||||
@_t.overload
|
|
||||||
def inverted(arg: IterItems[KT, VT]) -> IterItems[VT, KT]: ...
|
|
||||||
def inverted(arg: MapOrIterItems[KT, VT]) -> IterItems[VT, KT]:
|
def inverted(arg: MapOrIterItems[KT, VT]) -> IterItems[VT, KT]:
|
||||||
"""Yield the inverse items of the provided object.
|
"""Yield the inverse items of the provided object.
|
||||||
|
|
||||||
|
@ -61,7 +38,8 @@ def inverted(arg: MapOrIterItems[KT, VT]) -> IterItems[VT, KT]:
|
||||||
|
|
||||||
*See also* :attr:`bidict.BidirectionalMapping.__inverted__`
|
*See also* :attr:`bidict.BidirectionalMapping.__inverted__`
|
||||||
"""
|
"""
|
||||||
inv = getattr(arg, '__inverted__', None)
|
invattr = getattr(arg, '__inverted__', None)
|
||||||
if callable(inv):
|
if callable(invattr):
|
||||||
return inv() # type: ignore [no-any-return]
|
inv: IterItems[VT, KT] = invattr()
|
||||||
return ((val, key) for (key, val) in _iteritems_mapping_or_iterable(arg))
|
return inv
|
||||||
|
return map(swap, iteritems_mapping_or_iterable(arg))
|
||||||
|
|
|
@ -1,188 +0,0 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
# Copyright 2009-2021 Joshua Bronson. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
||||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
||||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
||||||
|
|
||||||
|
|
||||||
#==============================================================================
|
|
||||||
# * Welcome to the bidict source code *
|
|
||||||
#==============================================================================
|
|
||||||
|
|
||||||
# Doing a code review? You'll find a "Code review nav" comment like the one
|
|
||||||
# below at the top and bottom of the most important source files. This provides
|
|
||||||
# a suggested initial path through the source when reviewing.
|
|
||||||
#
|
|
||||||
# Note: If you aren't reading this on https://github.com/jab/bidict, you may be
|
|
||||||
# viewing an outdated version of the code. Please head to GitHub to review the
|
|
||||||
# latest version, which contains important improvements over older versions.
|
|
||||||
#
|
|
||||||
# Thank you for reading and for any feedback you provide.
|
|
||||||
|
|
||||||
# * Code review nav *
|
|
||||||
#==============================================================================
|
|
||||||
# ← Prev: _frozenbidict.py Current: _mut.py Next: _bidict.py →
|
|
||||||
#==============================================================================
|
|
||||||
|
|
||||||
|
|
||||||
"""Provide :class:`MutableBidict`."""
|
|
||||||
|
|
||||||
import typing as _t
|
|
||||||
|
|
||||||
from ._abc import MutableBidirectionalMapping
|
|
||||||
from ._base import BidictBase
|
|
||||||
from ._dup import OnDup, ON_DUP_RAISE, ON_DUP_DROP_OLD
|
|
||||||
from ._typing import _NONE, KT, VT, VDT, IterItems, MapOrIterItems
|
|
||||||
|
|
||||||
|
|
||||||
class MutableBidict(BidictBase[KT, VT], MutableBidirectionalMapping[KT, VT]):
|
|
||||||
"""Base class for mutable bidirectional mappings."""
|
|
||||||
|
|
||||||
__slots__ = ()
|
|
||||||
|
|
||||||
if _t.TYPE_CHECKING:
|
|
||||||
@property
|
|
||||||
def inverse(self) -> 'MutableBidict[VT, KT]': ...
|
|
||||||
|
|
||||||
def __delitem__(self, key: KT) -> None:
|
|
||||||
"""*x.__delitem__(y) ⟺ del x[y]*"""
|
|
||||||
self._pop(key)
|
|
||||||
|
|
||||||
def __setitem__(self, key: KT, val: VT) -> None:
|
|
||||||
"""Set the value for *key* to *val*.
|
|
||||||
|
|
||||||
If *key* is already associated with *val*, this is a no-op.
|
|
||||||
|
|
||||||
If *key* is already associated with a different value,
|
|
||||||
the old value will be replaced with *val*,
|
|
||||||
as with dict's :meth:`__setitem__`.
|
|
||||||
|
|
||||||
If *val* is already associated with a different key,
|
|
||||||
an exception is raised
|
|
||||||
to protect against accidental removal of the key
|
|
||||||
that's currently associated with *val*.
|
|
||||||
|
|
||||||
Use :meth:`put` instead if you want to specify different behavior in
|
|
||||||
the case that the provided key or value duplicates an existing one.
|
|
||||||
Or use :meth:`forceput` to unconditionally associate *key* with *val*,
|
|
||||||
replacing any existing items as necessary to preserve uniqueness.
|
|
||||||
|
|
||||||
:raises bidict.ValueDuplicationError: if *val* duplicates that of an
|
|
||||||
existing item.
|
|
||||||
|
|
||||||
:raises bidict.KeyAndValueDuplicationError: if *key* duplicates the key of an
|
|
||||||
existing item and *val* duplicates the value of a different
|
|
||||||
existing item.
|
|
||||||
"""
|
|
||||||
self._put(key, val, self.on_dup)
|
|
||||||
|
|
||||||
def put(self, key: KT, val: VT, on_dup: OnDup = ON_DUP_RAISE) -> None:
|
|
||||||
"""Associate *key* with *val*, honoring the :class:`OnDup` given in *on_dup*.
|
|
||||||
|
|
||||||
For example, if *on_dup* is :attr:`~bidict.ON_DUP_RAISE`,
|
|
||||||
then *key* will be associated with *val* if and only if
|
|
||||||
*key* is not already associated with an existing value and
|
|
||||||
*val* is not already associated with an existing key,
|
|
||||||
otherwise an exception will be raised.
|
|
||||||
|
|
||||||
If *key* is already associated with *val*, this is a no-op.
|
|
||||||
|
|
||||||
:raises bidict.KeyDuplicationError: if attempting to insert an item
|
|
||||||
whose key only duplicates an existing item's, and *on_dup.key* is
|
|
||||||
:attr:`~bidict.RAISE`.
|
|
||||||
|
|
||||||
:raises bidict.ValueDuplicationError: if attempting to insert an item
|
|
||||||
whose value only duplicates an existing item's, and *on_dup.val* is
|
|
||||||
:attr:`~bidict.RAISE`.
|
|
||||||
|
|
||||||
:raises bidict.KeyAndValueDuplicationError: if attempting to insert an
|
|
||||||
item whose key duplicates one existing item's, and whose value
|
|
||||||
duplicates another existing item's, and *on_dup.kv* is
|
|
||||||
:attr:`~bidict.RAISE`.
|
|
||||||
"""
|
|
||||||
self._put(key, val, on_dup)
|
|
||||||
|
|
||||||
def forceput(self, key: KT, val: VT) -> None:
|
|
||||||
"""Associate *key* with *val* unconditionally.
|
|
||||||
|
|
||||||
Replace any existing mappings containing key *key* or value *val*
|
|
||||||
as necessary to preserve uniqueness.
|
|
||||||
"""
|
|
||||||
self._put(key, val, ON_DUP_DROP_OLD)
|
|
||||||
|
|
||||||
def clear(self) -> None:
|
|
||||||
"""Remove all items."""
|
|
||||||
self._fwdm.clear()
|
|
||||||
self._invm.clear()
|
|
||||||
|
|
||||||
@_t.overload
|
|
||||||
def pop(self, key: KT) -> VT: ...
|
|
||||||
@_t.overload
|
|
||||||
def pop(self, key: KT, default: VDT = ...) -> VDT: ...
|
|
||||||
def pop(self, key: KT, default: VDT = _NONE) -> VDT:
|
|
||||||
"""*x.pop(k[, d]) → v*
|
|
||||||
|
|
||||||
Remove specified key and return the corresponding value.
|
|
||||||
|
|
||||||
:raises KeyError: if *key* is not found and no *default* is provided.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
return self._pop(key)
|
|
||||||
except KeyError:
|
|
||||||
if default is _NONE:
|
|
||||||
raise
|
|
||||||
return default
|
|
||||||
|
|
||||||
def popitem(self) -> _t.Tuple[KT, VT]:
|
|
||||||
"""*x.popitem() → (k, v)*
|
|
||||||
|
|
||||||
Remove and return some item as a (key, value) pair.
|
|
||||||
|
|
||||||
:raises KeyError: if *x* is empty.
|
|
||||||
"""
|
|
||||||
if not self:
|
|
||||||
raise KeyError('mapping is empty')
|
|
||||||
key, val = self._fwdm.popitem()
|
|
||||||
del self._invm[val]
|
|
||||||
return key, val
|
|
||||||
|
|
||||||
@_t.overload
|
|
||||||
def update(self, __arg: _t.Mapping[KT, VT], **kw: VT) -> None: ...
|
|
||||||
@_t.overload
|
|
||||||
def update(self, __arg: IterItems[KT, VT], **kw: VT) -> None: ...
|
|
||||||
@_t.overload
|
|
||||||
def update(self, **kw: VT) -> None: ...
|
|
||||||
def update(self, *args: MapOrIterItems[KT, VT], **kw: VT) -> None:
|
|
||||||
"""Like calling :meth:`putall` with *self.on_dup* passed for *on_dup*."""
|
|
||||||
if args or kw:
|
|
||||||
self._update(False, self.on_dup, *args, **kw)
|
|
||||||
|
|
||||||
@_t.overload
|
|
||||||
def forceupdate(self, __arg: _t.Mapping[KT, VT], **kw: VT) -> None: ...
|
|
||||||
@_t.overload
|
|
||||||
def forceupdate(self, __arg: IterItems[KT, VT], **kw: VT) -> None: ...
|
|
||||||
@_t.overload
|
|
||||||
def forceupdate(self, **kw: VT) -> None: ...
|
|
||||||
def forceupdate(self, *args: MapOrIterItems[KT, VT], **kw: VT) -> None:
|
|
||||||
"""Like a bulk :meth:`forceput`."""
|
|
||||||
self._update(False, ON_DUP_DROP_OLD, *args, **kw)
|
|
||||||
|
|
||||||
@_t.overload
|
|
||||||
def putall(self, items: _t.Mapping[KT, VT], on_dup: OnDup) -> None: ...
|
|
||||||
@_t.overload
|
|
||||||
def putall(self, items: IterItems[KT, VT], on_dup: OnDup = ON_DUP_RAISE) -> None: ...
|
|
||||||
def putall(self, items: MapOrIterItems[KT, VT], on_dup: OnDup = ON_DUP_RAISE) -> None:
|
|
||||||
"""Like a bulk :meth:`put`.
|
|
||||||
|
|
||||||
If one of the given items causes an exception to be raised,
|
|
||||||
none of the items is inserted.
|
|
||||||
"""
|
|
||||||
if items:
|
|
||||||
self._update(False, on_dup, items)
|
|
||||||
|
|
||||||
|
|
||||||
# * Code review nav *
|
|
||||||
#==============================================================================
|
|
||||||
# ← Prev: _frozenbidict.py Current: _mut.py Next: _bidict.py →
|
|
||||||
#==============================================================================
|
|
|
@ -1,5 +1,4 @@
|
||||||
# -*- coding: utf-8 -*-
|
# Copyright 2009-2022 Joshua Bronson. All rights reserved.
|
||||||
# Copyright 2009-2021 Joshua Bronson. All Rights Reserved.
|
|
||||||
#
|
#
|
||||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
|
@ -7,11 +6,19 @@
|
||||||
|
|
||||||
"""Provide :func:`bidict.namedbidict`."""
|
"""Provide :func:`bidict.namedbidict`."""
|
||||||
|
|
||||||
import typing as _t
|
import typing as t
|
||||||
from sys import _getframe
|
from sys import _getframe
|
||||||
|
|
||||||
from ._abc import BidirectionalMapping, KT, VT
|
from ._base import BidictBase
|
||||||
from ._bidict import bidict
|
from ._bidict import bidict
|
||||||
|
from ._typing import KT, VT
|
||||||
|
|
||||||
|
|
||||||
|
# pyright: reportPrivateUsage=false, reportUnnecessaryIsInstance=false
|
||||||
|
|
||||||
|
|
||||||
|
class NamedBidictBase:
|
||||||
|
"""Base class that namedbidicts derive from."""
|
||||||
|
|
||||||
|
|
||||||
def namedbidict(
|
def namedbidict(
|
||||||
|
@ -19,8 +26,8 @@ def namedbidict(
|
||||||
keyname: str,
|
keyname: str,
|
||||||
valname: str,
|
valname: str,
|
||||||
*,
|
*,
|
||||||
base_type: _t.Type[BidirectionalMapping[KT, VT]] = bidict,
|
base_type: t.Type[BidictBase[KT, VT]] = bidict,
|
||||||
) -> _t.Type[BidirectionalMapping[KT, VT]]:
|
) -> t.Type[BidictBase[KT, VT]]:
|
||||||
r"""Create a new subclass of *base_type* with custom accessors.
|
r"""Create a new subclass of *base_type* with custom accessors.
|
||||||
|
|
||||||
Like :func:`collections.namedtuple` for bidicts.
|
Like :func:`collections.namedtuple` for bidicts.
|
||||||
|
@ -36,64 +43,57 @@ def namedbidict(
|
||||||
|
|
||||||
*See also* the :ref:`namedbidict usage documentation
|
*See also* the :ref:`namedbidict usage documentation
|
||||||
<other-bidict-types:\:func\:\`~bidict.namedbidict\`>`
|
<other-bidict-types:\:func\:\`~bidict.namedbidict\`>`
|
||||||
|
(https://bidict.rtfd.io/other-bidict-types.html#namedbidict)
|
||||||
|
|
||||||
:raises ValueError: if any of the *typename*, *keyname*, or *valname*
|
:raises ValueError: if any of the *typename*, *keyname*, or *valname*
|
||||||
strings is not a valid Python identifier, or if *keyname == valname*.
|
strings is not a valid Python identifier, or if *keyname == valname*.
|
||||||
|
|
||||||
:raises TypeError: if *base_type* is not a :class:`BidirectionalMapping` subclass
|
:raises TypeError: if *base_type* is not a :class:`bidict.BidictBase` subclass.
|
||||||
that provides ``_isinv`` and :meth:`~object.__getstate__` attributes.
|
Any of the concrete bidict types pictured in the
|
||||||
(Any :class:`~bidict.BidictBase` subclass can be passed in, including all the
|
:ref:`other-bidict-types:Bidict Types Diagram` may be provided
|
||||||
concrete bidict types pictured in the :ref:`other-bidict-types:Bidict Types Diagram`.
|
(https://bidict.rtfd.io/other-bidict-types.html#bidict-types-diagram).
|
||||||
"""
|
"""
|
||||||
if not issubclass(base_type, BidirectionalMapping) or not all(hasattr(base_type, i) for i in ('_isinv', '__getstate__')):
|
if not issubclass(base_type, BidictBase):
|
||||||
raise TypeError(base_type)
|
raise TypeError(f'{base_type} is not a BidictBase subclass')
|
||||||
names = (typename, keyname, valname)
|
names = (typename, keyname, valname)
|
||||||
if not all(map(str.isidentifier, names)) or keyname == valname:
|
if not all(map(str.isidentifier, names)) or keyname == valname:
|
||||||
raise ValueError(names)
|
raise ValueError(names)
|
||||||
|
|
||||||
class _Named(base_type): # type: ignore [valid-type,misc]
|
basename = base_type.__name__
|
||||||
|
get_keyname = property(lambda self: keyname, doc='The keyname of this namedbidict.')
|
||||||
|
get_valname = property(lambda self: valname, doc='The valname of this namedbidict.')
|
||||||
|
val_by_key_name = f'{valname}_for'
|
||||||
|
key_by_val_name = f'{keyname}_for'
|
||||||
|
val_by_key_doc = f'{typename} forward {basename}: {keyname} -> {valname}'
|
||||||
|
key_by_val_doc = f'{typename} inverse {basename}: {valname} -> {keyname}'
|
||||||
|
get_val_by_key = property(lambda self: self, doc=val_by_key_doc)
|
||||||
|
get_key_by_val = property(lambda self: self.inverse, doc=key_by_val_doc)
|
||||||
|
|
||||||
__slots__ = ()
|
class NamedBidict(base_type, NamedBidictBase): # type: ignore [valid-type,misc] # https://github.com/python/mypy/issues/5865
|
||||||
|
"""NamedBidict."""
|
||||||
|
|
||||||
def _getfwd(self) -> '_Named':
|
keyname = get_keyname
|
||||||
return self.inverse if self._isinv else self # type: ignore [no-any-return]
|
valname = get_valname
|
||||||
|
|
||||||
def _getinv(self) -> '_Named':
|
@classmethod
|
||||||
return self if self._isinv else self.inverse # type: ignore [no-any-return]
|
def _inv_cls_dict_diff(cls) -> t.Dict[str, t.Any]:
|
||||||
|
base_diff = super()._inv_cls_dict_diff()
|
||||||
|
return {
|
||||||
|
**base_diff,
|
||||||
|
'keyname': get_valname,
|
||||||
|
'valname': get_keyname,
|
||||||
|
val_by_key_name: get_key_by_val,
|
||||||
|
key_by_val_name: get_val_by_key,
|
||||||
|
}
|
||||||
|
|
||||||
@property
|
NamedInv = NamedBidict._inv_cls
|
||||||
def _keyname(self) -> str:
|
assert NamedInv is not NamedBidict, 'namedbidict classes are not their own inverses'
|
||||||
return valname if self._isinv else keyname
|
setattr(NamedBidict, val_by_key_name, get_val_by_key)
|
||||||
|
setattr(NamedBidict, key_by_val_name, get_key_by_val)
|
||||||
@property
|
NamedBidict.__name__ = NamedBidict.__qualname__ = typename
|
||||||
def _valname(self) -> str:
|
NamedInv.__name__ = NamedInv.__qualname__ = f'{typename}Inv'
|
||||||
return keyname if self._isinv else valname
|
NamedBidict.__doc__ = f'NamedBidict({basename}) {typename!r}: {keyname} -> {valname}'
|
||||||
|
NamedInv.__doc__ = f'NamedBidictInv({basename}) {typename!r}: {valname} -> {keyname}'
|
||||||
def __reduce__(self) -> '_t.Tuple[_t.Callable[[str, str, str, _t.Type[BidirectionalMapping]], BidirectionalMapping], _t.Tuple[str, str, str, _t.Type[BidirectionalMapping]], dict]':
|
caller_module = _getframe(1).f_globals.get('__name__', '__main__')
|
||||||
return (_make_empty, (typename, keyname, valname, base_type), self.__getstate__())
|
NamedBidict.__module__ = NamedInv.__module__ = caller_module
|
||||||
|
return NamedBidict # pyright: ignore [reportUnknownVariableType]
|
||||||
bname = base_type.__name__
|
|
||||||
fname = valname + '_for'
|
|
||||||
iname = keyname + '_for'
|
|
||||||
fdoc = f'{typename} forward {bname}: {keyname} → {valname}'
|
|
||||||
idoc = f'{typename} inverse {bname}: {valname} → {keyname}'
|
|
||||||
setattr(_Named, fname, property(_Named._getfwd, doc=fdoc))
|
|
||||||
setattr(_Named, iname, property(_Named._getinv, doc=idoc))
|
|
||||||
|
|
||||||
_Named.__name__ = typename
|
|
||||||
_Named.__qualname__ = typename
|
|
||||||
_Named.__module__ = _getframe(1).f_globals.get('__name__') # type: ignore [assignment]
|
|
||||||
return _Named
|
|
||||||
|
|
||||||
|
|
||||||
def _make_empty(
|
|
||||||
typename: str,
|
|
||||||
keyname: str,
|
|
||||||
valname: str,
|
|
||||||
base_type: _t.Type[BidirectionalMapping] = bidict,
|
|
||||||
) -> BidirectionalMapping:
|
|
||||||
"""Create a named bidict with the indicated arguments and return an empty instance.
|
|
||||||
Used to make :func:`bidict.namedbidict` instances picklable.
|
|
||||||
"""
|
|
||||||
cls = namedbidict(typename, keyname, valname, base_type=base_type)
|
|
||||||
return cls()
|
|
||||||
|
|
|
@ -1,26 +1,12 @@
|
||||||
# -*- coding: utf-8 -*-
|
# Copyright 2009-2022 Joshua Bronson. All rights reserved.
|
||||||
# Copyright 2009-2021 Joshua Bronson. All Rights Reserved.
|
|
||||||
#
|
#
|
||||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
|
||||||
#==============================================================================
|
|
||||||
# * Welcome to the bidict source code *
|
|
||||||
#==============================================================================
|
|
||||||
|
|
||||||
# Doing a code review? You'll find a "Code review nav" comment like the one
|
|
||||||
# below at the top and bottom of the most important source files. This provides
|
|
||||||
# a suggested initial path through the source when reviewing.
|
|
||||||
#
|
|
||||||
# Note: If you aren't reading this on https://github.com/jab/bidict, you may be
|
|
||||||
# viewing an outdated version of the code. Please head to GitHub to review the
|
|
||||||
# latest version, which contains important improvements over older versions.
|
|
||||||
#
|
|
||||||
# Thank you for reading and for any feedback you provide.
|
|
||||||
|
|
||||||
# * Code review nav *
|
# * Code review nav *
|
||||||
|
# (see comments in __init__.py)
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
# ← Prev: _bidict.py Current: _orderedbase.py Next: _frozenordered.py →
|
# ← Prev: _bidict.py Current: _orderedbase.py Next: _frozenordered.py →
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
|
@ -28,126 +14,109 @@
|
||||||
|
|
||||||
"""Provide :class:`OrderedBidictBase`."""
|
"""Provide :class:`OrderedBidictBase`."""
|
||||||
|
|
||||||
import typing as _t
|
import typing as t
|
||||||
from copy import copy
|
from functools import partial
|
||||||
from weakref import ref
|
from weakref import ref as weakref
|
||||||
|
|
||||||
from ._abc import MutableBidirectionalMapping
|
from ._base import BidictBase, PreparedWrite
|
||||||
from ._base import _NONE, _DedupResult, _WriteResult, BidictBase, BT
|
|
||||||
from ._bidict import bidict
|
from ._bidict import bidict
|
||||||
from ._typing import KT, VT, OKT, OVT, IterItems, MapOrIterItems
|
from ._iter import iteritems
|
||||||
|
from ._typing import KT, VT, OKT, OVT, MISSING, IterItems, MapOrIterItems
|
||||||
|
|
||||||
|
|
||||||
class _Node:
|
IT = t.TypeVar('IT') # instance type
|
||||||
|
AT = t.TypeVar('AT') # attr type
|
||||||
|
|
||||||
|
|
||||||
|
class WeakAttr(t.Generic[IT, AT]):
|
||||||
|
"""Descriptor to automatically manage (de)referencing the given slot as a weakref.
|
||||||
|
|
||||||
|
See https://docs.python.org/3/howto/descriptor.html#managed-attributes
|
||||||
|
for an intro to using descriptors like this for managed attributes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *, slot: str) -> None:
|
||||||
|
self.slot = slot
|
||||||
|
|
||||||
|
def __set__(self, instance: IT, value: AT) -> None:
|
||||||
|
setattr(instance, self.slot, weakref(value))
|
||||||
|
|
||||||
|
def __get__(self, instance: IT, owner: t.Any) -> AT:
|
||||||
|
return getattr(instance, self.slot)() # type: ignore [no-any-return]
|
||||||
|
|
||||||
|
|
||||||
|
class Node:
|
||||||
"""A node in a circular doubly-linked list
|
"""A node in a circular doubly-linked list
|
||||||
used to encode the order of items in an ordered bidict.
|
used to encode the order of items in an ordered bidict.
|
||||||
|
|
||||||
Only weak references to the next and previous nodes
|
A weak reference to the previous node is stored
|
||||||
are held to avoid creating strong reference cycles.
|
to avoid creating strong reference cycles.
|
||||||
|
Referencing/dereferencing the weakref is handled automatically by :class:`WeakAttr`.
|
||||||
Because an ordered bidict retains two strong references
|
|
||||||
to each node instance (one from its backing `_fwdm` mapping
|
|
||||||
and one from its `_invm` mapping), a node's refcount will not
|
|
||||||
drop to zero (and so will not be garbage collected) as long as
|
|
||||||
the ordered bidict that contains it is still alive.
|
|
||||||
Because nodes don't have strong reference cycles,
|
|
||||||
once their containing bidict is freed,
|
|
||||||
they too are immediately freed.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__slots__ = ('_prv', '_nxt', '__weakref__')
|
prv: 'WeakAttr[Node, Node]' = WeakAttr(slot='_prv_weak')
|
||||||
|
__slots__ = ('_prv_weak', 'nxt', '__weakref__')
|
||||||
|
|
||||||
def __init__(self, prv: '_Node' = None, nxt: '_Node' = None) -> None:
|
def __init__(self, prv: 'Node', nxt: 'Node') -> None:
|
||||||
self._setprv(prv)
|
self.prv = prv
|
||||||
self._setnxt(nxt)
|
self.nxt = nxt
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def unlink(self) -> None:
|
||||||
clsname = self.__class__.__name__
|
"""Remove self from in between prv and nxt.
|
||||||
prv = id(self.prv)
|
Self's references to prv and nxt are retained so it can be relinked (see below).
|
||||||
nxt = id(self.nxt)
|
|
||||||
return f'{clsname}(prv={prv}, self={id(self)}, nxt={nxt})'
|
|
||||||
|
|
||||||
def _getprv(self) -> '_t.Optional[_Node]':
|
|
||||||
return self._prv() if isinstance(self._prv, ref) else self._prv
|
|
||||||
|
|
||||||
def _setprv(self, prv: '_t.Optional[_Node]') -> None:
|
|
||||||
self._prv = prv and ref(prv)
|
|
||||||
|
|
||||||
prv = property(_getprv, _setprv)
|
|
||||||
|
|
||||||
def _getnxt(self) -> '_t.Optional[_Node]':
|
|
||||||
return self._nxt() if isinstance(self._nxt, ref) else self._nxt
|
|
||||||
|
|
||||||
def _setnxt(self, nxt: '_t.Optional[_Node]') -> None:
|
|
||||||
self._nxt = nxt and ref(nxt)
|
|
||||||
|
|
||||||
nxt = property(_getnxt, _setnxt)
|
|
||||||
|
|
||||||
def __getstate__(self) -> dict:
|
|
||||||
"""Return the instance state dictionary
|
|
||||||
but with weakrefs converted to strong refs
|
|
||||||
so that it can be pickled.
|
|
||||||
|
|
||||||
*See also* :meth:`object.__getstate__`
|
|
||||||
"""
|
"""
|
||||||
return dict(_prv=self.prv, _nxt=self.nxt)
|
self.prv.nxt = self.nxt
|
||||||
|
self.nxt.prv = self.prv
|
||||||
|
|
||||||
def __setstate__(self, state: dict) -> None:
|
def relink(self) -> None:
|
||||||
"""Set the instance state from *state*."""
|
"""Restore self between prv and nxt after unlinking (see above)."""
|
||||||
self._setprv(state['_prv'])
|
self.prv.nxt = self.nxt.prv = self
|
||||||
self._setnxt(state['_nxt'])
|
|
||||||
|
|
||||||
|
|
||||||
class _SentinelNode(_Node):
|
class SentinelNode(Node):
|
||||||
"""Special node in a circular doubly-linked list
|
"""Special node in a circular doubly-linked list
|
||||||
that links the first node with the last node.
|
that links the first node with the last node.
|
||||||
When its next and previous references point back to itself
|
When its next and previous references point back to itself
|
||||||
it represents an empty list.
|
it represents an empty list.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__slots__ = ()
|
nxt: WeakAttr['SentinelNode', Node] = WeakAttr(slot='_nxt_weak') # type: ignore [assignment]
|
||||||
|
__slots__ = ('_nxt_weak',)
|
||||||
|
|
||||||
def __init__(self, prv: _Node = None, nxt: _Node = None) -> None:
|
def __init__(self) -> None:
|
||||||
super().__init__(prv or self, nxt or self)
|
super().__init__(self, self)
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def iternodes(self, *, reverse: bool = False) -> t.Iterator[Node]:
|
||||||
return '<SNTL>'
|
"""Iterator yielding nodes in the requested order."""
|
||||||
|
|
||||||
def __bool__(self) -> bool:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _iter(self, *, reverse: bool = False) -> _t.Iterator[_Node]:
|
|
||||||
"""Iterator yielding nodes in the requested order,
|
|
||||||
i.e. traverse the linked list via :attr:`nxt`
|
|
||||||
(or :attr:`prv` if *reverse* is truthy)
|
|
||||||
until reaching a falsy (i.e. sentinel) node.
|
|
||||||
"""
|
|
||||||
attr = 'prv' if reverse else 'nxt'
|
attr = 'prv' if reverse else 'nxt'
|
||||||
node = getattr(self, attr)
|
node = getattr(self, attr)
|
||||||
while node:
|
while node is not self:
|
||||||
yield node
|
yield node
|
||||||
node = getattr(node, attr)
|
node = getattr(node, attr)
|
||||||
|
|
||||||
|
def new_last_node(self) -> Node:
|
||||||
|
"""Create and return a new terminal node."""
|
||||||
|
old_last = self.prv
|
||||||
|
new_last = Node(old_last, self)
|
||||||
|
old_last.nxt = self.prv = new_last
|
||||||
|
return new_last
|
||||||
|
|
||||||
|
|
||||||
class OrderedBidictBase(BidictBase[KT, VT]):
|
class OrderedBidictBase(BidictBase[KT, VT]):
|
||||||
"""Base class implementing an ordered :class:`BidirectionalMapping`."""
|
"""Base class implementing an ordered :class:`BidirectionalMapping`."""
|
||||||
|
|
||||||
__slots__ = ('_sntl',)
|
_repr_delegate: t.ClassVar[t.Any] = list
|
||||||
|
|
||||||
_fwdm_cls: _t.Type[MutableBidirectionalMapping[KT, _Node]] = bidict # type: ignore [assignment]
|
_node_by_korv: bidict[t.Any, Node]
|
||||||
_invm_cls: _t.Type[MutableBidirectionalMapping[VT, _Node]] = bidict # type: ignore [assignment]
|
_bykey: bool
|
||||||
_fwdm: bidict[KT, _Node] # type: ignore [assignment]
|
|
||||||
_invm: bidict[VT, _Node] # type: ignore [assignment]
|
|
||||||
|
|
||||||
#: The object used by :meth:`__repr__` for printing the contained items.
|
@t.overload
|
||||||
_repr_delegate = list
|
def __init__(self, __m: t.Mapping[KT, VT], **kw: VT) -> None: ...
|
||||||
|
@t.overload
|
||||||
@_t.overload
|
def __init__(self, __i: IterItems[KT, VT], **kw: VT) -> None: ...
|
||||||
def __init__(self, __arg: _t.Mapping[KT, VT], **kw: VT) -> None: ...
|
@t.overload
|
||||||
@_t.overload
|
|
||||||
def __init__(self, __arg: IterItems[KT, VT], **kw: VT) -> None: ...
|
|
||||||
@_t.overload
|
|
||||||
def __init__(self, **kw: VT) -> None: ...
|
def __init__(self, **kw: VT) -> None: ...
|
||||||
|
|
||||||
def __init__(self, *args: MapOrIterItems[KT, VT], **kw: VT) -> None:
|
def __init__(self, *args: MapOrIterItems[KT, VT], **kw: VT) -> None:
|
||||||
"""Make a new ordered bidirectional mapping.
|
"""Make a new ordered bidirectional mapping.
|
||||||
The signature behaves like that of :class:`dict`.
|
The signature behaves like that of :class:`dict`.
|
||||||
|
@ -157,147 +126,109 @@ class OrderedBidictBase(BidictBase[KT, VT]):
|
||||||
The order in which items are inserted is remembered,
|
The order in which items are inserted is remembered,
|
||||||
similar to :class:`collections.OrderedDict`.
|
similar to :class:`collections.OrderedDict`.
|
||||||
"""
|
"""
|
||||||
self._sntl = _SentinelNode()
|
self._sntl = SentinelNode()
|
||||||
|
self._node_by_korv = bidict()
|
||||||
# Like unordered bidicts, ordered bidicts also store two backing one-directional mappings
|
self._bykey = True
|
||||||
# `_fwdm` and `_invm`. But rather than mapping `key` to `val` and `val` to `key`
|
|
||||||
# (respectively), they map `key` to `nodefwd` and `val` to `nodeinv` (respectively), where
|
|
||||||
# `nodefwd` is `nodeinv` when `key` and `val` are associated with one another.
|
|
||||||
|
|
||||||
# To effect this difference, `_write_item` and `_undo_write` are overridden. But much of the
|
|
||||||
# rest of BidictBase's implementation, including BidictBase.__init__ and BidictBase._update,
|
|
||||||
# are inherited and are able to be reused without modification.
|
|
||||||
super().__init__(*args, **kw)
|
super().__init__(*args, **kw)
|
||||||
|
|
||||||
if _t.TYPE_CHECKING:
|
if t.TYPE_CHECKING:
|
||||||
@property
|
@property
|
||||||
def inverse(self) -> 'OrderedBidictBase[VT, KT]': ...
|
def inverse(self) -> 'OrderedBidictBase[VT, KT]': ...
|
||||||
|
|
||||||
def _init_inv(self) -> None:
|
def _make_inverse(self) -> 'OrderedBidictBase[VT, KT]':
|
||||||
super()._init_inv()
|
inv = t.cast(OrderedBidictBase[VT, KT], super()._make_inverse())
|
||||||
self.inverse._sntl = self._sntl
|
inv._sntl = self._sntl
|
||||||
|
inv._node_by_korv = self._node_by_korv
|
||||||
|
inv._bykey = not self._bykey
|
||||||
|
return inv
|
||||||
|
|
||||||
# Can't reuse BidictBase.copy since ordered bidicts have different internal structure.
|
def _assoc_node(self, node: Node, key: KT, val: VT) -> None:
|
||||||
def copy(self: BT) -> BT:
|
korv = key if self._bykey else val
|
||||||
"""A shallow copy of this ordered bidict."""
|
self._node_by_korv.forceput(korv, node)
|
||||||
# Fast copy implementation bypassing __init__. See comments in :meth:`BidictBase.copy`.
|
|
||||||
cp: BT = self.__class__.__new__(self.__class__)
|
|
||||||
sntl = _SentinelNode()
|
|
||||||
fwdm = copy(self._fwdm)
|
|
||||||
invm = copy(self._invm)
|
|
||||||
cur = sntl
|
|
||||||
nxt = sntl.nxt
|
|
||||||
for (key, val) in self.items():
|
|
||||||
nxt = _Node(cur, sntl)
|
|
||||||
cur.nxt = fwdm[key] = invm[val] = nxt
|
|
||||||
cur = nxt
|
|
||||||
sntl.prv = nxt
|
|
||||||
cp._sntl = sntl # type: ignore [attr-defined]
|
|
||||||
cp._fwdm = fwdm
|
|
||||||
cp._invm = invm
|
|
||||||
cp._init_inv()
|
|
||||||
return cp
|
|
||||||
|
|
||||||
__copy__ = copy
|
def _dissoc_node(self, node: Node) -> None:
|
||||||
|
del self._node_by_korv.inverse[node]
|
||||||
|
node.unlink()
|
||||||
|
|
||||||
def __getitem__(self, key: KT) -> VT:
|
def _init_from(self, other: MapOrIterItems[KT, VT]) -> None:
|
||||||
nodefwd = self._fwdm[key]
|
"""See :meth:`BidictBase._init_from`."""
|
||||||
val = self._invm.inverse[nodefwd]
|
super()._init_from(other)
|
||||||
return val
|
bykey = self._bykey
|
||||||
|
korv_by_node = self._node_by_korv.inverse
|
||||||
|
korv_by_node.clear()
|
||||||
|
korv_by_node_set = korv_by_node.__setitem__
|
||||||
|
self._sntl.nxt = self._sntl.prv = self._sntl
|
||||||
|
new_node = self._sntl.new_last_node
|
||||||
|
for (k, v) in iteritems(other):
|
||||||
|
korv_by_node_set(new_node(), k if bykey else v)
|
||||||
|
|
||||||
def _pop(self, key: KT) -> VT:
|
def _prep_write(self, newkey: KT, newval: VT, oldkey: OKT[KT], oldval: OVT[VT], save_unwrite: bool) -> PreparedWrite:
|
||||||
nodefwd = self._fwdm.pop(key)
|
"""See :meth:`bidict.BidictBase._prep_write`."""
|
||||||
val = self._invm.inverse.pop(nodefwd)
|
write, unwrite = super()._prep_write(newkey, newval, oldkey, oldval, save_unwrite)
|
||||||
nodefwd.prv.nxt = nodefwd.nxt
|
assoc, dissoc = self._assoc_node, self._dissoc_node
|
||||||
nodefwd.nxt.prv = nodefwd.prv
|
node_by_korv, bykey = self._node_by_korv, self._bykey
|
||||||
return val
|
if oldval is MISSING and oldkey is MISSING: # no key or value duplication
|
||||||
|
# {0: 1, 2: 3} + (4, 5) => {0: 1, 2: 3, 4: 5}
|
||||||
|
newnode = self._sntl.new_last_node()
|
||||||
|
write.append(partial(assoc, newnode, newkey, newval))
|
||||||
|
if save_unwrite:
|
||||||
|
unwrite.append(partial(dissoc, newnode))
|
||||||
|
elif oldval is not MISSING and oldkey is not MISSING: # key and value duplication across two different items
|
||||||
|
# {0: 1, 2: 3} + (0, 3) => {0: 3}
|
||||||
|
# n1, n2 => n1 (collapse n1 and n2 into n1)
|
||||||
|
# oldkey: 2, oldval: 1, oldnode: n2, newkey: 0, newval: 3, newnode: n1
|
||||||
|
if bykey:
|
||||||
|
oldnode = node_by_korv[oldkey]
|
||||||
|
newnode = node_by_korv[newkey]
|
||||||
|
else:
|
||||||
|
oldnode = node_by_korv[newval]
|
||||||
|
newnode = node_by_korv[oldval]
|
||||||
|
write.extend((
|
||||||
|
partial(dissoc, oldnode),
|
||||||
|
partial(assoc, newnode, newkey, newval),
|
||||||
|
))
|
||||||
|
if save_unwrite:
|
||||||
|
unwrite.extend((
|
||||||
|
partial(assoc, newnode, newkey, oldval),
|
||||||
|
partial(assoc, oldnode, oldkey, newval),
|
||||||
|
partial(oldnode.relink,),
|
||||||
|
))
|
||||||
|
elif oldval is not MISSING: # just key duplication
|
||||||
|
# {0: 1, 2: 3} + (2, 4) => {0: 1, 2: 4}
|
||||||
|
# oldkey: MISSING, oldval: 3, newkey: 2, newval: 4
|
||||||
|
node = node_by_korv[newkey if bykey else oldval]
|
||||||
|
write.append(partial(assoc, node, newkey, newval))
|
||||||
|
if save_unwrite:
|
||||||
|
unwrite.append(partial(assoc, node, newkey, oldval))
|
||||||
|
else:
|
||||||
|
assert oldkey is not MISSING # just value duplication
|
||||||
|
# {0: 1, 2: 3} + (4, 3) => {0: 1, 4: 3}
|
||||||
|
# oldkey: 2, oldval: MISSING, newkey: 4, newval: 3
|
||||||
|
node = node_by_korv[oldkey if bykey else newval]
|
||||||
|
write.append(partial(assoc, node, newkey, newval))
|
||||||
|
if save_unwrite:
|
||||||
|
unwrite.append(partial(assoc, node, oldkey, newval))
|
||||||
|
return write, unwrite
|
||||||
|
|
||||||
@staticmethod
|
def __iter__(self) -> t.Iterator[KT]:
|
||||||
def _already_have(key: KT, val: VT, nodeinv: _Node, nodefwd: _Node) -> bool: # type: ignore [override]
|
|
||||||
# Overrides _base.BidictBase.
|
|
||||||
return nodeinv is nodefwd
|
|
||||||
|
|
||||||
def _write_item(self, key: KT, val: VT, dedup_result: _DedupResult) -> _WriteResult:
|
|
||||||
# Overrides _base.BidictBase.
|
|
||||||
fwdm = self._fwdm # bidict mapping keys to nodes
|
|
||||||
invm = self._invm # bidict mapping vals to nodes
|
|
||||||
isdupkey, isdupval, nodeinv, nodefwd = dedup_result
|
|
||||||
if not isdupkey and not isdupval:
|
|
||||||
# No key or value duplication -> create and append a new node.
|
|
||||||
sntl = self._sntl
|
|
||||||
last = sntl.prv
|
|
||||||
node = _Node(last, sntl)
|
|
||||||
last.nxt = sntl.prv = fwdm[key] = invm[val] = node
|
|
||||||
oldkey: OKT = _NONE
|
|
||||||
oldval: OVT = _NONE
|
|
||||||
elif isdupkey and isdupval:
|
|
||||||
# Key and value duplication across two different nodes.
|
|
||||||
assert nodefwd is not nodeinv
|
|
||||||
oldval = invm.inverse[nodefwd]
|
|
||||||
oldkey = fwdm.inverse[nodeinv]
|
|
||||||
assert oldkey != key
|
|
||||||
assert oldval != val
|
|
||||||
# We have to collapse nodefwd and nodeinv into a single node, i.e. drop one of them.
|
|
||||||
# Drop nodeinv, so that the item with the same key is the one overwritten in place.
|
|
||||||
nodeinv.prv.nxt = nodeinv.nxt
|
|
||||||
nodeinv.nxt.prv = nodeinv.prv
|
|
||||||
# Don't remove nodeinv's references to its neighbors since
|
|
||||||
# if the update fails, we'll need them to undo this write.
|
|
||||||
# Update fwdm and invm.
|
|
||||||
tmp = fwdm.pop(oldkey)
|
|
||||||
assert tmp is nodeinv
|
|
||||||
tmp = invm.pop(oldval)
|
|
||||||
assert tmp is nodefwd
|
|
||||||
fwdm[key] = invm[val] = nodefwd
|
|
||||||
elif isdupkey:
|
|
||||||
oldval = invm.inverse[nodefwd]
|
|
||||||
oldkey = _NONE
|
|
||||||
oldnodeinv = invm.pop(oldval)
|
|
||||||
assert oldnodeinv is nodefwd
|
|
||||||
invm[val] = nodefwd
|
|
||||||
else: # isdupval
|
|
||||||
oldkey = fwdm.inverse[nodeinv]
|
|
||||||
oldval = _NONE
|
|
||||||
oldnodefwd = fwdm.pop(oldkey)
|
|
||||||
assert oldnodefwd is nodeinv
|
|
||||||
fwdm[key] = nodeinv
|
|
||||||
return _WriteResult(key, val, oldkey, oldval)
|
|
||||||
|
|
||||||
def _undo_write(self, dedup_result: _DedupResult, write_result: _WriteResult) -> None:
|
|
||||||
fwdm = self._fwdm
|
|
||||||
invm = self._invm
|
|
||||||
isdupkey, isdupval, nodeinv, nodefwd = dedup_result
|
|
||||||
key, val, oldkey, oldval = write_result
|
|
||||||
if not isdupkey and not isdupval:
|
|
||||||
self._pop(key)
|
|
||||||
elif isdupkey and isdupval:
|
|
||||||
# Restore original items.
|
|
||||||
nodeinv.prv.nxt = nodeinv.nxt.prv = nodeinv
|
|
||||||
fwdm[oldkey] = invm[val] = nodeinv
|
|
||||||
invm[oldval] = fwdm[key] = nodefwd
|
|
||||||
elif isdupkey:
|
|
||||||
tmp = invm.pop(val)
|
|
||||||
assert tmp is nodefwd
|
|
||||||
invm[oldval] = nodefwd
|
|
||||||
assert fwdm[key] is nodefwd
|
|
||||||
else: # isdupval
|
|
||||||
tmp = fwdm.pop(key)
|
|
||||||
assert tmp is nodeinv
|
|
||||||
fwdm[oldkey] = nodeinv
|
|
||||||
assert invm[val] is nodeinv
|
|
||||||
|
|
||||||
def __iter__(self) -> _t.Iterator[KT]:
|
|
||||||
"""Iterator over the contained keys in insertion order."""
|
"""Iterator over the contained keys in insertion order."""
|
||||||
return self._iter()
|
return self._iter(reverse=False)
|
||||||
|
|
||||||
def _iter(self, *, reverse: bool = False) -> _t.Iterator[KT]:
|
def __reversed__(self: 'OrderedBidictBase[KT, VT]') -> t.Iterator[KT]:
|
||||||
fwdm_inv = self._fwdm.inverse
|
|
||||||
for node in self._sntl._iter(reverse=reverse):
|
|
||||||
yield fwdm_inv[node]
|
|
||||||
|
|
||||||
def __reversed__(self) -> _t.Iterator[KT]:
|
|
||||||
"""Iterator over the contained keys in reverse insertion order."""
|
"""Iterator over the contained keys in reverse insertion order."""
|
||||||
yield from self._iter(reverse=True)
|
return self._iter(reverse=True)
|
||||||
|
|
||||||
|
def _iter(self, *, reverse: bool = False) -> t.Iterator[KT]:
|
||||||
|
nodes = self._sntl.iternodes(reverse=reverse)
|
||||||
|
korv_by_node = self._node_by_korv.inverse
|
||||||
|
if self._bykey:
|
||||||
|
for node in nodes:
|
||||||
|
yield korv_by_node[node]
|
||||||
|
else:
|
||||||
|
key_by_val = self._invm
|
||||||
|
for node in nodes:
|
||||||
|
val = korv_by_node[node]
|
||||||
|
yield key_by_val[val]
|
||||||
|
|
||||||
|
|
||||||
# * Code review nav *
|
# * Code review nav *
|
||||||
|
|
|
@ -1,26 +1,12 @@
|
||||||
# -*- coding: utf-8 -*-
|
# Copyright 2009-2022 Joshua Bronson. All rights reserved.
|
||||||
# Copyright 2009-2021 Joshua Bronson. All Rights Reserved.
|
|
||||||
#
|
#
|
||||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
|
||||||
#==============================================================================
|
|
||||||
# * Welcome to the bidict source code *
|
|
||||||
#==============================================================================
|
|
||||||
|
|
||||||
# Doing a code review? You'll find a "Code review nav" comment like the one
|
|
||||||
# below at the top and bottom of the most important source files. This provides
|
|
||||||
# a suggested initial path through the source when reviewing.
|
|
||||||
#
|
|
||||||
# Note: If you aren't reading this on https://github.com/jab/bidict, you may be
|
|
||||||
# viewing an outdated version of the code. Please head to GitHub to review the
|
|
||||||
# latest version, which contains important improvements over older versions.
|
|
||||||
#
|
|
||||||
# Thank you for reading and for any feedback you provide.
|
|
||||||
|
|
||||||
# * Code review nav *
|
# * Code review nav *
|
||||||
|
# (see comments in __init__.py)
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
# ← Prev: _frozenordered.py Current: _orderedbidict.py <FIN>
|
# ← Prev: _frozenordered.py Current: _orderedbidict.py <FIN>
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
|
@ -28,52 +14,61 @@
|
||||||
|
|
||||||
"""Provide :class:`OrderedBidict`."""
|
"""Provide :class:`OrderedBidict`."""
|
||||||
|
|
||||||
import typing as _t
|
from collections.abc import Set
|
||||||
|
import typing as t
|
||||||
|
|
||||||
from ._mut import MutableBidict
|
from ._base import BidictKeysView
|
||||||
|
from ._bidict import MutableBidict
|
||||||
from ._orderedbase import OrderedBidictBase
|
from ._orderedbase import OrderedBidictBase
|
||||||
from ._typing import KT, VT
|
from ._typing import KT, VT
|
||||||
|
|
||||||
|
|
||||||
|
# pyright: reportPrivateUsage=false
|
||||||
|
|
||||||
|
|
||||||
class OrderedBidict(OrderedBidictBase[KT, VT], MutableBidict[KT, VT]):
|
class OrderedBidict(OrderedBidictBase[KT, VT], MutableBidict[KT, VT]):
|
||||||
"""Mutable bidict type that maintains items in insertion order."""
|
"""Mutable bidict type that maintains items in insertion order."""
|
||||||
|
|
||||||
__slots__ = ()
|
if t.TYPE_CHECKING:
|
||||||
|
|
||||||
if _t.TYPE_CHECKING:
|
|
||||||
@property
|
@property
|
||||||
def inverse(self) -> 'OrderedBidict[VT, KT]': ...
|
def inverse(self) -> 'OrderedBidict[VT, KT]': ...
|
||||||
|
|
||||||
def clear(self) -> None:
|
def clear(self) -> None:
|
||||||
"""Remove all items."""
|
"""Remove all items."""
|
||||||
self._fwdm.clear()
|
super().clear()
|
||||||
self._invm.clear()
|
self._node_by_korv.clear()
|
||||||
self._sntl.nxt = self._sntl.prv = self._sntl
|
self._sntl.nxt = self._sntl.prv = self._sntl
|
||||||
|
|
||||||
def popitem(self, last: bool = True) -> _t.Tuple[KT, VT]:
|
def _pop(self, key: KT) -> VT:
|
||||||
"""*x.popitem() → (k, v)*
|
val = super()._pop(key)
|
||||||
|
node = self._node_by_korv[key if self._bykey else val]
|
||||||
|
self._dissoc_node(node)
|
||||||
|
return val
|
||||||
|
|
||||||
Remove and return the most recently added item as a (key, value) pair
|
def popitem(self, last: bool = True) -> t.Tuple[KT, VT]:
|
||||||
if *last* is True, else the least recently added item.
|
"""*b.popitem() → (k, v)*
|
||||||
|
|
||||||
:raises KeyError: if *x* is empty.
|
If *last* is true,
|
||||||
|
remove and return the most recently added item as a (key, value) pair.
|
||||||
|
Otherwise, remove and return the least recently added item.
|
||||||
|
|
||||||
|
:raises KeyError: if *b* is empty.
|
||||||
"""
|
"""
|
||||||
if not self:
|
if not self:
|
||||||
raise KeyError('mapping is empty')
|
raise KeyError('OrderedBidict is empty')
|
||||||
itfn: _t.Callable = reversed if last else iter # type: ignore [assignment]
|
node = getattr(self._sntl, 'prv' if last else 'nxt')
|
||||||
it = itfn(self)
|
korv = self._node_by_korv.inverse[node]
|
||||||
key = next(it)
|
if self._bykey:
|
||||||
val = self._pop(key)
|
return korv, self._pop(korv)
|
||||||
return key, val
|
return self.inverse._pop(korv), korv # pyright: ignore [reportGeneralTypeIssues]
|
||||||
|
|
||||||
def move_to_end(self, key: KT, last: bool = True) -> None:
|
def move_to_end(self, key: KT, last: bool = True) -> None:
|
||||||
"""Move an existing key to the beginning or end of this ordered bidict.
|
"""Move the item with the given key to the end if *last* is true, else to the beginning.
|
||||||
|
|
||||||
The item is moved to the end if *last* is True, else to the beginning.
|
:raises KeyError: if *key* is missing
|
||||||
|
|
||||||
:raises KeyError: if the key does not exist
|
|
||||||
"""
|
"""
|
||||||
node = self._fwdm[key]
|
korv = key if self._bykey else self._fwdm[key]
|
||||||
|
node = self._node_by_korv[korv]
|
||||||
node.prv.nxt = node.nxt
|
node.prv.nxt = node.nxt
|
||||||
node.nxt.prv = node.prv
|
node.nxt.prv = node.prv
|
||||||
sntl = self._sntl
|
sntl = self._sntl
|
||||||
|
@ -88,6 +83,77 @@ class OrderedBidict(OrderedBidictBase[KT, VT], MutableBidict[KT, VT]):
|
||||||
node.nxt = firstnode
|
node.nxt = firstnode
|
||||||
sntl.nxt = firstnode.prv = node
|
sntl.nxt = firstnode.prv = node
|
||||||
|
|
||||||
|
# Override the keys() and items() implementations inherited from BidictBase,
|
||||||
|
# which may delegate to the backing _fwdm dict, since this is a mutable ordered bidict,
|
||||||
|
# and therefore the ordering of items can get out of sync with the backing mappings
|
||||||
|
# after mutation. (Need not override values() because it delegates to .inverse.keys().)
|
||||||
|
def keys(self) -> t.KeysView[KT]:
|
||||||
|
"""A set-like object providing a view on the contained keys."""
|
||||||
|
return _OrderedBidictKeysView(self)
|
||||||
|
|
||||||
|
def items(self) -> t.ItemsView[KT, VT]:
|
||||||
|
"""A set-like object providing a view on the contained items."""
|
||||||
|
return _OrderedBidictItemsView(self)
|
||||||
|
|
||||||
|
|
||||||
|
# The following MappingView implementations use the __iter__ implementations
|
||||||
|
# inherited from their superclass counterparts in collections.abc, so they
|
||||||
|
# continue to yield items in the correct order even after an OrderedBidict
|
||||||
|
# is mutated. They also provide a __reversed__ implementation, which is not
|
||||||
|
# provided by the collections.abc superclasses.
|
||||||
|
class _OrderedBidictKeysView(BidictKeysView[KT]):
|
||||||
|
_mapping: OrderedBidict[KT, t.Any]
|
||||||
|
|
||||||
|
def __reversed__(self) -> t.Iterator[KT]:
|
||||||
|
return reversed(self._mapping)
|
||||||
|
|
||||||
|
|
||||||
|
class _OrderedBidictItemsView(t.ItemsView[KT, VT]):
|
||||||
|
_mapping: OrderedBidict[KT, VT]
|
||||||
|
|
||||||
|
def __reversed__(self) -> t.Iterator[t.Tuple[KT, VT]]:
|
||||||
|
ob = self._mapping
|
||||||
|
for key in reversed(ob):
|
||||||
|
yield key, ob[key]
|
||||||
|
|
||||||
|
|
||||||
|
# For better performance, make _OrderedBidictKeysView and _OrderedBidictItemsView delegate
|
||||||
|
# to backing dicts for the methods they inherit from collections.abc.Set. (Cannot delegate
|
||||||
|
# for __iter__ and __reversed__ since they are order-sensitive.) See also: https://bugs.python.org/issue46713
|
||||||
|
def _override_set_methods_to_use_backing_dict(
|
||||||
|
cls: t.Union[t.Type[_OrderedBidictKeysView[KT]], t.Type[_OrderedBidictItemsView[KT, t.Any]]],
|
||||||
|
viewname: str,
|
||||||
|
_setmethodnames: t.Iterable[str] = (
|
||||||
|
'__lt__', '__le__', '__gt__', '__ge__', '__eq__', '__ne__', '__sub__', '__rsub__',
|
||||||
|
'__or__', '__ror__', '__xor__', '__rxor__', '__and__', '__rand__', 'isdisjoint',
|
||||||
|
)
|
||||||
|
) -> None:
|
||||||
|
def make_proxy_method(methodname: str) -> t.Any:
|
||||||
|
def method(self: t.Union[_OrderedBidictKeysView[KT], _OrderedBidictItemsView[KT, t.Any]], *args: t.Any) -> t.Any:
|
||||||
|
fwdm = self._mapping._fwdm
|
||||||
|
if not isinstance(fwdm, dict): # dict view speedup not available, fall back to Set's implementation.
|
||||||
|
return getattr(Set, methodname)(self, *args)
|
||||||
|
fwdm_dict_view = getattr(fwdm, viewname)()
|
||||||
|
fwdm_dict_view_method = getattr(fwdm_dict_view, methodname)
|
||||||
|
if len(args) != 1 or not isinstance(args[0], self.__class__) or not isinstance(args[0]._mapping._fwdm, dict):
|
||||||
|
return fwdm_dict_view_method(*args)
|
||||||
|
# self and arg are both _OrderedBidictKeysViews or _OrderedBidictItemsViews whose bidicts are backed by a dict.
|
||||||
|
# Use arg's backing dict's corresponding view instead of arg. Otherwise, e.g. `ob1.keys() < ob2.keys()` would give
|
||||||
|
# "TypeError: '<' not supported between instances of '_OrderedBidictKeysView' and '_OrderedBidictKeysView'", because
|
||||||
|
# both `dict_keys(ob1).__lt__(ob2.keys()) is NotImplemented` and `dict_keys(ob2).__gt__(ob1.keys()) is NotImplemented`.
|
||||||
|
arg_dict_view = getattr(args[0]._mapping._fwdm, viewname)()
|
||||||
|
return fwdm_dict_view_method(arg_dict_view)
|
||||||
|
method.__name__ = methodname
|
||||||
|
method.__qualname__ = f'{cls.__qualname__}.{methodname}'
|
||||||
|
return method
|
||||||
|
|
||||||
|
for name in _setmethodnames:
|
||||||
|
setattr(cls, name, make_proxy_method(name))
|
||||||
|
|
||||||
|
|
||||||
|
_override_set_methods_to_use_backing_dict(_OrderedBidictKeysView, 'keys')
|
||||||
|
_override_set_methods_to_use_backing_dict(_OrderedBidictItemsView, 'items')
|
||||||
|
|
||||||
|
|
||||||
# * Code review nav *
|
# * Code review nav *
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
# -*- coding: utf-8 -*-
|
# Copyright 2009-2022 Joshua Bronson. All rights reserved.
|
||||||
# Copyright 2009-2021 Joshua Bronson. All Rights Reserved.
|
|
||||||
#
|
#
|
||||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
|
@ -8,26 +7,28 @@
|
||||||
|
|
||||||
"""Provide typing-related objects."""
|
"""Provide typing-related objects."""
|
||||||
|
|
||||||
import typing as _t
|
import typing as t
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
KT = _t.TypeVar('KT')
|
KT = t.TypeVar('KT')
|
||||||
VT = _t.TypeVar('VT')
|
VT = t.TypeVar('VT')
|
||||||
IterItems = _t.Iterable[_t.Tuple[KT, VT]]
|
IterItems = t.Iterable[t.Tuple[KT, VT]]
|
||||||
MapOrIterItems = _t.Union[_t.Mapping[KT, VT], IterItems[KT, VT]]
|
MapOrIterItems = t.Union[t.Mapping[KT, VT], IterItems[KT, VT]]
|
||||||
|
|
||||||
DT = _t.TypeVar('DT') #: for default arguments
|
|
||||||
VDT = _t.Union[VT, DT]
|
|
||||||
|
|
||||||
|
|
||||||
class _BareReprMeta(type):
|
class MissingT(Enum):
|
||||||
def __repr__(cls) -> str:
|
"""Sentinel used to represent none/missing when None itself can't be used."""
|
||||||
return f'<{cls.__name__}>'
|
|
||||||
|
MISSING = 'MISSING'
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return '<MISSING>'
|
||||||
|
|
||||||
|
|
||||||
class _NONE(metaclass=_BareReprMeta):
|
MISSING = MissingT.MISSING
|
||||||
"""Sentinel type used to represent 'missing'."""
|
OKT = t.Union[KT, MissingT] #: optional key type
|
||||||
|
OVT = t.Union[VT, MissingT] #: optional value type
|
||||||
|
|
||||||
|
DT = t.TypeVar('DT') #: for default arguments
|
||||||
OKT = _t.Union[KT, _NONE] #: optional key type
|
ODT = t.Union[DT, MissingT]
|
||||||
OVT = _t.Union[VT, _NONE] #: optional value type
|
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
# -*- coding: utf-8 -*-
|
# Copyright 2009-2022 Joshua Bronson. All rights reserved.
|
||||||
# Copyright 2009-2021 Joshua Bronson. All Rights Reserved.
|
|
||||||
#
|
#
|
||||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
|
@ -8,22 +7,23 @@
|
||||||
"""Define bidict package metadata."""
|
"""Define bidict package metadata."""
|
||||||
|
|
||||||
|
|
||||||
__version__ = '0.21.4'
|
__version__ = '0.22.0'
|
||||||
__author__ = 'Joshua Bronson'
|
__author__ = 'Joshua Bronson'
|
||||||
__maintainer__ = 'Joshua Bronson'
|
__maintainer__ = 'Joshua Bronson'
|
||||||
__copyright__ = 'Copyright 2009-2021 Joshua Bronson'
|
__copyright__ = 'Copyright 2009-2022 Joshua Bronson'
|
||||||
__email__ = 'jabronson@gmail.com'
|
__email__ = 'jabronson@gmail.com'
|
||||||
|
|
||||||
# See: ../docs/thanks.rst
|
|
||||||
__credits__ = [i.strip() for i in """
|
|
||||||
Joshua Bronson, Michael Arntzenius, Francis Carr, Gregory Ewing, Raymond Hettinger, Jozef Knaperek,
|
|
||||||
Daniel Pope, Terry Reedy, David Turner, Tom Viner, Richard Sanger, Zeyi Wang
|
|
||||||
""".split(',')]
|
|
||||||
|
|
||||||
__description__ = 'The bidirectional mapping library for Python.'
|
__description__ = 'The bidirectional mapping library for Python.'
|
||||||
__keywords__ = 'dict dictionary mapping datastructure bimap bijection bijective ' \
|
__keywords__ = 'dict dictionary mapping datastructure bimap bijection bijective ' \
|
||||||
'injective inverse reverse bidirectional two-way 2-way'
|
'injective inverse reverse bidirectional two-way 2-way'
|
||||||
|
|
||||||
__license__ = 'MPL 2.0'
|
__license__ = 'MPL 2.0'
|
||||||
__status__ = 'Beta'
|
__status__ = 'Beta'
|
||||||
__url__ = 'https://bidict.readthedocs.io'
|
__url__ = 'https://bidict.readthedocs.io'
|
||||||
|
__project_urls__ = {
|
||||||
|
'Donate': 'https://github.com/sponsors/jab',
|
||||||
|
'Documentation': 'https://bidict.readthedocs.io',
|
||||||
|
'Enterprise Support': 'https://bidict.readthedocs.io/#enterprise-support',
|
||||||
|
'Changelog': 'https://bidict.readthedocs.io/changelog.html',
|
||||||
|
'Source Code': 'https://github.com/jab/bidict',
|
||||||
|
'Issue Tracker': 'https://github.com/jab/bidict/issues',
|
||||||
|
'Chat': 'https://gitter.im/jab/bidict',
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
PEP-561 marker.
|
|
@ -15,14 +15,13 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||||
__version__ = "4.10.0"
|
__version__ = "4.11.1"
|
||||||
__copyright__ = "Copyright (c) 2004-2021 Leonard Richardson"
|
__copyright__ = "Copyright (c) 2004-2022 Leonard Richardson"
|
||||||
# Use of this source code is governed by the MIT license.
|
# Use of this source code is governed by the MIT license.
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
__all__ = ['BeautifulSoup']
|
__all__ = ['BeautifulSoup']
|
||||||
|
|
||||||
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
@ -35,7 +34,11 @@ import warnings
|
||||||
if sys.version_info.major < 3:
|
if sys.version_info.major < 3:
|
||||||
raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.')
|
raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.')
|
||||||
|
|
||||||
from .builder import builder_registry, ParserRejectedMarkup
|
from .builder import (
|
||||||
|
builder_registry,
|
||||||
|
ParserRejectedMarkup,
|
||||||
|
XMLParsedAsHTMLWarning,
|
||||||
|
)
|
||||||
from .dammit import UnicodeDammit
|
from .dammit import UnicodeDammit
|
||||||
from .element import (
|
from .element import (
|
||||||
CData,
|
CData,
|
||||||
|
@ -207,10 +210,10 @@ class BeautifulSoup(Tag):
|
||||||
if old_name in kwargs:
|
if old_name in kwargs:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
'The "%s" argument to the BeautifulSoup constructor '
|
'The "%s" argument to the BeautifulSoup constructor '
|
||||||
'has been renamed to "%s."' % (old_name, new_name))
|
'has been renamed to "%s."' % (old_name, new_name),
|
||||||
value = kwargs[old_name]
|
DeprecationWarning
|
||||||
del kwargs[old_name]
|
)
|
||||||
return value
|
return kwargs.pop(old_name)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
parse_only = parse_only or deprecated_argument(
|
parse_only = parse_only or deprecated_argument(
|
||||||
|
@ -305,51 +308,18 @@ class BeautifulSoup(Tag):
|
||||||
self._namespaces = dict()
|
self._namespaces = dict()
|
||||||
self.parse_only = parse_only
|
self.parse_only = parse_only
|
||||||
|
|
||||||
self.builder.initialize_soup(self)
|
|
||||||
|
|
||||||
if hasattr(markup, 'read'): # It's a file-type object.
|
if hasattr(markup, 'read'): # It's a file-type object.
|
||||||
markup = markup.read()
|
markup = markup.read()
|
||||||
elif len(markup) <= 256 and (
|
elif len(markup) <= 256 and (
|
||||||
(isinstance(markup, bytes) and not b'<' in markup)
|
(isinstance(markup, bytes) and not b'<' in markup)
|
||||||
or (isinstance(markup, str) and not '<' in markup)
|
or (isinstance(markup, str) and not '<' in markup)
|
||||||
):
|
):
|
||||||
# Print out warnings for a couple beginner problems
|
# Issue warnings for a couple beginner problems
|
||||||
# involving passing non-markup to Beautiful Soup.
|
# involving passing non-markup to Beautiful Soup.
|
||||||
# Beautiful Soup will still parse the input as markup,
|
# Beautiful Soup will still parse the input as markup,
|
||||||
# just in case that's what the user really wants.
|
# since that is sometimes the intended behavior.
|
||||||
if (isinstance(markup, str)
|
if not self._markup_is_url(markup):
|
||||||
and not os.path.supports_unicode_filenames):
|
self._markup_resembles_filename(markup)
|
||||||
possible_filename = markup.encode("utf8")
|
|
||||||
else:
|
|
||||||
possible_filename = markup
|
|
||||||
is_file = False
|
|
||||||
is_directory = False
|
|
||||||
try:
|
|
||||||
is_file = os.path.exists(possible_filename)
|
|
||||||
if is_file:
|
|
||||||
is_directory = os.path.isdir(possible_filename)
|
|
||||||
except Exception as e:
|
|
||||||
# This is almost certainly a problem involving
|
|
||||||
# characters not valid in filenames on this
|
|
||||||
# system. Just let it go.
|
|
||||||
pass
|
|
||||||
if is_directory:
|
|
||||||
warnings.warn(
|
|
||||||
'"%s" looks like a directory name, not markup. You may'
|
|
||||||
' want to open a file found in this directory and pass'
|
|
||||||
' the filehandle into Beautiful Soup.' % (
|
|
||||||
self._decode_markup(markup)
|
|
||||||
),
|
|
||||||
MarkupResemblesLocatorWarning
|
|
||||||
)
|
|
||||||
elif is_file:
|
|
||||||
warnings.warn(
|
|
||||||
'"%s" looks like a filename, not markup. You should'
|
|
||||||
' probably open this file and pass the filehandle into'
|
|
||||||
' Beautiful Soup.' % self._decode_markup(markup),
|
|
||||||
MarkupResemblesLocatorWarning
|
|
||||||
)
|
|
||||||
self._check_markup_is_url(markup)
|
|
||||||
|
|
||||||
rejections = []
|
rejections = []
|
||||||
success = False
|
success = False
|
||||||
|
@ -358,6 +328,7 @@ class BeautifulSoup(Tag):
|
||||||
self.builder.prepare_markup(
|
self.builder.prepare_markup(
|
||||||
markup, from_encoding, exclude_encodings=exclude_encodings)):
|
markup, from_encoding, exclude_encodings=exclude_encodings)):
|
||||||
self.reset()
|
self.reset()
|
||||||
|
self.builder.initialize_soup(self)
|
||||||
try:
|
try:
|
||||||
self._feed()
|
self._feed()
|
||||||
success = True
|
success = True
|
||||||
|
@ -393,7 +364,7 @@ class BeautifulSoup(Tag):
|
||||||
def __getstate__(self):
|
def __getstate__(self):
|
||||||
# Frequently a tree builder can't be pickled.
|
# Frequently a tree builder can't be pickled.
|
||||||
d = dict(self.__dict__)
|
d = dict(self.__dict__)
|
||||||
if 'builder' in d and not self.builder.picklable:
|
if 'builder' in d and d['builder'] is not None and not self.builder.picklable:
|
||||||
d['builder'] = None
|
d['builder'] = None
|
||||||
return d
|
return d
|
||||||
|
|
||||||
|
@ -411,11 +382,13 @@ class BeautifulSoup(Tag):
|
||||||
return decoded
|
return decoded
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _check_markup_is_url(cls, markup):
|
def _markup_is_url(cls, markup):
|
||||||
"""Error-handling method to raise a warning if incoming markup looks
|
"""Error-handling method to raise a warning if incoming markup looks
|
||||||
like a URL.
|
like a URL.
|
||||||
|
|
||||||
:param markup: A string.
|
:param markup: A string.
|
||||||
|
:return: Whether or not the markup resembles a URL
|
||||||
|
closely enough to justify a warning.
|
||||||
"""
|
"""
|
||||||
if isinstance(markup, bytes):
|
if isinstance(markup, bytes):
|
||||||
space = b' '
|
space = b' '
|
||||||
|
@ -424,19 +397,49 @@ class BeautifulSoup(Tag):
|
||||||
space = ' '
|
space = ' '
|
||||||
cant_start_with = ("http:", "https:")
|
cant_start_with = ("http:", "https:")
|
||||||
else:
|
else:
|
||||||
return
|
return False
|
||||||
|
|
||||||
if any(markup.startswith(prefix) for prefix in cant_start_with):
|
if any(markup.startswith(prefix) for prefix in cant_start_with):
|
||||||
if not space in markup:
|
if not space in markup:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
'"%s" looks like a URL. Beautiful Soup is not an'
|
'The input looks more like a URL than markup. You may want to use'
|
||||||
' HTTP client. You should probably use an HTTP client like'
|
' an HTTP client like requests to get the document behind'
|
||||||
' requests to get the document behind the URL, and feed'
|
' the URL, and feed that document to Beautiful Soup.',
|
||||||
' that document to Beautiful Soup.' % cls._decode_markup(
|
|
||||||
markup
|
|
||||||
),
|
|
||||||
MarkupResemblesLocatorWarning
|
MarkupResemblesLocatorWarning
|
||||||
)
|
)
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _markup_resembles_filename(cls, markup):
|
||||||
|
"""Error-handling method to raise a warning if incoming markup
|
||||||
|
resembles a filename.
|
||||||
|
|
||||||
|
:param markup: A bytestring or string.
|
||||||
|
:return: Whether or not the markup resembles a filename
|
||||||
|
closely enough to justify a warning.
|
||||||
|
"""
|
||||||
|
path_characters = '/\\'
|
||||||
|
extensions = ['.html', '.htm', '.xml', '.xhtml', '.txt']
|
||||||
|
if isinstance(markup, bytes):
|
||||||
|
path_characters = path_characters.encode("utf8")
|
||||||
|
extensions = [x.encode('utf8') for x in extensions]
|
||||||
|
filelike = False
|
||||||
|
if any(x in markup for x in path_characters):
|
||||||
|
filelike = True
|
||||||
|
else:
|
||||||
|
lower = markup.lower()
|
||||||
|
if any(lower.endswith(ext) for ext in extensions):
|
||||||
|
filelike = True
|
||||||
|
if filelike:
|
||||||
|
warnings.warn(
|
||||||
|
'The input looks more like a filename than markup. You may'
|
||||||
|
' want to open this file and pass the filehandle into'
|
||||||
|
' Beautiful Soup.',
|
||||||
|
MarkupResemblesLocatorWarning
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def _feed(self):
|
def _feed(self):
|
||||||
"""Internal method that parses previously set markup, creating a large
|
"""Internal method that parses previously set markup, creating a large
|
||||||
|
@ -689,7 +692,7 @@ class BeautifulSoup(Tag):
|
||||||
return most_recently_popped
|
return most_recently_popped
|
||||||
|
|
||||||
def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
|
def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
|
||||||
sourcepos=None):
|
sourcepos=None, namespaces=None):
|
||||||
"""Called by the tree builder when a new tag is encountered.
|
"""Called by the tree builder when a new tag is encountered.
|
||||||
|
|
||||||
:param name: Name of the tag.
|
:param name: Name of the tag.
|
||||||
|
@ -699,6 +702,8 @@ class BeautifulSoup(Tag):
|
||||||
source document.
|
source document.
|
||||||
:param sourcepos: The character position within `sourceline` where this
|
:param sourcepos: The character position within `sourceline` where this
|
||||||
tag was found.
|
tag was found.
|
||||||
|
:param namespaces: A dictionary of all namespace prefix mappings
|
||||||
|
currently in scope in the document.
|
||||||
|
|
||||||
If this method returns None, the tag was rejected by an active
|
If this method returns None, the tag was rejected by an active
|
||||||
SoupStrainer. You should proceed as if the tag had not occurred
|
SoupStrainer. You should proceed as if the tag had not occurred
|
||||||
|
@ -716,7 +721,8 @@ class BeautifulSoup(Tag):
|
||||||
tag = self.element_classes.get(Tag, Tag)(
|
tag = self.element_classes.get(Tag, Tag)(
|
||||||
self, self.builder, name, namespace, nsprefix, attrs,
|
self, self.builder, name, namespace, nsprefix, attrs,
|
||||||
self.currentTag, self._most_recent_element,
|
self.currentTag, self._most_recent_element,
|
||||||
sourceline=sourceline, sourcepos=sourcepos
|
sourceline=sourceline, sourcepos=sourcepos,
|
||||||
|
namespaces=namespaces
|
||||||
)
|
)
|
||||||
if tag is None:
|
if tag is None:
|
||||||
return tag
|
return tag
|
||||||
|
@ -782,7 +788,9 @@ class BeautifulStoneSoup(BeautifulSoup):
|
||||||
kwargs['features'] = 'xml'
|
kwargs['features'] = 'xml'
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
'The BeautifulStoneSoup class is deprecated. Instead of using '
|
'The BeautifulStoneSoup class is deprecated. Instead of using '
|
||||||
'it, pass features="xml" into the BeautifulSoup constructor.')
|
'it, pass features="xml" into the BeautifulSoup constructor.',
|
||||||
|
DeprecationWarning
|
||||||
|
)
|
||||||
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
|
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,10 +3,14 @@ __license__ = "MIT"
|
||||||
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import itertools
|
import itertools
|
||||||
|
import re
|
||||||
|
import warnings
|
||||||
import sys
|
import sys
|
||||||
from bs4.element import (
|
from bs4.element import (
|
||||||
CharsetMetaAttributeValue,
|
CharsetMetaAttributeValue,
|
||||||
ContentMetaAttributeValue,
|
ContentMetaAttributeValue,
|
||||||
|
RubyParenthesisString,
|
||||||
|
RubyTextString,
|
||||||
Stylesheet,
|
Stylesheet,
|
||||||
Script,
|
Script,
|
||||||
TemplateString,
|
TemplateString,
|
||||||
|
@ -28,6 +32,12 @@ XML = 'xml'
|
||||||
HTML = 'html'
|
HTML = 'html'
|
||||||
HTML_5 = 'html5'
|
HTML_5 = 'html5'
|
||||||
|
|
||||||
|
class XMLParsedAsHTMLWarning(UserWarning):
|
||||||
|
"""The warning issued when an HTML parser is used to parse
|
||||||
|
XML that is not XHTML.
|
||||||
|
"""
|
||||||
|
MESSAGE = """It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor."""
|
||||||
|
|
||||||
|
|
||||||
class TreeBuilderRegistry(object):
|
class TreeBuilderRegistry(object):
|
||||||
"""A way of looking up TreeBuilder subclasses by their name or by desired
|
"""A way of looking up TreeBuilder subclasses by their name or by desired
|
||||||
|
@ -390,17 +400,25 @@ class HTMLTreeBuilder(TreeBuilder):
|
||||||
# you need to use it.
|
# you need to use it.
|
||||||
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
|
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
|
||||||
|
|
||||||
# The HTML standard defines an unusual content model for these tags.
|
# These HTML tags need special treatment so they can be
|
||||||
# We represent this by using a string class other than NavigableString
|
# represented by a string class other than NavigableString.
|
||||||
# inside these tags.
|
|
||||||
#
|
#
|
||||||
# I made this list by going through the HTML spec
|
# For some of these tags, it's because the HTML standard defines
|
||||||
|
# an unusual content model for them. I made this list by going
|
||||||
|
# through the HTML spec
|
||||||
# (https://html.spec.whatwg.org/#metadata-content) and looking for
|
# (https://html.spec.whatwg.org/#metadata-content) and looking for
|
||||||
# "metadata content" elements that can contain strings.
|
# "metadata content" elements that can contain strings.
|
||||||
#
|
#
|
||||||
|
# The Ruby tags (<rt> and <rp>) are here despite being normal
|
||||||
|
# "phrasing content" tags, because the content they contain is
|
||||||
|
# qualitatively different from other text in the document, and it
|
||||||
|
# can be useful to be able to distinguish it.
|
||||||
|
#
|
||||||
# TODO: Arguably <noscript> could go here but it seems
|
# TODO: Arguably <noscript> could go here but it seems
|
||||||
# qualitatively different from the other tags.
|
# qualitatively different from the other tags.
|
||||||
DEFAULT_STRING_CONTAINERS = {
|
DEFAULT_STRING_CONTAINERS = {
|
||||||
|
'rt' : RubyTextString,
|
||||||
|
'rp' : RubyParenthesisString,
|
||||||
'style': Stylesheet,
|
'style': Stylesheet,
|
||||||
'script': Script,
|
'script': Script,
|
||||||
'template': TemplateString,
|
'template': TemplateString,
|
||||||
|
@ -475,6 +493,99 @@ class HTMLTreeBuilder(TreeBuilder):
|
||||||
|
|
||||||
return (meta_encoding is not None)
|
return (meta_encoding is not None)
|
||||||
|
|
||||||
|
class DetectsXMLParsedAsHTML(object):
|
||||||
|
"""A mixin class for any class (a TreeBuilder, or some class used by a
|
||||||
|
TreeBuilder) that's in a position to detect whether an XML
|
||||||
|
document is being incorrectly parsed as HTML, and issue an
|
||||||
|
appropriate warning.
|
||||||
|
|
||||||
|
This requires being able to observe an incoming processing
|
||||||
|
instruction that might be an XML declaration, and also able to
|
||||||
|
observe tags as they're opened. If you can't do that for a given
|
||||||
|
TreeBuilder, there's a less reliable implementation based on
|
||||||
|
examining the raw markup.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Regular expression for seeing if markup has an <html> tag.
|
||||||
|
LOOKS_LIKE_HTML = re.compile("<[^ +]html", re.I)
|
||||||
|
LOOKS_LIKE_HTML_B = re.compile(b"<[^ +]html", re.I)
|
||||||
|
|
||||||
|
XML_PREFIX = '<?xml'
|
||||||
|
XML_PREFIX_B = b'<?xml'
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def warn_if_markup_looks_like_xml(cls, markup):
|
||||||
|
"""Perform a check on some markup to see if it looks like XML
|
||||||
|
that's not XHTML. If so, issue a warning.
|
||||||
|
|
||||||
|
This is much less reliable than doing the check while parsing,
|
||||||
|
but some of the tree builders can't do that.
|
||||||
|
|
||||||
|
:return: True if the markup looks like non-XHTML XML, False
|
||||||
|
otherwise.
|
||||||
|
"""
|
||||||
|
if isinstance(markup, bytes):
|
||||||
|
prefix = cls.XML_PREFIX_B
|
||||||
|
looks_like_html = cls.LOOKS_LIKE_HTML_B
|
||||||
|
else:
|
||||||
|
prefix = cls.XML_PREFIX
|
||||||
|
looks_like_html = cls.LOOKS_LIKE_HTML
|
||||||
|
|
||||||
|
if (markup is not None
|
||||||
|
and markup.startswith(prefix)
|
||||||
|
and not looks_like_html.search(markup[:500])
|
||||||
|
):
|
||||||
|
cls._warn()
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _warn(cls):
|
||||||
|
"""Issue a warning about XML being parsed as HTML."""
|
||||||
|
warnings.warn(
|
||||||
|
XMLParsedAsHTMLWarning.MESSAGE, XMLParsedAsHTMLWarning
|
||||||
|
)
|
||||||
|
|
||||||
|
def _initialize_xml_detector(self):
|
||||||
|
"""Call this method before parsing a document."""
|
||||||
|
self._first_processing_instruction = None
|
||||||
|
self._root_tag = None
|
||||||
|
|
||||||
|
def _document_might_be_xml(self, processing_instruction):
|
||||||
|
"""Call this method when encountering an XML declaration, or a
|
||||||
|
"processing instruction" that might be an XML declaration.
|
||||||
|
"""
|
||||||
|
if (self._first_processing_instruction is not None
|
||||||
|
or self._root_tag is not None):
|
||||||
|
# The document has already started. Don't bother checking
|
||||||
|
# anymore.
|
||||||
|
return
|
||||||
|
|
||||||
|
self._first_processing_instruction = processing_instruction
|
||||||
|
|
||||||
|
# We won't know until we encounter the first tag whether or
|
||||||
|
# not this is actually a problem.
|
||||||
|
|
||||||
|
def _root_tag_encountered(self, name):
|
||||||
|
"""Call this when you encounter the document's root tag.
|
||||||
|
|
||||||
|
This is where we actually check whether an XML document is
|
||||||
|
being incorrectly parsed as HTML, and issue the warning.
|
||||||
|
"""
|
||||||
|
if self._root_tag is not None:
|
||||||
|
# This method was incorrectly called multiple times. Do
|
||||||
|
# nothing.
|
||||||
|
return
|
||||||
|
|
||||||
|
self._root_tag = name
|
||||||
|
if (name != 'html' and self._first_processing_instruction is not None
|
||||||
|
and self._first_processing_instruction.lower().startswith('xml ')):
|
||||||
|
# We encountered an XML declaration and then a tag other
|
||||||
|
# than 'html'. This is a reliable indicator that a
|
||||||
|
# non-XHTML document is being parsed as XML.
|
||||||
|
self._warn()
|
||||||
|
|
||||||
|
|
||||||
def register_treebuilders_from(module):
|
def register_treebuilders_from(module):
|
||||||
"""Copy TreeBuilders from the given module into this module."""
|
"""Copy TreeBuilders from the given module into this module."""
|
||||||
this_module = sys.modules[__name__]
|
this_module = sys.modules[__name__]
|
||||||
|
|
|
@ -8,6 +8,7 @@ __all__ = [
|
||||||
import warnings
|
import warnings
|
||||||
import re
|
import re
|
||||||
from bs4.builder import (
|
from bs4.builder import (
|
||||||
|
DetectsXMLParsedAsHTML,
|
||||||
PERMISSIVE,
|
PERMISSIVE,
|
||||||
HTML,
|
HTML,
|
||||||
HTML_5,
|
HTML_5,
|
||||||
|
@ -70,6 +71,11 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||||
# UnicodeDammit.
|
# UnicodeDammit.
|
||||||
if exclude_encodings:
|
if exclude_encodings:
|
||||||
warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
|
warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
|
||||||
|
|
||||||
|
# html5lib only parses HTML, so if it's given XML that's worth
|
||||||
|
# noting.
|
||||||
|
DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup)
|
||||||
|
|
||||||
yield (markup, None, None, False)
|
yield (markup, None, None, False)
|
||||||
|
|
||||||
# These methods are defined by Beautiful Soup.
|
# These methods are defined by Beautiful Soup.
|
||||||
|
@ -242,8 +248,8 @@ class AttrList(object):
|
||||||
def __setitem__(self, name, value):
|
def __setitem__(self, name, value):
|
||||||
# If this attribute is a multi-valued attribute for this element,
|
# If this attribute is a multi-valued attribute for this element,
|
||||||
# turn its value into a list.
|
# turn its value into a list.
|
||||||
list_attr = self.element.cdata_list_attributes
|
list_attr = self.element.cdata_list_attributes or {}
|
||||||
if (name in list_attr['*']
|
if (name in list_attr.get('*')
|
||||||
or (self.element.name in list_attr
|
or (self.element.name in list_attr
|
||||||
and name in list_attr[self.element.name])):
|
and name in list_attr[self.element.name])):
|
||||||
# A node that is being cloned may have already undergone
|
# A node that is being cloned may have already undergone
|
||||||
|
|
|
@ -44,6 +44,7 @@ from bs4.element import (
|
||||||
from bs4.dammit import EntitySubstitution, UnicodeDammit
|
from bs4.dammit import EntitySubstitution, UnicodeDammit
|
||||||
|
|
||||||
from bs4.builder import (
|
from bs4.builder import (
|
||||||
|
DetectsXMLParsedAsHTML,
|
||||||
HTML,
|
HTML,
|
||||||
HTMLTreeBuilder,
|
HTMLTreeBuilder,
|
||||||
STRICT,
|
STRICT,
|
||||||
|
@ -52,7 +53,7 @@ from bs4.builder import (
|
||||||
|
|
||||||
HTMLPARSER = 'html.parser'
|
HTMLPARSER = 'html.parser'
|
||||||
|
|
||||||
class BeautifulSoupHTMLParser(HTMLParser):
|
class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
|
||||||
"""A subclass of the Python standard library's HTMLParser class, which
|
"""A subclass of the Python standard library's HTMLParser class, which
|
||||||
listens for HTMLParser events and translates them into calls
|
listens for HTMLParser events and translates them into calls
|
||||||
to Beautiful Soup's tree construction API.
|
to Beautiful Soup's tree construction API.
|
||||||
|
@ -88,6 +89,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
# will ignore, assuming they ever show up.
|
# will ignore, assuming they ever show up.
|
||||||
self.already_closed_empty_element = []
|
self.already_closed_empty_element = []
|
||||||
|
|
||||||
|
self._initialize_xml_detector()
|
||||||
|
|
||||||
def error(self, msg):
|
def error(self, msg):
|
||||||
"""In Python 3, HTMLParser subclasses must implement error(), although
|
"""In Python 3, HTMLParser subclasses must implement error(), although
|
||||||
this requirement doesn't appear to be documented.
|
this requirement doesn't appear to be documented.
|
||||||
|
@ -168,6 +171,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
# later on. If so, we want to ignore it.
|
# later on. If so, we want to ignore it.
|
||||||
self.already_closed_empty_element.append(name)
|
self.already_closed_empty_element.append(name)
|
||||||
|
|
||||||
|
if self._root_tag is None:
|
||||||
|
self._root_tag_encountered(name)
|
||||||
|
|
||||||
def handle_endtag(self, name, check_already_closed=True):
|
def handle_endtag(self, name, check_already_closed=True):
|
||||||
"""Handle a closing tag, e.g. '</tag>'
|
"""Handle a closing tag, e.g. '</tag>'
|
||||||
|
|
||||||
|
@ -288,6 +294,7 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
"""
|
"""
|
||||||
self.soup.endData()
|
self.soup.endData()
|
||||||
self.soup.handle_data(data)
|
self.soup.handle_data(data)
|
||||||
|
self._document_might_be_xml(data)
|
||||||
self.soup.endData(ProcessingInstruction)
|
self.soup.endData(ProcessingInstruction)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,7 @@ from bs4.element import (
|
||||||
XMLProcessingInstruction,
|
XMLProcessingInstruction,
|
||||||
)
|
)
|
||||||
from bs4.builder import (
|
from bs4.builder import (
|
||||||
|
DetectsXMLParsedAsHTML,
|
||||||
FAST,
|
FAST,
|
||||||
HTML,
|
HTML,
|
||||||
HTMLTreeBuilder,
|
HTMLTreeBuilder,
|
||||||
|
@ -79,9 +80,18 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
|
|
||||||
This might be useful later on when creating CSS selectors.
|
This might be useful later on when creating CSS selectors.
|
||||||
|
|
||||||
|
This will track (almost) all namespaces, even ones that were
|
||||||
|
only in scope for part of the document. If two namespaces have
|
||||||
|
the same prefix, only the first one encountered will be
|
||||||
|
tracked. Un-prefixed namespaces are not tracked.
|
||||||
|
|
||||||
:param mapping: A dictionary mapping namespace prefixes to URIs.
|
:param mapping: A dictionary mapping namespace prefixes to URIs.
|
||||||
"""
|
"""
|
||||||
for key, value in list(mapping.items()):
|
for key, value in list(mapping.items()):
|
||||||
|
# This is 'if key' and not 'if key is not None' because we
|
||||||
|
# don't track un-prefixed namespaces. Soupselect will
|
||||||
|
# treat an un-prefixed namespace as the default, which
|
||||||
|
# causes confusion in some cases.
|
||||||
if key and key not in self.soup._namespaces:
|
if key and key not in self.soup._namespaces:
|
||||||
# Let the BeautifulSoup object know about a new namespace.
|
# Let the BeautifulSoup object know about a new namespace.
|
||||||
# If there are multiple namespaces defined with the same
|
# If there are multiple namespaces defined with the same
|
||||||
|
@ -125,6 +135,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
self.empty_element_tags = set(empty_element_tags)
|
self.empty_element_tags = set(empty_element_tags)
|
||||||
self.soup = None
|
self.soup = None
|
||||||
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
|
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
|
||||||
|
self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
|
||||||
super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
|
super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
|
||||||
|
|
||||||
def _getNsTag(self, tag):
|
def _getNsTag(self, tag):
|
||||||
|
@ -166,12 +177,21 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
is_html = not self.is_xml
|
is_html = not self.is_xml
|
||||||
if is_html:
|
if is_html:
|
||||||
self.processing_instruction_class = ProcessingInstruction
|
self.processing_instruction_class = ProcessingInstruction
|
||||||
|
# We're in HTML mode, so if we're given XML, that's worth
|
||||||
|
# noting.
|
||||||
|
DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup)
|
||||||
else:
|
else:
|
||||||
self.processing_instruction_class = XMLProcessingInstruction
|
self.processing_instruction_class = XMLProcessingInstruction
|
||||||
|
|
||||||
if isinstance(markup, str):
|
if isinstance(markup, str):
|
||||||
# We were given Unicode. Maybe lxml can parse Unicode on
|
# We were given Unicode. Maybe lxml can parse Unicode on
|
||||||
# this system?
|
# this system?
|
||||||
|
|
||||||
|
# TODO: This is a workaround for
|
||||||
|
# https://bugs.launchpad.net/lxml/+bug/1948551.
|
||||||
|
# We can remove it once the upstream issue is fixed.
|
||||||
|
if len(markup) > 0 and markup[0] == u'\N{BYTE ORDER MARK}':
|
||||||
|
markup = markup[1:]
|
||||||
yield markup, None, document_declared_encoding, False
|
yield markup, None, document_declared_encoding, False
|
||||||
|
|
||||||
if isinstance(markup, str):
|
if isinstance(markup, str):
|
||||||
|
@ -240,6 +260,20 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
# mappings.
|
# mappings.
|
||||||
self.nsmaps.append(_invert(nsmap))
|
self.nsmaps.append(_invert(nsmap))
|
||||||
|
|
||||||
|
# The currently active namespace prefixes have
|
||||||
|
# changed. Calculate the new mapping so it can be stored
|
||||||
|
# with all Tag objects created while these prefixes are in
|
||||||
|
# scope.
|
||||||
|
current_mapping = dict(self.active_namespace_prefixes[-1])
|
||||||
|
current_mapping.update(nsmap)
|
||||||
|
|
||||||
|
# We should not track un-prefixed namespaces as we can only hold one
|
||||||
|
# and it will be recognized as the default namespace by soupsieve,
|
||||||
|
# which may be confusing in some situations.
|
||||||
|
if '' in current_mapping:
|
||||||
|
del current_mapping['']
|
||||||
|
self.active_namespace_prefixes.append(current_mapping)
|
||||||
|
|
||||||
# Also treat the namespace mapping as a set of attributes on the
|
# Also treat the namespace mapping as a set of attributes on the
|
||||||
# tag, so we can recreate it later.
|
# tag, so we can recreate it later.
|
||||||
attrs = attrs.copy()
|
attrs = attrs.copy()
|
||||||
|
@ -264,7 +298,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
|
|
||||||
namespace, name = self._getNsTag(name)
|
namespace, name = self._getNsTag(name)
|
||||||
nsprefix = self._prefix_for_namespace(namespace)
|
nsprefix = self._prefix_for_namespace(namespace)
|
||||||
self.soup.handle_starttag(name, namespace, nsprefix, attrs)
|
self.soup.handle_starttag(
|
||||||
|
name, namespace, nsprefix, attrs,
|
||||||
|
namespaces=self.active_namespace_prefixes[-1]
|
||||||
|
)
|
||||||
|
|
||||||
def _prefix_for_namespace(self, namespace):
|
def _prefix_for_namespace(self, namespace):
|
||||||
"""Find the currently active prefix for the given namespace."""
|
"""Find the currently active prefix for the given namespace."""
|
||||||
|
@ -289,11 +326,18 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
if len(self.nsmaps) > 1:
|
if len(self.nsmaps) > 1:
|
||||||
# This tag, or one of its parents, introduced a namespace
|
# This tag, or one of its parents, introduced a namespace
|
||||||
# mapping, so pop it off the stack.
|
# mapping, so pop it off the stack.
|
||||||
self.nsmaps.pop()
|
out_of_scope_nsmap = self.nsmaps.pop()
|
||||||
|
|
||||||
|
if out_of_scope_nsmap is not None:
|
||||||
|
# This tag introduced a namespace mapping which is no
|
||||||
|
# longer in scope. Recalculate the currently active
|
||||||
|
# namespace prefixes.
|
||||||
|
self.active_namespace_prefixes.pop()
|
||||||
|
|
||||||
def pi(self, target, data):
|
def pi(self, target, data):
|
||||||
self.soup.endData()
|
self.soup.endData()
|
||||||
self.soup.handle_data(target + ' ' + data)
|
data = target + ' ' + data
|
||||||
|
self.soup.handle_data(data)
|
||||||
self.soup.endData(self.processing_instruction_class)
|
self.soup.endData(self.processing_instruction_class)
|
||||||
|
|
||||||
def data(self, content):
|
def data(self, content):
|
||||||
|
|
2283
libs/bs4/dammit.py
2283
libs/bs4/dammit.py
File diff suppressed because it is too large
Load diff
|
@ -4,7 +4,7 @@
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
import cProfile
|
import cProfile
|
||||||
from io import StringIO
|
from io import BytesIO
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
import bs4
|
import bs4
|
||||||
from bs4 import BeautifulSoup, __version__
|
from bs4 import BeautifulSoup, __version__
|
||||||
|
@ -103,7 +103,13 @@ def lxml_trace(data, html=True, **kwargs):
|
||||||
if False, lxml's XML parser will be used.
|
if False, lxml's XML parser will be used.
|
||||||
"""
|
"""
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
|
recover = kwargs.pop('recover', True)
|
||||||
|
if isinstance(data, str):
|
||||||
|
data = data.encode("utf8")
|
||||||
|
reader = BytesIO(data)
|
||||||
|
for event, element in etree.iterparse(
|
||||||
|
reader, html=html, recover=recover, **kwargs
|
||||||
|
):
|
||||||
print(("%s, %4s, %s" % (event, element.tag, element.text)))
|
print(("%s, %4s, %s" % (event, element.tag, element.text)))
|
||||||
|
|
||||||
class AnnouncingParser(HTMLParser):
|
class AnnouncingParser(HTMLParser):
|
||||||
|
|
|
@ -23,7 +23,6 @@ from bs4.formatter import (
|
||||||
)
|
)
|
||||||
|
|
||||||
DEFAULT_OUTPUT_ENCODING = "utf-8"
|
DEFAULT_OUTPUT_ENCODING = "utf-8"
|
||||||
PY3K = (sys.version_info[0] > 2)
|
|
||||||
|
|
||||||
nonwhitespace_re = re.compile(r"\S+")
|
nonwhitespace_re = re.compile(r"\S+")
|
||||||
|
|
||||||
|
@ -555,7 +554,7 @@ class PageElement(object):
|
||||||
parent.insert(index+1+offset, successor)
|
parent.insert(index+1+offset, successor)
|
||||||
offset += 1
|
offset += 1
|
||||||
|
|
||||||
def find_next(self, name=None, attrs={}, text=None, **kwargs):
|
def find_next(self, name=None, attrs={}, string=None, **kwargs):
|
||||||
"""Find the first PageElement that matches the given criteria and
|
"""Find the first PageElement that matches the given criteria and
|
||||||
appears later in the document than this PageElement.
|
appears later in the document than this PageElement.
|
||||||
|
|
||||||
|
@ -564,15 +563,15 @@ class PageElement(object):
|
||||||
|
|
||||||
:param name: A filter on tag name.
|
:param name: A filter on tag name.
|
||||||
:param attrs: A dictionary of filters on attribute values.
|
:param attrs: A dictionary of filters on attribute values.
|
||||||
:param text: A filter for a NavigableString with specific text.
|
:param string: A filter for a NavigableString with specific text.
|
||||||
:kwargs: A dictionary of filters on attribute values.
|
:kwargs: A dictionary of filters on attribute values.
|
||||||
:return: A PageElement.
|
:return: A PageElement.
|
||||||
:rtype: bs4.element.Tag | bs4.element.NavigableString
|
:rtype: bs4.element.Tag | bs4.element.NavigableString
|
||||||
"""
|
"""
|
||||||
return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
|
return self._find_one(self.find_all_next, name, attrs, string, **kwargs)
|
||||||
findNext = find_next # BS3
|
findNext = find_next # BS3
|
||||||
|
|
||||||
def find_all_next(self, name=None, attrs={}, text=None, limit=None,
|
def find_all_next(self, name=None, attrs={}, string=None, limit=None,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""Find all PageElements that match the given criteria and appear
|
"""Find all PageElements that match the given criteria and appear
|
||||||
later in the document than this PageElement.
|
later in the document than this PageElement.
|
||||||
|
@ -582,16 +581,16 @@ class PageElement(object):
|
||||||
|
|
||||||
:param name: A filter on tag name.
|
:param name: A filter on tag name.
|
||||||
:param attrs: A dictionary of filters on attribute values.
|
:param attrs: A dictionary of filters on attribute values.
|
||||||
:param text: A filter for a NavigableString with specific text.
|
:param string: A filter for a NavigableString with specific text.
|
||||||
:param limit: Stop looking after finding this many results.
|
:param limit: Stop looking after finding this many results.
|
||||||
:kwargs: A dictionary of filters on attribute values.
|
:kwargs: A dictionary of filters on attribute values.
|
||||||
:return: A ResultSet containing PageElements.
|
:return: A ResultSet containing PageElements.
|
||||||
"""
|
"""
|
||||||
return self._find_all(name, attrs, text, limit, self.next_elements,
|
return self._find_all(name, attrs, string, limit, self.next_elements,
|
||||||
**kwargs)
|
**kwargs)
|
||||||
findAllNext = find_all_next # BS3
|
findAllNext = find_all_next # BS3
|
||||||
|
|
||||||
def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
|
def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):
|
||||||
"""Find the closest sibling to this PageElement that matches the
|
"""Find the closest sibling to this PageElement that matches the
|
||||||
given criteria and appears later in the document.
|
given criteria and appears later in the document.
|
||||||
|
|
||||||
|
@ -600,16 +599,16 @@ class PageElement(object):
|
||||||
|
|
||||||
:param name: A filter on tag name.
|
:param name: A filter on tag name.
|
||||||
:param attrs: A dictionary of filters on attribute values.
|
:param attrs: A dictionary of filters on attribute values.
|
||||||
:param text: A filter for a NavigableString with specific text.
|
:param string: A filter for a NavigableString with specific text.
|
||||||
:kwargs: A dictionary of filters on attribute values.
|
:kwargs: A dictionary of filters on attribute values.
|
||||||
:return: A PageElement.
|
:return: A PageElement.
|
||||||
:rtype: bs4.element.Tag | bs4.element.NavigableString
|
:rtype: bs4.element.Tag | bs4.element.NavigableString
|
||||||
"""
|
"""
|
||||||
return self._find_one(self.find_next_siblings, name, attrs, text,
|
return self._find_one(self.find_next_siblings, name, attrs, string,
|
||||||
**kwargs)
|
**kwargs)
|
||||||
findNextSibling = find_next_sibling # BS3
|
findNextSibling = find_next_sibling # BS3
|
||||||
|
|
||||||
def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
|
def find_next_siblings(self, name=None, attrs={}, string=None, limit=None,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""Find all siblings of this PageElement that match the given criteria
|
"""Find all siblings of this PageElement that match the given criteria
|
||||||
and appear later in the document.
|
and appear later in the document.
|
||||||
|
@ -619,18 +618,18 @@ class PageElement(object):
|
||||||
|
|
||||||
:param name: A filter on tag name.
|
:param name: A filter on tag name.
|
||||||
:param attrs: A dictionary of filters on attribute values.
|
:param attrs: A dictionary of filters on attribute values.
|
||||||
:param text: A filter for a NavigableString with specific text.
|
:param string: A filter for a NavigableString with specific text.
|
||||||
:param limit: Stop looking after finding this many results.
|
:param limit: Stop looking after finding this many results.
|
||||||
:kwargs: A dictionary of filters on attribute values.
|
:kwargs: A dictionary of filters on attribute values.
|
||||||
:return: A ResultSet of PageElements.
|
:return: A ResultSet of PageElements.
|
||||||
:rtype: bs4.element.ResultSet
|
:rtype: bs4.element.ResultSet
|
||||||
"""
|
"""
|
||||||
return self._find_all(name, attrs, text, limit,
|
return self._find_all(name, attrs, string, limit,
|
||||||
self.next_siblings, **kwargs)
|
self.next_siblings, **kwargs)
|
||||||
findNextSiblings = find_next_siblings # BS3
|
findNextSiblings = find_next_siblings # BS3
|
||||||
fetchNextSiblings = find_next_siblings # BS2
|
fetchNextSiblings = find_next_siblings # BS2
|
||||||
|
|
||||||
def find_previous(self, name=None, attrs={}, text=None, **kwargs):
|
def find_previous(self, name=None, attrs={}, string=None, **kwargs):
|
||||||
"""Look backwards in the document from this PageElement and find the
|
"""Look backwards in the document from this PageElement and find the
|
||||||
first PageElement that matches the given criteria.
|
first PageElement that matches the given criteria.
|
||||||
|
|
||||||
|
@ -639,16 +638,16 @@ class PageElement(object):
|
||||||
|
|
||||||
:param name: A filter on tag name.
|
:param name: A filter on tag name.
|
||||||
:param attrs: A dictionary of filters on attribute values.
|
:param attrs: A dictionary of filters on attribute values.
|
||||||
:param text: A filter for a NavigableString with specific text.
|
:param string: A filter for a NavigableString with specific text.
|
||||||
:kwargs: A dictionary of filters on attribute values.
|
:kwargs: A dictionary of filters on attribute values.
|
||||||
:return: A PageElement.
|
:return: A PageElement.
|
||||||
:rtype: bs4.element.Tag | bs4.element.NavigableString
|
:rtype: bs4.element.Tag | bs4.element.NavigableString
|
||||||
"""
|
"""
|
||||||
return self._find_one(
|
return self._find_one(
|
||||||
self.find_all_previous, name, attrs, text, **kwargs)
|
self.find_all_previous, name, attrs, string, **kwargs)
|
||||||
findPrevious = find_previous # BS3
|
findPrevious = find_previous # BS3
|
||||||
|
|
||||||
def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
|
def find_all_previous(self, name=None, attrs={}, string=None, limit=None,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""Look backwards in the document from this PageElement and find all
|
"""Look backwards in the document from this PageElement and find all
|
||||||
PageElements that match the given criteria.
|
PageElements that match the given criteria.
|
||||||
|
@ -658,18 +657,18 @@ class PageElement(object):
|
||||||
|
|
||||||
:param name: A filter on tag name.
|
:param name: A filter on tag name.
|
||||||
:param attrs: A dictionary of filters on attribute values.
|
:param attrs: A dictionary of filters on attribute values.
|
||||||
:param text: A filter for a NavigableString with specific text.
|
:param string: A filter for a NavigableString with specific text.
|
||||||
:param limit: Stop looking after finding this many results.
|
:param limit: Stop looking after finding this many results.
|
||||||
:kwargs: A dictionary of filters on attribute values.
|
:kwargs: A dictionary of filters on attribute values.
|
||||||
:return: A ResultSet of PageElements.
|
:return: A ResultSet of PageElements.
|
||||||
:rtype: bs4.element.ResultSet
|
:rtype: bs4.element.ResultSet
|
||||||
"""
|
"""
|
||||||
return self._find_all(name, attrs, text, limit, self.previous_elements,
|
return self._find_all(name, attrs, string, limit, self.previous_elements,
|
||||||
**kwargs)
|
**kwargs)
|
||||||
findAllPrevious = find_all_previous # BS3
|
findAllPrevious = find_all_previous # BS3
|
||||||
fetchPrevious = find_all_previous # BS2
|
fetchPrevious = find_all_previous # BS2
|
||||||
|
|
||||||
def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
|
def find_previous_sibling(self, name=None, attrs={}, string=None, **kwargs):
|
||||||
"""Returns the closest sibling to this PageElement that matches the
|
"""Returns the closest sibling to this PageElement that matches the
|
||||||
given criteria and appears earlier in the document.
|
given criteria and appears earlier in the document.
|
||||||
|
|
||||||
|
@ -678,16 +677,16 @@ class PageElement(object):
|
||||||
|
|
||||||
:param name: A filter on tag name.
|
:param name: A filter on tag name.
|
||||||
:param attrs: A dictionary of filters on attribute values.
|
:param attrs: A dictionary of filters on attribute values.
|
||||||
:param text: A filter for a NavigableString with specific text.
|
:param string: A filter for a NavigableString with specific text.
|
||||||
:kwargs: A dictionary of filters on attribute values.
|
:kwargs: A dictionary of filters on attribute values.
|
||||||
:return: A PageElement.
|
:return: A PageElement.
|
||||||
:rtype: bs4.element.Tag | bs4.element.NavigableString
|
:rtype: bs4.element.Tag | bs4.element.NavigableString
|
||||||
"""
|
"""
|
||||||
return self._find_one(self.find_previous_siblings, name, attrs, text,
|
return self._find_one(self.find_previous_siblings, name, attrs, string,
|
||||||
**kwargs)
|
**kwargs)
|
||||||
findPreviousSibling = find_previous_sibling # BS3
|
findPreviousSibling = find_previous_sibling # BS3
|
||||||
|
|
||||||
def find_previous_siblings(self, name=None, attrs={}, text=None,
|
def find_previous_siblings(self, name=None, attrs={}, string=None,
|
||||||
limit=None, **kwargs):
|
limit=None, **kwargs):
|
||||||
"""Returns all siblings to this PageElement that match the
|
"""Returns all siblings to this PageElement that match the
|
||||||
given criteria and appear earlier in the document.
|
given criteria and appear earlier in the document.
|
||||||
|
@ -697,13 +696,13 @@ class PageElement(object):
|
||||||
|
|
||||||
:param name: A filter on tag name.
|
:param name: A filter on tag name.
|
||||||
:param attrs: A dictionary of filters on attribute values.
|
:param attrs: A dictionary of filters on attribute values.
|
||||||
:param text: A filter for a NavigableString with specific text.
|
:param string: A filter for a NavigableString with specific text.
|
||||||
:param limit: Stop looking after finding this many results.
|
:param limit: Stop looking after finding this many results.
|
||||||
:kwargs: A dictionary of filters on attribute values.
|
:kwargs: A dictionary of filters on attribute values.
|
||||||
:return: A ResultSet of PageElements.
|
:return: A ResultSet of PageElements.
|
||||||
:rtype: bs4.element.ResultSet
|
:rtype: bs4.element.ResultSet
|
||||||
"""
|
"""
|
||||||
return self._find_all(name, attrs, text, limit,
|
return self._find_all(name, attrs, string, limit,
|
||||||
self.previous_siblings, **kwargs)
|
self.previous_siblings, **kwargs)
|
||||||
findPreviousSiblings = find_previous_siblings # BS3
|
findPreviousSiblings = find_previous_siblings # BS3
|
||||||
fetchPreviousSiblings = find_previous_siblings # BS2
|
fetchPreviousSiblings = find_previous_siblings # BS2
|
||||||
|
@ -770,26 +769,29 @@ class PageElement(object):
|
||||||
|
|
||||||
#These methods do the real heavy lifting.
|
#These methods do the real heavy lifting.
|
||||||
|
|
||||||
def _find_one(self, method, name, attrs, text, **kwargs):
|
def _find_one(self, method, name, attrs, string, **kwargs):
|
||||||
r = None
|
r = None
|
||||||
l = method(name, attrs, text, 1, **kwargs)
|
l = method(name, attrs, string, 1, **kwargs)
|
||||||
if l:
|
if l:
|
||||||
r = l[0]
|
r = l[0]
|
||||||
return r
|
return r
|
||||||
|
|
||||||
def _find_all(self, name, attrs, text, limit, generator, **kwargs):
|
def _find_all(self, name, attrs, string, limit, generator, **kwargs):
|
||||||
"Iterates over a generator looking for things that match."
|
"Iterates over a generator looking for things that match."
|
||||||
|
|
||||||
if text is None and 'string' in kwargs:
|
if string is None and 'text' in kwargs:
|
||||||
text = kwargs['string']
|
string = kwargs.pop('text')
|
||||||
del kwargs['string']
|
warnings.warn(
|
||||||
|
"The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
|
||||||
|
DeprecationWarning
|
||||||
|
)
|
||||||
|
|
||||||
if isinstance(name, SoupStrainer):
|
if isinstance(name, SoupStrainer):
|
||||||
strainer = name
|
strainer = name
|
||||||
else:
|
else:
|
||||||
strainer = SoupStrainer(name, attrs, text, **kwargs)
|
strainer = SoupStrainer(name, attrs, string, **kwargs)
|
||||||
|
|
||||||
if text is None and not limit and not attrs and not kwargs:
|
if string is None and not limit and not attrs and not kwargs:
|
||||||
if name is True or name is None:
|
if name is True or name is None:
|
||||||
# Optimization to find all tags.
|
# Optimization to find all tags.
|
||||||
result = (element for element in generator
|
result = (element for element in generator
|
||||||
|
@ -1013,6 +1015,11 @@ class NavigableString(str, PageElement):
|
||||||
|
|
||||||
# Do nothing if the caller is looking for specific types of
|
# Do nothing if the caller is looking for specific types of
|
||||||
# string, and we're of a different type.
|
# string, and we're of a different type.
|
||||||
|
#
|
||||||
|
# We check specific types instead of using isinstance(self,
|
||||||
|
# types) because all of these classes subclass
|
||||||
|
# NavigableString. Anyone who's using this feature probably
|
||||||
|
# wants generic NavigableStrings but not other stuff.
|
||||||
my_type = type(self)
|
my_type = type(self)
|
||||||
if types is not None:
|
if types is not None:
|
||||||
if isinstance(types, type):
|
if isinstance(types, type):
|
||||||
|
@ -1141,6 +1148,27 @@ class TemplateString(NavigableString):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class RubyTextString(NavigableString):
|
||||||
|
"""A NavigableString representing the contents of the <rt> HTML
|
||||||
|
element.
|
||||||
|
|
||||||
|
https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element
|
||||||
|
|
||||||
|
Can be used to distinguish such strings from the strings they're
|
||||||
|
annotating.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class RubyParenthesisString(NavigableString):
|
||||||
|
"""A NavigableString representing the contents of the <rp> HTML
|
||||||
|
element.
|
||||||
|
|
||||||
|
https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Tag(PageElement):
|
class Tag(PageElement):
|
||||||
"""Represents an HTML or XML tag that is part of a parse tree, along
|
"""Represents an HTML or XML tag that is part of a parse tree, along
|
||||||
with its attributes and contents.
|
with its attributes and contents.
|
||||||
|
@ -1155,6 +1183,7 @@ class Tag(PageElement):
|
||||||
can_be_empty_element=None, cdata_list_attributes=None,
|
can_be_empty_element=None, cdata_list_attributes=None,
|
||||||
preserve_whitespace_tags=None,
|
preserve_whitespace_tags=None,
|
||||||
interesting_string_types=None,
|
interesting_string_types=None,
|
||||||
|
namespaces=None
|
||||||
):
|
):
|
||||||
"""Basic constructor.
|
"""Basic constructor.
|
||||||
|
|
||||||
|
@ -1187,6 +1216,9 @@ class Tag(PageElement):
|
||||||
to be considered. The default is to consider
|
to be considered. The default is to consider
|
||||||
NavigableString and CData the only interesting string
|
NavigableString and CData the only interesting string
|
||||||
subtypes.
|
subtypes.
|
||||||
|
:param namespaces: A dictionary mapping currently active
|
||||||
|
namespace prefixes to URIs. This can be used later to
|
||||||
|
construct CSS selectors.
|
||||||
"""
|
"""
|
||||||
if parser is None:
|
if parser is None:
|
||||||
self.parser_class = None
|
self.parser_class = None
|
||||||
|
@ -1198,6 +1230,7 @@ class Tag(PageElement):
|
||||||
raise ValueError("No value provided for new tag's name.")
|
raise ValueError("No value provided for new tag's name.")
|
||||||
self.name = name
|
self.name = name
|
||||||
self.namespace = namespace
|
self.namespace = namespace
|
||||||
|
self._namespaces = namespaces or {}
|
||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
if ((not builder or builder.store_line_numbers)
|
if ((not builder or builder.store_line_numbers)
|
||||||
and (sourceline is not None or sourcepos is not None)):
|
and (sourceline is not None or sourcepos is not None)):
|
||||||
|
@ -1524,7 +1557,8 @@ class Tag(PageElement):
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
|
'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
|
||||||
name=tag_name
|
name=tag_name
|
||||||
)
|
),
|
||||||
|
DeprecationWarning
|
||||||
)
|
)
|
||||||
return self.find(tag_name)
|
return self.find(tag_name)
|
||||||
# We special case contents to avoid recursion.
|
# We special case contents to avoid recursion.
|
||||||
|
@ -1559,34 +1593,17 @@ class Tag(PageElement):
|
||||||
"""Renders this PageElement as a string.
|
"""Renders this PageElement as a string.
|
||||||
|
|
||||||
:param encoding: The encoding to use (Python 2 only).
|
:param encoding: The encoding to use (Python 2 only).
|
||||||
:return: Under Python 2, a bytestring; under Python 3,
|
TODO: This is now ignored and a warning should be issued
|
||||||
a Unicode string.
|
if a value is provided.
|
||||||
|
:return: A (Unicode) string.
|
||||||
"""
|
"""
|
||||||
if PY3K:
|
|
||||||
# "The return value must be a string object", i.e. Unicode
|
# "The return value must be a string object", i.e. Unicode
|
||||||
return self.decode()
|
return self.decode()
|
||||||
else:
|
|
||||||
# "The return value must be a string object", i.e. a bytestring.
|
|
||||||
# By convention, the return value of __repr__ should also be
|
|
||||||
# an ASCII string.
|
|
||||||
return self.encode(encoding)
|
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
"""Renders this PageElement as a Unicode string."""
|
"""Renders this PageElement as a Unicode string."""
|
||||||
return self.decode()
|
return self.decode()
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
"""Renders this PageElement as a generic string.
|
|
||||||
|
|
||||||
:return: Under Python 2, a UTF-8 bytestring; under Python 3,
|
|
||||||
a Unicode string.
|
|
||||||
"""
|
|
||||||
if PY3K:
|
|
||||||
return self.decode()
|
|
||||||
else:
|
|
||||||
return self.encode()
|
|
||||||
|
|
||||||
if PY3K:
|
|
||||||
__str__ = __repr__ = __unicode__
|
__str__ = __repr__ = __unicode__
|
||||||
|
|
||||||
def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
|
def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
|
||||||
|
@ -1597,8 +1614,10 @@ class Tag(PageElement):
|
||||||
|
|
||||||
:param encoding: The destination encoding.
|
:param encoding: The destination encoding.
|
||||||
:param indent_level: Each line of the rendering will be
|
:param indent_level: Each line of the rendering will be
|
||||||
indented this many spaces. Used internally in
|
indented this many levels. (The formatter decides what a
|
||||||
recursive calls while pretty-printing.
|
'level' means in terms of spaces or other characters
|
||||||
|
output.) Used internally in recursive calls while
|
||||||
|
pretty-printing.
|
||||||
:param formatter: A Formatter object, or a string naming one of
|
:param formatter: A Formatter object, or a string naming one of
|
||||||
the standard formatters.
|
the standard formatters.
|
||||||
:param errors: An error handling strategy such as
|
:param errors: An error handling strategy such as
|
||||||
|
@ -1674,7 +1693,7 @@ class Tag(PageElement):
|
||||||
space = ''
|
space = ''
|
||||||
indent_space = ''
|
indent_space = ''
|
||||||
if indent_level is not None:
|
if indent_level is not None:
|
||||||
indent_space = (' ' * (indent_level - 1))
|
indent_space = (formatter.indent * (indent_level - 1))
|
||||||
if pretty_print:
|
if pretty_print:
|
||||||
space = indent_space
|
space = indent_space
|
||||||
indent_contents = indent_level + 1
|
indent_contents = indent_level + 1
|
||||||
|
@ -1749,8 +1768,10 @@ class Tag(PageElement):
|
||||||
"""Renders the contents of this tag as a Unicode string.
|
"""Renders the contents of this tag as a Unicode string.
|
||||||
|
|
||||||
:param indent_level: Each line of the rendering will be
|
:param indent_level: Each line of the rendering will be
|
||||||
indented this many spaces. Used internally in
|
indented this many levels. (The formatter decides what a
|
||||||
recursive calls while pretty-printing.
|
'level' means in terms of spaces or other characters
|
||||||
|
output.) Used internally in recursive calls while
|
||||||
|
pretty-printing.
|
||||||
|
|
||||||
:param eventual_encoding: The tag is destined to be
|
:param eventual_encoding: The tag is destined to be
|
||||||
encoded into this encoding. decode_contents() is _not_
|
encoded into this encoding. decode_contents() is _not_
|
||||||
|
@ -1761,6 +1782,7 @@ class Tag(PageElement):
|
||||||
|
|
||||||
:param formatter: A Formatter object, or a string naming one of
|
:param formatter: A Formatter object, or a string naming one of
|
||||||
the standard Formatters.
|
the standard Formatters.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# First off, turn a string formatter into a Formatter object. This
|
# First off, turn a string formatter into a Formatter object. This
|
||||||
# will stop the lookup from happening over and over again.
|
# will stop the lookup from happening over and over again.
|
||||||
|
@ -1783,7 +1805,7 @@ class Tag(PageElement):
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
if text:
|
if text:
|
||||||
if pretty_print and not preserve_whitespace:
|
if pretty_print and not preserve_whitespace:
|
||||||
s.append(" " * (indent_level - 1))
|
s.append(formatter.indent * (indent_level - 1))
|
||||||
s.append(text)
|
s.append(text)
|
||||||
if pretty_print and not preserve_whitespace:
|
if pretty_print and not preserve_whitespace:
|
||||||
s.append("\n")
|
s.append("\n")
|
||||||
|
@ -1795,8 +1817,10 @@ class Tag(PageElement):
|
||||||
"""Renders the contents of this PageElement as a bytestring.
|
"""Renders the contents of this PageElement as a bytestring.
|
||||||
|
|
||||||
:param indent_level: Each line of the rendering will be
|
:param indent_level: Each line of the rendering will be
|
||||||
indented this many spaces. Used internally in
|
indented this many levels. (The formatter decides what a
|
||||||
recursive calls while pretty-printing.
|
'level' means in terms of spaces or other characters
|
||||||
|
output.) Used internally in recursive calls while
|
||||||
|
pretty-printing.
|
||||||
|
|
||||||
:param eventual_encoding: The bytestring will be in this encoding.
|
:param eventual_encoding: The bytestring will be in this encoding.
|
||||||
|
|
||||||
|
@ -1819,7 +1843,7 @@ class Tag(PageElement):
|
||||||
|
|
||||||
#Soup methods
|
#Soup methods
|
||||||
|
|
||||||
def find(self, name=None, attrs={}, recursive=True, text=None,
|
def find(self, name=None, attrs={}, recursive=True, string=None,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""Look in the children of this PageElement and find the first
|
"""Look in the children of this PageElement and find the first
|
||||||
PageElement that matches the given criteria.
|
PageElement that matches the given criteria.
|
||||||
|
@ -1838,13 +1862,13 @@ class Tag(PageElement):
|
||||||
:rtype: bs4.element.Tag | bs4.element.NavigableString
|
:rtype: bs4.element.Tag | bs4.element.NavigableString
|
||||||
"""
|
"""
|
||||||
r = None
|
r = None
|
||||||
l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
|
l = self.find_all(name, attrs, recursive, string, 1, **kwargs)
|
||||||
if l:
|
if l:
|
||||||
r = l[0]
|
r = l[0]
|
||||||
return r
|
return r
|
||||||
findChild = find #BS2
|
findChild = find #BS2
|
||||||
|
|
||||||
def find_all(self, name=None, attrs={}, recursive=True, text=None,
|
def find_all(self, name=None, attrs={}, recursive=True, string=None,
|
||||||
limit=None, **kwargs):
|
limit=None, **kwargs):
|
||||||
"""Look in the children of this PageElement and find all
|
"""Look in the children of this PageElement and find all
|
||||||
PageElements that match the given criteria.
|
PageElements that match the given criteria.
|
||||||
|
@ -1865,7 +1889,7 @@ class Tag(PageElement):
|
||||||
generator = self.descendants
|
generator = self.descendants
|
||||||
if not recursive:
|
if not recursive:
|
||||||
generator = self.children
|
generator = self.children
|
||||||
return self._find_all(name, attrs, text, limit, generator, **kwargs)
|
return self._find_all(name, attrs, string, limit, generator, **kwargs)
|
||||||
findAll = find_all # BS3
|
findAll = find_all # BS3
|
||||||
findChildren = find_all # BS2
|
findChildren = find_all # BS2
|
||||||
|
|
||||||
|
@ -1967,8 +1991,10 @@ class Tag(PageElement):
|
||||||
|
|
||||||
has_key() is gone in Python 3, anyway.
|
has_key() is gone in Python 3, anyway.
|
||||||
"""
|
"""
|
||||||
warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
|
warnings.warn(
|
||||||
key))
|
'has_key is deprecated. Use has_attr(key) instead.',
|
||||||
|
DeprecationWarning
|
||||||
|
)
|
||||||
return self.has_attr(key)
|
return self.has_attr(key)
|
||||||
|
|
||||||
# Next, a couple classes to represent queries and their results.
|
# Next, a couple classes to represent queries and their results.
|
||||||
|
@ -1982,7 +2008,7 @@ class SoupStrainer(object):
|
||||||
document.
|
document.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, name=None, attrs={}, text=None, **kwargs):
|
def __init__(self, name=None, attrs={}, string=None, **kwargs):
|
||||||
"""Constructor.
|
"""Constructor.
|
||||||
|
|
||||||
The SoupStrainer constructor takes the same arguments passed
|
The SoupStrainer constructor takes the same arguments passed
|
||||||
|
@ -1991,9 +2017,16 @@ class SoupStrainer(object):
|
||||||
|
|
||||||
:param name: A filter on tag name.
|
:param name: A filter on tag name.
|
||||||
:param attrs: A dictionary of filters on attribute values.
|
:param attrs: A dictionary of filters on attribute values.
|
||||||
:param text: A filter for a NavigableString with specific text.
|
:param string: A filter for a NavigableString with specific text.
|
||||||
:kwargs: A dictionary of filters on attribute values.
|
:kwargs: A dictionary of filters on attribute values.
|
||||||
"""
|
"""
|
||||||
|
if string is None and 'text' in kwargs:
|
||||||
|
string = kwargs.pop('text')
|
||||||
|
warnings.warn(
|
||||||
|
"The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
|
||||||
|
DeprecationWarning
|
||||||
|
)
|
||||||
|
|
||||||
self.name = self._normalize_search_value(name)
|
self.name = self._normalize_search_value(name)
|
||||||
if not isinstance(attrs, dict):
|
if not isinstance(attrs, dict):
|
||||||
# Treat a non-dict value for attrs as a search for the 'class'
|
# Treat a non-dict value for attrs as a search for the 'class'
|
||||||
|
@ -2018,7 +2051,10 @@ class SoupStrainer(object):
|
||||||
normalized_attrs[key] = self._normalize_search_value(value)
|
normalized_attrs[key] = self._normalize_search_value(value)
|
||||||
|
|
||||||
self.attrs = normalized_attrs
|
self.attrs = normalized_attrs
|
||||||
self.text = self._normalize_search_value(text)
|
self.string = self._normalize_search_value(string)
|
||||||
|
|
||||||
|
# DEPRECATED but just in case someone is checking this.
|
||||||
|
self.text = self.string
|
||||||
|
|
||||||
def _normalize_search_value(self, value):
|
def _normalize_search_value(self, value):
|
||||||
# Leave it alone if it's a Unicode string, a callable, a
|
# Leave it alone if it's a Unicode string, a callable, a
|
||||||
|
@ -2052,8 +2088,8 @@ class SoupStrainer(object):
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
"""A human-readable representation of this SoupStrainer."""
|
"""A human-readable representation of this SoupStrainer."""
|
||||||
if self.text:
|
if self.string:
|
||||||
return self.text
|
return self.string
|
||||||
else:
|
else:
|
||||||
return "%s|%s" % (self.name, self.attrs)
|
return "%s|%s" % (self.name, self.attrs)
|
||||||
|
|
||||||
|
@ -2113,7 +2149,7 @@ class SoupStrainer(object):
|
||||||
found = markup
|
found = markup
|
||||||
else:
|
else:
|
||||||
found = markup_name
|
found = markup_name
|
||||||
if found and self.text and not self._matches(found.string, self.text):
|
if found and self.string and not self._matches(found.string, self.string):
|
||||||
found = None
|
found = None
|
||||||
return found
|
return found
|
||||||
|
|
||||||
|
@ -2141,12 +2177,12 @@ class SoupStrainer(object):
|
||||||
# If it's a Tag, make sure its name or attributes match.
|
# If it's a Tag, make sure its name or attributes match.
|
||||||
# Don't bother with Tags if we're searching for text.
|
# Don't bother with Tags if we're searching for text.
|
||||||
elif isinstance(markup, Tag):
|
elif isinstance(markup, Tag):
|
||||||
if not self.text or self.name or self.attrs:
|
if not self.string or self.name or self.attrs:
|
||||||
found = self.search_tag(markup)
|
found = self.search_tag(markup)
|
||||||
# If it's text, make sure the text matches.
|
# If it's text, make sure the text matches.
|
||||||
elif isinstance(markup, NavigableString) or \
|
elif isinstance(markup, NavigableString) or \
|
||||||
isinstance(markup, str):
|
isinstance(markup, str):
|
||||||
if not self.name and not self.attrs and self._matches(markup, self.text):
|
if not self.name and not self.attrs and self._matches(markup, self.string):
|
||||||
found = markup
|
found = markup
|
||||||
else:
|
else:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
|
|
|
@ -49,7 +49,7 @@ class Formatter(EntitySubstitution):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, language=None, entity_substitution=None,
|
self, language=None, entity_substitution=None,
|
||||||
void_element_close_prefix='/', cdata_containing_tags=None,
|
void_element_close_prefix='/', cdata_containing_tags=None,
|
||||||
empty_attributes_are_booleans=False,
|
empty_attributes_are_booleans=False, indent=1,
|
||||||
):
|
):
|
||||||
"""Constructor.
|
"""Constructor.
|
||||||
|
|
||||||
|
@ -69,6 +69,15 @@ class Formatter(EntitySubstitution):
|
||||||
:param blank_attributes_are_booleans: Render attributes whose value
|
:param blank_attributes_are_booleans: Render attributes whose value
|
||||||
is the empty string as HTML-style boolean attributes.
|
is the empty string as HTML-style boolean attributes.
|
||||||
(Attributes whose value is None are always rendered this way.)
|
(Attributes whose value is None are always rendered this way.)
|
||||||
|
|
||||||
|
:param indent: If indent is a non-negative integer or string,
|
||||||
|
then the contents of elements will be indented
|
||||||
|
appropriately when pretty-printing. An indent level of 0,
|
||||||
|
negative, or "" will only insert newlines. Using a
|
||||||
|
positive integer indent indents that many spaces per
|
||||||
|
level. If indent is a string (such as "\t"), that string
|
||||||
|
is used to indent each level. The default behavior to
|
||||||
|
indent one space per level.
|
||||||
"""
|
"""
|
||||||
self.language = language
|
self.language = language
|
||||||
self.entity_substitution = entity_substitution
|
self.entity_substitution = entity_substitution
|
||||||
|
@ -77,6 +86,17 @@ class Formatter(EntitySubstitution):
|
||||||
language, cdata_containing_tags, 'cdata_containing_tags'
|
language, cdata_containing_tags, 'cdata_containing_tags'
|
||||||
)
|
)
|
||||||
self.empty_attributes_are_booleans=empty_attributes_are_booleans
|
self.empty_attributes_are_booleans=empty_attributes_are_booleans
|
||||||
|
if indent is None:
|
||||||
|
indent = 0
|
||||||
|
if isinstance(indent, int):
|
||||||
|
if indent < 0:
|
||||||
|
indent = 0
|
||||||
|
indent = ' ' * indent
|
||||||
|
elif isinstance(indent, str):
|
||||||
|
indent = indent
|
||||||
|
else:
|
||||||
|
indent = ' '
|
||||||
|
self.indent = indent
|
||||||
|
|
||||||
def substitute(self, ns):
|
def substitute(self, ns):
|
||||||
"""Process a string that needs to undergo entity substitution.
|
"""Process a string that needs to undergo entity substitution.
|
||||||
|
|
1136
libs/bs4/testing.py
1136
libs/bs4/testing.py
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
29
libs/bs4/tests/test_builder.py
Normal file
29
libs/bs4/tests/test_builder.py
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import patch
|
||||||
|
from bs4.builder import DetectsXMLParsedAsHTML
|
||||||
|
|
||||||
|
class TestDetectsXMLParsedAsHTML(object):
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"markup,looks_like_xml",
|
||||||
|
[("No xml declaration", False),
|
||||||
|
("<html>obviously HTML</html", False),
|
||||||
|
("<?xml ><html>Actually XHTML</html>", False),
|
||||||
|
("<?xml> < html>Tricky XHTML</html>", False),
|
||||||
|
("<?xml ><no-html-tag>", True),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
def test_warn_if_markup_looks_like_xml(self, markup, looks_like_xml):
|
||||||
|
# Test of our ability to guess at whether markup looks XML-ish
|
||||||
|
# _and_ not HTML-ish.
|
||||||
|
with patch('bs4.builder.DetectsXMLParsedAsHTML._warn') as mock:
|
||||||
|
for data in markup, markup.encode('utf8'):
|
||||||
|
result = DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(
|
||||||
|
data
|
||||||
|
)
|
||||||
|
assert result == looks_like_xml
|
||||||
|
if looks_like_xml:
|
||||||
|
assert mock.called
|
||||||
|
else:
|
||||||
|
assert not mock.called
|
||||||
|
mock.reset_mock()
|
|
@ -1,6 +1,6 @@
|
||||||
"""Tests of the builder registry."""
|
"""Tests of the builder registry."""
|
||||||
|
|
||||||
import unittest
|
import pytest
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
@ -26,46 +26,36 @@ except ImportError:
|
||||||
LXML_PRESENT = False
|
LXML_PRESENT = False
|
||||||
|
|
||||||
|
|
||||||
class BuiltInRegistryTest(unittest.TestCase):
|
class TestBuiltInRegistry(object):
|
||||||
"""Test the built-in registry with the default builders registered."""
|
"""Test the built-in registry with the default builders registered."""
|
||||||
|
|
||||||
def test_combination(self):
|
def test_combination(self):
|
||||||
|
assert registry.lookup('strict', 'html') == HTMLParserTreeBuilder
|
||||||
if LXML_PRESENT:
|
if LXML_PRESENT:
|
||||||
self.assertEqual(registry.lookup('fast', 'html'),
|
assert registry.lookup('fast', 'html') == LXMLTreeBuilder
|
||||||
LXMLTreeBuilder)
|
assert registry.lookup('permissive', 'xml') == LXMLTreeBuilderForXML
|
||||||
|
|
||||||
if LXML_PRESENT:
|
|
||||||
self.assertEqual(registry.lookup('permissive', 'xml'),
|
|
||||||
LXMLTreeBuilderForXML)
|
|
||||||
self.assertEqual(registry.lookup('strict', 'html'),
|
|
||||||
HTMLParserTreeBuilder)
|
|
||||||
if HTML5LIB_PRESENT:
|
if HTML5LIB_PRESENT:
|
||||||
self.assertEqual(registry.lookup('html5lib', 'html'),
|
assert registry.lookup('html5lib', 'html') == HTML5TreeBuilder
|
||||||
HTML5TreeBuilder)
|
|
||||||
|
|
||||||
def test_lookup_by_markup_type(self):
|
def test_lookup_by_markup_type(self):
|
||||||
if LXML_PRESENT:
|
if LXML_PRESENT:
|
||||||
self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
|
assert registry.lookup('html') == LXMLTreeBuilder
|
||||||
self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
|
assert registry.lookup('xml') == LXMLTreeBuilderForXML
|
||||||
else:
|
else:
|
||||||
self.assertEqual(registry.lookup('xml'), None)
|
assert registry.lookup('xml') == None
|
||||||
if HTML5LIB_PRESENT:
|
if HTML5LIB_PRESENT:
|
||||||
self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
|
assert registry.lookup('html') == HTML5TreeBuilder
|
||||||
else:
|
else:
|
||||||
self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
|
assert registry.lookup('html') == HTMLParserTreeBuilder
|
||||||
|
|
||||||
def test_named_library(self):
|
def test_named_library(self):
|
||||||
if LXML_PRESENT:
|
if LXML_PRESENT:
|
||||||
self.assertEqual(registry.lookup('lxml', 'xml'),
|
assert registry.lookup('lxml', 'xml') == LXMLTreeBuilderForXML
|
||||||
LXMLTreeBuilderForXML)
|
assert registry.lookup('lxml', 'html') == LXMLTreeBuilder
|
||||||
self.assertEqual(registry.lookup('lxml', 'html'),
|
|
||||||
LXMLTreeBuilder)
|
|
||||||
if HTML5LIB_PRESENT:
|
if HTML5LIB_PRESENT:
|
||||||
self.assertEqual(registry.lookup('html5lib'),
|
assert registry.lookup('html5lib') == HTML5TreeBuilder
|
||||||
HTML5TreeBuilder)
|
|
||||||
|
|
||||||
self.assertEqual(registry.lookup('html.parser'),
|
assert registry.lookup('html.parser') == HTMLParserTreeBuilder
|
||||||
HTMLParserTreeBuilder)
|
|
||||||
|
|
||||||
def test_beautifulsoup_constructor_does_lookup(self):
|
def test_beautifulsoup_constructor_does_lookup(self):
|
||||||
|
|
||||||
|
@ -77,16 +67,17 @@ class BuiltInRegistryTest(unittest.TestCase):
|
||||||
BeautifulSoup("", features="html")
|
BeautifulSoup("", features="html")
|
||||||
# Or a list of strings.
|
# Or a list of strings.
|
||||||
BeautifulSoup("", features=["html", "fast"])
|
BeautifulSoup("", features=["html", "fast"])
|
||||||
|
pass
|
||||||
|
|
||||||
# You'll get an exception if BS can't find an appropriate
|
# You'll get an exception if BS can't find an appropriate
|
||||||
# builder.
|
# builder.
|
||||||
self.assertRaises(ValueError, BeautifulSoup,
|
with pytest.raises(ValueError):
|
||||||
"", features="no-such-feature")
|
BeautifulSoup("", features="no-such-feature")
|
||||||
|
|
||||||
class RegistryTest(unittest.TestCase):
|
class TestRegistry(object):
|
||||||
"""Test the TreeBuilderRegistry class in general."""
|
"""Test the TreeBuilderRegistry class in general."""
|
||||||
|
|
||||||
def setUp(self):
|
def setup_method(self):
|
||||||
self.registry = TreeBuilderRegistry()
|
self.registry = TreeBuilderRegistry()
|
||||||
|
|
||||||
def builder_for_features(self, *feature_list):
|
def builder_for_features(self, *feature_list):
|
||||||
|
@ -101,28 +92,28 @@ class RegistryTest(unittest.TestCase):
|
||||||
|
|
||||||
# Since the builder advertises no features, you can't find it
|
# Since the builder advertises no features, you can't find it
|
||||||
# by looking up features.
|
# by looking up features.
|
||||||
self.assertEqual(self.registry.lookup('foo'), None)
|
assert self.registry.lookup('foo') is None
|
||||||
|
|
||||||
# But you can find it by doing a lookup with no features, if
|
# But you can find it by doing a lookup with no features, if
|
||||||
# this happens to be the only registered builder.
|
# this happens to be the only registered builder.
|
||||||
self.assertEqual(self.registry.lookup(), builder)
|
assert self.registry.lookup() == builder
|
||||||
|
|
||||||
def test_register_with_features_makes_lookup_succeed(self):
|
def test_register_with_features_makes_lookup_succeed(self):
|
||||||
builder = self.builder_for_features('foo', 'bar')
|
builder = self.builder_for_features('foo', 'bar')
|
||||||
self.assertEqual(self.registry.lookup('foo'), builder)
|
assert self.registry.lookup('foo') is builder
|
||||||
self.assertEqual(self.registry.lookup('bar'), builder)
|
assert self.registry.lookup('bar') is builder
|
||||||
|
|
||||||
def test_lookup_fails_when_no_builder_implements_feature(self):
|
def test_lookup_fails_when_no_builder_implements_feature(self):
|
||||||
builder = self.builder_for_features('foo', 'bar')
|
builder = self.builder_for_features('foo', 'bar')
|
||||||
self.assertEqual(self.registry.lookup('baz'), None)
|
assert self.registry.lookup('baz') is None
|
||||||
|
|
||||||
def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
|
def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
|
||||||
builder1 = self.builder_for_features('foo')
|
builder1 = self.builder_for_features('foo')
|
||||||
builder2 = self.builder_for_features('bar')
|
builder2 = self.builder_for_features('bar')
|
||||||
self.assertEqual(self.registry.lookup(), builder2)
|
assert self.registry.lookup() == builder2
|
||||||
|
|
||||||
def test_lookup_fails_when_no_tree_builders_registered(self):
|
def test_lookup_fails_when_no_tree_builders_registered(self):
|
||||||
self.assertEqual(self.registry.lookup(), None)
|
assert self.registry.lookup() is None
|
||||||
|
|
||||||
def test_lookup_gets_most_recent_builder_supporting_all_features(self):
|
def test_lookup_gets_most_recent_builder_supporting_all_features(self):
|
||||||
has_one = self.builder_for_features('foo')
|
has_one = self.builder_for_features('foo')
|
||||||
|
@ -134,14 +125,12 @@ class RegistryTest(unittest.TestCase):
|
||||||
|
|
||||||
# There are two builders featuring 'foo' and 'bar', but
|
# There are two builders featuring 'foo' and 'bar', but
|
||||||
# the one that also features 'quux' was registered later.
|
# the one that also features 'quux' was registered later.
|
||||||
self.assertEqual(self.registry.lookup('foo', 'bar'),
|
assert self.registry.lookup('foo', 'bar') == has_both_late
|
||||||
has_both_late)
|
|
||||||
|
|
||||||
# There is only one builder featuring 'foo', 'bar', and 'baz'.
|
# There is only one builder featuring 'foo', 'bar', and 'baz'.
|
||||||
self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
|
assert self.registry.lookup('foo', 'bar', 'baz') == has_both_early
|
||||||
has_both_early)
|
|
||||||
|
|
||||||
def test_lookup_fails_when_cannot_reconcile_requested_features(self):
|
def test_lookup_fails_when_cannot_reconcile_requested_features(self):
|
||||||
builder1 = self.builder_for_features('foo', 'bar')
|
builder1 = self.builder_for_features('foo', 'bar')
|
||||||
builder2 = self.builder_for_features('foo', 'baz')
|
builder2 = self.builder_for_features('foo', 'baz')
|
||||||
self.assertEqual(self.registry.lookup('bar', 'baz'), None)
|
assert self.registry.lookup('bar', 'baz') is None
|
||||||
|
|
371
libs/bs4/tests/test_dammit.py
Normal file
371
libs/bs4/tests/test_dammit.py
Normal file
|
@ -0,0 +1,371 @@
|
||||||
|
# encoding: utf-8
|
||||||
|
import pytest
|
||||||
|
import logging
|
||||||
|
import bs4
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4.dammit import (
|
||||||
|
EntitySubstitution,
|
||||||
|
EncodingDetector,
|
||||||
|
UnicodeDammit,
|
||||||
|
)
|
||||||
|
|
||||||
|
class TestUnicodeDammit(object):
|
||||||
|
"""Standalone tests of UnicodeDammit."""
|
||||||
|
|
||||||
|
def test_unicode_input(self):
|
||||||
|
markup = "I'm already Unicode! \N{SNOWMAN}"
|
||||||
|
dammit = UnicodeDammit(markup)
|
||||||
|
assert dammit.unicode_markup == markup
|
||||||
|
|
||||||
|
def test_smart_quotes_to_unicode(self):
|
||||||
|
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||||
|
dammit = UnicodeDammit(markup)
|
||||||
|
assert dammit.unicode_markup == "<foo>\u2018\u2019\u201c\u201d</foo>"
|
||||||
|
|
||||||
|
def test_smart_quotes_to_xml_entities(self):
|
||||||
|
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||||
|
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
|
||||||
|
assert dammit.unicode_markup == "<foo>‘’“”</foo>"
|
||||||
|
|
||||||
|
def test_smart_quotes_to_html_entities(self):
|
||||||
|
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||||
|
dammit = UnicodeDammit(markup, smart_quotes_to="html")
|
||||||
|
assert dammit.unicode_markup == "<foo>‘’“”</foo>"
|
||||||
|
|
||||||
|
def test_smart_quotes_to_ascii(self):
|
||||||
|
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||||
|
dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
|
||||||
|
assert dammit.unicode_markup == """<foo>''""</foo>"""
|
||||||
|
|
||||||
|
def test_detect_utf8(self):
|
||||||
|
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
|
||||||
|
dammit = UnicodeDammit(utf8)
|
||||||
|
assert dammit.original_encoding.lower() == 'utf-8'
|
||||||
|
assert dammit.unicode_markup == 'Sacr\xe9 bleu! \N{SNOWMAN}'
|
||||||
|
|
||||||
|
def test_convert_hebrew(self):
|
||||||
|
hebrew = b"\xed\xe5\xec\xf9"
|
||||||
|
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
|
||||||
|
assert dammit.original_encoding.lower() == 'iso-8859-8'
|
||||||
|
assert dammit.unicode_markup == '\u05dd\u05d5\u05dc\u05e9'
|
||||||
|
|
||||||
|
def test_dont_see_smart_quotes_where_there_are_none(self):
|
||||||
|
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
|
||||||
|
dammit = UnicodeDammit(utf_8)
|
||||||
|
assert dammit.original_encoding.lower() == 'utf-8'
|
||||||
|
assert dammit.unicode_markup.encode("utf-8") == utf_8
|
||||||
|
|
||||||
|
def test_ignore_inappropriate_codecs(self):
|
||||||
|
utf8_data = "Räksmörgås".encode("utf-8")
|
||||||
|
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
|
||||||
|
assert dammit.original_encoding.lower() == 'utf-8'
|
||||||
|
|
||||||
|
def test_ignore_invalid_codecs(self):
|
||||||
|
utf8_data = "Räksmörgås".encode("utf-8")
|
||||||
|
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
|
||||||
|
dammit = UnicodeDammit(utf8_data, [bad_encoding])
|
||||||
|
assert dammit.original_encoding.lower() == 'utf-8'
|
||||||
|
|
||||||
|
def test_exclude_encodings(self):
|
||||||
|
# This is UTF-8.
|
||||||
|
utf8_data = "Räksmörgås".encode("utf-8")
|
||||||
|
|
||||||
|
# But if we exclude UTF-8 from consideration, the guess is
|
||||||
|
# Windows-1252.
|
||||||
|
dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
|
||||||
|
assert dammit.original_encoding.lower() == 'windows-1252'
|
||||||
|
|
||||||
|
# And if we exclude that, there is no valid guess at all.
|
||||||
|
dammit = UnicodeDammit(
|
||||||
|
utf8_data, exclude_encodings=["utf-8", "windows-1252"])
|
||||||
|
assert dammit.original_encoding == None
|
||||||
|
|
||||||
|
class TestEncodingDetector(object):
|
||||||
|
|
||||||
|
def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
|
||||||
|
detected = EncodingDetector(
|
||||||
|
b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
|
||||||
|
encodings = list(detected.encodings)
|
||||||
|
assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
|
||||||
|
|
||||||
|
def test_detect_html5_style_meta_tag(self):
|
||||||
|
|
||||||
|
for data in (
|
||||||
|
b'<html><meta charset="euc-jp" /></html>',
|
||||||
|
b"<html><meta charset='euc-jp' /></html>",
|
||||||
|
b"<html><meta charset=euc-jp /></html>",
|
||||||
|
b"<html><meta charset=euc-jp/></html>"):
|
||||||
|
dammit = UnicodeDammit(data, is_html=True)
|
||||||
|
assert "euc-jp" == dammit.original_encoding
|
||||||
|
|
||||||
|
def test_last_ditch_entity_replacement(self):
|
||||||
|
# This is a UTF-8 document that contains bytestrings
|
||||||
|
# completely incompatible with UTF-8 (ie. encoded with some other
|
||||||
|
# encoding).
|
||||||
|
#
|
||||||
|
# Since there is no consistent encoding for the document,
|
||||||
|
# Unicode, Dammit will eventually encode the document as UTF-8
|
||||||
|
# and encode the incompatible characters as REPLACEMENT
|
||||||
|
# CHARACTER.
|
||||||
|
#
|
||||||
|
# If chardet is installed, it will detect that the document
|
||||||
|
# can be converted into ISO-8859-1 without errors. This happens
|
||||||
|
# to be the wrong encoding, but it is a consistent encoding, so the
|
||||||
|
# code we're testing here won't run.
|
||||||
|
#
|
||||||
|
# So we temporarily disable chardet if it's present.
|
||||||
|
doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<html><b>\330\250\330\252\330\261</b>
|
||||||
|
<i>\310\322\321\220\312\321\355\344</i></html>"""
|
||||||
|
chardet = bs4.dammit.chardet_dammit
|
||||||
|
logging.disable(logging.WARNING)
|
||||||
|
try:
|
||||||
|
def noop(str):
|
||||||
|
return None
|
||||||
|
bs4.dammit.chardet_dammit = noop
|
||||||
|
dammit = UnicodeDammit(doc)
|
||||||
|
assert True == dammit.contains_replacement_characters
|
||||||
|
assert "\ufffd" in dammit.unicode_markup
|
||||||
|
|
||||||
|
soup = BeautifulSoup(doc, "html.parser")
|
||||||
|
assert soup.contains_replacement_characters
|
||||||
|
finally:
|
||||||
|
logging.disable(logging.NOTSET)
|
||||||
|
bs4.dammit.chardet_dammit = chardet
|
||||||
|
|
||||||
|
def test_byte_order_mark_removed(self):
|
||||||
|
# A document written in UTF-16LE will have its byte order marker stripped.
|
||||||
|
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
|
||||||
|
dammit = UnicodeDammit(data)
|
||||||
|
assert "<a>áé</a>" == dammit.unicode_markup
|
||||||
|
assert "utf-16le" == dammit.original_encoding
|
||||||
|
|
||||||
|
def test_known_definite_versus_user_encodings(self):
|
||||||
|
# The known_definite_encodings are used before sniffing the
|
||||||
|
# byte-order mark; the user_encodings are used afterwards.
|
||||||
|
|
||||||
|
# Here's a document in UTF-16LE.
|
||||||
|
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
|
||||||
|
dammit = UnicodeDammit(data)
|
||||||
|
|
||||||
|
# We can process it as UTF-16 by passing it in as a known
|
||||||
|
# definite encoding.
|
||||||
|
before = UnicodeDammit(data, known_definite_encodings=["utf-16"])
|
||||||
|
assert "utf-16" == before.original_encoding
|
||||||
|
|
||||||
|
# If we pass UTF-18 as a user encoding, it's not even
|
||||||
|
# tried--the encoding sniffed from the byte-order mark takes
|
||||||
|
# precedence.
|
||||||
|
after = UnicodeDammit(data, user_encodings=["utf-8"])
|
||||||
|
assert "utf-16le" == after.original_encoding
|
||||||
|
assert ["utf-16le"] == [x[0] for x in dammit.tried_encodings]
|
||||||
|
|
||||||
|
# Here's a document in ISO-8859-8.
|
||||||
|
hebrew = b"\xed\xe5\xec\xf9"
|
||||||
|
dammit = UnicodeDammit(hebrew, known_definite_encodings=["utf-8"],
|
||||||
|
user_encodings=["iso-8859-8"])
|
||||||
|
|
||||||
|
# The known_definite_encodings don't work, BOM sniffing does
|
||||||
|
# nothing (it only works for a few UTF encodings), but one of
|
||||||
|
# the user_encodings does work.
|
||||||
|
assert "iso-8859-8" == dammit.original_encoding
|
||||||
|
assert ["utf-8", "iso-8859-8"] == [x[0] for x in dammit.tried_encodings]
|
||||||
|
|
||||||
|
def test_deprecated_override_encodings(self):
|
||||||
|
# override_encodings is a deprecated alias for
|
||||||
|
# known_definite_encodings.
|
||||||
|
hebrew = b"\xed\xe5\xec\xf9"
|
||||||
|
dammit = UnicodeDammit(
|
||||||
|
hebrew,
|
||||||
|
known_definite_encodings=["shift-jis"],
|
||||||
|
override_encodings=["utf-8"],
|
||||||
|
user_encodings=["iso-8859-8"],
|
||||||
|
)
|
||||||
|
assert "iso-8859-8" == dammit.original_encoding
|
||||||
|
|
||||||
|
# known_definite_encodings and override_encodings were tried
|
||||||
|
# before user_encodings.
|
||||||
|
assert ["shift-jis", "utf-8", "iso-8859-8"] == (
|
||||||
|
[x[0] for x in dammit.tried_encodings]
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_detwingle(self):
|
||||||
|
# Here's a UTF8 document.
|
||||||
|
utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
|
||||||
|
|
||||||
|
# Here's a Windows-1252 document.
|
||||||
|
windows_1252 = (
|
||||||
|
"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
|
||||||
|
"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
|
||||||
|
|
||||||
|
# Through some unholy alchemy, they've been stuck together.
|
||||||
|
doc = utf8 + windows_1252 + utf8
|
||||||
|
|
||||||
|
# The document can't be turned into UTF-8:
|
||||||
|
with pytest.raises(UnicodeDecodeError):
|
||||||
|
doc.decode("utf8")
|
||||||
|
|
||||||
|
# Unicode, Dammit thinks the whole document is Windows-1252,
|
||||||
|
# and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
|
||||||
|
|
||||||
|
# But if we run it through fix_embedded_windows_1252, it's fixed:
|
||||||
|
fixed = UnicodeDammit.detwingle(doc)
|
||||||
|
assert "☃☃☃“Hi, I like Windows!”☃☃☃" == fixed.decode("utf8")
|
||||||
|
|
||||||
|
def test_detwingle_ignores_multibyte_characters(self):
|
||||||
|
# Each of these characters has a UTF-8 representation ending
|
||||||
|
# in \x93. \x93 is a smart quote if interpreted as
|
||||||
|
# Windows-1252. But our code knows to skip over multibyte
|
||||||
|
# UTF-8 characters, so they'll survive the process unscathed.
|
||||||
|
for tricky_unicode_char in (
|
||||||
|
"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
|
||||||
|
"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
|
||||||
|
"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
|
||||||
|
):
|
||||||
|
input = tricky_unicode_char.encode("utf8")
|
||||||
|
assert input.endswith(b'\x93')
|
||||||
|
output = UnicodeDammit.detwingle(input)
|
||||||
|
assert output == input
|
||||||
|
|
||||||
|
def test_find_declared_encoding(self):
|
||||||
|
# Test our ability to find a declared encoding inside an
|
||||||
|
# XML or HTML document.
|
||||||
|
#
|
||||||
|
# Even if the document comes in as Unicode, it may be
|
||||||
|
# interesting to know what encoding was claimed
|
||||||
|
# originally.
|
||||||
|
|
||||||
|
html_unicode = '<html><head><meta charset="utf-8"></head></html>'
|
||||||
|
html_bytes = html_unicode.encode("ascii")
|
||||||
|
|
||||||
|
xml_unicode= '<?xml version="1.0" encoding="ISO-8859-1" ?>'
|
||||||
|
xml_bytes = xml_unicode.encode("ascii")
|
||||||
|
|
||||||
|
m = EncodingDetector.find_declared_encoding
|
||||||
|
assert m(html_unicode, is_html=False) is None
|
||||||
|
assert "utf-8" == m(html_unicode, is_html=True)
|
||||||
|
assert "utf-8" == m(html_bytes, is_html=True)
|
||||||
|
|
||||||
|
assert "iso-8859-1" == m(xml_unicode)
|
||||||
|
assert "iso-8859-1" == m(xml_bytes)
|
||||||
|
|
||||||
|
# Normally, only the first few kilobytes of a document are checked for
|
||||||
|
# an encoding.
|
||||||
|
spacer = b' ' * 5000
|
||||||
|
assert m(spacer + html_bytes) is None
|
||||||
|
assert m(spacer + xml_bytes) is None
|
||||||
|
|
||||||
|
# But you can tell find_declared_encoding to search an entire
|
||||||
|
# HTML document.
|
||||||
|
assert (
|
||||||
|
m(spacer + html_bytes, is_html=True, search_entire_document=True)
|
||||||
|
== "utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
# The XML encoding declaration has to be the very first thing
|
||||||
|
# in the document. We'll allow whitespace before the document
|
||||||
|
# starts, but nothing else.
|
||||||
|
assert m(xml_bytes, search_entire_document=True) == "iso-8859-1"
|
||||||
|
assert m(b' ' + xml_bytes, search_entire_document=True) == "iso-8859-1"
|
||||||
|
assert m(b'a' + xml_bytes, search_entire_document=True) is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestEntitySubstitution(object):
|
||||||
|
"""Standalone tests of the EntitySubstitution class."""
|
||||||
|
def setup_method(self):
|
||||||
|
self.sub = EntitySubstitution
|
||||||
|
|
||||||
|
def test_simple_html_substitution(self):
|
||||||
|
# Unicode characters corresponding to named HTML entites
|
||||||
|
# are substituted, and no others.
|
||||||
|
s = "foo\u2200\N{SNOWMAN}\u00f5bar"
|
||||||
|
assert self.sub.substitute_html(s) == "foo∀\N{SNOWMAN}õbar"
|
||||||
|
|
||||||
|
def test_smart_quote_substitution(self):
|
||||||
|
# MS smart quotes are a common source of frustration, so we
|
||||||
|
# give them a special test.
|
||||||
|
quotes = b"\x91\x92foo\x93\x94"
|
||||||
|
dammit = UnicodeDammit(quotes)
|
||||||
|
assert self.sub.substitute_html(dammit.markup) == "‘’foo“”"
|
||||||
|
|
||||||
|
def test_html5_entity(self):
|
||||||
|
# Some HTML5 entities correspond to single- or multi-character
|
||||||
|
# Unicode sequences.
|
||||||
|
|
||||||
|
for entity, u in (
|
||||||
|
# A few spot checks of our ability to recognize
|
||||||
|
# special character sequences and convert them
|
||||||
|
# to named entities.
|
||||||
|
('⊧', '\u22a7'),
|
||||||
|
('𝔑', '\U0001d511'),
|
||||||
|
('≧̸', '\u2267\u0338'),
|
||||||
|
('¬', '\xac'),
|
||||||
|
('⫬', '\u2aec'),
|
||||||
|
|
||||||
|
# We _could_ convert | to &verbarr;, but we don't, because
|
||||||
|
# | is an ASCII character.
|
||||||
|
('|' '|'),
|
||||||
|
|
||||||
|
# Similarly for the fj ligature, which we could convert to
|
||||||
|
# fj, but we don't.
|
||||||
|
("fj", "fj"),
|
||||||
|
|
||||||
|
# We do convert _these_ ASCII characters to HTML entities,
|
||||||
|
# because that's required to generate valid HTML.
|
||||||
|
('>', '>'),
|
||||||
|
('<', '<'),
|
||||||
|
('&', '&'),
|
||||||
|
):
|
||||||
|
template = '3 %s 4'
|
||||||
|
raw = template % u
|
||||||
|
with_entities = template % entity
|
||||||
|
assert self.sub.substitute_html(raw) == with_entities
|
||||||
|
|
||||||
|
def test_html5_entity_with_variation_selector(self):
|
||||||
|
# Some HTML5 entities correspond either to a single-character
|
||||||
|
# Unicode sequence _or_ to the same character plus U+FE00,
|
||||||
|
# VARIATION SELECTOR 1. We can handle this.
|
||||||
|
data = "fjords \u2294 penguins"
|
||||||
|
markup = "fjords ⊔ penguins"
|
||||||
|
assert self.sub.substitute_html(data) == markup
|
||||||
|
|
||||||
|
data = "fjords \u2294\ufe00 penguins"
|
||||||
|
markup = "fjords ⊔︀ penguins"
|
||||||
|
assert self.sub.substitute_html(data) == markup
|
||||||
|
|
||||||
|
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
|
||||||
|
s = 'Welcome to "my bar"'
|
||||||
|
assert self.sub.substitute_xml(s, False) == s
|
||||||
|
|
||||||
|
def test_xml_attribute_quoting_normally_uses_double_quotes(self):
|
||||||
|
assert self.sub.substitute_xml("Welcome", True) == '"Welcome"'
|
||||||
|
assert self.sub.substitute_xml("Bob's Bar", True) == '"Bob\'s Bar"'
|
||||||
|
|
||||||
|
def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
|
||||||
|
s = 'Welcome to "my bar"'
|
||||||
|
assert self.sub.substitute_xml(s, True) == "'Welcome to \"my bar\"'"
|
||||||
|
|
||||||
|
def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
|
||||||
|
s = 'Welcome to "Bob\'s Bar"'
|
||||||
|
assert self.sub.substitute_xml(s, True) == '"Welcome to "Bob\'s Bar""'
|
||||||
|
|
||||||
|
def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
|
||||||
|
quoted = 'Welcome to "Bob\'s Bar"'
|
||||||
|
assert self.sub.substitute_xml(quoted) == quoted
|
||||||
|
|
||||||
|
def test_xml_quoting_handles_angle_brackets(self):
|
||||||
|
assert self.sub.substitute_xml("foo<bar>") == "foo<bar>"
|
||||||
|
|
||||||
|
def test_xml_quoting_handles_ampersands(self):
|
||||||
|
assert self.sub.substitute_xml("AT&T") == "AT&T"
|
||||||
|
|
||||||
|
def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
|
||||||
|
assert self.sub.substitute_xml("ÁT&T") == "&Aacute;T&T"
|
||||||
|
|
||||||
|
def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
|
||||||
|
assert self.sub.substitute_xml_containing_entities("ÁT&T") == "ÁT&T"
|
||||||
|
|
||||||
|
def test_quotes_not_html_substituted(self):
|
||||||
|
"""There's no need to do this except inside attribute values."""
|
||||||
|
text = 'Bob\'s "bar"'
|
||||||
|
assert self.sub.substitute_html(text) == text
|
|
@ -1,5 +1,7 @@
|
||||||
"Test harness for doctests."
|
"Test harness for doctests."
|
||||||
|
|
||||||
|
# TODO: Pretty sure this isn't used and should be deleted.
|
||||||
|
|
||||||
# pylint: disable-msg=E0611,W0142
|
# pylint: disable-msg=E0611,W0142
|
||||||
|
|
||||||
__metaclass__ = type
|
__metaclass__ = type
|
||||||
|
|
74
libs/bs4/tests/test_element.py
Normal file
74
libs/bs4/tests/test_element.py
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
"""Tests of classes in element.py.
|
||||||
|
|
||||||
|
The really big classes -- Tag, PageElement, and NavigableString --
|
||||||
|
are tested in separate files.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from bs4.element import (
|
||||||
|
CharsetMetaAttributeValue,
|
||||||
|
ContentMetaAttributeValue,
|
||||||
|
NamespacedAttribute,
|
||||||
|
)
|
||||||
|
from . import SoupTest
|
||||||
|
|
||||||
|
|
||||||
|
class TestNamedspacedAttribute(object):
|
||||||
|
|
||||||
|
def test_name_may_be_none_or_missing(self):
|
||||||
|
a = NamespacedAttribute("xmlns", None)
|
||||||
|
assert a == "xmlns"
|
||||||
|
|
||||||
|
a = NamespacedAttribute("xmlns", "")
|
||||||
|
assert a == "xmlns"
|
||||||
|
|
||||||
|
a = NamespacedAttribute("xmlns")
|
||||||
|
assert a == "xmlns"
|
||||||
|
|
||||||
|
def test_namespace_may_be_none_or_missing(self):
|
||||||
|
a = NamespacedAttribute(None, "tag")
|
||||||
|
assert a == "tag"
|
||||||
|
|
||||||
|
a = NamespacedAttribute("", "tag")
|
||||||
|
assert a == "tag"
|
||||||
|
|
||||||
|
def test_attribute_is_equivalent_to_colon_separated_string(self):
|
||||||
|
a = NamespacedAttribute("a", "b")
|
||||||
|
assert "a:b" == a
|
||||||
|
|
||||||
|
def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
|
||||||
|
a = NamespacedAttribute("a", "b", "c")
|
||||||
|
b = NamespacedAttribute("a", "b", "c")
|
||||||
|
assert a == b
|
||||||
|
|
||||||
|
# The actual namespace is not considered.
|
||||||
|
c = NamespacedAttribute("a", "b", None)
|
||||||
|
assert a == c
|
||||||
|
|
||||||
|
# But name and prefix are important.
|
||||||
|
d = NamespacedAttribute("a", "z", "c")
|
||||||
|
assert a != d
|
||||||
|
|
||||||
|
e = NamespacedAttribute("z", "b", "c")
|
||||||
|
assert a != e
|
||||||
|
|
||||||
|
|
||||||
|
class TestAttributeValueWithCharsetSubstitution(object):
|
||||||
|
"""Certain attributes are designed to have the charset of the
|
||||||
|
final document substituted into their value.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_content_meta_attribute_value(self):
|
||||||
|
# The value of a CharsetMetaAttributeValue is whatever
|
||||||
|
# encoding the string is in.
|
||||||
|
value = CharsetMetaAttributeValue("euc-jp")
|
||||||
|
assert "euc-jp" == value
|
||||||
|
assert "euc-jp" == value.original_value
|
||||||
|
assert "utf8" == value.encode("utf8")
|
||||||
|
assert "ascii" == value.encode("ascii")
|
||||||
|
|
||||||
|
def test_content_meta_attribute_value(self):
|
||||||
|
value = ContentMetaAttributeValue("text/html; charset=euc-jp")
|
||||||
|
assert "text/html; charset=euc-jp" == value
|
||||||
|
assert "text/html; charset=euc-jp" == value.original_value
|
||||||
|
assert "text/html; charset=utf8" == value.encode("utf8")
|
||||||
|
assert "text/html; charset=ascii" == value.encode("ascii")
|
113
libs/bs4/tests/test_formatter.py
Normal file
113
libs/bs4/tests/test_formatter.py
Normal file
|
@ -0,0 +1,113 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from bs4.element import Tag
|
||||||
|
from bs4.formatter import (
|
||||||
|
Formatter,
|
||||||
|
HTMLFormatter,
|
||||||
|
XMLFormatter,
|
||||||
|
)
|
||||||
|
from . import SoupTest
|
||||||
|
|
||||||
|
class TestFormatter(SoupTest):
|
||||||
|
|
||||||
|
def test_default_attributes(self):
|
||||||
|
# Test the default behavior of Formatter.attributes().
|
||||||
|
formatter = Formatter()
|
||||||
|
tag = Tag(name="tag")
|
||||||
|
tag['b'] = 1
|
||||||
|
tag['a'] = 2
|
||||||
|
|
||||||
|
# Attributes come out sorted by name. In Python 3, attributes
|
||||||
|
# normally come out of a dictionary in the order they were
|
||||||
|
# added.
|
||||||
|
assert [('a', 2), ('b', 1)] == formatter.attributes(tag)
|
||||||
|
|
||||||
|
# This works even if Tag.attrs is None, though this shouldn't
|
||||||
|
# normally happen.
|
||||||
|
tag.attrs = None
|
||||||
|
assert [] == formatter.attributes(tag)
|
||||||
|
|
||||||
|
assert ' ' == formatter.indent
|
||||||
|
|
||||||
|
def test_sort_attributes(self):
|
||||||
|
# Test the ability to override Formatter.attributes() to,
|
||||||
|
# e.g., disable the normal sorting of attributes.
|
||||||
|
class UnsortedFormatter(Formatter):
|
||||||
|
def attributes(self, tag):
|
||||||
|
self.called_with = tag
|
||||||
|
for k, v in sorted(tag.attrs.items()):
|
||||||
|
if k == 'ignore':
|
||||||
|
continue
|
||||||
|
yield k,v
|
||||||
|
|
||||||
|
soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>')
|
||||||
|
formatter = UnsortedFormatter()
|
||||||
|
decoded = soup.decode(formatter=formatter)
|
||||||
|
|
||||||
|
# attributes() was called on the <p> tag. It filtered out one
|
||||||
|
# attribute and sorted the other two.
|
||||||
|
assert formatter.called_with == soup.p
|
||||||
|
assert '<p aval="2" cval="1"></p>' == decoded
|
||||||
|
|
||||||
|
def test_empty_attributes_are_booleans(self):
|
||||||
|
# Test the behavior of empty_attributes_are_booleans as well
|
||||||
|
# as which Formatters have it enabled.
|
||||||
|
|
||||||
|
for name in ('html', 'minimal', None):
|
||||||
|
formatter = HTMLFormatter.REGISTRY[name]
|
||||||
|
assert False == formatter.empty_attributes_are_booleans
|
||||||
|
|
||||||
|
formatter = XMLFormatter.REGISTRY[None]
|
||||||
|
assert False == formatter.empty_attributes_are_booleans
|
||||||
|
|
||||||
|
formatter = HTMLFormatter.REGISTRY['html5']
|
||||||
|
assert True == formatter.empty_attributes_are_booleans
|
||||||
|
|
||||||
|
# Verify that the constructor sets the value.
|
||||||
|
formatter = Formatter(empty_attributes_are_booleans=True)
|
||||||
|
assert True == formatter.empty_attributes_are_booleans
|
||||||
|
|
||||||
|
# Now demonstrate what it does to markup.
|
||||||
|
for markup in (
|
||||||
|
"<option selected></option>",
|
||||||
|
'<option selected=""></option>'
|
||||||
|
):
|
||||||
|
soup = self.soup(markup)
|
||||||
|
for formatter in ('html', 'minimal', 'xml', None):
|
||||||
|
assert b'<option selected=""></option>' == soup.option.encode(formatter='html')
|
||||||
|
assert b'<option selected></option>' == soup.option.encode(formatter='html5')
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"indent,expect",
|
||||||
|
[
|
||||||
|
(None, '<a>\n<b>\ntext\n</b>\n</a>'),
|
||||||
|
(-1, '<a>\n<b>\ntext\n</b>\n</a>'),
|
||||||
|
(0, '<a>\n<b>\ntext\n</b>\n</a>'),
|
||||||
|
("", '<a>\n<b>\ntext\n</b>\n</a>'),
|
||||||
|
|
||||||
|
(1, '<a>\n <b>\n text\n </b>\n</a>'),
|
||||||
|
(2, '<a>\n <b>\n text\n </b>\n</a>'),
|
||||||
|
|
||||||
|
("\t", '<a>\n\t<b>\n\t\ttext\n\t</b>\n</a>'),
|
||||||
|
('abc', '<a>\nabc<b>\nabcabctext\nabc</b>\n</a>'),
|
||||||
|
|
||||||
|
# Some invalid inputs -- the default behavior is used.
|
||||||
|
(object(), '<a>\n <b>\n text\n </b>\n</a>'),
|
||||||
|
(b'bytes', '<a>\n <b>\n text\n </b>\n</a>'),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
def test_indent(self, indent, expect):
|
||||||
|
# Pretty-print a tree with a Formatter set to
|
||||||
|
# indent in a certain way and verify the results.
|
||||||
|
soup = self.soup("<a><b>text</b></a>")
|
||||||
|
formatter = Formatter(indent=indent)
|
||||||
|
assert soup.prettify(formatter=formatter) == expect
|
||||||
|
|
||||||
|
# Pretty-printing only happens with prettify(), not
|
||||||
|
# encode().
|
||||||
|
assert soup.encode(formatter=formatter) != expect
|
||||||
|
|
||||||
|
def test_default_indent_value(self):
|
||||||
|
formatter = Formatter()
|
||||||
|
assert formatter.indent == ' '
|
||||||
|
|
|
@ -8,7 +8,7 @@ try:
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
HTML5LIB_PRESENT = False
|
HTML5LIB_PRESENT = False
|
||||||
from bs4.element import SoupStrainer
|
from bs4.element import SoupStrainer
|
||||||
from bs4.testing import (
|
from . import (
|
||||||
HTML5TreeBuilderSmokeTest,
|
HTML5TreeBuilderSmokeTest,
|
||||||
SoupTest,
|
SoupTest,
|
||||||
skipIf,
|
skipIf,
|
||||||
|
@ -17,7 +17,7 @@ from bs4.testing import (
|
||||||
@skipIf(
|
@skipIf(
|
||||||
not HTML5LIB_PRESENT,
|
not HTML5LIB_PRESENT,
|
||||||
"html5lib seems not to be present, not testing its tree builder.")
|
"html5lib seems not to be present, not testing its tree builder.")
|
||||||
class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
class TestHTML5LibBuilder(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||||
"""See ``HTML5TreeBuilderSmokeTest``."""
|
"""See ``HTML5TreeBuilderSmokeTest``."""
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -30,12 +30,9 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||||
markup = "<p>A <b>bold</b> statement.</p>"
|
markup = "<p>A <b>bold</b> statement.</p>"
|
||||||
with warnings.catch_warnings(record=True) as w:
|
with warnings.catch_warnings(record=True) as w:
|
||||||
soup = self.soup(markup, parse_only=strainer)
|
soup = self.soup(markup, parse_only=strainer)
|
||||||
self.assertEqual(
|
assert soup.decode() == self.document_for(markup)
|
||||||
soup.decode(), self.document_for(markup))
|
|
||||||
|
|
||||||
self.assertTrue(
|
assert "the html5lib tree builder doesn't support parse_only" in str(w[0].message)
|
||||||
"the html5lib tree builder doesn't support parse_only" in
|
|
||||||
str(w[0].message))
|
|
||||||
|
|
||||||
def test_correctly_nested_tables(self):
|
def test_correctly_nested_tables(self):
|
||||||
"""html5lib inserts <tbody> tags where other parsers don't."""
|
"""html5lib inserts <tbody> tags where other parsers don't."""
|
||||||
|
@ -46,13 +43,13 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||||
'<tr><td>foo</td></tr>'
|
'<tr><td>foo</td></tr>'
|
||||||
'</table></td>')
|
'</table></td>')
|
||||||
|
|
||||||
self.assertSoupEquals(
|
self.assert_soup(
|
||||||
markup,
|
markup,
|
||||||
'<table id="1"><tbody><tr><td>Here\'s another table:'
|
'<table id="1"><tbody><tr><td>Here\'s another table:'
|
||||||
'<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
|
'<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
|
||||||
'</td></tr></tbody></table>')
|
'</td></tr></tbody></table>')
|
||||||
|
|
||||||
self.assertSoupEquals(
|
self.assert_soup(
|
||||||
"<table><thead><tr><td>Foo</td></tr></thead>"
|
"<table><thead><tr><td>Foo</td></tr></thead>"
|
||||||
"<tbody><tr><td>Bar</td></tr></tbody>"
|
"<tbody><tr><td>Bar</td></tr></tbody>"
|
||||||
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
||||||
|
@ -69,20 +66,20 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||||
</html>'''
|
</html>'''
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
# Verify that we can reach the <p> tag; this means the tree is connected.
|
# Verify that we can reach the <p> tag; this means the tree is connected.
|
||||||
self.assertEqual(b"<p>foo</p>", soup.p.encode())
|
assert b"<p>foo</p>" == soup.p.encode()
|
||||||
|
|
||||||
def test_reparented_markup(self):
|
def test_reparented_markup(self):
|
||||||
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
|
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
|
assert "<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>" == soup.body.decode()
|
||||||
self.assertEqual(2, len(soup.find_all('p')))
|
assert 2 == len(soup.find_all('p'))
|
||||||
|
|
||||||
|
|
||||||
def test_reparented_markup_ends_with_whitespace(self):
|
def test_reparented_markup_ends_with_whitespace(self):
|
||||||
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
|
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
|
assert "<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>" == soup.body.decode()
|
||||||
self.assertEqual(2, len(soup.find_all('p')))
|
assert 2 == len(soup.find_all('p'))
|
||||||
|
|
||||||
def test_reparented_markup_containing_identical_whitespace_nodes(self):
|
def test_reparented_markup_containing_identical_whitespace_nodes(self):
|
||||||
"""Verify that we keep the two whitespace nodes in this
|
"""Verify that we keep the two whitespace nodes in this
|
||||||
|
@ -99,7 +96,7 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||||
markup = '<div><a>aftermath<p><noscript>target</noscript>aftermath</a></p></div>'
|
markup = '<div><a>aftermath<p><noscript>target</noscript>aftermath</a></p></div>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
noscript = soup.noscript
|
noscript = soup.noscript
|
||||||
self.assertEqual("target", noscript.next_element)
|
assert "target" == noscript.next_element
|
||||||
target = soup.find(string='target')
|
target = soup.find(string='target')
|
||||||
|
|
||||||
# The 'aftermath' string was duplicated; we want the second one.
|
# The 'aftermath' string was duplicated; we want the second one.
|
||||||
|
@ -108,8 +105,8 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||||
# The <noscript> tag was moved beneath a copy of the <a> tag,
|
# The <noscript> tag was moved beneath a copy of the <a> tag,
|
||||||
# but the 'target' string within is still connected to the
|
# but the 'target' string within is still connected to the
|
||||||
# (second) 'aftermath' string.
|
# (second) 'aftermath' string.
|
||||||
self.assertEqual(final_aftermath, target.next_element)
|
assert final_aftermath == target.next_element
|
||||||
self.assertEqual(target, final_aftermath.previous_element)
|
assert target == final_aftermath.previous_element
|
||||||
|
|
||||||
def test_processing_instruction(self):
|
def test_processing_instruction(self):
|
||||||
"""Processing instructions become comments."""
|
"""Processing instructions become comments."""
|
||||||
|
@ -121,13 +118,13 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||||
markup = b"""<a class="my_class"><p></a>"""
|
markup = b"""<a class="my_class"><p></a>"""
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
a1, a2 = soup.find_all('a')
|
a1, a2 = soup.find_all('a')
|
||||||
self.assertEqual(a1, a2)
|
assert a1 == a2
|
||||||
assert a1 is not a2
|
assert a1 is not a2
|
||||||
|
|
||||||
def test_foster_parenting(self):
|
def test_foster_parenting(self):
|
||||||
markup = b"""<table><td></tbody>A"""
|
markup = b"""<table><td></tbody>A"""
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual("<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
|
assert "<body>A<table><tbody><tr><td></td></tr></tbody></table></body>" == soup.body.decode()
|
||||||
|
|
||||||
def test_extraction(self):
|
def test_extraction(self):
|
||||||
"""
|
"""
|
||||||
|
@ -145,7 +142,7 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||||
[s.extract() for s in soup('script')]
|
[s.extract() for s in soup('script')]
|
||||||
[s.extract() for s in soup('style')]
|
[s.extract() for s in soup('style')]
|
||||||
|
|
||||||
self.assertEqual(len(soup.find_all("p")), 1)
|
assert len(soup.find_all("p")) == 1
|
||||||
|
|
||||||
def test_empty_comment(self):
|
def test_empty_comment(self):
|
||||||
"""
|
"""
|
||||||
|
@ -167,21 +164,21 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||||
inputs = []
|
inputs = []
|
||||||
for form in soup.find_all('form'):
|
for form in soup.find_all('form'):
|
||||||
inputs.extend(form.find_all('input'))
|
inputs.extend(form.find_all('input'))
|
||||||
self.assertEqual(len(inputs), 1)
|
assert len(inputs) == 1
|
||||||
|
|
||||||
def test_tracking_line_numbers(self):
|
def test_tracking_line_numbers(self):
|
||||||
# The html.parser TreeBuilder keeps track of line number and
|
# The html.parser TreeBuilder keeps track of line number and
|
||||||
# position of each element.
|
# position of each element.
|
||||||
markup = "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>"
|
markup = "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>"
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(2, soup.p.sourceline)
|
assert 2 == soup.p.sourceline
|
||||||
self.assertEqual(5, soup.p.sourcepos)
|
assert 5 == soup.p.sourcepos
|
||||||
self.assertEqual("sourceline", soup.p.find('sourceline').name)
|
assert "sourceline" == soup.p.find('sourceline').name
|
||||||
|
|
||||||
# You can deactivate this behavior.
|
# You can deactivate this behavior.
|
||||||
soup = self.soup(markup, store_line_numbers=False)
|
soup = self.soup(markup, store_line_numbers=False)
|
||||||
self.assertEqual("sourceline", soup.p.sourceline.name)
|
assert "sourceline" == soup.p.sourceline.name
|
||||||
self.assertEqual("sourcepos", soup.p.sourcepos.name)
|
assert "sourcepos" == soup.p.sourcepos.name
|
||||||
|
|
||||||
def test_special_string_containers(self):
|
def test_special_string_containers(self):
|
||||||
# The html5lib tree builder doesn't support this standard feature,
|
# The html5lib tree builder doesn't support this standard feature,
|
||||||
|
@ -219,8 +216,8 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||||
div = self.soup(markup).div
|
div = self.soup(markup).div
|
||||||
without_element = div.encode()
|
without_element = div.encode()
|
||||||
expect = b"<div>%s</div>" % output_unicode.encode("utf8")
|
expect = b"<div>%s</div>" % output_unicode.encode("utf8")
|
||||||
self.assertEqual(without_element, expect)
|
assert without_element == expect
|
||||||
|
|
||||||
with_element = div.encode(formatter="html")
|
with_element = div.encode(formatter="html")
|
||||||
expect = b"<div>%s</div>" % output_element
|
expect = b"<div>%s</div>" % output_element
|
||||||
self.assertEqual(with_element, expect)
|
assert with_element == expect
|
||||||
|
|
|
@ -4,11 +4,14 @@ trees."""
|
||||||
from pdb import set_trace
|
from pdb import set_trace
|
||||||
import pickle
|
import pickle
|
||||||
import warnings
|
import warnings
|
||||||
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
|
from bs4.builder import (
|
||||||
from bs4.builder import HTMLParserTreeBuilder
|
HTMLParserTreeBuilder,
|
||||||
|
XMLParsedAsHTMLWarning,
|
||||||
|
)
|
||||||
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
|
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
|
||||||
|
from . import SoupTest, HTMLTreeBuilderSmokeTest
|
||||||
|
|
||||||
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
class TestHTMLParserTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
|
|
||||||
default_builder = HTMLParserTreeBuilder
|
default_builder = HTMLParserTreeBuilder
|
||||||
|
|
||||||
|
@ -27,30 +30,30 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
tree = self.soup("<a><b>foo</a>")
|
tree = self.soup("<a><b>foo</a>")
|
||||||
dumped = pickle.dumps(tree, 2)
|
dumped = pickle.dumps(tree, 2)
|
||||||
loaded = pickle.loads(dumped)
|
loaded = pickle.loads(dumped)
|
||||||
self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
|
assert isinstance(loaded.builder, type(tree.builder))
|
||||||
|
|
||||||
def test_redundant_empty_element_closing_tags(self):
|
def test_redundant_empty_element_closing_tags(self):
|
||||||
self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
|
self.assert_soup('<br></br><br></br><br></br>', "<br/><br/><br/>")
|
||||||
self.assertSoupEquals('</br></br></br>', "")
|
self.assert_soup('</br></br></br>', "")
|
||||||
|
|
||||||
def test_empty_element(self):
|
def test_empty_element(self):
|
||||||
# This verifies that any buffered data present when the parser
|
# This verifies that any buffered data present when the parser
|
||||||
# finishes working is handled.
|
# finishes working is handled.
|
||||||
self.assertSoupEquals("foo &# bar", "foo &# bar")
|
self.assert_soup("foo &# bar", "foo &# bar")
|
||||||
|
|
||||||
def test_tracking_line_numbers(self):
|
def test_tracking_line_numbers(self):
|
||||||
# The html.parser TreeBuilder keeps track of line number and
|
# The html.parser TreeBuilder keeps track of line number and
|
||||||
# position of each element.
|
# position of each element.
|
||||||
markup = "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>"
|
markup = "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>"
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(2, soup.p.sourceline)
|
assert 2 == soup.p.sourceline
|
||||||
self.assertEqual(3, soup.p.sourcepos)
|
assert 3 == soup.p.sourcepos
|
||||||
self.assertEqual("sourceline", soup.p.find('sourceline').name)
|
assert "sourceline" == soup.p.find('sourceline').name
|
||||||
|
|
||||||
# You can deactivate this behavior.
|
# You can deactivate this behavior.
|
||||||
soup = self.soup(markup, store_line_numbers=False)
|
soup = self.soup(markup, store_line_numbers=False)
|
||||||
self.assertEqual("sourceline", soup.p.sourceline.name)
|
assert "sourceline" == soup.p.sourceline.name
|
||||||
self.assertEqual("sourcepos", soup.p.sourcepos.name)
|
assert "sourcepos" == soup.p.sourcepos.name
|
||||||
|
|
||||||
def test_on_duplicate_attribute(self):
|
def test_on_duplicate_attribute(self):
|
||||||
# The html.parser tree builder has a variety of ways of
|
# The html.parser tree builder has a variety of ways of
|
||||||
|
@ -61,20 +64,20 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
# If you don't provide any particular value for
|
# If you don't provide any particular value for
|
||||||
# on_duplicate_attribute, later values replace earlier values.
|
# on_duplicate_attribute, later values replace earlier values.
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual("url3", soup.a['href'])
|
assert "url3" == soup.a['href']
|
||||||
self.assertEqual(["cls"], soup.a['class'])
|
assert ["cls"] == soup.a['class']
|
||||||
self.assertEqual("id", soup.a['id'])
|
assert "id" == soup.a['id']
|
||||||
|
|
||||||
# You can also get this behavior explicitly.
|
# You can also get this behavior explicitly.
|
||||||
def assert_attribute(on_duplicate_attribute, expected):
|
def assert_attribute(on_duplicate_attribute, expected):
|
||||||
soup = self.soup(
|
soup = self.soup(
|
||||||
markup, on_duplicate_attribute=on_duplicate_attribute
|
markup, on_duplicate_attribute=on_duplicate_attribute
|
||||||
)
|
)
|
||||||
self.assertEqual(expected, soup.a['href'])
|
assert expected == soup.a['href']
|
||||||
|
|
||||||
# Verify that non-duplicate attributes are treated normally.
|
# Verify that non-duplicate attributes are treated normally.
|
||||||
self.assertEqual(["cls"], soup.a['class'])
|
assert ["cls"] == soup.a['class']
|
||||||
self.assertEqual("id", soup.a['id'])
|
assert "id" == soup.a['id']
|
||||||
assert_attribute(None, "url3")
|
assert_attribute(None, "url3")
|
||||||
assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3")
|
assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3")
|
||||||
|
|
||||||
|
@ -114,12 +117,11 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
div = self.soup(markup).div
|
div = self.soup(markup).div
|
||||||
without_element = div.encode()
|
without_element = div.encode()
|
||||||
expect = b"<div>%s</div>" % output_unicode.encode("utf8")
|
expect = b"<div>%s</div>" % output_unicode.encode("utf8")
|
||||||
self.assertEqual(without_element, expect)
|
assert without_element == expect
|
||||||
|
|
||||||
with_element = div.encode(formatter="html")
|
with_element = div.encode(formatter="html")
|
||||||
expect = b"<div>%s</div>" % output_element
|
expect = b"<div>%s</div>" % output_element
|
||||||
self.assertEqual(with_element, expect)
|
assert with_element == expect
|
||||||
|
|
||||||
|
|
||||||
class TestHTMLParserSubclass(SoupTest):
|
class TestHTMLParserSubclass(SoupTest):
|
||||||
def test_error(self):
|
def test_error(self):
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
"""Tests to ensure that the lxml tree builder generates good trees."""
|
"""Tests to ensure that the lxml tree builder generates good trees."""
|
||||||
|
|
||||||
|
import pickle
|
||||||
import re
|
import re
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
|
@ -19,9 +20,7 @@ from bs4 import (
|
||||||
BeautifulStoneSoup,
|
BeautifulStoneSoup,
|
||||||
)
|
)
|
||||||
from bs4.element import Comment, Doctype, SoupStrainer
|
from bs4.element import Comment, Doctype, SoupStrainer
|
||||||
from bs4.testing import skipIf
|
from . import (
|
||||||
from bs4.tests import test_htmlparser
|
|
||||||
from bs4.testing import (
|
|
||||||
HTMLTreeBuilderSmokeTest,
|
HTMLTreeBuilderSmokeTest,
|
||||||
XMLTreeBuilderSmokeTest,
|
XMLTreeBuilderSmokeTest,
|
||||||
SoupTest,
|
SoupTest,
|
||||||
|
@ -31,7 +30,7 @@ from bs4.testing import (
|
||||||
@skipIf(
|
@skipIf(
|
||||||
not LXML_PRESENT,
|
not LXML_PRESENT,
|
||||||
"lxml seems not to be present, not testing its tree builder.")
|
"lxml seems not to be present, not testing its tree builder.")
|
||||||
class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
"""See ``HTMLTreeBuilderSmokeTest``."""
|
"""See ``HTMLTreeBuilderSmokeTest``."""
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -39,11 +38,11 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
return LXMLTreeBuilder
|
return LXMLTreeBuilder
|
||||||
|
|
||||||
def test_out_of_range_entity(self):
|
def test_out_of_range_entity(self):
|
||||||
self.assertSoupEquals(
|
self.assert_soup(
|
||||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||||
self.assertSoupEquals(
|
self.assert_soup(
|
||||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||||
self.assertSoupEquals(
|
self.assert_soup(
|
||||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||||
|
|
||||||
def test_entities_in_foreign_document_encoding(self):
|
def test_entities_in_foreign_document_encoding(self):
|
||||||
|
@ -61,15 +60,15 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
def test_empty_doctype(self):
|
def test_empty_doctype(self):
|
||||||
soup = self.soup("<!DOCTYPE>")
|
soup = self.soup("<!DOCTYPE>")
|
||||||
doctype = soup.contents[0]
|
doctype = soup.contents[0]
|
||||||
self.assertEqual("", doctype.strip())
|
assert "" == doctype.strip()
|
||||||
|
|
||||||
def test_beautifulstonesoup_is_xml_parser(self):
|
def test_beautifulstonesoup_is_xml_parser(self):
|
||||||
# Make sure that the deprecated BSS class uses an xml builder
|
# Make sure that the deprecated BSS class uses an xml builder
|
||||||
# if one is installed.
|
# if one is installed.
|
||||||
with warnings.catch_warnings(record=True) as w:
|
with warnings.catch_warnings(record=True) as w:
|
||||||
soup = BeautifulStoneSoup("<b />")
|
soup = BeautifulStoneSoup("<b />")
|
||||||
self.assertEqual("<b/>", str(soup.b))
|
assert "<b/>" == str(soup.b)
|
||||||
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
|
assert "BeautifulStoneSoup class is deprecated" in str(w[0].message)
|
||||||
|
|
||||||
def test_tracking_line_numbers(self):
|
def test_tracking_line_numbers(self):
|
||||||
# The lxml TreeBuilder cannot keep track of line numbers from
|
# The lxml TreeBuilder cannot keep track of line numbers from
|
||||||
|
@ -83,13 +82,13 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
"\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>",
|
"\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>",
|
||||||
store_line_numbers=True
|
store_line_numbers=True
|
||||||
)
|
)
|
||||||
self.assertEqual("sourceline", soup.p.sourceline.name)
|
assert "sourceline" == soup.p.sourceline.name
|
||||||
self.assertEqual("sourcepos", soup.p.sourcepos.name)
|
assert "sourcepos" == soup.p.sourcepos.name
|
||||||
|
|
||||||
@skipIf(
|
@skipIf(
|
||||||
not LXML_PRESENT,
|
not LXML_PRESENT,
|
||||||
"lxml seems not to be present, not testing its XML tree builder.")
|
"lxml seems not to be present, not testing its XML tree builder.")
|
||||||
class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
|
class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
|
||||||
"""See ``HTMLTreeBuilderSmokeTest``."""
|
"""See ``HTMLTreeBuilderSmokeTest``."""
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -97,19 +96,104 @@ class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
|
||||||
return LXMLTreeBuilderForXML
|
return LXMLTreeBuilderForXML
|
||||||
|
|
||||||
def test_namespace_indexing(self):
|
def test_namespace_indexing(self):
|
||||||
# We should not track un-prefixed namespaces as we can only hold one
|
soup = self.soup(
|
||||||
# and it will be recognized as the default namespace by soupsieve,
|
'<?xml version="1.1"?>\n'
|
||||||
# which may be confusing in some situations. When no namespace is provided
|
'<root>'
|
||||||
# for a selector, the default namespace (if defined) is assumed.
|
'<tag xmlns="http://unprefixed-namespace.com">content</tag>'
|
||||||
|
'<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</prefix:tag2>'
|
||||||
|
'<prefix2:tag3 xmlns:prefix2="http://another-namespace.com">'
|
||||||
|
'<subtag xmlns="http://another-unprefixed-namespace.com">'
|
||||||
|
'<subsubtag xmlns="http://yet-another-unprefixed-namespace.com">'
|
||||||
|
'</prefix2:tag3>'
|
||||||
|
'</root>'
|
||||||
|
)
|
||||||
|
|
||||||
|
# The BeautifulSoup object includes every namespace prefix
|
||||||
|
# defined in the entire document. This is the default set of
|
||||||
|
# namespaces used by soupsieve.
|
||||||
|
#
|
||||||
|
# Un-prefixed namespaces are not included, and if a given
|
||||||
|
# prefix is defined twice, only the first prefix encountered
|
||||||
|
# in the document shows up here.
|
||||||
|
assert soup._namespaces == {
|
||||||
|
'xml': 'http://www.w3.org/XML/1998/namespace',
|
||||||
|
'prefix': 'http://prefixed-namespace.com',
|
||||||
|
'prefix2': 'http://another-namespace.com'
|
||||||
|
}
|
||||||
|
|
||||||
|
# A Tag object includes only the namespace prefixes
|
||||||
|
# that were in scope when it was parsed.
|
||||||
|
|
||||||
|
# We do not track un-prefixed namespaces as we can only hold
|
||||||
|
# one (the first one), and it will be recognized as the
|
||||||
|
# default namespace by soupsieve, even when operating from a
|
||||||
|
# tag with a different un-prefixed namespace.
|
||||||
|
assert soup.tag._namespaces == {
|
||||||
|
'xml': 'http://www.w3.org/XML/1998/namespace',
|
||||||
|
}
|
||||||
|
|
||||||
|
assert soup.tag2._namespaces == {
|
||||||
|
'prefix': 'http://prefixed-namespace.com',
|
||||||
|
'xml': 'http://www.w3.org/XML/1998/namespace',
|
||||||
|
}
|
||||||
|
|
||||||
|
assert soup.subtag._namespaces == {
|
||||||
|
'prefix2': 'http://another-namespace.com',
|
||||||
|
'xml': 'http://www.w3.org/XML/1998/namespace',
|
||||||
|
}
|
||||||
|
|
||||||
|
assert soup.subsubtag._namespaces == {
|
||||||
|
'prefix2': 'http://another-namespace.com',
|
||||||
|
'xml': 'http://www.w3.org/XML/1998/namespace',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_namespace_interaction_with_select_and_find(self):
|
||||||
|
# Demonstrate how namespaces interact with select* and
|
||||||
|
# find* methods.
|
||||||
|
|
||||||
soup = self.soup(
|
soup = self.soup(
|
||||||
'<?xml version="1.1"?>\n'
|
'<?xml version="1.1"?>\n'
|
||||||
'<root>'
|
'<root>'
|
||||||
'<tag xmlns="http://unprefixed-namespace.com">content</tag>'
|
'<tag xmlns="http://unprefixed-namespace.com">content</tag>'
|
||||||
'<prefix:tag xmlns:prefix="http://prefixed-namespace.com">content</tag>'
|
'<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</tag>'
|
||||||
|
'<subtag xmlns:prefix="http://another-namespace-same-prefix.com">'
|
||||||
|
'<prefix:tag3>'
|
||||||
|
'</subtag>'
|
||||||
'</root>'
|
'</root>'
|
||||||
)
|
)
|
||||||
self.assertEqual(
|
|
||||||
soup._namespaces,
|
# soupselect uses namespace URIs.
|
||||||
{'xml': 'http://www.w3.org/XML/1998/namespace', 'prefix': 'http://prefixed-namespace.com'}
|
assert soup.select_one('tag').name == 'tag'
|
||||||
)
|
assert soup.select_one('prefix|tag2').name == 'tag2'
|
||||||
|
|
||||||
|
# If a prefix is declared more than once, only the first usage
|
||||||
|
# is registered with the BeautifulSoup object.
|
||||||
|
assert soup.select_one('prefix|tag3') is None
|
||||||
|
|
||||||
|
# But you can always explicitly specify a namespace dictionary.
|
||||||
|
assert soup.select_one(
|
||||||
|
'prefix|tag3', namespaces=soup.subtag._namespaces
|
||||||
|
).name == 'tag3'
|
||||||
|
|
||||||
|
# And a Tag (as opposed to the BeautifulSoup object) will
|
||||||
|
# have a set of default namespaces scoped to that Tag.
|
||||||
|
assert soup.subtag.select_one('prefix|tag3').name=='tag3'
|
||||||
|
|
||||||
|
# the find() methods aren't fully namespace-aware; they just
|
||||||
|
# look at prefixes.
|
||||||
|
assert soup.find('tag').name == 'tag'
|
||||||
|
assert soup.find('prefix:tag2').name == 'tag2'
|
||||||
|
assert soup.find('prefix:tag3').name == 'tag3'
|
||||||
|
assert soup.subtag.find('prefix:tag3').name == 'tag3'
|
||||||
|
|
||||||
|
def test_pickle_removes_builder(self):
|
||||||
|
# The lxml TreeBuilder is not picklable, so it won't be
|
||||||
|
# preserved in a pickle/unpickle operation.
|
||||||
|
|
||||||
|
soup = self.soup("<a>some markup</a>")
|
||||||
|
assert isinstance(soup.builder, self.default_builder)
|
||||||
|
pickled = pickle.dumps(soup)
|
||||||
|
unpickled = pickle.loads(pickled)
|
||||||
|
assert "some markup" == unpickled.a.string
|
||||||
|
assert unpickled.builder is None
|
||||||
|
|
144
libs/bs4/tests/test_navigablestring.py
Normal file
144
libs/bs4/tests/test_navigablestring.py
Normal file
|
@ -0,0 +1,144 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from bs4.element import (
|
||||||
|
CData,
|
||||||
|
Comment,
|
||||||
|
Declaration,
|
||||||
|
Doctype,
|
||||||
|
NavigableString,
|
||||||
|
RubyParenthesisString,
|
||||||
|
RubyTextString,
|
||||||
|
Script,
|
||||||
|
Stylesheet,
|
||||||
|
TemplateString,
|
||||||
|
)
|
||||||
|
|
||||||
|
from . import SoupTest
|
||||||
|
|
||||||
|
class TestNavigableString(SoupTest):
|
||||||
|
|
||||||
|
def test_text_acquisition_methods(self):
|
||||||
|
# These methods are intended for use against Tag, but they
|
||||||
|
# work on NavigableString as well,
|
||||||
|
|
||||||
|
s = NavigableString("fee ")
|
||||||
|
cdata = CData("fie ")
|
||||||
|
comment = Comment("foe ")
|
||||||
|
|
||||||
|
assert "fee " == s.get_text()
|
||||||
|
assert "fee" == s.get_text(strip=True)
|
||||||
|
assert ["fee "] == list(s.strings)
|
||||||
|
assert ["fee"] == list(s.stripped_strings)
|
||||||
|
assert ["fee "] == list(s._all_strings())
|
||||||
|
|
||||||
|
assert "fie " == cdata.get_text()
|
||||||
|
assert "fie" == cdata.get_text(strip=True)
|
||||||
|
assert ["fie "] == list(cdata.strings)
|
||||||
|
assert ["fie"] == list(cdata.stripped_strings)
|
||||||
|
assert ["fie "] == list(cdata._all_strings())
|
||||||
|
|
||||||
|
# Since a Comment isn't normally considered 'text',
|
||||||
|
# these methods generally do nothing.
|
||||||
|
assert "" == comment.get_text()
|
||||||
|
assert [] == list(comment.strings)
|
||||||
|
assert [] == list(comment.stripped_strings)
|
||||||
|
assert [] == list(comment._all_strings())
|
||||||
|
|
||||||
|
# Unless you specifically say that comments are okay.
|
||||||
|
assert "foe" == comment.get_text(strip=True, types=Comment)
|
||||||
|
assert "foe " == comment.get_text(types=(Comment, NavigableString))
|
||||||
|
|
||||||
|
def test_string_has_immutable_name_property(self):
|
||||||
|
# string.name is defined as None and can't be modified
|
||||||
|
string = self.soup("s").string
|
||||||
|
assert None == string.name
|
||||||
|
with pytest.raises(AttributeError):
|
||||||
|
string.name = 'foo'
|
||||||
|
|
||||||
|
class TestNavigableStringSubclasses(SoupTest):
|
||||||
|
|
||||||
|
def test_cdata(self):
|
||||||
|
# None of the current builders turn CDATA sections into CData
|
||||||
|
# objects, but you can create them manually.
|
||||||
|
soup = self.soup("")
|
||||||
|
cdata = CData("foo")
|
||||||
|
soup.insert(1, cdata)
|
||||||
|
assert str(soup) == "<![CDATA[foo]]>"
|
||||||
|
assert soup.find(string="foo") == "foo"
|
||||||
|
assert soup.contents[0] == "foo"
|
||||||
|
|
||||||
|
def test_cdata_is_never_formatted(self):
|
||||||
|
"""Text inside a CData object is passed into the formatter.
|
||||||
|
|
||||||
|
But the return value is ignored.
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.count = 0
|
||||||
|
def increment(*args):
|
||||||
|
self.count += 1
|
||||||
|
return "BITTER FAILURE"
|
||||||
|
|
||||||
|
soup = self.soup("")
|
||||||
|
cdata = CData("<><><>")
|
||||||
|
soup.insert(1, cdata)
|
||||||
|
assert b"<![CDATA[<><><>]]>" == soup.encode(formatter=increment)
|
||||||
|
assert 1 == self.count
|
||||||
|
|
||||||
|
def test_doctype_ends_in_newline(self):
|
||||||
|
# Unlike other NavigableString subclasses, a DOCTYPE always ends
|
||||||
|
# in a newline.
|
||||||
|
doctype = Doctype("foo")
|
||||||
|
soup = self.soup("")
|
||||||
|
soup.insert(1, doctype)
|
||||||
|
assert soup.encode() == b"<!DOCTYPE foo>\n"
|
||||||
|
|
||||||
|
def test_declaration(self):
|
||||||
|
d = Declaration("foo")
|
||||||
|
assert "<?foo?>" == d.output_ready()
|
||||||
|
|
||||||
|
def test_default_string_containers(self):
|
||||||
|
# In some cases, we use different NavigableString subclasses for
|
||||||
|
# the same text in different tags.
|
||||||
|
soup = self.soup(
|
||||||
|
"<div>text</div><script>text</script><style>text</style>"
|
||||||
|
)
|
||||||
|
assert [NavigableString, Script, Stylesheet] == [
|
||||||
|
x.__class__ for x in soup.find_all(string=True)
|
||||||
|
]
|
||||||
|
|
||||||
|
# The TemplateString is a little unusual because it's generally found
|
||||||
|
# _inside_ children of a <template> element, not a direct child of the
|
||||||
|
# <template> element.
|
||||||
|
soup = self.soup(
|
||||||
|
"<template>Some text<p>In a tag</p></template>Some text outside"
|
||||||
|
)
|
||||||
|
assert all(
|
||||||
|
isinstance(x, TemplateString)
|
||||||
|
for x in soup.template._all_strings(types=None)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Once the <template> tag closed, we went back to using
|
||||||
|
# NavigableString.
|
||||||
|
outside = soup.template.next_sibling
|
||||||
|
assert isinstance(outside, NavigableString)
|
||||||
|
assert not isinstance(outside, TemplateString)
|
||||||
|
|
||||||
|
# The TemplateString is also unusual because it can contain
|
||||||
|
# NavigableString subclasses of _other_ types, such as
|
||||||
|
# Comment.
|
||||||
|
markup = b"<template>Some text<p>In a tag</p><!--with a comment--></template>"
|
||||||
|
soup = self.soup(markup)
|
||||||
|
assert markup == soup.template.encode("utf8")
|
||||||
|
|
||||||
|
def test_ruby_strings(self):
|
||||||
|
markup = "<ruby>漢 <rp>(</rp><rt>kan</rt><rp>)</rp> 字 <rp>(</rp><rt>ji</rt><rp>)</rp></ruby>"
|
||||||
|
soup = self.soup(markup)
|
||||||
|
assert isinstance(soup.rp.string, RubyParenthesisString)
|
||||||
|
assert isinstance(soup.rt.string, RubyTextString)
|
||||||
|
|
||||||
|
# Just as a demo, here's what this means for get_text usage.
|
||||||
|
assert "漢字" == soup.get_text(strip=True)
|
||||||
|
assert "漢(kan)字(ji)" == soup.get_text(
|
||||||
|
strip=True,
|
||||||
|
types=(NavigableString, RubyTextString, RubyParenthesisString)
|
||||||
|
)
|
751
libs/bs4/tests/test_pageelement.py
Normal file
751
libs/bs4/tests/test_pageelement.py
Normal file
|
@ -0,0 +1,751 @@
|
||||||
|
"""Tests of the bs4.element.PageElement class"""
|
||||||
|
import copy
|
||||||
|
import pickle
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from soupsieve import SelectorSyntaxError
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4.element import (
|
||||||
|
Comment,
|
||||||
|
SoupStrainer,
|
||||||
|
)
|
||||||
|
from . import SoupTest
|
||||||
|
|
||||||
|
|
||||||
|
class TestEncoding(SoupTest):
|
||||||
|
"""Test the ability to encode objects into strings."""
|
||||||
|
|
||||||
|
def test_unicode_string_can_be_encoded(self):
|
||||||
|
html = "<b>\N{SNOWMAN}</b>"
|
||||||
|
soup = self.soup(html)
|
||||||
|
assert soup.b.string.encode("utf-8") == "\N{SNOWMAN}".encode("utf-8")
|
||||||
|
|
||||||
|
def test_tag_containing_unicode_string_can_be_encoded(self):
|
||||||
|
html = "<b>\N{SNOWMAN}</b>"
|
||||||
|
soup = self.soup(html)
|
||||||
|
assert soup.b.encode("utf-8") == html.encode("utf-8")
|
||||||
|
|
||||||
|
def test_encoding_substitutes_unrecognized_characters_by_default(self):
|
||||||
|
html = "<b>\N{SNOWMAN}</b>"
|
||||||
|
soup = self.soup(html)
|
||||||
|
assert soup.b.encode("ascii") == b"<b>☃</b>"
|
||||||
|
|
||||||
|
def test_encoding_can_be_made_strict(self):
|
||||||
|
html = "<b>\N{SNOWMAN}</b>"
|
||||||
|
soup = self.soup(html)
|
||||||
|
with pytest.raises(UnicodeEncodeError):
|
||||||
|
soup.encode("ascii", errors="strict")
|
||||||
|
|
||||||
|
def test_decode_contents(self):
|
||||||
|
html = "<b>\N{SNOWMAN}</b>"
|
||||||
|
soup = self.soup(html)
|
||||||
|
assert "\N{SNOWMAN}" == soup.b.decode_contents()
|
||||||
|
|
||||||
|
def test_encode_contents(self):
|
||||||
|
html = "<b>\N{SNOWMAN}</b>"
|
||||||
|
soup = self.soup(html)
|
||||||
|
assert "\N{SNOWMAN}".encode("utf8") == soup.b.encode_contents(
|
||||||
|
encoding="utf8"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_deprecated_renderContents(self):
|
||||||
|
html = "<b>\N{SNOWMAN}</b>"
|
||||||
|
soup = self.soup(html)
|
||||||
|
assert "\N{SNOWMAN}".encode("utf8") == soup.b.renderContents()
|
||||||
|
|
||||||
|
def test_repr(self):
|
||||||
|
html = "<b>\N{SNOWMAN}</b>"
|
||||||
|
soup = self.soup(html)
|
||||||
|
assert html == repr(soup)
|
||||||
|
|
||||||
|
|
||||||
|
class TestFormatters(SoupTest):
|
||||||
|
"""Test the formatting feature, used by methods like decode() and
|
||||||
|
prettify(), and the formatters themselves.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_default_formatter_is_minimal(self):
|
||||||
|
markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||||
|
soup = self.soup(markup)
|
||||||
|
decoded = soup.decode(formatter="minimal")
|
||||||
|
# The < is converted back into < but the e-with-acute is left alone.
|
||||||
|
assert decoded == self.document_for(
|
||||||
|
"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_formatter_html(self):
|
||||||
|
markup = "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||||
|
soup = self.soup(markup)
|
||||||
|
decoded = soup.decode(formatter="html")
|
||||||
|
assert decoded == self.document_for(
|
||||||
|
"<br/><b><<Sacré bleu!>></b>"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_formatter_html5(self):
|
||||||
|
markup = "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||||
|
soup = self.soup(markup)
|
||||||
|
decoded = soup.decode(formatter="html5")
|
||||||
|
assert decoded == self.document_for(
|
||||||
|
"<br><b><<Sacré bleu!>></b>"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_formatter_minimal(self):
|
||||||
|
markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||||
|
soup = self.soup(markup)
|
||||||
|
decoded = soup.decode(formatter="minimal")
|
||||||
|
# The < is converted back into < but the e-with-acute is left alone.
|
||||||
|
assert decoded == self.document_for(
|
||||||
|
"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_formatter_null(self):
|
||||||
|
markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||||
|
soup = self.soup(markup)
|
||||||
|
decoded = soup.decode(formatter=None)
|
||||||
|
# Neither the angle brackets nor the e-with-acute are converted.
|
||||||
|
# This is not valid HTML, but it's what the user wanted.
|
||||||
|
assert decoded == self.document_for(
|
||||||
|
"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_formatter_custom(self):
|
||||||
|
markup = "<b><foo></b><b>bar</b><br/>"
|
||||||
|
soup = self.soup(markup)
|
||||||
|
decoded = soup.decode(formatter = lambda x: x.upper())
|
||||||
|
# Instead of normal entity conversion code, the custom
|
||||||
|
# callable is called on every string.
|
||||||
|
assert decoded == self.document_for("<b><FOO></b><b>BAR</b><br/>")
|
||||||
|
|
||||||
|
def test_formatter_is_run_on_attribute_values(self):
|
||||||
|
markup = '<a href="http://a.com?a=b&c=é">e</a>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
a = soup.a
|
||||||
|
|
||||||
|
expect_minimal = '<a href="http://a.com?a=b&c=é">e</a>'
|
||||||
|
|
||||||
|
assert expect_minimal == a.decode()
|
||||||
|
assert expect_minimal == a.decode(formatter="minimal")
|
||||||
|
|
||||||
|
expect_html = '<a href="http://a.com?a=b&c=é">e</a>'
|
||||||
|
assert expect_html == a.decode(formatter="html")
|
||||||
|
|
||||||
|
assert markup == a.decode(formatter=None)
|
||||||
|
expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>'
|
||||||
|
assert expect_upper == a.decode(formatter=lambda x: x.upper())
|
||||||
|
|
||||||
|
def test_formatter_skips_script_tag_for_html_documents(self):
|
||||||
|
doc = """
|
||||||
|
<script type="text/javascript">
|
||||||
|
console.log("< < hey > > ");
|
||||||
|
</script>
|
||||||
|
"""
|
||||||
|
encoded = BeautifulSoup(doc, 'html.parser').encode()
|
||||||
|
assert b"< < hey > >" in encoded
|
||||||
|
|
||||||
|
def test_formatter_skips_style_tag_for_html_documents(self):
|
||||||
|
doc = """
|
||||||
|
<style type="text/css">
|
||||||
|
console.log("< < hey > > ");
|
||||||
|
</style>
|
||||||
|
"""
|
||||||
|
encoded = BeautifulSoup(doc, 'html.parser').encode()
|
||||||
|
assert b"< < hey > >" in encoded
|
||||||
|
|
||||||
|
def test_prettify_leaves_preformatted_text_alone(self):
|
||||||
|
soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz <textarea> eee\nfff\t</textarea></div>")
|
||||||
|
# Everything outside the <pre> tag is reformatted, but everything
|
||||||
|
# inside is left alone.
|
||||||
|
assert '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>' == soup.div.prettify()
|
||||||
|
|
||||||
|
def test_prettify_accepts_formatter_function(self):
|
||||||
|
soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
|
||||||
|
pretty = soup.prettify(formatter = lambda x: x.upper())
|
||||||
|
assert "FOO" in pretty
|
||||||
|
|
||||||
|
def test_prettify_outputs_unicode_by_default(self):
|
||||||
|
soup = self.soup("<a></a>")
|
||||||
|
assert str == type(soup.prettify())
|
||||||
|
|
||||||
|
def test_prettify_can_encode_data(self):
|
||||||
|
soup = self.soup("<a></a>")
|
||||||
|
assert bytes == type(soup.prettify("utf-8"))
|
||||||
|
|
||||||
|
def test_html_entity_substitution_off_by_default(self):
|
||||||
|
markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
|
||||||
|
soup = self.soup(markup)
|
||||||
|
encoded = soup.b.encode("utf-8")
|
||||||
|
assert encoded == markup.encode('utf-8')
|
||||||
|
|
||||||
|
def test_encoding_substitution(self):
|
||||||
|
# Here's the <meta> tag saying that a document is
|
||||||
|
# encoded in Shift-JIS.
|
||||||
|
meta_tag = ('<meta content="text/html; charset=x-sjis" '
|
||||||
|
'http-equiv="Content-type"/>')
|
||||||
|
soup = self.soup(meta_tag)
|
||||||
|
|
||||||
|
# Parse the document, and the charset apprears unchanged.
|
||||||
|
assert soup.meta['content'] == 'text/html; charset=x-sjis'
|
||||||
|
|
||||||
|
# Encode the document into some encoding, and the encoding is
|
||||||
|
# substituted into the meta tag.
|
||||||
|
utf_8 = soup.encode("utf-8")
|
||||||
|
assert b"charset=utf-8" in utf_8
|
||||||
|
|
||||||
|
euc_jp = soup.encode("euc_jp")
|
||||||
|
assert b"charset=euc_jp" in euc_jp
|
||||||
|
|
||||||
|
shift_jis = soup.encode("shift-jis")
|
||||||
|
assert b"charset=shift-jis" in shift_jis
|
||||||
|
|
||||||
|
utf_16_u = soup.encode("utf-16").decode("utf-16")
|
||||||
|
assert "charset=utf-16" in utf_16_u
|
||||||
|
|
||||||
|
def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self):
|
||||||
|
markup = ('<head><meta content="text/html; charset=x-sjis" '
|
||||||
|
'http-equiv="Content-type"/></head><pre>foo</pre>')
|
||||||
|
|
||||||
|
# Beautiful Soup used to try to rewrite the meta tag even if the
|
||||||
|
# meta tag got filtered out by the strainer. This test makes
|
||||||
|
# sure that doesn't happen.
|
||||||
|
strainer = SoupStrainer('pre')
|
||||||
|
soup = self.soup(markup, parse_only=strainer)
|
||||||
|
assert soup.contents[0].name == 'pre'
|
||||||
|
|
||||||
|
|
||||||
|
class TestCSSSelectors(SoupTest):
|
||||||
|
"""Test basic CSS selector functionality.
|
||||||
|
|
||||||
|
This functionality is implemented in soupsieve, which has a much
|
||||||
|
more comprehensive test suite, so this is basically an extra check
|
||||||
|
that soupsieve works as expected.
|
||||||
|
"""
|
||||||
|
|
||||||
|
HTML = """
|
||||||
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
||||||
|
"http://www.w3.org/TR/html4/strict.dtd">
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>The title</title>
|
||||||
|
<link rel="stylesheet" href="blah.css" type="text/css" id="l1">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag>
|
||||||
|
<div id="main" class="fancy">
|
||||||
|
<div id="inner">
|
||||||
|
<h1 id="header1">An H1</h1>
|
||||||
|
<p>Some text</p>
|
||||||
|
<p class="onep" id="p1">Some more text</p>
|
||||||
|
<h2 id="header2">An H2</h2>
|
||||||
|
<p class="class1 class2 class3" id="pmulti">Another</p>
|
||||||
|
<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a>
|
||||||
|
<h2 id="header3">Another H2</h2>
|
||||||
|
<a id="me" href="http://simonwillison.net/" rel="me">me</a>
|
||||||
|
<span class="s1">
|
||||||
|
<a href="#" id="s1a1">span1a1</a>
|
||||||
|
<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a>
|
||||||
|
<span class="span2">
|
||||||
|
<a href="#" id="s2a1">span2a1</a>
|
||||||
|
</span>
|
||||||
|
<span class="span3"></span>
|
||||||
|
<custom-dashed-tag class="dashed" id="dash2"/>
|
||||||
|
<div data-tag="dashedvalue" id="data1"/>
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<x id="xid">
|
||||||
|
<z id="zida"/>
|
||||||
|
<z id="zidab"/>
|
||||||
|
<z id="zidac"/>
|
||||||
|
</x>
|
||||||
|
<y id="yid">
|
||||||
|
<z id="zidb"/>
|
||||||
|
</y>
|
||||||
|
<p lang="en" id="lang-en">English</p>
|
||||||
|
<p lang="en-gb" id="lang-en-gb">English UK</p>
|
||||||
|
<p lang="en-us" id="lang-en-us">English US</p>
|
||||||
|
<p lang="fr" id="lang-fr">French</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div id="footer">
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
|
||||||
|
def setup_method(self):
|
||||||
|
self.soup = BeautifulSoup(self.HTML, 'html.parser')
|
||||||
|
|
||||||
|
def assert_selects(self, selector, expected_ids, **kwargs):
|
||||||
|
el_ids = [el['id'] for el in self.soup.select(selector, **kwargs)]
|
||||||
|
el_ids.sort()
|
||||||
|
expected_ids.sort()
|
||||||
|
assert expected_ids == el_ids, "Selector %s, expected [%s], got [%s]" % (
|
||||||
|
selector, ', '.join(expected_ids), ', '.join(el_ids)
|
||||||
|
)
|
||||||
|
|
||||||
|
assertSelect = assert_selects
|
||||||
|
|
||||||
|
def assert_select_multiple(self, *tests):
|
||||||
|
for selector, expected_ids in tests:
|
||||||
|
self.assert_selects(selector, expected_ids)
|
||||||
|
|
||||||
|
def test_one_tag_one(self):
|
||||||
|
els = self.soup.select('title')
|
||||||
|
assert len(els) == 1
|
||||||
|
assert els[0].name == 'title'
|
||||||
|
assert els[0].contents == ['The title']
|
||||||
|
|
||||||
|
def test_one_tag_many(self):
|
||||||
|
els = self.soup.select('div')
|
||||||
|
assert len(els) == 4
|
||||||
|
for div in els:
|
||||||
|
assert div.name == 'div'
|
||||||
|
|
||||||
|
el = self.soup.select_one('div')
|
||||||
|
assert 'main' == el['id']
|
||||||
|
|
||||||
|
def test_select_one_returns_none_if_no_match(self):
|
||||||
|
match = self.soup.select_one('nonexistenttag')
|
||||||
|
assert None == match
|
||||||
|
|
||||||
|
|
||||||
|
def test_tag_in_tag_one(self):
|
||||||
|
els = self.soup.select('div div')
|
||||||
|
self.assert_selects('div div', ['inner', 'data1'])
|
||||||
|
|
||||||
|
def test_tag_in_tag_many(self):
|
||||||
|
for selector in ('html div', 'html body div', 'body div'):
|
||||||
|
self.assert_selects(selector, ['data1', 'main', 'inner', 'footer'])
|
||||||
|
|
||||||
|
|
||||||
|
def test_limit(self):
|
||||||
|
self.assert_selects('html div', ['main'], limit=1)
|
||||||
|
self.assert_selects('html body div', ['inner', 'main'], limit=2)
|
||||||
|
self.assert_selects('body div', ['data1', 'main', 'inner', 'footer'],
|
||||||
|
limit=10)
|
||||||
|
|
||||||
|
def test_tag_no_match(self):
|
||||||
|
assert len(self.soup.select('del')) == 0
|
||||||
|
|
||||||
|
def test_invalid_tag(self):
|
||||||
|
with pytest.raises(SelectorSyntaxError):
|
||||||
|
self.soup.select('tag%t')
|
||||||
|
|
||||||
|
def test_select_dashed_tag_ids(self):
|
||||||
|
self.assert_selects('custom-dashed-tag', ['dash1', 'dash2'])
|
||||||
|
|
||||||
|
def test_select_dashed_by_id(self):
|
||||||
|
dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]')
|
||||||
|
assert dashed[0].name == 'custom-dashed-tag'
|
||||||
|
assert dashed[0]['id'] == 'dash2'
|
||||||
|
|
||||||
|
def test_dashed_tag_text(self):
|
||||||
|
assert self.soup.select('body > custom-dashed-tag')[0].text == 'Hello there.'
|
||||||
|
|
||||||
|
def test_select_dashed_matches_find_all(self):
|
||||||
|
assert self.soup.select('custom-dashed-tag') == self.soup.find_all('custom-dashed-tag')
|
||||||
|
|
||||||
|
def test_header_tags(self):
|
||||||
|
self.assert_select_multiple(
|
||||||
|
('h1', ['header1']),
|
||||||
|
('h2', ['header2', 'header3']),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_class_one(self):
|
||||||
|
for selector in ('.onep', 'p.onep', 'html p.onep'):
|
||||||
|
els = self.soup.select(selector)
|
||||||
|
assert len(els) == 1
|
||||||
|
assert els[0].name == 'p'
|
||||||
|
assert els[0]['class'] == ['onep']
|
||||||
|
|
||||||
|
def test_class_mismatched_tag(self):
|
||||||
|
els = self.soup.select('div.onep')
|
||||||
|
assert len(els) == 0
|
||||||
|
|
||||||
|
def test_one_id(self):
|
||||||
|
for selector in ('div#inner', '#inner', 'div div#inner'):
|
||||||
|
self.assert_selects(selector, ['inner'])
|
||||||
|
|
||||||
|
def test_bad_id(self):
|
||||||
|
els = self.soup.select('#doesnotexist')
|
||||||
|
assert len(els) == 0
|
||||||
|
|
||||||
|
def test_items_in_id(self):
|
||||||
|
els = self.soup.select('div#inner p')
|
||||||
|
assert len(els) == 3
|
||||||
|
for el in els:
|
||||||
|
assert el.name == 'p'
|
||||||
|
assert els[1]['class'] == ['onep']
|
||||||
|
assert not els[0].has_attr('class')
|
||||||
|
|
||||||
|
def test_a_bunch_of_emptys(self):
|
||||||
|
for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
|
||||||
|
assert len(self.soup.select(selector)) == 0
|
||||||
|
|
||||||
|
def test_multi_class_support(self):
|
||||||
|
for selector in ('.class1', 'p.class1', '.class2', 'p.class2',
|
||||||
|
'.class3', 'p.class3', 'html p.class2', 'div#inner .class2'):
|
||||||
|
self.assert_selects(selector, ['pmulti'])
|
||||||
|
|
||||||
|
def test_multi_class_selection(self):
|
||||||
|
for selector in ('.class1.class3', '.class3.class2',
|
||||||
|
'.class1.class2.class3'):
|
||||||
|
self.assert_selects(selector, ['pmulti'])
|
||||||
|
|
||||||
|
def test_child_selector(self):
|
||||||
|
self.assert_selects('.s1 > a', ['s1a1', 's1a2'])
|
||||||
|
self.assert_selects('.s1 > a span', ['s1a2s1'])
|
||||||
|
|
||||||
|
def test_child_selector_id(self):
|
||||||
|
self.assert_selects('.s1 > a#s1a2 span', ['s1a2s1'])
|
||||||
|
|
||||||
|
def test_attribute_equals(self):
|
||||||
|
self.assert_select_multiple(
|
||||||
|
('p[class="onep"]', ['p1']),
|
||||||
|
('p[id="p1"]', ['p1']),
|
||||||
|
('[class="onep"]', ['p1']),
|
||||||
|
('[id="p1"]', ['p1']),
|
||||||
|
('link[rel="stylesheet"]', ['l1']),
|
||||||
|
('link[type="text/css"]', ['l1']),
|
||||||
|
('link[href="blah.css"]', ['l1']),
|
||||||
|
('link[href="no-blah.css"]', []),
|
||||||
|
('[rel="stylesheet"]', ['l1']),
|
||||||
|
('[type="text/css"]', ['l1']),
|
||||||
|
('[href="blah.css"]', ['l1']),
|
||||||
|
('[href="no-blah.css"]', []),
|
||||||
|
('p[href="no-blah.css"]', []),
|
||||||
|
('[href="no-blah.css"]', []),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_attribute_tilde(self):
|
||||||
|
self.assert_select_multiple(
|
||||||
|
('p[class~="class1"]', ['pmulti']),
|
||||||
|
('p[class~="class2"]', ['pmulti']),
|
||||||
|
('p[class~="class3"]', ['pmulti']),
|
||||||
|
('[class~="class1"]', ['pmulti']),
|
||||||
|
('[class~="class2"]', ['pmulti']),
|
||||||
|
('[class~="class3"]', ['pmulti']),
|
||||||
|
('a[rel~="friend"]', ['bob']),
|
||||||
|
('a[rel~="met"]', ['bob']),
|
||||||
|
('[rel~="friend"]', ['bob']),
|
||||||
|
('[rel~="met"]', ['bob']),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_attribute_startswith(self):
|
||||||
|
self.assert_select_multiple(
|
||||||
|
('[rel^="style"]', ['l1']),
|
||||||
|
('link[rel^="style"]', ['l1']),
|
||||||
|
('notlink[rel^="notstyle"]', []),
|
||||||
|
('[rel^="notstyle"]', []),
|
||||||
|
('link[rel^="notstyle"]', []),
|
||||||
|
('link[href^="bla"]', ['l1']),
|
||||||
|
('a[href^="http://"]', ['bob', 'me']),
|
||||||
|
('[href^="http://"]', ['bob', 'me']),
|
||||||
|
('[id^="p"]', ['pmulti', 'p1']),
|
||||||
|
('[id^="m"]', ['me', 'main']),
|
||||||
|
('div[id^="m"]', ['main']),
|
||||||
|
('a[id^="m"]', ['me']),
|
||||||
|
('div[data-tag^="dashed"]', ['data1'])
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_attribute_endswith(self):
|
||||||
|
self.assert_select_multiple(
|
||||||
|
('[href$=".css"]', ['l1']),
|
||||||
|
('link[href$=".css"]', ['l1']),
|
||||||
|
('link[id$="1"]', ['l1']),
|
||||||
|
('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']),
|
||||||
|
('div[id$="1"]', ['data1']),
|
||||||
|
('[id$="noending"]', []),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_attribute_contains(self):
|
||||||
|
self.assert_select_multiple(
|
||||||
|
# From test_attribute_startswith
|
||||||
|
('[rel*="style"]', ['l1']),
|
||||||
|
('link[rel*="style"]', ['l1']),
|
||||||
|
('notlink[rel*="notstyle"]', []),
|
||||||
|
('[rel*="notstyle"]', []),
|
||||||
|
('link[rel*="notstyle"]', []),
|
||||||
|
('link[href*="bla"]', ['l1']),
|
||||||
|
('[href*="http://"]', ['bob', 'me']),
|
||||||
|
('[id*="p"]', ['pmulti', 'p1']),
|
||||||
|
('div[id*="m"]', ['main']),
|
||||||
|
('a[id*="m"]', ['me']),
|
||||||
|
# From test_attribute_endswith
|
||||||
|
('[href*=".css"]', ['l1']),
|
||||||
|
('link[href*=".css"]', ['l1']),
|
||||||
|
('link[id*="1"]', ['l1']),
|
||||||
|
('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']),
|
||||||
|
('div[id*="1"]', ['data1']),
|
||||||
|
('[id*="noending"]', []),
|
||||||
|
# New for this test
|
||||||
|
('[href*="."]', ['bob', 'me', 'l1']),
|
||||||
|
('a[href*="."]', ['bob', 'me']),
|
||||||
|
('link[href*="."]', ['l1']),
|
||||||
|
('div[id*="n"]', ['main', 'inner']),
|
||||||
|
('div[id*="nn"]', ['inner']),
|
||||||
|
('div[data-tag*="edval"]', ['data1'])
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_attribute_exact_or_hypen(self):
|
||||||
|
self.assert_select_multiple(
|
||||||
|
('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
|
||||||
|
('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
|
||||||
|
('p[lang|="fr"]', ['lang-fr']),
|
||||||
|
('p[lang|="gb"]', []),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_attribute_exists(self):
|
||||||
|
self.assert_select_multiple(
|
||||||
|
('[rel]', ['l1', 'bob', 'me']),
|
||||||
|
('link[rel]', ['l1']),
|
||||||
|
('a[rel]', ['bob', 'me']),
|
||||||
|
('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']),
|
||||||
|
('p[class]', ['p1', 'pmulti']),
|
||||||
|
('[blah]', []),
|
||||||
|
('p[blah]', []),
|
||||||
|
('div[data-tag]', ['data1'])
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_quoted_space_in_selector_name(self):
|
||||||
|
html = """<div style="display: wrong">nope</div>
|
||||||
|
<div style="display: right">yes</div>
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
[chosen] = soup.select('div[style="display: right"]')
|
||||||
|
assert "yes" == chosen.string
|
||||||
|
|
||||||
|
def test_unsupported_pseudoclass(self):
|
||||||
|
with pytest.raises(NotImplementedError):
|
||||||
|
self.soup.select("a:no-such-pseudoclass")
|
||||||
|
|
||||||
|
with pytest.raises(SelectorSyntaxError):
|
||||||
|
self.soup.select("a:nth-of-type(a)")
|
||||||
|
|
||||||
|
def test_nth_of_type(self):
|
||||||
|
# Try to select first paragraph
|
||||||
|
els = self.soup.select('div#inner p:nth-of-type(1)')
|
||||||
|
assert len(els) == 1
|
||||||
|
assert els[0].string == 'Some text'
|
||||||
|
|
||||||
|
# Try to select third paragraph
|
||||||
|
els = self.soup.select('div#inner p:nth-of-type(3)')
|
||||||
|
assert len(els) == 1
|
||||||
|
assert els[0].string == 'Another'
|
||||||
|
|
||||||
|
# Try to select (non-existent!) fourth paragraph
|
||||||
|
els = self.soup.select('div#inner p:nth-of-type(4)')
|
||||||
|
assert len(els) == 0
|
||||||
|
|
||||||
|
# Zero will select no tags.
|
||||||
|
els = self.soup.select('div p:nth-of-type(0)')
|
||||||
|
assert len(els) == 0
|
||||||
|
|
||||||
|
def test_nth_of_type_direct_descendant(self):
|
||||||
|
els = self.soup.select('div#inner > p:nth-of-type(1)')
|
||||||
|
assert len(els) == 1
|
||||||
|
assert els[0].string == 'Some text'
|
||||||
|
|
||||||
|
def test_id_child_selector_nth_of_type(self):
|
||||||
|
self.assert_selects('#inner > p:nth-of-type(2)', ['p1'])
|
||||||
|
|
||||||
|
def test_select_on_element(self):
|
||||||
|
# Other tests operate on the tree; this operates on an element
|
||||||
|
# within the tree.
|
||||||
|
inner = self.soup.find("div", id="main")
|
||||||
|
selected = inner.select("div")
|
||||||
|
# The <div id="inner"> tag was selected. The <div id="footer">
|
||||||
|
# tag was not.
|
||||||
|
self.assert_selects_ids(selected, ['inner', 'data1'])
|
||||||
|
|
||||||
|
def test_overspecified_child_id(self):
|
||||||
|
self.assert_selects(".fancy #inner", ['inner'])
|
||||||
|
self.assert_selects(".normal #inner", [])
|
||||||
|
|
||||||
|
def test_adjacent_sibling_selector(self):
|
||||||
|
self.assert_selects('#p1 + h2', ['header2'])
|
||||||
|
self.assert_selects('#p1 + h2 + p', ['pmulti'])
|
||||||
|
self.assert_selects('#p1 + #header2 + .class1', ['pmulti'])
|
||||||
|
assert [] == self.soup.select('#p1 + p')
|
||||||
|
|
||||||
|
def test_general_sibling_selector(self):
|
||||||
|
self.assert_selects('#p1 ~ h2', ['header2', 'header3'])
|
||||||
|
self.assert_selects('#p1 ~ #header2', ['header2'])
|
||||||
|
self.assert_selects('#p1 ~ h2 + a', ['me'])
|
||||||
|
self.assert_selects('#p1 ~ h2 + [rel="me"]', ['me'])
|
||||||
|
assert [] == self.soup.select('#inner ~ h2')
|
||||||
|
|
||||||
|
def test_dangling_combinator(self):
|
||||||
|
with pytest.raises(SelectorSyntaxError):
|
||||||
|
self.soup.select('h1 >')
|
||||||
|
|
||||||
|
def test_sibling_combinator_wont_select_same_tag_twice(self):
|
||||||
|
self.assert_selects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
|
||||||
|
|
||||||
|
# Test the selector grouping operator (the comma)
|
||||||
|
def test_multiple_select(self):
|
||||||
|
self.assert_selects('x, y', ['xid', 'yid'])
|
||||||
|
|
||||||
|
def test_multiple_select_with_no_space(self):
|
||||||
|
self.assert_selects('x,y', ['xid', 'yid'])
|
||||||
|
|
||||||
|
def test_multiple_select_with_more_space(self):
|
||||||
|
self.assert_selects('x, y', ['xid', 'yid'])
|
||||||
|
|
||||||
|
def test_multiple_select_duplicated(self):
|
||||||
|
self.assert_selects('x, x', ['xid'])
|
||||||
|
|
||||||
|
def test_multiple_select_sibling(self):
|
||||||
|
self.assert_selects('x, y ~ p[lang=fr]', ['xid', 'lang-fr'])
|
||||||
|
|
||||||
|
def test_multiple_select_tag_and_direct_descendant(self):
|
||||||
|
self.assert_selects('x, y > z', ['xid', 'zidb'])
|
||||||
|
|
||||||
|
def test_multiple_select_direct_descendant_and_tags(self):
|
||||||
|
self.assert_selects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
|
||||||
|
|
||||||
|
def test_multiple_select_indirect_descendant(self):
|
||||||
|
self.assert_selects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
|
||||||
|
|
||||||
|
def test_invalid_multiple_select(self):
|
||||||
|
with pytest.raises(SelectorSyntaxError):
|
||||||
|
self.soup.select(',x, y')
|
||||||
|
with pytest.raises(SelectorSyntaxError):
|
||||||
|
self.soup.select('x,,y')
|
||||||
|
|
||||||
|
def test_multiple_select_attrs(self):
|
||||||
|
self.assert_selects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
|
||||||
|
|
||||||
|
def test_multiple_select_ids(self):
|
||||||
|
self.assert_selects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab'])
|
||||||
|
|
||||||
|
def test_multiple_select_nested(self):
|
||||||
|
self.assert_selects('body > div > x, y > z', ['xid', 'zidb'])
|
||||||
|
|
||||||
|
def test_select_duplicate_elements(self):
|
||||||
|
# When markup contains duplicate elements, a multiple select
|
||||||
|
# will find all of them.
|
||||||
|
markup = '<div class="c1"/><div class="c2"/><div class="c1"/>'
|
||||||
|
soup = BeautifulSoup(markup, 'html.parser')
|
||||||
|
selected = soup.select(".c1, .c2")
|
||||||
|
assert 3 == len(selected)
|
||||||
|
|
||||||
|
# Verify that find_all finds the same elements, though because
|
||||||
|
# of an implementation detail it finds them in a different
|
||||||
|
# order.
|
||||||
|
for element in soup.find_all(class_=['c1', 'c2']):
|
||||||
|
assert element in selected
|
||||||
|
|
||||||
|
|
||||||
|
class TestPersistence(SoupTest):
|
||||||
|
"Testing features like pickle and deepcopy."
|
||||||
|
|
||||||
|
def setup_method(self):
|
||||||
|
self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
|
||||||
|
"http://www.w3.org/TR/REC-html40/transitional.dtd">
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||||
|
<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
|
||||||
|
<link rev="made" href="mailto:leonardr@segfault.org">
|
||||||
|
<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
|
||||||
|
<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
|
||||||
|
<meta name="author" content="Leonard Richardson">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<a href="foo">foo</a>
|
||||||
|
<a href="foo"><b>bar</b></a>
|
||||||
|
</body>
|
||||||
|
</html>"""
|
||||||
|
self.tree = self.soup(self.page)
|
||||||
|
|
||||||
|
def test_pickle_and_unpickle_identity(self):
|
||||||
|
# Pickling a tree, then unpickling it, yields a tree identical
|
||||||
|
# to the original.
|
||||||
|
dumped = pickle.dumps(self.tree, 2)
|
||||||
|
loaded = pickle.loads(dumped)
|
||||||
|
assert loaded.__class__ == BeautifulSoup
|
||||||
|
assert loaded.decode() == self.tree.decode()
|
||||||
|
|
||||||
|
def test_deepcopy_identity(self):
|
||||||
|
# Making a deepcopy of a tree yields an identical tree.
|
||||||
|
copied = copy.deepcopy(self.tree)
|
||||||
|
assert copied.decode() == self.tree.decode()
|
||||||
|
|
||||||
|
def test_copy_preserves_encoding(self):
|
||||||
|
soup = BeautifulSoup(b'<p> </p>', 'html.parser')
|
||||||
|
encoding = soup.original_encoding
|
||||||
|
copy = soup.__copy__()
|
||||||
|
assert "<p> </p>" == str(copy)
|
||||||
|
assert encoding == copy.original_encoding
|
||||||
|
|
||||||
|
def test_copy_preserves_builder_information(self):
|
||||||
|
|
||||||
|
tag = self.soup('<p></p>').p
|
||||||
|
|
||||||
|
# Simulate a tag obtained from a source file.
|
||||||
|
tag.sourceline = 10
|
||||||
|
tag.sourcepos = 33
|
||||||
|
|
||||||
|
copied = tag.__copy__()
|
||||||
|
|
||||||
|
# The TreeBuilder object is no longer availble, but information
|
||||||
|
# obtained from it gets copied over to the new Tag object.
|
||||||
|
assert tag.sourceline == copied.sourceline
|
||||||
|
assert tag.sourcepos == copied.sourcepos
|
||||||
|
assert tag.can_be_empty_element == copied.can_be_empty_element
|
||||||
|
assert tag.cdata_list_attributes == copied.cdata_list_attributes
|
||||||
|
assert tag.preserve_whitespace_tags == copied.preserve_whitespace_tags
|
||||||
|
|
||||||
|
def test_unicode_pickle(self):
|
||||||
|
# A tree containing Unicode characters can be pickled.
|
||||||
|
html = "<b>\N{SNOWMAN}</b>"
|
||||||
|
soup = self.soup(html)
|
||||||
|
dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
|
||||||
|
loaded = pickle.loads(dumped)
|
||||||
|
assert loaded.decode() == soup.decode()
|
||||||
|
|
||||||
|
def test_copy_navigablestring_is_not_attached_to_tree(self):
|
||||||
|
html = "<b>Foo<a></a></b><b>Bar</b>"
|
||||||
|
soup = self.soup(html)
|
||||||
|
s1 = soup.find(string="Foo")
|
||||||
|
s2 = copy.copy(s1)
|
||||||
|
assert s1 == s2
|
||||||
|
assert None == s2.parent
|
||||||
|
assert None == s2.next_element
|
||||||
|
assert None != s1.next_sibling
|
||||||
|
assert None == s2.next_sibling
|
||||||
|
assert None == s2.previous_element
|
||||||
|
|
||||||
|
def test_copy_navigablestring_subclass_has_same_type(self):
|
||||||
|
html = "<b><!--Foo--></b>"
|
||||||
|
soup = self.soup(html)
|
||||||
|
s1 = soup.string
|
||||||
|
s2 = copy.copy(s1)
|
||||||
|
assert s1 == s2
|
||||||
|
assert isinstance(s2, Comment)
|
||||||
|
|
||||||
|
def test_copy_entire_soup(self):
|
||||||
|
html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
|
||||||
|
soup = self.soup(html)
|
||||||
|
soup_copy = copy.copy(soup)
|
||||||
|
assert soup == soup_copy
|
||||||
|
|
||||||
|
def test_copy_tag_copies_contents(self):
|
||||||
|
html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
|
||||||
|
soup = self.soup(html)
|
||||||
|
div = soup.div
|
||||||
|
div_copy = copy.copy(div)
|
||||||
|
|
||||||
|
# The two tags look the same, and evaluate to equal.
|
||||||
|
assert str(div) == str(div_copy)
|
||||||
|
assert div == div_copy
|
||||||
|
|
||||||
|
# But they're not the same object.
|
||||||
|
assert div is not div_copy
|
||||||
|
|
||||||
|
# And they don't have the same relation to the parse tree. The
|
||||||
|
# copy is not associated with a parse tree at all.
|
||||||
|
assert None == div_copy.parent
|
||||||
|
assert None == div_copy.previous_element
|
||||||
|
assert None == div_copy.find(string='Bar').next_element
|
||||||
|
assert None != div.find(string='Bar').next_element
|
||||||
|
|
|
@ -4,7 +4,8 @@
|
||||||
from pdb import set_trace
|
from pdb import set_trace
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import unittest
|
import pickle
|
||||||
|
import pytest
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
|
@ -13,27 +14,21 @@ from bs4 import (
|
||||||
BeautifulStoneSoup,
|
BeautifulStoneSoup,
|
||||||
GuessedAtParserWarning,
|
GuessedAtParserWarning,
|
||||||
MarkupResemblesLocatorWarning,
|
MarkupResemblesLocatorWarning,
|
||||||
|
dammit,
|
||||||
)
|
)
|
||||||
from bs4.builder import (
|
from bs4.builder import (
|
||||||
|
builder_registry,
|
||||||
TreeBuilder,
|
TreeBuilder,
|
||||||
ParserRejectedMarkup,
|
ParserRejectedMarkup,
|
||||||
)
|
)
|
||||||
from bs4.element import (
|
from bs4.element import (
|
||||||
CharsetMetaAttributeValue,
|
|
||||||
Comment,
|
Comment,
|
||||||
ContentMetaAttributeValue,
|
|
||||||
SoupStrainer,
|
SoupStrainer,
|
||||||
NamespacedAttribute,
|
|
||||||
Tag,
|
Tag,
|
||||||
NavigableString,
|
NavigableString,
|
||||||
)
|
)
|
||||||
|
|
||||||
import bs4.dammit
|
from . import (
|
||||||
from bs4.dammit import (
|
|
||||||
EntitySubstitution,
|
|
||||||
UnicodeDammit,
|
|
||||||
)
|
|
||||||
from bs4.testing import (
|
|
||||||
default_builder,
|
default_builder,
|
||||||
SoupTest,
|
SoupTest,
|
||||||
skipIf,
|
skipIf,
|
||||||
|
@ -53,17 +48,17 @@ class TestConstructor(SoupTest):
|
||||||
def test_short_unicode_input(self):
|
def test_short_unicode_input(self):
|
||||||
data = "<h1>éé</h1>"
|
data = "<h1>éé</h1>"
|
||||||
soup = self.soup(data)
|
soup = self.soup(data)
|
||||||
self.assertEqual("éé", soup.h1.string)
|
assert "éé" == soup.h1.string
|
||||||
|
|
||||||
def test_embedded_null(self):
|
def test_embedded_null(self):
|
||||||
data = "<h1>foo\0bar</h1>"
|
data = "<h1>foo\0bar</h1>"
|
||||||
soup = self.soup(data)
|
soup = self.soup(data)
|
||||||
self.assertEqual("foo\0bar", soup.h1.string)
|
assert "foo\0bar" == soup.h1.string
|
||||||
|
|
||||||
def test_exclude_encodings(self):
|
def test_exclude_encodings(self):
|
||||||
utf8_data = "Räksmörgås".encode("utf-8")
|
utf8_data = "Räksmörgås".encode("utf-8")
|
||||||
soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
|
soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
|
||||||
self.assertEqual("windows-1252", soup.original_encoding)
|
assert "windows-1252" == soup.original_encoding
|
||||||
|
|
||||||
def test_custom_builder_class(self):
|
def test_custom_builder_class(self):
|
||||||
# Verify that you can pass in a custom Builder class and
|
# Verify that you can pass in a custom Builder class and
|
||||||
|
@ -97,8 +92,8 @@ class TestConstructor(SoupTest):
|
||||||
with warnings.catch_warnings(record=True):
|
with warnings.catch_warnings(record=True):
|
||||||
soup = BeautifulSoup('', builder=Mock, **kwargs)
|
soup = BeautifulSoup('', builder=Mock, **kwargs)
|
||||||
assert isinstance(soup.builder, Mock)
|
assert isinstance(soup.builder, Mock)
|
||||||
self.assertEqual(dict(var="value"), soup.builder.called_with)
|
assert dict(var="value") == soup.builder.called_with
|
||||||
self.assertEqual("prepared markup", soup.builder.fed)
|
assert "prepared markup" == soup.builder.fed
|
||||||
|
|
||||||
# You can also instantiate the TreeBuilder yourself. In this
|
# You can also instantiate the TreeBuilder yourself. In this
|
||||||
# case, that specific object is used and any keyword arguments
|
# case, that specific object is used and any keyword arguments
|
||||||
|
@ -110,8 +105,8 @@ class TestConstructor(SoupTest):
|
||||||
)
|
)
|
||||||
msg = str(w[0].message)
|
msg = str(w[0].message)
|
||||||
assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
|
assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
|
||||||
self.assertEqual(builder, soup.builder)
|
assert builder == soup.builder
|
||||||
self.assertEqual(kwargs, builder.called_with)
|
assert kwargs == builder.called_with
|
||||||
|
|
||||||
def test_parser_markup_rejection(self):
|
def test_parser_markup_rejection(self):
|
||||||
# If markup is completely rejected by the parser, an
|
# If markup is completely rejected by the parser, an
|
||||||
|
@ -126,12 +121,11 @@ class TestConstructor(SoupTest):
|
||||||
yield markup, None, None, False
|
yield markup, None, None, False
|
||||||
yield markup, None, None, False
|
yield markup, None, None, False
|
||||||
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
self.assertRaisesRegex(
|
with pytest.raises(ParserRejectedMarkup) as exc_info:
|
||||||
ParserRejectedMarkup,
|
BeautifulSoup('', builder=Mock)
|
||||||
"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.",
|
assert "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help." in str(exc_info.value)
|
||||||
BeautifulSoup, '', builder=Mock,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_cdata_list_attributes(self):
|
def test_cdata_list_attributes(self):
|
||||||
# Most attribute values are represented as scalars, but the
|
# Most attribute values are represented as scalars, but the
|
||||||
|
@ -142,14 +136,14 @@ class TestConstructor(SoupTest):
|
||||||
|
|
||||||
# Note that the spaces are stripped for 'class' but not for 'id'.
|
# Note that the spaces are stripped for 'class' but not for 'id'.
|
||||||
a = soup.a
|
a = soup.a
|
||||||
self.assertEqual(" an id ", a['id'])
|
assert " an id " == a['id']
|
||||||
self.assertEqual(["a", "class"], a['class'])
|
assert ["a", "class"] == a['class']
|
||||||
|
|
||||||
# TreeBuilder takes an argument called 'mutli_valued_attributes' which lets
|
# TreeBuilder takes an argument called 'mutli_valued_attributes' which lets
|
||||||
# you customize or disable this. As always, you can customize the TreeBuilder
|
# you customize or disable this. As always, you can customize the TreeBuilder
|
||||||
# by passing in a keyword argument to the BeautifulSoup constructor.
|
# by passing in a keyword argument to the BeautifulSoup constructor.
|
||||||
soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
|
soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
|
||||||
self.assertEqual(" a class ", soup.a['class'])
|
assert " a class " == soup.a['class']
|
||||||
|
|
||||||
# Here are two ways of saying that `id` is a multi-valued
|
# Here are two ways of saying that `id` is a multi-valued
|
||||||
# attribute in this context, but 'class' is not.
|
# attribute in this context, but 'class' is not.
|
||||||
|
@ -159,8 +153,8 @@ class TestConstructor(SoupTest):
|
||||||
# specifying a parser, but we'll ignore it.
|
# specifying a parser, but we'll ignore it.
|
||||||
soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo)
|
soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo)
|
||||||
a = soup.a
|
a = soup.a
|
||||||
self.assertEqual(["an", "id"], a['id'])
|
assert ["an", "id"] == a['id']
|
||||||
self.assertEqual(" a class ", a['class'])
|
assert " a class " == a['class']
|
||||||
|
|
||||||
def test_replacement_classes(self):
|
def test_replacement_classes(self):
|
||||||
# Test the ability to pass in replacements for element classes
|
# Test the ability to pass in replacements for element classes
|
||||||
|
@ -221,7 +215,7 @@ class TestConstructor(SoupTest):
|
||||||
|
|
||||||
# Now that parsing was complete, the string_container_stack
|
# Now that parsing was complete, the string_container_stack
|
||||||
# (where this information was kept) has been cleared out.
|
# (where this information was kept) has been cleared out.
|
||||||
self.assertEqual([], soup.string_container_stack)
|
assert [] == soup.string_container_stack
|
||||||
|
|
||||||
|
|
||||||
class TestWarnings(SoupTest):
|
class TestWarnings(SoupTest):
|
||||||
|
@ -230,14 +224,12 @@ class TestWarnings(SoupTest):
|
||||||
for w in warnings:
|
for w in warnings:
|
||||||
if isinstance(w.message, cls):
|
if isinstance(w.message, cls):
|
||||||
return w
|
return w
|
||||||
raise Exception("%s warning not found in %r" % cls, warnings)
|
raise Exception("%s warning not found in %r" % (cls, warnings))
|
||||||
|
|
||||||
def _assert_no_parser_specified(self, w):
|
def _assert_no_parser_specified(self, w):
|
||||||
warning = self._assert_warning(w, GuessedAtParserWarning)
|
warning = self._assert_warning(w, GuessedAtParserWarning)
|
||||||
message = str(warning.message)
|
message = str(warning.message)
|
||||||
self.assertTrue(
|
assert message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60])
|
||||||
message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60])
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_warning_if_no_parser_specified(self):
|
def test_warning_if_no_parser_specified(self):
|
||||||
with warnings.catch_warnings(record=True) as w:
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
@ -252,91 +244,89 @@ class TestWarnings(SoupTest):
|
||||||
def test_no_warning_if_explicit_parser_specified(self):
|
def test_no_warning_if_explicit_parser_specified(self):
|
||||||
with warnings.catch_warnings(record=True) as w:
|
with warnings.catch_warnings(record=True) as w:
|
||||||
soup = BeautifulSoup("<a><b></b></a>", "html.parser")
|
soup = BeautifulSoup("<a><b></b></a>", "html.parser")
|
||||||
self.assertEqual([], w)
|
assert [] == w
|
||||||
|
|
||||||
def test_parseOnlyThese_renamed_to_parse_only(self):
|
def test_parseOnlyThese_renamed_to_parse_only(self):
|
||||||
with warnings.catch_warnings(record=True) as w:
|
with warnings.catch_warnings(record=True) as w:
|
||||||
soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
|
soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
|
||||||
msg = str(w[0].message)
|
msg = str(w[0].message)
|
||||||
self.assertTrue("parseOnlyThese" in msg)
|
assert "parseOnlyThese" in msg
|
||||||
self.assertTrue("parse_only" in msg)
|
assert "parse_only" in msg
|
||||||
self.assertEqual(b"<b></b>", soup.encode())
|
assert b"<b></b>" == soup.encode()
|
||||||
|
|
||||||
def test_fromEncoding_renamed_to_from_encoding(self):
|
def test_fromEncoding_renamed_to_from_encoding(self):
|
||||||
with warnings.catch_warnings(record=True) as w:
|
with warnings.catch_warnings(record=True) as w:
|
||||||
utf8 = b"\xc3\xa9"
|
utf8 = b"\xc3\xa9"
|
||||||
soup = self.soup(utf8, fromEncoding="utf8")
|
soup = self.soup(utf8, fromEncoding="utf8")
|
||||||
msg = str(w[0].message)
|
msg = str(w[0].message)
|
||||||
self.assertTrue("fromEncoding" in msg)
|
assert "fromEncoding" in msg
|
||||||
self.assertTrue("from_encoding" in msg)
|
assert "from_encoding" in msg
|
||||||
self.assertEqual("utf8", soup.original_encoding)
|
assert "utf8" == soup.original_encoding
|
||||||
|
|
||||||
def test_unrecognized_keyword_argument(self):
|
def test_unrecognized_keyword_argument(self):
|
||||||
self.assertRaises(
|
with pytest.raises(TypeError):
|
||||||
TypeError, self.soup, "<a>", no_such_argument=True)
|
self.soup("<a>", no_such_argument=True)
|
||||||
|
|
||||||
def test_disk_file_warning(self):
|
@pytest.mark.parametrize(
|
||||||
filehandle = tempfile.NamedTemporaryFile()
|
"extension",
|
||||||
filename = filehandle.name
|
['markup.html', 'markup.htm', 'markup.HTML', 'markup.txt',
|
||||||
try:
|
'markup.xhtml', 'markup.xml', "/home/user/file", "c:\\user\file"]
|
||||||
|
)
|
||||||
|
def test_resembles_filename_warning(self, extension):
|
||||||
|
# A warning is issued if the "markup" looks like the name of
|
||||||
|
# an HTML or text file, or a full path to a file on disk.
|
||||||
with warnings.catch_warnings(record=True) as w:
|
with warnings.catch_warnings(record=True) as w:
|
||||||
soup = self.soup(filename)
|
soup = self.soup("markup" + extension)
|
||||||
warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
|
warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
|
||||||
self.assertTrue("looks like a filename" in str(warning.message))
|
assert "looks more like a filename" in str(warning.message)
|
||||||
finally:
|
|
||||||
filehandle.close()
|
|
||||||
|
|
||||||
# The file no longer exists, so Beautiful Soup will no longer issue the warning.
|
@pytest.mark.parametrize(
|
||||||
|
"extension",
|
||||||
|
['markuphtml', 'markup.com', '', 'markup.js']
|
||||||
|
)
|
||||||
|
def test_resembles_filename_no_warning(self, extension):
|
||||||
|
# The 'looks more like a filename' warning is not issued if
|
||||||
|
# the markup looks like a bare string, a domain name, or a
|
||||||
|
# file that's not an HTML file.
|
||||||
with warnings.catch_warnings(record=True) as w:
|
with warnings.catch_warnings(record=True) as w:
|
||||||
soup = self.soup(filename)
|
soup = self.soup("markup" + extension)
|
||||||
self.assertEqual([], w)
|
assert [] == w
|
||||||
|
|
||||||
def test_directory_warning(self):
|
|
||||||
try:
|
|
||||||
filename = tempfile.mkdtemp()
|
|
||||||
with warnings.catch_warnings(record=True) as w:
|
|
||||||
soup = self.soup(filename)
|
|
||||||
warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
|
|
||||||
self.assertTrue("looks like a directory" in str(warning.message))
|
|
||||||
finally:
|
|
||||||
os.rmdir(filename)
|
|
||||||
|
|
||||||
# The directory no longer exists, so Beautiful Soup will no longer issue the warning.
|
|
||||||
with warnings.catch_warnings(record=True) as w:
|
|
||||||
soup = self.soup(filename)
|
|
||||||
self.assertEqual([], w)
|
|
||||||
|
|
||||||
def test_url_warning_with_bytes_url(self):
|
def test_url_warning_with_bytes_url(self):
|
||||||
|
url = b"http://www.crummybytes.com/"
|
||||||
with warnings.catch_warnings(record=True) as warning_list:
|
with warnings.catch_warnings(record=True) as warning_list:
|
||||||
soup = self.soup(b"http://www.crummybytes.com/")
|
soup = self.soup(url)
|
||||||
warning = self._assert_warning(
|
warning = self._assert_warning(
|
||||||
warning_list, MarkupResemblesLocatorWarning
|
warning_list, MarkupResemblesLocatorWarning
|
||||||
)
|
)
|
||||||
self.assertTrue("looks like a URL" in str(warning.message))
|
assert "looks more like a URL" in str(warning.message)
|
||||||
|
assert url not in str(warning.message).encode("utf8")
|
||||||
|
|
||||||
def test_url_warning_with_unicode_url(self):
|
def test_url_warning_with_unicode_url(self):
|
||||||
|
url = "http://www.crummyunicode.com/"
|
||||||
with warnings.catch_warnings(record=True) as warning_list:
|
with warnings.catch_warnings(record=True) as warning_list:
|
||||||
# note - this url must differ from the bytes one otherwise
|
# note - this url must differ from the bytes one otherwise
|
||||||
# python's warnings system swallows the second warning
|
# python's warnings system swallows the second warning
|
||||||
soup = self.soup("http://www.crummyunicode.com/")
|
soup = self.soup(url)
|
||||||
warning = self._assert_warning(
|
warning = self._assert_warning(
|
||||||
warning_list, MarkupResemblesLocatorWarning
|
warning_list, MarkupResemblesLocatorWarning
|
||||||
)
|
)
|
||||||
self.assertTrue("looks like a URL" in str(warning.message))
|
assert "looks more like a URL" in str(warning.message)
|
||||||
|
assert url not in str(warning.message)
|
||||||
|
|
||||||
def test_url_warning_with_bytes_and_space(self):
|
def test_url_warning_with_bytes_and_space(self):
|
||||||
# Here the markup contains something besides a URL, so no warning
|
# Here the markup contains something besides a URL, so no warning
|
||||||
# is issued.
|
# is issued.
|
||||||
with warnings.catch_warnings(record=True) as warning_list:
|
with warnings.catch_warnings(record=True) as warning_list:
|
||||||
soup = self.soup(b"http://www.crummybytes.com/ is great")
|
soup = self.soup(b"http://www.crummybytes.com/ is great")
|
||||||
self.assertFalse(any("looks like a URL" in str(w.message)
|
assert not any("looks more like a URL" in str(w.message)
|
||||||
for w in warning_list))
|
for w in warning_list)
|
||||||
|
|
||||||
def test_url_warning_with_unicode_and_space(self):
|
def test_url_warning_with_unicode_and_space(self):
|
||||||
with warnings.catch_warnings(record=True) as warning_list:
|
with warnings.catch_warnings(record=True) as warning_list:
|
||||||
soup = self.soup("http://www.crummyuncode.com/ is great")
|
soup = self.soup("http://www.crummyunicode.com/ is great")
|
||||||
self.assertFalse(any("looks like a URL" in str(w.message)
|
assert not any("looks more like a URL" in str(w.message)
|
||||||
for w in warning_list))
|
for w in warning_list)
|
||||||
|
|
||||||
|
|
||||||
class TestSelectiveParsing(SoupTest):
|
class TestSelectiveParsing(SoupTest):
|
||||||
|
@ -345,235 +335,128 @@ class TestSelectiveParsing(SoupTest):
|
||||||
markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
|
markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
|
||||||
strainer = SoupStrainer("b")
|
strainer = SoupStrainer("b")
|
||||||
soup = self.soup(markup, parse_only=strainer)
|
soup = self.soup(markup, parse_only=strainer)
|
||||||
self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
|
assert soup.encode() == b"<b>Yes</b><b>Yes <c>Yes</c></b>"
|
||||||
|
|
||||||
|
|
||||||
class TestEntitySubstitution(unittest.TestCase):
|
class TestNewTag(SoupTest):
|
||||||
"""Standalone tests of the EntitySubstitution class."""
|
"""Test the BeautifulSoup.new_tag() method."""
|
||||||
def setUp(self):
|
def test_new_tag(self):
|
||||||
self.sub = EntitySubstitution
|
soup = self.soup("")
|
||||||
|
new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"})
|
||||||
|
assert isinstance(new_tag, Tag)
|
||||||
|
assert "foo" == new_tag.name
|
||||||
|
assert dict(bar="baz", name="a name") == new_tag.attrs
|
||||||
|
assert None == new_tag.parent
|
||||||
|
|
||||||
def test_simple_html_substitution(self):
|
def test_tag_inherits_self_closing_rules_from_builder(self):
|
||||||
# Unicode characters corresponding to named HTML entites
|
if LXML_PRESENT:
|
||||||
# are substituted, and no others.
|
xml_soup = BeautifulSoup("", "lxml-xml")
|
||||||
s = "foo\u2200\N{SNOWMAN}\u00f5bar"
|
xml_br = xml_soup.new_tag("br")
|
||||||
self.assertEqual(self.sub.substitute_html(s),
|
xml_p = xml_soup.new_tag("p")
|
||||||
"foo∀\N{SNOWMAN}õbar")
|
|
||||||
|
|
||||||
def test_smart_quote_substitution(self):
|
# Both the <br> and <p> tag are empty-element, just because
|
||||||
# MS smart quotes are a common source of frustration, so we
|
# they have no contents.
|
||||||
# give them a special test.
|
assert b"<br/>" == xml_br.encode()
|
||||||
quotes = b"\x91\x92foo\x93\x94"
|
assert b"<p/>" == xml_p.encode()
|
||||||
dammit = UnicodeDammit(quotes)
|
|
||||||
self.assertEqual(self.sub.substitute_html(dammit.markup),
|
|
||||||
"‘’foo“”")
|
|
||||||
|
|
||||||
def test_html5_entity(self):
|
html_soup = BeautifulSoup("", "html.parser")
|
||||||
# Some HTML5 entities correspond to single- or multi-character
|
html_br = html_soup.new_tag("br")
|
||||||
# Unicode sequences.
|
html_p = html_soup.new_tag("p")
|
||||||
|
|
||||||
for entity, u in (
|
# The HTML builder users HTML's rules about which tags are
|
||||||
# A few spot checks of our ability to recognize
|
# empty-element tags, and the new tags reflect these rules.
|
||||||
# special character sequences and convert them
|
assert b"<br/>" == html_br.encode()
|
||||||
# to named entities.
|
assert b"<p></p>" == html_p.encode()
|
||||||
('⊧', '\u22a7'),
|
|
||||||
('𝔑', '\U0001d511'),
|
|
||||||
('≧̸', '\u2267\u0338'),
|
|
||||||
('¬', '\xac'),
|
|
||||||
('⫬', '\u2aec'),
|
|
||||||
|
|
||||||
# We _could_ convert | to &verbarr;, but we don't, because
|
class TestNewString(SoupTest):
|
||||||
# | is an ASCII character.
|
"""Test the BeautifulSoup.new_string() method."""
|
||||||
('|' '|'),
|
def test_new_string_creates_navigablestring(self):
|
||||||
|
soup = self.soup("")
|
||||||
|
s = soup.new_string("foo")
|
||||||
|
assert "foo" == s
|
||||||
|
assert isinstance(s, NavigableString)
|
||||||
|
|
||||||
# Similarly for the fj ligature, which we could convert to
|
def test_new_string_can_create_navigablestring_subclass(self):
|
||||||
# fj, but we don't.
|
soup = self.soup("")
|
||||||
("fj", "fj"),
|
s = soup.new_string("foo", Comment)
|
||||||
|
assert "foo" == s
|
||||||
|
assert isinstance(s, Comment)
|
||||||
|
|
||||||
# We do convert _these_ ASCII characters to HTML entities,
|
|
||||||
# because that's required to generate valid HTML.
|
|
||||||
('>', '>'),
|
|
||||||
('<', '<'),
|
|
||||||
('&', '&'),
|
|
||||||
):
|
|
||||||
template = '3 %s 4'
|
|
||||||
raw = template % u
|
|
||||||
with_entities = template % entity
|
|
||||||
self.assertEqual(self.sub.substitute_html(raw), with_entities)
|
|
||||||
|
|
||||||
def test_html5_entity_with_variation_selector(self):
|
class TestPickle(SoupTest):
|
||||||
# Some HTML5 entities correspond either to a single-character
|
# Test our ability to pickle the BeautifulSoup object itself.
|
||||||
# Unicode sequence _or_ to the same character plus U+FE00,
|
|
||||||
# VARIATION SELECTOR 1. We can handle this.
|
|
||||||
data = "fjords \u2294 penguins"
|
|
||||||
markup = "fjords ⊔ penguins"
|
|
||||||
self.assertEqual(self.sub.substitute_html(data), markup)
|
|
||||||
|
|
||||||
data = "fjords \u2294\ufe00 penguins"
|
def test_normal_pickle(self):
|
||||||
markup = "fjords ⊔︀ penguins"
|
soup = self.soup("<a>some markup</a>")
|
||||||
self.assertEqual(self.sub.substitute_html(data), markup)
|
pickled = pickle.dumps(soup)
|
||||||
|
unpickled = pickle.loads(pickled)
|
||||||
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
|
assert "some markup" == unpickled.a.string
|
||||||
s = 'Welcome to "my bar"'
|
|
||||||
self.assertEqual(self.sub.substitute_xml(s, False), s)
|
|
||||||
|
|
||||||
def test_xml_attribute_quoting_normally_uses_double_quotes(self):
|
|
||||||
self.assertEqual(self.sub.substitute_xml("Welcome", True),
|
|
||||||
'"Welcome"')
|
|
||||||
self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
|
|
||||||
'"Bob\'s Bar"')
|
|
||||||
|
|
||||||
def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
|
|
||||||
s = 'Welcome to "my bar"'
|
|
||||||
self.assertEqual(self.sub.substitute_xml(s, True),
|
|
||||||
"'Welcome to \"my bar\"'")
|
|
||||||
|
|
||||||
def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
|
|
||||||
s = 'Welcome to "Bob\'s Bar"'
|
|
||||||
self.assertEqual(
|
|
||||||
self.sub.substitute_xml(s, True),
|
|
||||||
'"Welcome to "Bob\'s Bar""')
|
|
||||||
|
|
||||||
def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
|
|
||||||
quoted = 'Welcome to "Bob\'s Bar"'
|
|
||||||
self.assertEqual(self.sub.substitute_xml(quoted), quoted)
|
|
||||||
|
|
||||||
def test_xml_quoting_handles_angle_brackets(self):
|
|
||||||
self.assertEqual(
|
|
||||||
self.sub.substitute_xml("foo<bar>"),
|
|
||||||
"foo<bar>")
|
|
||||||
|
|
||||||
def test_xml_quoting_handles_ampersands(self):
|
|
||||||
self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T")
|
|
||||||
|
|
||||||
def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
|
|
||||||
self.assertEqual(
|
|
||||||
self.sub.substitute_xml("ÁT&T"),
|
|
||||||
"&Aacute;T&T")
|
|
||||||
|
|
||||||
def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
|
|
||||||
self.assertEqual(
|
|
||||||
self.sub.substitute_xml_containing_entities("ÁT&T"),
|
|
||||||
"ÁT&T")
|
|
||||||
|
|
||||||
def test_quotes_not_html_substituted(self):
|
|
||||||
"""There's no need to do this except inside attribute values."""
|
|
||||||
text = 'Bob\'s "bar"'
|
|
||||||
self.assertEqual(self.sub.substitute_html(text), text)
|
|
||||||
|
|
||||||
|
def test_pickle_with_no_builder(self):
|
||||||
|
# We had a bug that prevented pickling from working if
|
||||||
|
# the builder wasn't set.
|
||||||
|
soup = self.soup("some markup")
|
||||||
|
soup.builder = None
|
||||||
|
pickled = pickle.dumps(soup)
|
||||||
|
unpickled = pickle.loads(pickled)
|
||||||
|
assert "some markup" == unpickled.string
|
||||||
|
|
||||||
class TestEncodingConversion(SoupTest):
|
class TestEncodingConversion(SoupTest):
|
||||||
# Test Beautiful Soup's ability to decode and encode from various
|
# Test Beautiful Soup's ability to decode and encode from various
|
||||||
# encodings.
|
# encodings.
|
||||||
|
|
||||||
def setUp(self):
|
def setup_method(self):
|
||||||
super(TestEncodingConversion, self).setUp()
|
|
||||||
self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
|
self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
|
||||||
self.utf8_data = self.unicode_data.encode("utf-8")
|
self.utf8_data = self.unicode_data.encode("utf-8")
|
||||||
# Just so you know what it looks like.
|
# Just so you know what it looks like.
|
||||||
self.assertEqual(
|
assert self.utf8_data == b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>'
|
||||||
self.utf8_data,
|
|
||||||
b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
|
|
||||||
|
|
||||||
def test_ascii_in_unicode_out(self):
|
def test_ascii_in_unicode_out(self):
|
||||||
# ASCII input is converted to Unicode. The original_encoding
|
# ASCII input is converted to Unicode. The original_encoding
|
||||||
# attribute is set to 'utf-8', a superset of ASCII.
|
# attribute is set to 'utf-8', a superset of ASCII.
|
||||||
chardet = bs4.dammit.chardet_dammit
|
chardet = dammit.chardet_dammit
|
||||||
logging.disable(logging.WARNING)
|
logging.disable(logging.WARNING)
|
||||||
try:
|
try:
|
||||||
def noop(str):
|
def noop(str):
|
||||||
return None
|
return None
|
||||||
# Disable chardet, which will realize that the ASCII is ASCII.
|
# Disable chardet, which will realize that the ASCII is ASCII.
|
||||||
bs4.dammit.chardet_dammit = noop
|
dammit.chardet_dammit = noop
|
||||||
ascii = b"<foo>a</foo>"
|
ascii = b"<foo>a</foo>"
|
||||||
soup_from_ascii = self.soup(ascii)
|
soup_from_ascii = self.soup(ascii)
|
||||||
unicode_output = soup_from_ascii.decode()
|
unicode_output = soup_from_ascii.decode()
|
||||||
self.assertTrue(isinstance(unicode_output, str))
|
assert isinstance(unicode_output, str)
|
||||||
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
|
assert unicode_output == self.document_for(ascii.decode())
|
||||||
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
|
assert soup_from_ascii.original_encoding.lower() == "utf-8"
|
||||||
finally:
|
finally:
|
||||||
logging.disable(logging.NOTSET)
|
logging.disable(logging.NOTSET)
|
||||||
bs4.dammit.chardet_dammit = chardet
|
dammit.chardet_dammit = chardet
|
||||||
|
|
||||||
def test_unicode_in_unicode_out(self):
|
def test_unicode_in_unicode_out(self):
|
||||||
# Unicode input is left alone. The original_encoding attribute
|
# Unicode input is left alone. The original_encoding attribute
|
||||||
# is not set.
|
# is not set.
|
||||||
soup_from_unicode = self.soup(self.unicode_data)
|
soup_from_unicode = self.soup(self.unicode_data)
|
||||||
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
|
assert soup_from_unicode.decode() == self.unicode_data
|
||||||
self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
|
assert soup_from_unicode.foo.string == 'Sacr\xe9 bleu!'
|
||||||
self.assertEqual(soup_from_unicode.original_encoding, None)
|
assert soup_from_unicode.original_encoding == None
|
||||||
|
|
||||||
def test_utf8_in_unicode_out(self):
|
def test_utf8_in_unicode_out(self):
|
||||||
# UTF-8 input is converted to Unicode. The original_encoding
|
# UTF-8 input is converted to Unicode. The original_encoding
|
||||||
# attribute is set.
|
# attribute is set.
|
||||||
soup_from_utf8 = self.soup(self.utf8_data)
|
soup_from_utf8 = self.soup(self.utf8_data)
|
||||||
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
|
assert soup_from_utf8.decode() == self.unicode_data
|
||||||
self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
|
assert soup_from_utf8.foo.string == 'Sacr\xe9 bleu!'
|
||||||
|
|
||||||
def test_utf8_out(self):
|
def test_utf8_out(self):
|
||||||
# The internal data structures can be encoded as UTF-8.
|
# The internal data structures can be encoded as UTF-8.
|
||||||
soup_from_unicode = self.soup(self.unicode_data)
|
soup_from_unicode = self.soup(self.unicode_data)
|
||||||
self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
|
assert soup_from_unicode.encode('utf-8') == self.utf8_data
|
||||||
|
|
||||||
@skipIf(
|
@skipIf(
|
||||||
PYTHON_3_PRE_3_2,
|
PYTHON_3_PRE_3_2,
|
||||||
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
|
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
|
||||||
def test_attribute_name_containing_unicode_characters(self):
|
def test_attribute_name_containing_unicode_characters(self):
|
||||||
markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
|
markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
|
||||||
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
|
assert self.soup(markup).div.encode("utf8") == markup.encode("utf8")
|
||||||
|
|
||||||
|
|
||||||
class TestNamedspacedAttribute(SoupTest):
|
|
||||||
|
|
||||||
def test_name_may_be_none_or_missing(self):
|
|
||||||
a = NamespacedAttribute("xmlns", None)
|
|
||||||
self.assertEqual(a, "xmlns")
|
|
||||||
|
|
||||||
a = NamespacedAttribute("xmlns", "")
|
|
||||||
self.assertEqual(a, "xmlns")
|
|
||||||
|
|
||||||
a = NamespacedAttribute("xmlns")
|
|
||||||
self.assertEqual(a, "xmlns")
|
|
||||||
|
|
||||||
def test_namespace_may_be_none_or_missing(self):
|
|
||||||
a = NamespacedAttribute(None, "tag")
|
|
||||||
self.assertEqual(a, "tag")
|
|
||||||
|
|
||||||
a = NamespacedAttribute("", "tag")
|
|
||||||
self.assertEqual(a, "tag")
|
|
||||||
|
|
||||||
def test_attribute_is_equivalent_to_colon_separated_string(self):
|
|
||||||
a = NamespacedAttribute("a", "b")
|
|
||||||
self.assertEqual("a:b", a)
|
|
||||||
|
|
||||||
def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
|
|
||||||
a = NamespacedAttribute("a", "b", "c")
|
|
||||||
b = NamespacedAttribute("a", "b", "c")
|
|
||||||
self.assertEqual(a, b)
|
|
||||||
|
|
||||||
# The actual namespace is not considered.
|
|
||||||
c = NamespacedAttribute("a", "b", None)
|
|
||||||
self.assertEqual(a, c)
|
|
||||||
|
|
||||||
# But name and prefix are important.
|
|
||||||
d = NamespacedAttribute("a", "z", "c")
|
|
||||||
self.assertNotEqual(a, d)
|
|
||||||
|
|
||||||
e = NamespacedAttribute("z", "b", "c")
|
|
||||||
self.assertNotEqual(a, e)
|
|
||||||
|
|
||||||
|
|
||||||
class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
|
|
||||||
|
|
||||||
def test_content_meta_attribute_value(self):
|
|
||||||
value = CharsetMetaAttributeValue("euc-jp")
|
|
||||||
self.assertEqual("euc-jp", value)
|
|
||||||
self.assertEqual("euc-jp", value.original_value)
|
|
||||||
self.assertEqual("utf8", value.encode("utf8"))
|
|
||||||
|
|
||||||
|
|
||||||
def test_content_meta_attribute_value(self):
|
|
||||||
value = ContentMetaAttributeValue("text/html; charset=euc-jp")
|
|
||||||
self.assertEqual("text/html; charset=euc-jp", value)
|
|
||||||
self.assertEqual("text/html; charset=euc-jp", value.original_value)
|
|
||||||
self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
|
|
||||||
|
|
221
libs/bs4/tests/test_tag.py
Normal file
221
libs/bs4/tests/test_tag.py
Normal file
|
@ -0,0 +1,221 @@
|
||||||
|
import warnings
|
||||||
|
from bs4.element import (
|
||||||
|
Comment,
|
||||||
|
NavigableString,
|
||||||
|
)
|
||||||
|
from . import SoupTest
|
||||||
|
|
||||||
|
class TestTag(SoupTest):
|
||||||
|
"""Test various methods of Tag which aren't so complicated they
|
||||||
|
need their own classes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test__should_pretty_print(self):
|
||||||
|
# Test the rules about when a tag should be pretty-printed.
|
||||||
|
tag = self.soup("").new_tag("a_tag")
|
||||||
|
|
||||||
|
# No list of whitespace-preserving tags -> pretty-print
|
||||||
|
tag._preserve_whitespace_tags = None
|
||||||
|
assert True == tag._should_pretty_print(0)
|
||||||
|
|
||||||
|
# List exists but tag is not on the list -> pretty-print
|
||||||
|
tag.preserve_whitespace_tags = ["some_other_tag"]
|
||||||
|
assert True == tag._should_pretty_print(1)
|
||||||
|
|
||||||
|
# Indent level is None -> don't pretty-print
|
||||||
|
assert False == tag._should_pretty_print(None)
|
||||||
|
|
||||||
|
# Tag is on the whitespace-preserving list -> don't pretty-print
|
||||||
|
tag.preserve_whitespace_tags = ["some_other_tag", "a_tag"]
|
||||||
|
assert False == tag._should_pretty_print(1)
|
||||||
|
|
||||||
|
def test_len(self):
|
||||||
|
"""The length of a Tag is its number of children."""
|
||||||
|
soup = self.soup("<top>1<b>2</b>3</top>")
|
||||||
|
|
||||||
|
# The BeautifulSoup object itself contains one element: the
|
||||||
|
# <top> tag.
|
||||||
|
assert len(soup.contents) == 1
|
||||||
|
assert len(soup) == 1
|
||||||
|
|
||||||
|
# The <top> tag contains three elements: the text node "1", the
|
||||||
|
# <b> tag, and the text node "3".
|
||||||
|
assert len(soup.top) == 3
|
||||||
|
assert len(soup.top.contents) == 3
|
||||||
|
|
||||||
|
def test_member_access_invokes_find(self):
|
||||||
|
"""Accessing a Python member .foo invokes find('foo')"""
|
||||||
|
soup = self.soup('<b><i></i></b>')
|
||||||
|
assert soup.b == soup.find('b')
|
||||||
|
assert soup.b.i == soup.find('b').find('i')
|
||||||
|
assert soup.a == None
|
||||||
|
|
||||||
|
def test_deprecated_member_access(self):
|
||||||
|
soup = self.soup('<b><i></i></b>')
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
tag = soup.bTag
|
||||||
|
assert soup.b == tag
|
||||||
|
assert '.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")' == str(w[0].message)
|
||||||
|
|
||||||
|
def test_has_attr(self):
|
||||||
|
"""has_attr() checks for the presence of an attribute.
|
||||||
|
|
||||||
|
Please note note: has_attr() is different from
|
||||||
|
__in__. has_attr() checks the tag's attributes and __in__
|
||||||
|
checks the tag's chidlren.
|
||||||
|
"""
|
||||||
|
soup = self.soup("<foo attr='bar'>")
|
||||||
|
assert soup.foo.has_attr('attr')
|
||||||
|
assert not soup.foo.has_attr('attr2')
|
||||||
|
|
||||||
|
def test_attributes_come_out_in_alphabetical_order(self):
|
||||||
|
markup = '<b a="1" z="5" m="3" f="2" y="4"></b>'
|
||||||
|
self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>')
|
||||||
|
|
||||||
|
def test_string(self):
|
||||||
|
# A Tag that contains only a text node makes that node
|
||||||
|
# available as .string.
|
||||||
|
soup = self.soup("<b>foo</b>")
|
||||||
|
assert soup.b.string == 'foo'
|
||||||
|
|
||||||
|
def test_empty_tag_has_no_string(self):
|
||||||
|
# A Tag with no children has no .stirng.
|
||||||
|
soup = self.soup("<b></b>")
|
||||||
|
assert soup.b.string == None
|
||||||
|
|
||||||
|
def test_tag_with_multiple_children_has_no_string(self):
|
||||||
|
# A Tag with no children has no .string.
|
||||||
|
soup = self.soup("<a>foo<b></b><b></b></b>")
|
||||||
|
assert soup.b.string == None
|
||||||
|
|
||||||
|
soup = self.soup("<a>foo<b></b>bar</b>")
|
||||||
|
assert soup.b.string == None
|
||||||
|
|
||||||
|
# Even if all the children are strings, due to trickery,
|
||||||
|
# it won't work--but this would be a good optimization.
|
||||||
|
soup = self.soup("<a>foo</b>")
|
||||||
|
soup.a.insert(1, "bar")
|
||||||
|
assert soup.a.string == None
|
||||||
|
|
||||||
|
def test_tag_with_recursive_string_has_string(self):
|
||||||
|
# A Tag with a single child which has a .string inherits that
|
||||||
|
# .string.
|
||||||
|
soup = self.soup("<a><b>foo</b></a>")
|
||||||
|
assert soup.a.string == "foo"
|
||||||
|
assert soup.string == "foo"
|
||||||
|
|
||||||
|
def test_lack_of_string(self):
|
||||||
|
"""Only a Tag containing a single text node has a .string."""
|
||||||
|
soup = self.soup("<b>f<i>e</i>o</b>")
|
||||||
|
assert soup.b.string is None
|
||||||
|
|
||||||
|
soup = self.soup("<b></b>")
|
||||||
|
assert soup.b.string is None
|
||||||
|
|
||||||
|
def test_all_text(self):
|
||||||
|
"""Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated"""
|
||||||
|
soup = self.soup("<a>a<b>r</b> <r> t </r></a>")
|
||||||
|
assert soup.a.text == "ar t "
|
||||||
|
assert soup.a.get_text(strip=True) == "art"
|
||||||
|
assert soup.a.get_text(",") == "a,r, , t "
|
||||||
|
assert soup.a.get_text(",", strip=True) == "a,r,t"
|
||||||
|
|
||||||
|
def test_get_text_ignores_special_string_containers(self):
|
||||||
|
soup = self.soup("foo<!--IGNORE-->bar")
|
||||||
|
assert soup.get_text() == "foobar"
|
||||||
|
|
||||||
|
assert soup.get_text(types=(NavigableString, Comment)) == "fooIGNOREbar"
|
||||||
|
assert soup.get_text(types=None) == "fooIGNOREbar"
|
||||||
|
|
||||||
|
soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
|
||||||
|
assert soup.get_text() == "foobar"
|
||||||
|
|
||||||
|
def test_all_strings_ignores_special_string_containers(self):
|
||||||
|
soup = self.soup("foo<!--IGNORE-->bar")
|
||||||
|
assert ['foo', 'bar'] == list(soup.strings)
|
||||||
|
|
||||||
|
soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
|
||||||
|
assert ['foo', 'bar'] == list(soup.strings)
|
||||||
|
|
||||||
|
def test_string_methods_inside_special_string_container_tags(self):
|
||||||
|
# Strings inside tags like <script> are generally ignored by
|
||||||
|
# methods like get_text, because they're not what humans
|
||||||
|
# consider 'text'. But if you call get_text on the <script>
|
||||||
|
# tag itself, those strings _are_ considered to be 'text',
|
||||||
|
# because there's nothing else you might be looking for.
|
||||||
|
|
||||||
|
style = self.soup("<div>a<style>Some CSS</style></div>")
|
||||||
|
template = self.soup("<div>a<template><p>Templated <b>text</b>.</p><!--With a comment.--></template></div>")
|
||||||
|
script = self.soup("<div>a<script><!--a comment-->Some text</script></div>")
|
||||||
|
|
||||||
|
assert style.div.get_text() == "a"
|
||||||
|
assert list(style.div.strings) == ["a"]
|
||||||
|
assert style.div.style.get_text() == "Some CSS"
|
||||||
|
assert list(style.div.style.strings) == ['Some CSS']
|
||||||
|
|
||||||
|
# The comment is not picked up here. That's because it was
|
||||||
|
# parsed into a Comment object, which is not considered
|
||||||
|
# interesting by template.strings.
|
||||||
|
assert template.div.get_text() == "a"
|
||||||
|
assert list(template.div.strings) == ["a"]
|
||||||
|
assert template.div.template.get_text() == "Templated text."
|
||||||
|
assert list(template.div.template.strings) == ["Templated ", "text", "."]
|
||||||
|
|
||||||
|
# The comment is included here, because it didn't get parsed
|
||||||
|
# into a Comment object--it's part of the Script string.
|
||||||
|
assert script.div.get_text() == "a"
|
||||||
|
assert list(script.div.strings) == ["a"]
|
||||||
|
assert script.div.script.get_text() == "<!--a comment-->Some text"
|
||||||
|
assert list(script.div.script.strings) == ['<!--a comment-->Some text']
|
||||||
|
|
||||||
|
|
||||||
|
class TestMultiValuedAttributes(SoupTest):
|
||||||
|
"""Test the behavior of multi-valued attributes like 'class'.
|
||||||
|
|
||||||
|
The values of such attributes are always presented as lists.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_single_value_becomes_list(self):
|
||||||
|
soup = self.soup("<a class='foo'>")
|
||||||
|
assert ["foo"] ==soup.a['class']
|
||||||
|
|
||||||
|
def test_multiple_values_becomes_list(self):
|
||||||
|
soup = self.soup("<a class='foo bar'>")
|
||||||
|
assert ["foo", "bar"] == soup.a['class']
|
||||||
|
|
||||||
|
def test_multiple_values_separated_by_weird_whitespace(self):
|
||||||
|
soup = self.soup("<a class='foo\tbar\nbaz'>")
|
||||||
|
assert ["foo", "bar", "baz"] ==soup.a['class']
|
||||||
|
|
||||||
|
def test_attributes_joined_into_string_on_output(self):
|
||||||
|
soup = self.soup("<a class='foo\tbar'>")
|
||||||
|
assert b'<a class="foo bar"></a>' == soup.a.encode()
|
||||||
|
|
||||||
|
def test_get_attribute_list(self):
|
||||||
|
soup = self.soup("<a id='abc def'>")
|
||||||
|
assert ['abc def'] == soup.a.get_attribute_list('id')
|
||||||
|
|
||||||
|
def test_accept_charset(self):
|
||||||
|
soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">')
|
||||||
|
assert ['ISO-8859-1', 'UTF-8'] == soup.form['accept-charset']
|
||||||
|
|
||||||
|
def test_cdata_attribute_applying_only_to_one_tag(self):
|
||||||
|
data = '<a accept-charset="ISO-8859-1 UTF-8"></a>'
|
||||||
|
soup = self.soup(data)
|
||||||
|
# We saw in another test that accept-charset is a cdata-list
|
||||||
|
# attribute for the <form> tag. But it's not a cdata-list
|
||||||
|
# attribute for any other tag.
|
||||||
|
assert 'ISO-8859-1 UTF-8' == soup.a['accept-charset']
|
||||||
|
|
||||||
|
def test_customization(self):
|
||||||
|
# It's possible to change which attributes of which tags
|
||||||
|
# are treated as multi-valued attributes.
|
||||||
|
#
|
||||||
|
# Here, 'id' is a multi-valued attribute and 'class' is not.
|
||||||
|
#
|
||||||
|
# TODO: This code is in the builder and should be tested there.
|
||||||
|
soup = self.soup(
|
||||||
|
'<a class="foo" id="bar">', multi_valued_attributes={ '*' : 'id' }
|
||||||
|
)
|
||||||
|
assert soup.a['class'] == 'foo'
|
||||||
|
assert soup.a['id'] == ['bar']
|
File diff suppressed because it is too large
Load diff
|
@ -1,3 +1,4 @@
|
||||||
from .core import contents, where
|
from .core import contents, where
|
||||||
|
|
||||||
__version__ = "2021.10.08"
|
__all__ = ["contents", "where"]
|
||||||
|
__version__ = "2022.09.24"
|
||||||
|
|
|
@ -28,36 +28,6 @@ DKqC5JlR3XC321Y9YeRq4VzW9v493kHMB65jUr9TU/Qr6cf9tveCX4XSQRjbgbME
|
||||||
HMUfpIBvFSDJ3gyICh3WZlXi/EjJKSZp4A==
|
HMUfpIBvFSDJ3gyICh3WZlXi/EjJKSZp4A==
|
||||||
-----END CERTIFICATE-----
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
# Issuer: CN=GlobalSign O=GlobalSign OU=GlobalSign Root CA - R2
|
|
||||||
# Subject: CN=GlobalSign O=GlobalSign OU=GlobalSign Root CA - R2
|
|
||||||
# Label: "GlobalSign Root CA - R2"
|
|
||||||
# Serial: 4835703278459682885658125
|
|
||||||
# MD5 Fingerprint: 94:14:77:7e:3e:5e:fd:8f:30:bd:41:b0:cf:e7:d0:30
|
|
||||||
# SHA1 Fingerprint: 75:e0:ab:b6:13:85:12:27:1c:04:f8:5f:dd:de:38:e4:b7:24:2e:fe
|
|
||||||
# SHA256 Fingerprint: ca:42:dd:41:74:5f:d0:b8:1e:b9:02:36:2c:f9:d8:bf:71:9d:a1:bd:1b:1e:fc:94:6f:5b:4c:99:f4:2c:1b:9e
|
|
||||||
-----BEGIN CERTIFICATE-----
|
|
||||||
MIIDujCCAqKgAwIBAgILBAAAAAABD4Ym5g0wDQYJKoZIhvcNAQEFBQAwTDEgMB4G
|
|
||||||
A1UECxMXR2xvYmFsU2lnbiBSb290IENBIC0gUjIxEzARBgNVBAoTCkdsb2JhbFNp
|
|
||||||
Z24xEzARBgNVBAMTCkdsb2JhbFNpZ24wHhcNMDYxMjE1MDgwMDAwWhcNMjExMjE1
|
|
||||||
MDgwMDAwWjBMMSAwHgYDVQQLExdHbG9iYWxTaWduIFJvb3QgQ0EgLSBSMjETMBEG
|
|
||||||
A1UEChMKR2xvYmFsU2lnbjETMBEGA1UEAxMKR2xvYmFsU2lnbjCCASIwDQYJKoZI
|
|
||||||
hvcNAQEBBQADggEPADCCAQoCggEBAKbPJA6+Lm8omUVCxKs+IVSbC9N/hHD6ErPL
|
|
||||||
v4dfxn+G07IwXNb9rfF73OX4YJYJkhD10FPe+3t+c4isUoh7SqbKSaZeqKeMWhG8
|
|
||||||
eoLrvozps6yWJQeXSpkqBy+0Hne/ig+1AnwblrjFuTosvNYSuetZfeLQBoZfXklq
|
|
||||||
tTleiDTsvHgMCJiEbKjNS7SgfQx5TfC4LcshytVsW33hoCmEofnTlEnLJGKRILzd
|
|
||||||
C9XZzPnqJworc5HGnRusyMvo4KD0L5CLTfuwNhv2GXqF4G3yYROIXJ/gkwpRl4pa
|
|
||||||
zq+r1feqCapgvdzZX99yqWATXgAByUr6P6TqBwMhAo6CygPCm48CAwEAAaOBnDCB
|
|
||||||
mTAOBgNVHQ8BAf8EBAMCAQYwDwYDVR0TAQH/BAUwAwEB/zAdBgNVHQ4EFgQUm+IH
|
|
||||||
V2ccHsBqBt5ZtJot39wZhi4wNgYDVR0fBC8wLTAroCmgJ4YlaHR0cDovL2NybC5n
|
|
||||||
bG9iYWxzaWduLm5ldC9yb290LXIyLmNybDAfBgNVHSMEGDAWgBSb4gdXZxwewGoG
|
|
||||||
3lm0mi3f3BmGLjANBgkqhkiG9w0BAQUFAAOCAQEAmYFThxxol4aR7OBKuEQLq4Gs
|
|
||||||
J0/WwbgcQ3izDJr86iw8bmEbTUsp9Z8FHSbBuOmDAGJFtqkIk7mpM0sYmsL4h4hO
|
|
||||||
291xNBrBVNpGP+DTKqttVCL1OmLNIG+6KYnX3ZHu01yiPqFbQfXf5WRDLenVOavS
|
|
||||||
ot+3i9DAgBkcRcAtjOj4LaR0VknFBbVPFd5uRHg5h6h+u/N5GJG79G+dwfCMNYxd
|
|
||||||
AfvDbbnvRG15RjF+Cv6pgsH/76tuIMRQyV+dTZsXjAzlAcmgQWpzU/qlULRuJQ/7
|
|
||||||
TBj0/VLZjmmx6BEP3ojY+x1J96relc8geMJgEtslQIxq/H5COEBkEveegeGTLg==
|
|
||||||
-----END CERTIFICATE-----
|
|
||||||
|
|
||||||
# Issuer: CN=Entrust.net Certification Authority (2048) O=Entrust.net OU=www.entrust.net/CPS_2048 incorp. by ref. (limits liab.)/(c) 1999 Entrust.net Limited
|
# Issuer: CN=Entrust.net Certification Authority (2048) O=Entrust.net OU=www.entrust.net/CPS_2048 incorp. by ref. (limits liab.)/(c) 1999 Entrust.net Limited
|
||||||
# Subject: CN=Entrust.net Certification Authority (2048) O=Entrust.net OU=www.entrust.net/CPS_2048 incorp. by ref. (limits liab.)/(c) 1999 Entrust.net Limited
|
# Subject: CN=Entrust.net Certification Authority (2048) O=Entrust.net OU=www.entrust.net/CPS_2048 incorp. by ref. (limits liab.)/(c) 1999 Entrust.net Limited
|
||||||
# Label: "Entrust.net Premium 2048 Secure Server CA"
|
# Label: "Entrust.net Premium 2048 Secure Server CA"
|
||||||
|
@ -491,34 +461,6 @@ vEsXCS+0yx5DaMkHJ8HSXPfqIbloEpw8nL+e/IBcm2PN7EeqJSdnoDfzAIJ9VNep
|
||||||
+OkuE6N36B9K
|
+OkuE6N36B9K
|
||||||
-----END CERTIFICATE-----
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
# Issuer: CN=DST Root CA X3 O=Digital Signature Trust Co.
|
|
||||||
# Subject: CN=DST Root CA X3 O=Digital Signature Trust Co.
|
|
||||||
# Label: "DST Root CA X3"
|
|
||||||
# Serial: 91299735575339953335919266965803778155
|
|
||||||
# MD5 Fingerprint: 41:03:52:dc:0f:f7:50:1b:16:f0:02:8e:ba:6f:45:c5
|
|
||||||
# SHA1 Fingerprint: da:c9:02:4f:54:d8:f6:df:94:93:5f:b1:73:26:38:ca:6a:d7:7c:13
|
|
||||||
# SHA256 Fingerprint: 06:87:26:03:31:a7:24:03:d9:09:f1:05:e6:9b:cf:0d:32:e1:bd:24:93:ff:c6:d9:20:6d:11:bc:d6:77:07:39
|
|
||||||
-----BEGIN CERTIFICATE-----
|
|
||||||
MIIDSjCCAjKgAwIBAgIQRK+wgNajJ7qJMDmGLvhAazANBgkqhkiG9w0BAQUFADA/
|
|
||||||
MSQwIgYDVQQKExtEaWdpdGFsIFNpZ25hdHVyZSBUcnVzdCBDby4xFzAVBgNVBAMT
|
|
||||||
DkRTVCBSb290IENBIFgzMB4XDTAwMDkzMDIxMTIxOVoXDTIxMDkzMDE0MDExNVow
|
|
||||||
PzEkMCIGA1UEChMbRGlnaXRhbCBTaWduYXR1cmUgVHJ1c3QgQ28uMRcwFQYDVQQD
|
|
||||||
Ew5EU1QgUm9vdCBDQSBYMzCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEB
|
|
||||||
AN+v6ZdQCINXtMxiZfaQguzH0yxrMMpb7NnDfcdAwRgUi+DoM3ZJKuM/IUmTrE4O
|
|
||||||
rz5Iy2Xu/NMhD2XSKtkyj4zl93ewEnu1lcCJo6m67XMuegwGMoOifooUMM0RoOEq
|
|
||||||
OLl5CjH9UL2AZd+3UWODyOKIYepLYYHsUmu5ouJLGiifSKOeDNoJjj4XLh7dIN9b
|
|
||||||
xiqKqy69cK3FCxolkHRyxXtqqzTWMIn/5WgTe1QLyNau7Fqckh49ZLOMxt+/yUFw
|
|
||||||
7BZy1SbsOFU5Q9D8/RhcQPGX69Wam40dutolucbY38EVAjqr2m7xPi71XAicPNaD
|
|
||||||
aeQQmxkqtilX4+U9m5/wAl0CAwEAAaNCMEAwDwYDVR0TAQH/BAUwAwEB/zAOBgNV
|
|
||||||
HQ8BAf8EBAMCAQYwHQYDVR0OBBYEFMSnsaR7LHH62+FLkHX/xBVghYkQMA0GCSqG
|
|
||||||
SIb3DQEBBQUAA4IBAQCjGiybFwBcqR7uKGY3Or+Dxz9LwwmglSBd49lZRNI+DT69
|
|
||||||
ikugdB/OEIKcdBodfpga3csTS7MgROSR6cz8faXbauX+5v3gTt23ADq1cEmv8uXr
|
|
||||||
AvHRAosZy5Q6XkjEGB5YGV8eAlrwDPGxrancWYaLbumR9YbK+rlmM6pZW87ipxZz
|
|
||||||
R8srzJmwN0jP41ZL9c8PDHIyh8bwRLtTcm1D9SZImlJnt1ir/md2cXjbDaJWFBM5
|
|
||||||
JDGFoqgCWjBH4d1QB7wCCZAA62RjYJsWvIjJEubSfZGL+T0yjWW06XyxV3bqxbYo
|
|
||||||
Ob8VZRzI9neWagqNdwvYkQsEjgfbKbYK7p2CNTUQ
|
|
||||||
-----END CERTIFICATE-----
|
|
||||||
|
|
||||||
# Issuer: CN=SwissSign Gold CA - G2 O=SwissSign AG
|
# Issuer: CN=SwissSign Gold CA - G2 O=SwissSign AG
|
||||||
# Subject: CN=SwissSign Gold CA - G2 O=SwissSign AG
|
# Subject: CN=SwissSign Gold CA - G2 O=SwissSign AG
|
||||||
# Label: "SwissSign Gold CA - G2"
|
# Label: "SwissSign Gold CA - G2"
|
||||||
|
@ -779,36 +721,6 @@ t0QmwCbAr1UwnjvVNioZBPRcHv/PLLf/0P2HQBHVESO7SMAhqaQoLf0V+LBOK/Qw
|
||||||
WyH8EZE0vkHve52Xdf+XlcCWWC/qu0bXu+TZLg==
|
WyH8EZE0vkHve52Xdf+XlcCWWC/qu0bXu+TZLg==
|
||||||
-----END CERTIFICATE-----
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
# Issuer: CN=Cybertrust Global Root O=Cybertrust, Inc
|
|
||||||
# Subject: CN=Cybertrust Global Root O=Cybertrust, Inc
|
|
||||||
# Label: "Cybertrust Global Root"
|
|
||||||
# Serial: 4835703278459682877484360
|
|
||||||
# MD5 Fingerprint: 72:e4:4a:87:e3:69:40:80:77:ea:bc:e3:f4:ff:f0:e1
|
|
||||||
# SHA1 Fingerprint: 5f:43:e5:b1:bf:f8:78:8c:ac:1c:c7:ca:4a:9a:c6:22:2b:cc:34:c6
|
|
||||||
# SHA256 Fingerprint: 96:0a:df:00:63:e9:63:56:75:0c:29:65:dd:0a:08:67:da:0b:9c:bd:6e:77:71:4a:ea:fb:23:49:ab:39:3d:a3
|
|
||||||
-----BEGIN CERTIFICATE-----
|
|
||||||
MIIDoTCCAomgAwIBAgILBAAAAAABD4WqLUgwDQYJKoZIhvcNAQEFBQAwOzEYMBYG
|
|
||||||
A1UEChMPQ3liZXJ0cnVzdCwgSW5jMR8wHQYDVQQDExZDeWJlcnRydXN0IEdsb2Jh
|
|
||||||
bCBSb290MB4XDTA2MTIxNTA4MDAwMFoXDTIxMTIxNTA4MDAwMFowOzEYMBYGA1UE
|
|
||||||
ChMPQ3liZXJ0cnVzdCwgSW5jMR8wHQYDVQQDExZDeWJlcnRydXN0IEdsb2JhbCBS
|
|
||||||
b290MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA+Mi8vRRQZhP/8NN5
|
|
||||||
7CPytxrHjoXxEnOmGaoQ25yiZXRadz5RfVb23CO21O1fWLE3TdVJDm71aofW0ozS
|
|
||||||
J8bi/zafmGWgE07GKmSb1ZASzxQG9Dvj1Ci+6A74q05IlG2OlTEQXO2iLb3VOm2y
|
|
||||||
HLtgwEZLAfVJrn5GitB0jaEMAs7u/OePuGtm839EAL9mJRQr3RAwHQeWP032a7iP
|
|
||||||
t3sMpTjr3kfb1V05/Iin89cqdPHoWqI7n1C6poxFNcJQZZXcY4Lv3b93TZxiyWNz
|
|
||||||
FtApD0mpSPCzqrdsxacwOUBdrsTiXSZT8M4cIwhhqJQZugRiQOwfOHB3EgZxpzAY
|
|
||||||
XSUnpQIDAQABo4GlMIGiMA4GA1UdDwEB/wQEAwIBBjAPBgNVHRMBAf8EBTADAQH/
|
|
||||||
MB0GA1UdDgQWBBS2CHsNesysIEyGVjJez6tuhS1wVzA/BgNVHR8EODA2MDSgMqAw
|
|
||||||
hi5odHRwOi8vd3d3Mi5wdWJsaWMtdHJ1c3QuY29tL2NybC9jdC9jdHJvb3QuY3Js
|
|
||||||
MB8GA1UdIwQYMBaAFLYIew16zKwgTIZWMl7Pq26FLXBXMA0GCSqGSIb3DQEBBQUA
|
|
||||||
A4IBAQBW7wojoFROlZfJ+InaRcHUowAl9B8Tq7ejhVhpwjCt2BWKLePJzYFa+HMj
|
|
||||||
Wqd8BfP9IjsO0QbE2zZMcwSO5bAi5MXzLqXZI+O4Tkogp24CJJ8iYGd7ix1yCcUx
|
|
||||||
XOl5n4BHPa2hCwcUPUf/A2kaDAtE52Mlp3+yybh2hO0j9n0Hq0V+09+zv+mKts2o
|
|
||||||
omcrUtW3ZfA5TGOgkXmTUg9U3YO7n9GPp1Nzw8v/MOx8BLjYRB+TX3EJIrduPuoc
|
|
||||||
A06dGiBh+4E37F78CkWr1+cXVdCg6mCbpvbjjFspwgZgFJ0tl0ypkxWdYcQBX0jW
|
|
||||||
WL1WMRJOEcgh4LMRkWXbtKaIOM5V
|
|
||||||
-----END CERTIFICATE-----
|
|
||||||
|
|
||||||
# Issuer: O=Chunghwa Telecom Co., Ltd. OU=ePKI Root Certification Authority
|
# Issuer: O=Chunghwa Telecom Co., Ltd. OU=ePKI Root Certification Authority
|
||||||
# Subject: O=Chunghwa Telecom Co., Ltd. OU=ePKI Root Certification Authority
|
# Subject: O=Chunghwa Telecom Co., Ltd. OU=ePKI Root Certification Authority
|
||||||
# Label: "ePKI Root Certification Authority"
|
# Label: "ePKI Root Certification Authority"
|
||||||
|
@ -1411,78 +1323,6 @@ t/2jioSgrGK+KwmHNPBqAbubKVY8/gA3zyNs8U6qtnRGEmyR7jTV7JqR50S+kDFy
|
||||||
SjnRBUkLp7Y3gaVdjKozXoEofKd9J+sAro03
|
SjnRBUkLp7Y3gaVdjKozXoEofKd9J+sAro03
|
||||||
-----END CERTIFICATE-----
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
# Issuer: CN=EC-ACC O=Agencia Catalana de Certificacio (NIF Q-0801176-I) OU=Serveis Publics de Certificacio/Vegeu https://www.catcert.net/verarrel (c)03/Jerarquia Entitats de Certificacio Catalanes
|
|
||||||
# Subject: CN=EC-ACC O=Agencia Catalana de Certificacio (NIF Q-0801176-I) OU=Serveis Publics de Certificacio/Vegeu https://www.catcert.net/verarrel (c)03/Jerarquia Entitats de Certificacio Catalanes
|
|
||||||
# Label: "EC-ACC"
|
|
||||||
# Serial: -23701579247955709139626555126524820479
|
|
||||||
# MD5 Fingerprint: eb:f5:9d:29:0d:61:f9:42:1f:7c:c2:ba:6d:e3:15:09
|
|
||||||
# SHA1 Fingerprint: 28:90:3a:63:5b:52:80:fa:e6:77:4c:0b:6d:a7:d6:ba:a6:4a:f2:e8
|
|
||||||
# SHA256 Fingerprint: 88:49:7f:01:60:2f:31:54:24:6a:e2:8c:4d:5a:ef:10:f1:d8:7e:bb:76:62:6f:4a:e0:b7:f9:5b:a7:96:87:99
|
|
||||||
-----BEGIN CERTIFICATE-----
|
|
||||||
MIIFVjCCBD6gAwIBAgIQ7is969Qh3hSoYqwE893EATANBgkqhkiG9w0BAQUFADCB
|
|
||||||
8zELMAkGA1UEBhMCRVMxOzA5BgNVBAoTMkFnZW5jaWEgQ2F0YWxhbmEgZGUgQ2Vy
|
|
||||||
dGlmaWNhY2lvIChOSUYgUS0wODAxMTc2LUkpMSgwJgYDVQQLEx9TZXJ2ZWlzIFB1
|
|
||||||
YmxpY3MgZGUgQ2VydGlmaWNhY2lvMTUwMwYDVQQLEyxWZWdldSBodHRwczovL3d3
|
|
||||||
dy5jYXRjZXJ0Lm5ldC92ZXJhcnJlbCAoYykwMzE1MDMGA1UECxMsSmVyYXJxdWlh
|
|
||||||
IEVudGl0YXRzIGRlIENlcnRpZmljYWNpbyBDYXRhbGFuZXMxDzANBgNVBAMTBkVD
|
|
||||||
LUFDQzAeFw0wMzAxMDcyMzAwMDBaFw0zMTAxMDcyMjU5NTlaMIHzMQswCQYDVQQG
|
|
||||||
EwJFUzE7MDkGA1UEChMyQWdlbmNpYSBDYXRhbGFuYSBkZSBDZXJ0aWZpY2FjaW8g
|
|
||||||
KE5JRiBRLTA4MDExNzYtSSkxKDAmBgNVBAsTH1NlcnZlaXMgUHVibGljcyBkZSBD
|
|
||||||
ZXJ0aWZpY2FjaW8xNTAzBgNVBAsTLFZlZ2V1IGh0dHBzOi8vd3d3LmNhdGNlcnQu
|
|
||||||
bmV0L3ZlcmFycmVsIChjKTAzMTUwMwYDVQQLEyxKZXJhcnF1aWEgRW50aXRhdHMg
|
|
||||||
ZGUgQ2VydGlmaWNhY2lvIENhdGFsYW5lczEPMA0GA1UEAxMGRUMtQUNDMIIBIjAN
|
|
||||||
BgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAsyLHT+KXQpWIR4NA9h0X84NzJB5R
|
|
||||||
85iKw5K4/0CQBXCHYMkAqbWUZRkiFRfCQ2xmRJoNBD45b6VLeqpjt4pEndljkYRm
|
|
||||||
4CgPukLjbo73FCeTae6RDqNfDrHrZqJyTxIThmV6PttPB/SnCWDaOkKZx7J/sxaV
|
|
||||||
HMf5NLWUhdWZXqBIoH7nF2W4onW4HvPlQn2v7fOKSGRdghST2MDk/7NQcvJ29rNd
|
|
||||||
QlB50JQ+awwAvthrDk4q7D7SzIKiGGUzE3eeml0aE9jD2z3Il3rucO2n5nzbcc8t
|
|
||||||
lGLfbdb1OL4/pYUKGbio2Al1QnDE6u/LDsg0qBIimAy4E5S2S+zw0JDnJwIDAQAB
|
|
||||||
o4HjMIHgMB0GA1UdEQQWMBSBEmVjX2FjY0BjYXRjZXJ0Lm5ldDAPBgNVHRMBAf8E
|
|
||||||
BTADAQH/MA4GA1UdDwEB/wQEAwIBBjAdBgNVHQ4EFgQUoMOLRKo3pUW/l4Ba0fF4
|
|
||||||
opvpXY0wfwYDVR0gBHgwdjB0BgsrBgEEAfV4AQMBCjBlMCwGCCsGAQUFBwIBFiBo
|
|
||||||
dHRwczovL3d3dy5jYXRjZXJ0Lm5ldC92ZXJhcnJlbDA1BggrBgEFBQcCAjApGidW
|
|
||||||
ZWdldSBodHRwczovL3d3dy5jYXRjZXJ0Lm5ldC92ZXJhcnJlbCAwDQYJKoZIhvcN
|
|
||||||
AQEFBQADggEBAKBIW4IB9k1IuDlVNZyAelOZ1Vr/sXE7zDkJlF7W2u++AVtd0x7Y
|
|
||||||
/X1PzaBB4DSTv8vihpw3kpBWHNzrKQXlxJ7HNd+KDM3FIUPpqojlNcAZQmNaAl6k
|
|
||||||
SBg6hW/cnbw/nZzBh7h6YQjpdwt/cKt63dmXLGQehb+8dJahw3oS7AwaboMMPOhy
|
|
||||||
Rp/7SNVel+axofjk70YllJyJ22k4vuxcDlbHZVHlUIiIv0LVKz3l+bqeLrPK9HOS
|
|
||||||
Agu+TGbrIP65y7WZf+a2E/rKS03Z7lNGBjvGTq2TWoF+bCpLagVFjPIhpDGQh2xl
|
|
||||||
nJ2lYJU6Un/10asIbvPuW/mIPX64b24D5EI=
|
|
||||||
-----END CERTIFICATE-----
|
|
||||||
|
|
||||||
# Issuer: CN=Hellenic Academic and Research Institutions RootCA 2011 O=Hellenic Academic and Research Institutions Cert. Authority
|
|
||||||
# Subject: CN=Hellenic Academic and Research Institutions RootCA 2011 O=Hellenic Academic and Research Institutions Cert. Authority
|
|
||||||
# Label: "Hellenic Academic and Research Institutions RootCA 2011"
|
|
||||||
# Serial: 0
|
|
||||||
# MD5 Fingerprint: 73:9f:4c:4b:73:5b:79:e9:fa:ba:1c:ef:6e:cb:d5:c9
|
|
||||||
# SHA1 Fingerprint: fe:45:65:9b:79:03:5b:98:a1:61:b5:51:2e:ac:da:58:09:48:22:4d
|
|
||||||
# SHA256 Fingerprint: bc:10:4f:15:a4:8b:e7:09:dc:a5:42:a7:e1:d4:b9:df:6f:05:45:27:e8:02:ea:a9:2d:59:54:44:25:8a:fe:71
|
|
||||||
-----BEGIN CERTIFICATE-----
|
|
||||||
MIIEMTCCAxmgAwIBAgIBADANBgkqhkiG9w0BAQUFADCBlTELMAkGA1UEBhMCR1Ix
|
|
||||||
RDBCBgNVBAoTO0hlbGxlbmljIEFjYWRlbWljIGFuZCBSZXNlYXJjaCBJbnN0aXR1
|
|
||||||
dGlvbnMgQ2VydC4gQXV0aG9yaXR5MUAwPgYDVQQDEzdIZWxsZW5pYyBBY2FkZW1p
|
|
||||||
YyBhbmQgUmVzZWFyY2ggSW5zdGl0dXRpb25zIFJvb3RDQSAyMDExMB4XDTExMTIw
|
|
||||||
NjEzNDk1MloXDTMxMTIwMTEzNDk1MlowgZUxCzAJBgNVBAYTAkdSMUQwQgYDVQQK
|
|
||||||
EztIZWxsZW5pYyBBY2FkZW1pYyBhbmQgUmVzZWFyY2ggSW5zdGl0dXRpb25zIENl
|
|
||||||
cnQuIEF1dGhvcml0eTFAMD4GA1UEAxM3SGVsbGVuaWMgQWNhZGVtaWMgYW5kIFJl
|
|
||||||
c2VhcmNoIEluc3RpdHV0aW9ucyBSb290Q0EgMjAxMTCCASIwDQYJKoZIhvcNAQEB
|
|
||||||
BQADggEPADCCAQoCggEBAKlTAOMupvaO+mDYLZU++CwqVE7NuYRhlFhPjz2L5EPz
|
|
||||||
dYmNUeTDN9KKiE15HrcS3UN4SoqS5tdI1Q+kOilENbgH9mgdVc04UfCMJDGFr4PJ
|
|
||||||
fel3r+0ae50X+bOdOFAPplp5kYCvN66m0zH7tSYJnTxa71HFK9+WXesyHgLacEns
|
|
||||||
bgzImjeN9/E2YEsmLIKe0HjzDQ9jpFEw4fkrJxIH2Oq9GGKYsFk3fb7u8yBRQlqD
|
|
||||||
75O6aRXxYp2fmTmCobd0LovUxQt7L/DICto9eQqakxylKHJzkUOap9FNhYS5qXSP
|
|
||||||
FEDH3N6sQWRstBmbAmNtJGSPRLIl6s5ddAxjMlyNh+UCAwEAAaOBiTCBhjAPBgNV
|
|
||||||
HRMBAf8EBTADAQH/MAsGA1UdDwQEAwIBBjAdBgNVHQ4EFgQUppFC/RNhSiOeCKQp
|
|
||||||
5dgTBCPuQSUwRwYDVR0eBEAwPqA8MAWCAy5ncjAFggMuZXUwBoIELmVkdTAGggQu
|
|
||||||
b3JnMAWBAy5ncjAFgQMuZXUwBoEELmVkdTAGgQQub3JnMA0GCSqGSIb3DQEBBQUA
|
|
||||||
A4IBAQAf73lB4XtuP7KMhjdCSk4cNx6NZrokgclPEg8hwAOXhiVtXdMiKahsog2p
|
|
||||||
6z0GW5k6x8zDmjR/qw7IThzh+uTczQ2+vyT+bOdrwg3IBp5OjWEopmr95fZi6hg8
|
|
||||||
TqBTnbI6nOulnJEWtk2C4AwFSKls9cz4y51JtPACpf1wA+2KIaWuE4ZJwzNzvoc7
|
|
||||||
dIsXRSZMFpGD/md9zU1jZ/rzAxKWeAaNsWftjj++n08C9bMJL/NMh98qy5V8Acys
|
|
||||||
Nnq/onN694/BtZqhFLKPM58N7yLcZnuEvUUXBj08yrl3NI/K6s8/MT7jiOOASSXI
|
|
||||||
l7WdmplNsDz4SgCbZN2fOUvRJ9e4
|
|
||||||
-----END CERTIFICATE-----
|
|
||||||
|
|
||||||
# Issuer: CN=Actalis Authentication Root CA O=Actalis S.p.A./03358520967
|
# Issuer: CN=Actalis Authentication Root CA O=Actalis S.p.A./03358520967
|
||||||
# Subject: CN=Actalis Authentication Root CA O=Actalis S.p.A./03358520967
|
# Subject: CN=Actalis Authentication Root CA O=Actalis S.p.A./03358520967
|
||||||
# Label: "Actalis Authentication Root CA"
|
# Label: "Actalis Authentication Root CA"
|
||||||
|
@ -2342,27 +2182,6 @@ zzuqQhFkoJ2UOQIReVx7Hfpkue4WQrO/isIJxOzksU0CMQDpKmFHjFJKS04YcPbW
|
||||||
RNZu9YO6bVi9JNlWSOrvxKJGgYhqOkbRqZtNyWHa0V1Xahg=
|
RNZu9YO6bVi9JNlWSOrvxKJGgYhqOkbRqZtNyWHa0V1Xahg=
|
||||||
-----END CERTIFICATE-----
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
# Issuer: CN=GlobalSign O=GlobalSign OU=GlobalSign ECC Root CA - R4
|
|
||||||
# Subject: CN=GlobalSign O=GlobalSign OU=GlobalSign ECC Root CA - R4
|
|
||||||
# Label: "GlobalSign ECC Root CA - R4"
|
|
||||||
# Serial: 14367148294922964480859022125800977897474
|
|
||||||
# MD5 Fingerprint: 20:f0:27:68:d1:7e:a0:9d:0e:e6:2a:ca:df:5c:89:8e
|
|
||||||
# SHA1 Fingerprint: 69:69:56:2e:40:80:f4:24:a1:e7:19:9f:14:ba:f3:ee:58:ab:6a:bb
|
|
||||||
# SHA256 Fingerprint: be:c9:49:11:c2:95:56:76:db:6c:0a:55:09:86:d7:6e:3b:a0:05:66:7c:44:2c:97:62:b4:fb:b7:73:de:22:8c
|
|
||||||
-----BEGIN CERTIFICATE-----
|
|
||||||
MIIB4TCCAYegAwIBAgIRKjikHJYKBN5CsiilC+g0mAIwCgYIKoZIzj0EAwIwUDEk
|
|
||||||
MCIGA1UECxMbR2xvYmFsU2lnbiBFQ0MgUm9vdCBDQSAtIFI0MRMwEQYDVQQKEwpH
|
|
||||||
bG9iYWxTaWduMRMwEQYDVQQDEwpHbG9iYWxTaWduMB4XDTEyMTExMzAwMDAwMFoX
|
|
||||||
DTM4MDExOTAzMTQwN1owUDEkMCIGA1UECxMbR2xvYmFsU2lnbiBFQ0MgUm9vdCBD
|
|
||||||
QSAtIFI0MRMwEQYDVQQKEwpHbG9iYWxTaWduMRMwEQYDVQQDEwpHbG9iYWxTaWdu
|
|
||||||
MFkwEwYHKoZIzj0CAQYIKoZIzj0DAQcDQgAEuMZ5049sJQ6fLjkZHAOkrprlOQcJ
|
|
||||||
FspjsbmG+IpXwVfOQvpzofdlQv8ewQCybnMO/8ch5RikqtlxP6jUuc6MHaNCMEAw
|
|
||||||
DgYDVR0PAQH/BAQDAgEGMA8GA1UdEwEB/wQFMAMBAf8wHQYDVR0OBBYEFFSwe61F
|
|
||||||
uOJAf/sKbvu+M8k8o4TVMAoGCCqGSM49BAMCA0gAMEUCIQDckqGgE6bPA7DmxCGX
|
|
||||||
kPoUVy0D7O48027KqGx2vKLeuwIgJ6iFJzWbVsaj8kfSt24bAgAXqmemFZHe+pTs
|
|
||||||
ewv4n4Q=
|
|
||||||
-----END CERTIFICATE-----
|
|
||||||
|
|
||||||
# Issuer: CN=GlobalSign O=GlobalSign OU=GlobalSign ECC Root CA - R5
|
# Issuer: CN=GlobalSign O=GlobalSign OU=GlobalSign ECC Root CA - R5
|
||||||
# Subject: CN=GlobalSign O=GlobalSign OU=GlobalSign ECC Root CA - R5
|
# Subject: CN=GlobalSign O=GlobalSign OU=GlobalSign ECC Root CA - R5
|
||||||
# Label: "GlobalSign ECC Root CA - R5"
|
# Label: "GlobalSign ECC Root CA - R5"
|
||||||
|
@ -3337,126 +3156,6 @@ rYy0UGYwEAYJKwYBBAGCNxUBBAMCAQAwCgYIKoZIzj0EAwMDaAAwZQIwJsdpW9zV
|
||||||
Mgj/mkkCtojeFK9dbJlxjRo/i9fgojaGHAeCOnZT/cKi7e97sIBPWA9LUzm9
|
Mgj/mkkCtojeFK9dbJlxjRo/i9fgojaGHAeCOnZT/cKi7e97sIBPWA9LUzm9
|
||||||
-----END CERTIFICATE-----
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
# Issuer: CN=GTS Root R1 O=Google Trust Services LLC
|
|
||||||
# Subject: CN=GTS Root R1 O=Google Trust Services LLC
|
|
||||||
# Label: "GTS Root R1"
|
|
||||||
# Serial: 146587175971765017618439757810265552097
|
|
||||||
# MD5 Fingerprint: 82:1a:ef:d4:d2:4a:f2:9f:e2:3d:97:06:14:70:72:85
|
|
||||||
# SHA1 Fingerprint: e1:c9:50:e6:ef:22:f8:4c:56:45:72:8b:92:20:60:d7:d5:a7:a3:e8
|
|
||||||
# SHA256 Fingerprint: 2a:57:54:71:e3:13:40:bc:21:58:1c:bd:2c:f1:3e:15:84:63:20:3e:ce:94:bc:f9:d3:cc:19:6b:f0:9a:54:72
|
|
||||||
-----BEGIN CERTIFICATE-----
|
|
||||||
MIIFWjCCA0KgAwIBAgIQbkepxUtHDA3sM9CJuRz04TANBgkqhkiG9w0BAQwFADBH
|
|
||||||
MQswCQYDVQQGEwJVUzEiMCAGA1UEChMZR29vZ2xlIFRydXN0IFNlcnZpY2VzIExM
|
|
||||||
QzEUMBIGA1UEAxMLR1RTIFJvb3QgUjEwHhcNMTYwNjIyMDAwMDAwWhcNMzYwNjIy
|
|
||||||
MDAwMDAwWjBHMQswCQYDVQQGEwJVUzEiMCAGA1UEChMZR29vZ2xlIFRydXN0IFNl
|
|
||||||
cnZpY2VzIExMQzEUMBIGA1UEAxMLR1RTIFJvb3QgUjEwggIiMA0GCSqGSIb3DQEB
|
|
||||||
AQUAA4ICDwAwggIKAoICAQC2EQKLHuOhd5s73L+UPreVp0A8of2C+X0yBoJx9vaM
|
|
||||||
f/vo27xqLpeXo4xL+Sv2sfnOhB2x+cWX3u+58qPpvBKJXqeqUqv4IyfLpLGcY9vX
|
|
||||||
mX7wCl7raKb0xlpHDU0QM+NOsROjyBhsS+z8CZDfnWQpJSMHobTSPS5g4M/SCYe7
|
|
||||||
zUjwTcLCeoiKu7rPWRnWr4+wB7CeMfGCwcDfLqZtbBkOtdh+JhpFAz2weaSUKK0P
|
|
||||||
fyblqAj+lug8aJRT7oM6iCsVlgmy4HqMLnXWnOunVmSPlk9orj2XwoSPwLxAwAtc
|
|
||||||
vfaHszVsrBhQf4TgTM2S0yDpM7xSma8ytSmzJSq0SPly4cpk9+aCEI3oncKKiPo4
|
|
||||||
Zor8Y/kB+Xj9e1x3+naH+uzfsQ55lVe0vSbv1gHR6xYKu44LtcXFilWr06zqkUsp
|
|
||||||
zBmkMiVOKvFlRNACzqrOSbTqn3yDsEB750Orp2yjj32JgfpMpf/VjsPOS+C12LOO
|
|
||||||
Rc92wO1AK/1TD7Cn1TsNsYqiA94xrcx36m97PtbfkSIS5r762DL8EGMUUXLeXdYW
|
|
||||||
k70paDPvOmbsB4om3xPXV2V4J95eSRQAogB/mqghtqmxlbCluQ0WEdrHbEg8QOB+
|
|
||||||
DVrNVjzRlwW5y0vtOUucxD/SVRNuJLDWcfr0wbrM7Rv1/oFB2ACYPTrIrnqYNxgF
|
|
||||||
lQIDAQABo0IwQDAOBgNVHQ8BAf8EBAMCAQYwDwYDVR0TAQH/BAUwAwEB/zAdBgNV
|
|
||||||
HQ4EFgQU5K8rJnEaK0gnhS9SZizv8IkTcT4wDQYJKoZIhvcNAQEMBQADggIBADiW
|
|
||||||
Cu49tJYeX++dnAsznyvgyv3SjgofQXSlfKqE1OXyHuY3UjKcC9FhHb8owbZEKTV1
|
|
||||||
d5iyfNm9dKyKaOOpMQkpAWBz40d8U6iQSifvS9efk+eCNs6aaAyC58/UEBZvXw6Z
|
|
||||||
XPYfcX3v73svfuo21pdwCxXu11xWajOl40k4DLh9+42FpLFZXvRq4d2h9mREruZR
|
|
||||||
gyFmxhE+885H7pwoHyXa/6xmld01D1zvICxi/ZG6qcz8WpyTgYMpl0p8WnK0OdC3
|
|
||||||
d8t5/Wk6kjftbjhlRn7pYL15iJdfOBL07q9bgsiG1eGZbYwE8na6SfZu6W0eX6Dv
|
|
||||||
J4J2QPim01hcDyxC2kLGe4g0x8HYRZvBPsVhHdljUEn2NIVq4BjFbkerQUIpm/Zg
|
|
||||||
DdIx02OYI5NaAIFItO/Nis3Jz5nu2Z6qNuFoS3FJFDYoOj0dzpqPJeaAcWErtXvM
|
|
||||||
+SUWgeExX6GjfhaknBZqlxi9dnKlC54dNuYvoS++cJEPqOba+MSSQGwlfnuzCdyy
|
|
||||||
F62ARPBopY+Udf90WuioAnwMCeKpSwughQtiue+hMZL77/ZRBIls6Kl0obsXs7X9
|
|
||||||
SQ98POyDGCBDTtWTurQ0sR8WNh8M5mQ5Fkzc4P4dyKliPUDqysU0ArSuiYgzNdws
|
|
||||||
E3PYJ/HQcu51OyLemGhmW/HGY0dVHLqlCFF1pkgl
|
|
||||||
-----END CERTIFICATE-----
|
|
||||||
|
|
||||||
# Issuer: CN=GTS Root R2 O=Google Trust Services LLC
|
|
||||||
# Subject: CN=GTS Root R2 O=Google Trust Services LLC
|
|
||||||
# Label: "GTS Root R2"
|
|
||||||
# Serial: 146587176055767053814479386953112547951
|
|
||||||
# MD5 Fingerprint: 44:ed:9a:0e:a4:09:3b:00:f2:ae:4c:a3:c6:61:b0:8b
|
|
||||||
# SHA1 Fingerprint: d2:73:96:2a:2a:5e:39:9f:73:3f:e1:c7:1e:64:3f:03:38:34:fc:4d
|
|
||||||
# SHA256 Fingerprint: c4:5d:7b:b0:8e:6d:67:e6:2e:42:35:11:0b:56:4e:5f:78:fd:92:ef:05:8c:84:0a:ea:4e:64:55:d7:58:5c:60
|
|
||||||
-----BEGIN CERTIFICATE-----
|
|
||||||
MIIFWjCCA0KgAwIBAgIQbkepxlqz5yDFMJo/aFLybzANBgkqhkiG9w0BAQwFADBH
|
|
||||||
MQswCQYDVQQGEwJVUzEiMCAGA1UEChMZR29vZ2xlIFRydXN0IFNlcnZpY2VzIExM
|
|
||||||
QzEUMBIGA1UEAxMLR1RTIFJvb3QgUjIwHhcNMTYwNjIyMDAwMDAwWhcNMzYwNjIy
|
|
||||||
MDAwMDAwWjBHMQswCQYDVQQGEwJVUzEiMCAGA1UEChMZR29vZ2xlIFRydXN0IFNl
|
|
||||||
cnZpY2VzIExMQzEUMBIGA1UEAxMLR1RTIFJvb3QgUjIwggIiMA0GCSqGSIb3DQEB
|
|
||||||
AQUAA4ICDwAwggIKAoICAQDO3v2m++zsFDQ8BwZabFn3GTXd98GdVarTzTukk3Lv
|
|
||||||
CvptnfbwhYBboUhSnznFt+4orO/LdmgUud+tAWyZH8QiHZ/+cnfgLFuv5AS/T3Kg
|
|
||||||
GjSY6Dlo7JUle3ah5mm5hRm9iYz+re026nO8/4Piy33B0s5Ks40FnotJk9/BW9Bu
|
|
||||||
XvAuMC6C/Pq8tBcKSOWIm8Wba96wyrQD8Nr0kLhlZPdcTK3ofmZemde4wj7I0BOd
|
|
||||||
re7kRXuJVfeKH2JShBKzwkCX44ofR5GmdFrS+LFjKBC4swm4VndAoiaYecb+3yXu
|
|
||||||
PuWgf9RhD1FLPD+M2uFwdNjCaKH5wQzpoeJ/u1U8dgbuak7MkogwTZq9TwtImoS1
|
|
||||||
mKPV+3PBV2HdKFZ1E66HjucMUQkQdYhMvI35ezzUIkgfKtzra7tEscszcTJGr61K
|
|
||||||
8YzodDqs5xoic4DSMPclQsciOzsSrZYuxsN2B6ogtzVJV+mSSeh2FnIxZyuWfoqj
|
|
||||||
x5RWIr9qS34BIbIjMt/kmkRtWVtd9QCgHJvGeJeNkP+byKq0rxFROV7Z+2et1VsR
|
|
||||||
nTKaG73VululycslaVNVJ1zgyjbLiGH7HrfQy+4W+9OmTN6SpdTi3/UGVN4unUu0
|
|
||||||
kzCqgc7dGtxRcw1PcOnlthYhGXmy5okLdWTK1au8CcEYof/UVKGFPP0UJAOyh9Ok
|
|
||||||
twIDAQABo0IwQDAOBgNVHQ8BAf8EBAMCAQYwDwYDVR0TAQH/BAUwAwEB/zAdBgNV
|
|
||||||
HQ4EFgQUu//KjiOfT5nK2+JopqUVJxce2Q4wDQYJKoZIhvcNAQEMBQADggIBALZp
|
|
||||||
8KZ3/p7uC4Gt4cCpx/k1HUCCq+YEtN/L9x0Pg/B+E02NjO7jMyLDOfxA325BS0JT
|
|
||||||
vhaI8dI4XsRomRyYUpOM52jtG2pzegVATX9lO9ZY8c6DR2Dj/5epnGB3GFW1fgiT
|
|
||||||
z9D2PGcDFWEJ+YF59exTpJ/JjwGLc8R3dtyDovUMSRqodt6Sm2T4syzFJ9MHwAiA
|
|
||||||
pJiS4wGWAqoC7o87xdFtCjMwc3i5T1QWvwsHoaRc5svJXISPD+AVdyx+Jn7axEvb
|
|
||||||
pxZ3B7DNdehyQtaVhJ2Gg/LkkM0JR9SLA3DaWsYDQvTtN6LwG1BUSw7YhN4ZKJmB
|
|
||||||
R64JGz9I0cNv4rBgF/XuIwKl2gBbbZCr7qLpGzvpx0QnRY5rn/WkhLx3+WuXrD5R
|
|
||||||
RaIRpsyF7gpo8j5QOHokYh4XIDdtak23CZvJ/KRY9bb7nE4Yu5UC56GtmwfuNmsk
|
|
||||||
0jmGwZODUNKBRqhfYlcsu2xkiAhu7xNUX90txGdj08+JN7+dIPT7eoOboB6BAFDC
|
|
||||||
5AwiWVIQ7UNWhwD4FFKnHYuTjKJNRn8nxnGbJN7k2oaLDX5rIMHAnuFl2GqjpuiF
|
|
||||||
izoHCBy69Y9Vmhh1fuXsgWbRIXOhNUQLgD1bnF5vKheW0YMjiGZt5obicDIvUiLn
|
|
||||||
yOd/xCxgXS/Dr55FBcOEArf9LAhST4Ldo/DUhgkC
|
|
||||||
-----END CERTIFICATE-----
|
|
||||||
|
|
||||||
# Issuer: CN=GTS Root R3 O=Google Trust Services LLC
|
|
||||||
# Subject: CN=GTS Root R3 O=Google Trust Services LLC
|
|
||||||
# Label: "GTS Root R3"
|
|
||||||
# Serial: 146587176140553309517047991083707763997
|
|
||||||
# MD5 Fingerprint: 1a:79:5b:6b:04:52:9c:5d:c7:74:33:1b:25:9a:f9:25
|
|
||||||
# SHA1 Fingerprint: 30:d4:24:6f:07:ff:db:91:89:8a:0b:e9:49:66:11:eb:8c:5e:46:e5
|
|
||||||
# SHA256 Fingerprint: 15:d5:b8:77:46:19:ea:7d:54:ce:1c:a6:d0:b0:c4:03:e0:37:a9:17:f1:31:e8:a0:4e:1e:6b:7a:71:ba:bc:e5
|
|
||||||
-----BEGIN CERTIFICATE-----
|
|
||||||
MIICDDCCAZGgAwIBAgIQbkepx2ypcyRAiQ8DVd2NHTAKBggqhkjOPQQDAzBHMQsw
|
|
||||||
CQYDVQQGEwJVUzEiMCAGA1UEChMZR29vZ2xlIFRydXN0IFNlcnZpY2VzIExMQzEU
|
|
||||||
MBIGA1UEAxMLR1RTIFJvb3QgUjMwHhcNMTYwNjIyMDAwMDAwWhcNMzYwNjIyMDAw
|
|
||||||
MDAwWjBHMQswCQYDVQQGEwJVUzEiMCAGA1UEChMZR29vZ2xlIFRydXN0IFNlcnZp
|
|
||||||
Y2VzIExMQzEUMBIGA1UEAxMLR1RTIFJvb3QgUjMwdjAQBgcqhkjOPQIBBgUrgQQA
|
|
||||||
IgNiAAQfTzOHMymKoYTey8chWEGJ6ladK0uFxh1MJ7x/JlFyb+Kf1qPKzEUURout
|
|
||||||
736GjOyxfi//qXGdGIRFBEFVbivqJn+7kAHjSxm65FSWRQmx1WyRRK2EE46ajA2A
|
|
||||||
DDL24CejQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNVHRMBAf8EBTADAQH/MB0GA1Ud
|
|
||||||
DgQWBBTB8Sa6oC2uhYHP0/EqEr24Cmf9vDAKBggqhkjOPQQDAwNpADBmAjEAgFuk
|
|
||||||
fCPAlaUs3L6JbyO5o91lAFJekazInXJ0glMLfalAvWhgxeG4VDvBNhcl2MG9AjEA
|
|
||||||
njWSdIUlUfUk7GRSJFClH9voy8l27OyCbvWFGFPouOOaKaqW04MjyaR7YbPMAuhd
|
|
||||||
-----END CERTIFICATE-----
|
|
||||||
|
|
||||||
# Issuer: CN=GTS Root R4 O=Google Trust Services LLC
|
|
||||||
# Subject: CN=GTS Root R4 O=Google Trust Services LLC
|
|
||||||
# Label: "GTS Root R4"
|
|
||||||
# Serial: 146587176229350439916519468929765261721
|
|
||||||
# MD5 Fingerprint: 5d:b6:6a:c4:60:17:24:6a:1a:99:a8:4b:ee:5e:b4:26
|
|
||||||
# SHA1 Fingerprint: 2a:1d:60:27:d9:4a:b1:0a:1c:4d:91:5c:cd:33:a0:cb:3e:2d:54:cb
|
|
||||||
# SHA256 Fingerprint: 71:cc:a5:39:1f:9e:79:4b:04:80:25:30:b3:63:e1:21:da:8a:30:43:bb:26:66:2f:ea:4d:ca:7f:c9:51:a4:bd
|
|
||||||
-----BEGIN CERTIFICATE-----
|
|
||||||
MIICCjCCAZGgAwIBAgIQbkepyIuUtui7OyrYorLBmTAKBggqhkjOPQQDAzBHMQsw
|
|
||||||
CQYDVQQGEwJVUzEiMCAGA1UEChMZR29vZ2xlIFRydXN0IFNlcnZpY2VzIExMQzEU
|
|
||||||
MBIGA1UEAxMLR1RTIFJvb3QgUjQwHhcNMTYwNjIyMDAwMDAwWhcNMzYwNjIyMDAw
|
|
||||||
MDAwWjBHMQswCQYDVQQGEwJVUzEiMCAGA1UEChMZR29vZ2xlIFRydXN0IFNlcnZp
|
|
||||||
Y2VzIExMQzEUMBIGA1UEAxMLR1RTIFJvb3QgUjQwdjAQBgcqhkjOPQIBBgUrgQQA
|
|
||||||
IgNiAATzdHOnaItgrkO4NcWBMHtLSZ37wWHO5t5GvWvVYRg1rkDdc/eJkTBa6zzu
|
|
||||||
hXyiQHY7qca4R9gq55KRanPpsXI5nymfopjTX15YhmUPoYRlBtHci8nHc8iMai/l
|
|
||||||
xKvRHYqjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNVHRMBAf8EBTADAQH/MB0GA1Ud
|
|
||||||
DgQWBBSATNbrdP9JNqPV2Py1PsVq8JQdjDAKBggqhkjOPQQDAwNnADBkAjBqUFJ0
|
|
||||||
CMRw3J5QdCHojXohw0+WbhXRIjVhLfoIN+4Zba3bssx9BzT1YBkstTTZbyACMANx
|
|
||||||
sbqjYAuG7ZoIapVon+Kz4ZNkfF6Tpt95LY2F45TPI11xzPKwTdb+mciUqXWi4w==
|
|
||||||
-----END CERTIFICATE-----
|
|
||||||
|
|
||||||
# Issuer: CN=UCA Global G2 Root O=UniTrust
|
# Issuer: CN=UCA Global G2 Root O=UniTrust
|
||||||
# Subject: CN=UCA Global G2 Root O=UniTrust
|
# Subject: CN=UCA Global G2 Root O=UniTrust
|
||||||
# Label: "UCA Global G2 Root"
|
# Label: "UCA Global G2 Root"
|
||||||
|
@ -4360,3 +4059,650 @@ AgGGMAoGCCqGSM49BAMDA2cAMGQCMBHervjcToiwqfAircJRQO9gcS3ujwLEXQNw
|
||||||
SaSS6sUUiHCm0w2wqsosQJz76YJumgIwK0eaB8bRwoF8yguWGEEbo/QwCZ61IygN
|
SaSS6sUUiHCm0w2wqsosQJz76YJumgIwK0eaB8bRwoF8yguWGEEbo/QwCZ61IygN
|
||||||
nxS2PFOiTAZpffpskcYqSUXm7LcT4Tps
|
nxS2PFOiTAZpffpskcYqSUXm7LcT4Tps
|
||||||
-----END CERTIFICATE-----
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
# Issuer: CN=Autoridad de Certificacion Firmaprofesional CIF A62634068
|
||||||
|
# Subject: CN=Autoridad de Certificacion Firmaprofesional CIF A62634068
|
||||||
|
# Label: "Autoridad de Certificacion Firmaprofesional CIF A62634068"
|
||||||
|
# Serial: 1977337328857672817
|
||||||
|
# MD5 Fingerprint: 4e:6e:9b:54:4c:ca:b7:fa:48:e4:90:b1:15:4b:1c:a3
|
||||||
|
# SHA1 Fingerprint: 0b:be:c2:27:22:49:cb:39:aa:db:35:5c:53:e3:8c:ae:78:ff:b6:fe
|
||||||
|
# SHA256 Fingerprint: 57:de:05:83:ef:d2:b2:6e:03:61:da:99:da:9d:f4:64:8d:ef:7e:e8:44:1c:3b:72:8a:fa:9b:cd:e0:f9:b2:6a
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIIGFDCCA/ygAwIBAgIIG3Dp0v+ubHEwDQYJKoZIhvcNAQELBQAwUTELMAkGA1UE
|
||||||
|
BhMCRVMxQjBABgNVBAMMOUF1dG9yaWRhZCBkZSBDZXJ0aWZpY2FjaW9uIEZpcm1h
|
||||||
|
cHJvZmVzaW9uYWwgQ0lGIEE2MjYzNDA2ODAeFw0xNDA5MjMxNTIyMDdaFw0zNjA1
|
||||||
|
MDUxNTIyMDdaMFExCzAJBgNVBAYTAkVTMUIwQAYDVQQDDDlBdXRvcmlkYWQgZGUg
|
||||||
|
Q2VydGlmaWNhY2lvbiBGaXJtYXByb2Zlc2lvbmFsIENJRiBBNjI2MzQwNjgwggIi
|
||||||
|
MA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQDKlmuO6vj78aI14H9M2uDDUtd9
|
||||||
|
thDIAl6zQyrET2qyyhxdKJp4ERppWVevtSBC5IsP5t9bpgOSL/UR5GLXMnE42QQM
|
||||||
|
cas9UX4PB99jBVzpv5RvwSmCwLTaUbDBPLutN0pcyvFLNg4kq7/DhHf9qFD0sefG
|
||||||
|
L9ItWY16Ck6WaVICqjaY7Pz6FIMMNx/Jkjd/14Et5cS54D40/mf0PmbR0/RAz15i
|
||||||
|
NA9wBj4gGFrO93IbJWyTdBSTo3OxDqqHECNZXyAFGUftaI6SEspd/NYrspI8IM/h
|
||||||
|
X68gvqB2f3bl7BqGYTM+53u0P6APjqK5am+5hyZvQWyIplD9amML9ZMWGxmPsu2b
|
||||||
|
m8mQ9QEM3xk9Dz44I8kvjwzRAv4bVdZO0I08r0+k8/6vKtMFnXkIoctXMbScyJCy
|
||||||
|
Z/QYFpM6/EfY0XiWMR+6KwxfXZmtY4laJCB22N/9q06mIqqdXuYnin1oKaPnirja
|
||||||
|
EbsXLZmdEyRG98Xi2J+Of8ePdG1asuhy9azuJBCtLxTa/y2aRnFHvkLfuwHb9H/T
|
||||||
|
KI8xWVvTyQKmtFLKbpf7Q8UIJm+K9Lv9nyiqDdVF8xM6HdjAeI9BZzwelGSuewvF
|
||||||
|
6NkBiDkal4ZkQdU7hwxu+g/GvUgUvzlN1J5Bto+WHWOWk9mVBngxaJ43BjuAiUVh
|
||||||
|
OSPHG0SjFeUc+JIwuwIDAQABo4HvMIHsMB0GA1UdDgQWBBRlzeurNR4APn7VdMAc
|
||||||
|
tHNHDhpkLzASBgNVHRMBAf8ECDAGAQH/AgEBMIGmBgNVHSAEgZ4wgZswgZgGBFUd
|
||||||
|
IAAwgY8wLwYIKwYBBQUHAgEWI2h0dHA6Ly93d3cuZmlybWFwcm9mZXNpb25hbC5j
|
||||||
|
b20vY3BzMFwGCCsGAQUFBwICMFAeTgBQAGEAcwBlAG8AIABkAGUAIABsAGEAIABC
|
||||||
|
AG8AbgBhAG4AbwB2AGEAIAA0ADcAIABCAGEAcgBjAGUAbABvAG4AYQAgADAAOAAw
|
||||||
|
ADEANzAOBgNVHQ8BAf8EBAMCAQYwDQYJKoZIhvcNAQELBQADggIBAHSHKAIrdx9m
|
||||||
|
iWTtj3QuRhy7qPj4Cx2Dtjqn6EWKB7fgPiDL4QjbEwj4KKE1soCzC1HA01aajTNF
|
||||||
|
Sa9J8OA9B3pFE1r/yJfY0xgsfZb43aJlQ3CTkBW6kN/oGbDbLIpgD7dvlAceHabJ
|
||||||
|
hfa9NPhAeGIQcDq+fUs5gakQ1JZBu/hfHAsdCPKxsIl68veg4MSPi3i1O1ilI45P
|
||||||
|
Vf42O+AMt8oqMEEgtIDNrvx2ZnOorm7hfNoD6JQg5iKj0B+QXSBTFCZX2lSX3xZE
|
||||||
|
EAEeiGaPcjiT3SC3NL7X8e5jjkd5KAb881lFJWAiMxujX6i6KtoaPc1A6ozuBRWV
|
||||||
|
1aUsIC+nmCjuRfzxuIgALI9C2lHVnOUTaHFFQ4ueCyE8S1wF3BqfmI7avSKecs2t
|
||||||
|
CsvMo2ebKHTEm9caPARYpoKdrcd7b/+Alun4jWq9GJAd/0kakFI3ky88Al2CdgtR
|
||||||
|
5xbHV/g4+afNmyJU72OwFW1TZQNKXkqgsqeOSQBZONXH9IBk9W6VULgRfhVwOEqw
|
||||||
|
f9DEMnDAGf/JOC0ULGb0QkTmVXYbgBVX/8Cnp6o5qtjTcNAuuuuUavpfNIbnYrX9
|
||||||
|
ivAwhZTJryQCL2/W3Wf+47BVTwSYT6RBVuKT0Gro1vP7ZeDOdcQxWQzugsgMYDNK
|
||||||
|
GbqEZycPvEJdvSRUDewdcAZfpLz6IHxV
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
# Issuer: CN=vTrus ECC Root CA O=iTrusChina Co.,Ltd.
|
||||||
|
# Subject: CN=vTrus ECC Root CA O=iTrusChina Co.,Ltd.
|
||||||
|
# Label: "vTrus ECC Root CA"
|
||||||
|
# Serial: 630369271402956006249506845124680065938238527194
|
||||||
|
# MD5 Fingerprint: de:4b:c1:f5:52:8c:9b:43:e1:3e:8f:55:54:17:8d:85
|
||||||
|
# SHA1 Fingerprint: f6:9c:db:b0:fc:f6:02:13:b6:52:32:a6:a3:91:3f:16:70:da:c3:e1
|
||||||
|
# SHA256 Fingerprint: 30:fb:ba:2c:32:23:8e:2a:98:54:7a:f9:79:31:e5:50:42:8b:9b:3f:1c:8e:eb:66:33:dc:fa:86:c5:b2:7d:d3
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIICDzCCAZWgAwIBAgIUbmq8WapTvpg5Z6LSa6Q75m0c1towCgYIKoZIzj0EAwMw
|
||||||
|
RzELMAkGA1UEBhMCQ04xHDAaBgNVBAoTE2lUcnVzQ2hpbmEgQ28uLEx0ZC4xGjAY
|
||||||
|
BgNVBAMTEXZUcnVzIEVDQyBSb290IENBMB4XDTE4MDczMTA3MjY0NFoXDTQzMDcz
|
||||||
|
MTA3MjY0NFowRzELMAkGA1UEBhMCQ04xHDAaBgNVBAoTE2lUcnVzQ2hpbmEgQ28u
|
||||||
|
LEx0ZC4xGjAYBgNVBAMTEXZUcnVzIEVDQyBSb290IENBMHYwEAYHKoZIzj0CAQYF
|
||||||
|
K4EEACIDYgAEZVBKrox5lkqqHAjDo6LN/llWQXf9JpRCux3NCNtzslt188+cToL0
|
||||||
|
v/hhJoVs1oVbcnDS/dtitN9Ti72xRFhiQgnH+n9bEOf+QP3A2MMrMudwpremIFUd
|
||||||
|
e4BdS49nTPEQo0IwQDAdBgNVHQ4EFgQUmDnNvtiyjPeyq+GtJK97fKHbH88wDwYD
|
||||||
|
VR0TAQH/BAUwAwEB/zAOBgNVHQ8BAf8EBAMCAQYwCgYIKoZIzj0EAwMDaAAwZQIw
|
||||||
|
V53dVvHH4+m4SVBrm2nDb+zDfSXkV5UTQJtS0zvzQBm8JsctBp61ezaf9SXUY2sA
|
||||||
|
AjEA6dPGnlaaKsyh2j/IZivTWJwghfqrkYpwcBE4YGQLYgmRWAD5Tfs0aNoJrSEG
|
||||||
|
GJTO
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
# Issuer: CN=vTrus Root CA O=iTrusChina Co.,Ltd.
|
||||||
|
# Subject: CN=vTrus Root CA O=iTrusChina Co.,Ltd.
|
||||||
|
# Label: "vTrus Root CA"
|
||||||
|
# Serial: 387574501246983434957692974888460947164905180485
|
||||||
|
# MD5 Fingerprint: b8:c9:37:df:fa:6b:31:84:64:c5:ea:11:6a:1b:75:fc
|
||||||
|
# SHA1 Fingerprint: 84:1a:69:fb:f5:cd:1a:25:34:13:3d:e3:f8:fc:b8:99:d0:c9:14:b7
|
||||||
|
# SHA256 Fingerprint: 8a:71:de:65:59:33:6f:42:6c:26:e5:38:80:d0:0d:88:a1:8d:a4:c6:a9:1f:0d:cb:61:94:e2:06:c5:c9:63:87
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIIFVjCCAz6gAwIBAgIUQ+NxE9izWRRdt86M/TX9b7wFjUUwDQYJKoZIhvcNAQEL
|
||||||
|
BQAwQzELMAkGA1UEBhMCQ04xHDAaBgNVBAoTE2lUcnVzQ2hpbmEgQ28uLEx0ZC4x
|
||||||
|
FjAUBgNVBAMTDXZUcnVzIFJvb3QgQ0EwHhcNMTgwNzMxMDcyNDA1WhcNNDMwNzMx
|
||||||
|
MDcyNDA1WjBDMQswCQYDVQQGEwJDTjEcMBoGA1UEChMTaVRydXNDaGluYSBDby4s
|
||||||
|
THRkLjEWMBQGA1UEAxMNdlRydXMgUm9vdCBDQTCCAiIwDQYJKoZIhvcNAQEBBQAD
|
||||||
|
ggIPADCCAgoCggIBAL1VfGHTuB0EYgWgrmy3cLRB6ksDXhA/kFocizuwZotsSKYc
|
||||||
|
IrrVQJLuM7IjWcmOvFjai57QGfIvWcaMY1q6n6MLsLOaXLoRuBLpDLvPbmyAhykU
|
||||||
|
AyyNJJrIZIO1aqwTLDPxn9wsYTwaP3BVm60AUn/PBLn+NvqcwBauYv6WTEN+VRS+
|
||||||
|
GrPSbcKvdmaVayqwlHeFXgQPYh1jdfdr58tbmnDsPmcF8P4HCIDPKNsFxhQnL4Z9
|
||||||
|
8Cfe/+Z+M0jnCx5Y0ScrUw5XSmXX+6KAYPxMvDVTAWqXcoKv8R1w6Jz1717CbMdH
|
||||||
|
flqUhSZNO7rrTOiwCcJlwp2dCZtOtZcFrPUGoPc2BX70kLJrxLT5ZOrpGgrIDajt
|
||||||
|
J8nU57O5q4IikCc9Kuh8kO+8T/3iCiSn3mUkpF3qwHYw03dQ+A0Em5Q2AXPKBlim
|
||||||
|
0zvc+gRGE1WKyURHuFE5Gi7oNOJ5y1lKCn+8pu8fA2dqWSslYpPZUxlmPCdiKYZN
|
||||||
|
pGvu/9ROutW04o5IWgAZCfEF2c6Rsffr6TlP9m8EQ5pV9T4FFL2/s1m02I4zhKOQ
|
||||||
|
UqqzApVg+QxMaPnu1RcN+HFXtSXkKe5lXa/R7jwXC1pDxaWG6iSe4gUH3DRCEpHW
|
||||||
|
OXSuTEGC2/KmSNGzm/MzqvOmwMVO9fSddmPmAsYiS8GVP1BkLFTltvA8Kc9XAgMB
|
||||||
|
AAGjQjBAMB0GA1UdDgQWBBRUYnBj8XWEQ1iO0RYgscasGrz2iTAPBgNVHRMBAf8E
|
||||||
|
BTADAQH/MA4GA1UdDwEB/wQEAwIBBjANBgkqhkiG9w0BAQsFAAOCAgEAKbqSSaet
|
||||||
|
8PFww+SX8J+pJdVrnjT+5hpk9jprUrIQeBqfTNqK2uwcN1LgQkv7bHbKJAs5EhWd
|
||||||
|
nxEt/Hlk3ODg9d3gV8mlsnZwUKT+twpw1aA08XXXTUm6EdGz2OyC/+sOxL9kLX1j
|
||||||
|
bhd47F18iMjrjld22VkE+rxSH0Ws8HqA7Oxvdq6R2xCOBNyS36D25q5J08FsEhvM
|
||||||
|
Kar5CKXiNxTKsbhm7xqC5PD48acWabfbqWE8n/Uxy+QARsIvdLGx14HuqCaVvIiv
|
||||||
|
TDUHKgLKeBRtRytAVunLKmChZwOgzoy8sHJnxDHO2zTlJQNgJXtxmOTAGytfdELS
|
||||||
|
S8VZCAeHvsXDf+eW2eHcKJfWjwXj9ZtOyh1QRwVTsMo554WgicEFOwE30z9J4nfr
|
||||||
|
I8iIZjs9OXYhRvHsXyO466JmdXTBQPfYaJqT4i2pLr0cox7IdMakLXogqzu4sEb9
|
||||||
|
b91fUlV1YvCXoHzXOP0l382gmxDPi7g4Xl7FtKYCNqEeXxzP4padKar9mK5S4fNB
|
||||||
|
UvupLnKWnyfjqnN9+BojZns7q2WwMgFLFT49ok8MKzWixtlnEjUwzXYuFrOZnk1P
|
||||||
|
Ti07NEPhmg4NpGaXutIcSkwsKouLgU9xGqndXHt7CMUADTdA43x7VF8vhV929ven
|
||||||
|
sBxXVsFy6K2ir40zSbofitzmdHxghm+Hl3s=
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
# Issuer: CN=ISRG Root X2 O=Internet Security Research Group
|
||||||
|
# Subject: CN=ISRG Root X2 O=Internet Security Research Group
|
||||||
|
# Label: "ISRG Root X2"
|
||||||
|
# Serial: 87493402998870891108772069816698636114
|
||||||
|
# MD5 Fingerprint: d3:9e:c4:1e:23:3c:a6:df:cf:a3:7e:6d:e0:14:e6:e5
|
||||||
|
# SHA1 Fingerprint: bd:b1:b9:3c:d5:97:8d:45:c6:26:14:55:f8:db:95:c7:5a:d1:53:af
|
||||||
|
# SHA256 Fingerprint: 69:72:9b:8e:15:a8:6e:fc:17:7a:57:af:b7:17:1d:fc:64:ad:d2:8c:2f:ca:8c:f1:50:7e:34:45:3c:cb:14:70
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIICGzCCAaGgAwIBAgIQQdKd0XLq7qeAwSxs6S+HUjAKBggqhkjOPQQDAzBPMQsw
|
||||||
|
CQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJuZXQgU2VjdXJpdHkgUmVzZWFyY2gg
|
||||||
|
R3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBYMjAeFw0yMDA5MDQwMDAwMDBaFw00
|
||||||
|
MDA5MTcxNjAwMDBaME8xCzAJBgNVBAYTAlVTMSkwJwYDVQQKEyBJbnRlcm5ldCBT
|
||||||
|
ZWN1cml0eSBSZXNlYXJjaCBHcm91cDEVMBMGA1UEAxMMSVNSRyBSb290IFgyMHYw
|
||||||
|
EAYHKoZIzj0CAQYFK4EEACIDYgAEzZvVn4CDCuwJSvMWSj5cz3es3mcFDR0HttwW
|
||||||
|
+1qLFNvicWDEukWVEYmO6gbf9yoWHKS5xcUy4APgHoIYOIvXRdgKam7mAHf7AlF9
|
||||||
|
ItgKbppbd9/w+kHsOdx1ymgHDB/qo0IwQDAOBgNVHQ8BAf8EBAMCAQYwDwYDVR0T
|
||||||
|
AQH/BAUwAwEB/zAdBgNVHQ4EFgQUfEKWrt5LSDv6kviejM9ti6lyN5UwCgYIKoZI
|
||||||
|
zj0EAwMDaAAwZQIwe3lORlCEwkSHRhtFcP9Ymd70/aTSVaYgLXTWNLxBo1BfASdW
|
||||||
|
tL4ndQavEi51mI38AjEAi/V3bNTIZargCyzuFJ0nN6T5U6VR5CmD1/iQMVtCnwr1
|
||||||
|
/q4AaOeMSQ+2b1tbFfLn
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
# Issuer: CN=HiPKI Root CA - G1 O=Chunghwa Telecom Co., Ltd.
|
||||||
|
# Subject: CN=HiPKI Root CA - G1 O=Chunghwa Telecom Co., Ltd.
|
||||||
|
# Label: "HiPKI Root CA - G1"
|
||||||
|
# Serial: 60966262342023497858655262305426234976
|
||||||
|
# MD5 Fingerprint: 69:45:df:16:65:4b:e8:68:9a:8f:76:5f:ff:80:9e:d3
|
||||||
|
# SHA1 Fingerprint: 6a:92:e4:a8:ee:1b:ec:96:45:37:e3:29:57:49:cd:96:e3:e5:d2:60
|
||||||
|
# SHA256 Fingerprint: f0:15:ce:3c:c2:39:bf:ef:06:4b:e9:f1:d2:c4:17:e1:a0:26:4a:0a:94:be:1f:0c:8d:12:18:64:eb:69:49:cc
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIIFajCCA1KgAwIBAgIQLd2szmKXlKFD6LDNdmpeYDANBgkqhkiG9w0BAQsFADBP
|
||||||
|
MQswCQYDVQQGEwJUVzEjMCEGA1UECgwaQ2h1bmdod2EgVGVsZWNvbSBDby4sIEx0
|
||||||
|
ZC4xGzAZBgNVBAMMEkhpUEtJIFJvb3QgQ0EgLSBHMTAeFw0xOTAyMjIwOTQ2MDRa
|
||||||
|
Fw0zNzEyMzExNTU5NTlaME8xCzAJBgNVBAYTAlRXMSMwIQYDVQQKDBpDaHVuZ2h3
|
||||||
|
YSBUZWxlY29tIENvLiwgTHRkLjEbMBkGA1UEAwwSSGlQS0kgUm9vdCBDQSAtIEcx
|
||||||
|
MIICIjANBgkqhkiG9w0BAQEFAAOCAg8AMIICCgKCAgEA9B5/UnMyDHPkvRN0o9Qw
|
||||||
|
qNCuS9i233VHZvR85zkEHmpwINJaR3JnVfSl6J3VHiGh8Ge6zCFovkRTv4354twv
|
||||||
|
Vcg3Px+kwJyz5HdcoEb+d/oaoDjq7Zpy3iu9lFc6uux55199QmQ5eiY29yTw1S+6
|
||||||
|
lZgRZq2XNdZ1AYDgr/SEYYwNHl98h5ZeQa/rh+r4XfEuiAU+TCK72h8q3VJGZDnz
|
||||||
|
Qs7ZngyzsHeXZJzA9KMuH5UHsBffMNsAGJZMoYFL3QRtU6M9/Aes1MU3guvklQgZ
|
||||||
|
KILSQjqj2FPseYlgSGDIcpJQ3AOPgz+yQlda22rpEZfdhSi8MEyr48KxRURHH+CK
|
||||||
|
FgeW0iEPU8DtqX7UTuybCeyvQqww1r/REEXgphaypcXTT3OUM3ECoWqj1jOXTyFj
|
||||||
|
HluP2cFeRXF3D4FdXyGarYPM+l7WjSNfGz1BryB1ZlpK9p/7qxj3ccC2HTHsOyDr
|
||||||
|
y+K49a6SsvfhhEvyovKTmiKe0xRvNlS9H15ZFblzqMF8b3ti6RZsR1pl8w4Rm0bZ
|
||||||
|
/W3c1pzAtH2lsN0/Vm+h+fbkEkj9Bn8SV7apI09bA8PgcSojt/ewsTu8mL3WmKgM
|
||||||
|
a/aOEmem8rJY5AIJEzypuxC00jBF8ez3ABHfZfjcK0NVvxaXxA/VLGGEqnKG/uY6
|
||||||
|
fsI/fe78LxQ+5oXdUG+3Se0CAwEAAaNCMEAwDwYDVR0TAQH/BAUwAwEB/zAdBgNV
|
||||||
|
HQ4EFgQU8ncX+l6o/vY9cdVouslGDDjYr7AwDgYDVR0PAQH/BAQDAgGGMA0GCSqG
|
||||||
|
SIb3DQEBCwUAA4ICAQBQUfB13HAE4/+qddRxosuej6ip0691x1TPOhwEmSKsxBHi
|
||||||
|
7zNKpiMdDg1H2DfHb680f0+BazVP6XKlMeJ45/dOlBhbQH3PayFUhuaVevvGyuqc
|
||||||
|
SE5XCV0vrPSltJczWNWseanMX/mF+lLFjfiRFOs6DRfQUsJ748JzjkZ4Bjgs6Fza
|
||||||
|
ZsT0pPBWGTMpWmWSBUdGSquEwx4noR8RkpkndZMPvDY7l1ePJlsMu5wP1G4wB9Tc
|
||||||
|
XzZoZjmDlicmisjEOf6aIW/Vcobpf2Lll07QJNBAsNB1CI69aO4I1258EHBGG3zg
|
||||||
|
iLKecoaZAeO/n0kZtCW+VmWuF2PlHt/o/0elv+EmBYTksMCv5wiZqAxeJoBF1Pho
|
||||||
|
L5aPruJKHJwWDBNvOIf2u8g0X5IDUXlwpt/L9ZlNec1OvFefQ05rLisY+GpzjLrF
|
||||||
|
Ne85akEez3GoorKGB1s6yeHvP2UEgEcyRHCVTjFnanRbEEV16rCf0OY1/k6fi8wr
|
||||||
|
kkVbbiVghUbN0aqwdmaTd5a+g744tiROJgvM7XpWGuDpWsZkrUx6AEhEL7lAuxM+
|
||||||
|
vhV4nYWBSipX3tUZQ9rbyltHhoMLP7YNdnhzeSJesYAfz77RP1YQmCuVh6EfnWQU
|
||||||
|
YDksswBVLuT1sw5XxJFBAJw/6KXf6vb/yPCtbVKoF6ubYfwSUTXkJf2vqmqGOQ==
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
# Issuer: CN=GlobalSign O=GlobalSign OU=GlobalSign ECC Root CA - R4
|
||||||
|
# Subject: CN=GlobalSign O=GlobalSign OU=GlobalSign ECC Root CA - R4
|
||||||
|
# Label: "GlobalSign ECC Root CA - R4"
|
||||||
|
# Serial: 159662223612894884239637590694
|
||||||
|
# MD5 Fingerprint: 26:29:f8:6d:e1:88:bf:a2:65:7f:aa:c4:cd:0f:7f:fc
|
||||||
|
# SHA1 Fingerprint: 6b:a0:b0:98:e1:71:ef:5a:ad:fe:48:15:80:77:10:f4:bd:6f:0b:28
|
||||||
|
# SHA256 Fingerprint: b0:85:d7:0b:96:4f:19:1a:73:e4:af:0d:54:ae:7a:0e:07:aa:fd:af:9b:71:dd:08:62:13:8a:b7:32:5a:24:a2
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIIB3DCCAYOgAwIBAgINAgPlfvU/k/2lCSGypjAKBggqhkjOPQQDAjBQMSQwIgYD
|
||||||
|
VQQLExtHbG9iYWxTaWduIEVDQyBSb290IENBIC0gUjQxEzARBgNVBAoTCkdsb2Jh
|
||||||
|
bFNpZ24xEzARBgNVBAMTCkdsb2JhbFNpZ24wHhcNMTIxMTEzMDAwMDAwWhcNMzgw
|
||||||
|
MTE5MDMxNDA3WjBQMSQwIgYDVQQLExtHbG9iYWxTaWduIEVDQyBSb290IENBIC0g
|
||||||
|
UjQxEzARBgNVBAoTCkdsb2JhbFNpZ24xEzARBgNVBAMTCkdsb2JhbFNpZ24wWTAT
|
||||||
|
BgcqhkjOPQIBBggqhkjOPQMBBwNCAAS4xnnTj2wlDp8uORkcA6SumuU5BwkWymOx
|
||||||
|
uYb4ilfBV85C+nOh92VC/x7BALJucw7/xyHlGKSq2XE/qNS5zowdo0IwQDAOBgNV
|
||||||
|
HQ8BAf8EBAMCAYYwDwYDVR0TAQH/BAUwAwEB/zAdBgNVHQ4EFgQUVLB7rUW44kB/
|
||||||
|
+wpu+74zyTyjhNUwCgYIKoZIzj0EAwIDRwAwRAIgIk90crlgr/HmnKAWBVBfw147
|
||||||
|
bmF0774BxL4YSFlhgjICICadVGNA3jdgUM/I2O2dgq43mLyjj0xMqTQrbO/7lZsm
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
# Issuer: CN=GTS Root R1 O=Google Trust Services LLC
|
||||||
|
# Subject: CN=GTS Root R1 O=Google Trust Services LLC
|
||||||
|
# Label: "GTS Root R1"
|
||||||
|
# Serial: 159662320309726417404178440727
|
||||||
|
# MD5 Fingerprint: 05:fe:d0:bf:71:a8:a3:76:63:da:01:e0:d8:52:dc:40
|
||||||
|
# SHA1 Fingerprint: e5:8c:1c:c4:91:3b:38:63:4b:e9:10:6e:e3:ad:8e:6b:9d:d9:81:4a
|
||||||
|
# SHA256 Fingerprint: d9:47:43:2a:bd:e7:b7:fa:90:fc:2e:6b:59:10:1b:12:80:e0:e1:c7:e4:e4:0f:a3:c6:88:7f:ff:57:a7:f4:cf
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIIFVzCCAz+gAwIBAgINAgPlk28xsBNJiGuiFzANBgkqhkiG9w0BAQwFADBHMQsw
|
||||||
|
CQYDVQQGEwJVUzEiMCAGA1UEChMZR29vZ2xlIFRydXN0IFNlcnZpY2VzIExMQzEU
|
||||||
|
MBIGA1UEAxMLR1RTIFJvb3QgUjEwHhcNMTYwNjIyMDAwMDAwWhcNMzYwNjIyMDAw
|
||||||
|
MDAwWjBHMQswCQYDVQQGEwJVUzEiMCAGA1UEChMZR29vZ2xlIFRydXN0IFNlcnZp
|
||||||
|
Y2VzIExMQzEUMBIGA1UEAxMLR1RTIFJvb3QgUjEwggIiMA0GCSqGSIb3DQEBAQUA
|
||||||
|
A4ICDwAwggIKAoICAQC2EQKLHuOhd5s73L+UPreVp0A8of2C+X0yBoJx9vaMf/vo
|
||||||
|
27xqLpeXo4xL+Sv2sfnOhB2x+cWX3u+58qPpvBKJXqeqUqv4IyfLpLGcY9vXmX7w
|
||||||
|
Cl7raKb0xlpHDU0QM+NOsROjyBhsS+z8CZDfnWQpJSMHobTSPS5g4M/SCYe7zUjw
|
||||||
|
TcLCeoiKu7rPWRnWr4+wB7CeMfGCwcDfLqZtbBkOtdh+JhpFAz2weaSUKK0Pfybl
|
||||||
|
qAj+lug8aJRT7oM6iCsVlgmy4HqMLnXWnOunVmSPlk9orj2XwoSPwLxAwAtcvfaH
|
||||||
|
szVsrBhQf4TgTM2S0yDpM7xSma8ytSmzJSq0SPly4cpk9+aCEI3oncKKiPo4Zor8
|
||||||
|
Y/kB+Xj9e1x3+naH+uzfsQ55lVe0vSbv1gHR6xYKu44LtcXFilWr06zqkUspzBmk
|
||||||
|
MiVOKvFlRNACzqrOSbTqn3yDsEB750Orp2yjj32JgfpMpf/VjsPOS+C12LOORc92
|
||||||
|
wO1AK/1TD7Cn1TsNsYqiA94xrcx36m97PtbfkSIS5r762DL8EGMUUXLeXdYWk70p
|
||||||
|
aDPvOmbsB4om3xPXV2V4J95eSRQAogB/mqghtqmxlbCluQ0WEdrHbEg8QOB+DVrN
|
||||||
|
VjzRlwW5y0vtOUucxD/SVRNuJLDWcfr0wbrM7Rv1/oFB2ACYPTrIrnqYNxgFlQID
|
||||||
|
AQABo0IwQDAOBgNVHQ8BAf8EBAMCAYYwDwYDVR0TAQH/BAUwAwEB/zAdBgNVHQ4E
|
||||||
|
FgQU5K8rJnEaK0gnhS9SZizv8IkTcT4wDQYJKoZIhvcNAQEMBQADggIBAJ+qQibb
|
||||||
|
C5u+/x6Wki4+omVKapi6Ist9wTrYggoGxval3sBOh2Z5ofmmWJyq+bXmYOfg6LEe
|
||||||
|
QkEzCzc9zolwFcq1JKjPa7XSQCGYzyI0zzvFIoTgxQ6KfF2I5DUkzps+GlQebtuy
|
||||||
|
h6f88/qBVRRiClmpIgUxPoLW7ttXNLwzldMXG+gnoot7TiYaelpkttGsN/H9oPM4
|
||||||
|
7HLwEXWdyzRSjeZ2axfG34arJ45JK3VmgRAhpuo+9K4l/3wV3s6MJT/KYnAK9y8J
|
||||||
|
ZgfIPxz88NtFMN9iiMG1D53Dn0reWVlHxYciNuaCp+0KueIHoI17eko8cdLiA6Ef
|
||||||
|
MgfdG+RCzgwARWGAtQsgWSl4vflVy2PFPEz0tv/bal8xa5meLMFrUKTX5hgUvYU/
|
||||||
|
Z6tGn6D/Qqc6f1zLXbBwHSs09dR2CQzreExZBfMzQsNhFRAbd03OIozUhfJFfbdT
|
||||||
|
6u9AWpQKXCBfTkBdYiJ23//OYb2MI3jSNwLgjt7RETeJ9r/tSQdirpLsQBqvFAnZ
|
||||||
|
0E6yove+7u7Y/9waLd64NnHi/Hm3lCXRSHNboTXns5lndcEZOitHTtNCjv0xyBZm
|
||||||
|
2tIMPNuzjsmhDYAPexZ3FL//2wmUspO8IFgV6dtxQ/PeEMMA3KgqlbbC1j+Qa3bb
|
||||||
|
bP6MvPJwNQzcmRk13NfIRmPVNnGuV/u3gm3c
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
# Issuer: CN=GTS Root R2 O=Google Trust Services LLC
|
||||||
|
# Subject: CN=GTS Root R2 O=Google Trust Services LLC
|
||||||
|
# Label: "GTS Root R2"
|
||||||
|
# Serial: 159662449406622349769042896298
|
||||||
|
# MD5 Fingerprint: 1e:39:c0:53:e6:1e:29:82:0b:ca:52:55:36:5d:57:dc
|
||||||
|
# SHA1 Fingerprint: 9a:44:49:76:32:db:de:fa:d0:bc:fb:5a:7b:17:bd:9e:56:09:24:94
|
||||||
|
# SHA256 Fingerprint: 8d:25:cd:97:22:9d:bf:70:35:6b:da:4e:b3:cc:73:40:31:e2:4c:f0:0f:af:cf:d3:2d:c7:6e:b5:84:1c:7e:a8
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIIFVzCCAz+gAwIBAgINAgPlrsWNBCUaqxElqjANBgkqhkiG9w0BAQwFADBHMQsw
|
||||||
|
CQYDVQQGEwJVUzEiMCAGA1UEChMZR29vZ2xlIFRydXN0IFNlcnZpY2VzIExMQzEU
|
||||||
|
MBIGA1UEAxMLR1RTIFJvb3QgUjIwHhcNMTYwNjIyMDAwMDAwWhcNMzYwNjIyMDAw
|
||||||
|
MDAwWjBHMQswCQYDVQQGEwJVUzEiMCAGA1UEChMZR29vZ2xlIFRydXN0IFNlcnZp
|
||||||
|
Y2VzIExMQzEUMBIGA1UEAxMLR1RTIFJvb3QgUjIwggIiMA0GCSqGSIb3DQEBAQUA
|
||||||
|
A4ICDwAwggIKAoICAQDO3v2m++zsFDQ8BwZabFn3GTXd98GdVarTzTukk3LvCvpt
|
||||||
|
nfbwhYBboUhSnznFt+4orO/LdmgUud+tAWyZH8QiHZ/+cnfgLFuv5AS/T3KgGjSY
|
||||||
|
6Dlo7JUle3ah5mm5hRm9iYz+re026nO8/4Piy33B0s5Ks40FnotJk9/BW9BuXvAu
|
||||||
|
MC6C/Pq8tBcKSOWIm8Wba96wyrQD8Nr0kLhlZPdcTK3ofmZemde4wj7I0BOdre7k
|
||||||
|
RXuJVfeKH2JShBKzwkCX44ofR5GmdFrS+LFjKBC4swm4VndAoiaYecb+3yXuPuWg
|
||||||
|
f9RhD1FLPD+M2uFwdNjCaKH5wQzpoeJ/u1U8dgbuak7MkogwTZq9TwtImoS1mKPV
|
||||||
|
+3PBV2HdKFZ1E66HjucMUQkQdYhMvI35ezzUIkgfKtzra7tEscszcTJGr61K8Yzo
|
||||||
|
dDqs5xoic4DSMPclQsciOzsSrZYuxsN2B6ogtzVJV+mSSeh2FnIxZyuWfoqjx5RW
|
||||||
|
Ir9qS34BIbIjMt/kmkRtWVtd9QCgHJvGeJeNkP+byKq0rxFROV7Z+2et1VsRnTKa
|
||||||
|
G73VululycslaVNVJ1zgyjbLiGH7HrfQy+4W+9OmTN6SpdTi3/UGVN4unUu0kzCq
|
||||||
|
gc7dGtxRcw1PcOnlthYhGXmy5okLdWTK1au8CcEYof/UVKGFPP0UJAOyh9OktwID
|
||||||
|
AQABo0IwQDAOBgNVHQ8BAf8EBAMCAYYwDwYDVR0TAQH/BAUwAwEB/zAdBgNVHQ4E
|
||||||
|
FgQUu//KjiOfT5nK2+JopqUVJxce2Q4wDQYJKoZIhvcNAQEMBQADggIBAB/Kzt3H
|
||||||
|
vqGf2SdMC9wXmBFqiN495nFWcrKeGk6c1SuYJF2ba3uwM4IJvd8lRuqYnrYb/oM8
|
||||||
|
0mJhwQTtzuDFycgTE1XnqGOtjHsB/ncw4c5omwX4Eu55MaBBRTUoCnGkJE+M3DyC
|
||||||
|
B19m3H0Q/gxhswWV7uGugQ+o+MePTagjAiZrHYNSVc61LwDKgEDg4XSsYPWHgJ2u
|
||||||
|
NmSRXbBoGOqKYcl3qJfEycel/FVL8/B/uWU9J2jQzGv6U53hkRrJXRqWbTKH7QMg
|
||||||
|
yALOWr7Z6v2yTcQvG99fevX4i8buMTolUVVnjWQye+mew4K6Ki3pHrTgSAai/Gev
|
||||||
|
HyICc/sgCq+dVEuhzf9gR7A/Xe8bVr2XIZYtCtFenTgCR2y59PYjJbigapordwj6
|
||||||
|
xLEokCZYCDzifqrXPW+6MYgKBesntaFJ7qBFVHvmJ2WZICGoo7z7GJa7Um8M7YNR
|
||||||
|
TOlZ4iBgxcJlkoKM8xAfDoqXvneCbT+PHV28SSe9zE8P4c52hgQjxcCMElv924Sg
|
||||||
|
JPFI/2R80L5cFtHvma3AH/vLrrw4IgYmZNralw4/KBVEqE8AyvCazM90arQ+POuV
|
||||||
|
7LXTWtiBmelDGDfrs7vRWGJB82bSj6p4lVQgw1oudCvV0b4YacCs1aTPObpRhANl
|
||||||
|
6WLAYv7YTVWW4tAR+kg0Eeye7QUd5MjWHYbL
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
# Issuer: CN=GTS Root R3 O=Google Trust Services LLC
|
||||||
|
# Subject: CN=GTS Root R3 O=Google Trust Services LLC
|
||||||
|
# Label: "GTS Root R3"
|
||||||
|
# Serial: 159662495401136852707857743206
|
||||||
|
# MD5 Fingerprint: 3e:e7:9d:58:02:94:46:51:94:e5:e0:22:4a:8b:e7:73
|
||||||
|
# SHA1 Fingerprint: ed:e5:71:80:2b:c8:92:b9:5b:83:3c:d2:32:68:3f:09:cd:a0:1e:46
|
||||||
|
# SHA256 Fingerprint: 34:d8:a7:3e:e2:08:d9:bc:db:0d:95:65:20:93:4b:4e:40:e6:94:82:59:6e:8b:6f:73:c8:42:6b:01:0a:6f:48
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIICCTCCAY6gAwIBAgINAgPluILrIPglJ209ZjAKBggqhkjOPQQDAzBHMQswCQYD
|
||||||
|
VQQGEwJVUzEiMCAGA1UEChMZR29vZ2xlIFRydXN0IFNlcnZpY2VzIExMQzEUMBIG
|
||||||
|
A1UEAxMLR1RTIFJvb3QgUjMwHhcNMTYwNjIyMDAwMDAwWhcNMzYwNjIyMDAwMDAw
|
||||||
|
WjBHMQswCQYDVQQGEwJVUzEiMCAGA1UEChMZR29vZ2xlIFRydXN0IFNlcnZpY2Vz
|
||||||
|
IExMQzEUMBIGA1UEAxMLR1RTIFJvb3QgUjMwdjAQBgcqhkjOPQIBBgUrgQQAIgNi
|
||||||
|
AAQfTzOHMymKoYTey8chWEGJ6ladK0uFxh1MJ7x/JlFyb+Kf1qPKzEUURout736G
|
||||||
|
jOyxfi//qXGdGIRFBEFVbivqJn+7kAHjSxm65FSWRQmx1WyRRK2EE46ajA2ADDL2
|
||||||
|
4CejQjBAMA4GA1UdDwEB/wQEAwIBhjAPBgNVHRMBAf8EBTADAQH/MB0GA1UdDgQW
|
||||||
|
BBTB8Sa6oC2uhYHP0/EqEr24Cmf9vDAKBggqhkjOPQQDAwNpADBmAjEA9uEglRR7
|
||||||
|
VKOQFhG/hMjqb2sXnh5GmCCbn9MN2azTL818+FsuVbu/3ZL3pAzcMeGiAjEA/Jdm
|
||||||
|
ZuVDFhOD3cffL74UOO0BzrEXGhF16b0DjyZ+hOXJYKaV11RZt+cRLInUue4X
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
# Issuer: CN=GTS Root R4 O=Google Trust Services LLC
|
||||||
|
# Subject: CN=GTS Root R4 O=Google Trust Services LLC
|
||||||
|
# Label: "GTS Root R4"
|
||||||
|
# Serial: 159662532700760215368942768210
|
||||||
|
# MD5 Fingerprint: 43:96:83:77:19:4d:76:b3:9d:65:52:e4:1d:22:a5:e8
|
||||||
|
# SHA1 Fingerprint: 77:d3:03:67:b5:e0:0c:15:f6:0c:38:61:df:7c:e1:3b:92:46:4d:47
|
||||||
|
# SHA256 Fingerprint: 34:9d:fa:40:58:c5:e2:63:12:3b:39:8a:e7:95:57:3c:4e:13:13:c8:3f:e6:8f:93:55:6c:d5:e8:03:1b:3c:7d
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIICCTCCAY6gAwIBAgINAgPlwGjvYxqccpBQUjAKBggqhkjOPQQDAzBHMQswCQYD
|
||||||
|
VQQGEwJVUzEiMCAGA1UEChMZR29vZ2xlIFRydXN0IFNlcnZpY2VzIExMQzEUMBIG
|
||||||
|
A1UEAxMLR1RTIFJvb3QgUjQwHhcNMTYwNjIyMDAwMDAwWhcNMzYwNjIyMDAwMDAw
|
||||||
|
WjBHMQswCQYDVQQGEwJVUzEiMCAGA1UEChMZR29vZ2xlIFRydXN0IFNlcnZpY2Vz
|
||||||
|
IExMQzEUMBIGA1UEAxMLR1RTIFJvb3QgUjQwdjAQBgcqhkjOPQIBBgUrgQQAIgNi
|
||||||
|
AATzdHOnaItgrkO4NcWBMHtLSZ37wWHO5t5GvWvVYRg1rkDdc/eJkTBa6zzuhXyi
|
||||||
|
QHY7qca4R9gq55KRanPpsXI5nymfopjTX15YhmUPoYRlBtHci8nHc8iMai/lxKvR
|
||||||
|
HYqjQjBAMA4GA1UdDwEB/wQEAwIBhjAPBgNVHRMBAf8EBTADAQH/MB0GA1UdDgQW
|
||||||
|
BBSATNbrdP9JNqPV2Py1PsVq8JQdjDAKBggqhkjOPQQDAwNpADBmAjEA6ED/g94D
|
||||||
|
9J+uHXqnLrmvT/aDHQ4thQEd0dlq7A/Cr8deVl5c1RxYIigL9zC2L7F8AjEA8GE8
|
||||||
|
p/SgguMh1YQdc4acLa/KNJvxn7kjNuK8YAOdgLOaVsjh4rsUecrNIdSUtUlD
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
# Issuer: CN=Telia Root CA v2 O=Telia Finland Oyj
|
||||||
|
# Subject: CN=Telia Root CA v2 O=Telia Finland Oyj
|
||||||
|
# Label: "Telia Root CA v2"
|
||||||
|
# Serial: 7288924052977061235122729490515358
|
||||||
|
# MD5 Fingerprint: 0e:8f:ac:aa:82:df:85:b1:f4:dc:10:1c:fc:99:d9:48
|
||||||
|
# SHA1 Fingerprint: b9:99:cd:d1:73:50:8a:c4:47:05:08:9c:8c:88:fb:be:a0:2b:40:cd
|
||||||
|
# SHA256 Fingerprint: 24:2b:69:74:2f:cb:1e:5b:2a:bf:98:89:8b:94:57:21:87:54:4e:5b:4d:99:11:78:65:73:62:1f:6a:74:b8:2c
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIIFdDCCA1ygAwIBAgIPAWdfJ9b+euPkrL4JWwWeMA0GCSqGSIb3DQEBCwUAMEQx
|
||||||
|
CzAJBgNVBAYTAkZJMRowGAYDVQQKDBFUZWxpYSBGaW5sYW5kIE95ajEZMBcGA1UE
|
||||||
|
AwwQVGVsaWEgUm9vdCBDQSB2MjAeFw0xODExMjkxMTU1NTRaFw00MzExMjkxMTU1
|
||||||
|
NTRaMEQxCzAJBgNVBAYTAkZJMRowGAYDVQQKDBFUZWxpYSBGaW5sYW5kIE95ajEZ
|
||||||
|
MBcGA1UEAwwQVGVsaWEgUm9vdCBDQSB2MjCCAiIwDQYJKoZIhvcNAQEBBQADggIP
|
||||||
|
ADCCAgoCggIBALLQPwe84nvQa5n44ndp586dpAO8gm2h/oFlH0wnrI4AuhZ76zBq
|
||||||
|
AMCzdGh+sq/H1WKzej9Qyow2RCRj0jbpDIX2Q3bVTKFgcmfiKDOlyzG4OiIjNLh9
|
||||||
|
vVYiQJ3q9HsDrWj8soFPmNB06o3lfc1jw6P23pLCWBnglrvFxKk9pXSW/q/5iaq9
|
||||||
|
lRdU2HhE8Qx3FZLgmEKnpNaqIJLNwaCzlrI6hEKNfdWV5Nbb6WLEWLN5xYzTNTOD
|
||||||
|
n3WhUidhOPFZPY5Q4L15POdslv5e2QJltI5c0BE0312/UqeBAMN/mUWZFdUXyApT
|
||||||
|
7GPzmX3MaRKGwhfwAZ6/hLzRUssbkmbOpFPlob/E2wnW5olWK8jjfN7j/4nlNW4o
|
||||||
|
6GwLI1GpJQXrSPjdscr6bAhR77cYbETKJuFzxokGgeWKrLDiKca5JLNrRBH0pUPC
|
||||||
|
TEPlcDaMtjNXepUugqD0XBCzYYP2AgWGLnwtbNwDRm41k9V6lS/eINhbfpSQBGq6
|
||||||
|
WT0EBXWdN6IOLj3rwaRSg/7Qa9RmjtzG6RJOHSpXqhC8fF6CfaamyfItufUXJ63R
|
||||||
|
DolUK5X6wK0dmBR4M0KGCqlztft0DbcbMBnEWg4cJ7faGND/isgFuvGqHKI3t+ZI
|
||||||
|
pEYslOqodmJHixBTB0hXbOKSTbauBcvcwUpej6w9GU7C7WB1K9vBykLVAgMBAAGj
|
||||||
|
YzBhMB8GA1UdIwQYMBaAFHKs5DN5qkWH9v2sHZ7Wxy+G2CQ5MB0GA1UdDgQWBBRy
|
||||||
|
rOQzeapFh/b9rB2e1scvhtgkOTAOBgNVHQ8BAf8EBAMCAQYwDwYDVR0TAQH/BAUw
|
||||||
|
AwEB/zANBgkqhkiG9w0BAQsFAAOCAgEAoDtZpwmUPjaE0n4vOaWWl/oRrfxn83EJ
|
||||||
|
8rKJhGdEr7nv7ZbsnGTbMjBvZ5qsfl+yqwE2foH65IRe0qw24GtixX1LDoJt0nZi
|
||||||
|
0f6X+J8wfBj5tFJ3gh1229MdqfDBmgC9bXXYfef6xzijnHDoRnkDry5023X4blMM
|
||||||
|
A8iZGok1GTzTyVR8qPAs5m4HeW9q4ebqkYJpCh3DflminmtGFZhb069GHWLIzoBS
|
||||||
|
SRE/yQQSwxN8PzuKlts8oB4KtItUsiRnDe+Cy748fdHif64W1lZYudogsYMVoe+K
|
||||||
|
TTJvQS8TUoKU1xrBeKJR3Stwbbca+few4GeXVtt8YVMJAygCQMez2P2ccGrGKMOF
|
||||||
|
6eLtGpOg3kuYooQ+BXcBlj37tCAPnHICehIv1aO6UXivKitEZU61/Qrowc15h2Er
|
||||||
|
3oBXRb9n8ZuRXqWk7FlIEA04x7D6w0RtBPV4UBySllva9bguulvP5fBqnUsvWHMt
|
||||||
|
Ty3EHD70sz+rFQ47GUGKpMFXEmZxTPpT41frYpUJnlTd0cI8Vzy9OK2YZLe4A5pT
|
||||||
|
VmBds9hCG1xLEooc6+t9xnppxyd/pPiL8uSUZodL6ZQHCRJ5irLrdATczvREWeAW
|
||||||
|
ysUsWNc8e89ihmpQfTU2Zqf7N+cox9jQraVplI/owd8k+BsHMYeB2F326CjYSlKA
|
||||||
|
rBPuUBQemMc=
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
# Issuer: CN=D-TRUST BR Root CA 1 2020 O=D-Trust GmbH
|
||||||
|
# Subject: CN=D-TRUST BR Root CA 1 2020 O=D-Trust GmbH
|
||||||
|
# Label: "D-TRUST BR Root CA 1 2020"
|
||||||
|
# Serial: 165870826978392376648679885835942448534
|
||||||
|
# MD5 Fingerprint: b5:aa:4b:d5:ed:f7:e3:55:2e:8f:72:0a:f3:75:b8:ed
|
||||||
|
# SHA1 Fingerprint: 1f:5b:98:f0:e3:b5:f7:74:3c:ed:e6:b0:36:7d:32:cd:f4:09:41:67
|
||||||
|
# SHA256 Fingerprint: e5:9a:aa:81:60:09:c2:2b:ff:5b:25:ba:d3:7d:f3:06:f0:49:79:7c:1f:81:d8:5a:b0:89:e6:57:bd:8f:00:44
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIIC2zCCAmCgAwIBAgIQfMmPK4TX3+oPyWWa00tNljAKBggqhkjOPQQDAzBIMQsw
|
||||||
|
CQYDVQQGEwJERTEVMBMGA1UEChMMRC1UcnVzdCBHbWJIMSIwIAYDVQQDExlELVRS
|
||||||
|
VVNUIEJSIFJvb3QgQ0EgMSAyMDIwMB4XDTIwMDIxMTA5NDUwMFoXDTM1MDIxMTA5
|
||||||
|
NDQ1OVowSDELMAkGA1UEBhMCREUxFTATBgNVBAoTDEQtVHJ1c3QgR21iSDEiMCAG
|
||||||
|
A1UEAxMZRC1UUlVTVCBCUiBSb290IENBIDEgMjAyMDB2MBAGByqGSM49AgEGBSuB
|
||||||
|
BAAiA2IABMbLxyjR+4T1mu9CFCDhQ2tuda38KwOE1HaTJddZO0Flax7mNCq7dPYS
|
||||||
|
zuht56vkPE4/RAiLzRZxy7+SmfSk1zxQVFKQhYN4lGdnoxwJGT11NIXe7WB9xwy0
|
||||||
|
QVK5buXuQqOCAQ0wggEJMA8GA1UdEwEB/wQFMAMBAf8wHQYDVR0OBBYEFHOREKv/
|
||||||
|
VbNafAkl1bK6CKBrqx9tMA4GA1UdDwEB/wQEAwIBBjCBxgYDVR0fBIG+MIG7MD6g
|
||||||
|
PKA6hjhodHRwOi8vY3JsLmQtdHJ1c3QubmV0L2NybC9kLXRydXN0X2JyX3Jvb3Rf
|
||||||
|
Y2FfMV8yMDIwLmNybDB5oHegdYZzbGRhcDovL2RpcmVjdG9yeS5kLXRydXN0Lm5l
|
||||||
|
dC9DTj1ELVRSVVNUJTIwQlIlMjBSb290JTIwQ0ElMjAxJTIwMjAyMCxPPUQtVHJ1
|
||||||
|
c3QlMjBHbWJILEM9REU/Y2VydGlmaWNhdGVyZXZvY2F0aW9ubGlzdDAKBggqhkjO
|
||||||
|
PQQDAwNpADBmAjEAlJAtE/rhY/hhY+ithXhUkZy4kzg+GkHaQBZTQgjKL47xPoFW
|
||||||
|
wKrY7RjEsK70PvomAjEA8yjixtsrmfu3Ubgko6SUeho/5jbiA1czijDLgsfWFBHV
|
||||||
|
dWNbFJWcHwHP2NVypw87
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
# Issuer: CN=D-TRUST EV Root CA 1 2020 O=D-Trust GmbH
|
||||||
|
# Subject: CN=D-TRUST EV Root CA 1 2020 O=D-Trust GmbH
|
||||||
|
# Label: "D-TRUST EV Root CA 1 2020"
|
||||||
|
# Serial: 126288379621884218666039612629459926992
|
||||||
|
# MD5 Fingerprint: 8c:2d:9d:70:9f:48:99:11:06:11:fb:e9:cb:30:c0:6e
|
||||||
|
# SHA1 Fingerprint: 61:db:8c:21:59:69:03:90:d8:7c:9c:12:86:54:cf:9d:3d:f4:dd:07
|
||||||
|
# SHA256 Fingerprint: 08:17:0d:1a:a3:64:53:90:1a:2f:95:92:45:e3:47:db:0c:8d:37:ab:aa:bc:56:b8:1a:a1:00:dc:95:89:70:db
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIIC2zCCAmCgAwIBAgIQXwJB13qHfEwDo6yWjfv/0DAKBggqhkjOPQQDAzBIMQsw
|
||||||
|
CQYDVQQGEwJERTEVMBMGA1UEChMMRC1UcnVzdCBHbWJIMSIwIAYDVQQDExlELVRS
|
||||||
|
VVNUIEVWIFJvb3QgQ0EgMSAyMDIwMB4XDTIwMDIxMTEwMDAwMFoXDTM1MDIxMTA5
|
||||||
|
NTk1OVowSDELMAkGA1UEBhMCREUxFTATBgNVBAoTDEQtVHJ1c3QgR21iSDEiMCAG
|
||||||
|
A1UEAxMZRC1UUlVTVCBFViBSb290IENBIDEgMjAyMDB2MBAGByqGSM49AgEGBSuB
|
||||||
|
BAAiA2IABPEL3YZDIBnfl4XoIkqbz52Yv7QFJsnL46bSj8WeeHsxiamJrSc8ZRCC
|
||||||
|
/N/DnU7wMyPE0jL1HLDfMxddxfCxivnvubcUyilKwg+pf3VlSSowZ/Rk99Yad9rD
|
||||||
|
wpdhQntJraOCAQ0wggEJMA8GA1UdEwEB/wQFMAMBAf8wHQYDVR0OBBYEFH8QARY3
|
||||||
|
OqQo5FD4pPfsazK2/umLMA4GA1UdDwEB/wQEAwIBBjCBxgYDVR0fBIG+MIG7MD6g
|
||||||
|
PKA6hjhodHRwOi8vY3JsLmQtdHJ1c3QubmV0L2NybC9kLXRydXN0X2V2X3Jvb3Rf
|
||||||
|
Y2FfMV8yMDIwLmNybDB5oHegdYZzbGRhcDovL2RpcmVjdG9yeS5kLXRydXN0Lm5l
|
||||||
|
dC9DTj1ELVRSVVNUJTIwRVYlMjBSb290JTIwQ0ElMjAxJTIwMjAyMCxPPUQtVHJ1
|
||||||
|
c3QlMjBHbWJILEM9REU/Y2VydGlmaWNhdGVyZXZvY2F0aW9ubGlzdDAKBggqhkjO
|
||||||
|
PQQDAwNpADBmAjEAyjzGKnXCXnViOTYAYFqLwZOZzNnbQTs7h5kXO9XMT8oi96CA
|
||||||
|
y/m0sRtW9XLS/BnRAjEAkfcwkz8QRitxpNA7RJvAKQIFskF3UfN5Wp6OFKBOQtJb
|
||||||
|
gfM0agPnIjhQW+0ZT0MW
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
# Issuer: CN=DigiCert TLS ECC P384 Root G5 O=DigiCert, Inc.
|
||||||
|
# Subject: CN=DigiCert TLS ECC P384 Root G5 O=DigiCert, Inc.
|
||||||
|
# Label: "DigiCert TLS ECC P384 Root G5"
|
||||||
|
# Serial: 13129116028163249804115411775095713523
|
||||||
|
# MD5 Fingerprint: d3:71:04:6a:43:1c:db:a6:59:e1:a8:a3:aa:c5:71:ed
|
||||||
|
# SHA1 Fingerprint: 17:f3:de:5e:9f:0f:19:e9:8e:f6:1f:32:26:6e:20:c4:07:ae:30:ee
|
||||||
|
# SHA256 Fingerprint: 01:8e:13:f0:77:25:32:cf:80:9b:d1:b1:72:81:86:72:83:fc:48:c6:e1:3b:e9:c6:98:12:85:4a:49:0c:1b:05
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIICGTCCAZ+gAwIBAgIQCeCTZaz32ci5PhwLBCou8zAKBggqhkjOPQQDAzBOMQsw
|
||||||
|
CQYDVQQGEwJVUzEXMBUGA1UEChMORGlnaUNlcnQsIEluYy4xJjAkBgNVBAMTHURp
|
||||||
|
Z2lDZXJ0IFRMUyBFQ0MgUDM4NCBSb290IEc1MB4XDTIxMDExNTAwMDAwMFoXDTQ2
|
||||||
|
MDExNDIzNTk1OVowTjELMAkGA1UEBhMCVVMxFzAVBgNVBAoTDkRpZ2lDZXJ0LCBJ
|
||||||
|
bmMuMSYwJAYDVQQDEx1EaWdpQ2VydCBUTFMgRUNDIFAzODQgUm9vdCBHNTB2MBAG
|
||||||
|
ByqGSM49AgEGBSuBBAAiA2IABMFEoc8Rl1Ca3iOCNQfN0MsYndLxf3c1TzvdlHJS
|
||||||
|
7cI7+Oz6e2tYIOyZrsn8aLN1udsJ7MgT9U7GCh1mMEy7H0cKPGEQQil8pQgO4CLp
|
||||||
|
0zVozptjn4S1mU1YoI71VOeVyaNCMEAwHQYDVR0OBBYEFMFRRVBZqz7nLFr6ICIS
|
||||||
|
B4CIfBFqMA4GA1UdDwEB/wQEAwIBhjAPBgNVHRMBAf8EBTADAQH/MAoGCCqGSM49
|
||||||
|
BAMDA2gAMGUCMQCJao1H5+z8blUD2WdsJk6Dxv3J+ysTvLd6jLRl0mlpYxNjOyZQ
|
||||||
|
LgGheQaRnUi/wr4CMEfDFXuxoJGZSZOoPHzoRgaLLPIxAJSdYsiJvRmEFOml+wG4
|
||||||
|
DXZDjC5Ty3zfDBeWUA==
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
# Issuer: CN=DigiCert TLS RSA4096 Root G5 O=DigiCert, Inc.
|
||||||
|
# Subject: CN=DigiCert TLS RSA4096 Root G5 O=DigiCert, Inc.
|
||||||
|
# Label: "DigiCert TLS RSA4096 Root G5"
|
||||||
|
# Serial: 11930366277458970227240571539258396554
|
||||||
|
# MD5 Fingerprint: ac:fe:f7:34:96:a9:f2:b3:b4:12:4b:e4:27:41:6f:e1
|
||||||
|
# SHA1 Fingerprint: a7:88:49:dc:5d:7c:75:8c:8c:de:39:98:56:b3:aa:d0:b2:a5:71:35
|
||||||
|
# SHA256 Fingerprint: 37:1a:00:dc:05:33:b3:72:1a:7e:eb:40:e8:41:9e:70:79:9d:2b:0a:0f:2c:1d:80:69:31:65:f7:ce:c4:ad:75
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIIFZjCCA06gAwIBAgIQCPm0eKj6ftpqMzeJ3nzPijANBgkqhkiG9w0BAQwFADBN
|
||||||
|
MQswCQYDVQQGEwJVUzEXMBUGA1UEChMORGlnaUNlcnQsIEluYy4xJTAjBgNVBAMT
|
||||||
|
HERpZ2lDZXJ0IFRMUyBSU0E0MDk2IFJvb3QgRzUwHhcNMjEwMTE1MDAwMDAwWhcN
|
||||||
|
NDYwMTE0MjM1OTU5WjBNMQswCQYDVQQGEwJVUzEXMBUGA1UEChMORGlnaUNlcnQs
|
||||||
|
IEluYy4xJTAjBgNVBAMTHERpZ2lDZXJ0IFRMUyBSU0E0MDk2IFJvb3QgRzUwggIi
|
||||||
|
MA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQCz0PTJeRGd/fxmgefM1eS87IE+
|
||||||
|
ajWOLrfn3q/5B03PMJ3qCQuZvWxX2hhKuHisOjmopkisLnLlvevxGs3npAOpPxG0
|
||||||
|
2C+JFvuUAT27L/gTBaF4HI4o4EXgg/RZG5Wzrn4DReW+wkL+7vI8toUTmDKdFqgp
|
||||||
|
wgscONyfMXdcvyej/Cestyu9dJsXLfKB2l2w4SMXPohKEiPQ6s+d3gMXsUJKoBZM
|
||||||
|
pG2T6T867jp8nVid9E6P/DsjyG244gXazOvswzH016cpVIDPRFtMbzCe88zdH5RD
|
||||||
|
nU1/cHAN1DrRN/BsnZvAFJNY781BOHW8EwOVfH/jXOnVDdXifBBiqmvwPXbzP6Po
|
||||||
|
sMH976pXTayGpxi0KcEsDr9kvimM2AItzVwv8n/vFfQMFawKsPHTDU9qTXeXAaDx
|
||||||
|
Zre3zu/O7Oyldcqs4+Fj97ihBMi8ez9dLRYiVu1ISf6nL3kwJZu6ay0/nTvEF+cd
|
||||||
|
Lvvyz6b84xQslpghjLSR6Rlgg/IwKwZzUNWYOwbpx4oMYIwo+FKbbuH2TbsGJJvX
|
||||||
|
KyY//SovcfXWJL5/MZ4PbeiPT02jP/816t9JXkGPhvnxd3lLG7SjXi/7RgLQZhNe
|
||||||
|
XoVPzthwiHvOAbWWl9fNff2C+MIkwcoBOU+NosEUQB+cZtUMCUbW8tDRSHZWOkPL
|
||||||
|
tgoRObqME2wGtZ7P6wIDAQABo0IwQDAdBgNVHQ4EFgQUUTMc7TZArxfTJc1paPKv
|
||||||
|
TiM+s0EwDgYDVR0PAQH/BAQDAgGGMA8GA1UdEwEB/wQFMAMBAf8wDQYJKoZIhvcN
|
||||||
|
AQEMBQADggIBAGCmr1tfV9qJ20tQqcQjNSH/0GEwhJG3PxDPJY7Jv0Y02cEhJhxw
|
||||||
|
GXIeo8mH/qlDZJY6yFMECrZBu8RHANmfGBg7sg7zNOok992vIGCukihfNudd5N7H
|
||||||
|
PNtQOa27PShNlnx2xlv0wdsUpasZYgcYQF+Xkdycx6u1UQ3maVNVzDl92sURVXLF
|
||||||
|
O4uJ+DQtpBflF+aZfTCIITfNMBc9uPK8qHWgQ9w+iUuQrm0D4ByjoJYJu32jtyoQ
|
||||||
|
REtGBzRj7TG5BO6jm5qu5jF49OokYTurWGT/u4cnYiWB39yhL/btp/96j1EuMPik
|
||||||
|
AdKFOV8BmZZvWltwGUb+hmA+rYAQCd05JS9Yf7vSdPD3Rh9GOUrYU9DzLjtxpdRv
|
||||||
|
/PNn5AeP3SYZ4Y1b+qOTEZvpyDrDVWiakuFSdjjo4bq9+0/V77PnSIMx8IIh47a+
|
||||||
|
p6tv75/fTM8BuGJqIz3nCU2AG3swpMPdB380vqQmsvZB6Akd4yCYqjdP//fx4ilw
|
||||||
|
MUc/dNAUFvohigLVigmUdy7yWSiLfFCSCmZ4OIN1xLVaqBHG5cGdZlXPU8Sv13WF
|
||||||
|
qUITVuwhd4GTWgzqltlJyqEI8pc7bZsEGCREjnwB8twl2F6GmrE52/WRMmrRpnCK
|
||||||
|
ovfepEWFJqgejF0pW8hL2JpqA15w8oVPbEtoL8pU9ozaMv7Da4M/OMZ+
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
# Issuer: CN=Certainly Root R1 O=Certainly
|
||||||
|
# Subject: CN=Certainly Root R1 O=Certainly
|
||||||
|
# Label: "Certainly Root R1"
|
||||||
|
# Serial: 188833316161142517227353805653483829216
|
||||||
|
# MD5 Fingerprint: 07:70:d4:3e:82:87:a0:fa:33:36:13:f4:fa:33:e7:12
|
||||||
|
# SHA1 Fingerprint: a0:50:ee:0f:28:71:f4:27:b2:12:6d:6f:50:96:25:ba:cc:86:42:af
|
||||||
|
# SHA256 Fingerprint: 77:b8:2c:d8:64:4c:43:05:f7:ac:c5:cb:15:6b:45:67:50:04:03:3d:51:c6:0c:62:02:a8:e0:c3:34:67:d3:a0
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIIFRzCCAy+gAwIBAgIRAI4P+UuQcWhlM1T01EQ5t+AwDQYJKoZIhvcNAQELBQAw
|
||||||
|
PTELMAkGA1UEBhMCVVMxEjAQBgNVBAoTCUNlcnRhaW5seTEaMBgGA1UEAxMRQ2Vy
|
||||||
|
dGFpbmx5IFJvb3QgUjEwHhcNMjEwNDAxMDAwMDAwWhcNNDYwNDAxMDAwMDAwWjA9
|
||||||
|
MQswCQYDVQQGEwJVUzESMBAGA1UEChMJQ2VydGFpbmx5MRowGAYDVQQDExFDZXJ0
|
||||||
|
YWlubHkgUm9vdCBSMTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBANA2
|
||||||
|
1B/q3avk0bbm+yLA3RMNansiExyXPGhjZjKcA7WNpIGD2ngwEc/csiu+kr+O5MQT
|
||||||
|
vqRoTNoCaBZ0vrLdBORrKt03H2As2/X3oXyVtwxwhi7xOu9S98zTm/mLvg7fMbed
|
||||||
|
aFySpvXl8wo0tf97ouSHocavFwDvA5HtqRxOcT3Si2yJ9HiG5mpJoM610rCrm/b0
|
||||||
|
1C7jcvk2xusVtyWMOvwlDbMicyF0yEqWYZL1LwsYpfSt4u5BvQF5+paMjRcCMLT5
|
||||||
|
r3gajLQ2EBAHBXDQ9DGQilHFhiZ5shGIXsXwClTNSaa/ApzSRKft43jvRl5tcdF5
|
||||||
|
cBxGX1HpyTfcX35pe0HfNEXgO4T0oYoKNp43zGJS4YkNKPl6I7ENPT2a/Z2B7yyQ
|
||||||
|
wHtETrtJ4A5KVpK8y7XdeReJkd5hiXSSqOMyhb5OhaRLWcsrxXiOcVTQAjeZjOVJ
|
||||||
|
6uBUcqQRBi8LjMFbvrWhsFNunLhgkR9Za/kt9JQKl7XsxXYDVBtlUrpMklZRNaBA
|
||||||
|
2CnbrlJ2Oy0wQJuK0EJWtLeIAaSHO1OWzaMWj/Nmqhexx2DgwUMFDO6bW2BvBlyH
|
||||||
|
Wyf5QBGenDPBt+U1VwV/J84XIIwc/PH72jEpSe31C4SnT8H2TsIonPru4K8H+zMR
|
||||||
|
eiFPCyEQtkA6qyI6BJyLm4SGcprSp6XEtHWRqSsjAgMBAAGjQjBAMA4GA1UdDwEB
|
||||||
|
/wQEAwIBBjAPBgNVHRMBAf8EBTADAQH/MB0GA1UdDgQWBBTgqj8ljZ9EXME66C6u
|
||||||
|
d0yEPmcM9DANBgkqhkiG9w0BAQsFAAOCAgEAuVevuBLaV4OPaAszHQNTVfSVcOQr
|
||||||
|
PbA56/qJYv331hgELyE03fFo8NWWWt7CgKPBjcZq91l3rhVkz1t5BXdm6ozTaw3d
|
||||||
|
8VkswTOlMIAVRQdFGjEitpIAq5lNOo93r6kiyi9jyhXWx8bwPWz8HA2YEGGeEaIi
|
||||||
|
1wrykXprOQ4vMMM2SZ/g6Q8CRFA3lFV96p/2O7qUpUzpvD5RtOjKkjZUbVwlKNrd
|
||||||
|
rRT90+7iIgXr0PK3aBLXWopBGsaSpVo7Y0VPv+E6dyIvXL9G+VoDhRNCX8reU9di
|
||||||
|
taY1BMJH/5n9hN9czulegChB8n3nHpDYT3Y+gjwN/KUD+nsa2UUeYNrEjvn8K8l7
|
||||||
|
lcUq/6qJ34IxD3L/DCfXCh5WAFAeDJDBlrXYFIW7pw0WwfgHJBu6haEaBQmAupVj
|
||||||
|
yTrsJZ9/nbqkRxWbRHDxakvWOF5D8xh+UG7pWijmZeZ3Gzr9Hb4DJqPb1OG7fpYn
|
||||||
|
Kx3upPvaJVQTA945xsMfTZDsjxtK0hzthZU4UHlG1sGQUDGpXJpuHfUzVounmdLy
|
||||||
|
yCwzk5Iwx06MZTMQZBf9JBeW0Y3COmor6xOLRPIh80oat3df1+2IpHLlOR+Vnb5n
|
||||||
|
wXARPbv0+Em34yaXOp/SX3z7wJl8OSngex2/DaeP0ik0biQVy96QXr8axGbqwua6
|
||||||
|
OV+KmalBWQewLK8=
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
# Issuer: CN=Certainly Root E1 O=Certainly
|
||||||
|
# Subject: CN=Certainly Root E1 O=Certainly
|
||||||
|
# Label: "Certainly Root E1"
|
||||||
|
# Serial: 8168531406727139161245376702891150584
|
||||||
|
# MD5 Fingerprint: 0a:9e:ca:cd:3e:52:50:c6:36:f3:4b:a3:ed:a7:53:e9
|
||||||
|
# SHA1 Fingerprint: f9:e1:6d:dc:01:89:cf:d5:82:45:63:3e:c5:37:7d:c2:eb:93:6f:2b
|
||||||
|
# SHA256 Fingerprint: b4:58:5f:22:e4:ac:75:6a:4e:86:12:a1:36:1c:5d:9d:03:1a:93:fd:84:fe:bb:77:8f:a3:06:8b:0f:c4:2d:c2
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIIB9zCCAX2gAwIBAgIQBiUzsUcDMydc+Y2aub/M+DAKBggqhkjOPQQDAzA9MQsw
|
||||||
|
CQYDVQQGEwJVUzESMBAGA1UEChMJQ2VydGFpbmx5MRowGAYDVQQDExFDZXJ0YWlu
|
||||||
|
bHkgUm9vdCBFMTAeFw0yMTA0MDEwMDAwMDBaFw00NjA0MDEwMDAwMDBaMD0xCzAJ
|
||||||
|
BgNVBAYTAlVTMRIwEAYDVQQKEwlDZXJ0YWlubHkxGjAYBgNVBAMTEUNlcnRhaW5s
|
||||||
|
eSBSb290IEUxMHYwEAYHKoZIzj0CAQYFK4EEACIDYgAE3m/4fxzf7flHh4axpMCK
|
||||||
|
+IKXgOqPyEpeKn2IaKcBYhSRJHpcnqMXfYqGITQYUBsQ3tA3SybHGWCA6TS9YBk2
|
||||||
|
QNYphwk8kXr2vBMj3VlOBF7PyAIcGFPBMdjaIOlEjeR2o0IwQDAOBgNVHQ8BAf8E
|
||||||
|
BAMCAQYwDwYDVR0TAQH/BAUwAwEB/zAdBgNVHQ4EFgQU8ygYy2R17ikq6+2uI1g4
|
||||||
|
hevIIgcwCgYIKoZIzj0EAwMDaAAwZQIxALGOWiDDshliTd6wT99u0nCK8Z9+aozm
|
||||||
|
ut6Dacpps6kFtZaSF4fC0urQe87YQVt8rgIwRt7qy12a7DLCZRawTDBcMPPaTnOG
|
||||||
|
BtjOiQRINzf43TNRnXCve1XYAS59BWQOhriR
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
# Issuer: CN=E-Tugra Global Root CA RSA v3 O=E-Tugra EBG A.S. OU=E-Tugra Trust Center
|
||||||
|
# Subject: CN=E-Tugra Global Root CA RSA v3 O=E-Tugra EBG A.S. OU=E-Tugra Trust Center
|
||||||
|
# Label: "E-Tugra Global Root CA RSA v3"
|
||||||
|
# Serial: 75951268308633135324246244059508261641472512052
|
||||||
|
# MD5 Fingerprint: 22:be:10:f6:c2:f8:03:88:73:5f:33:29:47:28:47:a4
|
||||||
|
# SHA1 Fingerprint: e9:a8:5d:22:14:52:1c:5b:aa:0a:b4:be:24:6a:23:8a:c9:ba:e2:a9
|
||||||
|
# SHA256 Fingerprint: ef:66:b0:b1:0a:3c:db:9f:2e:36:48:c7:6b:d2:af:18:ea:d2:bf:e6:f1:17:65:5e:28:c4:06:0d:a1:a3:f4:c2
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIIF8zCCA9ugAwIBAgIUDU3FzRYilZYIfrgLfxUGNPt5EDQwDQYJKoZIhvcNAQEL
|
||||||
|
BQAwgYAxCzAJBgNVBAYTAlRSMQ8wDQYDVQQHEwZBbmthcmExGTAXBgNVBAoTEEUt
|
||||||
|
VHVncmEgRUJHIEEuUy4xHTAbBgNVBAsTFEUtVHVncmEgVHJ1c3QgQ2VudGVyMSYw
|
||||||
|
JAYDVQQDEx1FLVR1Z3JhIEdsb2JhbCBSb290IENBIFJTQSB2MzAeFw0yMDAzMTgw
|
||||||
|
OTA3MTdaFw00NTAzMTIwOTA3MTdaMIGAMQswCQYDVQQGEwJUUjEPMA0GA1UEBxMG
|
||||||
|
QW5rYXJhMRkwFwYDVQQKExBFLVR1Z3JhIEVCRyBBLlMuMR0wGwYDVQQLExRFLVR1
|
||||||
|
Z3JhIFRydXN0IENlbnRlcjEmMCQGA1UEAxMdRS1UdWdyYSBHbG9iYWwgUm9vdCBD
|
||||||
|
QSBSU0EgdjMwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQCiZvCJt3J7
|
||||||
|
7gnJY9LTQ91ew6aEOErxjYG7FL1H6EAX8z3DeEVypi6Q3po61CBxyryfHUuXCscx
|
||||||
|
uj7X/iWpKo429NEvx7epXTPcMHD4QGxLsqYxYdE0PD0xesevxKenhOGXpOhL9hd8
|
||||||
|
7jwH7eKKV9y2+/hDJVDqJ4GohryPUkqWOmAalrv9c/SF/YP9f4RtNGx/ardLAQO/
|
||||||
|
rWm31zLZ9Vdq6YaCPqVmMbMWPcLzJmAy01IesGykNz709a/r4d+ABs8qQedmCeFL
|
||||||
|
l+d3vSFtKbZnwy1+7dZ5ZdHPOrbRsV5WYVB6Ws5OUDGAA5hH5+QYfERaxqSzO8bG
|
||||||
|
wzrwbMOLyKSRBfP12baqBqG3q+Sx6iEUXIOk/P+2UNOMEiaZdnDpwA+mdPy70Bt4
|
||||||
|
znKS4iicvObpCdg604nmvi533wEKb5b25Y08TVJ2Glbhc34XrD2tbKNSEhhw5oBO
|
||||||
|
M/J+JjKsBY04pOZ2PJ8QaQ5tndLBeSBrW88zjdGUdjXnXVXHt6woq0bM5zshtQoK
|
||||||
|
5EpZ3IE1S0SVEgpnpaH/WwAH0sDM+T/8nzPyAPiMbIedBi3x7+PmBvrFZhNb/FAH
|
||||||
|
nnGGstpvdDDPk1Po3CLW3iAfYY2jLqN4MpBs3KwytQXk9TwzDdbgh3cXTJ2w2Amo
|
||||||
|
DVf3RIXwyAS+XF1a4xeOVGNpf0l0ZAWMowIDAQABo2MwYTAPBgNVHRMBAf8EBTAD
|
||||||
|
AQH/MB8GA1UdIwQYMBaAFLK0ruYt9ybVqnUtdkvAG1Mh0EjvMB0GA1UdDgQWBBSy
|
||||||
|
tK7mLfcm1ap1LXZLwBtTIdBI7zAOBgNVHQ8BAf8EBAMCAQYwDQYJKoZIhvcNAQEL
|
||||||
|
BQADggIBAImocn+M684uGMQQgC0QDP/7FM0E4BQ8Tpr7nym/Ip5XuYJzEmMmtcyQ
|
||||||
|
6dIqKe6cLcwsmb5FJ+Sxce3kOJUxQfJ9emN438o2Fi+CiJ+8EUdPdk3ILY7r3y18
|
||||||
|
Tjvarvbj2l0Upq7ohUSdBm6O++96SmotKygY/r+QLHUWnw/qln0F7psTpURs+APQ
|
||||||
|
3SPh/QMSEgj0GDSz4DcLdxEBSL9htLX4GdnLTeqjjO/98Aa1bZL0SmFQhO3sSdPk
|
||||||
|
vmjmLuMxC1QLGpLWgti2omU8ZgT5Vdps+9u1FGZNlIM7zR6mK7L+d0CGq+ffCsn9
|
||||||
|
9t2HVhjYsCxVYJb6CH5SkPVLpi6HfMsg2wY+oF0Dd32iPBMbKaITVaA9FCKvb7jQ
|
||||||
|
mhty3QUBjYZgv6Rn7rWlDdF/5horYmbDB7rnoEgcOMPpRfunf/ztAmgayncSd6YA
|
||||||
|
VSgU7NbHEqIbZULpkejLPoeJVF3Zr52XnGnnCv8PWniLYypMfUeUP95L6VPQMPHF
|
||||||
|
9p5J3zugkaOj/s1YzOrfr28oO6Bpm4/srK4rVJ2bBLFHIK+WEj5jlB0E5y67hscM
|
||||||
|
moi/dkfv97ALl2bSRM9gUgfh1SxKOidhd8rXj+eHDjD/DLsE4mHDosiXYY60MGo8
|
||||||
|
bcIHX0pzLz/5FooBZu+6kcpSV3uu1OYP3Qt6f4ueJiDPO++BcYNZ
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
# Issuer: CN=E-Tugra Global Root CA ECC v3 O=E-Tugra EBG A.S. OU=E-Tugra Trust Center
|
||||||
|
# Subject: CN=E-Tugra Global Root CA ECC v3 O=E-Tugra EBG A.S. OU=E-Tugra Trust Center
|
||||||
|
# Label: "E-Tugra Global Root CA ECC v3"
|
||||||
|
# Serial: 218504919822255052842371958738296604628416471745
|
||||||
|
# MD5 Fingerprint: 46:bc:81:bb:f1:b5:1e:f7:4b:96:bc:14:e2:e7:27:64
|
||||||
|
# SHA1 Fingerprint: 8a:2f:af:57:53:b1:b0:e6:a1:04:ec:5b:6a:69:71:6d:f6:1c:e2:84
|
||||||
|
# SHA256 Fingerprint: 87:3f:46:85:fa:7f:56:36:25:25:2e:6d:36:bc:d7:f1:6f:c2:49:51:f2:64:e4:7e:1b:95:4f:49:08:cd:ca:13
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIICpTCCAiqgAwIBAgIUJkYZdzHhT28oNt45UYbm1JeIIsEwCgYIKoZIzj0EAwMw
|
||||||
|
gYAxCzAJBgNVBAYTAlRSMQ8wDQYDVQQHEwZBbmthcmExGTAXBgNVBAoTEEUtVHVn
|
||||||
|
cmEgRUJHIEEuUy4xHTAbBgNVBAsTFEUtVHVncmEgVHJ1c3QgQ2VudGVyMSYwJAYD
|
||||||
|
VQQDEx1FLVR1Z3JhIEdsb2JhbCBSb290IENBIEVDQyB2MzAeFw0yMDAzMTgwOTQ2
|
||||||
|
NThaFw00NTAzMTIwOTQ2NThaMIGAMQswCQYDVQQGEwJUUjEPMA0GA1UEBxMGQW5r
|
||||||
|
YXJhMRkwFwYDVQQKExBFLVR1Z3JhIEVCRyBBLlMuMR0wGwYDVQQLExRFLVR1Z3Jh
|
||||||
|
IFRydXN0IENlbnRlcjEmMCQGA1UEAxMdRS1UdWdyYSBHbG9iYWwgUm9vdCBDQSBF
|
||||||
|
Q0MgdjMwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAASOmCm/xxAeJ9urA8woLNheSBkQ
|
||||||
|
KczLWYHMjLiSF4mDKpL2w6QdTGLVn9agRtwcvHbB40fQWxPa56WzZkjnIZpKT4YK
|
||||||
|
fWzqTTKACrJ6CZtpS5iB4i7sAnCWH/31Rs7K3IKjYzBhMA8GA1UdEwEB/wQFMAMB
|
||||||
|
Af8wHwYDVR0jBBgwFoAU/4Ixcj75xGZsrTie0bBRiKWQzPUwHQYDVR0OBBYEFP+C
|
||||||
|
MXI++cRmbK04ntGwUYilkMz1MA4GA1UdDwEB/wQEAwIBBjAKBggqhkjOPQQDAwNp
|
||||||
|
ADBmAjEA5gVYaWHlLcoNy/EZCL3W/VGSGn5jVASQkZo1kTmZ+gepZpO6yGjUij/6
|
||||||
|
7W4WAie3AjEA3VoXK3YdZUKWpqxdinlW2Iob35reX8dQj7FbcQwm32pAAOwzkSFx
|
||||||
|
vmjkI6TZraE3
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
# Issuer: CN=Security Communication RootCA3 O=SECOM Trust Systems CO.,LTD.
|
||||||
|
# Subject: CN=Security Communication RootCA3 O=SECOM Trust Systems CO.,LTD.
|
||||||
|
# Label: "Security Communication RootCA3"
|
||||||
|
# Serial: 16247922307909811815
|
||||||
|
# MD5 Fingerprint: 1c:9a:16:ff:9e:5c:e0:4d:8a:14:01:f4:35:5d:29:26
|
||||||
|
# SHA1 Fingerprint: c3:03:c8:22:74:92:e5:61:a2:9c:5f:79:91:2b:1e:44:13:91:30:3a
|
||||||
|
# SHA256 Fingerprint: 24:a5:5c:2a:b0:51:44:2d:06:17:76:65:41:23:9a:4a:d0:32:d7:c5:51:75:aa:34:ff:de:2f:bc:4f:5c:52:94
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIIFfzCCA2egAwIBAgIJAOF8N0D9G/5nMA0GCSqGSIb3DQEBDAUAMF0xCzAJBgNV
|
||||||
|
BAYTAkpQMSUwIwYDVQQKExxTRUNPTSBUcnVzdCBTeXN0ZW1zIENPLixMVEQuMScw
|
||||||
|
JQYDVQQDEx5TZWN1cml0eSBDb21tdW5pY2F0aW9uIFJvb3RDQTMwHhcNMTYwNjE2
|
||||||
|
MDYxNzE2WhcNMzgwMTE4MDYxNzE2WjBdMQswCQYDVQQGEwJKUDElMCMGA1UEChMc
|
||||||
|
U0VDT00gVHJ1c3QgU3lzdGVtcyBDTy4sTFRELjEnMCUGA1UEAxMeU2VjdXJpdHkg
|
||||||
|
Q29tbXVuaWNhdGlvbiBSb290Q0EzMIICIjANBgkqhkiG9w0BAQEFAAOCAg8AMIIC
|
||||||
|
CgKCAgEA48lySfcw3gl8qUCBWNO0Ot26YQ+TUG5pPDXC7ltzkBtnTCHsXzW7OT4r
|
||||||
|
CmDvu20rhvtxosis5FaU+cmvsXLUIKx00rgVrVH+hXShuRD+BYD5UpOzQD11EKzA
|
||||||
|
lrenfna84xtSGc4RHwsENPXY9Wk8d/Nk9A2qhd7gCVAEF5aEt8iKvE1y/By7z/MG
|
||||||
|
TfmfZPd+pmaGNXHIEYBMwXFAWB6+oHP2/D5Q4eAvJj1+XCO1eXDe+uDRpdYMQXF7
|
||||||
|
9+qMHIjH7Iv10S9VlkZ8WjtYO/u62C21Jdp6Ts9EriGmnpjKIG58u4iFW/vAEGK7
|
||||||
|
8vknR+/RiTlDxN/e4UG/VHMgly1s2vPUB6PmudhvrvyMGS7TZ2crldtYXLVqAvO4
|
||||||
|
g160a75BflcJdURQVc1aEWEhCmHCqYj9E7wtiS/NYeCVvsq1e+F7NGcLH7YMx3we
|
||||||
|
GVPKp7FKFSBWFHA9K4IsD50VHUeAR/94mQ4xr28+j+2GaR57GIgUssL8gjMunEst
|
||||||
|
+3A7caoreyYn8xrC3PsXuKHqy6C0rtOUfnrQq8PsOC0RLoi/1D+tEjtCrI8Cbn3M
|
||||||
|
0V9hvqG8OmpI6iZVIhZdXw3/JzOfGAN0iltSIEdrRU0id4xVJ/CvHozJgyJUt5rQ
|
||||||
|
T9nO/NkuHJYosQLTA70lUhw0Zk8jq/R3gpYd0VcwCBEF/VfR2ccCAwEAAaNCMEAw
|
||||||
|
HQYDVR0OBBYEFGQUfPxYchamCik0FW8qy7z8r6irMA4GA1UdDwEB/wQEAwIBBjAP
|
||||||
|
BgNVHRMBAf8EBTADAQH/MA0GCSqGSIb3DQEBDAUAA4ICAQDcAiMI4u8hOscNtybS
|
||||||
|
YpOnpSNyByCCYN8Y11StaSWSntkUz5m5UoHPrmyKO1o5yGwBQ8IibQLwYs1OY0PA
|
||||||
|
FNr0Y/Dq9HHuTofjcan0yVflLl8cebsjqodEV+m9NU1Bu0soo5iyG9kLFwfl9+qd
|
||||||
|
9XbXv8S2gVj/yP9kaWJ5rW4OH3/uHWnlt3Jxs/6lATWUVCvAUm2PVcTJ0rjLyjQI
|
||||||
|
UYWg9by0F1jqClx6vWPGOi//lkkZhOpn2ASxYfQAW0q3nHE3GYV5v4GwxxMOdnE+
|
||||||
|
OoAGrgYWp421wsTL/0ClXI2lyTrtcoHKXJg80jQDdwj98ClZXSEIx2C/pHF7uNke
|
||||||
|
gr4Jr2VvKKu/S7XuPghHJ6APbw+LP6yVGPO5DtxnVW5inkYO0QR4ynKudtml+LLf
|
||||||
|
iAlhi+8kTtFZP1rUPcmTPCtk9YENFpb3ksP+MW/oKjJ0DvRMmEoYDjBU1cXrvMUV
|
||||||
|
nuiZIesnKwkK2/HmcBhWuwzkvvnoEKQTkrgc4NtnHVMDpCKn3F2SEDzq//wbEBrD
|
||||||
|
2NCcnWXL0CsnMQMeNuE9dnUM/0Umud1RvCPHX9jYhxBAEg09ODfnRDwYwFMJZI//
|
||||||
|
1ZqmfHAuc1Uh6N//g7kdPjIe1qZ9LPFm6Vwdp6POXiUyK+OVrCoHzrQoeIY8Laad
|
||||||
|
TdJ0MN1kURXbg4NR16/9M51NZg==
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
# Issuer: CN=Security Communication ECC RootCA1 O=SECOM Trust Systems CO.,LTD.
|
||||||
|
# Subject: CN=Security Communication ECC RootCA1 O=SECOM Trust Systems CO.,LTD.
|
||||||
|
# Label: "Security Communication ECC RootCA1"
|
||||||
|
# Serial: 15446673492073852651
|
||||||
|
# MD5 Fingerprint: 7e:43:b0:92:68:ec:05:43:4c:98:ab:5d:35:2e:7e:86
|
||||||
|
# SHA1 Fingerprint: b8:0e:26:a9:bf:d2:b2:3b:c0:ef:46:c9:ba:c7:bb:f6:1d:0d:41:41
|
||||||
|
# SHA256 Fingerprint: e7:4f:bd:a5:5b:d5:64:c4:73:a3:6b:44:1a:a7:99:c8:a6:8e:07:74:40:e8:28:8b:9f:a1:e5:0e:4b:ba:ca:11
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIICODCCAb6gAwIBAgIJANZdm7N4gS7rMAoGCCqGSM49BAMDMGExCzAJBgNVBAYT
|
||||||
|
AkpQMSUwIwYDVQQKExxTRUNPTSBUcnVzdCBTeXN0ZW1zIENPLixMVEQuMSswKQYD
|
||||||
|
VQQDEyJTZWN1cml0eSBDb21tdW5pY2F0aW9uIEVDQyBSb290Q0ExMB4XDTE2MDYx
|
||||||
|
NjA1MTUyOFoXDTM4MDExODA1MTUyOFowYTELMAkGA1UEBhMCSlAxJTAjBgNVBAoT
|
||||||
|
HFNFQ09NIFRydXN0IFN5c3RlbXMgQ08uLExURC4xKzApBgNVBAMTIlNlY3VyaXR5
|
||||||
|
IENvbW11bmljYXRpb24gRUNDIFJvb3RDQTEwdjAQBgcqhkjOPQIBBgUrgQQAIgNi
|
||||||
|
AASkpW9gAwPDvTH00xecK4R1rOX9PVdu12O/5gSJko6BnOPpR27KkBLIE+Cnnfdl
|
||||||
|
dB9sELLo5OnvbYUymUSxXv3MdhDYW72ixvnWQuRXdtyQwjWpS4g8EkdtXP9JTxpK
|
||||||
|
ULGjQjBAMB0GA1UdDgQWBBSGHOf+LaVKiwj+KBH6vqNm+GBZLzAOBgNVHQ8BAf8E
|
||||||
|
BAMCAQYwDwYDVR0TAQH/BAUwAwEB/zAKBggqhkjOPQQDAwNoADBlAjAVXUI9/Lbu
|
||||||
|
9zuxNuie9sRGKEkz0FhDKmMpzE2xtHqiuQ04pV1IKv3LsnNdo4gIxwwCMQDAqy0O
|
||||||
|
be0YottT6SXbVQjgUMzfRGEWgqtJsLKB7HOHeLRMsmIbEvoWTSVLY70eN9k=
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
|
@ -1,20 +1,20 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
certifi.py
|
certifi.py
|
||||||
~~~~~~~~~~
|
~~~~~~~~~~
|
||||||
|
|
||||||
This module returns the installation location of cacert.pem or its contents.
|
This module returns the installation location of cacert.pem or its contents.
|
||||||
"""
|
"""
|
||||||
import os
|
import sys
|
||||||
|
|
||||||
try:
|
|
||||||
from importlib.resources import path as get_path, read_text
|
if sys.version_info >= (3, 11):
|
||||||
|
|
||||||
|
from importlib.resources import as_file, files
|
||||||
|
|
||||||
_CACERT_CTX = None
|
_CACERT_CTX = None
|
||||||
_CACERT_PATH = None
|
_CACERT_PATH = None
|
||||||
|
|
||||||
def where():
|
def where() -> str:
|
||||||
# This is slightly terrible, but we want to delay extracting the file
|
# This is slightly terrible, but we want to delay extracting the file
|
||||||
# in cases where we're inside of a zipimport situation until someone
|
# in cases where we're inside of a zipimport situation until someone
|
||||||
# actually calls where(), but we don't want to re-extract the file
|
# actually calls where(), but we don't want to re-extract the file
|
||||||
|
@ -33,28 +33,76 @@ try:
|
||||||
# We also have to hold onto the actual context manager, because
|
# We also have to hold onto the actual context manager, because
|
||||||
# it will do the cleanup whenever it gets garbage collected, so
|
# it will do the cleanup whenever it gets garbage collected, so
|
||||||
# we will also store that at the global level as well.
|
# we will also store that at the global level as well.
|
||||||
|
_CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
|
||||||
|
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
||||||
|
|
||||||
|
return _CACERT_PATH
|
||||||
|
|
||||||
|
def contents() -> str:
|
||||||
|
return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
|
||||||
|
|
||||||
|
elif sys.version_info >= (3, 7):
|
||||||
|
|
||||||
|
from importlib.resources import path as get_path, read_text
|
||||||
|
|
||||||
|
_CACERT_CTX = None
|
||||||
|
_CACERT_PATH = None
|
||||||
|
|
||||||
|
def where() -> str:
|
||||||
|
# This is slightly terrible, but we want to delay extracting the
|
||||||
|
# file in cases where we're inside of a zipimport situation until
|
||||||
|
# someone actually calls where(), but we don't want to re-extract
|
||||||
|
# the file on every call of where(), so we'll do it once then store
|
||||||
|
# it in a global variable.
|
||||||
|
global _CACERT_CTX
|
||||||
|
global _CACERT_PATH
|
||||||
|
if _CACERT_PATH is None:
|
||||||
|
# This is slightly janky, the importlib.resources API wants you
|
||||||
|
# to manage the cleanup of this file, so it doesn't actually
|
||||||
|
# return a path, it returns a context manager that will give
|
||||||
|
# you the path when you enter it and will do any cleanup when
|
||||||
|
# you leave it. In the common case of not needing a temporary
|
||||||
|
# file, it will just return the file system location and the
|
||||||
|
# __exit__() is a no-op.
|
||||||
|
#
|
||||||
|
# We also have to hold onto the actual context manager, because
|
||||||
|
# it will do the cleanup whenever it gets garbage collected, so
|
||||||
|
# we will also store that at the global level as well.
|
||||||
_CACERT_CTX = get_path("certifi", "cacert.pem")
|
_CACERT_CTX = get_path("certifi", "cacert.pem")
|
||||||
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
||||||
|
|
||||||
return _CACERT_PATH
|
return _CACERT_PATH
|
||||||
|
|
||||||
|
def contents() -> str:
|
||||||
|
return read_text("certifi", "cacert.pem", encoding="ascii")
|
||||||
|
|
||||||
|
else:
|
||||||
|
import os
|
||||||
|
import types
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
Package = Union[types.ModuleType, str]
|
||||||
|
Resource = Union[str, "os.PathLike"]
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
# This fallback will work for Python versions prior to 3.7 that lack the
|
# This fallback will work for Python versions prior to 3.7 that lack the
|
||||||
# importlib.resources module but relies on the existing `where` function
|
# importlib.resources module but relies on the existing `where` function
|
||||||
# so won't address issues with environments like PyOxidizer that don't set
|
# so won't address issues with environments like PyOxidizer that don't set
|
||||||
# __file__ on modules.
|
# __file__ on modules.
|
||||||
def read_text(_module, _path, encoding="ascii"):
|
def read_text(
|
||||||
with open(where(), "r", encoding=encoding) as data:
|
package: Package,
|
||||||
|
resource: Resource,
|
||||||
|
encoding: str = 'utf-8',
|
||||||
|
errors: str = 'strict'
|
||||||
|
) -> str:
|
||||||
|
with open(where(), encoding=encoding) as data:
|
||||||
return data.read()
|
return data.read()
|
||||||
|
|
||||||
# If we don't have importlib.resources, then we will just do the old logic
|
# If we don't have importlib.resources, then we will just do the old logic
|
||||||
# of assuming we're on the filesystem and munge the path directly.
|
# of assuming we're on the filesystem and munge the path directly.
|
||||||
def where():
|
def where() -> str:
|
||||||
f = os.path.dirname(__file__)
|
f = os.path.dirname(__file__)
|
||||||
|
|
||||||
return os.path.join(f, "cacert.pem")
|
return os.path.join(f, "cacert.pem")
|
||||||
|
|
||||||
|
def contents() -> str:
|
||||||
def contents():
|
|
||||||
return read_text("certifi", "cacert.pem", encoding="ascii")
|
return read_text("certifi", "cacert.pem", encoding="ascii")
|
||||||
|
|
0
libs/certifi/py.typed
Normal file
0
libs/certifi/py.typed
Normal file
|
@ -15,13 +15,11 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
|
||||||
from .universaldetector import UniversalDetector
|
|
||||||
from .enums import InputState
|
from .enums import InputState
|
||||||
from .version import __version__, VERSION
|
from .universaldetector import UniversalDetector
|
||||||
|
from .version import VERSION, __version__
|
||||||
|
|
||||||
|
__all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"]
|
||||||
__all__ = ['UniversalDetector', 'detect', 'detect_all', '__version__', 'VERSION']
|
|
||||||
|
|
||||||
|
|
||||||
def detect(byte_str):
|
def detect(byte_str):
|
||||||
|
@ -33,51 +31,63 @@ def detect(byte_str):
|
||||||
"""
|
"""
|
||||||
if not isinstance(byte_str, bytearray):
|
if not isinstance(byte_str, bytearray):
|
||||||
if not isinstance(byte_str, bytes):
|
if not isinstance(byte_str, bytes):
|
||||||
raise TypeError('Expected object of type bytes or bytearray, got: '
|
raise TypeError(
|
||||||
'{}'.format(type(byte_str)))
|
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
|
||||||
else:
|
)
|
||||||
byte_str = bytearray(byte_str)
|
byte_str = bytearray(byte_str)
|
||||||
detector = UniversalDetector()
|
detector = UniversalDetector()
|
||||||
detector.feed(byte_str)
|
detector.feed(byte_str)
|
||||||
return detector.close()
|
return detector.close()
|
||||||
|
|
||||||
|
|
||||||
def detect_all(byte_str):
|
def detect_all(byte_str, ignore_threshold=False):
|
||||||
"""
|
"""
|
||||||
Detect all the possible encodings of the given byte string.
|
Detect all the possible encodings of the given byte string.
|
||||||
|
|
||||||
:param byte_str: The byte sequence to examine.
|
:param byte_str: The byte sequence to examine.
|
||||||
:type byte_str: ``bytes`` or ``bytearray``
|
:type byte_str: ``bytes`` or ``bytearray``
|
||||||
|
:param ignore_threshold: Include encodings that are below
|
||||||
|
``UniversalDetector.MINIMUM_THRESHOLD``
|
||||||
|
in results.
|
||||||
|
:type ignore_threshold: ``bool``
|
||||||
"""
|
"""
|
||||||
if not isinstance(byte_str, bytearray):
|
if not isinstance(byte_str, bytearray):
|
||||||
if not isinstance(byte_str, bytes):
|
if not isinstance(byte_str, bytes):
|
||||||
raise TypeError('Expected object of type bytes or bytearray, got: '
|
raise TypeError(
|
||||||
'{}'.format(type(byte_str)))
|
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
|
||||||
else:
|
)
|
||||||
byte_str = bytearray(byte_str)
|
byte_str = bytearray(byte_str)
|
||||||
|
|
||||||
detector = UniversalDetector()
|
detector = UniversalDetector()
|
||||||
detector.feed(byte_str)
|
detector.feed(byte_str)
|
||||||
detector.close()
|
detector.close()
|
||||||
|
|
||||||
if detector._input_state == InputState.HIGH_BYTE:
|
if detector.input_state == InputState.HIGH_BYTE:
|
||||||
results = []
|
results = []
|
||||||
for prober in detector._charset_probers:
|
probers = []
|
||||||
if prober.get_confidence() > detector.MINIMUM_THRESHOLD:
|
for prober in detector.charset_probers:
|
||||||
charset_name = prober.charset_name
|
if hasattr(prober, "probers"):
|
||||||
lower_charset_name = prober.charset_name.lower()
|
probers.extend(p for p in prober.probers)
|
||||||
|
else:
|
||||||
|
probers.append(prober)
|
||||||
|
for prober in probers:
|
||||||
|
if ignore_threshold or prober.get_confidence() > detector.MINIMUM_THRESHOLD:
|
||||||
|
charset_name = prober.charset_name or ""
|
||||||
|
lower_charset_name = charset_name.lower()
|
||||||
# Use Windows encoding name instead of ISO-8859 if we saw any
|
# Use Windows encoding name instead of ISO-8859 if we saw any
|
||||||
# extra Windows-specific bytes
|
# extra Windows-specific bytes
|
||||||
if lower_charset_name.startswith('iso-8859'):
|
if lower_charset_name.startswith("iso-8859") and detector.has_win_bytes:
|
||||||
if detector._has_win_bytes:
|
charset_name = detector.ISO_WIN_MAP.get(
|
||||||
charset_name = detector.ISO_WIN_MAP.get(lower_charset_name,
|
lower_charset_name, charset_name
|
||||||
charset_name)
|
)
|
||||||
results.append({
|
results.append(
|
||||||
'encoding': charset_name,
|
{
|
||||||
'confidence': prober.get_confidence(),
|
"encoding": charset_name,
|
||||||
'language': prober.language,
|
"confidence": prober.get_confidence(),
|
||||||
})
|
"language": prober.language,
|
||||||
|
}
|
||||||
|
)
|
||||||
if len(results) > 0:
|
if len(results) > 0:
|
||||||
return sorted(results, key=lambda result: -result['confidence'])
|
return sorted(results, key=lambda result: -result["confidence"])
|
||||||
|
|
||||||
return [detector.result]
|
return [detector.result]
|
||||||
|
|
|
@ -44,7 +44,7 @@ BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75
|
||||||
|
|
||||||
# Char to FreqOrder table
|
# Char to FreqOrder table
|
||||||
BIG5_TABLE_SIZE = 5376
|
BIG5_TABLE_SIZE = 5376
|
||||||
|
# fmt: off
|
||||||
BIG5_CHAR_TO_FREQ_ORDER = (
|
BIG5_CHAR_TO_FREQ_ORDER = (
|
||||||
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16
|
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16
|
||||||
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32
|
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32
|
||||||
|
@ -383,4 +383,4 @@ BIG5_CHAR_TO_FREQ_ORDER = (
|
||||||
890,3669,3943,5791,1878,3798,3439,5792,2186,2358,3440,1652,5793,5794,5795, 941, # 5360
|
890,3669,3943,5791,1878,3798,3439,5792,2186,2358,3440,1652,5793,5794,5795, 941, # 5360
|
||||||
2299, 208,3546,4161,2020, 330,4438,3944,2906,2499,3799,4439,4811,5796,5797,5798, # 5376
|
2299, 208,3546,4161,2020, 330,4438,3944,2906,2499,3799,4439,4811,5796,5797,5798, # 5376
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
|
@ -25,15 +25,15 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .mbcharsetprober import MultiByteCharSetProber
|
|
||||||
from .codingstatemachine import CodingStateMachine
|
|
||||||
from .chardistribution import Big5DistributionAnalysis
|
from .chardistribution import Big5DistributionAnalysis
|
||||||
|
from .codingstatemachine import CodingStateMachine
|
||||||
|
from .mbcharsetprober import MultiByteCharSetProber
|
||||||
from .mbcssm import BIG5_SM_MODEL
|
from .mbcssm import BIG5_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
class Big5Prober(MultiByteCharSetProber):
|
class Big5Prober(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(Big5Prober, self).__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(BIG5_SM_MODEL)
|
self.coding_sm = CodingStateMachine(BIG5_SM_MODEL)
|
||||||
self.distribution_analyzer = Big5DistributionAnalysis()
|
self.distribution_analyzer = Big5DistributionAnalysis()
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
|
@ -25,19 +25,35 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .euctwfreq import (EUCTW_CHAR_TO_FREQ_ORDER, EUCTW_TABLE_SIZE,
|
from .big5freq import (
|
||||||
EUCTW_TYPICAL_DISTRIBUTION_RATIO)
|
BIG5_CHAR_TO_FREQ_ORDER,
|
||||||
from .euckrfreq import (EUCKR_CHAR_TO_FREQ_ORDER, EUCKR_TABLE_SIZE,
|
BIG5_TABLE_SIZE,
|
||||||
EUCKR_TYPICAL_DISTRIBUTION_RATIO)
|
BIG5_TYPICAL_DISTRIBUTION_RATIO,
|
||||||
from .gb2312freq import (GB2312_CHAR_TO_FREQ_ORDER, GB2312_TABLE_SIZE,
|
)
|
||||||
GB2312_TYPICAL_DISTRIBUTION_RATIO)
|
from .euckrfreq import (
|
||||||
from .big5freq import (BIG5_CHAR_TO_FREQ_ORDER, BIG5_TABLE_SIZE,
|
EUCKR_CHAR_TO_FREQ_ORDER,
|
||||||
BIG5_TYPICAL_DISTRIBUTION_RATIO)
|
EUCKR_TABLE_SIZE,
|
||||||
from .jisfreq import (JIS_CHAR_TO_FREQ_ORDER, JIS_TABLE_SIZE,
|
EUCKR_TYPICAL_DISTRIBUTION_RATIO,
|
||||||
JIS_TYPICAL_DISTRIBUTION_RATIO)
|
)
|
||||||
|
from .euctwfreq import (
|
||||||
|
EUCTW_CHAR_TO_FREQ_ORDER,
|
||||||
|
EUCTW_TABLE_SIZE,
|
||||||
|
EUCTW_TYPICAL_DISTRIBUTION_RATIO,
|
||||||
|
)
|
||||||
|
from .gb2312freq import (
|
||||||
|
GB2312_CHAR_TO_FREQ_ORDER,
|
||||||
|
GB2312_TABLE_SIZE,
|
||||||
|
GB2312_TYPICAL_DISTRIBUTION_RATIO,
|
||||||
|
)
|
||||||
|
from .jisfreq import (
|
||||||
|
JIS_CHAR_TO_FREQ_ORDER,
|
||||||
|
JIS_TABLE_SIZE,
|
||||||
|
JIS_TYPICAL_DISTRIBUTION_RATIO,
|
||||||
|
)
|
||||||
|
from .johabfreq import JOHAB_TO_EUCKR_ORDER_TABLE
|
||||||
|
|
||||||
|
|
||||||
class CharDistributionAnalysis(object):
|
class CharDistributionAnalysis:
|
||||||
ENOUGH_DATA_THRESHOLD = 1024
|
ENOUGH_DATA_THRESHOLD = 1024
|
||||||
SURE_YES = 0.99
|
SURE_YES = 0.99
|
||||||
SURE_NO = 0.01
|
SURE_NO = 0.01
|
||||||
|
@ -46,7 +62,7 @@ class CharDistributionAnalysis(object):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
# Mapping table to get frequency order from char order (get from
|
# Mapping table to get frequency order from char order (get from
|
||||||
# GetOrder())
|
# GetOrder())
|
||||||
self._char_to_freq_order = None
|
self._char_to_freq_order = tuple()
|
||||||
self._table_size = None # Size of above table
|
self._table_size = None # Size of above table
|
||||||
# This is a constant value which varies from language to language,
|
# This is a constant value which varies from language to language,
|
||||||
# used in calculating confidence. See
|
# used in calculating confidence. See
|
||||||
|
@ -89,8 +105,9 @@ class CharDistributionAnalysis(object):
|
||||||
return self.SURE_NO
|
return self.SURE_NO
|
||||||
|
|
||||||
if self._total_chars != self._freq_chars:
|
if self._total_chars != self._freq_chars:
|
||||||
r = (self._freq_chars / ((self._total_chars - self._freq_chars)
|
r = self._freq_chars / (
|
||||||
* self.typical_distribution_ratio))
|
(self._total_chars - self._freq_chars) * self.typical_distribution_ratio
|
||||||
|
)
|
||||||
if r < self.SURE_YES:
|
if r < self.SURE_YES:
|
||||||
return r
|
return r
|
||||||
|
|
||||||
|
@ -102,7 +119,7 @@ class CharDistributionAnalysis(object):
|
||||||
# For charset detection, certain amount of data is enough
|
# For charset detection, certain amount of data is enough
|
||||||
return self._total_chars > self.ENOUGH_DATA_THRESHOLD
|
return self._total_chars > self.ENOUGH_DATA_THRESHOLD
|
||||||
|
|
||||||
def get_order(self, byte_str):
|
def get_order(self, _):
|
||||||
# We do not handle characters based on the original encoding string,
|
# We do not handle characters based on the original encoding string,
|
||||||
# but convert this encoding string to a number, here called order.
|
# but convert this encoding string to a number, here called order.
|
||||||
# This allows multiple encodings of a language to share one frequency
|
# This allows multiple encodings of a language to share one frequency
|
||||||
|
@ -112,7 +129,7 @@ class CharDistributionAnalysis(object):
|
||||||
|
|
||||||
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
|
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(EUCTWDistributionAnalysis, self).__init__()
|
super().__init__()
|
||||||
self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER
|
self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER
|
||||||
self._table_size = EUCTW_TABLE_SIZE
|
self._table_size = EUCTW_TABLE_SIZE
|
||||||
self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
|
self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
@ -125,13 +142,12 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis):
|
||||||
first_char = byte_str[0]
|
first_char = byte_str[0]
|
||||||
if first_char >= 0xC4:
|
if first_char >= 0xC4:
|
||||||
return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1
|
return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1
|
||||||
else:
|
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
|
||||||
class EUCKRDistributionAnalysis(CharDistributionAnalysis):
|
class EUCKRDistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(EUCKRDistributionAnalysis, self).__init__()
|
super().__init__()
|
||||||
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
|
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
|
||||||
self._table_size = EUCKR_TABLE_SIZE
|
self._table_size = EUCKR_TABLE_SIZE
|
||||||
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
@ -144,13 +160,27 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis):
|
||||||
first_char = byte_str[0]
|
first_char = byte_str[0]
|
||||||
if first_char >= 0xB0:
|
if first_char >= 0xB0:
|
||||||
return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1
|
return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1
|
||||||
else:
|
return -1
|
||||||
|
|
||||||
|
|
||||||
|
class JOHABDistributionAnalysis(CharDistributionAnalysis):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
|
||||||
|
self._table_size = EUCKR_TABLE_SIZE
|
||||||
|
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
|
||||||
|
def get_order(self, byte_str):
|
||||||
|
first_char = byte_str[0]
|
||||||
|
if 0x88 <= first_char < 0xD4:
|
||||||
|
code = first_char * 256 + byte_str[1]
|
||||||
|
return JOHAB_TO_EUCKR_ORDER_TABLE.get(code, -1)
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
|
||||||
class GB2312DistributionAnalysis(CharDistributionAnalysis):
|
class GB2312DistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(GB2312DistributionAnalysis, self).__init__()
|
super().__init__()
|
||||||
self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER
|
self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER
|
||||||
self._table_size = GB2312_TABLE_SIZE
|
self._table_size = GB2312_TABLE_SIZE
|
||||||
self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO
|
self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
@ -163,13 +193,12 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis):
|
||||||
first_char, second_char = byte_str[0], byte_str[1]
|
first_char, second_char = byte_str[0], byte_str[1]
|
||||||
if (first_char >= 0xB0) and (second_char >= 0xA1):
|
if (first_char >= 0xB0) and (second_char >= 0xA1):
|
||||||
return 94 * (first_char - 0xB0) + second_char - 0xA1
|
return 94 * (first_char - 0xB0) + second_char - 0xA1
|
||||||
else:
|
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
|
||||||
class Big5DistributionAnalysis(CharDistributionAnalysis):
|
class Big5DistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(Big5DistributionAnalysis, self).__init__()
|
super().__init__()
|
||||||
self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER
|
self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER
|
||||||
self._table_size = BIG5_TABLE_SIZE
|
self._table_size = BIG5_TABLE_SIZE
|
||||||
self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO
|
self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
@ -183,15 +212,13 @@ class Big5DistributionAnalysis(CharDistributionAnalysis):
|
||||||
if first_char >= 0xA4:
|
if first_char >= 0xA4:
|
||||||
if second_char >= 0xA1:
|
if second_char >= 0xA1:
|
||||||
return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
|
return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
|
||||||
else:
|
|
||||||
return 157 * (first_char - 0xA4) + second_char - 0x40
|
return 157 * (first_char - 0xA4) + second_char - 0x40
|
||||||
else:
|
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
|
||||||
class SJISDistributionAnalysis(CharDistributionAnalysis):
|
class SJISDistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(SJISDistributionAnalysis, self).__init__()
|
super().__init__()
|
||||||
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
|
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
|
||||||
self._table_size = JIS_TABLE_SIZE
|
self._table_size = JIS_TABLE_SIZE
|
||||||
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
@ -202,9 +229,9 @@ class SJISDistributionAnalysis(CharDistributionAnalysis):
|
||||||
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
|
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
|
||||||
# no validation needed here. State machine has done that
|
# no validation needed here. State machine has done that
|
||||||
first_char, second_char = byte_str[0], byte_str[1]
|
first_char, second_char = byte_str[0], byte_str[1]
|
||||||
if (first_char >= 0x81) and (first_char <= 0x9F):
|
if 0x81 <= first_char <= 0x9F:
|
||||||
order = 188 * (first_char - 0x81)
|
order = 188 * (first_char - 0x81)
|
||||||
elif (first_char >= 0xE0) and (first_char <= 0xEF):
|
elif 0xE0 <= first_char <= 0xEF:
|
||||||
order = 188 * (first_char - 0xE0 + 31)
|
order = 188 * (first_char - 0xE0 + 31)
|
||||||
else:
|
else:
|
||||||
return -1
|
return -1
|
||||||
|
@ -216,7 +243,7 @@ class SJISDistributionAnalysis(CharDistributionAnalysis):
|
||||||
|
|
||||||
class EUCJPDistributionAnalysis(CharDistributionAnalysis):
|
class EUCJPDistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(EUCJPDistributionAnalysis, self).__init__()
|
super().__init__()
|
||||||
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
|
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
|
||||||
self._table_size = JIS_TABLE_SIZE
|
self._table_size = JIS_TABLE_SIZE
|
||||||
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
@ -228,6 +255,5 @@ class EUCJPDistributionAnalysis(CharDistributionAnalysis):
|
||||||
# no validation needed here. State machine has done that
|
# no validation needed here. State machine has done that
|
||||||
char = byte_str[0]
|
char = byte_str[0]
|
||||||
if char >= 0xA0:
|
if char >= 0xA0:
|
||||||
return 94 * (char - 0xA1) + byte_str[1] - 0xa1
|
return 94 * (char - 0xA1) + byte_str[1] - 0xA1
|
||||||
else:
|
|
||||||
return -1
|
return -1
|
||||||
|
|
|
@ -25,19 +25,19 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .enums import ProbingState
|
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
|
from .enums import ProbingState
|
||||||
|
|
||||||
|
|
||||||
class CharSetGroupProber(CharSetProber):
|
class CharSetGroupProber(CharSetProber):
|
||||||
def __init__(self, lang_filter=None):
|
def __init__(self, lang_filter=None):
|
||||||
super(CharSetGroupProber, self).__init__(lang_filter=lang_filter)
|
super().__init__(lang_filter=lang_filter)
|
||||||
self._active_num = 0
|
self._active_num = 0
|
||||||
self.probers = []
|
self.probers = []
|
||||||
self._best_guess_prober = None
|
self._best_guess_prober = None
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
super(CharSetGroupProber, self).reset()
|
super().reset()
|
||||||
self._active_num = 0
|
self._active_num = 0
|
||||||
for prober in self.probers:
|
for prober in self.probers:
|
||||||
if prober:
|
if prober:
|
||||||
|
@ -75,7 +75,7 @@ class CharSetGroupProber(CharSetProber):
|
||||||
self._best_guess_prober = prober
|
self._best_guess_prober = prober
|
||||||
self._state = ProbingState.FOUND_IT
|
self._state = ProbingState.FOUND_IT
|
||||||
return self.state
|
return self.state
|
||||||
elif state == ProbingState.NOT_ME:
|
if state == ProbingState.NOT_ME:
|
||||||
prober.active = False
|
prober.active = False
|
||||||
self._active_num -= 1
|
self._active_num -= 1
|
||||||
if self._active_num <= 0:
|
if self._active_num <= 0:
|
||||||
|
@ -87,7 +87,7 @@ class CharSetGroupProber(CharSetProber):
|
||||||
state = self.state
|
state = self.state
|
||||||
if state == ProbingState.FOUND_IT:
|
if state == ProbingState.FOUND_IT:
|
||||||
return 0.99
|
return 0.99
|
||||||
elif state == ProbingState.NOT_ME:
|
if state == ProbingState.NOT_ME:
|
||||||
return 0.01
|
return 0.01
|
||||||
best_conf = 0.0
|
best_conf = 0.0
|
||||||
self._best_guess_prober = None
|
self._best_guess_prober = None
|
||||||
|
@ -95,10 +95,12 @@ class CharSetGroupProber(CharSetProber):
|
||||||
if not prober:
|
if not prober:
|
||||||
continue
|
continue
|
||||||
if not prober.active:
|
if not prober.active:
|
||||||
self.logger.debug('%s not active', prober.charset_name)
|
self.logger.debug("%s not active", prober.charset_name)
|
||||||
continue
|
continue
|
||||||
conf = prober.get_confidence()
|
conf = prober.get_confidence()
|
||||||
self.logger.debug('%s %s confidence = %s', prober.charset_name, prober.language, conf)
|
self.logger.debug(
|
||||||
|
"%s %s confidence = %s", prober.charset_name, prober.language, conf
|
||||||
|
)
|
||||||
if best_conf < conf:
|
if best_conf < conf:
|
||||||
best_conf = conf
|
best_conf = conf
|
||||||
self._best_guess_prober = prober
|
self._best_guess_prober = prober
|
||||||
|
|
|
@ -31,8 +31,12 @@ import re
|
||||||
|
|
||||||
from .enums import ProbingState
|
from .enums import ProbingState
|
||||||
|
|
||||||
|
INTERNATIONAL_WORDS_PATTERN = re.compile(
|
||||||
|
b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?"
|
||||||
|
)
|
||||||
|
|
||||||
class CharSetProber(object):
|
|
||||||
|
class CharSetProber:
|
||||||
|
|
||||||
SHORTCUT_THRESHOLD = 0.95
|
SHORTCUT_THRESHOLD = 0.95
|
||||||
|
|
||||||
|
@ -48,8 +52,8 @@ class CharSetProber(object):
|
||||||
def charset_name(self):
|
def charset_name(self):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def feed(self, buf):
|
def feed(self, byte_str):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def state(self):
|
def state(self):
|
||||||
|
@ -60,7 +64,7 @@ class CharSetProber(object):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def filter_high_byte_only(buf):
|
def filter_high_byte_only(buf):
|
||||||
buf = re.sub(b'([\x00-\x7F])+', b' ', buf)
|
buf = re.sub(b"([\x00-\x7F])+", b" ", buf)
|
||||||
return buf
|
return buf
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -70,12 +74,10 @@ class CharSetProber(object):
|
||||||
alphabet: english alphabets [a-zA-Z]
|
alphabet: english alphabets [a-zA-Z]
|
||||||
international: international characters [\x80-\xFF]
|
international: international characters [\x80-\xFF]
|
||||||
marker: everything else [^a-zA-Z\x80-\xFF]
|
marker: everything else [^a-zA-Z\x80-\xFF]
|
||||||
|
|
||||||
The input buffer can be thought to contain a series of words delimited
|
The input buffer can be thought to contain a series of words delimited
|
||||||
by markers. This function works to filter all words that contain at
|
by markers. This function works to filter all words that contain at
|
||||||
least one international character. All contiguous sequences of markers
|
least one international character. All contiguous sequences of markers
|
||||||
are replaced by a single space ascii character.
|
are replaced by a single space ascii character.
|
||||||
|
|
||||||
This filter applies to all scripts which do not use English characters.
|
This filter applies to all scripts which do not use English characters.
|
||||||
"""
|
"""
|
||||||
filtered = bytearray()
|
filtered = bytearray()
|
||||||
|
@ -83,8 +85,7 @@ class CharSetProber(object):
|
||||||
# This regex expression filters out only words that have at-least one
|
# This regex expression filters out only words that have at-least one
|
||||||
# international character. The word may include one marker character at
|
# international character. The word may include one marker character at
|
||||||
# the end.
|
# the end.
|
||||||
words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?',
|
words = INTERNATIONAL_WORDS_PATTERN.findall(buf)
|
||||||
buf)
|
|
||||||
|
|
||||||
for word in words:
|
for word in words:
|
||||||
filtered.extend(word[:-1])
|
filtered.extend(word[:-1])
|
||||||
|
@ -94,20 +95,17 @@ class CharSetProber(object):
|
||||||
# similarly across all languages and may thus have similar
|
# similarly across all languages and may thus have similar
|
||||||
# frequencies).
|
# frequencies).
|
||||||
last_char = word[-1:]
|
last_char = word[-1:]
|
||||||
if not last_char.isalpha() and last_char < b'\x80':
|
if not last_char.isalpha() and last_char < b"\x80":
|
||||||
last_char = b' '
|
last_char = b" "
|
||||||
filtered.extend(last_char)
|
filtered.extend(last_char)
|
||||||
|
|
||||||
return filtered
|
return filtered
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def filter_with_english_letters(buf):
|
def remove_xml_tags(buf):
|
||||||
"""
|
"""
|
||||||
Returns a copy of ``buf`` that retains only the sequences of English
|
Returns a copy of ``buf`` that retains only the sequences of English
|
||||||
alphabet and high byte characters that are not between <> characters.
|
alphabet and high byte characters that are not between <> characters.
|
||||||
Also retains English alphabet and high byte characters immediately
|
|
||||||
before occurrences of >.
|
|
||||||
|
|
||||||
This filter can be applied to all scripts which contain both English
|
This filter can be applied to all scripts which contain both English
|
||||||
characters and extended ASCII characters, but is currently only used by
|
characters and extended ASCII characters, but is currently only used by
|
||||||
``Latin1Prober``.
|
``Latin1Prober``.
|
||||||
|
@ -115,26 +113,21 @@ class CharSetProber(object):
|
||||||
filtered = bytearray()
|
filtered = bytearray()
|
||||||
in_tag = False
|
in_tag = False
|
||||||
prev = 0
|
prev = 0
|
||||||
|
buf = memoryview(buf).cast("c")
|
||||||
|
|
||||||
for curr in range(len(buf)):
|
for curr, buf_char in enumerate(buf):
|
||||||
# Slice here to get bytes instead of an int with Python 3
|
# Check if we're coming out of or entering an XML tag
|
||||||
buf_char = buf[curr:curr + 1]
|
if buf_char == b">":
|
||||||
# Check if we're coming out of or entering an HTML tag
|
prev = curr + 1
|
||||||
if buf_char == b'>':
|
|
||||||
in_tag = False
|
in_tag = False
|
||||||
elif buf_char == b'<':
|
elif buf_char == b"<":
|
||||||
in_tag = True
|
|
||||||
|
|
||||||
# If current character is not extended-ASCII and not alphabetic...
|
|
||||||
if buf_char < b'\x80' and not buf_char.isalpha():
|
|
||||||
# ...and we're not in a tag
|
|
||||||
if curr > prev and not in_tag:
|
if curr > prev and not in_tag:
|
||||||
# Keep everything after last non-extended-ASCII,
|
# Keep everything after last non-extended-ASCII,
|
||||||
# non-alphabetic character
|
# non-alphabetic character
|
||||||
filtered.extend(buf[prev:curr])
|
filtered.extend(buf[prev:curr])
|
||||||
# Output a space to delimit stretch we kept
|
# Output a space to delimit stretch we kept
|
||||||
filtered.extend(b' ')
|
filtered.extend(b" ")
|
||||||
prev = curr + 1
|
in_tag = True
|
||||||
|
|
||||||
# If we're not in a tag...
|
# If we're not in a tag...
|
||||||
if not in_tag:
|
if not in_tag:
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
|
|
|
@ -12,17 +12,15 @@ If no paths are provided, it takes its input from stdin.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import absolute_import, print_function, unicode_literals
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from chardet import __version__
|
from .. import __version__
|
||||||
from chardet.compat import PY2
|
from ..universaldetector import UniversalDetector
|
||||||
from chardet.universaldetector import UniversalDetector
|
|
||||||
|
|
||||||
|
|
||||||
def description_of(lines, name='stdin'):
|
def description_of(lines, name="stdin"):
|
||||||
"""
|
"""
|
||||||
Return a string describing the probable encoding of a file or
|
Return a string describing the probable encoding of a file or
|
||||||
list of strings.
|
list of strings.
|
||||||
|
@ -41,13 +39,9 @@ def description_of(lines, name='stdin'):
|
||||||
break
|
break
|
||||||
u.close()
|
u.close()
|
||||||
result = u.result
|
result = u.result
|
||||||
if PY2:
|
if result["encoding"]:
|
||||||
name = name.decode(sys.getfilesystemencoding(), 'ignore')
|
return f'{name}: {result["encoding"]} with confidence {result["confidence"]}'
|
||||||
if result['encoding']:
|
return f"{name}: no result"
|
||||||
return '{}: {} with confidence {}'.format(name, result['encoding'],
|
|
||||||
result['confidence'])
|
|
||||||
else:
|
|
||||||
return '{}: no result'.format(name)
|
|
||||||
|
|
||||||
|
|
||||||
def main(argv=None):
|
def main(argv=None):
|
||||||
|
@ -61,24 +55,32 @@ def main(argv=None):
|
||||||
# Get command line arguments
|
# Get command line arguments
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Takes one or more file paths and reports their detected \
|
description="Takes one or more file paths and reports their detected \
|
||||||
encodings")
|
encodings"
|
||||||
parser.add_argument('input',
|
)
|
||||||
help='File whose encoding we would like to determine. \
|
parser.add_argument(
|
||||||
(default: stdin)',
|
"input",
|
||||||
type=argparse.FileType('rb'), nargs='*',
|
help="File whose encoding we would like to determine. \
|
||||||
default=[sys.stdin if PY2 else sys.stdin.buffer])
|
(default: stdin)",
|
||||||
parser.add_argument('--version', action='version',
|
type=argparse.FileType("rb"),
|
||||||
version='%(prog)s {}'.format(__version__))
|
nargs="*",
|
||||||
|
default=[sys.stdin.buffer],
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--version", action="version", version=f"%(prog)s {__version__}"
|
||||||
|
)
|
||||||
args = parser.parse_args(argv)
|
args = parser.parse_args(argv)
|
||||||
|
|
||||||
for f in args.input:
|
for f in args.input:
|
||||||
if f.isatty():
|
if f.isatty():
|
||||||
print("You are running chardetect interactively. Press " +
|
print(
|
||||||
"CTRL-D twice at the start of a blank line to signal the " +
|
"You are running chardetect interactively. Press "
|
||||||
"end of your input. If you want help, run chardetect " +
|
"CTRL-D twice at the start of a blank line to signal the "
|
||||||
"--help\n", file=sys.stderr)
|
"end of your input. If you want help, run chardetect "
|
||||||
|
"--help\n",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
print(description_of(f, f.name))
|
print(description_of(f, f.name))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
@ -30,7 +30,7 @@ import logging
|
||||||
from .enums import MachineState
|
from .enums import MachineState
|
||||||
|
|
||||||
|
|
||||||
class CodingStateMachine(object):
|
class CodingStateMachine:
|
||||||
"""
|
"""
|
||||||
A state machine to verify a byte sequence for a particular encoding. For
|
A state machine to verify a byte sequence for a particular encoding. For
|
||||||
each byte the detector receives, it will feed that byte to every active
|
each byte the detector receives, it will feed that byte to every active
|
||||||
|
@ -52,6 +52,7 @@ class CodingStateMachine(object):
|
||||||
negative answer for this encoding. Detector will exclude this
|
negative answer for this encoding. Detector will exclude this
|
||||||
encoding from consideration from here on.
|
encoding from consideration from here on.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, sm):
|
def __init__(self, sm):
|
||||||
self._model = sm
|
self._model = sm
|
||||||
self._curr_byte_pos = 0
|
self._curr_byte_pos = 0
|
||||||
|
@ -66,14 +67,13 @@ class CodingStateMachine(object):
|
||||||
def next_state(self, c):
|
def next_state(self, c):
|
||||||
# for each byte we get its class
|
# for each byte we get its class
|
||||||
# if it is first byte, we also get byte length
|
# if it is first byte, we also get byte length
|
||||||
byte_class = self._model['class_table'][c]
|
byte_class = self._model["class_table"][c]
|
||||||
if self._curr_state == MachineState.START:
|
if self._curr_state == MachineState.START:
|
||||||
self._curr_byte_pos = 0
|
self._curr_byte_pos = 0
|
||||||
self._curr_char_len = self._model['char_len_table'][byte_class]
|
self._curr_char_len = self._model["char_len_table"][byte_class]
|
||||||
# from byte's class and state_table, we get its next state
|
# from byte's class and state_table, we get its next state
|
||||||
curr_state = (self._curr_state * self._model['class_factor']
|
curr_state = self._curr_state * self._model["class_factor"] + byte_class
|
||||||
+ byte_class)
|
self._curr_state = self._model["state_table"][curr_state]
|
||||||
self._curr_state = self._model['state_table'][curr_state]
|
|
||||||
self._curr_byte_pos += 1
|
self._curr_byte_pos += 1
|
||||||
return self._curr_state
|
return self._curr_state
|
||||||
|
|
||||||
|
@ -81,8 +81,8 @@ class CodingStateMachine(object):
|
||||||
return self._curr_char_len
|
return self._curr_char_len
|
||||||
|
|
||||||
def get_coding_state_machine(self):
|
def get_coding_state_machine(self):
|
||||||
return self._model['name']
|
return self._model["name"]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self):
|
||||||
return self._model['language']
|
return self._model["language"]
|
||||||
|
|
|
@ -33,7 +33,7 @@ from .mbcssm import CP949_SM_MODEL
|
||||||
|
|
||||||
class CP949Prober(MultiByteCharSetProber):
|
class CP949Prober(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(CP949Prober, self).__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(CP949_SM_MODEL)
|
self.coding_sm = CodingStateMachine(CP949_SM_MODEL)
|
||||||
# NOTE: CP949 is a superset of EUC-KR, so the distribution should be
|
# NOTE: CP949 is a superset of EUC-KR, so the distribution should be
|
||||||
# not different.
|
# not different.
|
||||||
|
|
|
@ -5,20 +5,22 @@ All of the Enums that are used throughout the chardet package.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
class InputState(object):
|
class InputState:
|
||||||
"""
|
"""
|
||||||
This enum represents the different states a universal detector can be in.
|
This enum represents the different states a universal detector can be in.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
PURE_ASCII = 0
|
PURE_ASCII = 0
|
||||||
ESC_ASCII = 1
|
ESC_ASCII = 1
|
||||||
HIGH_BYTE = 2
|
HIGH_BYTE = 2
|
||||||
|
|
||||||
|
|
||||||
class LanguageFilter(object):
|
class LanguageFilter:
|
||||||
"""
|
"""
|
||||||
This enum represents the different language filters we can apply to a
|
This enum represents the different language filters we can apply to a
|
||||||
``UniversalDetector``.
|
``UniversalDetector``.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
CHINESE_SIMPLIFIED = 0x01
|
CHINESE_SIMPLIFIED = 0x01
|
||||||
CHINESE_TRADITIONAL = 0x02
|
CHINESE_TRADITIONAL = 0x02
|
||||||
JAPANESE = 0x04
|
JAPANESE = 0x04
|
||||||
|
@ -29,28 +31,31 @@ class LanguageFilter(object):
|
||||||
CJK = CHINESE | JAPANESE | KOREAN
|
CJK = CHINESE | JAPANESE | KOREAN
|
||||||
|
|
||||||
|
|
||||||
class ProbingState(object):
|
class ProbingState:
|
||||||
"""
|
"""
|
||||||
This enum represents the different states a prober can be in.
|
This enum represents the different states a prober can be in.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
DETECTING = 0
|
DETECTING = 0
|
||||||
FOUND_IT = 1
|
FOUND_IT = 1
|
||||||
NOT_ME = 2
|
NOT_ME = 2
|
||||||
|
|
||||||
|
|
||||||
class MachineState(object):
|
class MachineState:
|
||||||
"""
|
"""
|
||||||
This enum represents the different states a state machine can be in.
|
This enum represents the different states a state machine can be in.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
START = 0
|
START = 0
|
||||||
ERROR = 1
|
ERROR = 1
|
||||||
ITS_ME = 2
|
ITS_ME = 2
|
||||||
|
|
||||||
|
|
||||||
class SequenceLikelihood(object):
|
class SequenceLikelihood:
|
||||||
"""
|
"""
|
||||||
This enum represents the likelihood of a character following the previous one.
|
This enum represents the likelihood of a character following the previous one.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
NEGATIVE = 0
|
NEGATIVE = 0
|
||||||
UNLIKELY = 1
|
UNLIKELY = 1
|
||||||
LIKELY = 2
|
LIKELY = 2
|
||||||
|
@ -62,13 +67,14 @@ class SequenceLikelihood(object):
|
||||||
return 4
|
return 4
|
||||||
|
|
||||||
|
|
||||||
class CharacterCategory(object):
|
class CharacterCategory:
|
||||||
"""
|
"""
|
||||||
This enum represents the different categories language models for
|
This enum represents the different categories language models for
|
||||||
``SingleByteCharsetProber`` put characters into.
|
``SingleByteCharsetProber`` put characters into.
|
||||||
|
|
||||||
Anything less than CONTROL is considered a letter.
|
Anything less than CONTROL is considered a letter.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
UNDEFINED = 255
|
UNDEFINED = 255
|
||||||
LINE_BREAK = 254
|
LINE_BREAK = 254
|
||||||
SYMBOL = 253
|
SYMBOL = 253
|
||||||
|
|
|
@ -27,9 +27,13 @@
|
||||||
|
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .codingstatemachine import CodingStateMachine
|
from .codingstatemachine import CodingStateMachine
|
||||||
from .enums import LanguageFilter, ProbingState, MachineState
|
from .enums import LanguageFilter, MachineState, ProbingState
|
||||||
from .escsm import (HZ_SM_MODEL, ISO2022CN_SM_MODEL, ISO2022JP_SM_MODEL,
|
from .escsm import (
|
||||||
ISO2022KR_SM_MODEL)
|
HZ_SM_MODEL,
|
||||||
|
ISO2022CN_SM_MODEL,
|
||||||
|
ISO2022JP_SM_MODEL,
|
||||||
|
ISO2022KR_SM_MODEL,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class EscCharSetProber(CharSetProber):
|
class EscCharSetProber(CharSetProber):
|
||||||
|
@ -40,7 +44,7 @@ class EscCharSetProber(CharSetProber):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, lang_filter=None):
|
def __init__(self, lang_filter=None):
|
||||||
super(EscCharSetProber, self).__init__(lang_filter=lang_filter)
|
super().__init__(lang_filter=lang_filter)
|
||||||
self.coding_sm = []
|
self.coding_sm = []
|
||||||
if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
|
if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
|
||||||
self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL))
|
self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL))
|
||||||
|
@ -56,7 +60,7 @@ class EscCharSetProber(CharSetProber):
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
super(EscCharSetProber, self).reset()
|
super().reset()
|
||||||
for coding_sm in self.coding_sm:
|
for coding_sm in self.coding_sm:
|
||||||
if not coding_sm:
|
if not coding_sm:
|
||||||
continue
|
continue
|
||||||
|
@ -75,10 +79,7 @@ class EscCharSetProber(CharSetProber):
|
||||||
return self._detected_language
|
return self._detected_language
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self):
|
||||||
if self._detected_charset:
|
return 0.99 if self._detected_charset else 0.00
|
||||||
return 0.99
|
|
||||||
else:
|
|
||||||
return 0.00
|
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str):
|
||||||
for c in byte_str:
|
for c in byte_str:
|
||||||
|
|
|
@ -27,6 +27,7 @@
|
||||||
|
|
||||||
from .enums import MachineState
|
from .enums import MachineState
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
HZ_CLS = (
|
HZ_CLS = (
|
||||||
1, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
1, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, # 08 - 0f
|
0, 0, 0, 0, 0, 0, 0, 0, # 08 - 0f
|
||||||
|
@ -70,16 +71,20 @@ MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,Ma
|
||||||
4, MachineState.ERROR, 4, 4, 4, MachineState.ERROR, 4, MachineState.ERROR, # 20-27
|
4, MachineState.ERROR, 4, 4, 4, MachineState.ERROR, 4, MachineState.ERROR, # 20-27
|
||||||
4, MachineState.ITS_ME, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 28-2f
|
4, MachineState.ITS_ME, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 28-2f
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
||||||
|
|
||||||
HZ_SM_MODEL = {'class_table': HZ_CLS,
|
HZ_SM_MODEL = {
|
||||||
'class_factor': 6,
|
"class_table": HZ_CLS,
|
||||||
'state_table': HZ_ST,
|
"class_factor": 6,
|
||||||
'char_len_table': HZ_CHAR_LEN_TABLE,
|
"state_table": HZ_ST,
|
||||||
'name': "HZ-GB-2312",
|
"char_len_table": HZ_CHAR_LEN_TABLE,
|
||||||
'language': 'Chinese'}
|
"name": "HZ-GB-2312",
|
||||||
|
"language": "Chinese",
|
||||||
|
}
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
ISO2022CN_CLS = (
|
ISO2022CN_CLS = (
|
||||||
2, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
2, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, # 08 - 0f
|
0, 0, 0, 0, 0, 0, 0, 0, # 08 - 0f
|
||||||
|
@ -125,16 +130,20 @@ MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,Mac
|
||||||
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 30-37
|
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 30-37
|
||||||
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.START, # 38-3f
|
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.START, # 38-3f
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0)
|
ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||||
|
|
||||||
ISO2022CN_SM_MODEL = {'class_table': ISO2022CN_CLS,
|
ISO2022CN_SM_MODEL = {
|
||||||
'class_factor': 9,
|
"class_table": ISO2022CN_CLS,
|
||||||
'state_table': ISO2022CN_ST,
|
"class_factor": 9,
|
||||||
'char_len_table': ISO2022CN_CHAR_LEN_TABLE,
|
"state_table": ISO2022CN_ST,
|
||||||
'name': "ISO-2022-CN",
|
"char_len_table": ISO2022CN_CHAR_LEN_TABLE,
|
||||||
'language': 'Chinese'}
|
"name": "ISO-2022-CN",
|
||||||
|
"language": "Chinese",
|
||||||
|
}
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
ISO2022JP_CLS = (
|
ISO2022JP_CLS = (
|
||||||
2, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
2, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
||||||
0, 0, 0, 0, 0, 0, 2, 2, # 08 - 0f
|
0, 0, 0, 0, 0, 0, 2, 2, # 08 - 0f
|
||||||
|
@ -181,16 +190,20 @@ MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,Mach
|
||||||
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 38-3f
|
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 38-3f
|
||||||
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.START, MachineState.START, # 40-47
|
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.START, MachineState.START, # 40-47
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||||
|
|
||||||
ISO2022JP_SM_MODEL = {'class_table': ISO2022JP_CLS,
|
ISO2022JP_SM_MODEL = {
|
||||||
'class_factor': 10,
|
"class_table": ISO2022JP_CLS,
|
||||||
'state_table': ISO2022JP_ST,
|
"class_factor": 10,
|
||||||
'char_len_table': ISO2022JP_CHAR_LEN_TABLE,
|
"state_table": ISO2022JP_ST,
|
||||||
'name': "ISO-2022-JP",
|
"char_len_table": ISO2022JP_CHAR_LEN_TABLE,
|
||||||
'language': 'Japanese'}
|
"name": "ISO-2022-JP",
|
||||||
|
"language": "Japanese",
|
||||||
|
}
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
ISO2022KR_CLS = (
|
ISO2022KR_CLS = (
|
||||||
2, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
2, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, # 08 - 0f
|
0, 0, 0, 0, 0, 0, 0, 0, # 08 - 0f
|
||||||
|
@ -233,14 +246,15 @@ MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,Ma
|
||||||
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 5, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 18-1f
|
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 5, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 18-1f
|
||||||
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 20-27
|
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 20-27
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
||||||
|
|
||||||
ISO2022KR_SM_MODEL = {'class_table': ISO2022KR_CLS,
|
ISO2022KR_SM_MODEL = {
|
||||||
'class_factor': 6,
|
"class_table": ISO2022KR_CLS,
|
||||||
'state_table': ISO2022KR_ST,
|
"class_factor": 6,
|
||||||
'char_len_table': ISO2022KR_CHAR_LEN_TABLE,
|
"state_table": ISO2022KR_ST,
|
||||||
'name': "ISO-2022-KR",
|
"char_len_table": ISO2022KR_CHAR_LEN_TABLE,
|
||||||
'language': 'Korean'}
|
"name": "ISO-2022-KR",
|
||||||
|
"language": "Korean",
|
||||||
|
}
|
||||||
|
|
|
@ -25,24 +25,24 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .enums import ProbingState, MachineState
|
|
||||||
from .mbcharsetprober import MultiByteCharSetProber
|
|
||||||
from .codingstatemachine import CodingStateMachine
|
|
||||||
from .chardistribution import EUCJPDistributionAnalysis
|
from .chardistribution import EUCJPDistributionAnalysis
|
||||||
|
from .codingstatemachine import CodingStateMachine
|
||||||
|
from .enums import MachineState, ProbingState
|
||||||
from .jpcntx import EUCJPContextAnalysis
|
from .jpcntx import EUCJPContextAnalysis
|
||||||
|
from .mbcharsetprober import MultiByteCharSetProber
|
||||||
from .mbcssm import EUCJP_SM_MODEL
|
from .mbcssm import EUCJP_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
class EUCJPProber(MultiByteCharSetProber):
|
class EUCJPProber(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(EUCJPProber, self).__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL)
|
self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL)
|
||||||
self.distribution_analyzer = EUCJPDistributionAnalysis()
|
self.distribution_analyzer = EUCJPDistributionAnalysis()
|
||||||
self.context_analyzer = EUCJPContextAnalysis()
|
self.context_analyzer = EUCJPContextAnalysis()
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
super(EUCJPProber, self).reset()
|
super().reset()
|
||||||
self.context_analyzer.reset()
|
self.context_analyzer.reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -54,34 +54,37 @@ class EUCJPProber(MultiByteCharSetProber):
|
||||||
return "Japanese"
|
return "Japanese"
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str):
|
||||||
for i in range(len(byte_str)):
|
for i, byte in enumerate(byte_str):
|
||||||
# PY3K: byte_str is a byte array, so byte_str[i] is an int, not a byte
|
# PY3K: byte_str is a byte array, so byte is an int, not a byte
|
||||||
coding_state = self.coding_sm.next_state(byte_str[i])
|
coding_state = self.coding_sm.next_state(byte)
|
||||||
if coding_state == MachineState.ERROR:
|
if coding_state == MachineState.ERROR:
|
||||||
self.logger.debug('%s %s prober hit error at byte %s',
|
self.logger.debug(
|
||||||
self.charset_name, self.language, i)
|
"%s %s prober hit error at byte %s",
|
||||||
|
self.charset_name,
|
||||||
|
self.language,
|
||||||
|
i,
|
||||||
|
)
|
||||||
self._state = ProbingState.NOT_ME
|
self._state = ProbingState.NOT_ME
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.ITS_ME:
|
if coding_state == MachineState.ITS_ME:
|
||||||
self._state = ProbingState.FOUND_IT
|
self._state = ProbingState.FOUND_IT
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.START:
|
if coding_state == MachineState.START:
|
||||||
char_len = self.coding_sm.get_current_charlen()
|
char_len = self.coding_sm.get_current_charlen()
|
||||||
if i == 0:
|
if i == 0:
|
||||||
self._last_char[1] = byte_str[0]
|
self._last_char[1] = byte
|
||||||
self.context_analyzer.feed(self._last_char, char_len)
|
self.context_analyzer.feed(self._last_char, char_len)
|
||||||
self.distribution_analyzer.feed(self._last_char, char_len)
|
self.distribution_analyzer.feed(self._last_char, char_len)
|
||||||
else:
|
else:
|
||||||
self.context_analyzer.feed(byte_str[i - 1:i + 1],
|
self.context_analyzer.feed(byte_str[i - 1 : i + 1], char_len)
|
||||||
char_len)
|
self.distribution_analyzer.feed(byte_str[i - 1 : i + 1], char_len)
|
||||||
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
|
|
||||||
char_len)
|
|
||||||
|
|
||||||
self._last_char[0] = byte_str[-1]
|
self._last_char[0] = byte_str[-1]
|
||||||
|
|
||||||
if self.state == ProbingState.DETECTING:
|
if self.state == ProbingState.DETECTING:
|
||||||
if (self.context_analyzer.got_enough_data() and
|
if self.context_analyzer.got_enough_data() and (
|
||||||
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
|
self.get_confidence() > self.SHORTCUT_THRESHOLD
|
||||||
|
):
|
||||||
self._state = ProbingState.FOUND_IT
|
self._state = ProbingState.FOUND_IT
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
|
|
@ -43,6 +43,7 @@ EUCKR_TYPICAL_DISTRIBUTION_RATIO = 6.0
|
||||||
EUCKR_TABLE_SIZE = 2352
|
EUCKR_TABLE_SIZE = 2352
|
||||||
|
|
||||||
# Char to FreqOrder table ,
|
# Char to FreqOrder table ,
|
||||||
|
# fmt: off
|
||||||
EUCKR_CHAR_TO_FREQ_ORDER = (
|
EUCKR_CHAR_TO_FREQ_ORDER = (
|
||||||
13, 130, 120,1396, 481,1719,1720, 328, 609, 212,1721, 707, 400, 299,1722, 87,
|
13, 130, 120,1396, 481,1719,1720, 328, 609, 212,1721, 707, 400, 299,1722, 87,
|
||||||
1397,1723, 104, 536,1117,1203,1724,1267, 685,1268, 508,1725,1726,1727,1728,1398,
|
1397,1723, 104, 536,1117,1203,1724,1267, 685,1268, 508,1725,1726,1727,1728,1398,
|
||||||
|
@ -192,4 +193,4 @@ EUCKR_CHAR_TO_FREQ_ORDER = (
|
||||||
2629,2630,2631, 924, 648, 863, 603,2632,2633, 934,1540, 864, 865,2634, 642,1042,
|
2629,2630,2631, 924, 648, 863, 603,2632,2633, 934,1540, 864, 865,2634, 642,1042,
|
||||||
670,1190,2635,2636,2637,2638, 168,2639, 652, 873, 542,1054,1541,2640,2641,2642, # 512, 256
|
670,1190,2635,2636,2637,2638, 168,2639, 652, 873, 542,1054,1541,2640,2641,2642, # 512, 256
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
|
@ -25,15 +25,15 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .mbcharsetprober import MultiByteCharSetProber
|
|
||||||
from .codingstatemachine import CodingStateMachine
|
|
||||||
from .chardistribution import EUCKRDistributionAnalysis
|
from .chardistribution import EUCKRDistributionAnalysis
|
||||||
|
from .codingstatemachine import CodingStateMachine
|
||||||
|
from .mbcharsetprober import MultiByteCharSetProber
|
||||||
from .mbcssm import EUCKR_SM_MODEL
|
from .mbcssm import EUCKR_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
class EUCKRProber(MultiByteCharSetProber):
|
class EUCKRProber(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(EUCKRProber, self).__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL)
|
self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL)
|
||||||
self.distribution_analyzer = EUCKRDistributionAnalysis()
|
self.distribution_analyzer = EUCKRDistributionAnalysis()
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
|
@ -43,9 +43,10 @@
|
||||||
|
|
||||||
EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75
|
EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75
|
||||||
|
|
||||||
# Char to FreqOrder table ,
|
# Char to FreqOrder table
|
||||||
EUCTW_TABLE_SIZE = 5376
|
EUCTW_TABLE_SIZE = 5376
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
EUCTW_CHAR_TO_FREQ_ORDER = (
|
EUCTW_CHAR_TO_FREQ_ORDER = (
|
||||||
1, 1800, 1506, 255, 1431, 198, 9, 82, 6, 7310, 177, 202, 3615, 1256, 2808, 110, # 2742
|
1, 1800, 1506, 255, 1431, 198, 9, 82, 6, 7310, 177, 202, 3615, 1256, 2808, 110, # 2742
|
||||||
3735, 33, 3241, 261, 76, 44, 2113, 16, 2931, 2184, 1176, 659, 3868, 26, 3404, 2643, # 2758
|
3735, 33, 3241, 261, 76, 44, 2113, 16, 2931, 2184, 1176, 659, 3868, 26, 3404, 2643, # 2758
|
||||||
|
@ -384,4 +385,4 @@ EUCTW_CHAR_TO_FREQ_ORDER = (
|
||||||
890, 3614, 3864, 8110, 1877, 3732, 3402, 8111, 2183, 2353, 3403, 1652, 8112, 8113, 8114, 941, # 8086
|
890, 3614, 3864, 8110, 1877, 3732, 3402, 8111, 2183, 2353, 3403, 1652, 8112, 8113, 8114, 941, # 8086
|
||||||
2294, 208, 3499, 4057, 2019, 330, 4294, 3865, 2892, 2492, 3733, 4295, 8115, 8116, 8117, 8118, # 8102
|
2294, 208, 3499, 4057, 2019, 330, 4294, 3865, 2892, 2492, 3733, 4295, 8115, 8116, 8117, 8118, # 8102
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
|
@ -25,14 +25,15 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .mbcharsetprober import MultiByteCharSetProber
|
|
||||||
from .codingstatemachine import CodingStateMachine
|
|
||||||
from .chardistribution import EUCTWDistributionAnalysis
|
from .chardistribution import EUCTWDistributionAnalysis
|
||||||
|
from .codingstatemachine import CodingStateMachine
|
||||||
|
from .mbcharsetprober import MultiByteCharSetProber
|
||||||
from .mbcssm import EUCTW_SM_MODEL
|
from .mbcssm import EUCTW_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
class EUCTWProber(MultiByteCharSetProber):
|
class EUCTWProber(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(EUCTWProber, self).__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL)
|
self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL)
|
||||||
self.distribution_analyzer = EUCTWDistributionAnalysis()
|
self.distribution_analyzer = EUCTWDistributionAnalysis()
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
|
@ -43,6 +43,7 @@ GB2312_TYPICAL_DISTRIBUTION_RATIO = 0.9
|
||||||
|
|
||||||
GB2312_TABLE_SIZE = 3760
|
GB2312_TABLE_SIZE = 3760
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
GB2312_CHAR_TO_FREQ_ORDER = (
|
GB2312_CHAR_TO_FREQ_ORDER = (
|
||||||
1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205,
|
1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205,
|
||||||
2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842,
|
2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842,
|
||||||
|
@ -280,4 +281,4 @@ GB2312_CHAR_TO_FREQ_ORDER = (
|
||||||
381,1638,4592,1020, 516,3214, 458, 947,4575,1432, 211,1514,2926,1865,2142, 189,
|
381,1638,4592,1020, 516,3214, 458, 947,4575,1432, 211,1514,2926,1865,2142, 189,
|
||||||
852,1221,1400,1486, 882,2299,4036, 351, 28,1122, 700,6479,6480,6481,6482,6483, #last 512
|
852,1221,1400,1486, 882,2299,4036, 351, 28,1122, 700,6479,6480,6481,6482,6483, #last 512
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
|
@ -25,14 +25,15 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .mbcharsetprober import MultiByteCharSetProber
|
|
||||||
from .codingstatemachine import CodingStateMachine
|
|
||||||
from .chardistribution import GB2312DistributionAnalysis
|
from .chardistribution import GB2312DistributionAnalysis
|
||||||
|
from .codingstatemachine import CodingStateMachine
|
||||||
|
from .mbcharsetprober import MultiByteCharSetProber
|
||||||
from .mbcssm import GB2312_SM_MODEL
|
from .mbcssm import GB2312_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
class GB2312Prober(MultiByteCharSetProber):
|
class GB2312Prober(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(GB2312Prober, self).__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(GB2312_SM_MODEL)
|
self.coding_sm = CodingStateMachine(GB2312_SM_MODEL)
|
||||||
self.distribution_analyzer = GB2312DistributionAnalysis()
|
self.distribution_analyzer = GB2312DistributionAnalysis()
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
|
@ -125,18 +125,19 @@ from .enums import ProbingState
|
||||||
# model probers scores. The answer is returned in the form of the name of the
|
# model probers scores. The answer is returned in the form of the name of the
|
||||||
# charset identified, either "windows-1255" or "ISO-8859-8".
|
# charset identified, either "windows-1255" or "ISO-8859-8".
|
||||||
|
|
||||||
|
|
||||||
class HebrewProber(CharSetProber):
|
class HebrewProber(CharSetProber):
|
||||||
# windows-1255 / ISO-8859-8 code points of interest
|
# windows-1255 / ISO-8859-8 code points of interest
|
||||||
FINAL_KAF = 0xea
|
FINAL_KAF = 0xEA
|
||||||
NORMAL_KAF = 0xeb
|
NORMAL_KAF = 0xEB
|
||||||
FINAL_MEM = 0xed
|
FINAL_MEM = 0xED
|
||||||
NORMAL_MEM = 0xee
|
NORMAL_MEM = 0xEE
|
||||||
FINAL_NUN = 0xef
|
FINAL_NUN = 0xEF
|
||||||
NORMAL_NUN = 0xf0
|
NORMAL_NUN = 0xF0
|
||||||
FINAL_PE = 0xf3
|
FINAL_PE = 0xF3
|
||||||
NORMAL_PE = 0xf4
|
NORMAL_PE = 0xF4
|
||||||
FINAL_TSADI = 0xf5
|
FINAL_TSADI = 0xF5
|
||||||
NORMAL_TSADI = 0xf6
|
NORMAL_TSADI = 0xF6
|
||||||
|
|
||||||
# Minimum Visual vs Logical final letter score difference.
|
# Minimum Visual vs Logical final letter score difference.
|
||||||
# If the difference is below this, don't rely solely on the final letter score
|
# If the difference is below this, don't rely solely on the final letter score
|
||||||
|
@ -152,7 +153,7 @@ class HebrewProber(CharSetProber):
|
||||||
LOGICAL_HEBREW_NAME = "windows-1255"
|
LOGICAL_HEBREW_NAME = "windows-1255"
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(HebrewProber, self).__init__()
|
super().__init__()
|
||||||
self._final_char_logical_score = None
|
self._final_char_logical_score = None
|
||||||
self._final_char_visual_score = None
|
self._final_char_visual_score = None
|
||||||
self._prev = None
|
self._prev = None
|
||||||
|
@ -167,17 +168,22 @@ class HebrewProber(CharSetProber):
|
||||||
# The two last characters seen in the previous buffer,
|
# The two last characters seen in the previous buffer,
|
||||||
# mPrev and mBeforePrev are initialized to space in order to simulate
|
# mPrev and mBeforePrev are initialized to space in order to simulate
|
||||||
# a word delimiter at the beginning of the data
|
# a word delimiter at the beginning of the data
|
||||||
self._prev = ' '
|
self._prev = " "
|
||||||
self._before_prev = ' '
|
self._before_prev = " "
|
||||||
# These probers are owned by the group prober.
|
# These probers are owned by the group prober.
|
||||||
|
|
||||||
def set_model_probers(self, logicalProber, visualProber):
|
def set_model_probers(self, logical_prober, visual_prober):
|
||||||
self._logical_prober = logicalProber
|
self._logical_prober = logical_prober
|
||||||
self._visual_prober = visualProber
|
self._visual_prober = visual_prober
|
||||||
|
|
||||||
def is_final(self, c):
|
def is_final(self, c):
|
||||||
return c in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN,
|
return c in [
|
||||||
self.FINAL_PE, self.FINAL_TSADI]
|
self.FINAL_KAF,
|
||||||
|
self.FINAL_MEM,
|
||||||
|
self.FINAL_NUN,
|
||||||
|
self.FINAL_PE,
|
||||||
|
self.FINAL_TSADI,
|
||||||
|
]
|
||||||
|
|
||||||
def is_non_final(self, c):
|
def is_non_final(self, c):
|
||||||
# The normal Tsadi is not a good Non-Final letter due to words like
|
# The normal Tsadi is not a good Non-Final letter due to words like
|
||||||
|
@ -190,8 +196,7 @@ class HebrewProber(CharSetProber):
|
||||||
# for example legally end with a Non-Final Pe or Kaf. However, the
|
# for example legally end with a Non-Final Pe or Kaf. However, the
|
||||||
# benefit of these letters as Non-Final letters outweighs the damage
|
# benefit of these letters as Non-Final letters outweighs the damage
|
||||||
# since these words are quite rare.
|
# since these words are quite rare.
|
||||||
return c in [self.NORMAL_KAF, self.NORMAL_MEM,
|
return c in [self.NORMAL_KAF, self.NORMAL_MEM, self.NORMAL_NUN, self.NORMAL_PE]
|
||||||
self.NORMAL_NUN, self.NORMAL_PE]
|
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str):
|
||||||
# Final letter analysis for logical-visual decision.
|
# Final letter analysis for logical-visual decision.
|
||||||
|
@ -227,9 +232,9 @@ class HebrewProber(CharSetProber):
|
||||||
byte_str = self.filter_high_byte_only(byte_str)
|
byte_str = self.filter_high_byte_only(byte_str)
|
||||||
|
|
||||||
for cur in byte_str:
|
for cur in byte_str:
|
||||||
if cur == ' ':
|
if cur == " ":
|
||||||
# We stand on a space - a word just ended
|
# We stand on a space - a word just ended
|
||||||
if self._before_prev != ' ':
|
if self._before_prev != " ":
|
||||||
# next-to-last char was not a space so self._prev is not a
|
# next-to-last char was not a space so self._prev is not a
|
||||||
# 1 letter word
|
# 1 letter word
|
||||||
if self.is_final(self._prev):
|
if self.is_final(self._prev):
|
||||||
|
@ -241,8 +246,11 @@ class HebrewProber(CharSetProber):
|
||||||
self._final_char_visual_score += 1
|
self._final_char_visual_score += 1
|
||||||
else:
|
else:
|
||||||
# Not standing on a space
|
# Not standing on a space
|
||||||
if ((self._before_prev == ' ') and
|
if (
|
||||||
(self.is_final(self._prev)) and (cur != ' ')):
|
(self._before_prev == " ")
|
||||||
|
and (self.is_final(self._prev))
|
||||||
|
and (cur != " ")
|
||||||
|
):
|
||||||
# case (3) [-2:space][-1:final letter][cur:not space]
|
# case (3) [-2:space][-1:final letter][cur:not space]
|
||||||
self._final_char_visual_score += 1
|
self._final_char_visual_score += 1
|
||||||
self._before_prev = self._prev
|
self._before_prev = self._prev
|
||||||
|
@ -263,8 +271,9 @@ class HebrewProber(CharSetProber):
|
||||||
return self.VISUAL_HEBREW_NAME
|
return self.VISUAL_HEBREW_NAME
|
||||||
|
|
||||||
# It's not dominant enough, try to rely on the model scores instead.
|
# It's not dominant enough, try to rely on the model scores instead.
|
||||||
modelsub = (self._logical_prober.get_confidence()
|
modelsub = (
|
||||||
- self._visual_prober.get_confidence())
|
self._logical_prober.get_confidence() - self._visual_prober.get_confidence()
|
||||||
|
)
|
||||||
if modelsub > self.MIN_MODEL_DISTANCE:
|
if modelsub > self.MIN_MODEL_DISTANCE:
|
||||||
return self.LOGICAL_HEBREW_NAME
|
return self.LOGICAL_HEBREW_NAME
|
||||||
if modelsub < -self.MIN_MODEL_DISTANCE:
|
if modelsub < -self.MIN_MODEL_DISTANCE:
|
||||||
|
@ -281,12 +290,13 @@ class HebrewProber(CharSetProber):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self):
|
||||||
return 'Hebrew'
|
return "Hebrew"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def state(self):
|
def state(self):
|
||||||
# Remain active as long as any of the model probers are active.
|
# Remain active as long as any of the model probers are active.
|
||||||
if (self._logical_prober.state == ProbingState.NOT_ME) and \
|
if (self._logical_prober.state == ProbingState.NOT_ME) and (
|
||||||
(self._visual_prober.state == ProbingState.NOT_ME):
|
self._visual_prober.state == ProbingState.NOT_ME
|
||||||
|
):
|
||||||
return ProbingState.NOT_ME
|
return ProbingState.NOT_ME
|
||||||
return ProbingState.DETECTING
|
return ProbingState.DETECTING
|
||||||
|
|
|
@ -46,6 +46,7 @@ JIS_TYPICAL_DISTRIBUTION_RATIO = 3.0
|
||||||
# Char to FreqOrder table ,
|
# Char to FreqOrder table ,
|
||||||
JIS_TABLE_SIZE = 4368
|
JIS_TABLE_SIZE = 4368
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
JIS_CHAR_TO_FREQ_ORDER = (
|
JIS_CHAR_TO_FREQ_ORDER = (
|
||||||
40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, # 16
|
40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, # 16
|
||||||
3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, # 32
|
3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, # 32
|
||||||
|
@ -321,5 +322,4 @@ JIS_CHAR_TO_FREQ_ORDER = (
|
||||||
1444,1698,2385,2251,3729,1365,2281,2235,1717,6188, 864,3841,2515, 444, 527,2767, # 4352
|
1444,1698,2385,2251,3729,1365,2281,2235,1717,6188, 864,3841,2515, 444, 527,2767, # 4352
|
||||||
2922,3625, 544, 461,6189, 566, 209,2437,3398,2098,1065,2068,3331,3626,3257,2137, # 4368 #last 512
|
2922,3625, 544, 461,6189, 566, 209,2437,3398,2098,1065,2068,3331,3626,3257,2137, # 4368 #last 512
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
2382
libs/chardet/johabfreq.py
Normal file
2382
libs/chardet/johabfreq.py
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1,7 +1,13 @@
|
||||||
######################## BEGIN LICENSE BLOCK ########################
|
######################## BEGIN LICENSE BLOCK ########################
|
||||||
|
# The Original Code is mozilla.org code.
|
||||||
|
#
|
||||||
|
# The Initial Developer of the Original Code is
|
||||||
|
# Netscape Communications Corporation.
|
||||||
|
# Portions created by the Initial Developer are Copyright (C) 1998
|
||||||
|
# the Initial Developer. All Rights Reserved.
|
||||||
|
#
|
||||||
# Contributor(s):
|
# Contributor(s):
|
||||||
# Dan Blanchard
|
# Mark Pilgrim - port to Python
|
||||||
# Ian Cordasco
|
|
||||||
#
|
#
|
||||||
# This library is free software; you can redistribute it and/or
|
# This library is free software; you can redistribute it and/or
|
||||||
# modify it under the terms of the GNU Lesser General Public
|
# modify it under the terms of the GNU Lesser General Public
|
||||||
|
@ -19,18 +25,23 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
import sys
|
from .chardistribution import JOHABDistributionAnalysis
|
||||||
|
from .codingstatemachine import CodingStateMachine
|
||||||
|
from .mbcharsetprober import MultiByteCharSetProber
|
||||||
|
from .mbcssm import JOHAB_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
if sys.version_info < (3, 0):
|
class JOHABProber(MultiByteCharSetProber):
|
||||||
PY2 = True
|
def __init__(self):
|
||||||
PY3 = False
|
super().__init__()
|
||||||
string_types = (str, unicode)
|
self.coding_sm = CodingStateMachine(JOHAB_SM_MODEL)
|
||||||
text_type = unicode
|
self.distribution_analyzer = JOHABDistributionAnalysis()
|
||||||
iteritems = dict.iteritems
|
self.reset()
|
||||||
else:
|
|
||||||
PY2 = False
|
@property
|
||||||
PY3 = True
|
def charset_name(self):
|
||||||
string_types = (bytes, str)
|
return "Johab"
|
||||||
text_type = str
|
|
||||||
iteritems = dict.items
|
@property
|
||||||
|
def language(self):
|
||||||
|
return "Korean"
|
|
@ -27,7 +27,8 @@
|
||||||
|
|
||||||
|
|
||||||
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
||||||
jp2CharContext = (
|
# fmt: off
|
||||||
|
jp2_char_context = (
|
||||||
(0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
|
(0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
|
||||||
(2, 4, 0, 4, 0, 3, 0, 4, 0, 3, 4, 4, 4, 2, 4, 3, 3, 4, 3, 2, 3, 3, 4, 2, 3, 3, 3, 2, 4, 1, 4, 3, 3, 1, 5, 4, 3, 4, 3, 4, 3, 5, 3, 0, 3, 5, 4, 2, 0, 3, 1, 0, 3, 3, 0, 3, 3, 0, 1, 1, 0, 4, 3, 0, 3, 3, 0, 4, 0, 2, 0, 3, 5, 5, 5, 5, 4, 0, 4, 1, 0, 3, 4),
|
(2, 4, 0, 4, 0, 3, 0, 4, 0, 3, 4, 4, 4, 2, 4, 3, 3, 4, 3, 2, 3, 3, 4, 2, 3, 3, 3, 2, 4, 1, 4, 3, 3, 1, 5, 4, 3, 4, 3, 4, 3, 5, 3, 0, 3, 5, 4, 2, 0, 3, 1, 0, 3, 3, 0, 3, 3, 0, 1, 1, 0, 4, 3, 0, 3, 3, 0, 4, 0, 2, 0, 3, 5, 5, 5, 5, 4, 0, 4, 1, 0, 3, 4),
|
||||||
(0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2),
|
(0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2),
|
||||||
|
@ -112,8 +113,10 @@ jp2CharContext = (
|
||||||
(0, 4, 0, 4, 0, 4, 0, 3, 0, 4, 4, 3, 4, 2, 4, 3, 2, 0, 4, 4, 4, 3, 5, 3, 5, 3, 3, 2, 4, 2, 4, 3, 4, 3, 1, 4, 0, 2, 3, 4, 4, 4, 3, 3, 3, 4, 4, 4, 3, 4, 1, 3, 4, 3, 2, 1, 2, 1, 3, 3, 3, 4, 4, 3, 3, 5, 0, 4, 0, 3, 0, 4, 3, 3, 3, 2, 1, 0, 3, 0, 0, 3, 3),
|
(0, 4, 0, 4, 0, 4, 0, 3, 0, 4, 4, 3, 4, 2, 4, 3, 2, 0, 4, 4, 4, 3, 5, 3, 5, 3, 3, 2, 4, 2, 4, 3, 4, 3, 1, 4, 0, 2, 3, 4, 4, 4, 3, 3, 3, 4, 4, 4, 3, 4, 1, 3, 4, 3, 2, 1, 2, 1, 3, 3, 3, 4, 4, 3, 3, 5, 0, 4, 0, 3, 0, 4, 3, 3, 3, 2, 1, 0, 3, 0, 0, 3, 3),
|
||||||
(0, 4, 0, 3, 0, 3, 0, 3, 0, 3, 5, 5, 3, 3, 3, 3, 4, 3, 4, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 4, 3, 5, 3, 3, 1, 3, 2, 4, 5, 5, 5, 5, 4, 3, 4, 5, 5, 3, 2, 2, 3, 3, 3, 3, 2, 3, 3, 1, 2, 3, 2, 4, 3, 3, 3, 4, 0, 4, 0, 2, 0, 4, 3, 2, 2, 1, 2, 0, 3, 0, 0, 4, 1),
|
(0, 4, 0, 3, 0, 3, 0, 3, 0, 3, 5, 5, 3, 3, 3, 3, 4, 3, 4, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 4, 3, 5, 3, 3, 1, 3, 2, 4, 5, 5, 5, 5, 4, 3, 4, 5, 5, 3, 2, 2, 3, 3, 3, 3, 2, 3, 3, 1, 2, 3, 2, 4, 3, 3, 3, 4, 0, 4, 0, 2, 0, 4, 3, 2, 2, 1, 2, 0, 3, 0, 0, 4, 1),
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
class JapaneseContextAnalysis(object):
|
|
||||||
|
class JapaneseContextAnalysis:
|
||||||
NUM_OF_CATEGORY = 6
|
NUM_OF_CATEGORY = 6
|
||||||
DONT_KNOW = -1
|
DONT_KNOW = -1
|
||||||
ENOUGH_REL_THRESHOLD = 100
|
ENOUGH_REL_THRESHOLD = 100
|
||||||
|
@ -164,7 +167,9 @@ class JapaneseContextAnalysis(object):
|
||||||
if self._total_rel > self.MAX_REL_THRESHOLD:
|
if self._total_rel > self.MAX_REL_THRESHOLD:
|
||||||
self._done = True
|
self._done = True
|
||||||
break
|
break
|
||||||
self._rel_sample[jp2CharContext[self._last_char_order][order]] += 1
|
self._rel_sample[
|
||||||
|
jp2_char_context[self._last_char_order][order]
|
||||||
|
] += 1
|
||||||
self._last_char_order = order
|
self._last_char_order = order
|
||||||
|
|
||||||
def got_enough_data(self):
|
def got_enough_data(self):
|
||||||
|
@ -174,15 +179,15 @@ class JapaneseContextAnalysis(object):
|
||||||
# This is just one way to calculate confidence. It works well for me.
|
# This is just one way to calculate confidence. It works well for me.
|
||||||
if self._total_rel > self.MINIMUM_DATA_THRESHOLD:
|
if self._total_rel > self.MINIMUM_DATA_THRESHOLD:
|
||||||
return (self._total_rel - self._rel_sample[0]) / self._total_rel
|
return (self._total_rel - self._rel_sample[0]) / self._total_rel
|
||||||
else:
|
|
||||||
return self.DONT_KNOW
|
return self.DONT_KNOW
|
||||||
|
|
||||||
def get_order(self, byte_str):
|
def get_order(self, _):
|
||||||
return -1, 1
|
return -1, 1
|
||||||
|
|
||||||
|
|
||||||
class SJISContextAnalysis(JapaneseContextAnalysis):
|
class SJISContextAnalysis(JapaneseContextAnalysis):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(SJISContextAnalysis, self).__init__()
|
super().__init__()
|
||||||
self._charset_name = "SHIFT_JIS"
|
self._charset_name = "SHIFT_JIS"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -209,6 +214,7 @@ class SJISContextAnalysis(JapaneseContextAnalysis):
|
||||||
|
|
||||||
return -1, char_len
|
return -1, char_len
|
||||||
|
|
||||||
|
|
||||||
class EUCJPContextAnalysis(JapaneseContextAnalysis):
|
class EUCJPContextAnalysis(JapaneseContextAnalysis):
|
||||||
def get_order(self, byte_str):
|
def get_order(self, byte_str):
|
||||||
if not byte_str:
|
if not byte_str:
|
||||||
|
@ -229,5 +235,3 @@ class EUCJPContextAnalysis(JapaneseContextAnalysis):
|
||||||
return second_char - 0xA1, char_len
|
return second_char - 0xA1, char_len
|
||||||
|
|
||||||
return -1, char_len
|
return -1, char_len
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,5 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from chardet.sbcharsetprober import SingleByteCharSetModel
|
from chardet.sbcharsetprober import SingleByteCharSetModel
|
||||||
|
|
||||||
|
|
||||||
# 3: Positive
|
# 3: Positive
|
||||||
# 2: Likely
|
# 2: Likely
|
||||||
# 1: Unlikely
|
# 1: Unlikely
|
||||||
|
@ -4373,13 +4369,15 @@ ISO_8859_5_BULGARIAN_CHAR_TO_ORDER = {
|
||||||
255: 253, # 'џ'
|
255: 253, # 'џ'
|
||||||
}
|
}
|
||||||
|
|
||||||
ISO_8859_5_BULGARIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-5',
|
ISO_8859_5_BULGARIAN_MODEL = SingleByteCharSetModel(
|
||||||
language='Bulgarian',
|
charset_name="ISO-8859-5",
|
||||||
|
language="Bulgarian",
|
||||||
char_to_order_map=ISO_8859_5_BULGARIAN_CHAR_TO_ORDER,
|
char_to_order_map=ISO_8859_5_BULGARIAN_CHAR_TO_ORDER,
|
||||||
language_model=BULGARIAN_LANG_MODEL,
|
language_model=BULGARIAN_LANG_MODEL,
|
||||||
typical_positive_ratio=0.969392,
|
typical_positive_ratio=0.969392,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя')
|
alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя",
|
||||||
|
)
|
||||||
|
|
||||||
WINDOWS_1251_BULGARIAN_CHAR_TO_ORDER = {
|
WINDOWS_1251_BULGARIAN_CHAR_TO_ORDER = {
|
||||||
0: 255, # '\x00'
|
0: 255, # '\x00'
|
||||||
|
@ -4640,11 +4638,12 @@ WINDOWS_1251_BULGARIAN_CHAR_TO_ORDER = {
|
||||||
255: 16, # 'я'
|
255: 16, # 'я'
|
||||||
}
|
}
|
||||||
|
|
||||||
WINDOWS_1251_BULGARIAN_MODEL = SingleByteCharSetModel(charset_name='windows-1251',
|
WINDOWS_1251_BULGARIAN_MODEL = SingleByteCharSetModel(
|
||||||
language='Bulgarian',
|
charset_name="windows-1251",
|
||||||
|
language="Bulgarian",
|
||||||
char_to_order_map=WINDOWS_1251_BULGARIAN_CHAR_TO_ORDER,
|
char_to_order_map=WINDOWS_1251_BULGARIAN_CHAR_TO_ORDER,
|
||||||
language_model=BULGARIAN_LANG_MODEL,
|
language_model=BULGARIAN_LANG_MODEL,
|
||||||
typical_positive_ratio=0.969392,
|
typical_positive_ratio=0.969392,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя')
|
alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя",
|
||||||
|
)
|
||||||
|
|
|
@ -1,9 +1,5 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from chardet.sbcharsetprober import SingleByteCharSetModel
|
from chardet.sbcharsetprober import SingleByteCharSetModel
|
||||||
|
|
||||||
|
|
||||||
# 3: Positive
|
# 3: Positive
|
||||||
# 2: Likely
|
# 2: Likely
|
||||||
# 1: Unlikely
|
# 1: Unlikely
|
||||||
|
@ -4121,13 +4117,15 @@ WINDOWS_1253_GREEK_CHAR_TO_ORDER = {
|
||||||
255: 253, # None
|
255: 253, # None
|
||||||
}
|
}
|
||||||
|
|
||||||
WINDOWS_1253_GREEK_MODEL = SingleByteCharSetModel(charset_name='windows-1253',
|
WINDOWS_1253_GREEK_MODEL = SingleByteCharSetModel(
|
||||||
language='Greek',
|
charset_name="windows-1253",
|
||||||
|
language="Greek",
|
||||||
char_to_order_map=WINDOWS_1253_GREEK_CHAR_TO_ORDER,
|
char_to_order_map=WINDOWS_1253_GREEK_CHAR_TO_ORDER,
|
||||||
language_model=GREEK_LANG_MODEL,
|
language_model=GREEK_LANG_MODEL,
|
||||||
typical_positive_ratio=0.982851,
|
typical_positive_ratio=0.982851,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='ΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίαβγδεζηθικλμνξοπρςστυφχψωόύώ')
|
alphabet="ΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίαβγδεζηθικλμνξοπρςστυφχψωόύώ",
|
||||||
|
)
|
||||||
|
|
||||||
ISO_8859_7_GREEK_CHAR_TO_ORDER = {
|
ISO_8859_7_GREEK_CHAR_TO_ORDER = {
|
||||||
0: 255, # '\x00'
|
0: 255, # '\x00'
|
||||||
|
@ -4388,11 +4386,12 @@ ISO_8859_7_GREEK_CHAR_TO_ORDER = {
|
||||||
255: 253, # None
|
255: 253, # None
|
||||||
}
|
}
|
||||||
|
|
||||||
ISO_8859_7_GREEK_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-7',
|
ISO_8859_7_GREEK_MODEL = SingleByteCharSetModel(
|
||||||
language='Greek',
|
charset_name="ISO-8859-7",
|
||||||
|
language="Greek",
|
||||||
char_to_order_map=ISO_8859_7_GREEK_CHAR_TO_ORDER,
|
char_to_order_map=ISO_8859_7_GREEK_CHAR_TO_ORDER,
|
||||||
language_model=GREEK_LANG_MODEL,
|
language_model=GREEK_LANG_MODEL,
|
||||||
typical_positive_ratio=0.982851,
|
typical_positive_ratio=0.982851,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='ΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίαβγδεζηθικλμνξοπρςστυφχψωόύώ')
|
alphabet="ΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίαβγδεζηθικλμνξοπρςστυφχψωόύώ",
|
||||||
|
)
|
||||||
|
|
|
@ -1,9 +1,5 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from chardet.sbcharsetprober import SingleByteCharSetModel
|
from chardet.sbcharsetprober import SingleByteCharSetModel
|
||||||
|
|
||||||
|
|
||||||
# 3: Positive
|
# 3: Positive
|
||||||
# 2: Likely
|
# 2: Likely
|
||||||
# 1: Unlikely
|
# 1: Unlikely
|
||||||
|
@ -4373,11 +4369,12 @@ WINDOWS_1255_HEBREW_CHAR_TO_ORDER = {
|
||||||
255: 253, # None
|
255: 253, # None
|
||||||
}
|
}
|
||||||
|
|
||||||
WINDOWS_1255_HEBREW_MODEL = SingleByteCharSetModel(charset_name='windows-1255',
|
WINDOWS_1255_HEBREW_MODEL = SingleByteCharSetModel(
|
||||||
language='Hebrew',
|
charset_name="windows-1255",
|
||||||
|
language="Hebrew",
|
||||||
char_to_order_map=WINDOWS_1255_HEBREW_CHAR_TO_ORDER,
|
char_to_order_map=WINDOWS_1255_HEBREW_CHAR_TO_ORDER,
|
||||||
language_model=HEBREW_LANG_MODEL,
|
language_model=HEBREW_LANG_MODEL,
|
||||||
typical_positive_ratio=0.984004,
|
typical_positive_ratio=0.984004,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ')
|
alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ",
|
||||||
|
)
|
||||||
|
|
|
@ -1,9 +1,5 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from chardet.sbcharsetprober import SingleByteCharSetModel
|
from chardet.sbcharsetprober import SingleByteCharSetModel
|
||||||
|
|
||||||
|
|
||||||
# 3: Positive
|
# 3: Positive
|
||||||
# 2: Likely
|
# 2: Likely
|
||||||
# 1: Unlikely
|
# 1: Unlikely
|
||||||
|
@ -4373,13 +4369,15 @@ WINDOWS_1250_HUNGARIAN_CHAR_TO_ORDER = {
|
||||||
255: 253, # '˙'
|
255: 253, # '˙'
|
||||||
}
|
}
|
||||||
|
|
||||||
WINDOWS_1250_HUNGARIAN_MODEL = SingleByteCharSetModel(charset_name='windows-1250',
|
WINDOWS_1250_HUNGARIAN_MODEL = SingleByteCharSetModel(
|
||||||
language='Hungarian',
|
charset_name="windows-1250",
|
||||||
|
language="Hungarian",
|
||||||
char_to_order_map=WINDOWS_1250_HUNGARIAN_CHAR_TO_ORDER,
|
char_to_order_map=WINDOWS_1250_HUNGARIAN_CHAR_TO_ORDER,
|
||||||
language_model=HUNGARIAN_LANG_MODEL,
|
language_model=HUNGARIAN_LANG_MODEL,
|
||||||
typical_positive_ratio=0.947368,
|
typical_positive_ratio=0.947368,
|
||||||
keep_ascii_letters=True,
|
keep_ascii_letters=True,
|
||||||
alphabet='ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzÁÉÍÓÖÚÜáéíóöúüŐőŰű')
|
alphabet="ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzÁÉÍÓÖÚÜáéíóöúüŐőŰű",
|
||||||
|
)
|
||||||
|
|
||||||
ISO_8859_2_HUNGARIAN_CHAR_TO_ORDER = {
|
ISO_8859_2_HUNGARIAN_CHAR_TO_ORDER = {
|
||||||
0: 255, # '\x00'
|
0: 255, # '\x00'
|
||||||
|
@ -4640,11 +4638,12 @@ ISO_8859_2_HUNGARIAN_CHAR_TO_ORDER = {
|
||||||
255: 253, # '˙'
|
255: 253, # '˙'
|
||||||
}
|
}
|
||||||
|
|
||||||
ISO_8859_2_HUNGARIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-2',
|
ISO_8859_2_HUNGARIAN_MODEL = SingleByteCharSetModel(
|
||||||
language='Hungarian',
|
charset_name="ISO-8859-2",
|
||||||
|
language="Hungarian",
|
||||||
char_to_order_map=ISO_8859_2_HUNGARIAN_CHAR_TO_ORDER,
|
char_to_order_map=ISO_8859_2_HUNGARIAN_CHAR_TO_ORDER,
|
||||||
language_model=HUNGARIAN_LANG_MODEL,
|
language_model=HUNGARIAN_LANG_MODEL,
|
||||||
typical_positive_ratio=0.947368,
|
typical_positive_ratio=0.947368,
|
||||||
keep_ascii_letters=True,
|
keep_ascii_letters=True,
|
||||||
alphabet='ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzÁÉÍÓÖÚÜáéíóöúüŐőŰű')
|
alphabet="ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvzÁÉÍÓÖÚÜáéíóöúüŐőŰű",
|
||||||
|
)
|
||||||
|
|
|
@ -1,9 +1,5 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from chardet.sbcharsetprober import SingleByteCharSetModel
|
from chardet.sbcharsetprober import SingleByteCharSetModel
|
||||||
|
|
||||||
|
|
||||||
# 3: Positive
|
# 3: Positive
|
||||||
# 2: Likely
|
# 2: Likely
|
||||||
# 1: Unlikely
|
# 1: Unlikely
|
||||||
|
@ -4373,13 +4369,15 @@ IBM866_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
255: 255, # '\xa0'
|
255: 255, # '\xa0'
|
||||||
}
|
}
|
||||||
|
|
||||||
IBM866_RUSSIAN_MODEL = SingleByteCharSetModel(charset_name='IBM866',
|
IBM866_RUSSIAN_MODEL = SingleByteCharSetModel(
|
||||||
language='Russian',
|
charset_name="IBM866",
|
||||||
|
language="Russian",
|
||||||
char_to_order_map=IBM866_RUSSIAN_CHAR_TO_ORDER,
|
char_to_order_map=IBM866_RUSSIAN_CHAR_TO_ORDER,
|
||||||
language_model=RUSSIAN_LANG_MODEL,
|
language_model=RUSSIAN_LANG_MODEL,
|
||||||
typical_positive_ratio=0.976601,
|
typical_positive_ratio=0.976601,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё')
|
alphabet="ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё",
|
||||||
|
)
|
||||||
|
|
||||||
WINDOWS_1251_RUSSIAN_CHAR_TO_ORDER = {
|
WINDOWS_1251_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
0: 255, # '\x00'
|
0: 255, # '\x00'
|
||||||
|
@ -4640,13 +4638,15 @@ WINDOWS_1251_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
255: 16, # 'я'
|
255: 16, # 'я'
|
||||||
}
|
}
|
||||||
|
|
||||||
WINDOWS_1251_RUSSIAN_MODEL = SingleByteCharSetModel(charset_name='windows-1251',
|
WINDOWS_1251_RUSSIAN_MODEL = SingleByteCharSetModel(
|
||||||
language='Russian',
|
charset_name="windows-1251",
|
||||||
|
language="Russian",
|
||||||
char_to_order_map=WINDOWS_1251_RUSSIAN_CHAR_TO_ORDER,
|
char_to_order_map=WINDOWS_1251_RUSSIAN_CHAR_TO_ORDER,
|
||||||
language_model=RUSSIAN_LANG_MODEL,
|
language_model=RUSSIAN_LANG_MODEL,
|
||||||
typical_positive_ratio=0.976601,
|
typical_positive_ratio=0.976601,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё')
|
alphabet="ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё",
|
||||||
|
)
|
||||||
|
|
||||||
IBM855_RUSSIAN_CHAR_TO_ORDER = {
|
IBM855_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
0: 255, # '\x00'
|
0: 255, # '\x00'
|
||||||
|
@ -4907,13 +4907,15 @@ IBM855_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
255: 255, # '\xa0'
|
255: 255, # '\xa0'
|
||||||
}
|
}
|
||||||
|
|
||||||
IBM855_RUSSIAN_MODEL = SingleByteCharSetModel(charset_name='IBM855',
|
IBM855_RUSSIAN_MODEL = SingleByteCharSetModel(
|
||||||
language='Russian',
|
charset_name="IBM855",
|
||||||
|
language="Russian",
|
||||||
char_to_order_map=IBM855_RUSSIAN_CHAR_TO_ORDER,
|
char_to_order_map=IBM855_RUSSIAN_CHAR_TO_ORDER,
|
||||||
language_model=RUSSIAN_LANG_MODEL,
|
language_model=RUSSIAN_LANG_MODEL,
|
||||||
typical_positive_ratio=0.976601,
|
typical_positive_ratio=0.976601,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё')
|
alphabet="ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё",
|
||||||
|
)
|
||||||
|
|
||||||
KOI8_R_RUSSIAN_CHAR_TO_ORDER = {
|
KOI8_R_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
0: 255, # '\x00'
|
0: 255, # '\x00'
|
||||||
|
@ -5174,13 +5176,15 @@ KOI8_R_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
255: 70, # 'Ъ'
|
255: 70, # 'Ъ'
|
||||||
}
|
}
|
||||||
|
|
||||||
KOI8_R_RUSSIAN_MODEL = SingleByteCharSetModel(charset_name='KOI8-R',
|
KOI8_R_RUSSIAN_MODEL = SingleByteCharSetModel(
|
||||||
language='Russian',
|
charset_name="KOI8-R",
|
||||||
|
language="Russian",
|
||||||
char_to_order_map=KOI8_R_RUSSIAN_CHAR_TO_ORDER,
|
char_to_order_map=KOI8_R_RUSSIAN_CHAR_TO_ORDER,
|
||||||
language_model=RUSSIAN_LANG_MODEL,
|
language_model=RUSSIAN_LANG_MODEL,
|
||||||
typical_positive_ratio=0.976601,
|
typical_positive_ratio=0.976601,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё')
|
alphabet="ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё",
|
||||||
|
)
|
||||||
|
|
||||||
MACCYRILLIC_RUSSIAN_CHAR_TO_ORDER = {
|
MACCYRILLIC_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
0: 255, # '\x00'
|
0: 255, # '\x00'
|
||||||
|
@ -5441,13 +5445,15 @@ MACCYRILLIC_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
255: 255, # '€'
|
255: 255, # '€'
|
||||||
}
|
}
|
||||||
|
|
||||||
MACCYRILLIC_RUSSIAN_MODEL = SingleByteCharSetModel(charset_name='MacCyrillic',
|
MACCYRILLIC_RUSSIAN_MODEL = SingleByteCharSetModel(
|
||||||
language='Russian',
|
charset_name="MacCyrillic",
|
||||||
|
language="Russian",
|
||||||
char_to_order_map=MACCYRILLIC_RUSSIAN_CHAR_TO_ORDER,
|
char_to_order_map=MACCYRILLIC_RUSSIAN_CHAR_TO_ORDER,
|
||||||
language_model=RUSSIAN_LANG_MODEL,
|
language_model=RUSSIAN_LANG_MODEL,
|
||||||
typical_positive_ratio=0.976601,
|
typical_positive_ratio=0.976601,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё')
|
alphabet="ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё",
|
||||||
|
)
|
||||||
|
|
||||||
ISO_8859_5_RUSSIAN_CHAR_TO_ORDER = {
|
ISO_8859_5_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
0: 255, # '\x00'
|
0: 255, # '\x00'
|
||||||
|
@ -5708,11 +5714,12 @@ ISO_8859_5_RUSSIAN_CHAR_TO_ORDER = {
|
||||||
255: 255, # 'џ'
|
255: 255, # 'џ'
|
||||||
}
|
}
|
||||||
|
|
||||||
ISO_8859_5_RUSSIAN_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-5',
|
ISO_8859_5_RUSSIAN_MODEL = SingleByteCharSetModel(
|
||||||
language='Russian',
|
charset_name="ISO-8859-5",
|
||||||
|
language="Russian",
|
||||||
char_to_order_map=ISO_8859_5_RUSSIAN_CHAR_TO_ORDER,
|
char_to_order_map=ISO_8859_5_RUSSIAN_CHAR_TO_ORDER,
|
||||||
language_model=RUSSIAN_LANG_MODEL,
|
language_model=RUSSIAN_LANG_MODEL,
|
||||||
typical_positive_ratio=0.976601,
|
typical_positive_ratio=0.976601,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё')
|
alphabet="ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё",
|
||||||
|
)
|
||||||
|
|
|
@ -1,9 +1,5 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from chardet.sbcharsetprober import SingleByteCharSetModel
|
from chardet.sbcharsetprober import SingleByteCharSetModel
|
||||||
|
|
||||||
|
|
||||||
# 3: Positive
|
# 3: Positive
|
||||||
# 2: Likely
|
# 2: Likely
|
||||||
# 1: Unlikely
|
# 1: Unlikely
|
||||||
|
@ -4373,11 +4369,12 @@ TIS_620_THAI_CHAR_TO_ORDER = {
|
||||||
255: 253, # None
|
255: 253, # None
|
||||||
}
|
}
|
||||||
|
|
||||||
TIS_620_THAI_MODEL = SingleByteCharSetModel(charset_name='TIS-620',
|
TIS_620_THAI_MODEL = SingleByteCharSetModel(
|
||||||
language='Thai',
|
charset_name="TIS-620",
|
||||||
|
language="Thai",
|
||||||
char_to_order_map=TIS_620_THAI_CHAR_TO_ORDER,
|
char_to_order_map=TIS_620_THAI_CHAR_TO_ORDER,
|
||||||
language_model=THAI_LANG_MODEL,
|
language_model=THAI_LANG_MODEL,
|
||||||
typical_positive_ratio=0.926386,
|
typical_positive_ratio=0.926386,
|
||||||
keep_ascii_letters=False,
|
keep_ascii_letters=False,
|
||||||
alphabet='กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛')
|
alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛",
|
||||||
|
)
|
||||||
|
|
|
@ -1,9 +1,5 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from chardet.sbcharsetprober import SingleByteCharSetModel
|
from chardet.sbcharsetprober import SingleByteCharSetModel
|
||||||
|
|
||||||
|
|
||||||
# 3: Positive
|
# 3: Positive
|
||||||
# 2: Likely
|
# 2: Likely
|
||||||
# 1: Unlikely
|
# 1: Unlikely
|
||||||
|
@ -4373,11 +4369,12 @@ ISO_8859_9_TURKISH_CHAR_TO_ORDER = {
|
||||||
255: 107, # 'ÿ'
|
255: 107, # 'ÿ'
|
||||||
}
|
}
|
||||||
|
|
||||||
ISO_8859_9_TURKISH_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-9',
|
ISO_8859_9_TURKISH_MODEL = SingleByteCharSetModel(
|
||||||
language='Turkish',
|
charset_name="ISO-8859-9",
|
||||||
|
language="Turkish",
|
||||||
char_to_order_map=ISO_8859_9_TURKISH_CHAR_TO_ORDER,
|
char_to_order_map=ISO_8859_9_TURKISH_CHAR_TO_ORDER,
|
||||||
language_model=TURKISH_LANG_MODEL,
|
language_model=TURKISH_LANG_MODEL,
|
||||||
typical_positive_ratio=0.97029,
|
typical_positive_ratio=0.97029,
|
||||||
keep_ascii_letters=True,
|
keep_ascii_letters=True,
|
||||||
alphabet='ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzÂÇÎÖÛÜâçîöûüĞğİıŞş')
|
alphabet="ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzÂÇÎÖÛÜâçîöûüĞğİıŞş",
|
||||||
|
)
|
||||||
|
|
|
@ -41,6 +41,7 @@ ASV = 6 # accent small vowel
|
||||||
ASO = 7 # accent small other
|
ASO = 7 # accent small other
|
||||||
CLASS_NUM = 8 # total classes
|
CLASS_NUM = 8 # total classes
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
Latin1_CharToClass = (
|
Latin1_CharToClass = (
|
||||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
|
||||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
|
||||||
|
@ -91,11 +92,12 @@ Latin1ClassModel = (
|
||||||
0, 3, 1, 3, 1, 1, 1, 3, # ASV
|
0, 3, 1, 3, 1, 1, 1, 3, # ASV
|
||||||
0, 3, 1, 3, 1, 1, 3, 3, # ASO
|
0, 3, 1, 3, 1, 1, 3, 3, # ASO
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
class Latin1Prober(CharSetProber):
|
class Latin1Prober(CharSetProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(Latin1Prober, self).__init__()
|
super().__init__()
|
||||||
self._last_char_class = None
|
self._last_char_class = None
|
||||||
self._freq_counter = None
|
self._freq_counter = None
|
||||||
self.reset()
|
self.reset()
|
||||||
|
@ -103,7 +105,7 @@ class Latin1Prober(CharSetProber):
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self._last_char_class = OTH
|
self._last_char_class = OTH
|
||||||
self._freq_counter = [0] * FREQ_CAT_NUM
|
self._freq_counter = [0] * FREQ_CAT_NUM
|
||||||
CharSetProber.reset(self)
|
super().reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self):
|
||||||
|
@ -114,11 +116,10 @@ class Latin1Prober(CharSetProber):
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str):
|
||||||
byte_str = self.filter_with_english_letters(byte_str)
|
byte_str = self.remove_xml_tags(byte_str)
|
||||||
for c in byte_str:
|
for c in byte_str:
|
||||||
char_class = Latin1_CharToClass[c]
|
char_class = Latin1_CharToClass[c]
|
||||||
freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM)
|
freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM) + char_class]
|
||||||
+ char_class]
|
|
||||||
if freq == 0:
|
if freq == 0:
|
||||||
self._state = ProbingState.NOT_ME
|
self._state = ProbingState.NOT_ME
|
||||||
break
|
break
|
||||||
|
@ -132,14 +133,13 @@ class Latin1Prober(CharSetProber):
|
||||||
return 0.01
|
return 0.01
|
||||||
|
|
||||||
total = sum(self._freq_counter)
|
total = sum(self._freq_counter)
|
||||||
if total < 0.01:
|
confidence = (
|
||||||
confidence = 0.0
|
0.0
|
||||||
else:
|
if total < 0.01
|
||||||
confidence = ((self._freq_counter[3] - self._freq_counter[1] * 20.0)
|
else (self._freq_counter[3] - self._freq_counter[1] * 20.0) / total
|
||||||
/ total)
|
)
|
||||||
if confidence < 0.0:
|
confidence = max(confidence, 0.0)
|
||||||
confidence = 0.0
|
|
||||||
# lower the confidence of latin1 so that other more accurate
|
# lower the confidence of latin1 so that other more accurate
|
||||||
# detector can take priority.
|
# detector can take priority.
|
||||||
confidence = confidence * 0.73
|
confidence *= 0.73
|
||||||
return confidence
|
return confidence
|
||||||
|
|
|
@ -28,7 +28,7 @@
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .enums import ProbingState, MachineState
|
from .enums import MachineState, ProbingState
|
||||||
|
|
||||||
|
|
||||||
class MultiByteCharSetProber(CharSetProber):
|
class MultiByteCharSetProber(CharSetProber):
|
||||||
|
@ -37,13 +37,13 @@ class MultiByteCharSetProber(CharSetProber):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, lang_filter=None):
|
def __init__(self, lang_filter=None):
|
||||||
super(MultiByteCharSetProber, self).__init__(lang_filter=lang_filter)
|
super().__init__(lang_filter=lang_filter)
|
||||||
self.distribution_analyzer = None
|
self.distribution_analyzer = None
|
||||||
self.coding_sm = None
|
self.coding_sm = None
|
||||||
self._last_char = [0, 0]
|
self._last_char = [0, 0]
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
super(MultiByteCharSetProber, self).reset()
|
super().reset()
|
||||||
if self.coding_sm:
|
if self.coding_sm:
|
||||||
self.coding_sm.reset()
|
self.coding_sm.reset()
|
||||||
if self.distribution_analyzer:
|
if self.distribution_analyzer:
|
||||||
|
@ -59,30 +59,34 @@ class MultiByteCharSetProber(CharSetProber):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str):
|
||||||
for i in range(len(byte_str)):
|
for i, byte in enumerate(byte_str):
|
||||||
coding_state = self.coding_sm.next_state(byte_str[i])
|
coding_state = self.coding_sm.next_state(byte)
|
||||||
if coding_state == MachineState.ERROR:
|
if coding_state == MachineState.ERROR:
|
||||||
self.logger.debug('%s %s prober hit error at byte %s',
|
self.logger.debug(
|
||||||
self.charset_name, self.language, i)
|
"%s %s prober hit error at byte %s",
|
||||||
|
self.charset_name,
|
||||||
|
self.language,
|
||||||
|
i,
|
||||||
|
)
|
||||||
self._state = ProbingState.NOT_ME
|
self._state = ProbingState.NOT_ME
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.ITS_ME:
|
if coding_state == MachineState.ITS_ME:
|
||||||
self._state = ProbingState.FOUND_IT
|
self._state = ProbingState.FOUND_IT
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.START:
|
if coding_state == MachineState.START:
|
||||||
char_len = self.coding_sm.get_current_charlen()
|
char_len = self.coding_sm.get_current_charlen()
|
||||||
if i == 0:
|
if i == 0:
|
||||||
self._last_char[1] = byte_str[0]
|
self._last_char[1] = byte
|
||||||
self.distribution_analyzer.feed(self._last_char, char_len)
|
self.distribution_analyzer.feed(self._last_char, char_len)
|
||||||
else:
|
else:
|
||||||
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
|
self.distribution_analyzer.feed(byte_str[i - 1 : i + 1], char_len)
|
||||||
char_len)
|
|
||||||
|
|
||||||
self._last_char[0] = byte_str[-1]
|
self._last_char[0] = byte_str[-1]
|
||||||
|
|
||||||
if self.state == ProbingState.DETECTING:
|
if self.state == ProbingState.DETECTING:
|
||||||
if (self.distribution_analyzer.got_enough_data() and
|
if self.distribution_analyzer.got_enough_data() and (
|
||||||
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
|
self.get_confidence() > self.SHORTCUT_THRESHOLD
|
||||||
|
):
|
||||||
self._state = ProbingState.FOUND_IT
|
self._state = ProbingState.FOUND_IT
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
|
|
@ -27,20 +27,21 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .charsetgroupprober import CharSetGroupProber
|
|
||||||
from .utf8prober import UTF8Prober
|
|
||||||
from .sjisprober import SJISProber
|
|
||||||
from .eucjpprober import EUCJPProber
|
|
||||||
from .gb2312prober import GB2312Prober
|
|
||||||
from .euckrprober import EUCKRProber
|
|
||||||
from .cp949prober import CP949Prober
|
|
||||||
from .big5prober import Big5Prober
|
from .big5prober import Big5Prober
|
||||||
|
from .charsetgroupprober import CharSetGroupProber
|
||||||
|
from .cp949prober import CP949Prober
|
||||||
|
from .eucjpprober import EUCJPProber
|
||||||
|
from .euckrprober import EUCKRProber
|
||||||
from .euctwprober import EUCTWProber
|
from .euctwprober import EUCTWProber
|
||||||
|
from .gb2312prober import GB2312Prober
|
||||||
|
from .johabprober import JOHABProber
|
||||||
|
from .sjisprober import SJISProber
|
||||||
|
from .utf8prober import UTF8Prober
|
||||||
|
|
||||||
|
|
||||||
class MBCSGroupProber(CharSetGroupProber):
|
class MBCSGroupProber(CharSetGroupProber):
|
||||||
def __init__(self, lang_filter=None):
|
def __init__(self, lang_filter=None):
|
||||||
super(MBCSGroupProber, self).__init__(lang_filter=lang_filter)
|
super().__init__(lang_filter=lang_filter)
|
||||||
self.probers = [
|
self.probers = [
|
||||||
UTF8Prober(),
|
UTF8Prober(),
|
||||||
SJISProber(),
|
SJISProber(),
|
||||||
|
@ -49,6 +50,7 @@ class MBCSGroupProber(CharSetGroupProber):
|
||||||
EUCKRProber(),
|
EUCKRProber(),
|
||||||
CP949Prober(),
|
CP949Prober(),
|
||||||
Big5Prober(),
|
Big5Prober(),
|
||||||
EUCTWProber()
|
EUCTWProber(),
|
||||||
|
JOHABProber(),
|
||||||
]
|
]
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
|
@ -29,6 +29,7 @@ from .enums import MachineState
|
||||||
|
|
||||||
# BIG5
|
# BIG5
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
BIG5_CLS = (
|
BIG5_CLS = (
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07 #allow 0x00 as legal value
|
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07 #allow 0x00 as legal value
|
||||||
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
|
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
|
||||||
|
@ -69,17 +70,20 @@ BIG5_ST = (
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,#08-0f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,#08-0f
|
||||||
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START#10-17
|
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START#10-17
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0)
|
BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0)
|
||||||
|
|
||||||
BIG5_SM_MODEL = {'class_table': BIG5_CLS,
|
BIG5_SM_MODEL = {
|
||||||
'class_factor': 5,
|
"class_table": BIG5_CLS,
|
||||||
'state_table': BIG5_ST,
|
"class_factor": 5,
|
||||||
'char_len_table': BIG5_CHAR_LEN_TABLE,
|
"state_table": BIG5_ST,
|
||||||
'name': 'Big5'}
|
"char_len_table": BIG5_CHAR_LEN_TABLE,
|
||||||
|
"name": "Big5",
|
||||||
|
}
|
||||||
|
|
||||||
# CP949
|
# CP949
|
||||||
|
# fmt: off
|
||||||
CP949_CLS = (
|
CP949_CLS = (
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, # 00 - 0f
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, # 00 - 0f
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, # 10 - 1f
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, # 10 - 1f
|
||||||
|
@ -109,17 +113,20 @@ CP949_ST = (
|
||||||
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, # 5
|
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, # 5
|
||||||
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START, # 6
|
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START, # 6
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
|
CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
|
||||||
|
|
||||||
CP949_SM_MODEL = {'class_table': CP949_CLS,
|
CP949_SM_MODEL = {
|
||||||
'class_factor': 10,
|
"class_table": CP949_CLS,
|
||||||
'state_table': CP949_ST,
|
"class_factor": 10,
|
||||||
'char_len_table': CP949_CHAR_LEN_TABLE,
|
"state_table": CP949_ST,
|
||||||
'name': 'CP949'}
|
"char_len_table": CP949_CHAR_LEN_TABLE,
|
||||||
|
"name": "CP949",
|
||||||
|
}
|
||||||
|
|
||||||
# EUC-JP
|
# EUC-JP
|
||||||
|
# fmt: off
|
||||||
EUCJP_CLS = (
|
EUCJP_CLS = (
|
||||||
4, 4, 4, 4, 4, 4, 4, 4, # 00 - 07
|
4, 4, 4, 4, 4, 4, 4, 4, # 00 - 07
|
||||||
4, 4, 4, 4, 4, 4, 5, 5, # 08 - 0f
|
4, 4, 4, 4, 4, 4, 5, 5, # 08 - 0f
|
||||||
|
@ -162,17 +169,20 @@ EUCJP_ST = (
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 3,MachineState.ERROR,#18-1f
|
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 3,MachineState.ERROR,#18-1f
|
||||||
3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START#20-27
|
3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START#20-27
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0)
|
EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0)
|
||||||
|
|
||||||
EUCJP_SM_MODEL = {'class_table': EUCJP_CLS,
|
EUCJP_SM_MODEL = {
|
||||||
'class_factor': 6,
|
"class_table": EUCJP_CLS,
|
||||||
'state_table': EUCJP_ST,
|
"class_factor": 6,
|
||||||
'char_len_table': EUCJP_CHAR_LEN_TABLE,
|
"state_table": EUCJP_ST,
|
||||||
'name': 'EUC-JP'}
|
"char_len_table": EUCJP_CHAR_LEN_TABLE,
|
||||||
|
"name": "EUC-JP",
|
||||||
|
}
|
||||||
|
|
||||||
# EUC-KR
|
# EUC-KR
|
||||||
|
# fmt: off
|
||||||
EUCKR_CLS = (
|
EUCKR_CLS = (
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07
|
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07
|
||||||
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
|
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
|
||||||
|
@ -212,17 +222,77 @@ EUCKR_ST = (
|
||||||
MachineState.ERROR,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
|
MachineState.ERROR,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
|
||||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #08-0f
|
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #08-0f
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0)
|
EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0)
|
||||||
|
|
||||||
EUCKR_SM_MODEL = {'class_table': EUCKR_CLS,
|
EUCKR_SM_MODEL = {
|
||||||
'class_factor': 4,
|
"class_table": EUCKR_CLS,
|
||||||
'state_table': EUCKR_ST,
|
"class_factor": 4,
|
||||||
'char_len_table': EUCKR_CHAR_LEN_TABLE,
|
"state_table": EUCKR_ST,
|
||||||
'name': 'EUC-KR'}
|
"char_len_table": EUCKR_CHAR_LEN_TABLE,
|
||||||
|
"name": "EUC-KR",
|
||||||
|
}
|
||||||
|
|
||||||
|
# JOHAB
|
||||||
|
# fmt: off
|
||||||
|
JOHAB_CLS = (
|
||||||
|
4,4,4,4,4,4,4,4, # 00 - 07
|
||||||
|
4,4,4,4,4,4,0,0, # 08 - 0f
|
||||||
|
4,4,4,4,4,4,4,4, # 10 - 17
|
||||||
|
4,4,4,0,4,4,4,4, # 18 - 1f
|
||||||
|
4,4,4,4,4,4,4,4, # 20 - 27
|
||||||
|
4,4,4,4,4,4,4,4, # 28 - 2f
|
||||||
|
4,3,3,3,3,3,3,3, # 30 - 37
|
||||||
|
3,3,3,3,3,3,3,3, # 38 - 3f
|
||||||
|
3,1,1,1,1,1,1,1, # 40 - 47
|
||||||
|
1,1,1,1,1,1,1,1, # 48 - 4f
|
||||||
|
1,1,1,1,1,1,1,1, # 50 - 57
|
||||||
|
1,1,1,1,1,1,1,1, # 58 - 5f
|
||||||
|
1,1,1,1,1,1,1,1, # 60 - 67
|
||||||
|
1,1,1,1,1,1,1,1, # 68 - 6f
|
||||||
|
1,1,1,1,1,1,1,1, # 70 - 77
|
||||||
|
1,1,1,1,1,1,1,2, # 78 - 7f
|
||||||
|
6,6,6,6,8,8,8,8, # 80 - 87
|
||||||
|
8,8,8,8,8,8,8,8, # 88 - 8f
|
||||||
|
8,7,7,7,7,7,7,7, # 90 - 97
|
||||||
|
7,7,7,7,7,7,7,7, # 98 - 9f
|
||||||
|
7,7,7,7,7,7,7,7, # a0 - a7
|
||||||
|
7,7,7,7,7,7,7,7, # a8 - af
|
||||||
|
7,7,7,7,7,7,7,7, # b0 - b7
|
||||||
|
7,7,7,7,7,7,7,7, # b8 - bf
|
||||||
|
7,7,7,7,7,7,7,7, # c0 - c7
|
||||||
|
7,7,7,7,7,7,7,7, # c8 - cf
|
||||||
|
7,7,7,7,5,5,5,5, # d0 - d7
|
||||||
|
5,9,9,9,9,9,9,5, # d8 - df
|
||||||
|
9,9,9,9,9,9,9,9, # e0 - e7
|
||||||
|
9,9,9,9,9,9,9,9, # e8 - ef
|
||||||
|
9,9,9,9,9,9,9,9, # f0 - f7
|
||||||
|
9,9,5,5,5,5,5,0 # f8 - ff
|
||||||
|
)
|
||||||
|
|
||||||
|
JOHAB_ST = (
|
||||||
|
# cls = 0 1 2 3 4 5 6 7 8 9
|
||||||
|
MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.ERROR ,MachineState.ERROR ,3 ,3 ,4 , # MachineState.START
|
||||||
|
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME, # MachineState.ITS_ME
|
||||||
|
MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR , # MachineState.ERROR
|
||||||
|
MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.ERROR ,MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START , # 3
|
||||||
|
MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START , # 4
|
||||||
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
JOHAB_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 0, 0, 2, 2, 2)
|
||||||
|
|
||||||
|
JOHAB_SM_MODEL = {
|
||||||
|
"class_table": JOHAB_CLS,
|
||||||
|
"class_factor": 10,
|
||||||
|
"state_table": JOHAB_ST,
|
||||||
|
"char_len_table": JOHAB_CHAR_LEN_TABLE,
|
||||||
|
"name": "Johab",
|
||||||
|
}
|
||||||
|
|
||||||
# EUC-TW
|
# EUC-TW
|
||||||
|
# fmt: off
|
||||||
EUCTW_CLS = (
|
EUCTW_CLS = (
|
||||||
2, 2, 2, 2, 2, 2, 2, 2, # 00 - 07
|
2, 2, 2, 2, 2, 2, 2, 2, # 00 - 07
|
||||||
2, 2, 2, 2, 2, 2, 0, 0, # 08 - 0f
|
2, 2, 2, 2, 2, 2, 0, 0, # 08 - 0f
|
||||||
|
@ -266,17 +336,20 @@ EUCTW_ST = (
|
||||||
5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,#20-27
|
5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,#20-27
|
||||||
MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
|
MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3)
|
EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3)
|
||||||
|
|
||||||
EUCTW_SM_MODEL = {'class_table': EUCTW_CLS,
|
EUCTW_SM_MODEL = {
|
||||||
'class_factor': 7,
|
"class_table": EUCTW_CLS,
|
||||||
'state_table': EUCTW_ST,
|
"class_factor": 7,
|
||||||
'char_len_table': EUCTW_CHAR_LEN_TABLE,
|
"state_table": EUCTW_ST,
|
||||||
'name': 'x-euc-tw'}
|
"char_len_table": EUCTW_CHAR_LEN_TABLE,
|
||||||
|
"name": "x-euc-tw",
|
||||||
|
}
|
||||||
|
|
||||||
# GB2312
|
# GB2312
|
||||||
|
# fmt: off
|
||||||
GB2312_CLS = (
|
GB2312_CLS = (
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07
|
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07
|
||||||
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
|
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
|
||||||
|
@ -320,6 +393,7 @@ GB2312_ST = (
|
||||||
MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#20-27
|
MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#20-27
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
|
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
# To be accurate, the length of class 6 can be either 2 or 4.
|
# To be accurate, the length of class 6 can be either 2 or 4.
|
||||||
# But it is not necessary to discriminate between the two since
|
# But it is not necessary to discriminate between the two since
|
||||||
|
@ -328,14 +402,16 @@ GB2312_ST = (
|
||||||
# 2 here.
|
# 2 here.
|
||||||
GB2312_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 1, 2)
|
GB2312_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 1, 2)
|
||||||
|
|
||||||
GB2312_SM_MODEL = {'class_table': GB2312_CLS,
|
GB2312_SM_MODEL = {
|
||||||
'class_factor': 7,
|
"class_table": GB2312_CLS,
|
||||||
'state_table': GB2312_ST,
|
"class_factor": 7,
|
||||||
'char_len_table': GB2312_CHAR_LEN_TABLE,
|
"state_table": GB2312_ST,
|
||||||
'name': 'GB2312'}
|
"char_len_table": GB2312_CHAR_LEN_TABLE,
|
||||||
|
"name": "GB2312",
|
||||||
|
}
|
||||||
|
|
||||||
# Shift_JIS
|
# Shift_JIS
|
||||||
|
# fmt: off
|
||||||
SJIS_CLS = (
|
SJIS_CLS = (
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07
|
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07
|
||||||
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
|
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
|
||||||
|
@ -370,25 +446,28 @@ SJIS_CLS = (
|
||||||
3, 3, 3, 3, 3, 3, 3, 3, # e0 - e7
|
3, 3, 3, 3, 3, 3, 3, 3, # e0 - e7
|
||||||
3, 3, 3, 3, 3, 4, 4, 4, # e8 - ef
|
3, 3, 3, 3, 3, 4, 4, 4, # e8 - ef
|
||||||
3, 3, 3, 3, 3, 3, 3, 3, # f0 - f7
|
3, 3, 3, 3, 3, 3, 3, 3, # f0 - f7
|
||||||
3,3,3,3,3,0,0,0) # f8 - ff
|
3, 3, 3, 3, 3, 0, 0, 0, # f8 - ff
|
||||||
|
)
|
||||||
|
|
||||||
SJIS_ST = (
|
SJIS_ST = (
|
||||||
MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
|
MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
|
||||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START #10-17
|
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START #10-17
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0)
|
SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0)
|
||||||
|
|
||||||
SJIS_SM_MODEL = {'class_table': SJIS_CLS,
|
SJIS_SM_MODEL = {
|
||||||
'class_factor': 6,
|
"class_table": SJIS_CLS,
|
||||||
'state_table': SJIS_ST,
|
"class_factor": 6,
|
||||||
'char_len_table': SJIS_CHAR_LEN_TABLE,
|
"state_table": SJIS_ST,
|
||||||
'name': 'Shift_JIS'}
|
"char_len_table": SJIS_CHAR_LEN_TABLE,
|
||||||
|
"name": "Shift_JIS",
|
||||||
|
}
|
||||||
|
|
||||||
# UCS2-BE
|
# UCS2-BE
|
||||||
|
# fmt: off
|
||||||
UCS2BE_CLS = (
|
UCS2BE_CLS = (
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
0, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
||||||
0, 0, 1, 0, 0, 2, 0, 0, # 08 - 0f
|
0, 0, 1, 0, 0, 2, 0, 0, # 08 - 0f
|
||||||
|
@ -433,17 +512,20 @@ UCS2BE_ST = (
|
||||||
5, 8, 6, 6,MachineState.ERROR, 6, 6, 6,#28-2f
|
5, 8, 6, 6,MachineState.ERROR, 6, 6, 6,#28-2f
|
||||||
6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #30-37
|
6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #30-37
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2)
|
UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2)
|
||||||
|
|
||||||
UCS2BE_SM_MODEL = {'class_table': UCS2BE_CLS,
|
UCS2BE_SM_MODEL = {
|
||||||
'class_factor': 6,
|
"class_table": UCS2BE_CLS,
|
||||||
'state_table': UCS2BE_ST,
|
"class_factor": 6,
|
||||||
'char_len_table': UCS2BE_CHAR_LEN_TABLE,
|
"state_table": UCS2BE_ST,
|
||||||
'name': 'UTF-16BE'}
|
"char_len_table": UCS2BE_CHAR_LEN_TABLE,
|
||||||
|
"name": "UTF-16BE",
|
||||||
|
}
|
||||||
|
|
||||||
# UCS2-LE
|
# UCS2-LE
|
||||||
|
# fmt: off
|
||||||
UCS2LE_CLS = (
|
UCS2LE_CLS = (
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
0, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
||||||
0, 0, 1, 0, 0, 2, 0, 0, # 08 - 0f
|
0, 0, 1, 0, 0, 2, 0, 0, # 08 - 0f
|
||||||
|
@ -488,17 +570,20 @@ UCS2LE_ST = (
|
||||||
5, 5, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 5,#28-2f
|
5, 5, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 5,#28-2f
|
||||||
5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR,MachineState.START,MachineState.START #30-37
|
5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR,MachineState.START,MachineState.START #30-37
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2)
|
UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2)
|
||||||
|
|
||||||
UCS2LE_SM_MODEL = {'class_table': UCS2LE_CLS,
|
UCS2LE_SM_MODEL = {
|
||||||
'class_factor': 6,
|
"class_table": UCS2LE_CLS,
|
||||||
'state_table': UCS2LE_ST,
|
"class_factor": 6,
|
||||||
'char_len_table': UCS2LE_CHAR_LEN_TABLE,
|
"state_table": UCS2LE_ST,
|
||||||
'name': 'UTF-16LE'}
|
"char_len_table": UCS2LE_CHAR_LEN_TABLE,
|
||||||
|
"name": "UTF-16LE",
|
||||||
|
}
|
||||||
|
|
||||||
# UTF-8
|
# UTF-8
|
||||||
|
# fmt: off
|
||||||
UTF8_CLS = (
|
UTF8_CLS = (
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07 #allow 0x00 as a legal value
|
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07 #allow 0x00 as a legal value
|
||||||
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
|
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
|
||||||
|
@ -562,11 +647,14 @@ UTF8_ST = (
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,#c0-c7
|
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,#c0-c7
|
||||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR #c8-cf
|
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR #c8-cf
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)
|
UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)
|
||||||
|
|
||||||
UTF8_SM_MODEL = {'class_table': UTF8_CLS,
|
UTF8_SM_MODEL = {
|
||||||
'class_factor': 16,
|
"class_table": UTF8_CLS,
|
||||||
'state_table': UTF8_ST,
|
"class_factor": 16,
|
||||||
'char_len_table': UTF8_CHAR_LEN_TABLE,
|
"state_table": UTF8_ST,
|
||||||
'name': 'UTF-8'}
|
"char_len_table": UTF8_CHAR_LEN_TABLE,
|
||||||
|
"name": "UTF-8",
|
||||||
|
}
|
||||||
|
|
|
@ -1,19 +1,16 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
"""
|
||||||
Metadata about languages used by our model training code for our
|
Metadata about languages used by our model training code for our
|
||||||
SingleByteCharSetProbers. Could be used for other things in the future.
|
SingleByteCharSetProbers. Could be used for other things in the future.
|
||||||
|
|
||||||
This code is based on the language metadata from the uchardet project.
|
This code is based on the language metadata from the uchardet project.
|
||||||
"""
|
"""
|
||||||
from __future__ import absolute_import, print_function
|
|
||||||
|
|
||||||
from string import ascii_letters
|
from string import ascii_letters
|
||||||
|
|
||||||
|
# TODO: Add Ukrainian (KOI8-U)
|
||||||
|
|
||||||
# TODO: Add Ukranian (KOI8-U)
|
|
||||||
|
|
||||||
class Language(object):
|
class Language:
|
||||||
"""Metadata about a language useful for training models
|
"""Metadata about a language useful for training models
|
||||||
|
|
||||||
:ivar name: The human name for the language, in English.
|
:ivar name: The human name for the language, in English.
|
||||||
|
@ -33,9 +30,17 @@ class Language(object):
|
||||||
Wikipedia for training data.
|
Wikipedia for training data.
|
||||||
:type wiki_start_pages: list of str
|
:type wiki_start_pages: list of str
|
||||||
"""
|
"""
|
||||||
def __init__(self, name=None, iso_code=None, use_ascii=True, charsets=None,
|
|
||||||
alphabet=None, wiki_start_pages=None):
|
def __init__(
|
||||||
super(Language, self).__init__()
|
self,
|
||||||
|
name=None,
|
||||||
|
iso_code=None,
|
||||||
|
use_ascii=True,
|
||||||
|
charsets=None,
|
||||||
|
alphabet=None,
|
||||||
|
wiki_start_pages=None,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
self.name = name
|
self.name = name
|
||||||
self.iso_code = iso_code
|
self.iso_code = iso_code
|
||||||
self.use_ascii = use_ascii
|
self.use_ascii = use_ascii
|
||||||
|
@ -46,246 +51,282 @@ class Language(object):
|
||||||
else:
|
else:
|
||||||
alphabet = ascii_letters
|
alphabet = ascii_letters
|
||||||
elif not alphabet:
|
elif not alphabet:
|
||||||
raise ValueError('Must supply alphabet if use_ascii is False')
|
raise ValueError("Must supply alphabet if use_ascii is False")
|
||||||
self.alphabet = ''.join(sorted(set(alphabet))) if alphabet else None
|
self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None
|
||||||
self.wiki_start_pages = wiki_start_pages
|
self.wiki_start_pages = wiki_start_pages
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '{}({})'.format(self.__class__.__name__,
|
param_str = ", ".join(
|
||||||
', '.join('{}={!r}'.format(k, v)
|
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
|
||||||
for k, v in self.__dict__.items()
|
)
|
||||||
if not k.startswith('_')))
|
return f"{self.__class__.__name__}({param_str})"
|
||||||
|
|
||||||
|
|
||||||
LANGUAGES = {'Arabic': Language(name='Arabic',
|
LANGUAGES = {
|
||||||
iso_code='ar',
|
"Arabic": Language(
|
||||||
|
name="Arabic",
|
||||||
|
iso_code="ar",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
# We only support encodings that use isolated
|
# We only support encodings that use isolated
|
||||||
# forms, because the current recommendation is
|
# forms, because the current recommendation is
|
||||||
# that the rendering system handles presentation
|
# that the rendering system handles presentation
|
||||||
# forms. This means we purposefully skip IBM864.
|
# forms. This means we purposefully skip IBM864.
|
||||||
charsets=['ISO-8859-6', 'WINDOWS-1256',
|
charsets=["ISO-8859-6", "WINDOWS-1256", "CP720", "CP864"],
|
||||||
'CP720', 'CP864'],
|
alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ",
|
||||||
alphabet=u'ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ',
|
wiki_start_pages=["الصفحة_الرئيسية"],
|
||||||
wiki_start_pages=[u'الصفحة_الرئيسية']),
|
),
|
||||||
'Belarusian': Language(name='Belarusian',
|
"Belarusian": Language(
|
||||||
iso_code='be',
|
name="Belarusian",
|
||||||
|
iso_code="be",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
charsets=["ISO-8859-5", "WINDOWS-1251", "IBM866", "MacCyrillic"],
|
||||||
'IBM866', 'MacCyrillic'],
|
alphabet="АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯабвгдеёжзійклмнопрстуўфхцчшыьэюяʼ",
|
||||||
alphabet=(u'АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯ'
|
wiki_start_pages=["Галоўная_старонка"],
|
||||||
u'абвгдеёжзійклмнопрстуўфхцчшыьэюяʼ'),
|
),
|
||||||
wiki_start_pages=[u'Галоўная_старонка']),
|
"Bulgarian": Language(
|
||||||
'Bulgarian': Language(name='Bulgarian',
|
name="Bulgarian",
|
||||||
iso_code='bg',
|
iso_code="bg",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
charsets=["ISO-8859-5", "WINDOWS-1251", "IBM855"],
|
||||||
'IBM855'],
|
alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя",
|
||||||
alphabet=(u'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯ'
|
wiki_start_pages=["Начална_страница"],
|
||||||
u'абвгдежзийклмнопрстуфхцчшщъьюя'),
|
),
|
||||||
wiki_start_pages=[u'Начална_страница']),
|
"Czech": Language(
|
||||||
'Czech': Language(name='Czech',
|
name="Czech",
|
||||||
iso_code='cz',
|
iso_code="cz",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||||
alphabet=u'áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ',
|
alphabet="áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ",
|
||||||
wiki_start_pages=[u'Hlavní_strana']),
|
wiki_start_pages=["Hlavní_strana"],
|
||||||
'Danish': Language(name='Danish',
|
),
|
||||||
iso_code='da',
|
"Danish": Language(
|
||||||
|
name="Danish",
|
||||||
|
iso_code="da",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
|
||||||
'WINDOWS-1252'],
|
alphabet="æøåÆØÅ",
|
||||||
alphabet=u'æøåÆØÅ',
|
wiki_start_pages=["Forside"],
|
||||||
wiki_start_pages=[u'Forside']),
|
),
|
||||||
'German': Language(name='German',
|
"German": Language(
|
||||||
iso_code='de',
|
name="German",
|
||||||
|
iso_code="de",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-1', 'WINDOWS-1252'],
|
charsets=["ISO-8859-1", "WINDOWS-1252"],
|
||||||
alphabet=u'äöüßÄÖÜ',
|
alphabet="äöüßÄÖÜ",
|
||||||
wiki_start_pages=[u'Wikipedia:Hauptseite']),
|
wiki_start_pages=["Wikipedia:Hauptseite"],
|
||||||
'Greek': Language(name='Greek',
|
),
|
||||||
iso_code='el',
|
"Greek": Language(
|
||||||
|
name="Greek",
|
||||||
|
iso_code="el",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-7', 'WINDOWS-1253'],
|
charsets=["ISO-8859-7", "WINDOWS-1253"],
|
||||||
alphabet=(u'αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ'
|
alphabet="αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ",
|
||||||
u'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ'),
|
wiki_start_pages=["Πύλη:Κύρια"],
|
||||||
wiki_start_pages=[u'Πύλη:Κύρια']),
|
),
|
||||||
'English': Language(name='English',
|
"English": Language(
|
||||||
iso_code='en',
|
name="English",
|
||||||
|
iso_code="en",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-1', 'WINDOWS-1252'],
|
charsets=["ISO-8859-1", "WINDOWS-1252"],
|
||||||
wiki_start_pages=[u'Main_Page']),
|
wiki_start_pages=["Main_Page"],
|
||||||
'Esperanto': Language(name='Esperanto',
|
),
|
||||||
iso_code='eo',
|
"Esperanto": Language(
|
||||||
|
name="Esperanto",
|
||||||
|
iso_code="eo",
|
||||||
# Q, W, X, and Y not used at all
|
# Q, W, X, and Y not used at all
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-3'],
|
charsets=["ISO-8859-3"],
|
||||||
alphabet=(u'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz'
|
alphabet="abcĉdefgĝhĥijĵklmnoprsŝtuŭvzABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ",
|
||||||
u'ABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ'),
|
wiki_start_pages=["Vikipedio:Ĉefpaĝo"],
|
||||||
wiki_start_pages=[u'Vikipedio:Ĉefpaĝo']),
|
),
|
||||||
'Spanish': Language(name='Spanish',
|
"Spanish": Language(
|
||||||
iso_code='es',
|
name="Spanish",
|
||||||
|
iso_code="es",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
|
||||||
'WINDOWS-1252'],
|
alphabet="ñáéíóúüÑÁÉÍÓÚÜ",
|
||||||
alphabet=u'ñáéíóúüÑÁÉÍÓÚÜ',
|
wiki_start_pages=["Wikipedia:Portada"],
|
||||||
wiki_start_pages=[u'Wikipedia:Portada']),
|
),
|
||||||
'Estonian': Language(name='Estonian',
|
"Estonian": Language(
|
||||||
iso_code='et',
|
name="Estonian",
|
||||||
|
iso_code="et",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-4', 'ISO-8859-13',
|
charsets=["ISO-8859-4", "ISO-8859-13", "WINDOWS-1257"],
|
||||||
'WINDOWS-1257'],
|
|
||||||
# C, F, Š, Q, W, X, Y, Z, Ž are only for
|
# C, F, Š, Q, W, X, Y, Z, Ž are only for
|
||||||
# loanwords
|
# loanwords
|
||||||
alphabet=(u'ABDEGHIJKLMNOPRSTUVÕÄÖÜ'
|
alphabet="ABDEGHIJKLMNOPRSTUVÕÄÖÜabdeghijklmnoprstuvõäöü",
|
||||||
u'abdeghijklmnoprstuvõäöü'),
|
wiki_start_pages=["Esileht"],
|
||||||
wiki_start_pages=[u'Esileht']),
|
),
|
||||||
'Finnish': Language(name='Finnish',
|
"Finnish": Language(
|
||||||
iso_code='fi',
|
name="Finnish",
|
||||||
|
iso_code="fi",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
|
||||||
'WINDOWS-1252'],
|
alphabet="ÅÄÖŠŽåäöšž",
|
||||||
alphabet=u'ÅÄÖŠŽåäöšž',
|
wiki_start_pages=["Wikipedia:Etusivu"],
|
||||||
wiki_start_pages=[u'Wikipedia:Etusivu']),
|
),
|
||||||
'French': Language(name='French',
|
"French": Language(
|
||||||
iso_code='fr',
|
name="French",
|
||||||
|
iso_code="fr",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
|
||||||
'WINDOWS-1252'],
|
alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ",
|
||||||
alphabet=u'œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ',
|
wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"],
|
||||||
wiki_start_pages=[u'Wikipédia:Accueil_principal',
|
),
|
||||||
u'Bœuf (animal)']),
|
"Hebrew": Language(
|
||||||
'Hebrew': Language(name='Hebrew',
|
name="Hebrew",
|
||||||
iso_code='he',
|
iso_code="he",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-8', 'WINDOWS-1255'],
|
charsets=["ISO-8859-8", "WINDOWS-1255"],
|
||||||
alphabet=u'אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ',
|
alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ",
|
||||||
wiki_start_pages=[u'עמוד_ראשי']),
|
wiki_start_pages=["עמוד_ראשי"],
|
||||||
'Croatian': Language(name='Croatian',
|
),
|
||||||
iso_code='hr',
|
"Croatian": Language(
|
||||||
|
name="Croatian",
|
||||||
|
iso_code="hr",
|
||||||
# Q, W, X, Y are only used for foreign words.
|
# Q, W, X, Y are only used for foreign words.
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||||
alphabet=(u'abcčćdđefghijklmnoprsštuvzž'
|
alphabet="abcčćdđefghijklmnoprsštuvzžABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ",
|
||||||
u'ABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ'),
|
wiki_start_pages=["Glavna_stranica"],
|
||||||
wiki_start_pages=[u'Glavna_stranica']),
|
),
|
||||||
'Hungarian': Language(name='Hungarian',
|
"Hungarian": Language(
|
||||||
iso_code='hu',
|
name="Hungarian",
|
||||||
|
iso_code="hu",
|
||||||
# Q, W, X, Y are only used for foreign words.
|
# Q, W, X, Y are only used for foreign words.
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||||
alphabet=(u'abcdefghijklmnoprstuvzáéíóöőúüű'
|
alphabet="abcdefghijklmnoprstuvzáéíóöőúüűABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ",
|
||||||
u'ABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ'),
|
wiki_start_pages=["Kezdőlap"],
|
||||||
wiki_start_pages=[u'Kezdőlap']),
|
),
|
||||||
'Italian': Language(name='Italian',
|
"Italian": Language(
|
||||||
iso_code='it',
|
name="Italian",
|
||||||
|
iso_code="it",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
|
||||||
'WINDOWS-1252'],
|
alphabet="ÀÈÉÌÒÓÙàèéìòóù",
|
||||||
alphabet=u'ÀÈÉÌÒÓÙàèéìòóù',
|
wiki_start_pages=["Pagina_principale"],
|
||||||
wiki_start_pages=[u'Pagina_principale']),
|
),
|
||||||
'Lithuanian': Language(name='Lithuanian',
|
"Lithuanian": Language(
|
||||||
iso_code='lt',
|
name="Lithuanian",
|
||||||
|
iso_code="lt",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-13', 'WINDOWS-1257',
|
charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
|
||||||
'ISO-8859-4'],
|
|
||||||
# Q, W, and X not used at all
|
# Q, W, and X not used at all
|
||||||
alphabet=(u'AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽ'
|
alphabet="AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽaąbcčdeęėfghiįyjklmnoprsštuųūvzž",
|
||||||
u'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'),
|
wiki_start_pages=["Pagrindinis_puslapis"],
|
||||||
wiki_start_pages=[u'Pagrindinis_puslapis']),
|
),
|
||||||
'Latvian': Language(name='Latvian',
|
"Latvian": Language(
|
||||||
iso_code='lv',
|
name="Latvian",
|
||||||
|
iso_code="lv",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-13', 'WINDOWS-1257',
|
charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
|
||||||
'ISO-8859-4'],
|
|
||||||
# Q, W, X, Y are only for loanwords
|
# Q, W, X, Y are only for loanwords
|
||||||
alphabet=(u'AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽ'
|
alphabet="AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽaābcčdeēfgģhiījkķlļmnņoprsštuūvzž",
|
||||||
u'aābcčdeēfgģhiījkķlļmnņoprsštuūvzž'),
|
wiki_start_pages=["Sākumlapa"],
|
||||||
wiki_start_pages=[u'Sākumlapa']),
|
),
|
||||||
'Macedonian': Language(name='Macedonian',
|
"Macedonian": Language(
|
||||||
iso_code='mk',
|
name="Macedonian",
|
||||||
|
iso_code="mk",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
|
||||||
'MacCyrillic', 'IBM855'],
|
alphabet="АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШабвгдѓежзѕијклљмнњопрстќуфхцчџш",
|
||||||
alphabet=(u'АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШ'
|
wiki_start_pages=["Главна_страница"],
|
||||||
u'абвгдѓежзѕијклљмнњопрстќуфхцчџш'),
|
),
|
||||||
wiki_start_pages=[u'Главна_страница']),
|
"Dutch": Language(
|
||||||
'Dutch': Language(name='Dutch',
|
name="Dutch",
|
||||||
iso_code='nl',
|
iso_code="nl",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-1', 'WINDOWS-1252'],
|
charsets=["ISO-8859-1", "WINDOWS-1252"],
|
||||||
wiki_start_pages=[u'Hoofdpagina']),
|
wiki_start_pages=["Hoofdpagina"],
|
||||||
'Polish': Language(name='Polish',
|
),
|
||||||
iso_code='pl',
|
"Polish": Language(
|
||||||
|
name="Polish",
|
||||||
|
iso_code="pl",
|
||||||
# Q and X are only used for foreign words.
|
# Q and X are only used for foreign words.
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||||
alphabet=(u'AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ'
|
alphabet="AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻaąbcćdeęfghijklłmnńoóprsśtuwyzźż",
|
||||||
u'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'),
|
wiki_start_pages=["Wikipedia:Strona_główna"],
|
||||||
wiki_start_pages=[u'Wikipedia:Strona_główna']),
|
),
|
||||||
'Portuguese': Language(name='Portuguese',
|
"Portuguese": Language(
|
||||||
iso_code='pt',
|
name="Portuguese",
|
||||||
|
iso_code="pt",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
|
||||||
'WINDOWS-1252'],
|
alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú",
|
||||||
alphabet=u'ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú',
|
wiki_start_pages=["Wikipédia:Página_principal"],
|
||||||
wiki_start_pages=[u'Wikipédia:Página_principal']),
|
),
|
||||||
'Romanian': Language(name='Romanian',
|
"Romanian": Language(
|
||||||
iso_code='ro',
|
name="Romanian",
|
||||||
|
iso_code="ro",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||||
alphabet=u'ăâîșțĂÂÎȘȚ',
|
alphabet="ăâîșțĂÂÎȘȚ",
|
||||||
wiki_start_pages=[u'Pagina_principală']),
|
wiki_start_pages=["Pagina_principală"],
|
||||||
'Russian': Language(name='Russian',
|
),
|
||||||
iso_code='ru',
|
"Russian": Language(
|
||||||
|
name="Russian",
|
||||||
|
iso_code="ru",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
charsets=[
|
||||||
'KOI8-R', 'MacCyrillic', 'IBM866',
|
"ISO-8859-5",
|
||||||
'IBM855'],
|
"WINDOWS-1251",
|
||||||
alphabet=(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
|
"KOI8-R",
|
||||||
u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'),
|
"MacCyrillic",
|
||||||
wiki_start_pages=[u'Заглавная_страница']),
|
"IBM866",
|
||||||
'Slovak': Language(name='Slovak',
|
"IBM855",
|
||||||
iso_code='sk',
|
],
|
||||||
|
alphabet="абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
|
||||||
|
wiki_start_pages=["Заглавная_страница"],
|
||||||
|
),
|
||||||
|
"Slovak": Language(
|
||||||
|
name="Slovak",
|
||||||
|
iso_code="sk",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||||
alphabet=u'áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ',
|
alphabet="áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ",
|
||||||
wiki_start_pages=[u'Hlavná_stránka']),
|
wiki_start_pages=["Hlavná_stránka"],
|
||||||
'Slovene': Language(name='Slovene',
|
),
|
||||||
iso_code='sl',
|
"Slovene": Language(
|
||||||
|
name="Slovene",
|
||||||
|
iso_code="sl",
|
||||||
# Q, W, X, Y are only used for foreign words.
|
# Q, W, X, Y are only used for foreign words.
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||||
alphabet=(u'abcčdefghijklmnoprsštuvzž'
|
alphabet="abcčdefghijklmnoprsštuvzžABCČDEFGHIJKLMNOPRSŠTUVZŽ",
|
||||||
u'ABCČDEFGHIJKLMNOPRSŠTUVZŽ'),
|
wiki_start_pages=["Glavna_stran"],
|
||||||
wiki_start_pages=[u'Glavna_stran']),
|
),
|
||||||
# Serbian can be written in both Latin and Cyrillic, but there's no
|
# Serbian can be written in both Latin and Cyrillic, but there's no
|
||||||
# simple way to get the Latin alphabet pages from Wikipedia through
|
# simple way to get the Latin alphabet pages from Wikipedia through
|
||||||
# the API, so for now we just support Cyrillic.
|
# the API, so for now we just support Cyrillic.
|
||||||
'Serbian': Language(name='Serbian',
|
"Serbian": Language(
|
||||||
iso_code='sr',
|
name="Serbian",
|
||||||
alphabet=(u'АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ'
|
iso_code="sr",
|
||||||
u'абвгдђежзијклљмнњопрстћуфхцчџш'),
|
alphabet="АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШабвгдђежзијклљмнњопрстћуфхцчџш",
|
||||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
|
||||||
'MacCyrillic', 'IBM855'],
|
wiki_start_pages=["Главна_страна"],
|
||||||
wiki_start_pages=[u'Главна_страна']),
|
),
|
||||||
'Thai': Language(name='Thai',
|
"Thai": Language(
|
||||||
iso_code='th',
|
name="Thai",
|
||||||
|
iso_code="th",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-11', 'TIS-620', 'CP874'],
|
charsets=["ISO-8859-11", "TIS-620", "CP874"],
|
||||||
alphabet=u'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛',
|
alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛",
|
||||||
wiki_start_pages=[u'หน้าหลัก']),
|
wiki_start_pages=["หน้าหลัก"],
|
||||||
'Turkish': Language(name='Turkish',
|
),
|
||||||
iso_code='tr',
|
"Turkish": Language(
|
||||||
|
name="Turkish",
|
||||||
|
iso_code="tr",
|
||||||
# Q, W, and X are not used by Turkish
|
# Q, W, and X are not used by Turkish
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
charsets=['ISO-8859-3', 'ISO-8859-9',
|
charsets=["ISO-8859-3", "ISO-8859-9", "WINDOWS-1254"],
|
||||||
'WINDOWS-1254'],
|
alphabet="abcçdefgğhıijklmnoöprsştuüvyzâîûABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ",
|
||||||
alphabet=(u'abcçdefgğhıijklmnoöprsştuüvyzâîû'
|
wiki_start_pages=["Ana_Sayfa"],
|
||||||
u'ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ'),
|
),
|
||||||
wiki_start_pages=[u'Ana_Sayfa']),
|
"Vietnamese": Language(
|
||||||
'Vietnamese': Language(name='Vietnamese',
|
name="Vietnamese",
|
||||||
iso_code='vi',
|
iso_code="vi",
|
||||||
use_ascii=False,
|
use_ascii=False,
|
||||||
# Windows-1258 is the only common 8-bit
|
# Windows-1258 is the only common 8-bit
|
||||||
# Vietnamese encoding supported by Python.
|
# Vietnamese encoding supported by Python.
|
||||||
|
@ -303,8 +344,8 @@ LANGUAGES = {'Arabic': Language(name='Arabic',
|
||||||
# scheme has declined dramatically following
|
# scheme has declined dramatically following
|
||||||
# the adoption of Unicode on the World Wide
|
# the adoption of Unicode on the World Wide
|
||||||
# Web.
|
# Web.
|
||||||
charsets=['WINDOWS-1258'],
|
charsets=["WINDOWS-1258"],
|
||||||
alphabet=(u'aăâbcdđeêghiklmnoôơpqrstuưvxy'
|
alphabet="aăâbcdđeêghiklmnoôơpqrstuưvxyAĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY",
|
||||||
u'AĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY'),
|
wiki_start_pages=["Chữ_Quốc_ngữ"],
|
||||||
wiki_start_pages=[u'Chữ_Quốc_ngữ']),
|
),
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,15 +31,18 @@ from collections import namedtuple
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .enums import CharacterCategory, ProbingState, SequenceLikelihood
|
from .enums import CharacterCategory, ProbingState, SequenceLikelihood
|
||||||
|
|
||||||
|
SingleByteCharSetModel = namedtuple(
|
||||||
SingleByteCharSetModel = namedtuple('SingleByteCharSetModel',
|
"SingleByteCharSetModel",
|
||||||
['charset_name',
|
[
|
||||||
'language',
|
"charset_name",
|
||||||
'char_to_order_map',
|
"language",
|
||||||
'language_model',
|
"char_to_order_map",
|
||||||
'typical_positive_ratio',
|
"language_model",
|
||||||
'keep_ascii_letters',
|
"typical_positive_ratio",
|
||||||
'alphabet'])
|
"keep_ascii_letters",
|
||||||
|
"alphabet",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class SingleByteCharSetProber(CharSetProber):
|
class SingleByteCharSetProber(CharSetProber):
|
||||||
|
@ -48,27 +51,29 @@ class SingleByteCharSetProber(CharSetProber):
|
||||||
POSITIVE_SHORTCUT_THRESHOLD = 0.95
|
POSITIVE_SHORTCUT_THRESHOLD = 0.95
|
||||||
NEGATIVE_SHORTCUT_THRESHOLD = 0.05
|
NEGATIVE_SHORTCUT_THRESHOLD = 0.05
|
||||||
|
|
||||||
def __init__(self, model, reversed=False, name_prober=None):
|
def __init__(self, model, is_reversed=False, name_prober=None):
|
||||||
super(SingleByteCharSetProber, self).__init__()
|
super().__init__()
|
||||||
self._model = model
|
self._model = model
|
||||||
# TRUE if we need to reverse every pair in the model lookup
|
# TRUE if we need to reverse every pair in the model lookup
|
||||||
self._reversed = reversed
|
self._reversed = is_reversed
|
||||||
# Optional auxiliary prober for name decision
|
# Optional auxiliary prober for name decision
|
||||||
self._name_prober = name_prober
|
self._name_prober = name_prober
|
||||||
self._last_order = None
|
self._last_order = None
|
||||||
self._seq_counters = None
|
self._seq_counters = None
|
||||||
self._total_seqs = None
|
self._total_seqs = None
|
||||||
self._total_char = None
|
self._total_char = None
|
||||||
|
self._control_char = None
|
||||||
self._freq_char = None
|
self._freq_char = None
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
super(SingleByteCharSetProber, self).reset()
|
super().reset()
|
||||||
# char order of last character
|
# char order of last character
|
||||||
self._last_order = 255
|
self._last_order = 255
|
||||||
self._seq_counters = [0] * SequenceLikelihood.get_num_categories()
|
self._seq_counters = [0] * SequenceLikelihood.get_num_categories()
|
||||||
self._total_seqs = 0
|
self._total_seqs = 0
|
||||||
self._total_char = 0
|
self._total_char = 0
|
||||||
|
self._control_char = 0
|
||||||
# characters that fall in our sampling range
|
# characters that fall in our sampling range
|
||||||
self._freq_char = 0
|
self._freq_char = 0
|
||||||
|
|
||||||
|
@ -76,20 +81,20 @@ class SingleByteCharSetProber(CharSetProber):
|
||||||
def charset_name(self):
|
def charset_name(self):
|
||||||
if self._name_prober:
|
if self._name_prober:
|
||||||
return self._name_prober.charset_name
|
return self._name_prober.charset_name
|
||||||
else:
|
|
||||||
return self._model.charset_name
|
return self._model.charset_name
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self):
|
||||||
if self._name_prober:
|
if self._name_prober:
|
||||||
return self._name_prober.language
|
return self._name_prober.language
|
||||||
else:
|
|
||||||
return self._model.language
|
return self._model.language
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str):
|
||||||
# TODO: Make filter_international_words keep things in self.alphabet
|
# TODO: Make filter_international_words keep things in self.alphabet
|
||||||
if not self._model.keep_ascii_letters:
|
if not self._model.keep_ascii_letters:
|
||||||
byte_str = self.filter_international_words(byte_str)
|
byte_str = self.filter_international_words(byte_str)
|
||||||
|
else:
|
||||||
|
byte_str = self.remove_xml_tags(byte_str)
|
||||||
if not byte_str:
|
if not byte_str:
|
||||||
return self.state
|
return self.state
|
||||||
char_to_order_map = self._model.char_to_order_map
|
char_to_order_map = self._model.char_to_order_map
|
||||||
|
@ -103,9 +108,6 @@ class SingleByteCharSetProber(CharSetProber):
|
||||||
# _total_char purposes.
|
# _total_char purposes.
|
||||||
if order < CharacterCategory.CONTROL:
|
if order < CharacterCategory.CONTROL:
|
||||||
self._total_char += 1
|
self._total_char += 1
|
||||||
# TODO: Follow uchardet's lead and discount confidence for frequent
|
|
||||||
# control characters.
|
|
||||||
# See https://github.com/BYVoid/uchardet/commit/55b4f23971db61
|
|
||||||
if order < self.SAMPLE_SIZE:
|
if order < self.SAMPLE_SIZE:
|
||||||
self._freq_char += 1
|
self._freq_char += 1
|
||||||
if self._last_order < self.SAMPLE_SIZE:
|
if self._last_order < self.SAMPLE_SIZE:
|
||||||
|
@ -122,14 +124,17 @@ class SingleByteCharSetProber(CharSetProber):
|
||||||
if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD:
|
if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD:
|
||||||
confidence = self.get_confidence()
|
confidence = self.get_confidence()
|
||||||
if confidence > self.POSITIVE_SHORTCUT_THRESHOLD:
|
if confidence > self.POSITIVE_SHORTCUT_THRESHOLD:
|
||||||
self.logger.debug('%s confidence = %s, we have a winner',
|
self.logger.debug(
|
||||||
charset_name, confidence)
|
"%s confidence = %s, we have a winner", charset_name, confidence
|
||||||
|
)
|
||||||
self._state = ProbingState.FOUND_IT
|
self._state = ProbingState.FOUND_IT
|
||||||
elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD:
|
elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD:
|
||||||
self.logger.debug('%s confidence = %s, below negative '
|
self.logger.debug(
|
||||||
'shortcut threshhold %s', charset_name,
|
"%s confidence = %s, below negative shortcut threshold %s",
|
||||||
|
charset_name,
|
||||||
confidence,
|
confidence,
|
||||||
self.NEGATIVE_SHORTCUT_THRESHOLD)
|
self.NEGATIVE_SHORTCUT_THRESHOLD,
|
||||||
|
)
|
||||||
self._state = ProbingState.NOT_ME
|
self._state = ProbingState.NOT_ME
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
@ -137,8 +142,18 @@ class SingleByteCharSetProber(CharSetProber):
|
||||||
def get_confidence(self):
|
def get_confidence(self):
|
||||||
r = 0.01
|
r = 0.01
|
||||||
if self._total_seqs > 0:
|
if self._total_seqs > 0:
|
||||||
r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) /
|
r = (
|
||||||
self._total_seqs / self._model.typical_positive_ratio)
|
(
|
||||||
|
self._seq_counters[SequenceLikelihood.POSITIVE]
|
||||||
|
+ 0.25 * self._seq_counters[SequenceLikelihood.LIKELY]
|
||||||
|
)
|
||||||
|
/ self._total_seqs
|
||||||
|
/ self._model.typical_positive_ratio
|
||||||
|
)
|
||||||
|
# The more control characters (proportionnaly to the size
|
||||||
|
# of the text), the less confident we become in the current
|
||||||
|
# charset.
|
||||||
|
r = r * (self._total_char - self._control_char) / self._total_char
|
||||||
r = r * self._freq_char / self._total_char
|
r = r * self._freq_char / self._total_char
|
||||||
if r >= 1.0:
|
if r >= 1.0:
|
||||||
r = 0.99
|
r = 0.99
|
||||||
|
|
|
@ -28,16 +28,20 @@
|
||||||
|
|
||||||
from .charsetgroupprober import CharSetGroupProber
|
from .charsetgroupprober import CharSetGroupProber
|
||||||
from .hebrewprober import HebrewProber
|
from .hebrewprober import HebrewProber
|
||||||
from .langbulgarianmodel import (ISO_8859_5_BULGARIAN_MODEL,
|
from .langbulgarianmodel import ISO_8859_5_BULGARIAN_MODEL, WINDOWS_1251_BULGARIAN_MODEL
|
||||||
WINDOWS_1251_BULGARIAN_MODEL)
|
|
||||||
from .langgreekmodel import ISO_8859_7_GREEK_MODEL, WINDOWS_1253_GREEK_MODEL
|
from .langgreekmodel import ISO_8859_7_GREEK_MODEL, WINDOWS_1253_GREEK_MODEL
|
||||||
from .langhebrewmodel import WINDOWS_1255_HEBREW_MODEL
|
from .langhebrewmodel import WINDOWS_1255_HEBREW_MODEL
|
||||||
|
|
||||||
# from .langhungarianmodel import (ISO_8859_2_HUNGARIAN_MODEL,
|
# from .langhungarianmodel import (ISO_8859_2_HUNGARIAN_MODEL,
|
||||||
# WINDOWS_1250_HUNGARIAN_MODEL)
|
# WINDOWS_1250_HUNGARIAN_MODEL)
|
||||||
from .langrussianmodel import (IBM855_RUSSIAN_MODEL, IBM866_RUSSIAN_MODEL,
|
from .langrussianmodel import (
|
||||||
ISO_8859_5_RUSSIAN_MODEL, KOI8_R_RUSSIAN_MODEL,
|
IBM855_RUSSIAN_MODEL,
|
||||||
|
IBM866_RUSSIAN_MODEL,
|
||||||
|
ISO_8859_5_RUSSIAN_MODEL,
|
||||||
|
KOI8_R_RUSSIAN_MODEL,
|
||||||
MACCYRILLIC_RUSSIAN_MODEL,
|
MACCYRILLIC_RUSSIAN_MODEL,
|
||||||
WINDOWS_1251_RUSSIAN_MODEL)
|
WINDOWS_1251_RUSSIAN_MODEL,
|
||||||
|
)
|
||||||
from .langthaimodel import TIS_620_THAI_MODEL
|
from .langthaimodel import TIS_620_THAI_MODEL
|
||||||
from .langturkishmodel import ISO_8859_9_TURKISH_MODEL
|
from .langturkishmodel import ISO_8859_9_TURKISH_MODEL
|
||||||
from .sbcharsetprober import SingleByteCharSetProber
|
from .sbcharsetprober import SingleByteCharSetProber
|
||||||
|
@ -45,16 +49,17 @@ from .sbcharsetprober import SingleByteCharSetProber
|
||||||
|
|
||||||
class SBCSGroupProber(CharSetGroupProber):
|
class SBCSGroupProber(CharSetGroupProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(SBCSGroupProber, self).__init__()
|
super().__init__()
|
||||||
hebrew_prober = HebrewProber()
|
hebrew_prober = HebrewProber()
|
||||||
logical_hebrew_prober = SingleByteCharSetProber(WINDOWS_1255_HEBREW_MODEL,
|
logical_hebrew_prober = SingleByteCharSetProber(
|
||||||
False, hebrew_prober)
|
WINDOWS_1255_HEBREW_MODEL, is_reversed=False, name_prober=hebrew_prober
|
||||||
|
)
|
||||||
# TODO: See if using ISO-8859-8 Hebrew model works better here, since
|
# TODO: See if using ISO-8859-8 Hebrew model works better here, since
|
||||||
# it's actually the visual one
|
# it's actually the visual one
|
||||||
visual_hebrew_prober = SingleByteCharSetProber(WINDOWS_1255_HEBREW_MODEL,
|
visual_hebrew_prober = SingleByteCharSetProber(
|
||||||
True, hebrew_prober)
|
WINDOWS_1255_HEBREW_MODEL, is_reversed=True, name_prober=hebrew_prober
|
||||||
hebrew_prober.set_model_probers(logical_hebrew_prober,
|
)
|
||||||
visual_hebrew_prober)
|
hebrew_prober.set_model_probers(logical_hebrew_prober, visual_hebrew_prober)
|
||||||
# TODO: ORDER MATTERS HERE. I changed the order vs what was in master
|
# TODO: ORDER MATTERS HERE. I changed the order vs what was in master
|
||||||
# and several tests failed that did not before. Some thought
|
# and several tests failed that did not before. Some thought
|
||||||
# should be put into the ordering, and we should consider making
|
# should be put into the ordering, and we should consider making
|
||||||
|
|
|
@ -25,24 +25,24 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .mbcharsetprober import MultiByteCharSetProber
|
|
||||||
from .codingstatemachine import CodingStateMachine
|
|
||||||
from .chardistribution import SJISDistributionAnalysis
|
from .chardistribution import SJISDistributionAnalysis
|
||||||
|
from .codingstatemachine import CodingStateMachine
|
||||||
|
from .enums import MachineState, ProbingState
|
||||||
from .jpcntx import SJISContextAnalysis
|
from .jpcntx import SJISContextAnalysis
|
||||||
|
from .mbcharsetprober import MultiByteCharSetProber
|
||||||
from .mbcssm import SJIS_SM_MODEL
|
from .mbcssm import SJIS_SM_MODEL
|
||||||
from .enums import ProbingState, MachineState
|
|
||||||
|
|
||||||
|
|
||||||
class SJISProber(MultiByteCharSetProber):
|
class SJISProber(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(SJISProber, self).__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(SJIS_SM_MODEL)
|
self.coding_sm = CodingStateMachine(SJIS_SM_MODEL)
|
||||||
self.distribution_analyzer = SJISDistributionAnalysis()
|
self.distribution_analyzer = SJISDistributionAnalysis()
|
||||||
self.context_analyzer = SJISContextAnalysis()
|
self.context_analyzer = SJISContextAnalysis()
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
super(SJISProber, self).reset()
|
super().reset()
|
||||||
self.context_analyzer.reset()
|
self.context_analyzer.reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -54,34 +54,40 @@ class SJISProber(MultiByteCharSetProber):
|
||||||
return "Japanese"
|
return "Japanese"
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str):
|
||||||
for i in range(len(byte_str)):
|
for i, byte in enumerate(byte_str):
|
||||||
coding_state = self.coding_sm.next_state(byte_str[i])
|
coding_state = self.coding_sm.next_state(byte)
|
||||||
if coding_state == MachineState.ERROR:
|
if coding_state == MachineState.ERROR:
|
||||||
self.logger.debug('%s %s prober hit error at byte %s',
|
self.logger.debug(
|
||||||
self.charset_name, self.language, i)
|
"%s %s prober hit error at byte %s",
|
||||||
|
self.charset_name,
|
||||||
|
self.language,
|
||||||
|
i,
|
||||||
|
)
|
||||||
self._state = ProbingState.NOT_ME
|
self._state = ProbingState.NOT_ME
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.ITS_ME:
|
if coding_state == MachineState.ITS_ME:
|
||||||
self._state = ProbingState.FOUND_IT
|
self._state = ProbingState.FOUND_IT
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.START:
|
if coding_state == MachineState.START:
|
||||||
char_len = self.coding_sm.get_current_charlen()
|
char_len = self.coding_sm.get_current_charlen()
|
||||||
if i == 0:
|
if i == 0:
|
||||||
self._last_char[1] = byte_str[0]
|
self._last_char[1] = byte
|
||||||
self.context_analyzer.feed(self._last_char[2 - char_len:],
|
self.context_analyzer.feed(
|
||||||
char_len)
|
self._last_char[2 - char_len :], char_len
|
||||||
|
)
|
||||||
self.distribution_analyzer.feed(self._last_char, char_len)
|
self.distribution_analyzer.feed(self._last_char, char_len)
|
||||||
else:
|
else:
|
||||||
self.context_analyzer.feed(byte_str[i + 1 - char_len:i + 3
|
self.context_analyzer.feed(
|
||||||
- char_len], char_len)
|
byte_str[i + 1 - char_len : i + 3 - char_len], char_len
|
||||||
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
|
)
|
||||||
char_len)
|
self.distribution_analyzer.feed(byte_str[i - 1 : i + 1], char_len)
|
||||||
|
|
||||||
self._last_char[0] = byte_str[-1]
|
self._last_char[0] = byte_str[-1]
|
||||||
|
|
||||||
if self.state == ProbingState.DETECTING:
|
if self.state == ProbingState.DETECTING:
|
||||||
if (self.context_analyzer.got_enough_data() and
|
if self.context_analyzer.got_enough_data() and (
|
||||||
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
|
self.get_confidence() > self.SHORTCUT_THRESHOLD
|
||||||
|
):
|
||||||
self._state = ProbingState.FOUND_IT
|
self._state = ProbingState.FOUND_IT
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
|
|
@ -46,9 +46,10 @@ from .escprober import EscCharSetProber
|
||||||
from .latin1prober import Latin1Prober
|
from .latin1prober import Latin1Prober
|
||||||
from .mbcsgroupprober import MBCSGroupProber
|
from .mbcsgroupprober import MBCSGroupProber
|
||||||
from .sbcsgroupprober import SBCSGroupProber
|
from .sbcsgroupprober import SBCSGroupProber
|
||||||
|
from .utf1632prober import UTF1632Prober
|
||||||
|
|
||||||
|
|
||||||
class UniversalDetector(object):
|
class UniversalDetector:
|
||||||
"""
|
"""
|
||||||
The ``UniversalDetector`` class underlies the ``chardet.detect`` function
|
The ``UniversalDetector`` class underlies the ``chardet.detect`` function
|
||||||
and coordinates all of the different charset probers.
|
and coordinates all of the different charset probers.
|
||||||
|
@ -66,20 +67,23 @@ class UniversalDetector(object):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
MINIMUM_THRESHOLD = 0.20
|
MINIMUM_THRESHOLD = 0.20
|
||||||
HIGH_BYTE_DETECTOR = re.compile(b'[\x80-\xFF]')
|
HIGH_BYTE_DETECTOR = re.compile(b"[\x80-\xFF]")
|
||||||
ESC_DETECTOR = re.compile(b'(\033|~{)')
|
ESC_DETECTOR = re.compile(b"(\033|~{)")
|
||||||
WIN_BYTE_DETECTOR = re.compile(b'[\x80-\x9F]')
|
WIN_BYTE_DETECTOR = re.compile(b"[\x80-\x9F]")
|
||||||
ISO_WIN_MAP = {'iso-8859-1': 'Windows-1252',
|
ISO_WIN_MAP = {
|
||||||
'iso-8859-2': 'Windows-1250',
|
"iso-8859-1": "Windows-1252",
|
||||||
'iso-8859-5': 'Windows-1251',
|
"iso-8859-2": "Windows-1250",
|
||||||
'iso-8859-6': 'Windows-1256',
|
"iso-8859-5": "Windows-1251",
|
||||||
'iso-8859-7': 'Windows-1253',
|
"iso-8859-6": "Windows-1256",
|
||||||
'iso-8859-8': 'Windows-1255',
|
"iso-8859-7": "Windows-1253",
|
||||||
'iso-8859-9': 'Windows-1254',
|
"iso-8859-8": "Windows-1255",
|
||||||
'iso-8859-13': 'Windows-1257'}
|
"iso-8859-9": "Windows-1254",
|
||||||
|
"iso-8859-13": "Windows-1257",
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(self, lang_filter=LanguageFilter.ALL):
|
def __init__(self, lang_filter=LanguageFilter.ALL):
|
||||||
self._esc_charset_prober = None
|
self._esc_charset_prober = None
|
||||||
|
self._utf1632_prober = None
|
||||||
self._charset_probers = []
|
self._charset_probers = []
|
||||||
self.result = None
|
self.result = None
|
||||||
self.done = None
|
self.done = None
|
||||||
|
@ -91,20 +95,34 @@ class UniversalDetector(object):
|
||||||
self._has_win_bytes = None
|
self._has_win_bytes = None
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def input_state(self):
|
||||||
|
return self._input_state
|
||||||
|
|
||||||
|
@property
|
||||||
|
def has_win_bytes(self):
|
||||||
|
return self._has_win_bytes
|
||||||
|
|
||||||
|
@property
|
||||||
|
def charset_probers(self):
|
||||||
|
return self._charset_probers
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
"""
|
"""
|
||||||
Reset the UniversalDetector and all of its probers back to their
|
Reset the UniversalDetector and all of its probers back to their
|
||||||
initial states. This is called by ``__init__``, so you only need to
|
initial states. This is called by ``__init__``, so you only need to
|
||||||
call this directly in between analyses of different documents.
|
call this directly in between analyses of different documents.
|
||||||
"""
|
"""
|
||||||
self.result = {'encoding': None, 'confidence': 0.0, 'language': None}
|
self.result = {"encoding": None, "confidence": 0.0, "language": None}
|
||||||
self.done = False
|
self.done = False
|
||||||
self._got_data = False
|
self._got_data = False
|
||||||
self._has_win_bytes = False
|
self._has_win_bytes = False
|
||||||
self._input_state = InputState.PURE_ASCII
|
self._input_state = InputState.PURE_ASCII
|
||||||
self._last_char = b''
|
self._last_char = b""
|
||||||
if self._esc_charset_prober:
|
if self._esc_charset_prober:
|
||||||
self._esc_charset_prober.reset()
|
self._esc_charset_prober.reset()
|
||||||
|
if self._utf1632_prober:
|
||||||
|
self._utf1632_prober.reset()
|
||||||
for prober in self._charset_probers:
|
for prober in self._charset_probers:
|
||||||
prober.reset()
|
prober.reset()
|
||||||
|
|
||||||
|
@ -125,7 +143,7 @@ class UniversalDetector(object):
|
||||||
if self.done:
|
if self.done:
|
||||||
return
|
return
|
||||||
|
|
||||||
if not len(byte_str):
|
if not byte_str:
|
||||||
return
|
return
|
||||||
|
|
||||||
if not isinstance(byte_str, bytearray):
|
if not isinstance(byte_str, bytearray):
|
||||||
|
@ -136,35 +154,36 @@ class UniversalDetector(object):
|
||||||
# If the data starts with BOM, we know it is UTF
|
# If the data starts with BOM, we know it is UTF
|
||||||
if byte_str.startswith(codecs.BOM_UTF8):
|
if byte_str.startswith(codecs.BOM_UTF8):
|
||||||
# EF BB BF UTF-8 with BOM
|
# EF BB BF UTF-8 with BOM
|
||||||
self.result = {'encoding': "UTF-8-SIG",
|
self.result = {
|
||||||
'confidence': 1.0,
|
"encoding": "UTF-8-SIG",
|
||||||
'language': ''}
|
"confidence": 1.0,
|
||||||
elif byte_str.startswith((codecs.BOM_UTF32_LE,
|
"language": "",
|
||||||
codecs.BOM_UTF32_BE)):
|
}
|
||||||
|
elif byte_str.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
|
||||||
# FF FE 00 00 UTF-32, little-endian BOM
|
# FF FE 00 00 UTF-32, little-endian BOM
|
||||||
# 00 00 FE FF UTF-32, big-endian BOM
|
# 00 00 FE FF UTF-32, big-endian BOM
|
||||||
self.result = {'encoding': "UTF-32",
|
self.result = {"encoding": "UTF-32", "confidence": 1.0, "language": ""}
|
||||||
'confidence': 1.0,
|
elif byte_str.startswith(b"\xFE\xFF\x00\x00"):
|
||||||
'language': ''}
|
|
||||||
elif byte_str.startswith(b'\xFE\xFF\x00\x00'):
|
|
||||||
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
||||||
self.result = {'encoding': "X-ISO-10646-UCS-4-3412",
|
self.result = {
|
||||||
'confidence': 1.0,
|
"encoding": "X-ISO-10646-UCS-4-3412",
|
||||||
'language': ''}
|
"confidence": 1.0,
|
||||||
elif byte_str.startswith(b'\x00\x00\xFF\xFE'):
|
"language": "",
|
||||||
|
}
|
||||||
|
elif byte_str.startswith(b"\x00\x00\xFF\xFE"):
|
||||||
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
||||||
self.result = {'encoding': "X-ISO-10646-UCS-4-2143",
|
self.result = {
|
||||||
'confidence': 1.0,
|
"encoding": "X-ISO-10646-UCS-4-2143",
|
||||||
'language': ''}
|
"confidence": 1.0,
|
||||||
|
"language": "",
|
||||||
|
}
|
||||||
elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)):
|
elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)):
|
||||||
# FF FE UTF-16, little endian BOM
|
# FF FE UTF-16, little endian BOM
|
||||||
# FE FF UTF-16, big endian BOM
|
# FE FF UTF-16, big endian BOM
|
||||||
self.result = {'encoding': "UTF-16",
|
self.result = {"encoding": "UTF-16", "confidence": 1.0, "language": ""}
|
||||||
'confidence': 1.0,
|
|
||||||
'language': ''}
|
|
||||||
|
|
||||||
self._got_data = True
|
self._got_data = True
|
||||||
if self.result['encoding'] is not None:
|
if self.result["encoding"] is not None:
|
||||||
self.done = True
|
self.done = True
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -173,12 +192,29 @@ class UniversalDetector(object):
|
||||||
if self._input_state == InputState.PURE_ASCII:
|
if self._input_state == InputState.PURE_ASCII:
|
||||||
if self.HIGH_BYTE_DETECTOR.search(byte_str):
|
if self.HIGH_BYTE_DETECTOR.search(byte_str):
|
||||||
self._input_state = InputState.HIGH_BYTE
|
self._input_state = InputState.HIGH_BYTE
|
||||||
elif self._input_state == InputState.PURE_ASCII and \
|
elif (
|
||||||
self.ESC_DETECTOR.search(self._last_char + byte_str):
|
self._input_state == InputState.PURE_ASCII
|
||||||
|
and self.ESC_DETECTOR.search(self._last_char + byte_str)
|
||||||
|
):
|
||||||
self._input_state = InputState.ESC_ASCII
|
self._input_state = InputState.ESC_ASCII
|
||||||
|
|
||||||
self._last_char = byte_str[-1:]
|
self._last_char = byte_str[-1:]
|
||||||
|
|
||||||
|
# next we will look to see if it is appears to be either a UTF-16 or
|
||||||
|
# UTF-32 encoding
|
||||||
|
if not self._utf1632_prober:
|
||||||
|
self._utf1632_prober = UTF1632Prober()
|
||||||
|
|
||||||
|
if self._utf1632_prober.state == ProbingState.DETECTING:
|
||||||
|
if self._utf1632_prober.feed(byte_str) == ProbingState.FOUND_IT:
|
||||||
|
self.result = {
|
||||||
|
"encoding": self._utf1632_prober.charset_name,
|
||||||
|
"confidence": self._utf1632_prober.get_confidence(),
|
||||||
|
"language": "",
|
||||||
|
}
|
||||||
|
self.done = True
|
||||||
|
return
|
||||||
|
|
||||||
# If we've seen escape sequences, use the EscCharSetProber, which
|
# If we've seen escape sequences, use the EscCharSetProber, which
|
||||||
# uses a simple state machine to check for known escape sequences in
|
# uses a simple state machine to check for known escape sequences in
|
||||||
# HZ and ISO-2022 encodings, since those are the only encodings that
|
# HZ and ISO-2022 encodings, since those are the only encodings that
|
||||||
|
@ -187,12 +223,11 @@ class UniversalDetector(object):
|
||||||
if not self._esc_charset_prober:
|
if not self._esc_charset_prober:
|
||||||
self._esc_charset_prober = EscCharSetProber(self.lang_filter)
|
self._esc_charset_prober = EscCharSetProber(self.lang_filter)
|
||||||
if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:
|
if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:
|
||||||
self.result = {'encoding':
|
self.result = {
|
||||||
self._esc_charset_prober.charset_name,
|
"encoding": self._esc_charset_prober.charset_name,
|
||||||
'confidence':
|
"confidence": self._esc_charset_prober.get_confidence(),
|
||||||
self._esc_charset_prober.get_confidence(),
|
"language": self._esc_charset_prober.language,
|
||||||
'language':
|
}
|
||||||
self._esc_charset_prober.language}
|
|
||||||
self.done = True
|
self.done = True
|
||||||
# If we've seen high bytes (i.e., those with values greater than 127),
|
# If we've seen high bytes (i.e., those with values greater than 127),
|
||||||
# we need to do more complicated checks using all our multi-byte and
|
# we need to do more complicated checks using all our multi-byte and
|
||||||
|
@ -209,9 +244,11 @@ class UniversalDetector(object):
|
||||||
self._charset_probers.append(Latin1Prober())
|
self._charset_probers.append(Latin1Prober())
|
||||||
for prober in self._charset_probers:
|
for prober in self._charset_probers:
|
||||||
if prober.feed(byte_str) == ProbingState.FOUND_IT:
|
if prober.feed(byte_str) == ProbingState.FOUND_IT:
|
||||||
self.result = {'encoding': prober.charset_name,
|
self.result = {
|
||||||
'confidence': prober.get_confidence(),
|
"encoding": prober.charset_name,
|
||||||
'language': prober.language}
|
"confidence": prober.get_confidence(),
|
||||||
|
"language": prober.language,
|
||||||
|
}
|
||||||
self.done = True
|
self.done = True
|
||||||
break
|
break
|
||||||
if self.WIN_BYTE_DETECTOR.search(byte_str):
|
if self.WIN_BYTE_DETECTOR.search(byte_str):
|
||||||
|
@ -231,13 +268,11 @@ class UniversalDetector(object):
|
||||||
self.done = True
|
self.done = True
|
||||||
|
|
||||||
if not self._got_data:
|
if not self._got_data:
|
||||||
self.logger.debug('no data received!')
|
self.logger.debug("no data received!")
|
||||||
|
|
||||||
# Default to ASCII if it is all we've seen so far
|
# Default to ASCII if it is all we've seen so far
|
||||||
elif self._input_state == InputState.PURE_ASCII:
|
elif self._input_state == InputState.PURE_ASCII:
|
||||||
self.result = {'encoding': 'ascii',
|
self.result = {"encoding": "ascii", "confidence": 1.0, "language": ""}
|
||||||
'confidence': 1.0,
|
|
||||||
'language': ''}
|
|
||||||
|
|
||||||
# If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD
|
# If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD
|
||||||
elif self._input_state == InputState.HIGH_BYTE:
|
elif self._input_state == InputState.HIGH_BYTE:
|
||||||
|
@ -257,30 +292,37 @@ class UniversalDetector(object):
|
||||||
confidence = max_prober.get_confidence()
|
confidence = max_prober.get_confidence()
|
||||||
# Use Windows encoding name instead of ISO-8859 if we saw any
|
# Use Windows encoding name instead of ISO-8859 if we saw any
|
||||||
# extra Windows-specific bytes
|
# extra Windows-specific bytes
|
||||||
if lower_charset_name.startswith('iso-8859'):
|
if lower_charset_name.startswith("iso-8859"):
|
||||||
if self._has_win_bytes:
|
if self._has_win_bytes:
|
||||||
charset_name = self.ISO_WIN_MAP.get(lower_charset_name,
|
charset_name = self.ISO_WIN_MAP.get(
|
||||||
charset_name)
|
lower_charset_name, charset_name
|
||||||
self.result = {'encoding': charset_name,
|
)
|
||||||
'confidence': confidence,
|
self.result = {
|
||||||
'language': max_prober.language}
|
"encoding": charset_name,
|
||||||
|
"confidence": confidence,
|
||||||
|
"language": max_prober.language,
|
||||||
|
}
|
||||||
|
|
||||||
# Log all prober confidences if none met MINIMUM_THRESHOLD
|
# Log all prober confidences if none met MINIMUM_THRESHOLD
|
||||||
if self.logger.getEffectiveLevel() <= logging.DEBUG:
|
if self.logger.getEffectiveLevel() <= logging.DEBUG:
|
||||||
if self.result['encoding'] is None:
|
if self.result["encoding"] is None:
|
||||||
self.logger.debug('no probers hit minimum threshold')
|
self.logger.debug("no probers hit minimum threshold")
|
||||||
for group_prober in self._charset_probers:
|
for group_prober in self._charset_probers:
|
||||||
if not group_prober:
|
if not group_prober:
|
||||||
continue
|
continue
|
||||||
if isinstance(group_prober, CharSetGroupProber):
|
if isinstance(group_prober, CharSetGroupProber):
|
||||||
for prober in group_prober.probers:
|
for prober in group_prober.probers:
|
||||||
self.logger.debug('%s %s confidence = %s',
|
self.logger.debug(
|
||||||
|
"%s %s confidence = %s",
|
||||||
prober.charset_name,
|
prober.charset_name,
|
||||||
prober.language,
|
prober.language,
|
||||||
prober.get_confidence())
|
prober.get_confidence(),
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self.logger.debug('%s %s confidence = %s',
|
self.logger.debug(
|
||||||
|
"%s %s confidence = %s",
|
||||||
group_prober.charset_name,
|
group_prober.charset_name,
|
||||||
group_prober.language,
|
group_prober.language,
|
||||||
group_prober.get_confidence())
|
group_prober.get_confidence(),
|
||||||
|
)
|
||||||
return self.result
|
return self.result
|
||||||
|
|
223
libs/chardet/utf1632prober.py
Normal file
223
libs/chardet/utf1632prober.py
Normal file
|
@ -0,0 +1,223 @@
|
||||||
|
######################## BEGIN LICENSE BLOCK ########################
|
||||||
|
#
|
||||||
|
# Contributor(s):
|
||||||
|
# Jason Zavaglia
|
||||||
|
#
|
||||||
|
# This library is free software; you can redistribute it and/or
|
||||||
|
# modify it under the terms of the GNU Lesser General Public
|
||||||
|
# License as published by the Free Software Foundation; either
|
||||||
|
# version 2.1 of the License, or (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This library is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
# Lesser General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Lesser General Public
|
||||||
|
# License along with this library; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||||
|
# 02110-1301 USA
|
||||||
|
######################### END LICENSE BLOCK #########################
|
||||||
|
from .charsetprober import CharSetProber
|
||||||
|
from .enums import ProbingState
|
||||||
|
|
||||||
|
|
||||||
|
class UTF1632Prober(CharSetProber):
|
||||||
|
"""
|
||||||
|
This class simply looks for occurrences of zero bytes, and infers
|
||||||
|
whether the file is UTF16 or UTF32 (low-endian or big-endian)
|
||||||
|
For instance, files looking like ( \0 \0 \0 [nonzero] )+
|
||||||
|
have a good probability to be UTF32BE. Files looking like ( \0 [nonzero] )+
|
||||||
|
may be guessed to be UTF16BE, and inversely for little-endian varieties.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# how many logical characters to scan before feeling confident of prediction
|
||||||
|
MIN_CHARS_FOR_DETECTION = 20
|
||||||
|
# a fixed constant ratio of expected zeros or non-zeros in modulo-position.
|
||||||
|
EXPECTED_RATIO = 0.94
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.position = 0
|
||||||
|
self.zeros_at_mod = [0] * 4
|
||||||
|
self.nonzeros_at_mod = [0] * 4
|
||||||
|
self._state = ProbingState.DETECTING
|
||||||
|
self.quad = [0, 0, 0, 0]
|
||||||
|
self.invalid_utf16be = False
|
||||||
|
self.invalid_utf16le = False
|
||||||
|
self.invalid_utf32be = False
|
||||||
|
self.invalid_utf32le = False
|
||||||
|
self.first_half_surrogate_pair_detected_16be = False
|
||||||
|
self.first_half_surrogate_pair_detected_16le = False
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
super().reset()
|
||||||
|
self.position = 0
|
||||||
|
self.zeros_at_mod = [0] * 4
|
||||||
|
self.nonzeros_at_mod = [0] * 4
|
||||||
|
self._state = ProbingState.DETECTING
|
||||||
|
self.invalid_utf16be = False
|
||||||
|
self.invalid_utf16le = False
|
||||||
|
self.invalid_utf32be = False
|
||||||
|
self.invalid_utf32le = False
|
||||||
|
self.first_half_surrogate_pair_detected_16be = False
|
||||||
|
self.first_half_surrogate_pair_detected_16le = False
|
||||||
|
self.quad = [0, 0, 0, 0]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def charset_name(self):
|
||||||
|
if self.is_likely_utf32be():
|
||||||
|
return "utf-32be"
|
||||||
|
if self.is_likely_utf32le():
|
||||||
|
return "utf-32le"
|
||||||
|
if self.is_likely_utf16be():
|
||||||
|
return "utf-16be"
|
||||||
|
if self.is_likely_utf16le():
|
||||||
|
return "utf-16le"
|
||||||
|
# default to something valid
|
||||||
|
return "utf-16"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def approx_32bit_chars(self):
|
||||||
|
return max(1.0, self.position / 4.0)
|
||||||
|
|
||||||
|
def approx_16bit_chars(self):
|
||||||
|
return max(1.0, self.position / 2.0)
|
||||||
|
|
||||||
|
def is_likely_utf32be(self):
|
||||||
|
approx_chars = self.approx_32bit_chars()
|
||||||
|
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
|
||||||
|
self.zeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
|
||||||
|
and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO
|
||||||
|
and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO
|
||||||
|
and self.nonzeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO
|
||||||
|
and not self.invalid_utf32be
|
||||||
|
)
|
||||||
|
|
||||||
|
def is_likely_utf32le(self):
|
||||||
|
approx_chars = self.approx_32bit_chars()
|
||||||
|
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
|
||||||
|
self.nonzeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
|
||||||
|
and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO
|
||||||
|
and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO
|
||||||
|
and self.zeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO
|
||||||
|
and not self.invalid_utf32le
|
||||||
|
)
|
||||||
|
|
||||||
|
def is_likely_utf16be(self):
|
||||||
|
approx_chars = self.approx_16bit_chars()
|
||||||
|
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
|
||||||
|
(self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]) / approx_chars
|
||||||
|
> self.EXPECTED_RATIO
|
||||||
|
and (self.zeros_at_mod[0] + self.zeros_at_mod[2]) / approx_chars
|
||||||
|
> self.EXPECTED_RATIO
|
||||||
|
and not self.invalid_utf16be
|
||||||
|
)
|
||||||
|
|
||||||
|
def is_likely_utf16le(self):
|
||||||
|
approx_chars = self.approx_16bit_chars()
|
||||||
|
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
|
||||||
|
(self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]) / approx_chars
|
||||||
|
> self.EXPECTED_RATIO
|
||||||
|
and (self.zeros_at_mod[1] + self.zeros_at_mod[3]) / approx_chars
|
||||||
|
> self.EXPECTED_RATIO
|
||||||
|
and not self.invalid_utf16le
|
||||||
|
)
|
||||||
|
|
||||||
|
def validate_utf32_characters(self, quad):
|
||||||
|
"""
|
||||||
|
Validate if the quad of bytes is valid UTF-32.
|
||||||
|
|
||||||
|
UTF-32 is valid in the range 0x00000000 - 0x0010FFFF
|
||||||
|
excluding 0x0000D800 - 0x0000DFFF
|
||||||
|
|
||||||
|
https://en.wikipedia.org/wiki/UTF-32
|
||||||
|
"""
|
||||||
|
if (
|
||||||
|
quad[0] != 0
|
||||||
|
or quad[1] > 0x10
|
||||||
|
or (quad[0] == 0 and quad[1] == 0 and 0xD8 <= quad[2] <= 0xDF)
|
||||||
|
):
|
||||||
|
self.invalid_utf32be = True
|
||||||
|
if (
|
||||||
|
quad[3] != 0
|
||||||
|
or quad[2] > 0x10
|
||||||
|
or (quad[3] == 0 and quad[2] == 0 and 0xD8 <= quad[1] <= 0xDF)
|
||||||
|
):
|
||||||
|
self.invalid_utf32le = True
|
||||||
|
|
||||||
|
def validate_utf16_characters(self, pair):
|
||||||
|
"""
|
||||||
|
Validate if the pair of bytes is valid UTF-16.
|
||||||
|
|
||||||
|
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
|
||||||
|
with an exception for surrogate pairs, which must be in the range
|
||||||
|
0xD800-0xDBFF followed by 0xDC00-0xDFFF
|
||||||
|
|
||||||
|
https://en.wikipedia.org/wiki/UTF-16
|
||||||
|
"""
|
||||||
|
if not self.first_half_surrogate_pair_detected_16be:
|
||||||
|
if 0xD8 <= pair[0] <= 0xDB:
|
||||||
|
self.first_half_surrogate_pair_detected_16be = True
|
||||||
|
elif 0xDC <= pair[0] <= 0xDF:
|
||||||
|
self.invalid_utf16be = True
|
||||||
|
else:
|
||||||
|
if 0xDC <= pair[0] <= 0xDF:
|
||||||
|
self.first_half_surrogate_pair_detected_16be = False
|
||||||
|
else:
|
||||||
|
self.invalid_utf16be = True
|
||||||
|
|
||||||
|
if not self.first_half_surrogate_pair_detected_16le:
|
||||||
|
if 0xD8 <= pair[1] <= 0xDB:
|
||||||
|
self.first_half_surrogate_pair_detected_16le = True
|
||||||
|
elif 0xDC <= pair[1] <= 0xDF:
|
||||||
|
self.invalid_utf16le = True
|
||||||
|
else:
|
||||||
|
if 0xDC <= pair[1] <= 0xDF:
|
||||||
|
self.first_half_surrogate_pair_detected_16le = False
|
||||||
|
else:
|
||||||
|
self.invalid_utf16le = True
|
||||||
|
|
||||||
|
def feed(self, byte_str):
|
||||||
|
for c in byte_str:
|
||||||
|
mod4 = self.position % 4
|
||||||
|
self.quad[mod4] = c
|
||||||
|
if mod4 == 3:
|
||||||
|
self.validate_utf32_characters(self.quad)
|
||||||
|
self.validate_utf16_characters(self.quad[0:2])
|
||||||
|
self.validate_utf16_characters(self.quad[2:4])
|
||||||
|
if c == 0:
|
||||||
|
self.zeros_at_mod[mod4] += 1
|
||||||
|
else:
|
||||||
|
self.nonzeros_at_mod[mod4] += 1
|
||||||
|
self.position += 1
|
||||||
|
return self.state
|
||||||
|
|
||||||
|
@property
|
||||||
|
def state(self):
|
||||||
|
if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}:
|
||||||
|
# terminal, decided states
|
||||||
|
return self._state
|
||||||
|
if self.get_confidence() > 0.80:
|
||||||
|
self._state = ProbingState.FOUND_IT
|
||||||
|
elif self.position > 4 * 1024:
|
||||||
|
# if we get to 4kb into the file, and we can't conclude it's UTF,
|
||||||
|
# let's give up
|
||||||
|
self._state = ProbingState.NOT_ME
|
||||||
|
return self._state
|
||||||
|
|
||||||
|
def get_confidence(self):
|
||||||
|
return (
|
||||||
|
0.85
|
||||||
|
if (
|
||||||
|
self.is_likely_utf16le()
|
||||||
|
or self.is_likely_utf16be()
|
||||||
|
or self.is_likely_utf32le()
|
||||||
|
or self.is_likely_utf32be()
|
||||||
|
)
|
||||||
|
else 0.00
|
||||||
|
)
|
|
@ -26,23 +26,22 @@
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .enums import ProbingState, MachineState
|
|
||||||
from .codingstatemachine import CodingStateMachine
|
from .codingstatemachine import CodingStateMachine
|
||||||
|
from .enums import MachineState, ProbingState
|
||||||
from .mbcssm import UTF8_SM_MODEL
|
from .mbcssm import UTF8_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class UTF8Prober(CharSetProber):
|
class UTF8Prober(CharSetProber):
|
||||||
ONE_CHAR_PROB = 0.5
|
ONE_CHAR_PROB = 0.5
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(UTF8Prober, self).__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(UTF8_SM_MODEL)
|
self.coding_sm = CodingStateMachine(UTF8_SM_MODEL)
|
||||||
self._num_mb_chars = None
|
self._num_mb_chars = None
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
super(UTF8Prober, self).reset()
|
super().reset()
|
||||||
self.coding_sm.reset()
|
self.coding_sm.reset()
|
||||||
self._num_mb_chars = 0
|
self._num_mb_chars = 0
|
||||||
|
|
||||||
|
@ -60,10 +59,10 @@ class UTF8Prober(CharSetProber):
|
||||||
if coding_state == MachineState.ERROR:
|
if coding_state == MachineState.ERROR:
|
||||||
self._state = ProbingState.NOT_ME
|
self._state = ProbingState.NOT_ME
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.ITS_ME:
|
if coding_state == MachineState.ITS_ME:
|
||||||
self._state = ProbingState.FOUND_IT
|
self._state = ProbingState.FOUND_IT
|
||||||
break
|
break
|
||||||
elif coding_state == MachineState.START:
|
if coding_state == MachineState.START:
|
||||||
if self.coding_sm.get_current_charlen() >= 2:
|
if self.coding_sm.get_current_charlen() >= 2:
|
||||||
self._num_mb_chars += 1
|
self._num_mb_chars += 1
|
||||||
|
|
||||||
|
@ -78,5 +77,4 @@ class UTF8Prober(CharSetProber):
|
||||||
if self._num_mb_chars < 6:
|
if self._num_mb_chars < 6:
|
||||||
unlike *= self.ONE_CHAR_PROB**self._num_mb_chars
|
unlike *= self.ONE_CHAR_PROB**self._num_mb_chars
|
||||||
return 1.0 - unlike
|
return 1.0 - unlike
|
||||||
else:
|
|
||||||
return unlike
|
return unlike
|
||||||
|
|
|
@ -5,5 +5,5 @@ from within setup.py and from chardet subpackages.
|
||||||
:author: Dan Blanchard (dan.blanchard@gmail.com)
|
:author: Dan Blanchard (dan.blanchard@gmail.com)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__version__ = "4.0.0"
|
__version__ = "5.0.0"
|
||||||
VERSION = __version__.split('.')
|
VERSION = __version__.split(".")
|
||||||
|
|
|
@ -41,7 +41,6 @@ from .termui import clear as clear
|
||||||
from .termui import confirm as confirm
|
from .termui import confirm as confirm
|
||||||
from .termui import echo_via_pager as echo_via_pager
|
from .termui import echo_via_pager as echo_via_pager
|
||||||
from .termui import edit as edit
|
from .termui import edit as edit
|
||||||
from .termui import get_terminal_size as get_terminal_size
|
|
||||||
from .termui import getchar as getchar
|
from .termui import getchar as getchar
|
||||||
from .termui import launch as launch
|
from .termui import launch as launch
|
||||||
from .termui import pause as pause
|
from .termui import pause as pause
|
||||||
|
@ -68,8 +67,7 @@ from .utils import echo as echo
|
||||||
from .utils import format_filename as format_filename
|
from .utils import format_filename as format_filename
|
||||||
from .utils import get_app_dir as get_app_dir
|
from .utils import get_app_dir as get_app_dir
|
||||||
from .utils import get_binary_stream as get_binary_stream
|
from .utils import get_binary_stream as get_binary_stream
|
||||||
from .utils import get_os_args as get_os_args
|
|
||||||
from .utils import get_text_stream as get_text_stream
|
from .utils import get_text_stream as get_text_stream
|
||||||
from .utils import open_file as open_file
|
from .utils import open_file as open_file
|
||||||
|
|
||||||
__version__ = "8.0.3"
|
__version__ = "8.1.3"
|
||||||
|
|
|
@ -388,9 +388,9 @@ def open_stream(
|
||||||
) -> t.Tuple[t.IO, bool]:
|
) -> t.Tuple[t.IO, bool]:
|
||||||
binary = "b" in mode
|
binary = "b" in mode
|
||||||
|
|
||||||
# Standard streams first. These are simple because they don't need
|
# Standard streams first. These are simple because they ignore the
|
||||||
# special handling for the atomic flag. It's entirely ignored.
|
# atomic flag. Use fsdecode to handle Path("-").
|
||||||
if filename == "-":
|
if os.fsdecode(filename) == "-":
|
||||||
if any(m in mode for m in ["w", "a", "x"]):
|
if any(m in mode for m in ["w", "a", "x"]):
|
||||||
if binary:
|
if binary:
|
||||||
return get_binary_stdout(), False
|
return get_binary_stdout(), False
|
||||||
|
@ -561,7 +561,6 @@ if sys.platform.startswith("win") and WIN:
|
||||||
|
|
||||||
return rv
|
return rv
|
||||||
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|
||||||
def _get_argv_encoding() -> str:
|
def _get_argv_encoding() -> str:
|
||||||
|
|
|
@ -675,7 +675,6 @@ if WIN:
|
||||||
_translate_ch_to_exc(rv)
|
_translate_ch_to_exc(rv)
|
||||||
return rv
|
return rv
|
||||||
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
import tty
|
import tty
|
||||||
import termios
|
import termios
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue