mirror of
https://github.com/morpheus65535/bazarr.git
synced 2025-04-23 22:27:17 -04:00
WIP
This commit is contained in:
parent
4e7e3a39d2
commit
645952c61a
119 changed files with 2332 additions and 5250 deletions
22
bazarr.py
22
bazarr.py
|
@ -1,5 +1,7 @@
|
|||
# coding=utf-8
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import print_function
|
||||
import subprocess as sp
|
||||
import time
|
||||
import os
|
||||
|
@ -12,14 +14,16 @@ from bazarr.get_args import args
|
|||
def check_python_version():
|
||||
python_version = platform.python_version_tuple()
|
||||
minimum_python_version_tuple = (2, 7, 13)
|
||||
minimum_python3_version_tuple = (3, 6, 0)
|
||||
minimum_python_version = ".".join(str(i) for i in minimum_python_version_tuple)
|
||||
minimum_python3_version = ".".join(str(i) for i in minimum_python3_version_tuple)
|
||||
|
||||
if int(python_version[0]) > minimum_python_version_tuple[0]:
|
||||
print "Python 3 isn't supported. Please use Python " + minimum_python_version + " or greater."
|
||||
if int(python_version[0]) == minimum_python3_version_tuple[0] and int(python_version[1]) < minimum_python3_version_tuple[1]:
|
||||
print("Python " + minimum_python3_version + " or greater required. Current version is " + platform.python_version() + ". Please upgrade Python.")
|
||||
os._exit(0)
|
||||
|
||||
elif int(python_version[1]) < minimum_python_version_tuple[1] or int(python_version[2].rstrip('+')) < minimum_python_version_tuple[2]:
|
||||
print "Python " + minimum_python_version + " or greater required. Current version is " + platform.python_version() + ". Please upgrade Python."
|
||||
elif int(python_version[0]) == minimum_python_version_tuple[0] and (int(python_version[1]) < minimum_python_version_tuple[1] or int(python_version[2].rstrip('+')) < minimum_python_version_tuple[2]):
|
||||
print("Python " + minimum_python_version + " or greater required. Current version is " + platform.python_version() + ". Please upgrade Python.")
|
||||
os._exit(0)
|
||||
|
||||
|
||||
|
@ -32,10 +36,10 @@ def start_bazarr():
|
|||
script = [sys.executable, "-u", os.path.normcase(os.path.join(dir_name, 'bazarr', 'main.py'))] + sys.argv[1:]
|
||||
|
||||
ep = sp.Popen(script, stdout=sp.PIPE, stderr=sp.STDOUT, stdin=sp.PIPE)
|
||||
print "Bazarr starting..."
|
||||
print("Bazarr starting...")
|
||||
try:
|
||||
for line in iter(ep.stdout.readline, ''):
|
||||
sys.stdout.write(line)
|
||||
sys.stdout.buffer.write(line)
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
|
||||
|
@ -60,16 +64,16 @@ if __name__ == '__main__':
|
|||
try:
|
||||
os.remove(stopfile)
|
||||
except:
|
||||
print 'Unable to delete stop file.'
|
||||
print('Unable to delete stop file.')
|
||||
else:
|
||||
print 'Bazarr exited.'
|
||||
print('Bazarr exited.')
|
||||
os._exit(0)
|
||||
|
||||
if os.path.exists(restartfile):
|
||||
try:
|
||||
os.remove(restartfile)
|
||||
except:
|
||||
print 'Unable to delete restart file.'
|
||||
print('Unable to delete restart file.')
|
||||
else:
|
||||
start_bazarr()
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# coding=utf-8
|
||||
|
||||
import cPickle as pickle
|
||||
from __future__ import absolute_import
|
||||
import six.moves.cPickle as pickle
|
||||
import base64
|
||||
import random
|
||||
import platform
|
||||
|
@ -30,7 +31,7 @@ def track_event(category=None, action=None, label=None):
|
|||
visitor = pickle.loads(base64.b64decode(settings.analytics.visitor))
|
||||
except:
|
||||
visitor = Visitor()
|
||||
unique_id = long(random.getrandbits(32))
|
||||
unique_id = int(random.getrandbits(32))
|
||||
visitor.unique_id = unique_id
|
||||
|
||||
session = Session()
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
# coding=utf-8
|
||||
from __future__ import absolute_import
|
||||
import os
|
||||
import logging
|
||||
import json
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
# coding=utf-8
|
||||
from __future__ import absolute_import
|
||||
import os
|
||||
|
||||
from simpleconfigparser import simpleconfigparser
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from __future__ import absolute_import
|
||||
import os
|
||||
import atexit
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from __future__ import absolute_import
|
||||
import enzyme
|
||||
import logging
|
||||
import os
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
# coding=utf-8
|
||||
from __future__ import absolute_import
|
||||
import os
|
||||
import argparse
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
# coding=utf-8
|
||||
from __future__ import absolute_import
|
||||
import os
|
||||
import requests
|
||||
import logging
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# coding=utf-8
|
||||
|
||||
from __future__ import absolute_import
|
||||
import os
|
||||
import pycountry
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# coding=utf-8
|
||||
|
||||
from __future__ import absolute_import
|
||||
import os
|
||||
import requests
|
||||
import logging
|
||||
|
@ -13,6 +14,7 @@ from list_subtitles import store_subtitles_movie, list_missing_subtitles_movies,
|
|||
|
||||
from get_subtitle import movies_download_subtitles
|
||||
from database import TableMovies, wal_cleaning
|
||||
import six
|
||||
|
||||
|
||||
def update_all_movies():
|
||||
|
@ -82,7 +84,7 @@ def update_movies():
|
|||
|
||||
if movie["path"] != None and movie['movieFile']['relativePath'] != None:
|
||||
try:
|
||||
overview = unicode(movie['overview'])
|
||||
overview = six.text_type(movie['overview'])
|
||||
except:
|
||||
overview = ""
|
||||
try:
|
||||
|
@ -136,27 +138,27 @@ def update_movies():
|
|||
audioCodec = None
|
||||
|
||||
# Add movies in radarr to current movies list
|
||||
current_movies_radarr.append(unicode(movie['tmdbId']))
|
||||
current_movies_radarr.append(six.text_type(movie['tmdbId']))
|
||||
|
||||
if unicode(movie['tmdbId']) in current_movies_db_list:
|
||||
if six.text_type(movie['tmdbId']) in current_movies_db_list:
|
||||
movies_to_update.append({'radarr_id': movie["id"],
|
||||
'title': unicode(movie["title"]),
|
||||
'path': unicode(movie["path"] + separator + movie['movieFile']['relativePath']),
|
||||
'tmdb_id': unicode(movie["tmdbId"]),
|
||||
'poster': unicode(poster),
|
||||
'fanart': unicode(fanart),
|
||||
'audio_language': unicode(profile_id_to_language(movie['qualityProfileId'], audio_profiles)),
|
||||
'title': six.text_type(movie["title"]),
|
||||
'path': six.text_type(movie["path"] + separator + movie['movieFile']['relativePath']),
|
||||
'tmdb_id': six.text_type(movie["tmdbId"]),
|
||||
'poster': six.text_type(poster),
|
||||
'fanart': six.text_type(fanart),
|
||||
'audio_language': six.text_type(profile_id_to_language(movie['qualityProfileId'], audio_profiles)),
|
||||
'scene_name': sceneName,
|
||||
'monitored': unicode(bool(movie['monitored'])),
|
||||
'year': unicode(movie['year']),
|
||||
'sort_title': unicode(movie['sortTitle']),
|
||||
'alternative_titles': unicode(alternativeTitles),
|
||||
'format': unicode(format),
|
||||
'resolution': unicode(resolution),
|
||||
'video_codec': unicode(videoCodec),
|
||||
'audio_codec': unicode(audioCodec),
|
||||
'overview': unicode(overview),
|
||||
'imdb_id': unicode(imdbId)})
|
||||
'monitored': six.text_type(bool(movie['monitored'])),
|
||||
'year': six.text_type(movie['year']),
|
||||
'sort_title': six.text_type(movie['sortTitle']),
|
||||
'alternative_titles': six.text_type(alternativeTitles),
|
||||
'format': six.text_type(format),
|
||||
'resolution': six.text_type(resolution),
|
||||
'video_codec': six.text_type(videoCodec),
|
||||
'audio_codec': six.text_type(audioCodec),
|
||||
'overview': six.text_type(overview),
|
||||
'imdb_id': six.text_type(imdbId)})
|
||||
else:
|
||||
if movie_default_enabled is True:
|
||||
movies_to_add.append({'radarr_id': movie["id"],
|
||||
|
@ -171,7 +173,7 @@ def update_movies():
|
|||
'fanart': fanart,
|
||||
'audio_language': profile_id_to_language(movie['qualityProfileId'], audio_profiles),
|
||||
'scene_name': sceneName,
|
||||
'monitored': unicode(bool(movie['monitored'])),
|
||||
'monitored': six.text_type(bool(movie['monitored'])),
|
||||
'sort_title': movie['sortTitle'],
|
||||
'year': movie['year'],
|
||||
'alternative_titles': alternativeTitles,
|
||||
|
@ -191,7 +193,7 @@ def update_movies():
|
|||
'fanart': fanart,
|
||||
'audio_language': profile_id_to_language(movie['qualityProfileId'], audio_profiles),
|
||||
'scene_name': sceneName,
|
||||
'monitored': unicode(bool(movie['monitored'])),
|
||||
'monitored': six.text_type(bool(movie['monitored'])),
|
||||
'sort_title': movie['sortTitle'],
|
||||
'year': movie['year'],
|
||||
'alternative_titles': alternativeTitles,
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
# coding=utf-8
|
||||
from __future__ import absolute_import
|
||||
import os
|
||||
import datetime
|
||||
import logging
|
||||
|
@ -159,8 +160,8 @@ def provider_throttle(name, exception):
|
|||
|
||||
def throttled_count(name):
|
||||
global throttle_count
|
||||
if name in throttle_count.keys():
|
||||
if 'count' in throttle_count[name].keys():
|
||||
if name in list(throttle_count.keys()):
|
||||
if 'count' in list(throttle_count[name].keys()):
|
||||
for key, value in throttle_count[name].items():
|
||||
if key == 'count':
|
||||
value += 1
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
# coding=utf-8
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import requests
|
||||
import logging
|
||||
|
@ -12,6 +14,7 @@ from config import settings, url_sonarr
|
|||
from list_subtitles import list_missing_subtitles
|
||||
from database import TableShows
|
||||
from utils import get_sonarr_version
|
||||
import six
|
||||
|
||||
|
||||
def update_series():
|
||||
|
@ -60,7 +63,7 @@ def update_series():
|
|||
for i, show in enumerate(r.json(), 1):
|
||||
notifications.write(msg="Getting series data from Sonarr...", queue='get_series', item=i, length=seriesListLength)
|
||||
try:
|
||||
overview = unicode(show['overview'])
|
||||
overview = six.text_type(show['overview'])
|
||||
except:
|
||||
overview = ""
|
||||
try:
|
||||
|
@ -82,17 +85,17 @@ def update_series():
|
|||
current_shows_sonarr.append(show['tvdbId'])
|
||||
|
||||
if show['tvdbId'] in current_shows_db_list:
|
||||
series_to_update.append({'title': unicode(show["title"]),
|
||||
'path': unicode(show["path"]),
|
||||
series_to_update.append({'title': six.text_type(show["title"]),
|
||||
'path': six.text_type(show["path"]),
|
||||
'tvdb_id': int(show["tvdbId"]),
|
||||
'sonarr_series_id': int(show["id"]),
|
||||
'overview': unicode(overview),
|
||||
'poster': unicode(poster),
|
||||
'fanart': unicode(fanart),
|
||||
'audio_language': unicode(profile_id_to_language((show['qualityProfileId'] if get_sonarr_version().startswith('2') else show['languageProfileId']), audio_profiles)),
|
||||
'sort_title': unicode(show['sortTitle']),
|
||||
'year': unicode(show['year']),
|
||||
'alternate_titles': unicode(alternateTitles)})
|
||||
'overview': six.text_type(overview),
|
||||
'poster': six.text_type(poster),
|
||||
'fanart': six.text_type(fanart),
|
||||
'audio_language': six.text_type(profile_id_to_language((show['qualityProfileId'] if get_sonarr_version().startswith('2') else show['languageProfileId']), audio_profiles)),
|
||||
'sort_title': six.text_type(show['sortTitle']),
|
||||
'year': six.text_type(show['year']),
|
||||
'alternate_titles': six.text_type(alternateTitles)})
|
||||
else:
|
||||
if serie_default_enabled is True:
|
||||
series_to_add.append({'title': show["title"],
|
||||
|
@ -161,9 +164,9 @@ def update_series():
|
|||
removed_series = list(set(current_shows_db_list) - set(current_shows_sonarr))
|
||||
|
||||
for series in removed_series:
|
||||
print TableShows.delete().where(
|
||||
print(TableShows.delete().where(
|
||||
TableShows.tvdb_id == series
|
||||
).execute()
|
||||
).execute())
|
||||
|
||||
logging.debug('BAZARR All series synced from Sonarr into database.')
|
||||
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
# coding=utf-8
|
||||
|
||||
from __future__ import absolute_import
|
||||
import os
|
||||
import sys
|
||||
import ast
|
||||
import logging
|
||||
import subprocess
|
||||
import time
|
||||
import cPickle as pickle
|
||||
import six.moves.cPickle as pickle
|
||||
import codecs
|
||||
import types
|
||||
import re
|
||||
|
@ -37,6 +38,9 @@ from database import TableShows, TableEpisodes, TableMovies, TableHistory, Table
|
|||
from peewee import fn, JOIN
|
||||
|
||||
from analytics import track_event
|
||||
import six
|
||||
from six.moves import range
|
||||
from functools import reduce
|
||||
|
||||
|
||||
def get_video(path, title, sceneName, use_scenename, providers=None, media_type="movie"):
|
||||
|
@ -91,11 +95,11 @@ def get_scores(video, media_type, min_score_movie_perc=60 * 100 / 120.0, min_sco
|
|||
"""
|
||||
max_score = 120.0
|
||||
min_score = max_score * min_score_movie_perc / 100.0
|
||||
scores = subliminal_scores.movie_scores.keys()
|
||||
scores = list(subliminal_scores.movie_scores.keys())
|
||||
if media_type == "series":
|
||||
max_score = 360.0
|
||||
min_score = max_score * min_score_series_perc / 100.0
|
||||
scores = subliminal_scores.episode_scores.keys()
|
||||
scores = list(subliminal_scores.episode_scores.keys())
|
||||
if video.is_special:
|
||||
min_score = max_score * min_score_special_ep / 100.0
|
||||
|
||||
|
@ -119,7 +123,7 @@ def download_subtitle(path, language, hi, forced, providers, providers_auth, sce
|
|||
hi = "force non-HI"
|
||||
language_set = set()
|
||||
|
||||
if not isinstance(language, types.ListType):
|
||||
if not isinstance(language, list):
|
||||
language = [language]
|
||||
|
||||
if forced == "True":
|
||||
|
@ -185,7 +189,7 @@ def download_subtitle(path, language, hi, forced, providers, providers_auth, sce
|
|||
|
||||
saved_any = False
|
||||
if downloaded_subtitles:
|
||||
for video, subtitles in downloaded_subtitles.iteritems():
|
||||
for video, subtitles in six.iteritems(downloaded_subtitles):
|
||||
if not subtitles:
|
||||
continue
|
||||
|
||||
|
@ -221,10 +225,10 @@ def download_subtitle(path, language, hi, forced, providers, providers_auth, sce
|
|||
else:
|
||||
action = "downloaded"
|
||||
if video.used_scene_name:
|
||||
message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + unicode(
|
||||
message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + six.text_type(
|
||||
round(subtitle.score * 100 / max_score, 2)) + "% using this scene name: " + sceneName
|
||||
else:
|
||||
message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + unicode(
|
||||
message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + six.text_type(
|
||||
round(subtitle.score * 100 / max_score, 2)) + "% using filename guessing."
|
||||
|
||||
if use_postprocessing is True:
|
||||
|
@ -444,7 +448,7 @@ def manual_download_subtitle(path, language, hi, forced, subtitle, provider, pro
|
|||
downloaded_path = saved_subtitle.storage_path
|
||||
logging.debug('BAZARR Subtitles file saved to disk: ' + downloaded_path)
|
||||
is_forced_string = " forced" if subtitle.language.forced else ""
|
||||
message = downloaded_language + is_forced_string + " subtitles downloaded from " + downloaded_provider + " with a score of " + unicode(
|
||||
message = downloaded_language + is_forced_string + " subtitles downloaded from " + downloaded_provider + " with a score of " + six.text_type(
|
||||
score) + "% using manual search."
|
||||
|
||||
if use_postprocessing is True:
|
||||
|
@ -749,7 +753,7 @@ def wanted_download_subtitles(path, l, count_episodes):
|
|||
|
||||
for episode in episodes_details:
|
||||
attempt = episode.failed_attempts
|
||||
if type(attempt) == unicode:
|
||||
if type(attempt) == six.text_type:
|
||||
attempt = ast.literal_eval(attempt)
|
||||
for language in ast.literal_eval(episode.missing_subtitles):
|
||||
if attempt is None:
|
||||
|
@ -762,7 +766,7 @@ def wanted_download_subtitles(path, l, count_episodes):
|
|||
|
||||
TableEpisodes.update(
|
||||
{
|
||||
TableEpisodes.failed_attempts: unicode(attempt)
|
||||
TableEpisodes.failed_attempts: six.text_type(attempt)
|
||||
}
|
||||
).where(
|
||||
TableEpisodes.sonarr_episode_id == episode.sonarr_episode_id
|
||||
|
@ -818,7 +822,7 @@ def wanted_download_subtitles_movie(path, l, count_movies):
|
|||
|
||||
for movie in movies_details:
|
||||
attempt = movie.failed_attempts
|
||||
if type(attempt) == unicode:
|
||||
if type(attempt) == six.text_type:
|
||||
attempt = ast.literal_eval(attempt)
|
||||
for language in ast.literal_eval(movie.missing_subtitles):
|
||||
if attempt is None:
|
||||
|
@ -831,7 +835,7 @@ def wanted_download_subtitles_movie(path, l, count_movies):
|
|||
|
||||
TableMovies.update(
|
||||
{
|
||||
TableMovies.failed_attempts: unicode(attempt)
|
||||
TableMovies.failed_attempts: six.text_type(attempt)
|
||||
}
|
||||
).where(
|
||||
TableMovies.radarr_id == movie.radarr_id
|
||||
|
@ -991,7 +995,7 @@ def refine_from_db(path, video):
|
|||
TableMovies.audio_codec,
|
||||
TableMovies.imdb_id
|
||||
).where(
|
||||
TableMovies.path == unicode(path_replace_reverse_movie(path))
|
||||
TableMovies.path == six.text_type(path_replace_reverse_movie(path))
|
||||
).first()
|
||||
|
||||
if data:
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
# coding=utf-8
|
||||
from __future__ import absolute_import
|
||||
import ast
|
||||
import os
|
||||
import re
|
||||
|
@ -126,7 +127,7 @@ def force_unicode(s):
|
|||
:param s: string
|
||||
:return: unicode string
|
||||
"""
|
||||
if not isinstance(s, types.UnicodeType):
|
||||
if not isinstance(s, str):
|
||||
try:
|
||||
s = s.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
# coding=utf-8
|
||||
|
||||
from __future__ import absolute_import
|
||||
import os
|
||||
import logging
|
||||
import time
|
||||
import rarfile
|
||||
|
||||
from cork import Cork
|
||||
from ConfigParser2 import ConfigParser
|
||||
from backports import configparser2
|
||||
from config import settings
|
||||
from check_update import check_releases
|
||||
from get_args import args
|
||||
|
@ -66,7 +67,7 @@ if not os.path.exists(os.path.join(args.config_dir, 'config', 'releases.txt')):
|
|||
|
||||
config_file = os.path.normpath(os.path.join(args.config_dir, 'config', 'config.ini'))
|
||||
|
||||
cfg = ConfigParser()
|
||||
cfg = configparser2.ConfigParser()
|
||||
|
||||
|
||||
def init_binaries():
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# coding=utf-8
|
||||
|
||||
from __future__ import absolute_import
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# coding=utf-8
|
||||
|
||||
from __future__ import absolute_import
|
||||
import gc
|
||||
import os
|
||||
import babelfish
|
||||
|
@ -24,6 +25,7 @@ from helper import path_replace, path_replace_movie, path_replace_reverse, \
|
|||
|
||||
from queueconfig import notifications
|
||||
from embedded_subs_reader import embedded_subs_reader
|
||||
import six
|
||||
|
||||
gc.enable()
|
||||
|
||||
|
@ -63,7 +65,7 @@ def store_subtitles(file):
|
|||
logging.exception("BAZARR unable to index external subtitles.")
|
||||
pass
|
||||
else:
|
||||
for subtitle, language in subtitles.iteritems():
|
||||
for subtitle, language in six.iteritems(subtitles):
|
||||
subtitle_path = get_external_subtitles_path(file, subtitle)
|
||||
if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(brazilian_portuguese)):
|
||||
logging.debug("BAZARR external subtitles detected: " + "pb")
|
||||
|
@ -155,7 +157,7 @@ def store_subtitles_movie(file):
|
|||
logging.exception("BAZARR unable to index external subtitles.")
|
||||
pass
|
||||
else:
|
||||
for subtitle, language in subtitles.iteritems():
|
||||
for subtitle, language in six.iteritems(subtitles):
|
||||
if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(brazilian_portuguese)) is True:
|
||||
logging.debug("BAZARR external subtitles detected: " + "pb")
|
||||
actual_subtitles.append(
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# coding=utf-8
|
||||
|
||||
from __future__ import absolute_import
|
||||
import os
|
||||
import logging
|
||||
import re
|
||||
|
@ -9,6 +10,7 @@ import platform
|
|||
from logging.handlers import TimedRotatingFileHandler
|
||||
from get_args import args
|
||||
from config import settings
|
||||
import six
|
||||
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
@ -107,10 +109,10 @@ class MyFilter(logging.Filter):
|
|||
|
||||
class ArgsFilteringFilter(logging.Filter):
|
||||
def filter_args(self, record, func):
|
||||
if isinstance(record.args, (types.ListType, types.TupleType)):
|
||||
if isinstance(record.args, (list, tuple)):
|
||||
final_args = []
|
||||
for arg in record.args:
|
||||
if not isinstance(arg, basestring):
|
||||
if not isinstance(arg, six.string_types):
|
||||
final_args.append(arg)
|
||||
continue
|
||||
|
||||
|
@ -118,7 +120,7 @@ class ArgsFilteringFilter(logging.Filter):
|
|||
record.args = type(record.args)(final_args)
|
||||
elif isinstance(record.args, dict):
|
||||
for key, arg in record.args.items():
|
||||
if not isinstance(arg, basestring):
|
||||
if not isinstance(arg, six.string_types):
|
||||
continue
|
||||
|
||||
record.args[key] = func(arg)
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
# coding=utf-8
|
||||
|
||||
import six
|
||||
from six.moves import zip
|
||||
from functools import reduce
|
||||
bazarr_version = '0.8.2'
|
||||
|
||||
import gc
|
||||
|
@ -12,7 +15,7 @@ import pretty
|
|||
import math
|
||||
import ast
|
||||
import hashlib
|
||||
import urllib
|
||||
import six.moves.urllib.request, six.moves.urllib.parse, six.moves.urllib.error
|
||||
import warnings
|
||||
import queueconfig
|
||||
import platform
|
||||
|
@ -1575,12 +1578,12 @@ def save_settings():
|
|||
settings_death_by_captcha_username = request.forms.get('settings_death_by_captcha_username')
|
||||
settings_death_by_captcha_password = request.forms.get('settings_death_by_captcha_password')
|
||||
|
||||
before = (unicode(settings.general.ip), int(settings.general.port), unicode(settings.general.base_url),
|
||||
unicode(settings.general.path_mappings), unicode(settings.general.getboolean('use_sonarr')),
|
||||
unicode(settings.general.getboolean('use_radarr')), unicode(settings.general.path_mappings_movie))
|
||||
after = (unicode(settings_general_ip), int(settings_general_port), unicode(settings_general_baseurl),
|
||||
unicode(settings_general_pathmapping), unicode(settings_general_use_sonarr),
|
||||
unicode(settings_general_use_radarr), unicode(settings_general_pathmapping_movie))
|
||||
before = (six.text_type(settings.general.ip), int(settings.general.port), six.text_type(settings.general.base_url),
|
||||
six.text_type(settings.general.path_mappings), six.text_type(settings.general.getboolean('use_sonarr')),
|
||||
six.text_type(settings.general.getboolean('use_radarr')), six.text_type(settings.general.path_mappings_movie))
|
||||
after = (six.text_type(settings_general_ip), int(settings_general_port), six.text_type(settings_general_baseurl),
|
||||
six.text_type(settings_general_pathmapping), six.text_type(settings_general_use_sonarr),
|
||||
six.text_type(settings_general_use_radarr), six.text_type(settings_general_pathmapping_movie))
|
||||
|
||||
settings.general.ip = text_type(settings_general_ip)
|
||||
settings.general.port = text_type(settings_general_port)
|
||||
|
@ -1645,7 +1648,7 @@ def save_settings():
|
|||
settings_proxy_password = request.forms.get('settings_proxy_password')
|
||||
settings_proxy_exclude = request.forms.get('settings_proxy_exclude')
|
||||
|
||||
before_proxy_password = (unicode(settings.proxy.type), unicode(settings.proxy.exclude))
|
||||
before_proxy_password = (six.text_type(settings.proxy.type), six.text_type(settings.proxy.exclude))
|
||||
if before_proxy_password[0] != settings_proxy_type:
|
||||
configured()
|
||||
if before_proxy_password[1] == settings_proxy_password:
|
||||
|
@ -2029,7 +2032,7 @@ def remove_subtitles():
|
|||
history_log(0, sonarrSeriesId, sonarrEpisodeId, result)
|
||||
except OSError as e:
|
||||
logging.exception('BAZARR cannot delete subtitles file: ' + subtitlesPath)
|
||||
store_subtitles(unicode(episodePath))
|
||||
store_subtitles(six.text_type(episodePath))
|
||||
list_missing_subtitles(sonarrSeriesId)
|
||||
|
||||
|
||||
|
@ -2048,7 +2051,7 @@ def remove_subtitles_movie():
|
|||
history_log_movie(0, radarrId, result)
|
||||
except OSError as e:
|
||||
logging.exception('BAZARR cannot delete subtitles file: ' + subtitlesPath)
|
||||
store_subtitles_movie(unicode(moviePath))
|
||||
store_subtitles_movie(six.text_type(moviePath))
|
||||
list_missing_subtitles_movies(radarrId)
|
||||
|
||||
|
||||
|
@ -2082,7 +2085,7 @@ def get_subtitle():
|
|||
score = result[4]
|
||||
history_log(1, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score)
|
||||
send_notifications(sonarrSeriesId, sonarrEpisodeId, message)
|
||||
store_subtitles(unicode(episodePath))
|
||||
store_subtitles(six.text_type(episodePath))
|
||||
list_missing_subtitles(sonarrSeriesId)
|
||||
redirect(ref)
|
||||
except OSError:
|
||||
|
@ -2140,7 +2143,7 @@ def manual_get_subtitle():
|
|||
score = result[4]
|
||||
history_log(2, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score)
|
||||
send_notifications(sonarrSeriesId, sonarrEpisodeId, message)
|
||||
store_subtitles(unicode(episodePath))
|
||||
store_subtitles(six.text_type(episodePath))
|
||||
list_missing_subtitles(sonarrSeriesId)
|
||||
redirect(ref)
|
||||
except OSError:
|
||||
|
@ -2184,7 +2187,7 @@ def perform_manual_upload_subtitle():
|
|||
score = 360
|
||||
history_log(4, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score)
|
||||
send_notifications(sonarrSeriesId, sonarrEpisodeId, message)
|
||||
store_subtitles(unicode(episodePath))
|
||||
store_subtitles(six.text_type(episodePath))
|
||||
list_missing_subtitles(sonarrSeriesId)
|
||||
|
||||
redirect(ref)
|
||||
|
@ -2221,7 +2224,7 @@ def get_subtitle_movie():
|
|||
score = result[4]
|
||||
history_log_movie(1, radarrId, message, path, language_code, provider, score)
|
||||
send_notifications_movie(radarrId, message)
|
||||
store_subtitles_movie(unicode(moviePath))
|
||||
store_subtitles_movie(six.text_type(moviePath))
|
||||
list_missing_subtitles_movies(radarrId)
|
||||
redirect(ref)
|
||||
except OSError:
|
||||
|
@ -2277,7 +2280,7 @@ def manual_get_subtitle_movie():
|
|||
score = result[4]
|
||||
history_log_movie(2, radarrId, message, path, language_code, provider, score)
|
||||
send_notifications_movie(radarrId, message)
|
||||
store_subtitles_movie(unicode(moviePath))
|
||||
store_subtitles_movie(six.text_type(moviePath))
|
||||
list_missing_subtitles_movies(radarrId)
|
||||
redirect(ref)
|
||||
except OSError:
|
||||
|
@ -2320,7 +2323,7 @@ def perform_manual_upload_subtitle_movie():
|
|||
score = 120
|
||||
history_log_movie(4, radarrId, message, path, language_code, provider, score)
|
||||
send_notifications_movie(radarrId, message)
|
||||
store_subtitles_movie(unicode(moviePath))
|
||||
store_subtitles_movie(six.text_type(moviePath))
|
||||
list_missing_subtitles_movies(radarrId)
|
||||
|
||||
redirect(ref)
|
||||
|
@ -2421,7 +2424,7 @@ def api_history():
|
|||
@route(base_url + 'test_url/<protocol>/<url:path>', method='GET')
|
||||
@custom_auth_basic(check_credentials)
|
||||
def test_url(protocol, url):
|
||||
url = urllib.unquote(url)
|
||||
url = six.moves.urllib.parse.unquote(url)
|
||||
try:
|
||||
result = requests.get(protocol + "://" + url, allow_redirects=False, verify=False).json()['version']
|
||||
except:
|
||||
|
@ -2433,7 +2436,7 @@ def test_url(protocol, url):
|
|||
@route(base_url + 'test_notification/<protocol>/<provider:path>', method='GET')
|
||||
@custom_auth_basic(check_credentials)
|
||||
def test_notification(protocol, provider):
|
||||
provider = urllib.unquote(provider)
|
||||
provider = six.moves.urllib.parse.unquote(provider)
|
||||
apobj = apprise.Apprise()
|
||||
apobj.add(protocol + "://" + provider)
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# coding=utf-8
|
||||
|
||||
from __future__ import absolute_import
|
||||
import apprise
|
||||
import os
|
||||
import logging
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from __future__ import absolute_import
|
||||
from collections import deque
|
||||
import json
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# coding=utf-8
|
||||
|
||||
from __future__ import absolute_import
|
||||
from get_episodes import sync_episodes, update_all_episodes
|
||||
from get_movies import update_movies, update_all_movies
|
||||
from get_series import update_series
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# coding=utf-8
|
||||
|
||||
from __future__ import absolute_import
|
||||
import os
|
||||
import time
|
||||
import platform
|
||||
|
|
|
@ -1,797 +0,0 @@
|
|||
"""Configuration file parser.
|
||||
|
||||
A setup file consists of sections, lead by a "[section]" header,
|
||||
and followed by "name: value" entries, with continuations and such in
|
||||
the style of RFC 822.
|
||||
|
||||
The option values can contain format strings which refer to other values in
|
||||
the same section, or values in a special [DEFAULT] section.
|
||||
|
||||
For example:
|
||||
|
||||
something: %(dir)s/whatever
|
||||
|
||||
would resolve the "%(dir)s" to the value of dir. All reference
|
||||
expansions are done late, on demand.
|
||||
|
||||
Intrinsic defaults can be specified by passing them into the
|
||||
ConfigParser constructor as a dictionary.
|
||||
|
||||
class:
|
||||
|
||||
ConfigParser -- responsible for parsing a list of
|
||||
configuration files, and managing the parsed database.
|
||||
|
||||
methods:
|
||||
|
||||
__init__(defaults=None)
|
||||
create the parser and specify a dictionary of intrinsic defaults. The
|
||||
keys must be strings, the values must be appropriate for %()s string
|
||||
interpolation. Note that `__name__' is always an intrinsic default;
|
||||
its value is the section's name.
|
||||
|
||||
sections()
|
||||
return all the configuration section names, sans DEFAULT
|
||||
|
||||
has_section(section)
|
||||
return whether the given section exists
|
||||
|
||||
has_option(section, option)
|
||||
return whether the given option exists in the given section
|
||||
|
||||
options(section)
|
||||
return list of configuration options for the named section
|
||||
|
||||
read(filenames)
|
||||
read and parse the list of named configuration files, given by
|
||||
name. A single filename is also allowed. Non-existing files
|
||||
are ignored. Return list of successfully read files.
|
||||
|
||||
readfp(fp, filename=None)
|
||||
read and parse one configuration file, given as a file object.
|
||||
The filename defaults to fp.name; it is only used in error
|
||||
messages (if fp has no `name' attribute, the string `<???>' is used).
|
||||
|
||||
get(section, option, raw=False, vars=None)
|
||||
return a string value for the named option. All % interpolations are
|
||||
expanded in the return values, based on the defaults passed into the
|
||||
constructor and the DEFAULT section. Additional substitutions may be
|
||||
provided using the `vars' argument, which must be a dictionary whose
|
||||
contents override any pre-existing defaults.
|
||||
|
||||
getint(section, options)
|
||||
like get(), but convert value to an integer
|
||||
|
||||
getfloat(section, options)
|
||||
like get(), but convert value to a float
|
||||
|
||||
getboolean(section, options)
|
||||
like get(), but convert value to a boolean (currently case
|
||||
insensitively defined as 0, false, no, off for False, and 1, true,
|
||||
yes, on for True). Returns False or True.
|
||||
|
||||
items(section, raw=False, vars=None)
|
||||
return a list of tuples with (name, value) for each option
|
||||
in the section.
|
||||
|
||||
remove_section(section)
|
||||
remove the given file section and all its options
|
||||
|
||||
remove_option(section, option)
|
||||
remove the given option from the given section
|
||||
|
||||
set(section, option, value)
|
||||
set the given option
|
||||
|
||||
write(fp)
|
||||
write the configuration state in .ini format
|
||||
"""
|
||||
|
||||
try:
|
||||
from collections import OrderedDict as _default_dict
|
||||
except ImportError:
|
||||
# fallback for setup.py which hasn't yet built _collections
|
||||
_default_dict = dict
|
||||
|
||||
import re
|
||||
|
||||
__all__ = ["NoSectionError", "DuplicateSectionError", "NoOptionError",
|
||||
"InterpolationError", "InterpolationDepthError",
|
||||
"InterpolationSyntaxError", "ParsingError",
|
||||
"MissingSectionHeaderError",
|
||||
"ConfigParser", "SafeConfigParser", "RawConfigParser",
|
||||
"DEFAULTSECT", "MAX_INTERPOLATION_DEPTH"]
|
||||
|
||||
DEFAULTSECT = "DEFAULT"
|
||||
|
||||
MAX_INTERPOLATION_DEPTH = 10
|
||||
|
||||
|
||||
|
||||
# exception classes
|
||||
class Error(Exception):
|
||||
"""Base class for ConfigParser exceptions."""
|
||||
|
||||
def _get_message(self):
|
||||
"""Getter for 'message'; needed only to override deprecation in
|
||||
BaseException."""
|
||||
return self.__message
|
||||
|
||||
def _set_message(self, value):
|
||||
"""Setter for 'message'; needed only to override deprecation in
|
||||
BaseException."""
|
||||
self.__message = value
|
||||
|
||||
# BaseException.message has been deprecated since Python 2.6. To prevent
|
||||
# DeprecationWarning from popping up over this pre-existing attribute, use
|
||||
# a new property that takes lookup precedence.
|
||||
message = property(_get_message, _set_message)
|
||||
|
||||
def __init__(self, msg=''):
|
||||
self.message = msg
|
||||
Exception.__init__(self, msg)
|
||||
|
||||
def __repr__(self):
|
||||
return self.message
|
||||
|
||||
__str__ = __repr__
|
||||
|
||||
class NoSectionError(Error):
|
||||
"""Raised when no section matches a requested option."""
|
||||
|
||||
def __init__(self, section):
|
||||
Error.__init__(self, 'No section: %r' % (section,))
|
||||
self.section = section
|
||||
self.args = (section, )
|
||||
|
||||
class DuplicateSectionError(Error):
|
||||
"""Raised when a section is multiply-created."""
|
||||
|
||||
def __init__(self, section):
|
||||
Error.__init__(self, "Section %r already exists" % section)
|
||||
self.section = section
|
||||
self.args = (section, )
|
||||
|
||||
class NoOptionError(Error):
|
||||
"""A requested option was not found."""
|
||||
|
||||
def __init__(self, option, section):
|
||||
Error.__init__(self, "No option %r in section: %r" %
|
||||
(option, section))
|
||||
self.option = option
|
||||
self.section = section
|
||||
self.args = (option, section)
|
||||
|
||||
class InterpolationError(Error):
|
||||
"""Base class for interpolation-related exceptions."""
|
||||
|
||||
def __init__(self, option, section, msg):
|
||||
Error.__init__(self, msg)
|
||||
self.option = option
|
||||
self.section = section
|
||||
self.args = (option, section, msg)
|
||||
|
||||
class InterpolationMissingOptionError(InterpolationError):
|
||||
"""A string substitution required a setting which was not available."""
|
||||
|
||||
def __init__(self, option, section, rawval, reference):
|
||||
msg = ("Bad value substitution:\n"
|
||||
"\tsection: [%s]\n"
|
||||
"\toption : %s\n"
|
||||
"\tkey : %s\n"
|
||||
"\trawval : %s\n"
|
||||
% (section, option, reference, rawval))
|
||||
InterpolationError.__init__(self, option, section, msg)
|
||||
self.reference = reference
|
||||
self.args = (option, section, rawval, reference)
|
||||
|
||||
class InterpolationSyntaxError(InterpolationError):
|
||||
"""Raised when the source text into which substitutions are made
|
||||
does not conform to the required syntax."""
|
||||
|
||||
class InterpolationDepthError(InterpolationError):
|
||||
"""Raised when substitutions are nested too deeply."""
|
||||
|
||||
def __init__(self, option, section, rawval):
|
||||
msg = ("Value interpolation too deeply recursive:\n"
|
||||
"\tsection: [%s]\n"
|
||||
"\toption : %s\n"
|
||||
"\trawval : %s\n"
|
||||
% (section, option, rawval))
|
||||
InterpolationError.__init__(self, option, section, msg)
|
||||
self.args = (option, section, rawval)
|
||||
|
||||
class ParsingError(Error):
|
||||
"""Raised when a configuration file does not follow legal syntax."""
|
||||
|
||||
def __init__(self, filename):
|
||||
Error.__init__(self, 'File contains parsing errors: %s' % filename)
|
||||
self.filename = filename
|
||||
self.errors = []
|
||||
self.args = (filename, )
|
||||
|
||||
def append(self, lineno, line):
|
||||
self.errors.append((lineno, line))
|
||||
self.message += '\n\t[line %2d]: %s' % (lineno, line)
|
||||
|
||||
class MissingSectionHeaderError(ParsingError):
|
||||
"""Raised when a key-value pair is found before any section header."""
|
||||
|
||||
def __init__(self, filename, lineno, line):
|
||||
Error.__init__(
|
||||
self,
|
||||
'File contains no section headers.\nfile: %s, line: %d\n%r' %
|
||||
(filename, lineno, line))
|
||||
self.filename = filename
|
||||
self.lineno = lineno
|
||||
self.line = line
|
||||
self.args = (filename, lineno, line)
|
||||
|
||||
|
||||
class RawConfigParser:
|
||||
def __init__(self, defaults=None, dict_type=_default_dict,
|
||||
allow_no_value=False):
|
||||
self._dict = dict_type
|
||||
self._sections = self._dict()
|
||||
self._defaults = self._dict()
|
||||
if allow_no_value:
|
||||
self._optcre = self.OPTCRE_NV
|
||||
else:
|
||||
self._optcre = self.OPTCRE
|
||||
if defaults:
|
||||
for key, value in defaults.items():
|
||||
self._defaults[self.optionxform(key)] = value
|
||||
self.comment_store = None ## used for storing comments in ini
|
||||
|
||||
|
||||
def defaults(self):
|
||||
return self._defaults
|
||||
|
||||
def sections(self):
|
||||
"""Return a list of section names, excluding [DEFAULT]"""
|
||||
# self._sections will never have [DEFAULT] in it
|
||||
return self._sections.keys()
|
||||
|
||||
def add_section(self, section):
|
||||
"""Create a new section in the configuration.
|
||||
|
||||
Raise DuplicateSectionError if a section by the specified name
|
||||
already exists. Raise ValueError if name is DEFAULT or any of it's
|
||||
case-insensitive variants.
|
||||
"""
|
||||
if section.lower() == "default":
|
||||
raise ValueError, 'Invalid section name: %s' % section
|
||||
|
||||
if section in self._sections:
|
||||
raise DuplicateSectionError(section)
|
||||
self._sections[section] = self._dict()
|
||||
|
||||
def has_section(self, section):
|
||||
"""Indicate whether the named section is present in the configuration.
|
||||
|
||||
The DEFAULT section is not acknowledged.
|
||||
"""
|
||||
return section in self._sections
|
||||
|
||||
def options(self, section):
|
||||
"""Return a list of option names for the given section name."""
|
||||
try:
|
||||
opts = self._sections[section].copy()
|
||||
except KeyError:
|
||||
raise NoSectionError(section)
|
||||
opts.update(self._defaults)
|
||||
if '__name__' in opts:
|
||||
del opts['__name__']
|
||||
return opts.keys()
|
||||
|
||||
def read(self, filenames):
|
||||
"""Read and parse a filename or a list of filenames.
|
||||
|
||||
Files that cannot be opened are silently ignored; this is
|
||||
designed so that you can specify a list of potential
|
||||
configuration file locations (e.g. current directory, user's
|
||||
home directory, systemwide directory), and all existing
|
||||
configuration files in the list will be read. A single
|
||||
filename may also be given.
|
||||
|
||||
Return list of successfully read files.
|
||||
"""
|
||||
if isinstance(filenames, basestring):
|
||||
filenames = [filenames]
|
||||
read_ok = []
|
||||
for filename in filenames:
|
||||
try:
|
||||
fp = open(filename)
|
||||
except IOError:
|
||||
continue
|
||||
self._read(fp, filename)
|
||||
fp.close()
|
||||
read_ok.append(filename)
|
||||
return read_ok
|
||||
|
||||
def readfp(self, fp, filename=None):
|
||||
"""Like read() but the argument must be a file-like object.
|
||||
|
||||
The `fp' argument must have a `readline' method. Optional
|
||||
second argument is the `filename', which if not given, is
|
||||
taken from fp.name. If fp has no `name' attribute, `<???>' is
|
||||
used.
|
||||
|
||||
"""
|
||||
if filename is None:
|
||||
try:
|
||||
filename = fp.name
|
||||
except AttributeError:
|
||||
filename = '<???>'
|
||||
self._read(fp, filename)
|
||||
|
||||
def get(self, section, option):
|
||||
opt = self.optionxform(option)
|
||||
if section not in self._sections:
|
||||
if section != DEFAULTSECT:
|
||||
raise NoSectionError(section)
|
||||
if opt in self._defaults:
|
||||
return self._defaults[opt]
|
||||
else:
|
||||
raise NoOptionError(option, section)
|
||||
elif opt in self._sections[section]:
|
||||
return self._sections[section][opt]
|
||||
elif opt in self._defaults:
|
||||
return self._defaults[opt]
|
||||
else:
|
||||
raise NoOptionError(option, section)
|
||||
|
||||
def items(self, section):
|
||||
try:
|
||||
d2 = self._sections[section]
|
||||
except KeyError:
|
||||
if section != DEFAULTSECT:
|
||||
raise NoSectionError(section)
|
||||
d2 = self._dict()
|
||||
d = self._defaults.copy()
|
||||
d.update(d2)
|
||||
if "__name__" in d:
|
||||
del d["__name__"]
|
||||
return d.items()
|
||||
|
||||
def _get(self, section, conv, option):
|
||||
return conv(self.get(section, option))
|
||||
|
||||
def getint(self, section, option):
|
||||
return self._get(section, int, option)
|
||||
|
||||
def getfloat(self, section, option):
|
||||
return self._get(section, float, option)
|
||||
|
||||
_boolean_states = {'1': True, 'yes': True, 'true': True, 'on': True,
|
||||
'0': False, 'no': False, 'false': False, 'off': False}
|
||||
|
||||
def getboolean(self, section, option):
|
||||
v = self.get(section, option)
|
||||
if v.lower() not in self._boolean_states:
|
||||
raise ValueError, 'Not a boolean: %s' % v
|
||||
return self._boolean_states[v.lower()]
|
||||
|
||||
def optionxform(self, optionstr):
|
||||
return optionstr.lower()
|
||||
|
||||
def has_option(self, section, option):
|
||||
"""Check for the existence of a given option in a given section."""
|
||||
if not section or section == DEFAULTSECT:
|
||||
option = self.optionxform(option)
|
||||
return option in self._defaults
|
||||
elif section not in self._sections:
|
||||
return False
|
||||
else:
|
||||
option = self.optionxform(option)
|
||||
return (option in self._sections[section]
|
||||
or option in self._defaults)
|
||||
|
||||
def set(self, section, option, value=None):
|
||||
"""Set an option."""
|
||||
if not section or section == DEFAULTSECT:
|
||||
sectdict = self._defaults
|
||||
else:
|
||||
try:
|
||||
sectdict = self._sections[section]
|
||||
except KeyError:
|
||||
raise NoSectionError(section)
|
||||
sectdict[self.optionxform(option)] = value
|
||||
|
||||
def write(self, fp):
|
||||
"""Write an .ini-format representation of the configuration state."""
|
||||
if self._defaults:
|
||||
fp.write("[%s]\n" % DEFAULTSECT)
|
||||
for (key, value) in self._defaults.items():
|
||||
fp.write("%s = %s\n" % (key, str(value).replace('\n', '\n\t')))
|
||||
fp.write("\n")
|
||||
for section in self._sections:
|
||||
fp.write("[%s]\n" % section)
|
||||
for (key, value) in self._sections[section].items():
|
||||
if key == "__name__":
|
||||
continue
|
||||
if (value is not None) or (self._optcre == self.OPTCRE):
|
||||
key = " = ".join((key, str(value).replace('\n', '\n\t')))
|
||||
fp.write("%s\n" % (key))
|
||||
fp.write("\n")
|
||||
|
||||
def remove_option(self, section, option):
|
||||
"""Remove an option."""
|
||||
if not section or section == DEFAULTSECT:
|
||||
sectdict = self._defaults
|
||||
else:
|
||||
try:
|
||||
sectdict = self._sections[section]
|
||||
except KeyError:
|
||||
raise NoSectionError(section)
|
||||
option = self.optionxform(option)
|
||||
existed = option in sectdict
|
||||
if existed:
|
||||
del sectdict[option]
|
||||
return existed
|
||||
|
||||
def remove_section(self, section):
|
||||
"""Remove a file section."""
|
||||
existed = section in self._sections
|
||||
if existed:
|
||||
del self._sections[section]
|
||||
return existed
|
||||
|
||||
#
|
||||
# Regular expressions for parsing section headers and options.
|
||||
#
|
||||
SECTCRE = re.compile(
|
||||
r'\[' # [
|
||||
r'(?P<header>[^]]+)' # very permissive!
|
||||
r'\]' # ]
|
||||
)
|
||||
OPTCRE = re.compile(
|
||||
r'(?P<option>[^:=\s][^:=]*)' # very permissive!
|
||||
r'\s*(?P<vi>[:=])\s*' # any number of space/tab,
|
||||
# followed by separator
|
||||
# (either : or =), followed
|
||||
# by any # space/tab
|
||||
r'(?P<value>.*)$' # everything up to eol
|
||||
)
|
||||
OPTCRE_NV = re.compile(
|
||||
r'(?P<option>[^:=\s][^:=]*)' # very permissive!
|
||||
r'\s*(?:' # any number of space/tab,
|
||||
r'(?P<vi>[:=])\s*' # optionally followed by
|
||||
# separator (either : or
|
||||
# =), followed by any #
|
||||
# space/tab
|
||||
r'(?P<value>.*))?$' # everything up to eol
|
||||
)
|
||||
|
||||
def _read(self, fp, fpname):
|
||||
"""Parse a sectioned setup file.
|
||||
|
||||
The sections in setup file contains a title line at the top,
|
||||
indicated by a name in square brackets (`[]'), plus key/value
|
||||
options lines, indicated by `name: value' format lines.
|
||||
Continuations are represented by an embedded newline then
|
||||
leading whitespace. Blank lines, lines beginning with a '#',
|
||||
and just about everything else are ignored.
|
||||
"""
|
||||
|
||||
comment_store = {}
|
||||
cursect = None # None, or a dictionary
|
||||
optname = None
|
||||
lineno = 0
|
||||
e = None # None, or an exception
|
||||
while True:
|
||||
line = fp.readline()
|
||||
if not line:
|
||||
break
|
||||
lineno = lineno + 1
|
||||
# comment or blank line?
|
||||
if line.strip() == '' :
|
||||
continue
|
||||
### store comments for doc purposes
|
||||
### Deal with cases of sections and options being there or not
|
||||
if line[0] in '#;' and cursect is not None:
|
||||
if optname is None:
|
||||
comment_store.setdefault(cursect['__name__'] +
|
||||
"::" + "global",[]).append(line)
|
||||
else:
|
||||
comment_store.setdefault(cursect['__name__'] +
|
||||
"::" + optname,[]).append(line)
|
||||
continue
|
||||
elif line[0] in '#;' and cursect is None:
|
||||
comment_store.setdefault("global" +
|
||||
"::" + optname,[]).append(line)
|
||||
continue
|
||||
|
||||
if line.split(None, 1)[0].lower() == 'rem' and line[0] in "rR":
|
||||
# no leading whitespace
|
||||
continue
|
||||
# continuation line?
|
||||
if line[0].isspace() and cursect is not None and optname:
|
||||
value = line.strip()
|
||||
if value:
|
||||
cursect[optname].append(value)
|
||||
# a section header or option header?
|
||||
else:
|
||||
# is it a section header?
|
||||
mo = self.SECTCRE.match(line)
|
||||
if mo:
|
||||
sectname = mo.group('header')
|
||||
if sectname in self._sections:
|
||||
cursect = self._sections[sectname]
|
||||
elif sectname == DEFAULTSECT:
|
||||
cursect = self._defaults
|
||||
else:
|
||||
cursect = self._dict()
|
||||
cursect['__name__'] = sectname
|
||||
self._sections[sectname] = cursect
|
||||
# So sections can't start with a continuation line
|
||||
optname = None
|
||||
# no section header in the file?
|
||||
elif cursect is None:
|
||||
raise MissingSectionHeaderError(fpname, lineno, line)
|
||||
# an option line?
|
||||
else:
|
||||
mo = self._optcre.match(line)
|
||||
if mo:
|
||||
optname, vi, optval = mo.group('option', 'vi', 'value')
|
||||
optname = self.optionxform(optname.rstrip())
|
||||
# This check is fine because the OPTCRE cannot
|
||||
# match if it would set optval to None
|
||||
if optval is not None:
|
||||
if vi in ('=', ':') and ';' in optval:
|
||||
# ';' is a comment delimiter only if it follows
|
||||
# a spacing character
|
||||
pos = optval.find(';')
|
||||
if pos != -1 and optval[pos-1].isspace():
|
||||
optval = optval[:pos]
|
||||
optval = optval.strip()
|
||||
# allow empty values
|
||||
if optval == '""':
|
||||
optval = ''
|
||||
cursect[optname] = [optval]
|
||||
else:
|
||||
# valueless option handling
|
||||
cursect[optname] = optval
|
||||
else:
|
||||
# a non-fatal parsing error occurred. set up the
|
||||
# exception but keep going. the exception will be
|
||||
# raised at the end of the file and will contain a
|
||||
# list of all bogus lines
|
||||
if not e:
|
||||
e = ParsingError(fpname)
|
||||
e.append(lineno, repr(line))
|
||||
# if any parsing errors occurred, raise an exception
|
||||
if e:
|
||||
raise e
|
||||
|
||||
# join the multi-line values collected while reading
|
||||
all_sections = [self._defaults]
|
||||
all_sections.extend(self._sections.values())
|
||||
for options in all_sections:
|
||||
for name, val in options.items():
|
||||
if isinstance(val, list):
|
||||
options[name] = '\n'.join(val)
|
||||
self.comment_store = comment_store
|
||||
|
||||
def ini_as_rst(self):
|
||||
"""trivial helper function to putput comment_stroe as rest
|
||||
|
||||
.. todo:: write actual doctests with string input
|
||||
>> p = ConfigParser2.SafeConfigParser()
|
||||
>> p.read(f)
|
||||
['/usr/home/pbrian/src/public/configparser2/example.ini']
|
||||
>> open("/tmp/foo.rst", "w").write(p.ini_as_rst())
|
||||
|
||||
"""
|
||||
outstr = ".. rst version of ini file\n\n"
|
||||
_cursectname = None
|
||||
for item in sorted(self.comment_store.keys()):
|
||||
_sect, _opt = item.split("::")
|
||||
if _sect != _cursectname:
|
||||
outstr += "\n%s\n%s\n" % (_sect, "-"* len(_sect))
|
||||
_cursectname = _sect
|
||||
txt = " ".join(self.comment_store[item])
|
||||
txt = txt.replace("#", "").replace(";","")
|
||||
outstr += ":%s: %s" % (_opt, txt)
|
||||
return outstr
|
||||
|
||||
|
||||
|
||||
import UserDict as _UserDict
|
||||
|
||||
class _Chainmap(_UserDict.DictMixin):
|
||||
"""Combine multiple mappings for successive lookups.
|
||||
|
||||
For example, to emulate Python's normal lookup sequence:
|
||||
|
||||
import __builtin__
|
||||
pylookup = _Chainmap(locals(), globals(), vars(__builtin__))
|
||||
"""
|
||||
|
||||
def __init__(self, *maps):
|
||||
self._maps = maps
|
||||
|
||||
def __getitem__(self, key):
|
||||
for mapping in self._maps:
|
||||
try:
|
||||
return mapping[key]
|
||||
except KeyError:
|
||||
pass
|
||||
raise KeyError(key)
|
||||
|
||||
def keys(self):
|
||||
result = []
|
||||
seen = set()
|
||||
for mapping in self._maps:
|
||||
for key in mapping:
|
||||
if key not in seen:
|
||||
result.append(key)
|
||||
seen.add(key)
|
||||
return result
|
||||
|
||||
class ConfigParser(RawConfigParser):
|
||||
|
||||
def get(self, section, option, raw=False, vars=None):
|
||||
"""Get an option value for a given section.
|
||||
|
||||
If `vars' is provided, it must be a dictionary. The option is looked up
|
||||
in `vars' (if provided), `section', and in `defaults' in that order.
|
||||
|
||||
All % interpolations are expanded in the return values, unless the
|
||||
optional argument `raw' is true. Values for interpolation keys are
|
||||
looked up in the same manner as the option.
|
||||
|
||||
The section DEFAULT is special.
|
||||
"""
|
||||
sectiondict = {}
|
||||
try:
|
||||
sectiondict = self._sections[section]
|
||||
except KeyError:
|
||||
if section != DEFAULTSECT:
|
||||
raise NoSectionError(section)
|
||||
# Update with the entry specific variables
|
||||
vardict = {}
|
||||
if vars:
|
||||
for key, value in vars.items():
|
||||
vardict[self.optionxform(key)] = value
|
||||
d = _Chainmap(vardict, sectiondict, self._defaults)
|
||||
option = self.optionxform(option)
|
||||
try:
|
||||
value = d[option]
|
||||
except KeyError:
|
||||
raise NoOptionError(option, section)
|
||||
|
||||
if raw or value is None:
|
||||
return value
|
||||
else:
|
||||
return self._interpolate(section, option, value, d)
|
||||
|
||||
def items(self, section, raw=False, vars=None):
|
||||
"""Return a list of tuples with (name, value) for each option
|
||||
in the section.
|
||||
|
||||
All % interpolations are expanded in the return values, based on the
|
||||
defaults passed into the constructor, unless the optional argument
|
||||
`raw' is true. Additional substitutions may be provided using the
|
||||
`vars' argument, which must be a dictionary whose contents overrides
|
||||
any pre-existing defaults.
|
||||
|
||||
The section DEFAULT is special.
|
||||
"""
|
||||
d = self._defaults.copy()
|
||||
try:
|
||||
d.update(self._sections[section])
|
||||
except KeyError:
|
||||
if section != DEFAULTSECT:
|
||||
raise NoSectionError(section)
|
||||
# Update with the entry specific variables
|
||||
if vars:
|
||||
for key, value in vars.items():
|
||||
d[self.optionxform(key)] = value
|
||||
options = d.keys()
|
||||
if "__name__" in options:
|
||||
options.remove("__name__")
|
||||
if raw:
|
||||
return [(option, d[option])
|
||||
for option in options]
|
||||
else:
|
||||
return [(option, self._interpolate(section, option, d[option], d))
|
||||
for option in options]
|
||||
|
||||
def _interpolate(self, section, option, rawval, vars):
|
||||
# do the string interpolation
|
||||
value = rawval
|
||||
depth = MAX_INTERPOLATION_DEPTH
|
||||
while depth: # Loop through this until it's done
|
||||
depth -= 1
|
||||
if value and "%(" in value:
|
||||
value = self._KEYCRE.sub(self._interpolation_replace, value)
|
||||
try:
|
||||
value = value % vars
|
||||
except KeyError, e:
|
||||
raise InterpolationMissingOptionError(
|
||||
option, section, rawval, e.args[0])
|
||||
else:
|
||||
break
|
||||
if value and "%(" in value:
|
||||
raise InterpolationDepthError(option, section, rawval)
|
||||
return value
|
||||
|
||||
_KEYCRE = re.compile(r"%\(([^)]*)\)s|.")
|
||||
|
||||
def _interpolation_replace(self, match):
|
||||
s = match.group(1)
|
||||
if s is None:
|
||||
return match.group()
|
||||
else:
|
||||
return "%%(%s)s" % self.optionxform(s)
|
||||
|
||||
|
||||
class SafeConfigParser(ConfigParser):
|
||||
|
||||
def _interpolate(self, section, option, rawval, vars):
|
||||
# do the string interpolation
|
||||
L = []
|
||||
self._interpolate_some(option, L, rawval, section, vars, 1)
|
||||
return ''.join(L)
|
||||
|
||||
_interpvar_re = re.compile(r"%\(([^)]+)\)s")
|
||||
|
||||
def _interpolate_some(self, option, accum, rest, section, map, depth):
|
||||
if depth > MAX_INTERPOLATION_DEPTH:
|
||||
raise InterpolationDepthError(option, section, rest)
|
||||
while rest:
|
||||
p = rest.find("%")
|
||||
if p < 0:
|
||||
accum.append(rest)
|
||||
return
|
||||
if p > 0:
|
||||
accum.append(rest[:p])
|
||||
rest = rest[p:]
|
||||
# p is no longer used
|
||||
c = rest[1:2]
|
||||
if c == "%":
|
||||
accum.append("%")
|
||||
rest = rest[2:]
|
||||
elif c == "(":
|
||||
m = self._interpvar_re.match(rest)
|
||||
if m is None:
|
||||
raise InterpolationSyntaxError(option, section,
|
||||
"bad interpolation variable reference %r" % rest)
|
||||
var = self.optionxform(m.group(1))
|
||||
rest = rest[m.end():]
|
||||
try:
|
||||
v = map[var]
|
||||
except KeyError:
|
||||
raise InterpolationMissingOptionError(
|
||||
option, section, rest, var)
|
||||
if "%" in v:
|
||||
self._interpolate_some(option, accum, v,
|
||||
section, map, depth + 1)
|
||||
else:
|
||||
accum.append(v)
|
||||
else:
|
||||
raise InterpolationSyntaxError(
|
||||
option, section,
|
||||
"'%%' must be followed by '%%' or '(', found: %r" % (rest,))
|
||||
|
||||
def set(self, section, option, value=None):
|
||||
"""Set an option. Extend ConfigParser.set: check for string values."""
|
||||
# The only legal non-string value if we allow valueless
|
||||
# options is None, so we need to check if the value is a
|
||||
# string if:
|
||||
# - we do not allow valueless options, or
|
||||
# - we allow valueless options but the value is not None
|
||||
if self._optcre is self.OPTCRE or value:
|
||||
if not isinstance(value, basestring):
|
||||
raise TypeError("option values must be strings")
|
||||
if value is not None:
|
||||
# check for bad percent signs:
|
||||
# first, replace all "good" interpolations
|
||||
tmp_value = value.replace('%%', '')
|
||||
tmp_value = self._interpvar_re.sub('', tmp_value)
|
||||
# then, check if there's a lone percent sign left
|
||||
if '%' in tmp_value:
|
||||
raise ValueError("invalid interpolation syntax in %r at "
|
||||
"position %d" % (value, tmp_value.find('%')))
|
||||
ConfigParser.set(self, section, option, value)
|
|
@ -1,43 +0,0 @@
|
|||
Behold, mortal, the origins of Beautiful Soup...
|
||||
================================================
|
||||
|
||||
Leonard Richardson is the primary programmer.
|
||||
|
||||
Aaron DeVore is awesome.
|
||||
|
||||
Mark Pilgrim provided the encoding detection code that forms the base
|
||||
of UnicodeDammit.
|
||||
|
||||
Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful
|
||||
Soup 4 working under Python 3.
|
||||
|
||||
Simon Willison wrote soupselect, which was used to make Beautiful Soup
|
||||
support CSS selectors.
|
||||
|
||||
Sam Ruby helped with a lot of edge cases.
|
||||
|
||||
Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his
|
||||
work in solving the nestable tags conundrum.
|
||||
|
||||
An incomplete list of people have contributed patches to Beautiful
|
||||
Soup:
|
||||
|
||||
Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang,
|
||||
Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris
|
||||
Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren,
|
||||
Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed
|
||||
Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko
|
||||
Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn
|
||||
Webster, Paul Wright, Danny Yoo
|
||||
|
||||
An incomplete list of people who made suggestions or found bugs or
|
||||
found ways to break Beautiful Soup:
|
||||
|
||||
Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel,
|
||||
Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes,
|
||||
Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams,
|
||||
warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison,
|
||||
Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed
|
||||
Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart
|
||||
Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de
|
||||
Sousa Rocha, Yichun Wei, Per Vognsen
|
|
@ -1,27 +0,0 @@
|
|||
Beautiful Soup is made available under the MIT license:
|
||||
|
||||
Copyright (c) 2004-2015 Leonard Richardson
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
||||
Beautiful Soup incorporates code from the html5lib library, which is
|
||||
also made available under the MIT license. Copyright (c) 2006-2013
|
||||
James Graham and other contributors
|
1190
libs/bs4/NEWS.txt
1190
libs/bs4/NEWS.txt
File diff suppressed because it is too large
Load diff
|
@ -1,63 +0,0 @@
|
|||
= Introduction =
|
||||
|
||||
>>> from bs4 import BeautifulSoup
|
||||
>>> soup = BeautifulSoup("<p>Some<b>bad<i>HTML")
|
||||
>>> print soup.prettify()
|
||||
<html>
|
||||
<body>
|
||||
<p>
|
||||
Some
|
||||
<b>
|
||||
bad
|
||||
<i>
|
||||
HTML
|
||||
</i>
|
||||
</b>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
>>> soup.find(text="bad")
|
||||
u'bad'
|
||||
|
||||
>>> soup.i
|
||||
<i>HTML</i>
|
||||
|
||||
>>> soup = BeautifulSoup("<tag1>Some<tag2/>bad<tag3>XML", "xml")
|
||||
>>> print soup.prettify()
|
||||
<?xml version="1.0" encoding="utf-8">
|
||||
<tag1>
|
||||
Some
|
||||
<tag2 />
|
||||
bad
|
||||
<tag3>
|
||||
XML
|
||||
</tag3>
|
||||
</tag1>
|
||||
|
||||
= Full documentation =
|
||||
|
||||
The bs4/doc/ directory contains full documentation in Sphinx
|
||||
format. Run "make html" in that directory to create HTML
|
||||
documentation.
|
||||
|
||||
= Running the unit tests =
|
||||
|
||||
Beautiful Soup supports unit test discovery from the project root directory:
|
||||
|
||||
$ nosetests
|
||||
|
||||
$ python -m unittest discover -s bs4 # Python 2.7 and up
|
||||
|
||||
If you checked out the source tree, you should see a script in the
|
||||
home directory called test-all-versions. This script will run the unit
|
||||
tests under Python 2.7, then create a temporary Python 3 conversion of
|
||||
the source and run the unit tests again under Python 3.
|
||||
|
||||
= Links =
|
||||
|
||||
Homepage: http://www.crummy.com/software/BeautifulSoup/bs4/
|
||||
Documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||
http://readthedocs.org/docs/beautiful-soup-4/
|
||||
Discussion group: http://groups.google.com/group/beautifulsoup/
|
||||
Development: https://code.launchpad.net/beautifulsoup/
|
||||
Bug tracker: https://bugs.launchpad.net/beautifulsoup/
|
|
@ -1,31 +0,0 @@
|
|||
Additions
|
||||
---------
|
||||
|
||||
More of the jQuery API: nextUntil?
|
||||
|
||||
Optimizations
|
||||
-------------
|
||||
|
||||
The html5lib tree builder doesn't use the standard tree-building API,
|
||||
which worries me and has resulted in a number of bugs.
|
||||
|
||||
markup_attr_map can be optimized since it's always a map now.
|
||||
|
||||
Upon encountering UTF-16LE data or some other uncommon serialization
|
||||
of Unicode, UnicodeDammit will convert the data to Unicode, then
|
||||
encode it at UTF-8. This is wasteful because it will just get decoded
|
||||
back to Unicode.
|
||||
|
||||
CDATA
|
||||
-----
|
||||
|
||||
The elementtree XMLParser has a strip_cdata argument that, when set to
|
||||
False, should allow Beautiful Soup to preserve CDATA sections instead
|
||||
of treating them as text. Except it doesn't. (This argument is also
|
||||
present for HTMLParser, and also does nothing there.)
|
||||
|
||||
Currently, htm5lib converts CDATA sections into comments. An
|
||||
as-yet-unreleased version of html5lib changes the parser's handling of
|
||||
CDATA sections to allow CDATA sections in tags like <svg> and
|
||||
<math>. The HTML5TreeBuilder will need to be updated to create CData
|
||||
objects instead of Comment objects in this situation.
|
|
@ -17,18 +17,17 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
|||
|
||||
"""
|
||||
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
|
||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||
__version__ = "4.6.0"
|
||||
__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
|
||||
__version__ = "4.8.0"
|
||||
__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = ['BeautifulSoup']
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import traceback
|
||||
import warnings
|
||||
|
||||
|
@ -50,7 +49,7 @@ from .element import (
|
|||
|
||||
# The very first thing we do is give a useful error if someone is
|
||||
# running this code under Python 3 without converting it.
|
||||
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||||
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||||
|
||||
class BeautifulSoup(Tag):
|
||||
"""
|
||||
|
@ -74,7 +73,7 @@ class BeautifulSoup(Tag):
|
|||
like HTML's <br> tag), call handle_starttag and then
|
||||
handle_endtag.
|
||||
"""
|
||||
ROOT_TAG_NAME = u'[document]'
|
||||
ROOT_TAG_NAME = '[document]'
|
||||
|
||||
# If the end-user gives no indication which tree builder they
|
||||
# want, look for one with these features.
|
||||
|
@ -82,16 +81,56 @@ class BeautifulSoup(Tag):
|
|||
|
||||
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
||||
|
||||
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n"
|
||||
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
|
||||
|
||||
def __init__(self, markup="", features=None, builder=None,
|
||||
parse_only=None, from_encoding=None, exclude_encodings=None,
|
||||
**kwargs):
|
||||
"""The Soup object is initialized as the 'root tag', and the
|
||||
provided markup (which can be a string or a file-like object)
|
||||
is fed into the underlying parser."""
|
||||
"""Constructor.
|
||||
|
||||
:param markup: A string or a file-like object representing
|
||||
markup to be parsed.
|
||||
|
||||
:param features: Desirable features of the parser to be used. This
|
||||
may be the name of a specific parser ("lxml", "lxml-xml",
|
||||
"html.parser", or "html5lib") or it may be the type of markup
|
||||
to be used ("html", "html5", "xml"). It's recommended that you
|
||||
name a specific parser, so that Beautiful Soup gives you the
|
||||
same results across platforms and virtual environments.
|
||||
|
||||
:param builder: A TreeBuilder subclass to instantiate (or
|
||||
instance to use) instead of looking one up based on
|
||||
`features`. You only need to use this if you've implemented a
|
||||
custom TreeBuilder.
|
||||
|
||||
:param parse_only: A SoupStrainer. Only parts of the document
|
||||
matching the SoupStrainer will be considered. This is useful
|
||||
when parsing part of a document that would otherwise be too
|
||||
large to fit into memory.
|
||||
|
||||
:param from_encoding: A string indicating the encoding of the
|
||||
document to be parsed. Pass this in if Beautiful Soup is
|
||||
guessing wrongly about the document's encoding.
|
||||
|
||||
:param exclude_encodings: A list of strings indicating
|
||||
encodings known to be wrong. Pass this in if you don't know
|
||||
the document's encoding but you know Beautiful Soup's guess is
|
||||
wrong.
|
||||
|
||||
:param kwargs: For backwards compatibility purposes, the
|
||||
constructor accepts certain keyword arguments used in
|
||||
Beautiful Soup 3. None of these arguments do anything in
|
||||
Beautiful Soup 4; they will result in a warning and then be ignored.
|
||||
|
||||
Apart from this, any keyword arguments passed into the BeautifulSoup
|
||||
constructor are propagated to the TreeBuilder constructor. This
|
||||
makes it possible to configure a TreeBuilder beyond saying
|
||||
which one to use.
|
||||
|
||||
"""
|
||||
|
||||
if 'convertEntities' in kwargs:
|
||||
del kwargs['convertEntities']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the convertEntities argument to the "
|
||||
"BeautifulSoup constructor. Entities are always converted "
|
||||
|
@ -142,18 +181,22 @@ class BeautifulSoup(Tag):
|
|||
from_encoding = from_encoding or deprecated_argument(
|
||||
"fromEncoding", "from_encoding")
|
||||
|
||||
if from_encoding and isinstance(markup, unicode):
|
||||
if from_encoding and isinstance(markup, str):
|
||||
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
|
||||
from_encoding = None
|
||||
|
||||
if len(kwargs) > 0:
|
||||
arg = kwargs.keys().pop()
|
||||
raise TypeError(
|
||||
"__init__() got an unexpected keyword argument '%s'" % arg)
|
||||
|
||||
if builder is None:
|
||||
original_features = features
|
||||
if isinstance(features, basestring):
|
||||
# We need this information to track whether or not the builder
|
||||
# was specified well enough that we can omit the 'you need to
|
||||
# specify a parser' warning.
|
||||
original_builder = builder
|
||||
original_features = features
|
||||
|
||||
if isinstance(builder, type):
|
||||
# A builder class was passed in; it needs to be instantiated.
|
||||
builder_class = builder
|
||||
builder = None
|
||||
elif builder is None:
|
||||
if isinstance(features, str):
|
||||
features = [features]
|
||||
if features is None or len(features) == 0:
|
||||
features = self.DEFAULT_BUILDER_FEATURES
|
||||
|
@ -163,41 +206,73 @@ class BeautifulSoup(Tag):
|
|||
"Couldn't find a tree builder with the features you "
|
||||
"requested: %s. Do you need to install a parser library?"
|
||||
% ",".join(features))
|
||||
builder = builder_class()
|
||||
if not (original_features == builder.NAME or
|
||||
original_features in builder.ALTERNATE_NAMES):
|
||||
|
||||
# At this point either we have a TreeBuilder instance in
|
||||
# builder, or we have a builder_class that we can instantiate
|
||||
# with the remaining **kwargs.
|
||||
if builder is None:
|
||||
builder = builder_class(**kwargs)
|
||||
if not original_builder and not (
|
||||
original_features == builder.NAME or
|
||||
original_features in builder.ALTERNATE_NAMES
|
||||
):
|
||||
if builder.is_xml:
|
||||
markup_type = "XML"
|
||||
else:
|
||||
markup_type = "HTML"
|
||||
|
||||
caller = traceback.extract_stack()[0]
|
||||
filename = caller[0]
|
||||
line_number = caller[1]
|
||||
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
|
||||
filename=filename,
|
||||
line_number=line_number,
|
||||
parser=builder.NAME,
|
||||
markup_type=markup_type))
|
||||
|
||||
# This code adapted from warnings.py so that we get the same line
|
||||
# of code as our warnings.warn() call gets, even if the answer is wrong
|
||||
# (as it may be in a multithreading situation).
|
||||
caller = None
|
||||
try:
|
||||
caller = sys._getframe(1)
|
||||
except ValueError:
|
||||
pass
|
||||
if caller:
|
||||
globals = caller.f_globals
|
||||
line_number = caller.f_lineno
|
||||
else:
|
||||
globals = sys.__dict__
|
||||
line_number= 1
|
||||
filename = globals.get('__file__')
|
||||
if filename:
|
||||
fnl = filename.lower()
|
||||
if fnl.endswith((".pyc", ".pyo")):
|
||||
filename = filename[:-1]
|
||||
if filename:
|
||||
# If there is no filename at all, the user is most likely in a REPL,
|
||||
# and the warning is not necessary.
|
||||
values = dict(
|
||||
filename=filename,
|
||||
line_number=line_number,
|
||||
parser=builder.NAME,
|
||||
markup_type=markup_type
|
||||
)
|
||||
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
|
||||
else:
|
||||
if kwargs:
|
||||
warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
|
||||
|
||||
self.builder = builder
|
||||
self.is_xml = builder.is_xml
|
||||
self.known_xml = self.is_xml
|
||||
self.builder.soup = self
|
||||
|
||||
self._namespaces = dict()
|
||||
self.parse_only = parse_only
|
||||
|
||||
self.builder.initialize_soup(self)
|
||||
|
||||
if hasattr(markup, 'read'): # It's a file-type object.
|
||||
markup = markup.read()
|
||||
elif len(markup) <= 256 and (
|
||||
(isinstance(markup, bytes) and not b'<' in markup)
|
||||
or (isinstance(markup, unicode) and not u'<' in markup)
|
||||
or (isinstance(markup, str) and not '<' in markup)
|
||||
):
|
||||
# Print out warnings for a couple beginner problems
|
||||
# involving passing non-markup to Beautiful Soup.
|
||||
# Beautiful Soup will still parse the input as markup,
|
||||
# just in case that's what the user really wants.
|
||||
if (isinstance(markup, unicode)
|
||||
if (isinstance(markup, str)
|
||||
and not os.path.supports_unicode_filenames):
|
||||
possible_filename = markup.encode("utf8")
|
||||
else:
|
||||
|
@ -205,13 +280,13 @@ class BeautifulSoup(Tag):
|
|||
is_file = False
|
||||
try:
|
||||
is_file = os.path.exists(possible_filename)
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
# This is almost certainly a problem involving
|
||||
# characters not valid in filenames on this
|
||||
# system. Just let it go.
|
||||
pass
|
||||
if is_file:
|
||||
if isinstance(markup, unicode):
|
||||
if isinstance(markup, str):
|
||||
markup = markup.encode("utf8")
|
||||
warnings.warn(
|
||||
'"%s" looks like a filename, not markup. You should'
|
||||
|
@ -263,9 +338,9 @@ class BeautifulSoup(Tag):
|
|||
if isinstance(markup, bytes):
|
||||
space = b' '
|
||||
cant_start_with = (b"http:", b"https:")
|
||||
elif isinstance(markup, unicode):
|
||||
space = u' '
|
||||
cant_start_with = (u"http:", u"https:")
|
||||
elif isinstance(markup, str):
|
||||
space = ' '
|
||||
cant_start_with = ("http:", "https:")
|
||||
else:
|
||||
return
|
||||
|
||||
|
@ -302,9 +377,10 @@ class BeautifulSoup(Tag):
|
|||
self.preserve_whitespace_tag_stack = []
|
||||
self.pushTag(self)
|
||||
|
||||
def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
|
||||
def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs):
|
||||
"""Create a new tag associated with this soup."""
|
||||
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
|
||||
kwattrs.update(attrs)
|
||||
return Tag(None, self.builder, name, namespace, nsprefix, kwattrs)
|
||||
|
||||
def new_string(self, s, subclass=NavigableString):
|
||||
"""Create a new NavigableString associated with this soup."""
|
||||
|
@ -327,7 +403,7 @@ class BeautifulSoup(Tag):
|
|||
|
||||
def pushTag(self, tag):
|
||||
#print "Push", tag.name
|
||||
if self.currentTag:
|
||||
if self.currentTag is not None:
|
||||
self.currentTag.contents.append(tag)
|
||||
self.tagStack.append(tag)
|
||||
self.currentTag = self.tagStack[-1]
|
||||
|
@ -336,7 +412,7 @@ class BeautifulSoup(Tag):
|
|||
|
||||
def endData(self, containerClass=NavigableString):
|
||||
if self.current_data:
|
||||
current_data = u''.join(self.current_data)
|
||||
current_data = ''.join(self.current_data)
|
||||
# If whitespace is not preserved, and this string contains
|
||||
# nothing but ASCII spaces, replace it with a single space
|
||||
# or newline.
|
||||
|
@ -366,60 +442,71 @@ class BeautifulSoup(Tag):
|
|||
|
||||
def object_was_parsed(self, o, parent=None, most_recent_element=None):
|
||||
"""Add an object to the parse tree."""
|
||||
parent = parent or self.currentTag
|
||||
previous_element = most_recent_element or self._most_recent_element
|
||||
if parent is None:
|
||||
parent = self.currentTag
|
||||
if most_recent_element is not None:
|
||||
previous_element = most_recent_element
|
||||
else:
|
||||
previous_element = self._most_recent_element
|
||||
|
||||
next_element = previous_sibling = next_sibling = None
|
||||
if isinstance(o, Tag):
|
||||
next_element = o.next_element
|
||||
next_sibling = o.next_sibling
|
||||
previous_sibling = o.previous_sibling
|
||||
if not previous_element:
|
||||
if previous_element is None:
|
||||
previous_element = o.previous_element
|
||||
|
||||
fix = parent.next_element is not None
|
||||
|
||||
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
|
||||
|
||||
self._most_recent_element = o
|
||||
parent.contents.append(o)
|
||||
|
||||
if parent.next_sibling:
|
||||
# This node is being inserted into an element that has
|
||||
# already been parsed. Deal with any dangling references.
|
||||
index = len(parent.contents)-1
|
||||
while index >= 0:
|
||||
if parent.contents[index] is o:
|
||||
break
|
||||
index -= 1
|
||||
else:
|
||||
raise ValueError(
|
||||
"Error building tree: supposedly %r was inserted "
|
||||
"into %r after the fact, but I don't see it!" % (
|
||||
o, parent
|
||||
)
|
||||
)
|
||||
if index == 0:
|
||||
previous_element = parent
|
||||
previous_sibling = None
|
||||
else:
|
||||
previous_element = previous_sibling = parent.contents[index-1]
|
||||
if index == len(parent.contents)-1:
|
||||
next_element = parent.next_sibling
|
||||
next_sibling = None
|
||||
else:
|
||||
next_element = next_sibling = parent.contents[index+1]
|
||||
# Check if we are inserting into an already parsed node.
|
||||
if fix:
|
||||
self._linkage_fixer(parent)
|
||||
|
||||
o.previous_element = previous_element
|
||||
if previous_element:
|
||||
previous_element.next_element = o
|
||||
o.next_element = next_element
|
||||
if next_element:
|
||||
next_element.previous_element = o
|
||||
o.next_sibling = next_sibling
|
||||
if next_sibling:
|
||||
next_sibling.previous_sibling = o
|
||||
o.previous_sibling = previous_sibling
|
||||
if previous_sibling:
|
||||
previous_sibling.next_sibling = o
|
||||
def _linkage_fixer(self, el):
|
||||
"""Make sure linkage of this fragment is sound."""
|
||||
|
||||
first = el.contents[0]
|
||||
child = el.contents[-1]
|
||||
descendant = child
|
||||
|
||||
if child is first and el.parent is not None:
|
||||
# Parent should be linked to first child
|
||||
el.next_element = child
|
||||
# We are no longer linked to whatever this element is
|
||||
prev_el = child.previous_element
|
||||
if prev_el is not None and prev_el is not el:
|
||||
prev_el.next_element = None
|
||||
# First child should be linked to the parent, and no previous siblings.
|
||||
child.previous_element = el
|
||||
child.previous_sibling = None
|
||||
|
||||
# We have no sibling as we've been appended as the last.
|
||||
child.next_sibling = None
|
||||
|
||||
# This index is a tag, dig deeper for a "last descendant"
|
||||
if isinstance(child, Tag) and child.contents:
|
||||
descendant = child._last_descendant(False)
|
||||
|
||||
# As the final step, link last descendant. It should be linked
|
||||
# to the parent's next sibling (if found), else walk up the chain
|
||||
# and find a parent with a sibling. It should have no next sibling.
|
||||
descendant.next_element = None
|
||||
descendant.next_sibling = None
|
||||
target = el
|
||||
while True:
|
||||
if target is None:
|
||||
break
|
||||
elif target.next_sibling is not None:
|
||||
descendant.next_element = target.next_sibling
|
||||
target.next_sibling.previous_element = child
|
||||
break
|
||||
target = target.parent
|
||||
|
||||
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
||||
"""Pops the tag stack up to and including the most recent
|
||||
|
@ -465,7 +552,7 @@ class BeautifulSoup(Tag):
|
|||
self.currentTag, self._most_recent_element)
|
||||
if tag is None:
|
||||
return tag
|
||||
if self._most_recent_element:
|
||||
if self._most_recent_element is not None:
|
||||
self._most_recent_element.next_element = tag
|
||||
self._most_recent_element = tag
|
||||
self.pushTag(tag)
|
||||
|
@ -490,9 +577,9 @@ class BeautifulSoup(Tag):
|
|||
encoding_part = ''
|
||||
if eventual_encoding != None:
|
||||
encoding_part = ' encoding="%s"' % eventual_encoding
|
||||
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
|
||||
prefix = '<?xml version="1.0"%s?>\n' % encoding_part
|
||||
else:
|
||||
prefix = u''
|
||||
prefix = ''
|
||||
if not pretty_print:
|
||||
indent_level = None
|
||||
else:
|
||||
|
@ -526,4 +613,4 @@ class FeatureNotFound(ValueError):
|
|||
if __name__ == '__main__':
|
||||
import sys
|
||||
soup = BeautifulSoup(sys.stdin)
|
||||
print soup.prettify()
|
||||
print(soup.prettify())
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
from collections import defaultdict
|
||||
import itertools
|
||||
|
@ -7,8 +7,7 @@ import sys
|
|||
from bs4.element import (
|
||||
CharsetMetaAttributeValue,
|
||||
ContentMetaAttributeValue,
|
||||
HTMLAwareEntitySubstitution,
|
||||
whitespace_re
|
||||
nonwhitespace_re
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
|
@ -90,18 +89,46 @@ class TreeBuilder(object):
|
|||
|
||||
is_xml = False
|
||||
picklable = False
|
||||
preserve_whitespace_tags = set()
|
||||
empty_element_tags = None # A tag will be considered an empty-element
|
||||
# tag when and only when it has no contents.
|
||||
|
||||
|
||||
# A value for these tag/attribute combinations is a space- or
|
||||
# comma-separated list of CDATA, rather than a single CDATA.
|
||||
cdata_list_attributes = {}
|
||||
DEFAULT_CDATA_LIST_ATTRIBUTES = {}
|
||||
|
||||
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
|
||||
|
||||
USE_DEFAULT = object()
|
||||
|
||||
def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT):
|
||||
"""Constructor.
|
||||
|
||||
def __init__(self):
|
||||
:param multi_valued_attributes: If this is set to None, the
|
||||
TreeBuilder will not turn any values for attributes like
|
||||
'class' into lists. Setting this do a dictionary will
|
||||
customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
|
||||
for an example.
|
||||
|
||||
Internally, these are called "CDATA list attributes", but that
|
||||
probably doesn't make sense to an end-user, so the argument name
|
||||
is `multi_valued_attributes`.
|
||||
|
||||
:param preserve_whitespace_tags:
|
||||
"""
|
||||
self.soup = None
|
||||
|
||||
if multi_valued_attributes is self.USE_DEFAULT:
|
||||
multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
|
||||
self.cdata_list_attributes = multi_valued_attributes
|
||||
if preserve_whitespace_tags is self.USE_DEFAULT:
|
||||
preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
|
||||
self.preserve_whitespace_tags = preserve_whitespace_tags
|
||||
|
||||
def initialize_soup(self, soup):
|
||||
"""The BeautifulSoup object has been initialized and is now
|
||||
being associated with the TreeBuilder.
|
||||
"""
|
||||
self.soup = soup
|
||||
|
||||
def reset(self):
|
||||
pass
|
||||
|
||||
|
@ -125,7 +152,7 @@ class TreeBuilder(object):
|
|||
if self.empty_element_tags is None:
|
||||
return True
|
||||
return tag_name in self.empty_element_tags
|
||||
|
||||
|
||||
def feed(self, markup):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
@ -160,14 +187,14 @@ class TreeBuilder(object):
|
|||
universal = self.cdata_list_attributes.get('*', [])
|
||||
tag_specific = self.cdata_list_attributes.get(
|
||||
tag_name.lower(), None)
|
||||
for attr in attrs.keys():
|
||||
for attr in list(attrs.keys()):
|
||||
if attr in universal or (tag_specific and attr in tag_specific):
|
||||
# We have a "class"-type attribute whose string
|
||||
# value is a whitespace-separated list of
|
||||
# values. Split it into a list.
|
||||
value = attrs[attr]
|
||||
if isinstance(value, basestring):
|
||||
values = whitespace_re.split(value)
|
||||
if isinstance(value, str):
|
||||
values = nonwhitespace_re.findall(value)
|
||||
else:
|
||||
# html5lib sometimes calls setAttributes twice
|
||||
# for the same tag when rearranging the parse
|
||||
|
@ -231,15 +258,20 @@ class HTMLTreeBuilder(TreeBuilder):
|
|||
Such as which tags are empty-element tags.
|
||||
"""
|
||||
|
||||
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
|
||||
empty_element_tags = set([
|
||||
# These are from HTML5.
|
||||
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
|
||||
|
||||
# These are from HTML4, removed in HTML5.
|
||||
'spacer', 'frame'
|
||||
|
||||
# These are from earlier versions of HTML and are removed in HTML5.
|
||||
'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
|
||||
])
|
||||
|
||||
# The HTML standard defines these as block-level elements. Beautiful
|
||||
# Soup does not treat these elements differently from other elements,
|
||||
# but it may do so eventually, and this information is available if
|
||||
# you need to use it.
|
||||
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
|
||||
|
||||
# The HTML standard defines these attributes as containing a
|
||||
# space-separated list of values, not a single value. That is,
|
||||
# class="foo bar" means that the 'class' attribute has two values,
|
||||
|
@ -247,7 +279,7 @@ class HTMLTreeBuilder(TreeBuilder):
|
|||
# encounter one of these attributes, we will parse its value into
|
||||
# a list of values if possible. Upon output, the list will be
|
||||
# converted back into a string.
|
||||
cdata_list_attributes = {
|
||||
DEFAULT_CDATA_LIST_ATTRIBUTES = {
|
||||
"*" : ['class', 'accesskey', 'dropzone'],
|
||||
"a" : ['rel', 'rev'],
|
||||
"link" : ['rel', 'rev'],
|
||||
|
@ -264,6 +296,8 @@ class HTMLTreeBuilder(TreeBuilder):
|
|||
"output" : ["for"],
|
||||
}
|
||||
|
||||
DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
|
||||
|
||||
def set_up_substitutions(self, tag):
|
||||
# We are only interested in <meta> tags
|
||||
if tag.name != 'meta':
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = [
|
||||
'HTML5TreeBuilder',
|
||||
|
@ -15,7 +15,7 @@ from bs4.builder import (
|
|||
)
|
||||
from bs4.element import (
|
||||
NamespacedAttribute,
|
||||
whitespace_re,
|
||||
nonwhitespace_re,
|
||||
)
|
||||
import html5lib
|
||||
from html5lib.constants import (
|
||||
|
@ -33,7 +33,7 @@ try:
|
|||
# Pre-0.99999999
|
||||
from html5lib.treebuilders import _base as treebuilder_base
|
||||
new_html5lib = False
|
||||
except ImportError, e:
|
||||
except ImportError as e:
|
||||
# 0.99999999 and up
|
||||
from html5lib.treebuilders import base as treebuilder_base
|
||||
new_html5lib = True
|
||||
|
@ -64,7 +64,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
|||
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
||||
|
||||
extra_kwargs = dict()
|
||||
if not isinstance(markup, unicode):
|
||||
if not isinstance(markup, str):
|
||||
if new_html5lib:
|
||||
extra_kwargs['override_encoding'] = self.user_specified_encoding
|
||||
else:
|
||||
|
@ -72,13 +72,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
|||
doc = parser.parse(markup, **extra_kwargs)
|
||||
|
||||
# Set the character encoding detected by the tokenizer.
|
||||
if isinstance(markup, unicode):
|
||||
if isinstance(markup, str):
|
||||
# We need to special-case this because html5lib sets
|
||||
# charEncoding to UTF-8 if it gets Unicode input.
|
||||
doc.original_encoding = None
|
||||
else:
|
||||
original_encoding = parser.tokenizer.stream.charEncoding[0]
|
||||
if not isinstance(original_encoding, basestring):
|
||||
if not isinstance(original_encoding, str):
|
||||
# In 0.99999999 and up, the encoding is an html5lib
|
||||
# Encoding object. We want to use a string for compatibility
|
||||
# with other tree builders.
|
||||
|
@ -92,7 +92,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
|||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<html><head></head><body>%s</body></html>' % fragment
|
||||
return '<html><head></head><body>%s</body></html>' % fragment
|
||||
|
||||
|
||||
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
|
||||
|
@ -174,7 +174,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
|
|||
rv.append("|%s<%s>" % (' ' * indent, name))
|
||||
if element.attrs:
|
||||
attributes = []
|
||||
for name, value in element.attrs.items():
|
||||
for name, value in list(element.attrs.items()):
|
||||
if isinstance(name, NamespacedAttribute):
|
||||
name = "%s %s" % (prefixes[name.namespace], name.name)
|
||||
if isinstance(value, list):
|
||||
|
@ -199,14 +199,14 @@ class AttrList(object):
|
|||
def __setitem__(self, name, value):
|
||||
# If this attribute is a multi-valued attribute for this element,
|
||||
# turn its value into a list.
|
||||
list_attr = HTML5TreeBuilder.cdata_list_attributes
|
||||
list_attr = self.element.cdata_list_attributes
|
||||
if (name in list_attr['*']
|
||||
or (self.element.name in list_attr
|
||||
and name in list_attr[self.element.name])):
|
||||
# A node that is being cloned may have already undergone
|
||||
# this procedure.
|
||||
if not isinstance(value, list):
|
||||
value = whitespace_re.split(value)
|
||||
value = nonwhitespace_re.findall(value)
|
||||
self.element[name] = value
|
||||
def items(self):
|
||||
return list(self.attrs.items())
|
||||
|
@ -229,7 +229,7 @@ class Element(treebuilder_base.Node):
|
|||
|
||||
def appendChild(self, node):
|
||||
string_child = child = None
|
||||
if isinstance(node, basestring):
|
||||
if isinstance(node, str):
|
||||
# Some other piece of code decided to pass in a string
|
||||
# instead of creating a TextElement object to contain the
|
||||
# string.
|
||||
|
@ -246,10 +246,10 @@ class Element(treebuilder_base.Node):
|
|||
child = node.element
|
||||
node.parent = self
|
||||
|
||||
if not isinstance(child, basestring) and child.parent is not None:
|
||||
if not isinstance(child, str) and child.parent is not None:
|
||||
node.element.extract()
|
||||
|
||||
if (string_child and self.element.contents
|
||||
if (string_child is not None and self.element.contents
|
||||
and self.element.contents[-1].__class__ == NavigableString):
|
||||
# We are appending a string onto another string.
|
||||
# TODO This has O(n^2) performance, for input like
|
||||
|
@ -259,7 +259,7 @@ class Element(treebuilder_base.Node):
|
|||
old_element.replace_with(new_element)
|
||||
self.soup._most_recent_element = new_element
|
||||
else:
|
||||
if isinstance(node, basestring):
|
||||
if isinstance(node, str):
|
||||
# Create a brand new NavigableString from this string.
|
||||
child = self.soup.new_string(node)
|
||||
|
||||
|
@ -299,7 +299,7 @@ class Element(treebuilder_base.Node):
|
|||
|
||||
self.soup.builder._replace_cdata_list_attribute_values(
|
||||
self.name, attributes)
|
||||
for name, value in attributes.items():
|
||||
for name, value in list(attributes.items()):
|
||||
self.element[name] = value
|
||||
|
||||
# The attributes may contain variables that need substitution.
|
||||
|
@ -360,16 +360,16 @@ class Element(treebuilder_base.Node):
|
|||
# Set the first child's previous_element and previous_sibling
|
||||
# to elements within the new parent
|
||||
first_child = to_append[0]
|
||||
if new_parents_last_descendant:
|
||||
if new_parents_last_descendant is not None:
|
||||
first_child.previous_element = new_parents_last_descendant
|
||||
else:
|
||||
first_child.previous_element = new_parent_element
|
||||
first_child.previous_sibling = new_parents_last_child
|
||||
if new_parents_last_descendant:
|
||||
if new_parents_last_descendant is not None:
|
||||
new_parents_last_descendant.next_element = first_child
|
||||
else:
|
||||
new_parent_element.next_element = first_child
|
||||
if new_parents_last_child:
|
||||
if new_parents_last_child is not None:
|
||||
new_parents_last_child.next_sibling = first_child
|
||||
|
||||
# Find the very last element being moved. It is now the
|
||||
|
@ -379,7 +379,7 @@ class Element(treebuilder_base.Node):
|
|||
last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
|
||||
|
||||
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
|
||||
if new_parents_last_descendant_next_element:
|
||||
if new_parents_last_descendant_next_element is not None:
|
||||
# TODO: This code has no test coverage and I'm not sure
|
||||
# how to get html5lib to go through this path, but it's
|
||||
# just the other side of the previous line.
|
||||
|
|
|
@ -1,17 +1,18 @@
|
|||
# encoding: utf-8
|
||||
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
|
||||
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = [
|
||||
'HTMLParserTreeBuilder',
|
||||
]
|
||||
|
||||
from HTMLParser import HTMLParser
|
||||
from html.parser import HTMLParser
|
||||
|
||||
try:
|
||||
from HTMLParser import HTMLParseError
|
||||
except ImportError, e:
|
||||
from html.parser import HTMLParseError
|
||||
except ImportError as e:
|
||||
# HTMLParseError is removed in Python 3.5. Since it can never be
|
||||
# thrown in 3.5, we can just define our own class as a placeholder.
|
||||
class HTMLParseError(Exception):
|
||||
|
@ -64,7 +65,18 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
|||
# order. It's a list of closing tags we've already handled and
|
||||
# will ignore, assuming they ever show up.
|
||||
self.already_closed_empty_element = []
|
||||
|
||||
|
||||
def error(self, msg):
|
||||
"""In Python 3, HTMLParser subclasses must implement error(), although this
|
||||
requirement doesn't appear to be documented.
|
||||
|
||||
In Python 2, HTMLParser implements error() as raising an exception.
|
||||
|
||||
In any event, this method is called only on very strange markup and our best strategy
|
||||
is to pretend it didn't happen and keep going.
|
||||
"""
|
||||
warnings.warn(msg)
|
||||
|
||||
def handle_startendtag(self, name, attrs):
|
||||
# This is only called when the markup looks like
|
||||
# <tag/>.
|
||||
|
@ -129,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
|||
else:
|
||||
real_name = int(name)
|
||||
|
||||
try:
|
||||
data = unichr(real_name)
|
||||
except (ValueError, OverflowError), e:
|
||||
data = u"\N{REPLACEMENT CHARACTER}"
|
||||
|
||||
data = None
|
||||
if real_name < 256:
|
||||
# HTML numeric entities are supposed to reference Unicode
|
||||
# code points, but sometimes they reference code points in
|
||||
# some other encoding (ahem, Windows-1252). E.g. “
|
||||
# instead of É for LEFT DOUBLE QUOTATION MARK. This
|
||||
# code tries to detect this situation and compensate.
|
||||
for encoding in (self.soup.original_encoding, 'windows-1252'):
|
||||
if not encoding:
|
||||
continue
|
||||
try:
|
||||
data = bytearray([real_name]).decode(encoding)
|
||||
except UnicodeDecodeError as e:
|
||||
pass
|
||||
if not data:
|
||||
try:
|
||||
data = chr(real_name)
|
||||
except (ValueError, OverflowError) as e:
|
||||
pass
|
||||
data = data or "\N{REPLACEMENT CHARACTER}"
|
||||
self.handle_data(data)
|
||||
|
||||
def handle_entityref(self, name):
|
||||
|
@ -141,7 +168,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
|||
if character is not None:
|
||||
data = character
|
||||
else:
|
||||
data = "&%s;" % name
|
||||
# If this were XML, it would be ambiguous whether "&foo"
|
||||
# was an character entity reference with a missing
|
||||
# semicolon or the literal string "&foo". Since this is
|
||||
# HTML, we have a complete list of all character entity references,
|
||||
# and this one wasn't found, so assume it's the literal string "&foo".
|
||||
data = "&%s" % name
|
||||
self.handle_data(data)
|
||||
|
||||
def handle_comment(self, data):
|
||||
|
@ -182,12 +214,15 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
|||
NAME = HTMLPARSER
|
||||
features = [NAME, HTML, STRICT]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
|
||||
super(HTMLParserTreeBuilder, self).__init__(**kwargs)
|
||||
parser_args = parser_args or []
|
||||
parser_kwargs = parser_kwargs or {}
|
||||
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
|
||||
kwargs['strict'] = False
|
||||
parser_kwargs['strict'] = False
|
||||
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
|
||||
kwargs['convert_charrefs'] = False
|
||||
self.parser_args = (args, kwargs)
|
||||
parser_kwargs['convert_charrefs'] = False
|
||||
self.parser_args = (parser_args, parser_kwargs)
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
document_declared_encoding=None, exclude_encodings=None):
|
||||
|
@ -196,7 +231,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
|||
declared within markup, whether any characters had to be
|
||||
replaced with REPLACEMENT CHARACTER).
|
||||
"""
|
||||
if isinstance(markup, unicode):
|
||||
if isinstance(markup, str):
|
||||
yield (markup, None, None, False)
|
||||
return
|
||||
|
||||
|
@ -213,7 +248,8 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
|||
parser.soup = self.soup
|
||||
try:
|
||||
parser.feed(markup)
|
||||
except HTMLParseError, e:
|
||||
parser.close()
|
||||
except HTMLParseError as e:
|
||||
warnings.warn(RuntimeWarning(
|
||||
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
||||
raise e
|
||||
|
|
|
@ -1,13 +1,18 @@
|
|||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = [
|
||||
'LXMLTreeBuilderForXML',
|
||||
'LXMLTreeBuilder',
|
||||
]
|
||||
|
||||
try:
|
||||
from collections.abc import Callable # Python 3.6
|
||||
except ImportError as e:
|
||||
from collections import Callable
|
||||
|
||||
from io import BytesIO
|
||||
from StringIO import StringIO
|
||||
import collections
|
||||
from io import StringIO
|
||||
from lxml import etree
|
||||
from bs4.element import (
|
||||
Comment,
|
||||
|
@ -28,6 +33,10 @@ from bs4.dammit import EncodingDetector
|
|||
|
||||
LXML = 'lxml'
|
||||
|
||||
def _invert(d):
|
||||
"Invert a dictionary."
|
||||
return dict((v,k) for k, v in list(d.items()))
|
||||
|
||||
class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
DEFAULT_PARSER_CLASS = etree.XMLParser
|
||||
|
||||
|
@ -44,7 +53,29 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
|
||||
# This namespace mapping is specified in the XML Namespace
|
||||
# standard.
|
||||
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
|
||||
DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
|
||||
|
||||
DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
|
||||
|
||||
def initialize_soup(self, soup):
|
||||
"""Let the BeautifulSoup object know about the standard namespace
|
||||
mapping.
|
||||
"""
|
||||
super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
|
||||
self._register_namespaces(self.DEFAULT_NSMAPS)
|
||||
|
||||
def _register_namespaces(self, mapping):
|
||||
"""Let the BeautifulSoup object know about namespaces encountered
|
||||
while parsing the document.
|
||||
|
||||
This might be useful later on when creating CSS selectors.
|
||||
"""
|
||||
for key, value in list(mapping.items()):
|
||||
if key and key not in self.soup._namespaces:
|
||||
# Let the BeautifulSoup object know about a new namespace.
|
||||
# If there are multiple namespaces defined with the same
|
||||
# prefix, the first one in the document takes precedence.
|
||||
self.soup._namespaces[key] = value
|
||||
|
||||
def default_parser(self, encoding):
|
||||
# This can either return a parser object or a class, which
|
||||
|
@ -58,12 +89,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
# Use the default parser.
|
||||
parser = self.default_parser(encoding)
|
||||
|
||||
if isinstance(parser, collections.Callable):
|
||||
if isinstance(parser, Callable):
|
||||
# Instantiate the parser with default arguments
|
||||
parser = parser(target=self, strip_cdata=False, encoding=encoding)
|
||||
return parser
|
||||
|
||||
def __init__(self, parser=None, empty_element_tags=None):
|
||||
def __init__(self, parser=None, empty_element_tags=None, **kwargs):
|
||||
# TODO: Issue a warning if parser is present but not a
|
||||
# callable, since that means there's no way to create new
|
||||
# parsers for different encodings.
|
||||
|
@ -71,8 +102,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
if empty_element_tags is not None:
|
||||
self.empty_element_tags = set(empty_element_tags)
|
||||
self.soup = None
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS]
|
||||
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
|
||||
super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
|
||||
|
||||
def _getNsTag(self, tag):
|
||||
# Split the namespace URL out of a fully-qualified lxml tag
|
||||
# name. Copied from lxml's src/lxml/sax.py.
|
||||
|
@ -101,12 +133,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
else:
|
||||
self.processing_instruction_class = XMLProcessingInstruction
|
||||
|
||||
if isinstance(markup, unicode):
|
||||
if isinstance(markup, str):
|
||||
# We were given Unicode. Maybe lxml can parse Unicode on
|
||||
# this system?
|
||||
yield markup, None, document_declared_encoding, False
|
||||
|
||||
if isinstance(markup, unicode):
|
||||
if isinstance(markup, str):
|
||||
# No, apparently not. Convert the Unicode to UTF-8 and
|
||||
# tell lxml to parse it as UTF-8.
|
||||
yield (markup.encode("utf8"), "utf8",
|
||||
|
@ -121,7 +153,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
def feed(self, markup):
|
||||
if isinstance(markup, bytes):
|
||||
markup = BytesIO(markup)
|
||||
elif isinstance(markup, unicode):
|
||||
elif isinstance(markup, str):
|
||||
markup = StringIO(markup)
|
||||
|
||||
# Call feed() at least once, even if the markup is empty,
|
||||
|
@ -136,30 +168,36 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
if len(data) != 0:
|
||||
self.parser.feed(data)
|
||||
self.parser.close()
|
||||
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
|
||||
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
|
||||
raise ParserRejectedMarkup(str(e))
|
||||
|
||||
def close(self):
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS]
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
|
||||
|
||||
def start(self, name, attrs, nsmap={}):
|
||||
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
|
||||
attrs = dict(attrs)
|
||||
nsprefix = None
|
||||
# Invert each namespace map as it comes in.
|
||||
if len(self.nsmaps) > 1:
|
||||
# There are no new namespaces for this tag, but
|
||||
# non-default namespaces are in play, so we need a
|
||||
# separate tag stack to know when they end.
|
||||
self.nsmaps.append(None)
|
||||
if len(nsmap) == 0 and len(self.nsmaps) > 1:
|
||||
# There are no new namespaces for this tag, but
|
||||
# non-default namespaces are in play, so we need a
|
||||
# separate tag stack to know when they end.
|
||||
self.nsmaps.append(None)
|
||||
elif len(nsmap) > 0:
|
||||
# A new namespace mapping has come into play.
|
||||
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
|
||||
self.nsmaps.append(inverted_nsmap)
|
||||
|
||||
# First, Let the BeautifulSoup object know about it.
|
||||
self._register_namespaces(nsmap)
|
||||
|
||||
# Then, add it to our running list of inverted namespace
|
||||
# mappings.
|
||||
self.nsmaps.append(_invert(nsmap))
|
||||
|
||||
# Also treat the namespace mapping as a set of attributes on the
|
||||
# tag, so we can recreate it later.
|
||||
attrs = attrs.copy()
|
||||
for prefix, namespace in nsmap.items():
|
||||
for prefix, namespace in list(nsmap.items()):
|
||||
attribute = NamespacedAttribute(
|
||||
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
|
||||
attrs[attribute] = namespace
|
||||
|
@ -168,7 +206,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
# from lxml with namespaces attached to their names, and
|
||||
# turn then into NamespacedAttribute objects.
|
||||
new_attrs = {}
|
||||
for attr, value in attrs.items():
|
||||
for attr, value in list(attrs.items()):
|
||||
namespace, attr = self._getNsTag(attr)
|
||||
if namespace is None:
|
||||
new_attrs[attr] = value
|
||||
|
@ -228,7 +266,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
|
||||
return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
|
||||
|
||||
|
||||
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||||
|
@ -249,10 +287,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
|||
self.parser = self.parser_for(encoding)
|
||||
self.parser.feed(markup)
|
||||
self.parser.close()
|
||||
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
|
||||
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
|
||||
raise ParserRejectedMarkup(str(e))
|
||||
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<html><body>%s</body></html>' % fragment
|
||||
return '<html><body>%s</body></html>' % fragment
|
||||
|
|
|
@ -6,12 +6,11 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal
|
|||
Feed Parser. It works best on XML and HTML, but it does not rewrite the
|
||||
XML or HTML to reflect a new encoding; that's the tree builder's job.
|
||||
"""
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
import codecs
|
||||
from htmlentitydefs import codepoint2name
|
||||
from html.entities import codepoint2name
|
||||
import re
|
||||
import logging
|
||||
import string
|
||||
|
@ -46,9 +45,9 @@ except ImportError:
|
|||
pass
|
||||
|
||||
xml_encoding_re = re.compile(
|
||||
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
|
||||
'^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
|
||||
html_meta_re = re.compile(
|
||||
'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
|
||||
'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
|
||||
|
||||
class EntitySubstitution(object):
|
||||
|
||||
|
@ -58,15 +57,24 @@ class EntitySubstitution(object):
|
|||
lookup = {}
|
||||
reverse_lookup = {}
|
||||
characters_for_re = []
|
||||
for codepoint, name in list(codepoint2name.items()):
|
||||
character = unichr(codepoint)
|
||||
if codepoint != 34:
|
||||
|
||||
# &apos is an XHTML entity and an HTML 5, but not an HTML 4
|
||||
# entity. We don't want to use it, but we want to recognize it on the way in.
|
||||
#
|
||||
# TODO: Ideally we would be able to recognize all HTML 5 named
|
||||
# entities, but that's a little tricky.
|
||||
extra = [(39, 'apos')]
|
||||
for codepoint, name in list(codepoint2name.items()) + extra:
|
||||
character = chr(codepoint)
|
||||
if codepoint not in (34, 39):
|
||||
# There's no point in turning the quotation mark into
|
||||
# ", unless it happens within an attribute value, which
|
||||
# is handled elsewhere.
|
||||
# " or the single quote into ', unless it
|
||||
# happens within an attribute value, which is handled
|
||||
# elsewhere.
|
||||
characters_for_re.append(character)
|
||||
lookup[character] = name
|
||||
# But we do want to turn " into the quotation mark.
|
||||
# But we do want to recognize those entities on the way in and
|
||||
# convert them to Unicode characters.
|
||||
reverse_lookup[name] = character
|
||||
re_definition = "[%s]" % "".join(characters_for_re)
|
||||
return lookup, reverse_lookup, re.compile(re_definition)
|
||||
|
@ -82,7 +90,7 @@ class EntitySubstitution(object):
|
|||
}
|
||||
|
||||
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
|
||||
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
|
||||
"&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
|
||||
")")
|
||||
|
||||
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
|
||||
|
@ -274,7 +282,7 @@ class EncodingDetector:
|
|||
def strip_byte_order_mark(cls, data):
|
||||
"""If a byte-order mark is present, strip it and return the encoding it implies."""
|
||||
encoding = None
|
||||
if isinstance(data, unicode):
|
||||
if isinstance(data, str):
|
||||
# Unicode data cannot have a byte-order mark.
|
||||
return data, encoding
|
||||
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
|
||||
|
@ -352,9 +360,9 @@ class UnicodeDammit:
|
|||
markup, override_encodings, is_html, exclude_encodings)
|
||||
|
||||
# Short-circuit if the data is in Unicode to begin with.
|
||||
if isinstance(markup, unicode) or markup == '':
|
||||
if isinstance(markup, str) or markup == '':
|
||||
self.markup = markup
|
||||
self.unicode_markup = unicode(markup)
|
||||
self.unicode_markup = str(markup)
|
||||
self.original_encoding = None
|
||||
return
|
||||
|
||||
|
@ -438,7 +446,7 @@ class UnicodeDammit:
|
|||
def _to_unicode(self, data, encoding, errors="strict"):
|
||||
'''Given a string and its encoding, decodes the string into Unicode.
|
||||
%encoding is a string recognized by encodings.aliases'''
|
||||
return unicode(data, encoding, errors)
|
||||
return str(data, encoding, errors)
|
||||
|
||||
@property
|
||||
def declared_html_encoding(self):
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
"""Diagnostic functions, mainly for use when doing tech support."""
|
||||
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
import cProfile
|
||||
from StringIO import StringIO
|
||||
from HTMLParser import HTMLParser
|
||||
from io import StringIO
|
||||
from html.parser import HTMLParser
|
||||
import bs4
|
||||
from bs4 import BeautifulSoup, __version__
|
||||
from bs4.builder import builder_registry
|
||||
|
@ -22,8 +21,8 @@ import cProfile
|
|||
|
||||
def diagnose(data):
|
||||
"""Diagnostic suite for isolating common problems."""
|
||||
print "Diagnostic running on Beautiful Soup %s" % __version__
|
||||
print "Python version %s" % sys.version
|
||||
print("Diagnostic running on Beautiful Soup %s" % __version__)
|
||||
print("Python version %s" % sys.version)
|
||||
|
||||
basic_parsers = ["html.parser", "html5lib", "lxml"]
|
||||
for name in basic_parsers:
|
||||
|
@ -32,16 +31,16 @@ def diagnose(data):
|
|||
break
|
||||
else:
|
||||
basic_parsers.remove(name)
|
||||
print (
|
||||
print((
|
||||
"I noticed that %s is not installed. Installing it may help." %
|
||||
name)
|
||||
name))
|
||||
|
||||
if 'lxml' in basic_parsers:
|
||||
basic_parsers.append(["lxml", "xml"])
|
||||
basic_parsers.append("lxml-xml")
|
||||
try:
|
||||
from lxml import etree
|
||||
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
|
||||
except ImportError, e:
|
||||
print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
|
||||
except ImportError as e:
|
||||
print (
|
||||
"lxml is not installed or couldn't be imported.")
|
||||
|
||||
|
@ -49,37 +48,43 @@ def diagnose(data):
|
|||
if 'html5lib' in basic_parsers:
|
||||
try:
|
||||
import html5lib
|
||||
print "Found html5lib version %s" % html5lib.__version__
|
||||
except ImportError, e:
|
||||
print("Found html5lib version %s" % html5lib.__version__)
|
||||
except ImportError as e:
|
||||
print (
|
||||
"html5lib is not installed or couldn't be imported.")
|
||||
|
||||
if hasattr(data, 'read'):
|
||||
data = data.read()
|
||||
elif os.path.exists(data):
|
||||
print '"%s" looks like a filename. Reading data from the file.' % data
|
||||
with open(data) as fp:
|
||||
data = fp.read()
|
||||
elif data.startswith("http:") or data.startswith("https:"):
|
||||
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
|
||||
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
|
||||
print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
|
||||
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
|
||||
return
|
||||
print
|
||||
else:
|
||||
try:
|
||||
if os.path.exists(data):
|
||||
print('"%s" looks like a filename. Reading data from the file.' % data)
|
||||
with open(data) as fp:
|
||||
data = fp.read()
|
||||
except ValueError:
|
||||
# This can happen on some platforms when the 'filename' is
|
||||
# too long. Assume it's data and not a filename.
|
||||
pass
|
||||
print()
|
||||
|
||||
for parser in basic_parsers:
|
||||
print "Trying to parse your markup with %s" % parser
|
||||
print("Trying to parse your markup with %s" % parser)
|
||||
success = False
|
||||
try:
|
||||
soup = BeautifulSoup(data, parser)
|
||||
soup = BeautifulSoup(data, features=parser)
|
||||
success = True
|
||||
except Exception, e:
|
||||
print "%s could not parse the markup." % parser
|
||||
except Exception as e:
|
||||
print("%s could not parse the markup." % parser)
|
||||
traceback.print_exc()
|
||||
if success:
|
||||
print "Here's what %s did with the markup:" % parser
|
||||
print soup.prettify()
|
||||
print("Here's what %s did with the markup:" % parser)
|
||||
print(soup.prettify())
|
||||
|
||||
print "-" * 80
|
||||
print("-" * 80)
|
||||
|
||||
def lxml_trace(data, html=True, **kwargs):
|
||||
"""Print out the lxml events that occur during parsing.
|
||||
|
@ -89,7 +94,7 @@ def lxml_trace(data, html=True, **kwargs):
|
|||
"""
|
||||
from lxml import etree
|
||||
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
|
||||
print("%s, %4s, %s" % (event, element.tag, element.text))
|
||||
print(("%s, %4s, %s" % (event, element.tag, element.text)))
|
||||
|
||||
class AnnouncingParser(HTMLParser):
|
||||
"""Announces HTMLParser parse events, without doing anything else."""
|
||||
|
@ -149,7 +154,7 @@ def rword(length=5):
|
|||
|
||||
def rsentence(length=4):
|
||||
"Generate a random sentence-like string."
|
||||
return " ".join(rword(random.randint(4,9)) for i in range(length))
|
||||
return " ".join(rword(random.randint(4,9)) for i in list(range(length)))
|
||||
|
||||
def rdoc(num_elements=1000):
|
||||
"""Randomly generate an invalid HTML document."""
|
||||
|
@ -171,9 +176,9 @@ def rdoc(num_elements=1000):
|
|||
|
||||
def benchmark_parsers(num_elements=100000):
|
||||
"""Very basic head-to-head performance benchmark."""
|
||||
print "Comparative parser benchmark on Beautiful Soup %s" % __version__
|
||||
print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
|
||||
data = rdoc(num_elements)
|
||||
print "Generated a large invalid HTML document (%d bytes)." % len(data)
|
||||
print("Generated a large invalid HTML document (%d bytes)." % len(data))
|
||||
|
||||
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
|
||||
success = False
|
||||
|
@ -182,24 +187,24 @@ def benchmark_parsers(num_elements=100000):
|
|||
soup = BeautifulSoup(data, parser)
|
||||
b = time.time()
|
||||
success = True
|
||||
except Exception, e:
|
||||
print "%s could not parse the markup." % parser
|
||||
except Exception as e:
|
||||
print("%s could not parse the markup." % parser)
|
||||
traceback.print_exc()
|
||||
if success:
|
||||
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
|
||||
print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
|
||||
|
||||
from lxml import etree
|
||||
a = time.time()
|
||||
etree.HTML(data)
|
||||
b = time.time()
|
||||
print "Raw lxml parsed the markup in %.2fs." % (b-a)
|
||||
print("Raw lxml parsed the markup in %.2fs." % (b-a))
|
||||
|
||||
import html5lib
|
||||
parser = html5lib.HTMLParser()
|
||||
a = time.time()
|
||||
parser.parse(data)
|
||||
b = time.time()
|
||||
print "Raw html5lib parsed the markup in %.2fs." % (b-a)
|
||||
print("Raw html5lib parsed the markup in %.2fs." % (b-a))
|
||||
|
||||
def profile(num_elements=100000, parser="lxml"):
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,7 +1,7 @@
|
|||
# encoding: utf-8
|
||||
"""Helper classes for tests."""
|
||||
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
import pickle
|
||||
|
@ -16,29 +16,66 @@ from bs4.element import (
|
|||
ContentMetaAttributeValue,
|
||||
Doctype,
|
||||
SoupStrainer,
|
||||
Tag
|
||||
)
|
||||
|
||||
from bs4.builder import HTMLParserTreeBuilder
|
||||
default_builder = HTMLParserTreeBuilder
|
||||
|
||||
BAD_DOCUMENT = """A bare string
|
||||
<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
|
||||
<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
|
||||
<div><![CDATA[A CDATA section where it doesn't belong]]></div>
|
||||
<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
|
||||
<div>A <meta> tag</div>
|
||||
<div>A <br> tag that supposedly has contents.</br></div>
|
||||
<div>AT&T</div>
|
||||
<div><textarea>Within a textarea, markup like <b> tags and <&<& should be treated as literal</textarea></div>
|
||||
<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
|
||||
<div>This numeric entity is missing the final semicolon: <x t="piñata"></div>
|
||||
<div><a href="http://example.com/</a> that attribute value never got closed</div>
|
||||
<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
|
||||
<! This document starts with a bogus declaration ><div>a</div>
|
||||
<div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
|
||||
<div>This document ends with <!an incomplete declaration
|
||||
<div><a style={height:21px;}>That attribute value was bogus</a></div>
|
||||
<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
|
||||
<div><table><td nowrap>That boolean attribute had no value</td></table></div>
|
||||
<div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
|
||||
<div>This document ends before the entity finishes: >
|
||||
<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
|
||||
<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
|
||||
<div><table><tr><td>Here's a table</td></tr></table></div>
|
||||
<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
|
||||
<div>This tag contains nothing but whitespace: <b> </b></div>
|
||||
<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
|
||||
<div><table><div>This table contains bare markup</div></table></div>
|
||||
<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div>
|
||||
<div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
|
||||
<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
|
||||
<div><our\u2603>Tag name contains Unicode characters</our\u2603></div>
|
||||
<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
"""
|
||||
|
||||
|
||||
class SoupTest(unittest.TestCase):
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return default_builder()
|
||||
return default_builder
|
||||
|
||||
def soup(self, markup, **kwargs):
|
||||
"""Build a Beautiful Soup object from markup."""
|
||||
builder = kwargs.pop('builder', self.default_builder)
|
||||
return BeautifulSoup(markup, builder=builder, **kwargs)
|
||||
|
||||
def document_for(self, markup):
|
||||
def document_for(self, markup, **kwargs):
|
||||
"""Turn an HTML fragment into a document.
|
||||
|
||||
The details depend on the builder.
|
||||
"""
|
||||
return self.default_builder.test_fragment_to_document(markup)
|
||||
return self.default_builder(**kwargs).test_fragment_to_document(markup)
|
||||
|
||||
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
|
||||
builder = self.default_builder
|
||||
|
@ -59,6 +96,121 @@ class SoupTest(unittest.TestCase):
|
|||
self.assertEqual(earlier, e.previous_element)
|
||||
earlier = e
|
||||
|
||||
def linkage_validator(self, el, _recursive_call=False):
|
||||
"""Ensure proper linkage throughout the document."""
|
||||
descendant = None
|
||||
# Document element should have no previous element or previous sibling.
|
||||
# It also shouldn't have a next sibling.
|
||||
if el.parent is None:
|
||||
assert el.previous_element is None,\
|
||||
"Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
|
||||
el, el.previous_element, None
|
||||
)
|
||||
assert el.previous_sibling is None,\
|
||||
"Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
|
||||
el, el.previous_sibling, None
|
||||
)
|
||||
assert el.next_sibling is None,\
|
||||
"Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
|
||||
el, el.next_sibling, None
|
||||
)
|
||||
|
||||
idx = 0
|
||||
child = None
|
||||
last_child = None
|
||||
last_idx = len(el.contents) - 1
|
||||
for child in el.contents:
|
||||
descendant = None
|
||||
|
||||
# Parent should link next element to their first child
|
||||
# That child should have no previous sibling
|
||||
if idx == 0:
|
||||
if el.parent is not None:
|
||||
assert el.next_element is child,\
|
||||
"Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
|
||||
el, el.next_element, child
|
||||
)
|
||||
assert child.previous_element is el,\
|
||||
"Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
|
||||
child, child.previous_element, el
|
||||
)
|
||||
assert child.previous_sibling is None,\
|
||||
"Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
|
||||
child, child.previous_sibling, None
|
||||
)
|
||||
|
||||
# If not the first child, previous index should link as sibling to this index
|
||||
# Previous element should match the last index or the last bubbled up descendant
|
||||
else:
|
||||
assert child.previous_sibling is el.contents[idx - 1],\
|
||||
"Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
|
||||
child, child.previous_sibling, el.contents[idx - 1]
|
||||
)
|
||||
assert el.contents[idx - 1].next_sibling is child,\
|
||||
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||
el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
|
||||
)
|
||||
|
||||
if last_child is not None:
|
||||
assert child.previous_element is last_child,\
|
||||
"Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
|
||||
child, child.previous_element, last_child, child.parent.contents
|
||||
)
|
||||
assert last_child.next_element is child,\
|
||||
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||
last_child, last_child.next_element, child
|
||||
)
|
||||
|
||||
if isinstance(child, Tag) and child.contents:
|
||||
descendant = self.linkage_validator(child, True)
|
||||
# A bubbled up descendant should have no next siblings
|
||||
assert descendant.next_sibling is None,\
|
||||
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||
descendant, descendant.next_sibling, None
|
||||
)
|
||||
|
||||
# Mark last child as either the bubbled up descendant or the current child
|
||||
if descendant is not None:
|
||||
last_child = descendant
|
||||
else:
|
||||
last_child = child
|
||||
|
||||
# If last child, there are non next siblings
|
||||
if idx == last_idx:
|
||||
assert child.next_sibling is None,\
|
||||
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||
child, child.next_sibling, None
|
||||
)
|
||||
idx += 1
|
||||
|
||||
child = descendant if descendant is not None else child
|
||||
if child is None:
|
||||
child = el
|
||||
|
||||
if not _recursive_call and child is not None:
|
||||
target = el
|
||||
while True:
|
||||
if target is None:
|
||||
assert child.next_element is None, \
|
||||
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||
child, child.next_element, None
|
||||
)
|
||||
break
|
||||
elif target.next_sibling is not None:
|
||||
assert child.next_element is target.next_sibling, \
|
||||
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||
child, child.next_element, target.next_sibling
|
||||
)
|
||||
break
|
||||
target = target.parent
|
||||
|
||||
# We are done, so nothing to return
|
||||
return None
|
||||
else:
|
||||
# Return the child to the recursive caller
|
||||
return child
|
||||
|
||||
|
||||
class HTMLTreeBuilderSmokeTest(object):
|
||||
|
||||
"""A basic test of a treebuilder's competence.
|
||||
|
@ -80,7 +232,7 @@ class HTMLTreeBuilderSmokeTest(object):
|
|||
soup = self.soup("")
|
||||
new_tag = soup.new_tag(name)
|
||||
self.assertEqual(True, new_tag.is_empty_element)
|
||||
|
||||
|
||||
def test_pickle_and_unpickle_identity(self):
|
||||
# Pickling a tree, then unpickling it, yields a tree identical
|
||||
# to the original.
|
||||
|
@ -150,12 +302,20 @@ class HTMLTreeBuilderSmokeTest(object):
|
|||
soup.encode("utf-8").replace(b"\n", b""),
|
||||
markup.replace(b"\n", b""))
|
||||
|
||||
def test_namespaced_html(self):
|
||||
"""When a namespaced XML document is parsed as HTML it should
|
||||
be treated as HTML with weird tag names.
|
||||
"""
|
||||
markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(2, len(soup.find_all("ns1:foo")))
|
||||
|
||||
def test_processing_instruction(self):
|
||||
# We test both Unicode and bytestring to verify that
|
||||
# process_markup correctly sets processing_instruction_class
|
||||
# even when the markup is already Unicode and there is no
|
||||
# need to process anything.
|
||||
markup = u"""<?PITarget PIContent?>"""
|
||||
markup = """<?PITarget PIContent?>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(markup, soup.decode())
|
||||
|
||||
|
@ -292,6 +452,18 @@ Hello, world!
|
|||
"<tbody><tr><td>Bar</td></tr></tbody>"
|
||||
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
||||
|
||||
def test_multivalued_attribute_with_whitespace(self):
|
||||
# Whitespace separating the values of a multi-valued attribute
|
||||
# should be ignored.
|
||||
|
||||
markup = '<div class=" foo bar "></a>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(['foo', 'bar'], soup.div['class'])
|
||||
|
||||
# If you search by the literal name of the class it's like the whitespace
|
||||
# wasn't there.
|
||||
self.assertEqual(soup.div, soup.find('div', class_="foo bar"))
|
||||
|
||||
def test_deeply_nested_multivalued_attribute(self):
|
||||
# html5lib can set the attributes of the same tag many times
|
||||
# as it rearranges the tree. This has caused problems with
|
||||
|
@ -311,15 +483,41 @@ Hello, world!
|
|||
def test_angle_brackets_in_attribute_values_are_escaped(self):
|
||||
self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>')
|
||||
|
||||
def test_strings_resembling_character_entity_references(self):
|
||||
# "&T" and "&p" look like incomplete character entities, but they are
|
||||
# not.
|
||||
self.assertSoupEquals(
|
||||
"<p>• AT&T is in the s&p 500</p>",
|
||||
"<p>\u2022 AT&T is in the s&p 500</p>"
|
||||
)
|
||||
|
||||
def test_apos_entity(self):
|
||||
self.assertSoupEquals(
|
||||
"<p>Bob's Bar</p>",
|
||||
"<p>Bob's Bar</p>",
|
||||
)
|
||||
|
||||
def test_entities_in_foreign_document_encoding(self):
|
||||
# “ and ” are invalid numeric entities referencing
|
||||
# Windows-1252 characters. - references a character common
|
||||
# to Windows-1252 and Unicode, and ☃ references a
|
||||
# character only found in Unicode.
|
||||
#
|
||||
# All of these entities should be converted to Unicode
|
||||
# characters.
|
||||
markup = "<p>“Hello” -☃</p>"
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual("“Hello” -☃", soup.p.string)
|
||||
|
||||
def test_entities_in_attributes_converted_to_unicode(self):
|
||||
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
|
||||
expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
|
||||
def test_entities_in_text_converted_to_unicode(self):
|
||||
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
|
||||
expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
|
@ -330,11 +528,11 @@ Hello, world!
|
|||
'<p>I said "good day!"</p>')
|
||||
|
||||
def test_out_of_range_entity(self):
|
||||
expect = u"\N{REPLACEMENT CHARACTER}"
|
||||
expect = "\N{REPLACEMENT CHARACTER}"
|
||||
self.assertSoupEquals("�", expect)
|
||||
self.assertSoupEquals("�", expect)
|
||||
self.assertSoupEquals("�", expect)
|
||||
|
||||
|
||||
def test_multipart_strings(self):
|
||||
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
|
||||
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
|
||||
|
@ -408,9 +606,9 @@ Hello, world!
|
|||
# A seemingly innocuous document... but it's in Unicode! And
|
||||
# it contains characters that can't be represented in the
|
||||
# encoding found in the declaration! The horror!
|
||||
markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
|
||||
markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
|
||||
self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
|
||||
|
||||
def test_soupstrainer(self):
|
||||
"""Parsers should be able to work with SoupStrainers."""
|
||||
|
@ -450,7 +648,7 @@ Hello, world!
|
|||
# Both XML and HTML entities are converted to Unicode characters
|
||||
# during parsing.
|
||||
text = "<p><<sacré bleu!>></p>"
|
||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
|
||||
expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
|
||||
self.assertSoupEquals(text, expected)
|
||||
|
||||
def test_smart_quotes_converted_on_the_way_in(self):
|
||||
|
@ -460,15 +658,15 @@ Hello, world!
|
|||
soup = self.soup(quote)
|
||||
self.assertEqual(
|
||||
soup.p.string,
|
||||
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
|
||||
"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
|
||||
|
||||
def test_non_breaking_spaces_converted_on_the_way_in(self):
|
||||
soup = self.soup("<a> </a>")
|
||||
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
|
||||
self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
|
||||
|
||||
def test_entities_converted_on_the_way_out(self):
|
||||
text = "<p><<sacré bleu!>></p>"
|
||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8")
|
||||
expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8")
|
||||
soup = self.soup(text)
|
||||
self.assertEqual(soup.p.encode("utf-8"), expected)
|
||||
|
||||
|
@ -477,7 +675,7 @@ Hello, world!
|
|||
# easy-to-understand document.
|
||||
|
||||
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
|
||||
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
|
||||
unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
|
||||
|
||||
# That's because we're going to encode it into ISO-Latin-1, and use
|
||||
# that to test.
|
||||
|
@ -586,6 +784,13 @@ Hello, world!
|
|||
data.a['foo'] = 'bar'
|
||||
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
|
||||
|
||||
def test_worst_case(self):
|
||||
"""Test the worst case (currently) for linking issues."""
|
||||
|
||||
soup = self.soup(BAD_DOCUMENT)
|
||||
self.linkage_validator(soup)
|
||||
|
||||
|
||||
class XMLTreeBuilderSmokeTest(object):
|
||||
|
||||
def test_pickle_and_unpickle_identity(self):
|
||||
|
@ -624,6 +829,17 @@ class XMLTreeBuilderSmokeTest(object):
|
|||
self.assertEqual(
|
||||
soup.encode("utf-8"), markup)
|
||||
|
||||
def test_nested_namespaces(self):
|
||||
doc = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
<parent xmlns="http://ns1/">
|
||||
<child xmlns="http://ns2/" xmlns:ns3="http://ns3/">
|
||||
<grandchild ns3:attr="value" xmlns="http://ns4/"/>
|
||||
</child>
|
||||
</parent>"""
|
||||
soup = self.soup(doc)
|
||||
self.assertEqual(doc, soup.encode())
|
||||
|
||||
def test_formatter_processes_script_tag_for_xml_documents(self):
|
||||
doc = """
|
||||
<script type="text/javascript">
|
||||
|
@ -637,15 +853,15 @@ class XMLTreeBuilderSmokeTest(object):
|
|||
self.assertTrue(b"< < hey > >" in encoded)
|
||||
|
||||
def test_can_parse_unicode_document(self):
|
||||
markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
|
||||
markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
|
||||
self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
|
||||
|
||||
def test_popping_namespaced_tag(self):
|
||||
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
unicode(soup.rss), markup)
|
||||
str(soup.rss), markup)
|
||||
|
||||
def test_docstring_includes_correct_encoding(self):
|
||||
soup = self.soup("<root/>")
|
||||
|
@ -676,17 +892,17 @@ class XMLTreeBuilderSmokeTest(object):
|
|||
def test_closing_namespaced_tag(self):
|
||||
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(unicode(soup.p), markup)
|
||||
self.assertEqual(str(soup.p), markup)
|
||||
|
||||
def test_namespaced_attributes(self):
|
||||
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(unicode(soup.foo), markup)
|
||||
self.assertEqual(str(soup.foo), markup)
|
||||
|
||||
def test_namespaced_attributes_xml_namespace(self):
|
||||
markup = '<foo xml:lang="fr">bar</foo>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(unicode(soup.foo), markup)
|
||||
self.assertEqual(str(soup.foo), markup)
|
||||
|
||||
def test_find_by_prefixed_name(self):
|
||||
doc = """<?xml version="1.0" encoding="utf-8"?>
|
||||
|
@ -721,6 +937,12 @@ class XMLTreeBuilderSmokeTest(object):
|
|||
# The two tags have the same namespace prefix.
|
||||
self.assertEqual(tag.prefix, duplicate.prefix)
|
||||
|
||||
def test_worst_case(self):
|
||||
"""Test the worst case (currently) for linking issues."""
|
||||
|
||||
soup = self.soup(BAD_DOCUMENT)
|
||||
self.linkage_validator(soup)
|
||||
|
||||
|
||||
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
|
||||
"""Smoke test for a tree builder that supports HTML5."""
|
||||
|
|
|
@ -5,7 +5,7 @@ import warnings
|
|||
try:
|
||||
from bs4.builder import HTML5TreeBuilder
|
||||
HTML5LIB_PRESENT = True
|
||||
except ImportError, e:
|
||||
except ImportError as e:
|
||||
HTML5LIB_PRESENT = False
|
||||
from bs4.element import SoupStrainer
|
||||
from bs4.testing import (
|
||||
|
@ -22,7 +22,7 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
|||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return HTML5TreeBuilder()
|
||||
return HTML5TreeBuilder
|
||||
|
||||
def test_soupstrainer(self):
|
||||
# The html5lib tree builder does not support SoupStrainers.
|
||||
|
@ -74,14 +74,14 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
|||
def test_reparented_markup(self):
|
||||
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
|
||||
self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
|
||||
self.assertEqual(2, len(soup.find_all('p')))
|
||||
|
||||
|
||||
def test_reparented_markup_ends_with_whitespace(self):
|
||||
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
|
||||
self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
|
||||
self.assertEqual(2, len(soup.find_all('p')))
|
||||
|
||||
def test_reparented_markup_containing_identical_whitespace_nodes(self):
|
||||
|
@ -127,4 +127,44 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
|||
def test_foster_parenting(self):
|
||||
markup = b"""<table><td></tbody>A"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(u"<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
|
||||
self.assertEqual("<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
|
||||
|
||||
def test_extraction(self):
|
||||
"""
|
||||
Test that extraction does not destroy the tree.
|
||||
|
||||
https://bugs.launchpad.net/beautifulsoup/+bug/1782928
|
||||
"""
|
||||
|
||||
markup = """
|
||||
<html><head></head>
|
||||
<style>
|
||||
</style><script></script><body><p>hello</p></body></html>
|
||||
"""
|
||||
soup = self.soup(markup)
|
||||
[s.extract() for s in soup('script')]
|
||||
[s.extract() for s in soup('style')]
|
||||
|
||||
self.assertEqual(len(soup.find_all("p")), 1)
|
||||
|
||||
def test_empty_comment(self):
|
||||
"""
|
||||
Test that empty comment does not break structure.
|
||||
|
||||
https://bugs.launchpad.net/beautifulsoup/+bug/1806598
|
||||
"""
|
||||
|
||||
markup = """
|
||||
<html>
|
||||
<body>
|
||||
<form>
|
||||
<!----><input type="text">
|
||||
</form>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
soup = self.soup(markup)
|
||||
inputs = []
|
||||
for form in soup.find_all('form'):
|
||||
inputs.extend(form.find_all('input'))
|
||||
self.assertEqual(len(inputs), 1)
|
||||
|
|
|
@ -5,12 +5,11 @@ from pdb import set_trace
|
|||
import pickle
|
||||
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
|
||||
from bs4.builder import HTMLParserTreeBuilder
|
||||
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
|
||||
|
||||
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return HTMLParserTreeBuilder()
|
||||
default_builder = HTMLParserTreeBuilder
|
||||
|
||||
def test_namespaced_system_doctype(self):
|
||||
# html.parser can't handle namespaced doctypes, so skip this one.
|
||||
|
@ -32,3 +31,17 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
|||
def test_redundant_empty_element_closing_tags(self):
|
||||
self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
|
||||
self.assertSoupEquals('</br></br></br>', "")
|
||||
|
||||
def test_empty_element(self):
|
||||
# This verifies that any buffered data present when the parser
|
||||
# finishes working is handled.
|
||||
self.assertSoupEquals("foo &# bar", "foo &# bar")
|
||||
|
||||
|
||||
class TestHTMLParserSubclass(SoupTest):
|
||||
def test_error(self):
|
||||
"""Verify that our HTMLParser subclass implements error() in a way
|
||||
that doesn't cause a crash.
|
||||
"""
|
||||
parser = BeautifulSoupHTMLParser()
|
||||
parser.error("don't crash")
|
||||
|
|
|
@ -7,7 +7,7 @@ try:
|
|||
import lxml.etree
|
||||
LXML_PRESENT = True
|
||||
LXML_VERSION = lxml.etree.LXML_VERSION
|
||||
except ImportError, e:
|
||||
except ImportError as e:
|
||||
LXML_PRESENT = False
|
||||
LXML_VERSION = (0,)
|
||||
|
||||
|
@ -36,7 +36,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
|||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return LXMLTreeBuilder()
|
||||
return LXMLTreeBuilder
|
||||
|
||||
def test_out_of_range_entity(self):
|
||||
self.assertSoupEquals(
|
||||
|
@ -46,6 +46,12 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
|||
self.assertSoupEquals(
|
||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||
|
||||
def test_entities_in_foreign_document_encoding(self):
|
||||
# We can't implement this case correctly because by the time we
|
||||
# hear about markup like "“", it's been (incorrectly) converted into
|
||||
# a string like u'\x93'
|
||||
pass
|
||||
|
||||
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
|
||||
# test if an old version of lxml is installed.
|
||||
|
||||
|
@ -62,7 +68,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
|||
# if one is installed.
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = BeautifulStoneSoup("<b />")
|
||||
self.assertEqual(u"<b/>", unicode(soup.b))
|
||||
self.assertEqual("<b/>", str(soup.b))
|
||||
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
|
||||
|
||||
@skipIf(
|
||||
|
@ -73,4 +79,22 @@ class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
|
|||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return LXMLTreeBuilderForXML()
|
||||
return LXMLTreeBuilderForXML
|
||||
|
||||
def test_namespace_indexing(self):
|
||||
# We should not track un-prefixed namespaces as we can only hold one
|
||||
# and it will be recognized as the default namespace by soupsieve,
|
||||
# which may be confusing in some situations. When no namespace is provided
|
||||
# for a selector, the default namespace (if defined) is assumed.
|
||||
|
||||
soup = self.soup(
|
||||
'<?xml version="1.1"?>\n'
|
||||
'<root>'
|
||||
'<tag xmlns="http://unprefixed-namespace.com">content</tag>'
|
||||
'<prefix:tag xmlns:prefix="http://prefixed-namespace.com">content</tag>'
|
||||
'</root>'
|
||||
)
|
||||
self.assertEqual(
|
||||
soup._namespaces,
|
||||
{'xml': 'http://www.w3.org/XML/1998/namespace', 'prefix': 'http://prefixed-namespace.com'}
|
||||
)
|
||||
|
|
|
@ -24,6 +24,7 @@ from bs4.dammit import (
|
|||
EncodingDetector,
|
||||
)
|
||||
from bs4.testing import (
|
||||
default_builder,
|
||||
SoupTest,
|
||||
skipIf,
|
||||
)
|
||||
|
@ -32,7 +33,7 @@ import warnings
|
|||
try:
|
||||
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
|
||||
LXML_PRESENT = True
|
||||
except ImportError, e:
|
||||
except ImportError as e:
|
||||
LXML_PRESENT = False
|
||||
|
||||
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
|
||||
|
@ -40,21 +41,86 @@ PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
|
|||
class TestConstructor(SoupTest):
|
||||
|
||||
def test_short_unicode_input(self):
|
||||
data = u"<h1>éé</h1>"
|
||||
data = "<h1>éé</h1>"
|
||||
soup = self.soup(data)
|
||||
self.assertEqual(u"éé", soup.h1.string)
|
||||
self.assertEqual("éé", soup.h1.string)
|
||||
|
||||
def test_embedded_null(self):
|
||||
data = u"<h1>foo\0bar</h1>"
|
||||
data = "<h1>foo\0bar</h1>"
|
||||
soup = self.soup(data)
|
||||
self.assertEqual(u"foo\0bar", soup.h1.string)
|
||||
self.assertEqual("foo\0bar", soup.h1.string)
|
||||
|
||||
def test_exclude_encodings(self):
|
||||
utf8_data = u"Räksmörgås".encode("utf-8")
|
||||
utf8_data = "Räksmörgås".encode("utf-8")
|
||||
soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
|
||||
self.assertEqual("windows-1252", soup.original_encoding)
|
||||
|
||||
def test_custom_builder_class(self):
|
||||
# Verify that you can pass in a custom Builder class and
|
||||
# it'll be instantiated with the appropriate keyword arguments.
|
||||
class Mock(object):
|
||||
def __init__(self, **kwargs):
|
||||
self.called_with = kwargs
|
||||
self.is_xml = True
|
||||
def initialize_soup(self, soup):
|
||||
pass
|
||||
def prepare_markup(self, *args, **kwargs):
|
||||
return ''
|
||||
|
||||
kwargs = dict(
|
||||
var="value",
|
||||
# This is a deprecated BS3-era keyword argument, which
|
||||
# will be stripped out.
|
||||
convertEntities=True,
|
||||
)
|
||||
with warnings.catch_warnings(record=True):
|
||||
soup = BeautifulSoup('', builder=Mock, **kwargs)
|
||||
assert isinstance(soup.builder, Mock)
|
||||
self.assertEqual(dict(var="value"), soup.builder.called_with)
|
||||
|
||||
# You can also instantiate the TreeBuilder yourself. In this
|
||||
# case, that specific object is used and any keyword arguments
|
||||
# to the BeautifulSoup constructor are ignored.
|
||||
builder = Mock(**kwargs)
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = BeautifulSoup(
|
||||
'', builder=builder, ignored_value=True,
|
||||
)
|
||||
msg = str(w[0].message)
|
||||
assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
|
||||
self.assertEqual(builder, soup.builder)
|
||||
self.assertEqual(kwargs, builder.called_with)
|
||||
|
||||
def test_cdata_list_attributes(self):
|
||||
# Most attribute values are represented as scalars, but the
|
||||
# HTML standard says that some attributes, like 'class' have
|
||||
# space-separated lists as values.
|
||||
markup = '<a id=" an id " class=" a class "></a>'
|
||||
soup = self.soup(markup)
|
||||
|
||||
# Note that the spaces are stripped for 'class' but not for 'id'.
|
||||
a = soup.a
|
||||
self.assertEqual(" an id ", a['id'])
|
||||
self.assertEqual(["a", "class"], a['class'])
|
||||
|
||||
# TreeBuilder takes an argument called 'mutli_valued_attributes' which lets
|
||||
# you customize or disable this. As always, you can customize the TreeBuilder
|
||||
# by passing in a keyword argument to the BeautifulSoup constructor.
|
||||
soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
|
||||
self.assertEqual(" a class ", soup.a['class'])
|
||||
|
||||
# Here are two ways of saying that `id` is a multi-valued
|
||||
# attribute in this context, but 'class' is not.
|
||||
for switcheroo in ({'*': 'id'}, {'a': 'id'}):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
# This will create a warning about not explicitly
|
||||
# specifying a parser, but we'll ignore it.
|
||||
soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo)
|
||||
a = soup.a
|
||||
self.assertEqual(["an", "id"], a['id'])
|
||||
self.assertEqual(" a class ", a['class'])
|
||||
|
||||
|
||||
class TestWarnings(SoupTest):
|
||||
|
||||
def _no_parser_specified(self, s, is_there=True):
|
||||
|
@ -129,7 +195,7 @@ class TestWarnings(SoupTest):
|
|||
with warnings.catch_warnings(record=True) as warning_list:
|
||||
# note - this url must differ from the bytes one otherwise
|
||||
# python's warnings system swallows the second warning
|
||||
soup = self.soup(u"http://www.crummyunicode.com/")
|
||||
soup = self.soup("http://www.crummyunicode.com/")
|
||||
self.assertTrue(any("looks like a URL" in str(w.message)
|
||||
for w in warning_list))
|
||||
|
||||
|
@ -141,7 +207,7 @@ class TestWarnings(SoupTest):
|
|||
|
||||
def test_url_warning_with_unicode_and_space(self):
|
||||
with warnings.catch_warnings(record=True) as warning_list:
|
||||
soup = self.soup(u"http://www.crummyuncode.com/ is great")
|
||||
soup = self.soup("http://www.crummyuncode.com/ is great")
|
||||
self.assertFalse(any("looks like a URL" in str(w.message)
|
||||
for w in warning_list))
|
||||
|
||||
|
@ -163,9 +229,9 @@ class TestEntitySubstitution(unittest.TestCase):
|
|||
def test_simple_html_substitution(self):
|
||||
# Unicode characters corresponding to named HTML entites
|
||||
# are substituted, and no others.
|
||||
s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
|
||||
s = "foo\u2200\N{SNOWMAN}\u00f5bar"
|
||||
self.assertEqual(self.sub.substitute_html(s),
|
||||
u"foo∀\N{SNOWMAN}õbar")
|
||||
"foo∀\N{SNOWMAN}õbar")
|
||||
|
||||
def test_smart_quote_substitution(self):
|
||||
# MS smart quotes are a common source of frustration, so we
|
||||
|
@ -217,7 +283,7 @@ class TestEntitySubstitution(unittest.TestCase):
|
|||
self.assertEqual(
|
||||
self.sub.substitute_xml_containing_entities("ÁT&T"),
|
||||
"ÁT&T")
|
||||
|
||||
|
||||
def test_quotes_not_html_substituted(self):
|
||||
"""There's no need to do this except inside attribute values."""
|
||||
text = 'Bob\'s "bar"'
|
||||
|
@ -230,7 +296,7 @@ class TestEncodingConversion(SoupTest):
|
|||
|
||||
def setUp(self):
|
||||
super(TestEncodingConversion, self).setUp()
|
||||
self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
|
||||
self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
|
||||
self.utf8_data = self.unicode_data.encode("utf-8")
|
||||
# Just so you know what it looks like.
|
||||
self.assertEqual(
|
||||
|
@ -250,7 +316,7 @@ class TestEncodingConversion(SoupTest):
|
|||
ascii = b"<foo>a</foo>"
|
||||
soup_from_ascii = self.soup(ascii)
|
||||
unicode_output = soup_from_ascii.decode()
|
||||
self.assertTrue(isinstance(unicode_output, unicode))
|
||||
self.assertTrue(isinstance(unicode_output, str))
|
||||
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
|
||||
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
|
||||
finally:
|
||||
|
@ -262,7 +328,7 @@ class TestEncodingConversion(SoupTest):
|
|||
# is not set.
|
||||
soup_from_unicode = self.soup(self.unicode_data)
|
||||
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
|
||||
self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
|
||||
self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
|
||||
self.assertEqual(soup_from_unicode.original_encoding, None)
|
||||
|
||||
def test_utf8_in_unicode_out(self):
|
||||
|
@ -270,7 +336,7 @@ class TestEncodingConversion(SoupTest):
|
|||
# attribute is set.
|
||||
soup_from_utf8 = self.soup(self.utf8_data)
|
||||
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
|
||||
self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
|
||||
self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
|
||||
|
||||
def test_utf8_out(self):
|
||||
# The internal data structures can be encoded as UTF-8.
|
||||
|
@ -281,14 +347,14 @@ class TestEncodingConversion(SoupTest):
|
|||
PYTHON_3_PRE_3_2,
|
||||
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
|
||||
def test_attribute_name_containing_unicode_characters(self):
|
||||
markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
|
||||
markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
|
||||
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
|
||||
|
||||
class TestUnicodeDammit(unittest.TestCase):
|
||||
"""Standalone tests of UnicodeDammit."""
|
||||
|
||||
def test_unicode_input(self):
|
||||
markup = u"I'm already Unicode! \N{SNOWMAN}"
|
||||
markup = "I'm already Unicode! \N{SNOWMAN}"
|
||||
dammit = UnicodeDammit(markup)
|
||||
self.assertEqual(dammit.unicode_markup, markup)
|
||||
|
||||
|
@ -296,7 +362,7 @@ class TestUnicodeDammit(unittest.TestCase):
|
|||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
dammit = UnicodeDammit(markup)
|
||||
self.assertEqual(
|
||||
dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
|
||||
dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
|
||||
|
||||
def test_smart_quotes_to_xml_entities(self):
|
||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
|
@ -320,14 +386,14 @@ class TestUnicodeDammit(unittest.TestCase):
|
|||
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
|
||||
dammit = UnicodeDammit(utf8)
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
self.assertEqual(dammit.unicode_markup, u'Sacr\xe9 bleu! \N{SNOWMAN}')
|
||||
self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
|
||||
|
||||
|
||||
def test_convert_hebrew(self):
|
||||
hebrew = b"\xed\xe5\xec\xf9"
|
||||
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
|
||||
self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
|
||||
self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
|
||||
|
||||
def test_dont_see_smart_quotes_where_there_are_none(self):
|
||||
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
|
||||
|
@ -336,19 +402,19 @@ class TestUnicodeDammit(unittest.TestCase):
|
|||
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
|
||||
|
||||
def test_ignore_inappropriate_codecs(self):
|
||||
utf8_data = u"Räksmörgås".encode("utf-8")
|
||||
utf8_data = "Räksmörgås".encode("utf-8")
|
||||
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
|
||||
def test_ignore_invalid_codecs(self):
|
||||
utf8_data = u"Räksmörgås".encode("utf-8")
|
||||
utf8_data = "Räksmörgås".encode("utf-8")
|
||||
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
|
||||
dammit = UnicodeDammit(utf8_data, [bad_encoding])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
|
||||
def test_exclude_encodings(self):
|
||||
# This is UTF-8.
|
||||
utf8_data = u"Räksmörgås".encode("utf-8")
|
||||
utf8_data = "Räksmörgås".encode("utf-8")
|
||||
|
||||
# But if we exclude UTF-8 from consideration, the guess is
|
||||
# Windows-1252.
|
||||
|
@ -364,7 +430,7 @@ class TestUnicodeDammit(unittest.TestCase):
|
|||
detected = EncodingDetector(
|
||||
b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
|
||||
encodings = list(detected.encodings)
|
||||
assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings
|
||||
assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
|
||||
|
||||
def test_detect_html5_style_meta_tag(self):
|
||||
|
||||
|
@ -404,7 +470,7 @@ class TestUnicodeDammit(unittest.TestCase):
|
|||
bs4.dammit.chardet_dammit = noop
|
||||
dammit = UnicodeDammit(doc)
|
||||
self.assertEqual(True, dammit.contains_replacement_characters)
|
||||
self.assertTrue(u"\ufffd" in dammit.unicode_markup)
|
||||
self.assertTrue("\ufffd" in dammit.unicode_markup)
|
||||
|
||||
soup = BeautifulSoup(doc, "html.parser")
|
||||
self.assertTrue(soup.contains_replacement_characters)
|
||||
|
@ -416,17 +482,17 @@ class TestUnicodeDammit(unittest.TestCase):
|
|||
# A document written in UTF-16LE will have its byte order marker stripped.
|
||||
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
|
||||
dammit = UnicodeDammit(data)
|
||||
self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
|
||||
self.assertEqual("<a>áé</a>", dammit.unicode_markup)
|
||||
self.assertEqual("utf-16le", dammit.original_encoding)
|
||||
|
||||
def test_detwingle(self):
|
||||
# Here's a UTF8 document.
|
||||
utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
|
||||
utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
|
||||
|
||||
# Here's a Windows-1252 document.
|
||||
windows_1252 = (
|
||||
u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
|
||||
u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
|
||||
"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
|
||||
"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
|
||||
|
||||
# Through some unholy alchemy, they've been stuck together.
|
||||
doc = utf8 + windows_1252 + utf8
|
||||
|
@ -441,7 +507,7 @@ class TestUnicodeDammit(unittest.TestCase):
|
|||
|
||||
fixed = UnicodeDammit.detwingle(doc)
|
||||
self.assertEqual(
|
||||
u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
|
||||
"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
|
||||
|
||||
def test_detwingle_ignores_multibyte_characters(self):
|
||||
# Each of these characters has a UTF-8 representation ending
|
||||
|
@ -449,9 +515,9 @@ class TestUnicodeDammit(unittest.TestCase):
|
|||
# Windows-1252. But our code knows to skip over multibyte
|
||||
# UTF-8 characters, so they'll survive the process unscathed.
|
||||
for tricky_unicode_char in (
|
||||
u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
|
||||
u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
|
||||
u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
|
||||
"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
|
||||
"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
|
||||
"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
|
||||
):
|
||||
input = tricky_unicode_char.encode("utf8")
|
||||
self.assertTrue(input.endswith(b'\x93'))
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Tests for Beautiful Soup's tree traversal methods.
|
||||
|
||||
|
@ -26,6 +25,7 @@ from bs4.element import (
|
|||
Comment,
|
||||
Declaration,
|
||||
Doctype,
|
||||
Formatter,
|
||||
NavigableString,
|
||||
SoupStrainer,
|
||||
Tag,
|
||||
|
@ -71,13 +71,13 @@ class TestFind(TreeTest):
|
|||
self.assertEqual(soup.find("b").string, "2")
|
||||
|
||||
def test_unicode_text_find(self):
|
||||
soup = self.soup(u'<h1>Räksmörgås</h1>')
|
||||
self.assertEqual(soup.find(string=u'Räksmörgås'), u'Räksmörgås')
|
||||
soup = self.soup('<h1>Räksmörgås</h1>')
|
||||
self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås')
|
||||
|
||||
def test_unicode_attribute_find(self):
|
||||
soup = self.soup(u'<h1 id="Räksmörgås">here it is</h1>')
|
||||
soup = self.soup('<h1 id="Räksmörgås">here it is</h1>')
|
||||
str(soup)
|
||||
self.assertEqual("here it is", soup.find(id=u'Räksmörgås').text)
|
||||
self.assertEqual("here it is", soup.find(id='Räksmörgås').text)
|
||||
|
||||
|
||||
def test_find_everything(self):
|
||||
|
@ -97,17 +97,17 @@ class TestFindAll(TreeTest):
|
|||
"""You can search the tree for text nodes."""
|
||||
soup = self.soup("<html>Foo<b>bar</b>\xbb</html>")
|
||||
# Exact match.
|
||||
self.assertEqual(soup.find_all(string="bar"), [u"bar"])
|
||||
self.assertEqual(soup.find_all(text="bar"), [u"bar"])
|
||||
self.assertEqual(soup.find_all(string="bar"), ["bar"])
|
||||
self.assertEqual(soup.find_all(text="bar"), ["bar"])
|
||||
# Match any of a number of strings.
|
||||
self.assertEqual(
|
||||
soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"])
|
||||
soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"])
|
||||
# Match a regular expression.
|
||||
self.assertEqual(soup.find_all(text=re.compile('.*')),
|
||||
[u"Foo", u"bar", u'\xbb'])
|
||||
["Foo", "bar", '\xbb'])
|
||||
# Match anything.
|
||||
self.assertEqual(soup.find_all(text=True),
|
||||
[u"Foo", u"bar", u'\xbb'])
|
||||
["Foo", "bar", '\xbb'])
|
||||
|
||||
def test_find_all_limit(self):
|
||||
"""You can limit the number of items returned by find_all."""
|
||||
|
@ -250,8 +250,8 @@ class TestFindAllByAttribute(TreeTest):
|
|||
["Matching a.", "Matching b."])
|
||||
|
||||
def test_find_all_by_utf8_attribute_value(self):
|
||||
peace = u"םולש".encode("utf8")
|
||||
data = u'<a title="םולש"></a>'.encode("utf8")
|
||||
peace = "םולש".encode("utf8")
|
||||
data = '<a title="םולש"></a>'.encode("utf8")
|
||||
soup = self.soup(data)
|
||||
self.assertEqual([soup.a], soup.find_all(title=peace))
|
||||
self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
|
||||
|
@ -417,6 +417,48 @@ class TestFindAllByAttribute(TreeTest):
|
|||
self.assertEqual([], soup.find_all(id=1, text="bar"))
|
||||
|
||||
|
||||
class TestSmooth(TreeTest):
|
||||
"""Test Tag.smooth."""
|
||||
|
||||
def test_smooth(self):
|
||||
soup = self.soup("<div>a</div>")
|
||||
div = soup.div
|
||||
div.append("b")
|
||||
div.append("c")
|
||||
div.append(Comment("Comment 1"))
|
||||
div.append(Comment("Comment 2"))
|
||||
div.append("d")
|
||||
builder = self.default_builder()
|
||||
span = Tag(soup, builder, 'span')
|
||||
span.append('1')
|
||||
span.append('2')
|
||||
div.append(span)
|
||||
|
||||
# At this point the tree has a bunch of adjacent
|
||||
# NavigableStrings. This is normal, but it has no meaning in
|
||||
# terms of HTML, so we may want to smooth things out for
|
||||
# output.
|
||||
|
||||
# Since the <span> tag has two children, its .string is None.
|
||||
self.assertEqual(None, div.span.string)
|
||||
|
||||
self.assertEqual(7, len(div.contents))
|
||||
div.smooth()
|
||||
self.assertEqual(5, len(div.contents))
|
||||
|
||||
# The three strings at the beginning of div.contents have been
|
||||
# merged into on string.
|
||||
#
|
||||
self.assertEqual('abc', div.contents[0])
|
||||
|
||||
# The call is recursive -- the <span> tag was also smoothed.
|
||||
self.assertEqual('12', div.span.string)
|
||||
|
||||
# The two comments have _not_ been merged, even though
|
||||
# comments are strings. Merging comments would change the
|
||||
# meaning of the HTML.
|
||||
self.assertEqual('Comment 1', div.contents[1])
|
||||
self.assertEqual('Comment 2', div.contents[2])
|
||||
|
||||
|
||||
class TestIndex(TreeTest):
|
||||
|
@ -605,7 +647,7 @@ class SiblingTest(TreeTest):
|
|||
</html>'''
|
||||
# All that whitespace looks good but makes the tests more
|
||||
# difficult. Get rid of it.
|
||||
markup = re.compile("\n\s*").sub("", markup)
|
||||
markup = re.compile(r"\n\s*").sub("", markup)
|
||||
self.tree = self.soup(markup)
|
||||
|
||||
|
||||
|
@ -703,12 +745,12 @@ class TestTagCreation(SoupTest):
|
|||
"""Test the ability to create new tags."""
|
||||
def test_new_tag(self):
|
||||
soup = self.soup("")
|
||||
new_tag = soup.new_tag("foo", bar="baz")
|
||||
new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"})
|
||||
self.assertTrue(isinstance(new_tag, Tag))
|
||||
self.assertEqual("foo", new_tag.name)
|
||||
self.assertEqual(dict(bar="baz"), new_tag.attrs)
|
||||
self.assertEqual(dict(bar="baz", name="a name"), new_tag.attrs)
|
||||
self.assertEqual(None, new_tag.parent)
|
||||
|
||||
|
||||
def test_tag_inherits_self_closing_rules_from_builder(self):
|
||||
if XML_BUILDER_PRESENT:
|
||||
xml_soup = BeautifulSoup("", "lxml-xml")
|
||||
|
@ -821,6 +863,26 @@ class TestTreeModification(SoupTest):
|
|||
soup = self.soup(text)
|
||||
self.assertRaises(ValueError, soup.a.insert, 0, soup.a)
|
||||
|
||||
def test_insert_beautifulsoup_object_inserts_children(self):
|
||||
"""Inserting one BeautifulSoup object into another actually inserts all
|
||||
of its children -- you'll never combine BeautifulSoup objects.
|
||||
"""
|
||||
soup = self.soup("<p>And now, a word:</p><p>And we're back.</p>")
|
||||
|
||||
text = "<p>p2</p><p>p3</p>"
|
||||
to_insert = self.soup(text)
|
||||
soup.insert(1, to_insert)
|
||||
|
||||
for i in soup.descendants:
|
||||
assert not isinstance(i, BeautifulSoup)
|
||||
|
||||
p1, p2, p3, p4 = list(soup.children)
|
||||
self.assertEqual("And now, a word:", p1.string)
|
||||
self.assertEqual("p2", p2.string)
|
||||
self.assertEqual("p3", p3.string)
|
||||
self.assertEqual("And we're back.", p4.string)
|
||||
|
||||
|
||||
def test_replace_with_maintains_next_element_throughout(self):
|
||||
soup = self.soup('<p><a>one</a><b>three</b></p>')
|
||||
a = soup.a
|
||||
|
@ -877,7 +939,7 @@ class TestTreeModification(SoupTest):
|
|||
self.assertEqual(soup.a.contents[0].next_element, "bar")
|
||||
|
||||
def test_insert_tag(self):
|
||||
builder = self.default_builder
|
||||
builder = self.default_builder()
|
||||
soup = self.soup(
|
||||
"<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder)
|
||||
magic_tag = Tag(soup, builder, 'magictag')
|
||||
|
@ -912,6 +974,13 @@ class TestTreeModification(SoupTest):
|
|||
soup.a.append(soup.b)
|
||||
self.assertEqual(data, soup.decode())
|
||||
|
||||
def test_extend(self):
|
||||
data = "<a><b><c><d><e><f><g></g></f></e></d></c></b></a>"
|
||||
soup = self.soup(data)
|
||||
l = [soup.g, soup.f, soup.e, soup.d, soup.c, soup.b]
|
||||
soup.a.extend(l)
|
||||
self.assertEqual("<a><g></g><f></f><e></e><d></d><c></c><b></b></a>", soup.decode())
|
||||
|
||||
def test_move_tag_to_beginning_of_parent(self):
|
||||
data = "<a><b></b><c></c><d></d></a>"
|
||||
soup = self.soup(data)
|
||||
|
@ -938,6 +1007,29 @@ class TestTreeModification(SoupTest):
|
|||
self.assertEqual(
|
||||
soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
|
||||
|
||||
# Can't insert an element before itself.
|
||||
b = soup.b
|
||||
self.assertRaises(ValueError, b.insert_before, b)
|
||||
|
||||
# Can't insert before if an element has no parent.
|
||||
b.extract()
|
||||
self.assertRaises(ValueError, b.insert_before, "nope")
|
||||
|
||||
# Can insert an identical element
|
||||
soup = self.soup("<a>")
|
||||
soup.a.insert_before(soup.new_tag("a"))
|
||||
|
||||
def test_insert_multiple_before(self):
|
||||
soup = self.soup("<a>foo</a><b>bar</b>")
|
||||
soup.b.insert_before("BAZ", " ", "QUUX")
|
||||
soup.a.insert_before("QUUX", " ", "BAZ")
|
||||
self.assertEqual(
|
||||
soup.decode(), self.document_for("QUUX BAZ<a>foo</a>BAZ QUUX<b>bar</b>"))
|
||||
|
||||
soup.a.insert_before(soup.b, "FOO")
|
||||
self.assertEqual(
|
||||
soup.decode(), self.document_for("QUUX BAZ<b>bar</b>FOO<a>foo</a>BAZ QUUX"))
|
||||
|
||||
def test_insert_after(self):
|
||||
soup = self.soup("<a>foo</a><b>bar</b>")
|
||||
soup.b.insert_after("BAZ")
|
||||
|
@ -948,6 +1040,28 @@ class TestTreeModification(SoupTest):
|
|||
self.assertEqual(
|
||||
soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
|
||||
|
||||
# Can't insert an element after itself.
|
||||
b = soup.b
|
||||
self.assertRaises(ValueError, b.insert_after, b)
|
||||
|
||||
# Can't insert after if an element has no parent.
|
||||
b.extract()
|
||||
self.assertRaises(ValueError, b.insert_after, "nope")
|
||||
|
||||
# Can insert an identical element
|
||||
soup = self.soup("<a>")
|
||||
soup.a.insert_before(soup.new_tag("a"))
|
||||
|
||||
def test_insert_multiple_after(self):
|
||||
soup = self.soup("<a>foo</a><b>bar</b>")
|
||||
soup.b.insert_after("BAZ", " ", "QUUX")
|
||||
soup.a.insert_after("QUUX", " ", "BAZ")
|
||||
self.assertEqual(
|
||||
soup.decode(), self.document_for("<a>foo</a>QUUX BAZ<b>bar</b>BAZ QUUX"))
|
||||
soup.b.insert_after(soup.a, "FOO ")
|
||||
self.assertEqual(
|
||||
soup.decode(), self.document_for("QUUX BAZ<b>bar</b><a>foo</a>FOO BAZ QUUX"))
|
||||
|
||||
def test_insert_after_raises_exception_if_after_has_no_meaning(self):
|
||||
soup = self.soup("")
|
||||
tag = soup.new_tag("a")
|
||||
|
@ -1111,7 +1225,7 @@ class TestTreeModification(SoupTest):
|
|||
<script>baz</script>
|
||||
</html>""")
|
||||
[soup.script.extract() for i in soup.find_all("script")]
|
||||
self.assertEqual("<body>\n\n<a></a>\n</body>", unicode(soup.body))
|
||||
self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body))
|
||||
|
||||
|
||||
def test_extract_works_when_element_is_surrounded_by_identical_strings(self):
|
||||
|
@ -1186,7 +1300,7 @@ class TestElementObjects(SoupTest):
|
|||
tag = soup.bTag
|
||||
self.assertEqual(soup.b, tag)
|
||||
self.assertEqual(
|
||||
'.bTag is deprecated, use .find("b") instead.',
|
||||
'.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")',
|
||||
str(w[0].message))
|
||||
|
||||
def test_has_attr(self):
|
||||
|
@ -1349,19 +1463,19 @@ class TestPersistence(SoupTest):
|
|||
soup = BeautifulSoup(b'<p> </p>', 'html.parser')
|
||||
encoding = soup.original_encoding
|
||||
copy = soup.__copy__()
|
||||
self.assertEqual(u"<p> </p>", unicode(copy))
|
||||
self.assertEqual("<p> </p>", str(copy))
|
||||
self.assertEqual(encoding, copy.original_encoding)
|
||||
|
||||
def test_unicode_pickle(self):
|
||||
# A tree containing Unicode characters can be pickled.
|
||||
html = u"<b>\N{SNOWMAN}</b>"
|
||||
html = "<b>\N{SNOWMAN}</b>"
|
||||
soup = self.soup(html)
|
||||
dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
|
||||
loaded = pickle.loads(dumped)
|
||||
self.assertEqual(loaded.decode(), soup.decode())
|
||||
|
||||
def test_copy_navigablestring_is_not_attached_to_tree(self):
|
||||
html = u"<b>Foo<a></a></b><b>Bar</b>"
|
||||
html = "<b>Foo<a></a></b><b>Bar</b>"
|
||||
soup = self.soup(html)
|
||||
s1 = soup.find(string="Foo")
|
||||
s2 = copy.copy(s1)
|
||||
|
@ -1373,7 +1487,7 @@ class TestPersistence(SoupTest):
|
|||
self.assertEqual(None, s2.previous_element)
|
||||
|
||||
def test_copy_navigablestring_subclass_has_same_type(self):
|
||||
html = u"<b><!--Foo--></b>"
|
||||
html = "<b><!--Foo--></b>"
|
||||
soup = self.soup(html)
|
||||
s1 = soup.string
|
||||
s2 = copy.copy(s1)
|
||||
|
@ -1381,19 +1495,19 @@ class TestPersistence(SoupTest):
|
|||
self.assertTrue(isinstance(s2, Comment))
|
||||
|
||||
def test_copy_entire_soup(self):
|
||||
html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end"
|
||||
html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
|
||||
soup = self.soup(html)
|
||||
soup_copy = copy.copy(soup)
|
||||
self.assertEqual(soup, soup_copy)
|
||||
|
||||
def test_copy_tag_copies_contents(self):
|
||||
html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end"
|
||||
html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
|
||||
soup = self.soup(html)
|
||||
div = soup.div
|
||||
div_copy = copy.copy(div)
|
||||
|
||||
# The two tags look the same, and evaluate to equal.
|
||||
self.assertEqual(unicode(div), unicode(div_copy))
|
||||
self.assertEqual(str(div), str(div_copy))
|
||||
self.assertEqual(div, div_copy)
|
||||
|
||||
# But they're not the same object.
|
||||
|
@ -1409,67 +1523,75 @@ class TestPersistence(SoupTest):
|
|||
class TestSubstitutions(SoupTest):
|
||||
|
||||
def test_default_formatter_is_minimal(self):
|
||||
markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||
markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||
soup = self.soup(markup)
|
||||
decoded = soup.decode(formatter="minimal")
|
||||
# The < is converted back into < but the e-with-acute is left alone.
|
||||
self.assertEqual(
|
||||
decoded,
|
||||
self.document_for(
|
||||
u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
|
||||
"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
|
||||
|
||||
def test_formatter_html(self):
|
||||
markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||
markup = "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||
soup = self.soup(markup)
|
||||
decoded = soup.decode(formatter="html")
|
||||
self.assertEqual(
|
||||
decoded,
|
||||
self.document_for("<b><<Sacré bleu!>></b>"))
|
||||
self.document_for("<br/><b><<Sacré bleu!>></b>"))
|
||||
|
||||
def test_formatter_html5(self):
|
||||
markup = "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||
soup = self.soup(markup)
|
||||
decoded = soup.decode(formatter="html5")
|
||||
self.assertEqual(
|
||||
decoded,
|
||||
self.document_for("<br><b><<Sacré bleu!>></b>"))
|
||||
|
||||
def test_formatter_minimal(self):
|
||||
markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||
markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||
soup = self.soup(markup)
|
||||
decoded = soup.decode(formatter="minimal")
|
||||
# The < is converted back into < but the e-with-acute is left alone.
|
||||
self.assertEqual(
|
||||
decoded,
|
||||
self.document_for(
|
||||
u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
|
||||
"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
|
||||
|
||||
def test_formatter_null(self):
|
||||
markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||
markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||
soup = self.soup(markup)
|
||||
decoded = soup.decode(formatter=None)
|
||||
# Neither the angle brackets nor the e-with-acute are converted.
|
||||
# This is not valid HTML, but it's what the user wanted.
|
||||
self.assertEqual(decoded,
|
||||
self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
|
||||
self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
|
||||
|
||||
def test_formatter_custom(self):
|
||||
markup = u"<b><foo></b><b>bar</b>"
|
||||
markup = "<b><foo></b><b>bar</b><br/>"
|
||||
soup = self.soup(markup)
|
||||
decoded = soup.decode(formatter = lambda x: x.upper())
|
||||
# Instead of normal entity conversion code, the custom
|
||||
# callable is called on every string.
|
||||
self.assertEqual(
|
||||
decoded,
|
||||
self.document_for(u"<b><FOO></b><b>BAR</b>"))
|
||||
self.document_for("<b><FOO></b><b>BAR</b><br/>"))
|
||||
|
||||
def test_formatter_is_run_on_attribute_values(self):
|
||||
markup = u'<a href="http://a.com?a=b&c=é">e</a>'
|
||||
markup = '<a href="http://a.com?a=b&c=é">e</a>'
|
||||
soup = self.soup(markup)
|
||||
a = soup.a
|
||||
|
||||
expect_minimal = u'<a href="http://a.com?a=b&c=é">e</a>'
|
||||
expect_minimal = '<a href="http://a.com?a=b&c=é">e</a>'
|
||||
|
||||
self.assertEqual(expect_minimal, a.decode())
|
||||
self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
|
||||
|
||||
expect_html = u'<a href="http://a.com?a=b&c=é">e</a>'
|
||||
expect_html = '<a href="http://a.com?a=b&c=é">e</a>'
|
||||
self.assertEqual(expect_html, a.decode(formatter="html"))
|
||||
|
||||
self.assertEqual(markup, a.decode(formatter=None))
|
||||
expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>'
|
||||
expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>'
|
||||
self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
|
||||
|
||||
def test_formatter_skips_script_tag_for_html_documents(self):
|
||||
|
@ -1491,28 +1613,28 @@ class TestSubstitutions(SoupTest):
|
|||
self.assertTrue(b"< < hey > >" in encoded)
|
||||
|
||||
def test_prettify_leaves_preformatted_text_alone(self):
|
||||
soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ")
|
||||
soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz <textarea> eee\nfff\t</textarea></div>")
|
||||
# Everything outside the <pre> tag is reformatted, but everything
|
||||
# inside is left alone.
|
||||
self.assertEqual(
|
||||
u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>',
|
||||
'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>',
|
||||
soup.div.prettify())
|
||||
|
||||
def test_prettify_accepts_formatter(self):
|
||||
def test_prettify_accepts_formatter_function(self):
|
||||
soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
|
||||
pretty = soup.prettify(formatter = lambda x: x.upper())
|
||||
self.assertTrue("FOO" in pretty)
|
||||
|
||||
def test_prettify_outputs_unicode_by_default(self):
|
||||
soup = self.soup("<a></a>")
|
||||
self.assertEqual(unicode, type(soup.prettify()))
|
||||
self.assertEqual(str, type(soup.prettify()))
|
||||
|
||||
def test_prettify_can_encode_data(self):
|
||||
soup = self.soup("<a></a>")
|
||||
self.assertEqual(bytes, type(soup.prettify("utf-8")))
|
||||
|
||||
def test_html_entity_substitution_off_by_default(self):
|
||||
markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
|
||||
markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
|
||||
soup = self.soup(markup)
|
||||
encoded = soup.b.encode("utf-8")
|
||||
self.assertEqual(encoded, markup.encode('utf-8'))
|
||||
|
@ -1556,54 +1678,77 @@ class TestEncoding(SoupTest):
|
|||
"""Test the ability to encode objects into strings."""
|
||||
|
||||
def test_unicode_string_can_be_encoded(self):
|
||||
html = u"<b>\N{SNOWMAN}</b>"
|
||||
html = "<b>\N{SNOWMAN}</b>"
|
||||
soup = self.soup(html)
|
||||
self.assertEqual(soup.b.string.encode("utf-8"),
|
||||
u"\N{SNOWMAN}".encode("utf-8"))
|
||||
"\N{SNOWMAN}".encode("utf-8"))
|
||||
|
||||
def test_tag_containing_unicode_string_can_be_encoded(self):
|
||||
html = u"<b>\N{SNOWMAN}</b>"
|
||||
html = "<b>\N{SNOWMAN}</b>"
|
||||
soup = self.soup(html)
|
||||
self.assertEqual(
|
||||
soup.b.encode("utf-8"), html.encode("utf-8"))
|
||||
|
||||
def test_encoding_substitutes_unrecognized_characters_by_default(self):
|
||||
html = u"<b>\N{SNOWMAN}</b>"
|
||||
html = "<b>\N{SNOWMAN}</b>"
|
||||
soup = self.soup(html)
|
||||
self.assertEqual(soup.b.encode("ascii"), b"<b>☃</b>")
|
||||
|
||||
def test_encoding_can_be_made_strict(self):
|
||||
html = u"<b>\N{SNOWMAN}</b>"
|
||||
html = "<b>\N{SNOWMAN}</b>"
|
||||
soup = self.soup(html)
|
||||
self.assertRaises(
|
||||
UnicodeEncodeError, soup.encode, "ascii", errors="strict")
|
||||
|
||||
def test_decode_contents(self):
|
||||
html = u"<b>\N{SNOWMAN}</b>"
|
||||
html = "<b>\N{SNOWMAN}</b>"
|
||||
soup = self.soup(html)
|
||||
self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents())
|
||||
self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents())
|
||||
|
||||
def test_encode_contents(self):
|
||||
html = u"<b>\N{SNOWMAN}</b>"
|
||||
html = "<b>\N{SNOWMAN}</b>"
|
||||
soup = self.soup(html)
|
||||
self.assertEqual(
|
||||
u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
|
||||
"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
|
||||
encoding="utf8"))
|
||||
|
||||
def test_deprecated_renderContents(self):
|
||||
html = u"<b>\N{SNOWMAN}</b>"
|
||||
html = "<b>\N{SNOWMAN}</b>"
|
||||
soup = self.soup(html)
|
||||
self.assertEqual(
|
||||
u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
|
||||
"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
|
||||
|
||||
def test_repr(self):
|
||||
html = u"<b>\N{SNOWMAN}</b>"
|
||||
html = "<b>\N{SNOWMAN}</b>"
|
||||
soup = self.soup(html)
|
||||
if PY3K:
|
||||
self.assertEqual(html, repr(soup))
|
||||
else:
|
||||
self.assertEqual(b'<b>\\u2603</b>', repr(soup))
|
||||
|
||||
class TestFormatter(SoupTest):
|
||||
|
||||
def test_sort_attributes(self):
|
||||
# Test the ability to override Formatter.attributes() to,
|
||||
# e.g., disable the normal sorting of attributes.
|
||||
class UnsortedFormatter(Formatter):
|
||||
def attributes(self, tag):
|
||||
self.called_with = tag
|
||||
for k, v in sorted(tag.attrs.items()):
|
||||
if k == 'ignore':
|
||||
continue
|
||||
yield k,v
|
||||
|
||||
soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>')
|
||||
formatter = UnsortedFormatter()
|
||||
decoded = soup.decode(formatter=formatter)
|
||||
|
||||
# attributes() was called on the <p> tag. It filtered out one
|
||||
# attribute and sorted the other two.
|
||||
self.assertEqual(formatter.called_with, soup.p)
|
||||
self.assertEqual('<p aval="2" cval="1"></p>', decoded)
|
||||
|
||||
|
||||
class TestNavigableStringSubclasses(SoupTest):
|
||||
|
||||
def test_cdata(self):
|
||||
|
@ -1720,7 +1865,7 @@ class TestSoupSelector(TreeTest):
|
|||
els = self.soup.select('title')
|
||||
self.assertEqual(len(els), 1)
|
||||
self.assertEqual(els[0].name, 'title')
|
||||
self.assertEqual(els[0].contents, [u'The title'])
|
||||
self.assertEqual(els[0].contents, ['The title'])
|
||||
|
||||
def test_one_tag_many(self):
|
||||
els = self.soup.select('div')
|
||||
|
@ -1755,7 +1900,7 @@ class TestSoupSelector(TreeTest):
|
|||
self.assertEqual(len(self.soup.select('del')), 0)
|
||||
|
||||
def test_invalid_tag(self):
|
||||
self.assertRaises(ValueError, self.soup.select, 'tag%t')
|
||||
self.assertRaises(SyntaxError, self.soup.select, 'tag%t')
|
||||
|
||||
def test_select_dashed_tag_ids(self):
|
||||
self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
|
||||
|
@ -1766,7 +1911,7 @@ class TestSoupSelector(TreeTest):
|
|||
self.assertEqual(dashed[0]['id'], 'dash2')
|
||||
|
||||
def test_dashed_tag_text(self):
|
||||
self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, u'Hello there.')
|
||||
self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.')
|
||||
|
||||
def test_select_dashed_matches_find_all(self):
|
||||
self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag'))
|
||||
|
@ -1946,32 +2091,31 @@ class TestSoupSelector(TreeTest):
|
|||
NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
|
||||
|
||||
self.assertRaises(
|
||||
NotImplementedError, self.soup.select, "a:nth-of-type(a)")
|
||||
|
||||
SyntaxError, self.soup.select, "a:nth-of-type(a)")
|
||||
|
||||
def test_nth_of_type(self):
|
||||
# Try to select first paragraph
|
||||
els = self.soup.select('div#inner p:nth-of-type(1)')
|
||||
self.assertEqual(len(els), 1)
|
||||
self.assertEqual(els[0].string, u'Some text')
|
||||
self.assertEqual(els[0].string, 'Some text')
|
||||
|
||||
# Try to select third paragraph
|
||||
els = self.soup.select('div#inner p:nth-of-type(3)')
|
||||
self.assertEqual(len(els), 1)
|
||||
self.assertEqual(els[0].string, u'Another')
|
||||
self.assertEqual(els[0].string, 'Another')
|
||||
|
||||
# Try to select (non-existent!) fourth paragraph
|
||||
els = self.soup.select('div#inner p:nth-of-type(4)')
|
||||
self.assertEqual(len(els), 0)
|
||||
|
||||
# Pass in an invalid value.
|
||||
self.assertRaises(
|
||||
ValueError, self.soup.select, 'div p:nth-of-type(0)')
|
||||
# Zero will select no tags.
|
||||
els = self.soup.select('div p:nth-of-type(0)')
|
||||
self.assertEqual(len(els), 0)
|
||||
|
||||
def test_nth_of_type_direct_descendant(self):
|
||||
els = self.soup.select('div#inner > p:nth-of-type(1)')
|
||||
self.assertEqual(len(els), 1)
|
||||
self.assertEqual(els[0].string, u'Some text')
|
||||
self.assertEqual(els[0].string, 'Some text')
|
||||
|
||||
def test_id_child_selector_nth_of_type(self):
|
||||
self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
|
||||
|
@ -2003,7 +2147,7 @@ class TestSoupSelector(TreeTest):
|
|||
self.assertEqual([], self.soup.select('#inner ~ h2'))
|
||||
|
||||
def test_dangling_combinator(self):
|
||||
self.assertRaises(ValueError, self.soup.select, 'h1 >')
|
||||
self.assertRaises(SyntaxError, self.soup.select, 'h1 >')
|
||||
|
||||
def test_sibling_combinator_wont_select_same_tag_twice(self):
|
||||
self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
|
||||
|
@ -2034,8 +2178,8 @@ class TestSoupSelector(TreeTest):
|
|||
self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
|
||||
|
||||
def test_invalid_multiple_select(self):
|
||||
self.assertRaises(ValueError, self.soup.select, ',x, y')
|
||||
self.assertRaises(ValueError, self.soup.select, 'x,,y')
|
||||
self.assertRaises(SyntaxError, self.soup.select, ',x, y')
|
||||
self.assertRaises(SyntaxError, self.soup.select, 'x,,y')
|
||||
|
||||
def test_multiple_select_attrs(self):
|
||||
self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
|
||||
|
@ -2046,5 +2190,16 @@ class TestSoupSelector(TreeTest):
|
|||
def test_multiple_select_nested(self):
|
||||
self.assertSelects('body > div > x, y > z', ['xid', 'zidb'])
|
||||
|
||||
def test_select_duplicate_elements(self):
|
||||
# When markup contains duplicate elements, a multiple select
|
||||
# will find all of them.
|
||||
markup = '<div class="c1"/><div class="c2"/><div class="c1"/>'
|
||||
soup = BeautifulSoup(markup, 'html.parser')
|
||||
selected = soup.select(".c1, .c2")
|
||||
self.assertEqual(3, len(selected))
|
||||
|
||||
|
||||
# Verify that find_all finds the same elements, though because
|
||||
# of an implementation detail it finds them in a different
|
||||
# order.
|
||||
for element in soup.find_all(class_=['c1', 'c2']):
|
||||
assert element in selected
|
||||
|
|
|
@ -1,3 +0,0 @@
|
|||
from pkgutil import extend_path
|
||||
|
||||
__path__ = extend_path(__path__, __name__)
|
|
@ -1,23 +0,0 @@
|
|||
# Copyright 2009 Brian Quinlan. All Rights Reserved.
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
"""Execute computations asynchronously using threads or processes."""
|
||||
|
||||
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
|
||||
|
||||
from concurrent.futures._base import (FIRST_COMPLETED,
|
||||
FIRST_EXCEPTION,
|
||||
ALL_COMPLETED,
|
||||
CancelledError,
|
||||
TimeoutError,
|
||||
Future,
|
||||
Executor,
|
||||
wait,
|
||||
as_completed)
|
||||
from concurrent.futures.thread import ThreadPoolExecutor
|
||||
|
||||
try:
|
||||
from concurrent.futures.process import ProcessPoolExecutor
|
||||
except ImportError:
|
||||
# some platforms don't have multiprocessing
|
||||
pass
|
|
@ -1,607 +0,0 @@
|
|||
# Copyright 2009 Brian Quinlan. All Rights Reserved.
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
import collections
|
||||
import logging
|
||||
import threading
|
||||
import itertools
|
||||
import time
|
||||
|
||||
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
|
||||
|
||||
FIRST_COMPLETED = 'FIRST_COMPLETED'
|
||||
FIRST_EXCEPTION = 'FIRST_EXCEPTION'
|
||||
ALL_COMPLETED = 'ALL_COMPLETED'
|
||||
_AS_COMPLETED = '_AS_COMPLETED'
|
||||
|
||||
# Possible future states (for internal use by the futures package).
|
||||
PENDING = 'PENDING'
|
||||
RUNNING = 'RUNNING'
|
||||
# The future was cancelled by the user...
|
||||
CANCELLED = 'CANCELLED'
|
||||
# ...and _Waiter.add_cancelled() was called by a worker.
|
||||
CANCELLED_AND_NOTIFIED = 'CANCELLED_AND_NOTIFIED'
|
||||
FINISHED = 'FINISHED'
|
||||
|
||||
_FUTURE_STATES = [
|
||||
PENDING,
|
||||
RUNNING,
|
||||
CANCELLED,
|
||||
CANCELLED_AND_NOTIFIED,
|
||||
FINISHED
|
||||
]
|
||||
|
||||
_STATE_TO_DESCRIPTION_MAP = {
|
||||
PENDING: "pending",
|
||||
RUNNING: "running",
|
||||
CANCELLED: "cancelled",
|
||||
CANCELLED_AND_NOTIFIED: "cancelled",
|
||||
FINISHED: "finished"
|
||||
}
|
||||
|
||||
# Logger for internal use by the futures package.
|
||||
LOGGER = logging.getLogger("concurrent.futures")
|
||||
|
||||
class Error(Exception):
|
||||
"""Base class for all future-related exceptions."""
|
||||
pass
|
||||
|
||||
class CancelledError(Error):
|
||||
"""The Future was cancelled."""
|
||||
pass
|
||||
|
||||
class TimeoutError(Error):
|
||||
"""The operation exceeded the given deadline."""
|
||||
pass
|
||||
|
||||
class _Waiter(object):
|
||||
"""Provides the event that wait() and as_completed() block on."""
|
||||
def __init__(self):
|
||||
self.event = threading.Event()
|
||||
self.finished_futures = []
|
||||
|
||||
def add_result(self, future):
|
||||
self.finished_futures.append(future)
|
||||
|
||||
def add_exception(self, future):
|
||||
self.finished_futures.append(future)
|
||||
|
||||
def add_cancelled(self, future):
|
||||
self.finished_futures.append(future)
|
||||
|
||||
class _AsCompletedWaiter(_Waiter):
|
||||
"""Used by as_completed()."""
|
||||
|
||||
def __init__(self):
|
||||
super(_AsCompletedWaiter, self).__init__()
|
||||
self.lock = threading.Lock()
|
||||
|
||||
def add_result(self, future):
|
||||
with self.lock:
|
||||
super(_AsCompletedWaiter, self).add_result(future)
|
||||
self.event.set()
|
||||
|
||||
def add_exception(self, future):
|
||||
with self.lock:
|
||||
super(_AsCompletedWaiter, self).add_exception(future)
|
||||
self.event.set()
|
||||
|
||||
def add_cancelled(self, future):
|
||||
with self.lock:
|
||||
super(_AsCompletedWaiter, self).add_cancelled(future)
|
||||
self.event.set()
|
||||
|
||||
class _FirstCompletedWaiter(_Waiter):
|
||||
"""Used by wait(return_when=FIRST_COMPLETED)."""
|
||||
|
||||
def add_result(self, future):
|
||||
super(_FirstCompletedWaiter, self).add_result(future)
|
||||
self.event.set()
|
||||
|
||||
def add_exception(self, future):
|
||||
super(_FirstCompletedWaiter, self).add_exception(future)
|
||||
self.event.set()
|
||||
|
||||
def add_cancelled(self, future):
|
||||
super(_FirstCompletedWaiter, self).add_cancelled(future)
|
||||
self.event.set()
|
||||
|
||||
class _AllCompletedWaiter(_Waiter):
|
||||
"""Used by wait(return_when=FIRST_EXCEPTION and ALL_COMPLETED)."""
|
||||
|
||||
def __init__(self, num_pending_calls, stop_on_exception):
|
||||
self.num_pending_calls = num_pending_calls
|
||||
self.stop_on_exception = stop_on_exception
|
||||
self.lock = threading.Lock()
|
||||
super(_AllCompletedWaiter, self).__init__()
|
||||
|
||||
def _decrement_pending_calls(self):
|
||||
with self.lock:
|
||||
self.num_pending_calls -= 1
|
||||
if not self.num_pending_calls:
|
||||
self.event.set()
|
||||
|
||||
def add_result(self, future):
|
||||
super(_AllCompletedWaiter, self).add_result(future)
|
||||
self._decrement_pending_calls()
|
||||
|
||||
def add_exception(self, future):
|
||||
super(_AllCompletedWaiter, self).add_exception(future)
|
||||
if self.stop_on_exception:
|
||||
self.event.set()
|
||||
else:
|
||||
self._decrement_pending_calls()
|
||||
|
||||
def add_cancelled(self, future):
|
||||
super(_AllCompletedWaiter, self).add_cancelled(future)
|
||||
self._decrement_pending_calls()
|
||||
|
||||
class _AcquireFutures(object):
|
||||
"""A context manager that does an ordered acquire of Future conditions."""
|
||||
|
||||
def __init__(self, futures):
|
||||
self.futures = sorted(futures, key=id)
|
||||
|
||||
def __enter__(self):
|
||||
for future in self.futures:
|
||||
future._condition.acquire()
|
||||
|
||||
def __exit__(self, *args):
|
||||
for future in self.futures:
|
||||
future._condition.release()
|
||||
|
||||
def _create_and_install_waiters(fs, return_when):
|
||||
if return_when == _AS_COMPLETED:
|
||||
waiter = _AsCompletedWaiter()
|
||||
elif return_when == FIRST_COMPLETED:
|
||||
waiter = _FirstCompletedWaiter()
|
||||
else:
|
||||
pending_count = sum(
|
||||
f._state not in [CANCELLED_AND_NOTIFIED, FINISHED] for f in fs)
|
||||
|
||||
if return_when == FIRST_EXCEPTION:
|
||||
waiter = _AllCompletedWaiter(pending_count, stop_on_exception=True)
|
||||
elif return_when == ALL_COMPLETED:
|
||||
waiter = _AllCompletedWaiter(pending_count, stop_on_exception=False)
|
||||
else:
|
||||
raise ValueError("Invalid return condition: %r" % return_when)
|
||||
|
||||
for f in fs:
|
||||
f._waiters.append(waiter)
|
||||
|
||||
return waiter
|
||||
|
||||
def as_completed(fs, timeout=None):
|
||||
"""An iterator over the given futures that yields each as it completes.
|
||||
|
||||
Args:
|
||||
fs: The sequence of Futures (possibly created by different Executors) to
|
||||
iterate over.
|
||||
timeout: The maximum number of seconds to wait. If None, then there
|
||||
is no limit on the wait time.
|
||||
|
||||
Returns:
|
||||
An iterator that yields the given Futures as they complete (finished or
|
||||
cancelled). If any given Futures are duplicated, they will be returned
|
||||
once.
|
||||
|
||||
Raises:
|
||||
TimeoutError: If the entire result iterator could not be generated
|
||||
before the given timeout.
|
||||
"""
|
||||
if timeout is not None:
|
||||
end_time = timeout + time.time()
|
||||
|
||||
fs = set(fs)
|
||||
with _AcquireFutures(fs):
|
||||
finished = set(
|
||||
f for f in fs
|
||||
if f._state in [CANCELLED_AND_NOTIFIED, FINISHED])
|
||||
pending = fs - finished
|
||||
waiter = _create_and_install_waiters(fs, _AS_COMPLETED)
|
||||
|
||||
try:
|
||||
for future in finished:
|
||||
yield future
|
||||
|
||||
while pending:
|
||||
if timeout is None:
|
||||
wait_timeout = None
|
||||
else:
|
||||
wait_timeout = end_time - time.time()
|
||||
if wait_timeout < 0:
|
||||
raise TimeoutError(
|
||||
'%d (of %d) futures unfinished' % (
|
||||
len(pending), len(fs)))
|
||||
|
||||
waiter.event.wait(wait_timeout)
|
||||
|
||||
with waiter.lock:
|
||||
finished = waiter.finished_futures
|
||||
waiter.finished_futures = []
|
||||
waiter.event.clear()
|
||||
|
||||
for future in finished:
|
||||
yield future
|
||||
pending.remove(future)
|
||||
|
||||
finally:
|
||||
for f in fs:
|
||||
with f._condition:
|
||||
f._waiters.remove(waiter)
|
||||
|
||||
DoneAndNotDoneFutures = collections.namedtuple(
|
||||
'DoneAndNotDoneFutures', 'done not_done')
|
||||
def wait(fs, timeout=None, return_when=ALL_COMPLETED):
|
||||
"""Wait for the futures in the given sequence to complete.
|
||||
|
||||
Args:
|
||||
fs: The sequence of Futures (possibly created by different Executors) to
|
||||
wait upon.
|
||||
timeout: The maximum number of seconds to wait. If None, then there
|
||||
is no limit on the wait time.
|
||||
return_when: Indicates when this function should return. The options
|
||||
are:
|
||||
|
||||
FIRST_COMPLETED - Return when any future finishes or is
|
||||
cancelled.
|
||||
FIRST_EXCEPTION - Return when any future finishes by raising an
|
||||
exception. If no future raises an exception
|
||||
then it is equivalent to ALL_COMPLETED.
|
||||
ALL_COMPLETED - Return when all futures finish or are cancelled.
|
||||
|
||||
Returns:
|
||||
A named 2-tuple of sets. The first set, named 'done', contains the
|
||||
futures that completed (is finished or cancelled) before the wait
|
||||
completed. The second set, named 'not_done', contains uncompleted
|
||||
futures.
|
||||
"""
|
||||
with _AcquireFutures(fs):
|
||||
done = set(f for f in fs
|
||||
if f._state in [CANCELLED_AND_NOTIFIED, FINISHED])
|
||||
not_done = set(fs) - done
|
||||
|
||||
if (return_when == FIRST_COMPLETED) and done:
|
||||
return DoneAndNotDoneFutures(done, not_done)
|
||||
elif (return_when == FIRST_EXCEPTION) and done:
|
||||
if any(f for f in done
|
||||
if not f.cancelled() and f.exception() is not None):
|
||||
return DoneAndNotDoneFutures(done, not_done)
|
||||
|
||||
if len(done) == len(fs):
|
||||
return DoneAndNotDoneFutures(done, not_done)
|
||||
|
||||
waiter = _create_and_install_waiters(fs, return_when)
|
||||
|
||||
waiter.event.wait(timeout)
|
||||
for f in fs:
|
||||
with f._condition:
|
||||
f._waiters.remove(waiter)
|
||||
|
||||
done.update(waiter.finished_futures)
|
||||
return DoneAndNotDoneFutures(done, set(fs) - done)
|
||||
|
||||
class Future(object):
|
||||
"""Represents the result of an asynchronous computation."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initializes the future. Should not be called by clients."""
|
||||
self._condition = threading.Condition()
|
||||
self._state = PENDING
|
||||
self._result = None
|
||||
self._exception = None
|
||||
self._traceback = None
|
||||
self._waiters = []
|
||||
self._done_callbacks = []
|
||||
|
||||
def _invoke_callbacks(self):
|
||||
for callback in self._done_callbacks:
|
||||
try:
|
||||
callback(self)
|
||||
except Exception:
|
||||
LOGGER.exception('exception calling callback for %r', self)
|
||||
|
||||
def __repr__(self):
|
||||
with self._condition:
|
||||
if self._state == FINISHED:
|
||||
if self._exception:
|
||||
return '<Future at %s state=%s raised %s>' % (
|
||||
hex(id(self)),
|
||||
_STATE_TO_DESCRIPTION_MAP[self._state],
|
||||
self._exception.__class__.__name__)
|
||||
else:
|
||||
return '<Future at %s state=%s returned %s>' % (
|
||||
hex(id(self)),
|
||||
_STATE_TO_DESCRIPTION_MAP[self._state],
|
||||
self._result.__class__.__name__)
|
||||
return '<Future at %s state=%s>' % (
|
||||
hex(id(self)),
|
||||
_STATE_TO_DESCRIPTION_MAP[self._state])
|
||||
|
||||
def cancel(self):
|
||||
"""Cancel the future if possible.
|
||||
|
||||
Returns True if the future was cancelled, False otherwise. A future
|
||||
cannot be cancelled if it is running or has already completed.
|
||||
"""
|
||||
with self._condition:
|
||||
if self._state in [RUNNING, FINISHED]:
|
||||
return False
|
||||
|
||||
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
|
||||
return True
|
||||
|
||||
self._state = CANCELLED
|
||||
self._condition.notify_all()
|
||||
|
||||
self._invoke_callbacks()
|
||||
return True
|
||||
|
||||
def cancelled(self):
|
||||
"""Return True if the future has cancelled."""
|
||||
with self._condition:
|
||||
return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]
|
||||
|
||||
def running(self):
|
||||
"""Return True if the future is currently executing."""
|
||||
with self._condition:
|
||||
return self._state == RUNNING
|
||||
|
||||
def done(self):
|
||||
"""Return True of the future was cancelled or finished executing."""
|
||||
with self._condition:
|
||||
return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED, FINISHED]
|
||||
|
||||
def __get_result(self):
|
||||
if self._exception:
|
||||
raise type(self._exception), self._exception, self._traceback
|
||||
else:
|
||||
return self._result
|
||||
|
||||
def add_done_callback(self, fn):
|
||||
"""Attaches a callable that will be called when the future finishes.
|
||||
|
||||
Args:
|
||||
fn: A callable that will be called with this future as its only
|
||||
argument when the future completes or is cancelled. The callable
|
||||
will always be called by a thread in the same process in which
|
||||
it was added. If the future has already completed or been
|
||||
cancelled then the callable will be called immediately. These
|
||||
callables are called in the order that they were added.
|
||||
"""
|
||||
with self._condition:
|
||||
if self._state not in [CANCELLED, CANCELLED_AND_NOTIFIED, FINISHED]:
|
||||
self._done_callbacks.append(fn)
|
||||
return
|
||||
fn(self)
|
||||
|
||||
def result(self, timeout=None):
|
||||
"""Return the result of the call that the future represents.
|
||||
|
||||
Args:
|
||||
timeout: The number of seconds to wait for the result if the future
|
||||
isn't done. If None, then there is no limit on the wait time.
|
||||
|
||||
Returns:
|
||||
The result of the call that the future represents.
|
||||
|
||||
Raises:
|
||||
CancelledError: If the future was cancelled.
|
||||
TimeoutError: If the future didn't finish executing before the given
|
||||
timeout.
|
||||
Exception: If the call raised then that exception will be raised.
|
||||
"""
|
||||
with self._condition:
|
||||
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
|
||||
raise CancelledError()
|
||||
elif self._state == FINISHED:
|
||||
return self.__get_result()
|
||||
|
||||
self._condition.wait(timeout)
|
||||
|
||||
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
|
||||
raise CancelledError()
|
||||
elif self._state == FINISHED:
|
||||
return self.__get_result()
|
||||
else:
|
||||
raise TimeoutError()
|
||||
|
||||
def exception_info(self, timeout=None):
|
||||
"""Return a tuple of (exception, traceback) raised by the call that the
|
||||
future represents.
|
||||
|
||||
Args:
|
||||
timeout: The number of seconds to wait for the exception if the
|
||||
future isn't done. If None, then there is no limit on the wait
|
||||
time.
|
||||
|
||||
Returns:
|
||||
The exception raised by the call that the future represents or None
|
||||
if the call completed without raising.
|
||||
|
||||
Raises:
|
||||
CancelledError: If the future was cancelled.
|
||||
TimeoutError: If the future didn't finish executing before the given
|
||||
timeout.
|
||||
"""
|
||||
with self._condition:
|
||||
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
|
||||
raise CancelledError()
|
||||
elif self._state == FINISHED:
|
||||
return self._exception, self._traceback
|
||||
|
||||
self._condition.wait(timeout)
|
||||
|
||||
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
|
||||
raise CancelledError()
|
||||
elif self._state == FINISHED:
|
||||
return self._exception, self._traceback
|
||||
else:
|
||||
raise TimeoutError()
|
||||
|
||||
def exception(self, timeout=None):
|
||||
"""Return the exception raised by the call that the future represents.
|
||||
|
||||
Args:
|
||||
timeout: The number of seconds to wait for the exception if the
|
||||
future isn't done. If None, then there is no limit on the wait
|
||||
time.
|
||||
|
||||
Returns:
|
||||
The exception raised by the call that the future represents or None
|
||||
if the call completed without raising.
|
||||
|
||||
Raises:
|
||||
CancelledError: If the future was cancelled.
|
||||
TimeoutError: If the future didn't finish executing before the given
|
||||
timeout.
|
||||
"""
|
||||
return self.exception_info(timeout)[0]
|
||||
|
||||
# The following methods should only be used by Executors and in tests.
|
||||
def set_running_or_notify_cancel(self):
|
||||
"""Mark the future as running or process any cancel notifications.
|
||||
|
||||
Should only be used by Executor implementations and unit tests.
|
||||
|
||||
If the future has been cancelled (cancel() was called and returned
|
||||
True) then any threads waiting on the future completing (though calls
|
||||
to as_completed() or wait()) are notified and False is returned.
|
||||
|
||||
If the future was not cancelled then it is put in the running state
|
||||
(future calls to running() will return True) and True is returned.
|
||||
|
||||
This method should be called by Executor implementations before
|
||||
executing the work associated with this future. If this method returns
|
||||
False then the work should not be executed.
|
||||
|
||||
Returns:
|
||||
False if the Future was cancelled, True otherwise.
|
||||
|
||||
Raises:
|
||||
RuntimeError: if this method was already called or if set_result()
|
||||
or set_exception() was called.
|
||||
"""
|
||||
with self._condition:
|
||||
if self._state == CANCELLED:
|
||||
self._state = CANCELLED_AND_NOTIFIED
|
||||
for waiter in self._waiters:
|
||||
waiter.add_cancelled(self)
|
||||
# self._condition.notify_all() is not necessary because
|
||||
# self.cancel() triggers a notification.
|
||||
return False
|
||||
elif self._state == PENDING:
|
||||
self._state = RUNNING
|
||||
return True
|
||||
else:
|
||||
LOGGER.critical('Future %s in unexpected state: %s',
|
||||
id(self),
|
||||
self._state)
|
||||
raise RuntimeError('Future in unexpected state')
|
||||
|
||||
def set_result(self, result):
|
||||
"""Sets the return value of work associated with the future.
|
||||
|
||||
Should only be used by Executor implementations and unit tests.
|
||||
"""
|
||||
with self._condition:
|
||||
self._result = result
|
||||
self._state = FINISHED
|
||||
for waiter in self._waiters:
|
||||
waiter.add_result(self)
|
||||
self._condition.notify_all()
|
||||
self._invoke_callbacks()
|
||||
|
||||
def set_exception_info(self, exception, traceback):
|
||||
"""Sets the result of the future as being the given exception
|
||||
and traceback.
|
||||
|
||||
Should only be used by Executor implementations and unit tests.
|
||||
"""
|
||||
with self._condition:
|
||||
self._exception = exception
|
||||
self._traceback = traceback
|
||||
self._state = FINISHED
|
||||
for waiter in self._waiters:
|
||||
waiter.add_exception(self)
|
||||
self._condition.notify_all()
|
||||
self._invoke_callbacks()
|
||||
|
||||
def set_exception(self, exception):
|
||||
"""Sets the result of the future as being the given exception.
|
||||
|
||||
Should only be used by Executor implementations and unit tests.
|
||||
"""
|
||||
self.set_exception_info(exception, None)
|
||||
|
||||
class Executor(object):
|
||||
"""This is an abstract base class for concrete asynchronous executors."""
|
||||
|
||||
def submit(self, fn, *args, **kwargs):
|
||||
"""Submits a callable to be executed with the given arguments.
|
||||
|
||||
Schedules the callable to be executed as fn(*args, **kwargs) and returns
|
||||
a Future instance representing the execution of the callable.
|
||||
|
||||
Returns:
|
||||
A Future representing the given call.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def map(self, fn, *iterables, **kwargs):
|
||||
"""Returns a iterator equivalent to map(fn, iter).
|
||||
|
||||
Args:
|
||||
fn: A callable that will take as many arguments as there are
|
||||
passed iterables.
|
||||
timeout: The maximum number of seconds to wait. If None, then there
|
||||
is no limit on the wait time.
|
||||
|
||||
Returns:
|
||||
An iterator equivalent to: map(func, *iterables) but the calls may
|
||||
be evaluated out-of-order.
|
||||
|
||||
Raises:
|
||||
TimeoutError: If the entire result iterator could not be generated
|
||||
before the given timeout.
|
||||
Exception: If fn(*args) raises for any values.
|
||||
"""
|
||||
timeout = kwargs.get('timeout')
|
||||
if timeout is not None:
|
||||
end_time = timeout + time.time()
|
||||
|
||||
fs = [self.submit(fn, *args) for args in itertools.izip(*iterables)]
|
||||
|
||||
# Yield must be hidden in closure so that the futures are submitted
|
||||
# before the first iterator value is required.
|
||||
def result_iterator():
|
||||
try:
|
||||
for future in fs:
|
||||
if timeout is None:
|
||||
yield future.result()
|
||||
else:
|
||||
yield future.result(end_time - time.time())
|
||||
finally:
|
||||
for future in fs:
|
||||
future.cancel()
|
||||
return result_iterator()
|
||||
|
||||
def shutdown(self, wait=True):
|
||||
"""Clean-up the resources associated with the Executor.
|
||||
|
||||
It is safe to call this method several times. Otherwise, no other
|
||||
methods can be called after this one.
|
||||
|
||||
Args:
|
||||
wait: If True then shutdown will not return until all running
|
||||
futures have finished executing and the resources used by the
|
||||
executor have been reclaimed.
|
||||
"""
|
||||
pass
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.shutdown(wait=True)
|
||||
return False
|
|
@ -1,359 +0,0 @@
|
|||
# Copyright 2009 Brian Quinlan. All Rights Reserved.
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
"""Implements ProcessPoolExecutor.
|
||||
|
||||
The follow diagram and text describe the data-flow through the system:
|
||||
|
||||
|======================= In-process =====================|== Out-of-process ==|
|
||||
|
||||
+----------+ +----------+ +--------+ +-----------+ +---------+
|
||||
| | => | Work Ids | => | | => | Call Q | => | |
|
||||
| | +----------+ | | +-----------+ | |
|
||||
| | | ... | | | | ... | | |
|
||||
| | | 6 | | | | 5, call() | | |
|
||||
| | | 7 | | | | ... | | |
|
||||
| Process | | ... | | Local | +-----------+ | Process |
|
||||
| Pool | +----------+ | Worker | | #1..n |
|
||||
| Executor | | Thread | | |
|
||||
| | +----------- + | | +-----------+ | |
|
||||
| | <=> | Work Items | <=> | | <= | Result Q | <= | |
|
||||
| | +------------+ | | +-----------+ | |
|
||||
| | | 6: call() | | | | ... | | |
|
||||
| | | future | | | | 4, result | | |
|
||||
| | | ... | | | | 3, except | | |
|
||||
+----------+ +------------+ +--------+ +-----------+ +---------+
|
||||
|
||||
Executor.submit() called:
|
||||
- creates a uniquely numbered _WorkItem and adds it to the "Work Items" dict
|
||||
- adds the id of the _WorkItem to the "Work Ids" queue
|
||||
|
||||
Local worker thread:
|
||||
- reads work ids from the "Work Ids" queue and looks up the corresponding
|
||||
WorkItem from the "Work Items" dict: if the work item has been cancelled then
|
||||
it is simply removed from the dict, otherwise it is repackaged as a
|
||||
_CallItem and put in the "Call Q". New _CallItems are put in the "Call Q"
|
||||
until "Call Q" is full. NOTE: the size of the "Call Q" is kept small because
|
||||
calls placed in the "Call Q" can no longer be cancelled with Future.cancel().
|
||||
- reads _ResultItems from "Result Q", updates the future stored in the
|
||||
"Work Items" dict and deletes the dict entry
|
||||
|
||||
Process #1..n:
|
||||
- reads _CallItems from "Call Q", executes the calls, and puts the resulting
|
||||
_ResultItems in "Request Q"
|
||||
"""
|
||||
|
||||
import atexit
|
||||
from concurrent.futures import _base
|
||||
import Queue as queue
|
||||
import multiprocessing
|
||||
import threading
|
||||
import weakref
|
||||
import sys
|
||||
|
||||
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
|
||||
|
||||
# Workers are created as daemon threads and processes. This is done to allow the
|
||||
# interpreter to exit when there are still idle processes in a
|
||||
# ProcessPoolExecutor's process pool (i.e. shutdown() was not called). However,
|
||||
# allowing workers to die with the interpreter has two undesirable properties:
|
||||
# - The workers would still be running during interpretor shutdown,
|
||||
# meaning that they would fail in unpredictable ways.
|
||||
# - The workers could be killed while evaluating a work item, which could
|
||||
# be bad if the callable being evaluated has external side-effects e.g.
|
||||
# writing to a file.
|
||||
#
|
||||
# To work around this problem, an exit handler is installed which tells the
|
||||
# workers to exit when their work queues are empty and then waits until the
|
||||
# threads/processes finish.
|
||||
|
||||
_threads_queues = weakref.WeakKeyDictionary()
|
||||
_shutdown = False
|
||||
|
||||
def _python_exit():
|
||||
global _shutdown
|
||||
_shutdown = True
|
||||
items = list(_threads_queues.items()) if _threads_queues else ()
|
||||
for t, q in items:
|
||||
q.put(None)
|
||||
for t, q in items:
|
||||
t.join(sys.maxint)
|
||||
|
||||
# Controls how many more calls than processes will be queued in the call queue.
|
||||
# A smaller number will mean that processes spend more time idle waiting for
|
||||
# work while a larger number will make Future.cancel() succeed less frequently
|
||||
# (Futures in the call queue cannot be cancelled).
|
||||
EXTRA_QUEUED_CALLS = 1
|
||||
|
||||
class _WorkItem(object):
|
||||
def __init__(self, future, fn, args, kwargs):
|
||||
self.future = future
|
||||
self.fn = fn
|
||||
self.args = args
|
||||
self.kwargs = kwargs
|
||||
|
||||
class _ResultItem(object):
|
||||
def __init__(self, work_id, exception=None, result=None):
|
||||
self.work_id = work_id
|
||||
self.exception = exception
|
||||
self.result = result
|
||||
|
||||
class _CallItem(object):
|
||||
def __init__(self, work_id, fn, args, kwargs):
|
||||
self.work_id = work_id
|
||||
self.fn = fn
|
||||
self.args = args
|
||||
self.kwargs = kwargs
|
||||
|
||||
def _process_worker(call_queue, result_queue):
|
||||
"""Evaluates calls from call_queue and places the results in result_queue.
|
||||
|
||||
This worker is run in a separate process.
|
||||
|
||||
Args:
|
||||
call_queue: A multiprocessing.Queue of _CallItems that will be read and
|
||||
evaluated by the worker.
|
||||
result_queue: A multiprocessing.Queue of _ResultItems that will written
|
||||
to by the worker.
|
||||
shutdown: A multiprocessing.Event that will be set as a signal to the
|
||||
worker that it should exit when call_queue is empty.
|
||||
"""
|
||||
while True:
|
||||
call_item = call_queue.get(block=True)
|
||||
if call_item is None:
|
||||
# Wake up queue management thread
|
||||
result_queue.put(None)
|
||||
return
|
||||
try:
|
||||
r = call_item.fn(*call_item.args, **call_item.kwargs)
|
||||
except BaseException:
|
||||
e = sys.exc_info()[1]
|
||||
result_queue.put(_ResultItem(call_item.work_id,
|
||||
exception=e))
|
||||
else:
|
||||
result_queue.put(_ResultItem(call_item.work_id,
|
||||
result=r))
|
||||
|
||||
def _add_call_item_to_queue(pending_work_items,
|
||||
work_ids,
|
||||
call_queue):
|
||||
"""Fills call_queue with _WorkItems from pending_work_items.
|
||||
|
||||
This function never blocks.
|
||||
|
||||
Args:
|
||||
pending_work_items: A dict mapping work ids to _WorkItems e.g.
|
||||
{5: <_WorkItem...>, 6: <_WorkItem...>, ...}
|
||||
work_ids: A queue.Queue of work ids e.g. Queue([5, 6, ...]). Work ids
|
||||
are consumed and the corresponding _WorkItems from
|
||||
pending_work_items are transformed into _CallItems and put in
|
||||
call_queue.
|
||||
call_queue: A multiprocessing.Queue that will be filled with _CallItems
|
||||
derived from _WorkItems.
|
||||
"""
|
||||
while True:
|
||||
if call_queue.full():
|
||||
return
|
||||
try:
|
||||
work_id = work_ids.get(block=False)
|
||||
except queue.Empty:
|
||||
return
|
||||
else:
|
||||
work_item = pending_work_items[work_id]
|
||||
|
||||
if work_item.future.set_running_or_notify_cancel():
|
||||
call_queue.put(_CallItem(work_id,
|
||||
work_item.fn,
|
||||
work_item.args,
|
||||
work_item.kwargs),
|
||||
block=True)
|
||||
else:
|
||||
del pending_work_items[work_id]
|
||||
continue
|
||||
|
||||
def _queue_management_worker(executor_reference,
|
||||
processes,
|
||||
pending_work_items,
|
||||
work_ids_queue,
|
||||
call_queue,
|
||||
result_queue):
|
||||
"""Manages the communication between this process and the worker processes.
|
||||
|
||||
This function is run in a local thread.
|
||||
|
||||
Args:
|
||||
executor_reference: A weakref.ref to the ProcessPoolExecutor that owns
|
||||
this thread. Used to determine if the ProcessPoolExecutor has been
|
||||
garbage collected and that this function can exit.
|
||||
process: A list of the multiprocessing.Process instances used as
|
||||
workers.
|
||||
pending_work_items: A dict mapping work ids to _WorkItems e.g.
|
||||
{5: <_WorkItem...>, 6: <_WorkItem...>, ...}
|
||||
work_ids_queue: A queue.Queue of work ids e.g. Queue([5, 6, ...]).
|
||||
call_queue: A multiprocessing.Queue that will be filled with _CallItems
|
||||
derived from _WorkItems for processing by the process workers.
|
||||
result_queue: A multiprocessing.Queue of _ResultItems generated by the
|
||||
process workers.
|
||||
"""
|
||||
nb_shutdown_processes = [0]
|
||||
def shutdown_one_process():
|
||||
"""Tell a worker to terminate, which will in turn wake us again"""
|
||||
call_queue.put(None)
|
||||
nb_shutdown_processes[0] += 1
|
||||
while True:
|
||||
_add_call_item_to_queue(pending_work_items,
|
||||
work_ids_queue,
|
||||
call_queue)
|
||||
|
||||
result_item = result_queue.get(block=True)
|
||||
if result_item is not None:
|
||||
work_item = pending_work_items[result_item.work_id]
|
||||
del pending_work_items[result_item.work_id]
|
||||
|
||||
if result_item.exception:
|
||||
work_item.future.set_exception(result_item.exception)
|
||||
else:
|
||||
work_item.future.set_result(result_item.result)
|
||||
# Delete references to object. See issue16284
|
||||
del work_item
|
||||
# Check whether we should start shutting down.
|
||||
executor = executor_reference()
|
||||
# No more work items can be added if:
|
||||
# - The interpreter is shutting down OR
|
||||
# - The executor that owns this worker has been collected OR
|
||||
# - The executor that owns this worker has been shutdown.
|
||||
if _shutdown or executor is None or executor._shutdown_thread:
|
||||
# Since no new work items can be added, it is safe to shutdown
|
||||
# this thread if there are no pending work items.
|
||||
if not pending_work_items:
|
||||
while nb_shutdown_processes[0] < len(processes):
|
||||
shutdown_one_process()
|
||||
# If .join() is not called on the created processes then
|
||||
# some multiprocessing.Queue methods may deadlock on Mac OS
|
||||
# X.
|
||||
for p in processes:
|
||||
p.join()
|
||||
call_queue.close()
|
||||
return
|
||||
del executor
|
||||
|
||||
_system_limits_checked = False
|
||||
_system_limited = None
|
||||
def _check_system_limits():
|
||||
global _system_limits_checked, _system_limited
|
||||
if _system_limits_checked:
|
||||
if _system_limited:
|
||||
raise NotImplementedError(_system_limited)
|
||||
_system_limits_checked = True
|
||||
try:
|
||||
import os
|
||||
nsems_max = os.sysconf("SC_SEM_NSEMS_MAX")
|
||||
except (AttributeError, ValueError):
|
||||
# sysconf not available or setting not available
|
||||
return
|
||||
if nsems_max == -1:
|
||||
# indetermine limit, assume that limit is determined
|
||||
# by available memory only
|
||||
return
|
||||
if nsems_max >= 256:
|
||||
# minimum number of semaphores available
|
||||
# according to POSIX
|
||||
return
|
||||
_system_limited = "system provides too few semaphores (%d available, 256 necessary)" % nsems_max
|
||||
raise NotImplementedError(_system_limited)
|
||||
|
||||
class ProcessPoolExecutor(_base.Executor):
|
||||
def __init__(self, max_workers=None):
|
||||
"""Initializes a new ProcessPoolExecutor instance.
|
||||
|
||||
Args:
|
||||
max_workers: The maximum number of processes that can be used to
|
||||
execute the given calls. If None or not given then as many
|
||||
worker processes will be created as the machine has processors.
|
||||
"""
|
||||
_check_system_limits()
|
||||
|
||||
if max_workers is None:
|
||||
self._max_workers = multiprocessing.cpu_count()
|
||||
else:
|
||||
self._max_workers = max_workers
|
||||
|
||||
# Make the call queue slightly larger than the number of processes to
|
||||
# prevent the worker processes from idling. But don't make it too big
|
||||
# because futures in the call queue cannot be cancelled.
|
||||
self._call_queue = multiprocessing.Queue(self._max_workers +
|
||||
EXTRA_QUEUED_CALLS)
|
||||
self._result_queue = multiprocessing.Queue()
|
||||
self._work_ids = queue.Queue()
|
||||
self._queue_management_thread = None
|
||||
self._processes = set()
|
||||
|
||||
# Shutdown is a two-step process.
|
||||
self._shutdown_thread = False
|
||||
self._shutdown_lock = threading.Lock()
|
||||
self._queue_count = 0
|
||||
self._pending_work_items = {}
|
||||
|
||||
def _start_queue_management_thread(self):
|
||||
# When the executor gets lost, the weakref callback will wake up
|
||||
# the queue management thread.
|
||||
def weakref_cb(_, q=self._result_queue):
|
||||
q.put(None)
|
||||
if self._queue_management_thread is None:
|
||||
self._queue_management_thread = threading.Thread(
|
||||
target=_queue_management_worker,
|
||||
args=(weakref.ref(self, weakref_cb),
|
||||
self._processes,
|
||||
self._pending_work_items,
|
||||
self._work_ids,
|
||||
self._call_queue,
|
||||
self._result_queue))
|
||||
self._queue_management_thread.daemon = True
|
||||
self._queue_management_thread.start()
|
||||
_threads_queues[self._queue_management_thread] = self._result_queue
|
||||
|
||||
def _adjust_process_count(self):
|
||||
for _ in range(len(self._processes), self._max_workers):
|
||||
p = multiprocessing.Process(
|
||||
target=_process_worker,
|
||||
args=(self._call_queue,
|
||||
self._result_queue))
|
||||
p.start()
|
||||
self._processes.add(p)
|
||||
|
||||
def submit(self, fn, *args, **kwargs):
|
||||
with self._shutdown_lock:
|
||||
if self._shutdown_thread:
|
||||
raise RuntimeError('cannot schedule new futures after shutdown')
|
||||
|
||||
f = _base.Future()
|
||||
w = _WorkItem(f, fn, args, kwargs)
|
||||
|
||||
self._pending_work_items[self._queue_count] = w
|
||||
self._work_ids.put(self._queue_count)
|
||||
self._queue_count += 1
|
||||
# Wake up queue management thread
|
||||
self._result_queue.put(None)
|
||||
|
||||
self._start_queue_management_thread()
|
||||
self._adjust_process_count()
|
||||
return f
|
||||
submit.__doc__ = _base.Executor.submit.__doc__
|
||||
|
||||
def shutdown(self, wait=True):
|
||||
with self._shutdown_lock:
|
||||
self._shutdown_thread = True
|
||||
if self._queue_management_thread:
|
||||
# Wake up queue management thread
|
||||
self._result_queue.put(None)
|
||||
if wait:
|
||||
self._queue_management_thread.join(sys.maxint)
|
||||
# To reduce the risk of openning too many files, remove references to
|
||||
# objects that use file descriptors.
|
||||
self._queue_management_thread = None
|
||||
self._call_queue = None
|
||||
self._result_queue = None
|
||||
self._processes = None
|
||||
shutdown.__doc__ = _base.Executor.shutdown.__doc__
|
||||
|
||||
atexit.register(_python_exit)
|
|
@ -1,134 +0,0 @@
|
|||
# Copyright 2009 Brian Quinlan. All Rights Reserved.
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
"""Implements ThreadPoolExecutor."""
|
||||
|
||||
import atexit
|
||||
from concurrent.futures import _base
|
||||
import Queue as queue
|
||||
import threading
|
||||
import weakref
|
||||
import sys
|
||||
|
||||
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
|
||||
|
||||
# Workers are created as daemon threads. This is done to allow the interpreter
|
||||
# to exit when there are still idle threads in a ThreadPoolExecutor's thread
|
||||
# pool (i.e. shutdown() was not called). However, allowing workers to die with
|
||||
# the interpreter has two undesirable properties:
|
||||
# - The workers would still be running during interpretor shutdown,
|
||||
# meaning that they would fail in unpredictable ways.
|
||||
# - The workers could be killed while evaluating a work item, which could
|
||||
# be bad if the callable being evaluated has external side-effects e.g.
|
||||
# writing to a file.
|
||||
#
|
||||
# To work around this problem, an exit handler is installed which tells the
|
||||
# workers to exit when their work queues are empty and then waits until the
|
||||
# threads finish.
|
||||
|
||||
_threads_queues = weakref.WeakKeyDictionary()
|
||||
_shutdown = False
|
||||
|
||||
def _python_exit():
|
||||
global _shutdown
|
||||
_shutdown = True
|
||||
items = list(_threads_queues.items()) if _threads_queues else ()
|
||||
for t, q in items:
|
||||
q.put(None)
|
||||
for t, q in items:
|
||||
t.join(sys.maxint)
|
||||
|
||||
atexit.register(_python_exit)
|
||||
|
||||
class _WorkItem(object):
|
||||
def __init__(self, future, fn, args, kwargs):
|
||||
self.future = future
|
||||
self.fn = fn
|
||||
self.args = args
|
||||
self.kwargs = kwargs
|
||||
|
||||
def run(self):
|
||||
if not self.future.set_running_or_notify_cancel():
|
||||
return
|
||||
|
||||
try:
|
||||
result = self.fn(*self.args, **self.kwargs)
|
||||
except BaseException:
|
||||
e, tb = sys.exc_info()[1:]
|
||||
self.future.set_exception_info(e, tb)
|
||||
else:
|
||||
self.future.set_result(result)
|
||||
|
||||
def _worker(executor_reference, work_queue):
|
||||
try:
|
||||
while True:
|
||||
work_item = work_queue.get(block=True)
|
||||
if work_item is not None:
|
||||
work_item.run()
|
||||
# Delete references to object. See issue16284
|
||||
del work_item
|
||||
continue
|
||||
executor = executor_reference()
|
||||
# Exit if:
|
||||
# - The interpreter is shutting down OR
|
||||
# - The executor that owns the worker has been collected OR
|
||||
# - The executor that owns the worker has been shutdown.
|
||||
if _shutdown or executor is None or executor._shutdown:
|
||||
# Notice other workers
|
||||
work_queue.put(None)
|
||||
return
|
||||
del executor
|
||||
except BaseException:
|
||||
_base.LOGGER.critical('Exception in worker', exc_info=True)
|
||||
|
||||
class ThreadPoolExecutor(_base.Executor):
|
||||
def __init__(self, max_workers):
|
||||
"""Initializes a new ThreadPoolExecutor instance.
|
||||
|
||||
Args:
|
||||
max_workers: The maximum number of threads that can be used to
|
||||
execute the given calls.
|
||||
"""
|
||||
self._max_workers = max_workers
|
||||
self._work_queue = queue.Queue()
|
||||
self._threads = set()
|
||||
self._shutdown = False
|
||||
self._shutdown_lock = threading.Lock()
|
||||
|
||||
def submit(self, fn, *args, **kwargs):
|
||||
with self._shutdown_lock:
|
||||
if self._shutdown:
|
||||
raise RuntimeError('cannot schedule new futures after shutdown')
|
||||
|
||||
f = _base.Future()
|
||||
w = _WorkItem(f, fn, args, kwargs)
|
||||
|
||||
self._work_queue.put(w)
|
||||
self._adjust_thread_count()
|
||||
return f
|
||||
submit.__doc__ = _base.Executor.submit.__doc__
|
||||
|
||||
def _adjust_thread_count(self):
|
||||
# When the executor gets lost, the weakref callback will wake up
|
||||
# the worker threads.
|
||||
def weakref_cb(_, q=self._work_queue):
|
||||
q.put(None)
|
||||
# TODO(bquinlan): Should avoid creating new threads if there are more
|
||||
# idle threads than items in the work queue.
|
||||
if len(self._threads) < self._max_workers:
|
||||
t = threading.Thread(target=_worker,
|
||||
args=(weakref.ref(self, weakref_cb),
|
||||
self._work_queue))
|
||||
t.daemon = True
|
||||
t.start()
|
||||
self._threads.add(t)
|
||||
_threads_queues[t] = self._work_queue
|
||||
|
||||
def shutdown(self, wait=True):
|
||||
with self._shutdown_lock:
|
||||
self._shutdown = True
|
||||
self._work_queue.put(None)
|
||||
if wait:
|
||||
for t in self._threads:
|
||||
t.join(sys.maxint)
|
||||
shutdown.__doc__ = _base.Executor.shutdown.__doc__
|
|
@ -1,73 +1,6 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
"""Death by Captcha HTTP and socket API clients.
|
||||
|
||||
There are two types of Death by Captcha (DBC hereinafter) API: HTTP and
|
||||
socket ones. Both offer the same functionalily, with the socket API
|
||||
sporting faster responses and using way less connections.
|
||||
|
||||
To access the socket API, use SocketClient class; for the HTTP API, use
|
||||
HttpClient class. Both are thread-safe. SocketClient keeps a persistent
|
||||
connection opened and serializes all API requests sent through it, thus
|
||||
it is advised to keep a pool of them if you're script is heavily
|
||||
multithreaded.
|
||||
|
||||
Both SocketClient and HttpClient give you the following methods:
|
||||
|
||||
get_user()
|
||||
Returns your DBC account details as a dict with the following keys:
|
||||
|
||||
"user": your account numeric ID; if login fails, it will be the only
|
||||
item with the value of 0;
|
||||
"rate": your CAPTCHA rate, i.e. how much you will be charged for one
|
||||
solved CAPTCHA in US cents;
|
||||
"balance": your DBC account balance in US cents;
|
||||
"is_banned": flag indicating whether your account is suspended or not.
|
||||
|
||||
get_balance()
|
||||
Returns your DBC account balance in US cents.
|
||||
|
||||
get_captcha(cid)
|
||||
Returns an uploaded CAPTCHA details as a dict with the following keys:
|
||||
|
||||
"captcha": the CAPTCHA numeric ID; if no such CAPTCHAs found, it will
|
||||
be the only item with the value of 0;
|
||||
"text": the CAPTCHA text, if solved, otherwise None;
|
||||
"is_correct": flag indicating whether the CAPTCHA was solved correctly
|
||||
(DBC can detect that in rare cases).
|
||||
|
||||
The only argument `cid` is the CAPTCHA numeric ID.
|
||||
|
||||
get_text(cid)
|
||||
Returns an uploaded CAPTCHA text (None if not solved). The only argument
|
||||
`cid` is the CAPTCHA numeric ID.
|
||||
|
||||
report(cid)
|
||||
Reports an incorrectly solved CAPTCHA. The only argument `cid` is the
|
||||
CAPTCHA numeric ID. Returns True on success, False otherwise.
|
||||
|
||||
upload(captcha)
|
||||
Uploads a CAPTCHA. The only argument `captcha` can be either file-like
|
||||
object (any object with `read` method defined, actually, so StringIO
|
||||
will do), or CAPTCHA image file name. On successul upload you'll get
|
||||
the CAPTCHA details dict (see get_captcha() method).
|
||||
|
||||
NOTE: AT THIS POINT THE UPLOADED CAPTCHA IS NOT SOLVED YET! You have
|
||||
to poll for its status periodically using get_captcha() or get_text()
|
||||
method until the CAPTCHA is solved and you get the text.
|
||||
|
||||
decode(captcha, timeout=DEFAULT_TIMEOUT)
|
||||
A convenient method that uploads a CAPTCHA and polls for its status
|
||||
periodically, but no longer than `timeout` (defaults to 60 seconds).
|
||||
If solved, you'll get the CAPTCHA details dict (see get_captcha()
|
||||
method for details). See upload() method for details on `captcha`
|
||||
argument.
|
||||
|
||||
Visit http://www.deathbycaptcha.com/user/api for updates.
|
||||
|
||||
"""
|
||||
|
||||
import base64
|
||||
import binascii
|
||||
import errno
|
||||
|
@ -79,8 +12,7 @@ import socket
|
|||
import sys
|
||||
import threading
|
||||
import time
|
||||
import urllib
|
||||
import urllib2
|
||||
|
||||
try:
|
||||
from json import read as json_decode, write as json_encode
|
||||
except ImportError:
|
||||
|
@ -89,64 +21,71 @@ except ImportError:
|
|||
except ImportError:
|
||||
from simplejson import loads as json_decode, dumps as json_encode
|
||||
|
||||
try:
|
||||
from urllib2 import build_opener, HTTPRedirectHandler, Request, HTTPError
|
||||
from urllib import urlencode, urlopen
|
||||
except ImportError:
|
||||
from urllib.request import build_opener, HTTPRedirectHandler, Request, urlopen
|
||||
from urllib.error import HTTPError
|
||||
from urllib.parse import urlencode
|
||||
|
||||
# API version and unique software ID
|
||||
API_VERSION = 'DBC/Python v4.6'
|
||||
API_VERSION = 'DBC/Python v4.0.11'
|
||||
SOFTWARE_VENDOR_ID = 0
|
||||
|
||||
# Default CAPTCHA timeout and decode() polling interval
|
||||
DEFAULT_TIMEOUT = 60
|
||||
DEFAULT_TOKEN_TIMEOUT = 120
|
||||
POLLS_INTERVAL = [1, 1, 2, 3, 2, 2, 3, 2, 2]
|
||||
DFLT_POLL_INTERVAL = 3
|
||||
POLLS_INTERVAL = 5
|
||||
|
||||
# Base HTTP API url
|
||||
HTTP_BASE_URL = 'http://api.dbcapi.me/api'
|
||||
HTTP_BASE_URL = 'http://api.deathbycaptcha.com/api'
|
||||
|
||||
# Preferred HTTP API server's response content type, do not change
|
||||
HTTP_RESPONSE_TYPE = 'application/json'
|
||||
|
||||
# Socket API server's host & ports range
|
||||
SOCKET_HOST = 'api.dbcapi.me'
|
||||
SOCKET_HOST = 'api.deathbycaptcha.com'
|
||||
SOCKET_PORTS = range(8123, 8131)
|
||||
|
||||
|
||||
def _load_image(captcha):
|
||||
if hasattr(captcha, 'read'):
|
||||
img = captcha.read()
|
||||
elif type(captcha) == bytearray:
|
||||
img = captcha
|
||||
else:
|
||||
img = ''
|
||||
try:
|
||||
captcha_file = open(captcha, 'rb')
|
||||
except Exception:
|
||||
raise
|
||||
else:
|
||||
img = captcha_file.read()
|
||||
captcha_file.close()
|
||||
if not len(img):
|
||||
raise ValueError('CAPTCHA image is empty')
|
||||
elif imghdr.what(None, img) is None:
|
||||
raise TypeError('Unknown CAPTCHA image type')
|
||||
else:
|
||||
return img
|
||||
|
||||
|
||||
class AccessDeniedException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class Client(object):
|
||||
|
||||
"""Death by Captcha API Client."""
|
||||
"""Death by Captcha API Client"""
|
||||
|
||||
def __init__(self, username, password):
|
||||
self.is_verbose = False
|
||||
self.userpwd = {'username': username, 'password': password}
|
||||
self.userpwd = {'username': username,
|
||||
'password': password}
|
||||
|
||||
def _load_file(self, captcha):
|
||||
if hasattr(captcha, 'read'):
|
||||
raw_captcha = captcha.read()
|
||||
elif isinstance(captcha, bytearray):
|
||||
raw_captcha = captcha
|
||||
elif os.path.isfile(captcha):
|
||||
raw_captcha = ''
|
||||
try:
|
||||
f = open(captcha, 'rb')
|
||||
except Exception as e:
|
||||
raise e
|
||||
else:
|
||||
raw_captcha = f.read()
|
||||
f.close()
|
||||
else:
|
||||
f_stream = urlopen(captcha)
|
||||
raw_captcha = f_stream.read()
|
||||
|
||||
if not len(raw_captcha):
|
||||
raise ValueError('CAPTCHA image is empty')
|
||||
elif imghdr.what(None, raw_captcha) is None:
|
||||
raise TypeError('Unknown CAPTCHA image type')
|
||||
else:
|
||||
return raw_captcha
|
||||
|
||||
def _log(self, cmd, msg=''):
|
||||
if self.is_verbose:
|
||||
print '%d %s %s' % (time.time(), cmd, msg.rstrip())
|
||||
print('%d %s %s' % (time.time(), cmd, msg.rstrip()))
|
||||
return self
|
||||
|
||||
def close(self):
|
||||
|
@ -156,16 +95,16 @@ class Client(object):
|
|||
pass
|
||||
|
||||
def get_user(self):
|
||||
"""Fetch user details -- ID, balance, rate and banned status."""
|
||||
raise NotImplementedError()
|
||||
"""Fetch the user's details dict -- balance, rate and banned status."""
|
||||
raise NotImplemented()
|
||||
|
||||
def get_balance(self):
|
||||
"""Fetch user balance (in US cents)."""
|
||||
"""Fetch the user's balance (in US cents)."""
|
||||
return self.get_user().get('balance')
|
||||
|
||||
def get_captcha(self, cid):
|
||||
"""Fetch a CAPTCHA details -- ID, text and correctness flag."""
|
||||
raise NotImplementedError()
|
||||
"""Fetch a CAPTCHA details dict -- its ID, text and correctness."""
|
||||
raise NotImplemented()
|
||||
|
||||
def get_text(self, cid):
|
||||
"""Fetch a CAPTCHA text."""
|
||||
|
@ -173,7 +112,11 @@ class Client(object):
|
|||
|
||||
def report(self, cid):
|
||||
"""Report a CAPTCHA as incorrectly solved."""
|
||||
raise NotImplementedError()
|
||||
raise NotImplemented()
|
||||
|
||||
def remove(self, cid):
|
||||
"""Remove an unsolved CAPTCHA."""
|
||||
raise NotImplemented()
|
||||
|
||||
def upload(self, captcha):
|
||||
"""Upload a CAPTCHA.
|
||||
|
@ -182,56 +125,32 @@ class Client(object):
|
|||
dict on success.
|
||||
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
raise NotImplemented()
|
||||
|
||||
def decode(self, captcha=None, timeout=None, **kwargs):
|
||||
"""
|
||||
Try to solve a CAPTCHA.
|
||||
def decode(self, captcha, timeout=DEFAULT_TIMEOUT):
|
||||
"""Try to solve a CAPTCHA.
|
||||
|
||||
See Client.upload() for arguments details.
|
||||
|
||||
Uploads a CAPTCHA, polls for its status periodically with arbitrary
|
||||
timeout (in seconds), returns CAPTCHA details if (correctly) solved.
|
||||
|
||||
"""
|
||||
if not timeout:
|
||||
if not captcha:
|
||||
timeout = DEFAULT_TOKEN_TIMEOUT
|
||||
else:
|
||||
timeout = DEFAULT_TIMEOUT
|
||||
|
||||
deadline = time.time() + (max(0, timeout) or DEFAULT_TIMEOUT)
|
||||
uploaded_captcha = self.upload(captcha, **kwargs)
|
||||
if uploaded_captcha:
|
||||
intvl_idx = 0 # POLL_INTERVAL index
|
||||
while deadline > time.time() and not uploaded_captcha.get('text'):
|
||||
intvl, intvl_idx = self._get_poll_interval(intvl_idx)
|
||||
time.sleep(intvl)
|
||||
pulled = self.get_captcha(uploaded_captcha['captcha'])
|
||||
if pulled['captcha'] == uploaded_captcha['captcha']:
|
||||
uploaded_captcha = pulled
|
||||
if uploaded_captcha.get('text') and \
|
||||
uploaded_captcha.get('is_correct'):
|
||||
return uploaded_captcha
|
||||
|
||||
def _get_poll_interval(self, idx):
|
||||
"""Returns poll interval and next index depending on index provided"""
|
||||
|
||||
if len(POLLS_INTERVAL) > idx:
|
||||
intvl = POLLS_INTERVAL[idx]
|
||||
else:
|
||||
intvl = DFLT_POLL_INTERVAL
|
||||
idx += 1
|
||||
|
||||
return intvl, idx
|
||||
|
||||
c = self.upload(captcha)
|
||||
if c:
|
||||
while deadline > time.time() and not c.get('text'):
|
||||
time.sleep(POLLS_INTERVAL)
|
||||
c = self.get_captcha(c['captcha'])
|
||||
if c.get('text') and c.get('is_correct'):
|
||||
return c
|
||||
|
||||
class HttpClient(Client):
|
||||
|
||||
"""Death by Captcha HTTP API client."""
|
||||
|
||||
def __init__(self, *args):
|
||||
Client.__init__(self, *args)
|
||||
self.opener = urllib2.build_opener(urllib2.HTTPRedirectHandler())
|
||||
self.opener = build_opener(HTTPRedirectHandler())
|
||||
|
||||
def _call(self, cmd, payload=None, headers=None):
|
||||
if headers is None:
|
||||
|
@ -239,30 +158,22 @@ class HttpClient(Client):
|
|||
headers['Accept'] = HTTP_RESPONSE_TYPE
|
||||
headers['User-Agent'] = API_VERSION
|
||||
if hasattr(payload, 'items'):
|
||||
payload = urllib.urlencode(payload)
|
||||
payload = urlencode(payload)
|
||||
self._log('SEND', '%s %d %s' % (cmd, len(payload), payload))
|
||||
else:
|
||||
self._log('SEND', '%s' % cmd)
|
||||
if payload is not None:
|
||||
headers['Content-Length'] = len(payload)
|
||||
try:
|
||||
response = self.opener.open(urllib2.Request(
|
||||
response = self.opener.open(Request(
|
||||
HTTP_BASE_URL + '/' + cmd.strip('/'),
|
||||
data=payload,
|
||||
headers=headers
|
||||
)).read()
|
||||
except urllib2.HTTPError, err:
|
||||
if 403 == err.code:
|
||||
raise AccessDeniedException('Access denied, please check'
|
||||
' your credentials and/or balance')
|
||||
elif 400 == err.code or 413 == err.code:
|
||||
raise ValueError("CAPTCHA was rejected by the service, check"
|
||||
" if it's a valid image")
|
||||
elif 503 == err.code:
|
||||
raise OverflowError("CAPTCHA was rejected due to service"
|
||||
" overload, try again later")
|
||||
else:
|
||||
raise err
|
||||
except HTTPError as e:
|
||||
if 403 == e.code:
|
||||
raise AccessDeniedException(
|
||||
'Access denied, please check your credentials and/or balance')
|
||||
elif 400 == e.code or 413 == e.code:
|
||||
raise ValueError("CAPTCHA was rejected by the service, check if it's a valid image")
|
||||
else:
|
||||
self._log('RECV', '%d %s' % (len(response), response))
|
||||
try:
|
||||
|
@ -281,53 +192,38 @@ class HttpClient(Client):
|
|||
return not self._call('captcha/%d/report' % cid,
|
||||
self.userpwd.copy()).get('is_correct')
|
||||
|
||||
def upload(self, captcha=None, **kwargs):
|
||||
def remove(self, cid):
|
||||
return not self._call('captcha/%d/remove' % cid,
|
||||
self.userpwd.copy()).get('captcha')
|
||||
|
||||
def upload(self, captcha):
|
||||
boundary = binascii.hexlify(os.urandom(16))
|
||||
banner = kwargs.get('banner', '')
|
||||
if banner:
|
||||
kwargs['banner'] = 'base64:' + base64.b64encode(_load_image(banner))
|
||||
body = '\r\n'.join(('\r\n'.join((
|
||||
'--%s' % boundary,
|
||||
'Content-Disposition: form-data; name="%s"' % k,
|
||||
'Content-Type: text/plain',
|
||||
'Content-Length: %d' % len(str(v)),
|
||||
'',
|
||||
str(v)
|
||||
))) for k, v in self.userpwd.items())
|
||||
|
||||
body += '\r\n'.join(('\r\n'.join((
|
||||
'--%s' % boundary,
|
||||
'Content-Disposition: form-data; name="%s"' % k,
|
||||
'Content-Type: text/plain',
|
||||
'Content-Length: %d' % len(str(v)),
|
||||
'',
|
||||
str(v)
|
||||
))) for k, v in kwargs.items())
|
||||
|
||||
if captcha:
|
||||
img = _load_image(captcha)
|
||||
body += '\r\n'.join((
|
||||
'',
|
||||
'--%s' % boundary,
|
||||
'Content-Disposition: form-data; name="captchafile"; '
|
||||
'filename="captcha"',
|
||||
'Content-Type: application/octet-stream',
|
||||
'Content-Length: %d' % len(img),
|
||||
'',
|
||||
img,
|
||||
'--%s--' % boundary,
|
||||
''
|
||||
))
|
||||
|
||||
data = self.userpwd.copy()
|
||||
data['swid'] = SOFTWARE_VENDOR_ID
|
||||
body = '\r\n'.join(('\r\n'.join(('--%s' % boundary,
|
||||
'Content-Disposition: form-data; name="%s"' % k,
|
||||
'Content-Type: text/plain',
|
||||
'Content-Length: %d' % len(str(v)),
|
||||
'',
|
||||
str(v))))
|
||||
for k, v in data.items())
|
||||
captcha = self._load_file(captcha)
|
||||
body += '\r\n'.join(('',
|
||||
'--%s' % boundary,
|
||||
'Content-Disposition: form-data; name="captchafile"; filename="captcha"',
|
||||
'Content-Type: application/octet-stream',
|
||||
'Content-Length: %d' % len(captcha),
|
||||
'',
|
||||
captcha,
|
||||
'--%s--' % boundary,
|
||||
''))
|
||||
response = self._call('captcha', body, {
|
||||
'Content-Type': 'multipart/form-data; boundary="%s"' % boundary
|
||||
}) or {}
|
||||
if response.get('captcha'):
|
||||
return response
|
||||
|
||||
|
||||
class SocketClient(Client):
|
||||
|
||||
"""Death by Captcha socket API client."""
|
||||
|
||||
TERMINATOR = '\r\n'
|
||||
|
@ -357,11 +253,12 @@ class SocketClient(Client):
|
|||
self.socket.settimeout(0)
|
||||
try:
|
||||
self.socket.connect(host)
|
||||
except socket.error, err:
|
||||
if (err.args[0] not in
|
||||
(errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)):
|
||||
except socket.error as e:
|
||||
if errno.EINPROGRESS == e[0]:
|
||||
pass
|
||||
else:
|
||||
self.close()
|
||||
raise err
|
||||
raise e
|
||||
return self.socket
|
||||
|
||||
def __del__(self):
|
||||
|
@ -372,30 +269,27 @@ class SocketClient(Client):
|
|||
fds = [sock]
|
||||
buf += self.TERMINATOR
|
||||
response = ''
|
||||
intvl_idx = 0
|
||||
while True:
|
||||
intvl, intvl_idx = self._get_poll_interval(intvl_idx)
|
||||
rds, wrs, exs = select.select((not buf and fds) or [],
|
||||
(buf and fds) or [],
|
||||
fds,
|
||||
intvl)
|
||||
if exs:
|
||||
rd, wr, ex = select.select((not buf and fds) or [],
|
||||
(buf and fds) or [],
|
||||
fds,
|
||||
POLLS_INTERVAL)
|
||||
if ex:
|
||||
raise IOError('select() failed')
|
||||
try:
|
||||
if wrs:
|
||||
if wr:
|
||||
while buf:
|
||||
buf = buf[wrs[0].send(buf):]
|
||||
elif rds:
|
||||
buf = buf[wr[0].send(buf):]
|
||||
elif rd:
|
||||
while True:
|
||||
s = rds[0].recv(256)
|
||||
s = rd[0].recv(256)
|
||||
if not s:
|
||||
raise IOError('recv(): connection lost')
|
||||
else:
|
||||
response += s
|
||||
except socket.error, err:
|
||||
if (err.args[0] not in
|
||||
(errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)):
|
||||
raise err
|
||||
except socket.error as e:
|
||||
if e[0] not in (errno.EAGAIN, errno.EINPROGRESS):
|
||||
raise e
|
||||
if response.endswith(self.TERMINATOR):
|
||||
self._log('RECV', response)
|
||||
return response.rstrip(self.TERMINATOR)
|
||||
|
@ -409,18 +303,16 @@ class SocketClient(Client):
|
|||
request = json_encode(data)
|
||||
|
||||
response = None
|
||||
for _ in range(2):
|
||||
if not self.socket and cmd != 'login':
|
||||
self._call('login', self.userpwd.copy())
|
||||
for i in range(2):
|
||||
self.socket_lock.acquire()
|
||||
try:
|
||||
sock = self.connect()
|
||||
response = self._sendrecv(sock, request)
|
||||
except IOError, err:
|
||||
sys.stderr.write(str(err) + "\n")
|
||||
except IOError as e:
|
||||
sys.stderr.write(str(e) + "\n")
|
||||
self.close()
|
||||
except socket.error, err:
|
||||
sys.stderr.write(str(err) + "\n")
|
||||
except socket.error as e:
|
||||
sys.stderr.write(str(e) + "\n")
|
||||
self.close()
|
||||
raise IOError('Connection refused')
|
||||
else:
|
||||
|
@ -428,89 +320,84 @@ class SocketClient(Client):
|
|||
finally:
|
||||
self.socket_lock.release()
|
||||
|
||||
if response is None:
|
||||
raise IOError('Connection lost or timed out during API request')
|
||||
|
||||
try:
|
||||
response = json_decode(response)
|
||||
except Exception:
|
||||
raise RuntimeError('Invalid API response')
|
||||
|
||||
if not response.get('error'):
|
||||
return response
|
||||
|
||||
error = response['error']
|
||||
if error in ('not-logged-in', 'invalid-credentials'):
|
||||
raise AccessDeniedException('Access denied, check your credentials')
|
||||
elif 'banned' == error:
|
||||
raise AccessDeniedException('Access denied, account is suspended')
|
||||
elif 'insufficient-funds' == error:
|
||||
raise AccessDeniedException(
|
||||
'CAPTCHA was rejected due to low balance')
|
||||
elif 'invalid-captcha' == error:
|
||||
raise ValueError('CAPTCHA is not a valid image')
|
||||
elif 'service-overload' == error:
|
||||
raise OverflowError(
|
||||
'CAPTCHA was rejected due to service overload, try again later')
|
||||
else:
|
||||
if response is None:
|
||||
raise IOError('Connection lost timed out during API request')
|
||||
try:
|
||||
response = json_decode(response)
|
||||
except Exception:
|
||||
raise RuntimeError('Invalid API response')
|
||||
if 'error' in response:
|
||||
error = response['error']
|
||||
if 'not-logged-in' == error:
|
||||
raise AccessDeniedException('Access denied, check your credentials')
|
||||
elif 'banned' == error:
|
||||
raise AccessDeniedException('Access denied, account is suspended')
|
||||
elif 'insufficient-funds' == error:
|
||||
raise AccessDeniedException('CAPTCHA was rejected due to low balance')
|
||||
elif 'invalid-captcha' == error:
|
||||
raise ValueError('CAPTCHA is not a valid image')
|
||||
elif 'service-overload' == error:
|
||||
raise ValueError(
|
||||
'CAPTCHA was rejected due to service overload, try again later')
|
||||
else:
|
||||
raise RuntimeError('API server error occured: %s' % error)
|
||||
except Exception as e:
|
||||
self.socket_lock.acquire()
|
||||
self.close()
|
||||
self.socket_lock.release()
|
||||
raise RuntimeError('API server error occured: %s' % error)
|
||||
raise e
|
||||
else:
|
||||
return response
|
||||
|
||||
def get_user(self):
|
||||
return self._call('user') or {'user': 0}
|
||||
return self._call('user', self.userpwd.copy()) or {'user': 0}
|
||||
|
||||
def get_captcha(self, cid):
|
||||
return self._call('captcha', {'captcha': cid}) or {'captcha': 0}
|
||||
|
||||
def upload(self, captcha=None, **kwargs):
|
||||
data = {}
|
||||
if captcha:
|
||||
data['captcha'] = base64.b64encode(_load_image(captcha))
|
||||
if kwargs:
|
||||
banner = kwargs.get('banner', '')
|
||||
if banner:
|
||||
kwargs['banner'] = base64.b64encode(_load_image(banner))
|
||||
data.update(kwargs)
|
||||
def upload(self, captcha):
|
||||
data = self.userpwd.copy()
|
||||
data['captcha'] = base64.b64encode(self._load_file(captcha))
|
||||
response = self._call('upload', data)
|
||||
if response.get('captcha'):
|
||||
uploaded_captcha = dict(
|
||||
(k, response.get(k))
|
||||
for k in ('captcha', 'text', 'is_correct')
|
||||
)
|
||||
if not uploaded_captcha['text']:
|
||||
uploaded_captcha['text'] = None
|
||||
return uploaded_captcha
|
||||
return dict((k, response.get(k)) for k in ('captcha', 'text', 'is_correct'))
|
||||
|
||||
def report(self, cid):
|
||||
return not self._call('report', {'captcha': cid}).get('is_correct')
|
||||
data = self.userpwd.copy()
|
||||
data['captcha'] = cid
|
||||
return not self._call('report', data).get('is_correct')
|
||||
|
||||
def remove(self, cid):
|
||||
data = self.userpwd.copy()
|
||||
data['captcha'] = cid
|
||||
return not self._call('remove', data).get('captcha')
|
||||
|
||||
if '__main__' == __name__:
|
||||
import sys
|
||||
|
||||
# Put your DBC username & password here:
|
||||
# client = HttpClient(sys.argv[1], sys.argv[2])
|
||||
#client = HttpClient(sys.argv[1], sys.argv[2])
|
||||
client = SocketClient(sys.argv[1], sys.argv[2])
|
||||
client.is_verbose = True
|
||||
|
||||
print 'Your balance is %s US cents' % client.get_balance()
|
||||
print('Your balance is %s US cents' % client.get_balance())
|
||||
|
||||
for fn in sys.argv[3:]:
|
||||
try:
|
||||
# Put your CAPTCHA image file name or file-like object, and optional
|
||||
# solving timeout (in seconds) here:
|
||||
captcha = client.decode(fn, DEFAULT_TIMEOUT)
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
sys.stderr.write('Failed uploading CAPTCHA: %s\n' % (e, ))
|
||||
captcha = None
|
||||
|
||||
if captcha:
|
||||
print 'CAPTCHA %d solved: %s' % \
|
||||
(captcha['captcha'], captcha['text'])
|
||||
print('CAPTCHA %d solved: %s' % (captcha['captcha'], captcha['text']))
|
||||
|
||||
# Report as incorrectly solved if needed. Make sure the CAPTCHA was
|
||||
# in fact incorrectly solved!
|
||||
# try:
|
||||
# client.report(captcha['captcha'])
|
||||
# except Exception, e:
|
||||
# sys.stderr.write('Failed reporting CAPTCHA: %s\n' % (e, ))
|
||||
try:
|
||||
client.report(captcha['captcha'])
|
||||
except Exception as e:
|
||||
sys.stderr.write('Failed reporting CAPTCHA: %s\n' % (e, ))
|
||||
|
|
|
@ -40,7 +40,7 @@ import operator
|
|||
import itertools
|
||||
import collections
|
||||
|
||||
__version__ = '4.3.0'
|
||||
__version__ = '4.4.0'
|
||||
|
||||
if sys.version >= '3':
|
||||
from inspect import getfullargspec
|
||||
|
@ -65,6 +65,12 @@ except AttributeError:
|
|||
# let's assume there are no coroutine functions in old Python
|
||||
def iscoroutinefunction(f):
|
||||
return False
|
||||
try:
|
||||
from inspect import isgeneratorfunction
|
||||
except ImportError:
|
||||
# assume no generator function in old Python versions
|
||||
def isgeneratorfunction(caller):
|
||||
return False
|
||||
|
||||
|
||||
DEF = re.compile(r'\s*def\s*([_\w][_\w\d]*)\s*\(')
|
||||
|
@ -173,7 +179,8 @@ class FunctionMaker(object):
|
|||
# Ensure each generated function has a unique filename for profilers
|
||||
# (such as cProfile) that depend on the tuple of (<filename>,
|
||||
# <definition line>, <function name>) being unique.
|
||||
filename = '<decorator-gen-%d>' % (next(self._compile_count),)
|
||||
filename = '<%s:decorator-gen-%d>' % (
|
||||
__file__, next(self._compile_count))
|
||||
try:
|
||||
code = compile(src, filename, 'single')
|
||||
exec(code, evaldict)
|
||||
|
@ -218,6 +225,8 @@ class FunctionMaker(object):
|
|||
def decorate(func, caller, extras=()):
|
||||
"""
|
||||
decorate(func, caller) decorates a function using a caller.
|
||||
If the caller is a generator function, the resulting function
|
||||
will be a generator function.
|
||||
"""
|
||||
evaldict = dict(_call_=caller, _func_=func)
|
||||
es = ''
|
||||
|
@ -225,9 +234,23 @@ def decorate(func, caller, extras=()):
|
|||
ex = '_e%d_' % i
|
||||
evaldict[ex] = extra
|
||||
es += ex + ', '
|
||||
fun = FunctionMaker.create(
|
||||
func, "return _call_(_func_, %s%%(shortsignature)s)" % es,
|
||||
evaldict, __wrapped__=func)
|
||||
|
||||
if '3.5' <= sys.version < '3.6':
|
||||
# with Python 3.5 isgeneratorfunction returns True for all coroutines
|
||||
# however we know that it is NOT possible to have a generator
|
||||
# coroutine in python 3.5: PEP525 was not there yet
|
||||
generatorcaller = isgeneratorfunction(
|
||||
caller) and not iscoroutinefunction(caller)
|
||||
else:
|
||||
generatorcaller = isgeneratorfunction(caller)
|
||||
if generatorcaller:
|
||||
fun = FunctionMaker.create(
|
||||
func, "for res in _call_(_func_, %s%%(shortsignature)s):\n"
|
||||
" yield res" % es, evaldict, __wrapped__=func)
|
||||
else:
|
||||
fun = FunctionMaker.create(
|
||||
func, "return _call_(_func_, %s%%(shortsignature)s)" % es,
|
||||
evaldict, __wrapped__=func)
|
||||
if hasattr(func, '__qualname__'):
|
||||
fun.__qualname__ = func.__qualname__
|
||||
return fun
|
||||
|
@ -261,12 +284,12 @@ def decorator(caller, _func=None):
|
|||
doc = caller.__call__.__doc__
|
||||
evaldict = dict(_call=caller, _decorate_=decorate)
|
||||
dec = FunctionMaker.create(
|
||||
'%s(%s func)' % (name, defaultargs),
|
||||
'%s(func, %s)' % (name, defaultargs),
|
||||
'if func is None: return lambda func: _decorate_(func, _call, (%s))\n'
|
||||
'return _decorate_(func, _call, (%s))' % (defaultargs, defaultargs),
|
||||
evaldict, doc=doc, module=caller.__module__, __wrapped__=caller)
|
||||
if defaults:
|
||||
dec.__defaults__ = defaults + (None,)
|
||||
dec.__defaults__ = (None,) + defaults
|
||||
return dec
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
__version__ = '0.6.5'
|
||||
__version__ = '0.7.1'
|
||||
|
||||
from .lock import Lock # noqa
|
||||
from .lock import NeedRegenerationException # noqa
|
||||
|
|
206
libs/dogpile/cache/region.py
vendored
206
libs/dogpile/cache/region.py
vendored
|
@ -10,8 +10,9 @@ from ..util import compat
|
|||
import time
|
||||
import datetime
|
||||
from numbers import Number
|
||||
from functools import wraps
|
||||
from functools import wraps, partial
|
||||
import threading
|
||||
from decorator import decorate
|
||||
|
||||
_backend_loader = PluginLoader("dogpile.cache")
|
||||
register_backend = _backend_loader.register
|
||||
|
@ -188,7 +189,7 @@ class DefaultInvalidationStrategy(RegionInvalidationStrategy):
|
|||
|
||||
|
||||
class CacheRegion(object):
|
||||
"""A front end to a particular cache backend.
|
||||
r"""A front end to a particular cache backend.
|
||||
|
||||
:param name: Optional, a string name for the region.
|
||||
This isn't used internally
|
||||
|
@ -484,6 +485,26 @@ class CacheRegion(object):
|
|||
else:
|
||||
return self._LockWrapper()
|
||||
|
||||
# cached value
|
||||
_actual_backend = None
|
||||
|
||||
@property
|
||||
def actual_backend(self):
|
||||
"""Return the ultimate backend underneath any proxies.
|
||||
|
||||
The backend might be the result of one or more ``proxy.wrap``
|
||||
applications. If so, derive the actual underlying backend.
|
||||
|
||||
.. versionadded:: 0.6.6
|
||||
|
||||
"""
|
||||
if self._actual_backend is None:
|
||||
_backend = self.backend
|
||||
while hasattr(_backend, 'proxied'):
|
||||
_backend = _backend.proxied
|
||||
self._actual_backend = _backend
|
||||
return self._actual_backend
|
||||
|
||||
def invalidate(self, hard=True):
|
||||
"""Invalidate this :class:`.CacheRegion`.
|
||||
|
||||
|
@ -723,7 +744,8 @@ class CacheRegion(object):
|
|||
]
|
||||
|
||||
def get_or_create(
|
||||
self, key, creator, expiration_time=None, should_cache_fn=None):
|
||||
self, key, creator, expiration_time=None, should_cache_fn=None,
|
||||
creator_args=None):
|
||||
"""Return a cached value based on the given key.
|
||||
|
||||
If the value does not exist or is considered to be expired
|
||||
|
@ -759,6 +781,11 @@ class CacheRegion(object):
|
|||
|
||||
:param creator: function which creates a new value.
|
||||
|
||||
:param creator_args: optional tuple of (args, kwargs) that will be
|
||||
passed to the creator function if present.
|
||||
|
||||
.. versionadded:: 0.7.0
|
||||
|
||||
:param expiration_time: optional expiration time which will overide
|
||||
the expiration time already configured on this :class:`.CacheRegion`
|
||||
if not None. To set no expiration, use the value -1.
|
||||
|
@ -799,7 +826,7 @@ class CacheRegion(object):
|
|||
value = self.backend.get(key)
|
||||
if (value is NO_VALUE or value.metadata['v'] != value_version or
|
||||
self.region_invalidator.is_hard_invalidated(
|
||||
value.metadata["ct"])):
|
||||
value.metadata["ct"])):
|
||||
raise NeedRegenerationException()
|
||||
ct = value.metadata["ct"]
|
||||
if self.region_invalidator.is_soft_invalidated(ct):
|
||||
|
@ -808,7 +835,10 @@ class CacheRegion(object):
|
|||
return value.payload, ct
|
||||
|
||||
def gen_value():
|
||||
created_value = creator()
|
||||
if creator_args:
|
||||
created_value = creator(*creator_args[0], **creator_args[1])
|
||||
else:
|
||||
created_value = creator()
|
||||
value = self._value(created_value)
|
||||
|
||||
if not should_cache_fn or \
|
||||
|
@ -831,8 +861,13 @@ class CacheRegion(object):
|
|||
|
||||
if self.async_creation_runner:
|
||||
def async_creator(mutex):
|
||||
return self.async_creation_runner(
|
||||
self, orig_key, creator, mutex)
|
||||
if creator_args:
|
||||
@wraps(creator)
|
||||
def go():
|
||||
return creator(*creator_args[0], **creator_args[1])
|
||||
else:
|
||||
go = creator
|
||||
return self.async_creation_runner(self, orig_key, go, mutex)
|
||||
else:
|
||||
async_creator = None
|
||||
|
||||
|
@ -896,7 +931,7 @@ class CacheRegion(object):
|
|||
|
||||
if (value is NO_VALUE or value.metadata['v'] != value_version or
|
||||
self.region_invalidator.is_hard_invalidated(
|
||||
value.metadata['v'])):
|
||||
value.metadata['ct'])):
|
||||
# dogpile.core understands a 0 here as
|
||||
# "the value is not available", e.g.
|
||||
# _has_value() will return False.
|
||||
|
@ -1228,26 +1263,31 @@ class CacheRegion(object):
|
|||
if function_key_generator is None:
|
||||
function_key_generator = self.function_key_generator
|
||||
|
||||
def decorator(fn):
|
||||
def get_or_create_for_user_func(key_generator, user_func, *arg, **kw):
|
||||
key = key_generator(*arg, **kw)
|
||||
|
||||
timeout = expiration_time() if expiration_time_is_callable \
|
||||
else expiration_time
|
||||
return self.get_or_create(key, user_func, timeout,
|
||||
should_cache_fn, (arg, kw))
|
||||
|
||||
def cache_decorator(user_func):
|
||||
if to_str is compat.string_type:
|
||||
# backwards compatible
|
||||
key_generator = function_key_generator(namespace, fn)
|
||||
key_generator = function_key_generator(namespace, user_func)
|
||||
else:
|
||||
key_generator = function_key_generator(
|
||||
namespace, fn,
|
||||
namespace, user_func,
|
||||
to_str=to_str)
|
||||
|
||||
@wraps(fn)
|
||||
def decorate(*arg, **kw):
|
||||
def refresh(*arg, **kw):
|
||||
"""
|
||||
Like invalidate, but regenerates the value instead
|
||||
"""
|
||||
key = key_generator(*arg, **kw)
|
||||
|
||||
@wraps(fn)
|
||||
def creator():
|
||||
return fn(*arg, **kw)
|
||||
timeout = expiration_time() if expiration_time_is_callable \
|
||||
else expiration_time
|
||||
return self.get_or_create(key, creator, timeout,
|
||||
should_cache_fn)
|
||||
value = user_func(*arg, **kw)
|
||||
self.set(key, value)
|
||||
return value
|
||||
|
||||
def invalidate(*arg, **kw):
|
||||
key = key_generator(*arg, **kw)
|
||||
|
@ -1261,20 +1301,18 @@ class CacheRegion(object):
|
|||
key = key_generator(*arg, **kw)
|
||||
return self.get(key)
|
||||
|
||||
def refresh(*arg, **kw):
|
||||
key = key_generator(*arg, **kw)
|
||||
value = fn(*arg, **kw)
|
||||
self.set(key, value)
|
||||
return value
|
||||
user_func.set = set_
|
||||
user_func.invalidate = invalidate
|
||||
user_func.get = get
|
||||
user_func.refresh = refresh
|
||||
user_func.original = user_func
|
||||
|
||||
decorate.set = set_
|
||||
decorate.invalidate = invalidate
|
||||
decorate.refresh = refresh
|
||||
decorate.get = get
|
||||
decorate.original = fn
|
||||
# Use `decorate` to preserve the signature of :param:`user_func`.
|
||||
|
||||
return decorate
|
||||
return decorator
|
||||
return decorate(user_func, partial(
|
||||
get_or_create_for_user_func, key_generator))
|
||||
|
||||
return cache_decorator
|
||||
|
||||
def cache_multi_on_arguments(
|
||||
self, namespace=None, expiration_time=None,
|
||||
|
@ -1402,51 +1440,50 @@ class CacheRegion(object):
|
|||
if function_multi_key_generator is None:
|
||||
function_multi_key_generator = self.function_multi_key_generator
|
||||
|
||||
def decorator(fn):
|
||||
def get_or_create_for_user_func(key_generator, user_func, *arg, **kw):
|
||||
cache_keys = arg
|
||||
keys = key_generator(*arg, **kw)
|
||||
key_lookup = dict(zip(keys, cache_keys))
|
||||
|
||||
@wraps(user_func)
|
||||
def creator(*keys_to_create):
|
||||
return user_func(*[key_lookup[k] for k in keys_to_create])
|
||||
|
||||
timeout = expiration_time() if expiration_time_is_callable \
|
||||
else expiration_time
|
||||
|
||||
if asdict:
|
||||
def dict_create(*keys):
|
||||
d_values = creator(*keys)
|
||||
return [
|
||||
d_values.get(key_lookup[k], NO_VALUE)
|
||||
for k in keys]
|
||||
|
||||
def wrap_cache_fn(value):
|
||||
if value is NO_VALUE:
|
||||
return False
|
||||
elif not should_cache_fn:
|
||||
return True
|
||||
else:
|
||||
return should_cache_fn(value)
|
||||
|
||||
result = self.get_or_create_multi(
|
||||
keys, dict_create, timeout, wrap_cache_fn)
|
||||
result = dict(
|
||||
(k, v) for k, v in zip(cache_keys, result)
|
||||
if v is not NO_VALUE)
|
||||
else:
|
||||
result = self.get_or_create_multi(
|
||||
keys, creator, timeout,
|
||||
should_cache_fn)
|
||||
|
||||
return result
|
||||
|
||||
def cache_decorator(user_func):
|
||||
key_generator = function_multi_key_generator(
|
||||
namespace, fn,
|
||||
namespace, user_func,
|
||||
to_str=to_str)
|
||||
|
||||
@wraps(fn)
|
||||
def decorate(*arg, **kw):
|
||||
cache_keys = arg
|
||||
keys = key_generator(*arg, **kw)
|
||||
key_lookup = dict(zip(keys, cache_keys))
|
||||
|
||||
@wraps(fn)
|
||||
def creator(*keys_to_create):
|
||||
return fn(*[key_lookup[k] for k in keys_to_create])
|
||||
|
||||
timeout = expiration_time() if expiration_time_is_callable \
|
||||
else expiration_time
|
||||
|
||||
if asdict:
|
||||
def dict_create(*keys):
|
||||
d_values = creator(*keys)
|
||||
return [
|
||||
d_values.get(key_lookup[k], NO_VALUE)
|
||||
for k in keys]
|
||||
|
||||
def wrap_cache_fn(value):
|
||||
if value is NO_VALUE:
|
||||
return False
|
||||
elif not should_cache_fn:
|
||||
return True
|
||||
else:
|
||||
return should_cache_fn(value)
|
||||
|
||||
result = self.get_or_create_multi(
|
||||
keys, dict_create, timeout, wrap_cache_fn)
|
||||
result = dict(
|
||||
(k, v) for k, v in zip(cache_keys, result)
|
||||
if v is not NO_VALUE)
|
||||
else:
|
||||
result = self.get_or_create_multi(
|
||||
keys, creator, timeout,
|
||||
should_cache_fn)
|
||||
|
||||
return result
|
||||
|
||||
def invalidate(*arg):
|
||||
keys = key_generator(*arg)
|
||||
self.delete_multi(keys)
|
||||
|
@ -1466,7 +1503,7 @@ class CacheRegion(object):
|
|||
|
||||
def refresh(*arg):
|
||||
keys = key_generator(*arg)
|
||||
values = fn(*arg)
|
||||
values = user_func(*arg)
|
||||
if asdict:
|
||||
self.set_multi(
|
||||
dict(zip(keys, [values[a] for a in arg]))
|
||||
|
@ -1478,13 +1515,18 @@ class CacheRegion(object):
|
|||
)
|
||||
return values
|
||||
|
||||
decorate.set = set_
|
||||
decorate.invalidate = invalidate
|
||||
decorate.refresh = refresh
|
||||
decorate.get = get
|
||||
user_func.set = set_
|
||||
user_func.invalidate = invalidate
|
||||
user_func.refresh = refresh
|
||||
user_func.get = get
|
||||
|
||||
# Use `decorate` to preserve the signature of :param:`user_func`.
|
||||
|
||||
return decorate(user_func, partial(get_or_create_for_user_func, key_generator))
|
||||
|
||||
return cache_decorator
|
||||
|
||||
|
||||
return decorate
|
||||
return decorator
|
||||
|
||||
|
||||
def make_region(*arg, **kw):
|
||||
|
|
7
libs/dogpile/cache/util.py
vendored
7
libs/dogpile/cache/util.py
vendored
|
@ -1,5 +1,4 @@
|
|||
from hashlib import sha1
|
||||
import inspect
|
||||
from ..util import compat
|
||||
from ..util import langhelpers
|
||||
|
||||
|
@ -28,7 +27,7 @@ def function_key_generator(namespace, fn, to_str=compat.string_type):
|
|||
else:
|
||||
namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace)
|
||||
|
||||
args = inspect.getargspec(fn)
|
||||
args = compat.inspect_getargspec(fn)
|
||||
has_self = args[0] and args[0][0] in ('self', 'cls')
|
||||
|
||||
def generate_key(*args, **kw):
|
||||
|
@ -50,7 +49,7 @@ def function_multi_key_generator(namespace, fn, to_str=compat.string_type):
|
|||
else:
|
||||
namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace)
|
||||
|
||||
args = inspect.getargspec(fn)
|
||||
args = compat.inspect_getargspec(fn)
|
||||
has_self = args[0] and args[0][0] in ('self', 'cls')
|
||||
|
||||
def generate_keys(*args, **kw):
|
||||
|
@ -88,7 +87,7 @@ def kwarg_function_key_generator(namespace, fn, to_str=compat.string_type):
|
|||
else:
|
||||
namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace)
|
||||
|
||||
argspec = inspect.getargspec(fn)
|
||||
argspec = compat.inspect_getargspec(fn)
|
||||
default_list = list(argspec.defaults or [])
|
||||
# Reverse the list, as we want to compare the argspec by negative index,
|
||||
# meaning default_list[0] should be args[-1], which works well with
|
||||
|
|
|
@ -69,11 +69,10 @@ class Lock(object):
|
|||
"""Return true if the expiration time is reached, or no
|
||||
value is available."""
|
||||
|
||||
return not self._has_value(createdtime) or \
|
||||
(
|
||||
self.expiretime is not None and
|
||||
time.time() - createdtime > self.expiretime
|
||||
)
|
||||
return not self._has_value(createdtime) or (
|
||||
self.expiretime is not None and
|
||||
time.time() - createdtime > self.expiretime
|
||||
)
|
||||
|
||||
def _has_value(self, createdtime):
|
||||
"""Return true if the creation function has proceeded
|
||||
|
@ -91,68 +90,100 @@ class Lock(object):
|
|||
value = NOT_REGENERATED
|
||||
createdtime = -1
|
||||
|
||||
generated = self._enter_create(createdtime)
|
||||
generated = self._enter_create(value, createdtime)
|
||||
|
||||
if generated is not NOT_REGENERATED:
|
||||
generated, createdtime = generated
|
||||
return generated
|
||||
elif value is NOT_REGENERATED:
|
||||
# we called upon the creator, and it said that it
|
||||
# didn't regenerate. this typically means another
|
||||
# thread is running the creation function, and that the
|
||||
# cache should still have a value. However,
|
||||
# we don't have a value at all, which is unusual since we just
|
||||
# checked for it, so check again (TODO: is this a real codepath?)
|
||||
try:
|
||||
value, createdtime = value_fn()
|
||||
return value
|
||||
except NeedRegenerationException:
|
||||
raise Exception("Generation function should "
|
||||
"have just been called by a concurrent "
|
||||
"thread.")
|
||||
raise Exception(
|
||||
"Generation function should "
|
||||
"have just been called by a concurrent "
|
||||
"thread.")
|
||||
else:
|
||||
return value
|
||||
|
||||
def _enter_create(self, createdtime):
|
||||
|
||||
def _enter_create(self, value, createdtime):
|
||||
if not self._is_expired(createdtime):
|
||||
return NOT_REGENERATED
|
||||
|
||||
async = False
|
||||
_async = False
|
||||
|
||||
if self._has_value(createdtime):
|
||||
has_value = True
|
||||
if not self.mutex.acquire(False):
|
||||
log.debug("creation function in progress "
|
||||
"elsewhere, returning")
|
||||
log.debug(
|
||||
"creation function in progress "
|
||||
"elsewhere, returning")
|
||||
return NOT_REGENERATED
|
||||
else:
|
||||
has_value = False
|
||||
log.debug("no value, waiting for create lock")
|
||||
self.mutex.acquire()
|
||||
|
||||
try:
|
||||
log.debug("value creation lock %r acquired" % self.mutex)
|
||||
|
||||
# see if someone created the value already
|
||||
try:
|
||||
value, createdtime = self.value_and_created_fn()
|
||||
except NeedRegenerationException:
|
||||
pass
|
||||
else:
|
||||
if not self._is_expired(createdtime):
|
||||
log.debug("value already present")
|
||||
return value, createdtime
|
||||
elif self.async_creator:
|
||||
log.debug("Passing creation lock to async runner")
|
||||
self.async_creator(self.mutex)
|
||||
async = True
|
||||
return value, createdtime
|
||||
if not has_value:
|
||||
# we entered without a value, or at least with "creationtime ==
|
||||
# 0". Run the "getter" function again, to see if another
|
||||
# thread has already generated the value while we waited on the
|
||||
# mutex, or if the caller is otherwise telling us there is a
|
||||
# value already which allows us to use async regeneration. (the
|
||||
# latter is used by the multi-key routine).
|
||||
try:
|
||||
value, createdtime = self.value_and_created_fn()
|
||||
except NeedRegenerationException:
|
||||
# nope, nobody created the value, we're it.
|
||||
# we must create it right now
|
||||
pass
|
||||
else:
|
||||
has_value = True
|
||||
# caller is telling us there is a value and that we can
|
||||
# use async creation if it is expired.
|
||||
if not self._is_expired(createdtime):
|
||||
# it's not expired, return it
|
||||
log.debug("Concurrent thread created the value")
|
||||
return value, createdtime
|
||||
|
||||
log.debug("Calling creation function")
|
||||
created = self.creator()
|
||||
return created
|
||||
# otherwise it's expired, call creator again
|
||||
|
||||
if has_value and self.async_creator:
|
||||
# we have a value we can return, safe to use async_creator
|
||||
log.debug("Passing creation lock to async runner")
|
||||
|
||||
# so...run it!
|
||||
self.async_creator(self.mutex)
|
||||
_async = True
|
||||
|
||||
# and return the expired value for now
|
||||
return value, createdtime
|
||||
|
||||
# it's expired, and it's our turn to create it synchronously, *or*,
|
||||
# there's no value at all, and we have to create it synchronously
|
||||
log.debug(
|
||||
"Calling creation function for %s value",
|
||||
"not-yet-present" if not has_value else
|
||||
"previously expired"
|
||||
)
|
||||
return self.creator()
|
||||
finally:
|
||||
if not async:
|
||||
if not _async:
|
||||
self.mutex.release()
|
||||
log.debug("Released creation lock")
|
||||
|
||||
|
||||
def __enter__(self):
|
||||
return self._enter()
|
||||
|
||||
def __exit__(self, type, value, traceback):
|
||||
pass
|
||||
|
||||
|
|
|
@ -51,11 +51,33 @@ else:
|
|||
import thread # noqa
|
||||
|
||||
|
||||
if py3k:
|
||||
import collections
|
||||
ArgSpec = collections.namedtuple(
|
||||
"ArgSpec",
|
||||
["args", "varargs", "keywords", "defaults"])
|
||||
|
||||
from inspect import getfullargspec as inspect_getfullargspec
|
||||
|
||||
def inspect_getargspec(func):
|
||||
return ArgSpec(
|
||||
*inspect_getfullargspec(func)[0:4]
|
||||
)
|
||||
else:
|
||||
from inspect import getargspec as inspect_getargspec # noqa
|
||||
|
||||
if py3k or jython:
|
||||
import pickle
|
||||
else:
|
||||
import cPickle as pickle # noqa
|
||||
|
||||
if py3k:
|
||||
def read_config_file(config, fileobj):
|
||||
return config.read_file(fileobj)
|
||||
else:
|
||||
def read_config_file(config, fileobj):
|
||||
return config.readfp(fileobj)
|
||||
|
||||
|
||||
def timedelta_total_seconds(td):
|
||||
if py27:
|
||||
|
|
|
@ -50,7 +50,7 @@ class NameRegistry(object):
|
|||
self.creator = creator
|
||||
|
||||
def get(self, identifier, *args, **kw):
|
||||
"""Get and possibly create the value.
|
||||
r"""Get and possibly create the value.
|
||||
|
||||
:param identifier: Hash key for the value.
|
||||
If the creation function is called, this identifier
|
||||
|
@ -75,10 +75,12 @@ class NameRegistry(object):
|
|||
if identifier in self._values:
|
||||
return self._values[identifier]
|
||||
else:
|
||||
self._values[identifier] = value = self.creator(identifier, *args, **kw)
|
||||
self._values[identifier] = value = self.creator(
|
||||
identifier, *args, **kw)
|
||||
return value
|
||||
except KeyError:
|
||||
self._values[identifier] = value = self.creator(identifier, *args, **kw)
|
||||
self._values[identifier] = value = self.creator(
|
||||
identifier, *args, **kw)
|
||||
return value
|
||||
finally:
|
||||
self._mutex.release()
|
||||
|
|
|
@ -23,7 +23,7 @@ class ReadWriteMutex(object):
|
|||
|
||||
def __init__(self):
|
||||
# counts how many asynchronous methods are executing
|
||||
self.async = 0
|
||||
self.async_ = 0
|
||||
|
||||
# pointer to thread that is the current sync operation
|
||||
self.current_sync_operation = None
|
||||
|
@ -31,7 +31,7 @@ class ReadWriteMutex(object):
|
|||
# condition object to lock on
|
||||
self.condition = threading.Condition(threading.Lock())
|
||||
|
||||
def acquire_read_lock(self, wait = True):
|
||||
def acquire_read_lock(self, wait=True):
|
||||
"""Acquire the 'read' lock."""
|
||||
self.condition.acquire()
|
||||
try:
|
||||
|
@ -45,7 +45,7 @@ class ReadWriteMutex(object):
|
|||
if self.current_sync_operation is not None:
|
||||
return False
|
||||
|
||||
self.async += 1
|
||||
self.async_ += 1
|
||||
log.debug("%s acquired read lock", self)
|
||||
finally:
|
||||
self.condition.release()
|
||||
|
@ -57,23 +57,23 @@ class ReadWriteMutex(object):
|
|||
"""Release the 'read' lock."""
|
||||
self.condition.acquire()
|
||||
try:
|
||||
self.async -= 1
|
||||
self.async_ -= 1
|
||||
|
||||
# check if we are the last asynchronous reader thread
|
||||
# out the door.
|
||||
if self.async == 0:
|
||||
if self.async_ == 0:
|
||||
# yes. so if a sync operation is waiting, notifyAll to wake
|
||||
# it up
|
||||
if self.current_sync_operation is not None:
|
||||
self.condition.notifyAll()
|
||||
elif self.async < 0:
|
||||
elif self.async_ < 0:
|
||||
raise LockError("Synchronizer error - too many "
|
||||
"release_read_locks called")
|
||||
log.debug("%s released read lock", self)
|
||||
finally:
|
||||
self.condition.release()
|
||||
|
||||
def acquire_write_lock(self, wait = True):
|
||||
def acquire_write_lock(self, wait=True):
|
||||
"""Acquire the 'write' lock."""
|
||||
self.condition.acquire()
|
||||
try:
|
||||
|
@ -96,7 +96,7 @@ class ReadWriteMutex(object):
|
|||
self.current_sync_operation = threading.currentThread()
|
||||
|
||||
# now wait again for asyncs to finish
|
||||
if self.async > 0:
|
||||
if self.async_ > 0:
|
||||
if wait:
|
||||
# wait
|
||||
self.condition.wait()
|
||||
|
|
|
@ -6,8 +6,16 @@
|
|||
# s/class \(\w\+\):/class \1(object):/
|
||||
|
||||
# Use iterator versions of map and range:
|
||||
from itertools import imap as map
|
||||
range = xrange
|
||||
try:
|
||||
from itertools import imap as map
|
||||
except ImportError:
|
||||
imap = map
|
||||
|
||||
try:
|
||||
import xrange
|
||||
range = xrange
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Except that xrange only supports machine integers, not longs, so...
|
||||
def long_range(start, end):
|
||||
|
|
|
@ -23,12 +23,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|||
THE SOFTWARE.
|
||||
"""
|
||||
|
||||
# Bazarr patch to use custom ConfigParser2:
|
||||
from ConfigParser2 import ConfigParser as configparser, NoOptionError, NoSectionError
|
||||
#try:
|
||||
# from configparser2 import ConfigParser as configparser, NoOptionError, NoSectionError
|
||||
#except ImportError:
|
||||
# from ConfigParser import SafeConfigParser as configparser, NoOptionError, NoSectionError
|
||||
try:
|
||||
from backports.configparser2 import ConfigParser as configparser, NoOptionError, NoSectionError
|
||||
except ImportError:
|
||||
from ConfigParser import SafeConfigParser as configparser, NoOptionError, NoSectionError
|
||||
|
||||
|
||||
class simpleconfigparser(configparser):
|
||||
|
|
65
libs/six.py
65
libs/six.py
|
@ -1,4 +1,4 @@
|
|||
# Copyright (c) 2010-2017 Benjamin Peterson
|
||||
# Copyright (c) 2010-2018 Benjamin Peterson
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
|
@ -29,7 +29,7 @@ import sys
|
|||
import types
|
||||
|
||||
__author__ = "Benjamin Peterson <benjamin@python.org>"
|
||||
__version__ = "1.11.0"
|
||||
__version__ = "1.12.0"
|
||||
|
||||
|
||||
# Useful for very coarse version differentiation.
|
||||
|
@ -844,10 +844,71 @@ def add_metaclass(metaclass):
|
|||
orig_vars.pop(slots_var)
|
||||
orig_vars.pop('__dict__', None)
|
||||
orig_vars.pop('__weakref__', None)
|
||||
if hasattr(cls, '__qualname__'):
|
||||
orig_vars['__qualname__'] = cls.__qualname__
|
||||
return metaclass(cls.__name__, cls.__bases__, orig_vars)
|
||||
return wrapper
|
||||
|
||||
|
||||
def ensure_binary(s, encoding='utf-8', errors='strict'):
|
||||
"""Coerce **s** to six.binary_type.
|
||||
|
||||
For Python 2:
|
||||
- `unicode` -> encoded to `str`
|
||||
- `str` -> `str`
|
||||
|
||||
For Python 3:
|
||||
- `str` -> encoded to `bytes`
|
||||
- `bytes` -> `bytes`
|
||||
"""
|
||||
if isinstance(s, text_type):
|
||||
return s.encode(encoding, errors)
|
||||
elif isinstance(s, binary_type):
|
||||
return s
|
||||
else:
|
||||
raise TypeError("not expecting type '%s'" % type(s))
|
||||
|
||||
|
||||
def ensure_str(s, encoding='utf-8', errors='strict'):
|
||||
"""Coerce *s* to `str`.
|
||||
|
||||
For Python 2:
|
||||
- `unicode` -> encoded to `str`
|
||||
- `str` -> `str`
|
||||
|
||||
For Python 3:
|
||||
- `str` -> `str`
|
||||
- `bytes` -> decoded to `str`
|
||||
"""
|
||||
if not isinstance(s, (text_type, binary_type)):
|
||||
raise TypeError("not expecting type '%s'" % type(s))
|
||||
if PY2 and isinstance(s, text_type):
|
||||
s = s.encode(encoding, errors)
|
||||
elif PY3 and isinstance(s, binary_type):
|
||||
s = s.decode(encoding, errors)
|
||||
return s
|
||||
|
||||
|
||||
def ensure_text(s, encoding='utf-8', errors='strict'):
|
||||
"""Coerce *s* to six.text_type.
|
||||
|
||||
For Python 2:
|
||||
- `unicode` -> `unicode`
|
||||
- `str` -> `unicode`
|
||||
|
||||
For Python 3:
|
||||
- `str` -> `str`
|
||||
- `bytes` -> decoded to `str`
|
||||
"""
|
||||
if isinstance(s, binary_type):
|
||||
return s.decode(encoding, errors)
|
||||
elif isinstance(s, text_type):
|
||||
return s
|
||||
else:
|
||||
raise TypeError("not expecting type '%s'" % type(s))
|
||||
|
||||
|
||||
|
||||
def python_2_unicode_compatible(klass):
|
||||
"""
|
||||
A decorator that defines __unicode__ and __str__ methods under Python 2.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
__title__ = 'subliminal'
|
||||
__version__ = '2.1.0.dev'
|
||||
__version__ = '2.0.5'
|
||||
__short_version__ = '.'.join(__version__.split('.')[:2])
|
||||
__author__ = 'Antoine Bertin'
|
||||
__license__ = 'MIT'
|
||||
|
|
|
@ -219,12 +219,13 @@ config_file = 'config.ini'
|
|||
@click.option('--legendastv', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', help='LegendasTV configuration.')
|
||||
@click.option('--opensubtitles', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD',
|
||||
help='OpenSubtitles configuration.')
|
||||
@click.option('--subscenter', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', help='SubsCenter configuration.')
|
||||
@click.option('--cache-dir', type=click.Path(writable=True, file_okay=False), default=dirs.user_cache_dir,
|
||||
show_default=True, expose_value=True, help='Path to the cache directory.')
|
||||
@click.option('--debug', is_flag=True, help='Print useful information for debugging subliminal and for reporting bugs.')
|
||||
@click.version_option(__version__)
|
||||
@click.pass_context
|
||||
def subliminal(ctx, addic7ed, legendastv, opensubtitles, cache_dir, debug):
|
||||
def subliminal(ctx, addic7ed, legendastv, opensubtitles, subscenter, cache_dir, debug):
|
||||
"""Subtitles, faster than your thoughts."""
|
||||
# create cache directory
|
||||
try:
|
||||
|
@ -252,6 +253,8 @@ def subliminal(ctx, addic7ed, legendastv, opensubtitles, cache_dir, debug):
|
|||
ctx.obj['provider_configs']['legendastv'] = {'username': legendastv[0], 'password': legendastv[1]}
|
||||
if opensubtitles:
|
||||
ctx.obj['provider_configs']['opensubtitles'] = {'username': opensubtitles[0], 'password': opensubtitles[1]}
|
||||
if subscenter:
|
||||
ctx.obj['provider_configs']['subscenter'] = {'username': subscenter[0], 'password': subscenter[1]}
|
||||
|
||||
|
||||
@subliminal.command()
|
||||
|
|
|
@ -1,38 +1,19 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from collections import defaultdict
|
||||
|
||||
import platform
|
||||
is_windows_special_path = False
|
||||
|
||||
if platform.system() == "Windows":
|
||||
try:
|
||||
__file__.decode("ascii")
|
||||
except UnicodeDecodeError:
|
||||
is_windows_special_path = True
|
||||
|
||||
if not is_windows_special_path:
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
else:
|
||||
ThreadPoolExecutor = object
|
||||
|
||||
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import datetime
|
||||
import io
|
||||
import itertools
|
||||
import logging
|
||||
import operator
|
||||
import os
|
||||
import os.path
|
||||
import socket
|
||||
|
||||
from babelfish import Language, LanguageReverseError
|
||||
from guessit import guessit
|
||||
from six.moves.xmlrpc_client import ProtocolError
|
||||
from rarfile import BadRarFile, NotRarFile, RarCannotExec, RarFile
|
||||
from zipfile import BadZipfile
|
||||
from ssl import SSLError
|
||||
from rarfile import NotRarFile, RarCannotExec, RarFile
|
||||
import requests
|
||||
|
||||
from .exceptions import ServiceUnavailable
|
||||
from .extensions import provider_manager, refiner_manager
|
||||
from .score import compute_score as default_compute_score
|
||||
from .subtitle import SUBTITLE_EXTENSIONS, get_subtitle_path
|
||||
|
@ -98,18 +79,6 @@ class ProviderPool(object):
|
|||
self.initialized_providers[name].terminate()
|
||||
except (requests.Timeout, socket.timeout):
|
||||
logger.error('Provider %r timed out, improperly terminated', name)
|
||||
except (ServiceUnavailable, ProtocolError): # OpenSubtitles raises xmlrpclib.ProtocolError when unavailable
|
||||
logger.error('Provider %r unavailable, improperly terminated', name)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if e.response.status_code in range(500, 600):
|
||||
logger.error('Provider %r unavailable, improperly terminated', name)
|
||||
else:
|
||||
logger.exception('Provider %r http error %r, improperly terminated', name, e.response.status_code)
|
||||
except SSLError as e:
|
||||
if e.args[0] == 'The read operation timed out':
|
||||
logger.error('Provider %r unavailable, improperly terminated', name)
|
||||
else:
|
||||
logger.exception('Provider %r SSL error %r, improperly terminated', name, e.args[0])
|
||||
except:
|
||||
logger.exception('Provider %r terminated unexpectedly', name)
|
||||
|
||||
|
@ -149,18 +118,6 @@ class ProviderPool(object):
|
|||
return self[provider].list_subtitles(video, provider_languages)
|
||||
except (requests.Timeout, socket.timeout):
|
||||
logger.error('Provider %r timed out', provider)
|
||||
except (ServiceUnavailable, ProtocolError): # OpenSubtitles raises xmlrpclib.ProtocolError when unavailable
|
||||
logger.error('Provider %r unavailable', provider)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if e.response.status_code in range(500, 600):
|
||||
logger.error('Provider %r unavailable', provider)
|
||||
else:
|
||||
logger.exception('Provider %r http error %r', provider, e.response.status_code)
|
||||
except SSLError as e:
|
||||
if e.args[0] == 'The read operation timed out':
|
||||
logger.error('Provider %r unavailable', provider)
|
||||
else:
|
||||
logger.exception('Provider %r SSL error %r', provider, e.args[0])
|
||||
except:
|
||||
logger.exception('Unexpected error in provider %r', provider)
|
||||
|
||||
|
@ -216,28 +173,6 @@ class ProviderPool(object):
|
|||
logger.error('Provider %r timed out, discarding it', subtitle.provider_name)
|
||||
self.discarded_providers.add(subtitle.provider_name)
|
||||
return False
|
||||
except (ServiceUnavailable, ProtocolError): # OpenSubtitles raises xmlrpclib.ProtocolError when unavailable
|
||||
logger.error('Provider %r unavailable, discarding it', subtitle.provider_name)
|
||||
self.discarded_providers.add(subtitle.provider_name)
|
||||
return False
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if e.response.status_code in range(500, 600):
|
||||
logger.error('Provider %r unavailable, discarding it', subtitle.provider_name)
|
||||
else:
|
||||
logger.exception('Provider %r http error %r, discarding it', subtitle.provider_name,
|
||||
e.response.status_code)
|
||||
self.discarded_providers.add(subtitle.provider_name)
|
||||
return False
|
||||
except SSLError as e:
|
||||
if e.args[0] == 'The read operation timed out':
|
||||
logger.error('Provider %r unavailable, discarding it', subtitle.provider_name)
|
||||
else:
|
||||
logger.exception('Provider %r SSL error %r, discarding it', subtitle.provider_name, e.args[0])
|
||||
self.discarded_providers.add(subtitle.provider_name)
|
||||
return False
|
||||
except (BadRarFile, BadZipfile):
|
||||
logger.error('Bad archive for %r', subtitle)
|
||||
return False
|
||||
except:
|
||||
logger.exception('Unexpected error in provider %r, discarding it', subtitle.provider_name)
|
||||
self.discarded_providers.add(subtitle.provider_name)
|
||||
|
@ -557,15 +492,9 @@ def scan_videos(path, age=None, archives=True):
|
|||
continue
|
||||
|
||||
# skip old files
|
||||
try:
|
||||
file_age = datetime.utcfromtimestamp(os.path.getmtime(filepath))
|
||||
except ValueError:
|
||||
logger.warning('Could not get age of file %r in %r', filename, dirpath)
|
||||
if age and datetime.utcnow() - datetime.utcfromtimestamp(os.path.getmtime(filepath)) > age:
|
||||
logger.debug('Skipping old file %r in %r', filename, dirpath)
|
||||
continue
|
||||
else:
|
||||
if age and datetime.utcnow() - file_age > age:
|
||||
logger.debug('Skipping old file %r in %r', filename, dirpath)
|
||||
continue
|
||||
|
||||
# scan
|
||||
if filename.endswith(VIDEO_EXTENSIONS): # video
|
||||
|
@ -612,8 +541,7 @@ def refine(video, episode_refiners=None, movie_refiners=None, **kwargs):
|
|||
try:
|
||||
refiner_manager[refiner].plugin(video, **kwargs)
|
||||
except:
|
||||
logger.error('Failed to refine video %r', video.name)
|
||||
logger.debug('Refiner exception:', exc_info=True)
|
||||
logger.exception('Failed to refine video')
|
||||
|
||||
|
||||
def list_subtitles(videos, languages, pool_class=ProviderPool, **kwargs):
|
||||
|
|
|
@ -19,8 +19,8 @@ class AuthenticationError(ProviderError):
|
|||
pass
|
||||
|
||||
|
||||
class ServiceUnavailable(ProviderError):
|
||||
"""Exception raised when status is '503 Service Unavailable'."""
|
||||
class TooManyRequests(ProviderError):
|
||||
"""Exception raised by providers when too many requests are made."""
|
||||
pass
|
||||
|
||||
|
||||
|
|
|
@ -29,9 +29,9 @@ class RegistrableExtensionManager(ExtensionManager):
|
|||
|
||||
super(RegistrableExtensionManager, self).__init__(namespace, **kwargs)
|
||||
|
||||
def list_entry_points(self):
|
||||
def _find_entry_points(self, namespace):
|
||||
# copy of default extensions
|
||||
eps = list(super(RegistrableExtensionManager, self).list_entry_points())
|
||||
eps = list(super(RegistrableExtensionManager, self)._find_entry_points(namespace))
|
||||
|
||||
# internal extensions
|
||||
for iep in self.internal_extensions:
|
||||
|
@ -93,6 +93,7 @@ provider_manager = RegistrableExtensionManager('subliminal.providers', [
|
|||
'opensubtitles = subliminal.providers.opensubtitles:OpenSubtitlesProvider',
|
||||
'podnapisi = subliminal.providers.podnapisi:PodnapisiProvider',
|
||||
'shooter = subliminal.providers.shooter:ShooterProvider',
|
||||
'subscenter = subliminal.providers.subscenter:SubsCenterProvider',
|
||||
'thesubdb = subliminal.providers.thesubdb:TheSubDBProvider',
|
||||
'tvsubtitles = subliminal.providers.tvsubtitles:TVsubtitlesProvider'
|
||||
])
|
||||
|
|
|
@ -68,9 +68,6 @@ class Provider(object):
|
|||
#: Required hash, if any
|
||||
required_hash = None
|
||||
|
||||
#: Subtitle class to use
|
||||
subtitle_class = None
|
||||
|
||||
def __enter__(self):
|
||||
self.initialize()
|
||||
return self
|
||||
|
|
|
@ -9,7 +9,7 @@ from requests import Session
|
|||
from . import ParserBeautifulSoup, Provider
|
||||
from .. import __short_version__
|
||||
from ..cache import SHOW_EXPIRATION_TIME, region
|
||||
from ..exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded
|
||||
from ..exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded, TooManyRequests
|
||||
from ..score import get_equivalent_release_groups
|
||||
from ..subtitle import Subtitle, fix_line_ending, guess_matches
|
||||
from ..utils import sanitize, sanitize_release_group
|
||||
|
@ -19,11 +19,8 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
language_converters.register('addic7ed = subliminal.converters.addic7ed:Addic7edConverter')
|
||||
|
||||
# Series cell matching regex
|
||||
show_cells_re = re.compile(b'<td class="version">.*?</td>', re.DOTALL)
|
||||
|
||||
#: Series header parsing regex
|
||||
series_year_re = re.compile(r'^(?P<series>[ \w\'.:(),*&!?-]+?)(?: \((?P<year>\d{4})\))?$')
|
||||
series_year_re = re.compile(r'^(?P<series>[ \w\'.:(),&!?-]+?)(?: \((?P<year>\d{4})\))?$')
|
||||
|
||||
|
||||
class Addic7edSubtitle(Subtitle):
|
||||
|
@ -32,7 +29,7 @@ class Addic7edSubtitle(Subtitle):
|
|||
|
||||
def __init__(self, language, hearing_impaired, page_link, series, season, episode, title, year, version,
|
||||
download_link):
|
||||
super(Addic7edSubtitle, self).__init__(language, hearing_impaired=hearing_impaired, page_link=page_link)
|
||||
super(Addic7edSubtitle, self).__init__(language, hearing_impaired, page_link)
|
||||
self.series = series
|
||||
self.season = season
|
||||
self.episode = episode
|
||||
|
@ -48,9 +45,8 @@ class Addic7edSubtitle(Subtitle):
|
|||
def get_matches(self, video):
|
||||
matches = set()
|
||||
|
||||
# series name
|
||||
if video.series and sanitize(self.series) in (
|
||||
sanitize(name) for name in [video.series] + video.alternative_series):
|
||||
# series
|
||||
if video.series and sanitize(self.series) == sanitize(video.series):
|
||||
matches.add('series')
|
||||
# season
|
||||
if video.season and self.season == video.season:
|
||||
|
@ -58,7 +54,7 @@ class Addic7edSubtitle(Subtitle):
|
|||
# episode
|
||||
if video.episode and self.episode == video.episode:
|
||||
matches.add('episode')
|
||||
# title of the episode
|
||||
# title
|
||||
if video.title and sanitize(self.title) == sanitize(video.title):
|
||||
matches.add('title')
|
||||
# year
|
||||
|
@ -90,23 +86,21 @@ class Addic7edProvider(Provider):
|
|||
]}
|
||||
video_types = (Episode,)
|
||||
server_url = 'http://www.addic7ed.com/'
|
||||
subtitle_class = Addic7edSubtitle
|
||||
|
||||
def __init__(self, username=None, password=None):
|
||||
if any((username, password)) and not all((username, password)):
|
||||
if username is not None and password is None or username is None and password is not None:
|
||||
raise ConfigurationError('Username and password must be specified')
|
||||
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.logged_in = False
|
||||
self.session = None
|
||||
|
||||
def initialize(self):
|
||||
self.session = Session()
|
||||
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
|
||||
|
||||
# login
|
||||
if self.username and self.password:
|
||||
if self.username is not None and self.password is not None:
|
||||
logger.info('Logging in')
|
||||
data = {'username': self.username, 'password': self.password, 'Submit': 'Log in'}
|
||||
r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10)
|
||||
|
@ -140,16 +134,7 @@ class Addic7edProvider(Provider):
|
|||
logger.info('Getting show ids')
|
||||
r = self.session.get(self.server_url + 'shows.php', timeout=10)
|
||||
r.raise_for_status()
|
||||
|
||||
# LXML parser seems to fail when parsing Addic7ed.com HTML markup.
|
||||
# Last known version to work properly is 3.6.4 (next version, 3.7.0, fails)
|
||||
# Assuming the site's markup is bad, and stripping it down to only contain what's needed.
|
||||
show_cells = re.findall(show_cells_re, r.content)
|
||||
if show_cells:
|
||||
soup = ParserBeautifulSoup(b''.join(show_cells), ['lxml', 'html.parser'])
|
||||
else:
|
||||
# If RegEx fails, fall back to original r.content and use 'html.parser'
|
||||
soup = ParserBeautifulSoup(r.content, ['html.parser'])
|
||||
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
|
||||
|
||||
# populate the show ids
|
||||
show_ids = {}
|
||||
|
@ -181,6 +166,8 @@ class Addic7edProvider(Provider):
|
|||
logger.info('Searching show ids with %r', params)
|
||||
r = self.session.get(self.server_url + 'search.php', params=params, timeout=10)
|
||||
r.raise_for_status()
|
||||
if r.status_code == 304:
|
||||
raise TooManyRequests()
|
||||
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
|
||||
|
||||
# get the suggestion
|
||||
|
@ -231,23 +218,24 @@ class Addic7edProvider(Provider):
|
|||
|
||||
# search as last resort
|
||||
if not show_id:
|
||||
logger.warning('Series %s not found in show ids', series)
|
||||
logger.warning('Series not found in show ids')
|
||||
show_id = self._search_show_id(series)
|
||||
|
||||
return show_id
|
||||
|
||||
def query(self, show_id, series, season, year=None, country=None):
|
||||
def query(self, series, season, year=None, country=None):
|
||||
# get the show id
|
||||
show_id = self.get_show_id(series, year, country)
|
||||
if show_id is None:
|
||||
logger.error('No show id found for %r (%r)', series, {'year': year, 'country': country})
|
||||
return []
|
||||
|
||||
# get the page of the season of the show
|
||||
logger.info('Getting the page of show id %d, season %d', show_id, season)
|
||||
r = self.session.get(self.server_url + 'show/%d' % show_id, params={'season': season}, timeout=10)
|
||||
r.raise_for_status()
|
||||
|
||||
if not r.content:
|
||||
# Provider returns a status of 304 Not Modified with an empty content
|
||||
# raise_for_status won't raise exception for that status code
|
||||
logger.debug('No data returned from provider')
|
||||
return []
|
||||
|
||||
if r.status_code == 304:
|
||||
raise TooManyRequests()
|
||||
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
|
||||
|
||||
# loop over subtitle rows
|
||||
|
@ -274,32 +262,16 @@ class Addic7edProvider(Provider):
|
|||
version = cells[4].text
|
||||
download_link = cells[9].a['href'][1:]
|
||||
|
||||
subtitle = self.subtitle_class(language, hearing_impaired, page_link, series, season, episode, title, year,
|
||||
version, download_link)
|
||||
subtitle = Addic7edSubtitle(language, hearing_impaired, page_link, series, season, episode, title, year,
|
||||
version, download_link)
|
||||
logger.debug('Found subtitle %r', subtitle)
|
||||
subtitles.append(subtitle)
|
||||
|
||||
return subtitles
|
||||
|
||||
def list_subtitles(self, video, languages):
|
||||
# lookup show_id
|
||||
titles = [video.series] + video.alternative_series
|
||||
show_id = None
|
||||
for title in titles:
|
||||
show_id = self.get_show_id(title, video.year)
|
||||
if show_id is not None:
|
||||
break
|
||||
|
||||
# query for subtitles with the show_id
|
||||
if show_id is not None:
|
||||
subtitles = [s for s in self.query(show_id, title, video.season, video.year)
|
||||
if s.language in languages and s.episode == video.episode]
|
||||
if subtitles:
|
||||
return subtitles
|
||||
else:
|
||||
logger.error('No show id found for %r (%r)', video.series, {'year': video.year})
|
||||
|
||||
return []
|
||||
return [s for s in self.query(video.series, video.season, video.year)
|
||||
if s.language in languages and s.episode == video.episode]
|
||||
|
||||
def download_subtitle(self, subtitle):
|
||||
# download the subtitle
|
||||
|
@ -308,12 +280,6 @@ class Addic7edProvider(Provider):
|
|||
timeout=10)
|
||||
r.raise_for_status()
|
||||
|
||||
if not r.content:
|
||||
# Provider returns a status of 304 Not Modified with an empty content
|
||||
# raise_for_status won't raise exception for that status code
|
||||
logger.debug('Unable to download subtitle. No data returned from provider')
|
||||
return
|
||||
|
||||
# detect download limit exceeded
|
||||
if r.headers['Content-Type'] == 'text/html':
|
||||
raise DownloadLimitExceeded
|
||||
|
|
|
@ -18,7 +18,7 @@ from zipfile import ZipFile, is_zipfile
|
|||
from . import ParserBeautifulSoup, Provider
|
||||
from .. import __short_version__
|
||||
from ..cache import SHOW_EXPIRATION_TIME, region
|
||||
from ..exceptions import AuthenticationError, ConfigurationError, ProviderError, ServiceUnavailable
|
||||
from ..exceptions import AuthenticationError, ConfigurationError, ProviderError
|
||||
from ..subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches, sanitize
|
||||
from ..video import Episode, Movie
|
||||
|
||||
|
@ -44,11 +44,8 @@ rating_re = re.compile(r'nota (?P<rating>\d+)')
|
|||
#: Timestamp parsing regex
|
||||
timestamp_re = re.compile(r'(?P<day>\d+)/(?P<month>\d+)/(?P<year>\d+) - (?P<hour>\d+):(?P<minute>\d+)')
|
||||
|
||||
#: Title with year/country regex
|
||||
title_re = re.compile(r'^(?P<series>.*?)(?: \((?:(?P<year>\d{4})|(?P<country>[A-Z]{2}))\))?$')
|
||||
|
||||
#: Cache key for releases
|
||||
releases_key = __name__ + ':releases|{archive_id}|{archive_name}'
|
||||
releases_key = __name__ + ':releases|{archive_id}'
|
||||
|
||||
|
||||
class LegendasTVArchive(object):
|
||||
|
@ -63,8 +60,8 @@ class LegendasTVArchive(object):
|
|||
:param int rating: rating (0-10).
|
||||
:param timestamp: timestamp.
|
||||
:type timestamp: datetime.datetime
|
||||
"""
|
||||
|
||||
"""
|
||||
def __init__(self, id, name, pack, featured, link, downloads=0, rating=0, timestamp=None):
|
||||
#: Identifier
|
||||
self.id = id
|
||||
|
@ -99,11 +96,10 @@ class LegendasTVArchive(object):
|
|||
|
||||
class LegendasTVSubtitle(Subtitle):
|
||||
"""LegendasTV Subtitle."""
|
||||
|
||||
provider_name = 'legendastv'
|
||||
|
||||
def __init__(self, language, type, title, year, imdb_id, season, archive, name):
|
||||
super(LegendasTVSubtitle, self).__init__(language, page_link=archive.link)
|
||||
super(LegendasTVSubtitle, self).__init__(language, archive.link)
|
||||
self.type = type
|
||||
self.title = title
|
||||
self.year = year
|
||||
|
@ -122,12 +118,11 @@ class LegendasTVSubtitle(Subtitle):
|
|||
# episode
|
||||
if isinstance(video, Episode) and self.type == 'episode':
|
||||
# series
|
||||
if video.series and (sanitize(self.title) in (
|
||||
sanitize(name) for name in [video.series] + video.alternative_series)):
|
||||
if video.series and sanitize(self.title) == sanitize(video.series):
|
||||
matches.add('series')
|
||||
|
||||
# year
|
||||
if video.original_series and self.year is None or video.year and video.year == self.year:
|
||||
# year (year is based on season air date hence the adjustment)
|
||||
if video.original_series and self.year is None or video.year and video.year == self.year - self.season + 1:
|
||||
matches.add('year')
|
||||
|
||||
# imdb_id
|
||||
|
@ -137,8 +132,7 @@ class LegendasTVSubtitle(Subtitle):
|
|||
# movie
|
||||
elif isinstance(video, Movie) and self.type == 'movie':
|
||||
# title
|
||||
if video.title and (sanitize(self.title) in (
|
||||
sanitize(name) for name in [video.title] + video.alternative_titles)):
|
||||
if video.title and sanitize(self.title) == sanitize(video.title):
|
||||
matches.add('title')
|
||||
|
||||
# year
|
||||
|
@ -149,6 +143,9 @@ class LegendasTVSubtitle(Subtitle):
|
|||
if video.imdb_id and self.imdb_id == video.imdb_id:
|
||||
matches.add('imdb_id')
|
||||
|
||||
# archive name
|
||||
matches |= guess_matches(video, guessit(self.archive.name, {'type': self.type}))
|
||||
|
||||
# name
|
||||
matches |= guess_matches(video, guessit(self.name, {'type': self.type}))
|
||||
|
||||
|
@ -160,38 +157,29 @@ class LegendasTVProvider(Provider):
|
|||
|
||||
:param str username: username.
|
||||
:param str password: password.
|
||||
"""
|
||||
|
||||
"""
|
||||
languages = {Language.fromlegendastv(l) for l in language_converters['legendastv'].codes}
|
||||
server_url = 'http://legendas.tv/'
|
||||
subtitle_class = LegendasTVSubtitle
|
||||
|
||||
def __init__(self, username=None, password=None):
|
||||
|
||||
# Provider needs UNRAR installed. If not available raise ConfigurationError
|
||||
try:
|
||||
rarfile.custom_check(rarfile.UNRAR_TOOL)
|
||||
except rarfile.RarExecError:
|
||||
raise ConfigurationError('UNRAR tool not available')
|
||||
|
||||
if any((username, password)) and not all((username, password)):
|
||||
if username and not password or not username and password:
|
||||
raise ConfigurationError('Username and password must be specified')
|
||||
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.logged_in = False
|
||||
self.session = None
|
||||
|
||||
def initialize(self):
|
||||
self.session = Session()
|
||||
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
|
||||
|
||||
# login
|
||||
if self.username and self.password:
|
||||
if self.username is not None and self.password is not None:
|
||||
logger.info('Logging in')
|
||||
data = {'_method': 'POST', 'data[User][username]': self.username, 'data[User][password]': self.password}
|
||||
r = self.session.post(self.server_url + 'login', data, allow_redirects=False, timeout=10)
|
||||
raise_for_status(r)
|
||||
r.raise_for_status()
|
||||
|
||||
soup = ParserBeautifulSoup(r.content, ['html.parser'])
|
||||
if soup.find('div', {'class': 'alert-error'}, string=re.compile(u'Usuário ou senha inválidos')):
|
||||
|
@ -205,174 +193,94 @@ class LegendasTVProvider(Provider):
|
|||
if self.logged_in:
|
||||
logger.info('Logging out')
|
||||
r = self.session.get(self.server_url + 'users/logout', allow_redirects=False, timeout=10)
|
||||
raise_for_status(r)
|
||||
r.raise_for_status()
|
||||
logger.debug('Logged out')
|
||||
self.logged_in = False
|
||||
|
||||
self.session.close()
|
||||
|
||||
@staticmethod
|
||||
def is_valid_title(title, title_id, sanitized_title, season, year):
|
||||
"""Check if is a valid title."""
|
||||
sanitized_result = sanitize(title['title'])
|
||||
if sanitized_result != sanitized_title:
|
||||
logger.debug("Mismatched title, discarding title %d (%s)",
|
||||
title_id, sanitized_result)
|
||||
return
|
||||
|
||||
# episode type
|
||||
if season:
|
||||
# discard mismatches on type
|
||||
if title['type'] != 'episode':
|
||||
logger.debug("Mismatched 'episode' type, discarding title %d (%s)", title_id, sanitized_result)
|
||||
return
|
||||
|
||||
# discard mismatches on season
|
||||
if 'season' not in title or title['season'] != season:
|
||||
logger.debug('Mismatched season %s, discarding title %d (%s)',
|
||||
title.get('season'), title_id, sanitized_result)
|
||||
return
|
||||
# movie type
|
||||
else:
|
||||
# discard mismatches on type
|
||||
if title['type'] != 'movie':
|
||||
logger.debug("Mismatched 'movie' type, discarding title %d (%s)", title_id, sanitized_result)
|
||||
return
|
||||
|
||||
# discard mismatches on year
|
||||
if year is not None and 'year' in title and title['year'] != year:
|
||||
logger.debug("Mismatched movie year, discarding title %d (%s)", title_id, sanitized_result)
|
||||
return
|
||||
return True
|
||||
|
||||
@region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME, should_cache_fn=lambda value: value)
|
||||
def search_titles(self, title, season, title_year):
|
||||
@region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME)
|
||||
def search_titles(self, title):
|
||||
"""Search for titles matching the `title`.
|
||||
|
||||
For episodes, each season has it own title
|
||||
:param str title: the title to search for.
|
||||
:param int season: season of the title
|
||||
:param int title_year: year of the title
|
||||
:return: found titles.
|
||||
:rtype: dict
|
||||
|
||||
"""
|
||||
# make the query
|
||||
logger.info('Searching title %r', title)
|
||||
r = self.session.get(self.server_url + 'legenda/sugestao/{}'.format(title), timeout=10)
|
||||
r.raise_for_status()
|
||||
results = json.loads(r.text)
|
||||
|
||||
# loop over results
|
||||
titles = {}
|
||||
sanitized_titles = [sanitize(title)]
|
||||
ignore_characters = {'\'', '.'}
|
||||
if any(c in title for c in ignore_characters):
|
||||
sanitized_titles.append(sanitize(title, ignore_characters=ignore_characters))
|
||||
for result in results:
|
||||
source = result['_source']
|
||||
|
||||
for sanitized_title in sanitized_titles:
|
||||
# make the query
|
||||
if season:
|
||||
logger.info('Searching episode title %r for season %r', sanitized_title, season)
|
||||
else:
|
||||
logger.info('Searching movie title %r', sanitized_title)
|
||||
# extract id
|
||||
title_id = int(source['id_filme'])
|
||||
|
||||
r = self.session.get(self.server_url + 'legenda/sugestao/{}'.format(sanitized_title), timeout=10)
|
||||
raise_for_status(r)
|
||||
results = json.loads(r.text)
|
||||
# extract type and title
|
||||
title = {'type': type_map[source['tipo']], 'title': source['dsc_nome']}
|
||||
|
||||
# loop over results
|
||||
for result in results:
|
||||
source = result['_source']
|
||||
# extract year
|
||||
if source['dsc_data_lancamento'] and source['dsc_data_lancamento'].isdigit():
|
||||
title['year'] = int(source['dsc_data_lancamento'])
|
||||
|
||||
# extract id
|
||||
title_id = int(source['id_filme'])
|
||||
# extract imdb_id
|
||||
if source['id_imdb'] != '0':
|
||||
if not source['id_imdb'].startswith('tt'):
|
||||
title['imdb_id'] = 'tt' + source['id_imdb'].zfill(7)
|
||||
else:
|
||||
title['imdb_id'] = source['id_imdb']
|
||||
|
||||
# extract type
|
||||
title = {'type': type_map[source['tipo']]}
|
||||
|
||||
# extract title, year and country
|
||||
name, year, country = title_re.match(source['dsc_nome']).groups()
|
||||
title['title'] = name
|
||||
|
||||
# extract imdb_id
|
||||
if source['id_imdb'] != '0':
|
||||
if not source['id_imdb'].startswith('tt'):
|
||||
title['imdb_id'] = 'tt' + source['id_imdb'].zfill(7)
|
||||
# extract season
|
||||
if title['type'] == 'episode':
|
||||
if source['temporada'] and source['temporada'].isdigit():
|
||||
title['season'] = int(source['temporada'])
|
||||
else:
|
||||
match = season_re.search(source['dsc_nome_br'])
|
||||
if match:
|
||||
title['season'] = int(match.group('season'))
|
||||
else:
|
||||
title['imdb_id'] = source['id_imdb']
|
||||
logger.warning('No season detected for title %d', title_id)
|
||||
|
||||
# extract season
|
||||
if title['type'] == 'episode':
|
||||
if source['temporada'] and source['temporada'].isdigit():
|
||||
title['season'] = int(source['temporada'])
|
||||
else:
|
||||
match = season_re.search(source['dsc_nome_br'])
|
||||
if match:
|
||||
title['season'] = int(match.group('season'))
|
||||
else:
|
||||
logger.debug('No season detected for title %d (%s)', title_id, name)
|
||||
# add title
|
||||
titles[title_id] = title
|
||||
|
||||
# extract year
|
||||
if year:
|
||||
title['year'] = int(year)
|
||||
elif source['dsc_data_lancamento'] and source['dsc_data_lancamento'].isdigit():
|
||||
# year is based on season air date hence the adjustment
|
||||
title['year'] = int(source['dsc_data_lancamento']) - title.get('season', 1) + 1
|
||||
|
||||
# add title only if is valid
|
||||
# Check against title without ignored chars
|
||||
if self.is_valid_title(title, title_id, sanitized_titles[0], season, title_year):
|
||||
titles[title_id] = title
|
||||
|
||||
logger.debug('Found %d titles', len(titles))
|
||||
logger.debug('Found %d titles', len(titles))
|
||||
|
||||
return titles
|
||||
|
||||
@region.cache_on_arguments(expiration_time=timedelta(minutes=15).total_seconds())
|
||||
def get_archives(self, title_id, language_code, title_type, season, episode):
|
||||
"""Get the archive list from a given `title_id`, `language_code`, `title_type`, `season` and `episode`.
|
||||
def get_archives(self, title_id, language_code):
|
||||
"""Get the archive list from a given `title_id` and `language_code`.
|
||||
|
||||
:param int title_id: title id.
|
||||
:param int language_code: language code.
|
||||
:param str title_type: episode or movie
|
||||
:param int season: season
|
||||
:param int episode: episode
|
||||
:return: the archives.
|
||||
:rtype: list of :class:`LegendasTVArchive`
|
||||
|
||||
"""
|
||||
logger.info('Getting archives for title %d and language %d', title_id, language_code)
|
||||
archives = []
|
||||
page = 0
|
||||
page = 1
|
||||
while True:
|
||||
# get the archive page
|
||||
url = self.server_url + 'legenda/busca/-/{language}/-/{page}/{title}'.format(
|
||||
language=language_code, page=page, title=title_id)
|
||||
url = self.server_url + 'util/carrega_legendas_busca_filme/{title}/{language}/-/{page}'.format(
|
||||
title=title_id, language=language_code, page=page)
|
||||
r = self.session.get(url)
|
||||
raise_for_status(r)
|
||||
r.raise_for_status()
|
||||
|
||||
# parse the results
|
||||
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
|
||||
for archive_soup in soup.select('div.list_element > article > div > div.f_left'):
|
||||
for archive_soup in soup.select('div.list_element > article > div'):
|
||||
# create archive
|
||||
archive = LegendasTVArchive(archive_soup.a['href'].split('/')[2],
|
||||
archive_soup.a.text,
|
||||
'pack' in archive_soup.parent['class'],
|
||||
'destaque' in archive_soup.parent['class'],
|
||||
archive = LegendasTVArchive(archive_soup.a['href'].split('/')[2], archive_soup.a.text,
|
||||
'pack' in archive_soup['class'], 'destaque' in archive_soup['class'],
|
||||
self.server_url + archive_soup.a['href'][1:])
|
||||
# clean name of path separators and pack flags
|
||||
clean_name = archive.name.replace('/', '-')
|
||||
if archive.pack and clean_name.startswith('(p)'):
|
||||
clean_name = clean_name[3:]
|
||||
|
||||
# guess from name
|
||||
guess = guessit(clean_name, {'type': title_type})
|
||||
|
||||
# episode
|
||||
if season and episode:
|
||||
# discard mismatches on episode in non-pack archives
|
||||
|
||||
# Guessit may return int for single episode or list for multi-episode
|
||||
# Check if archive name has multiple episodes releases on it
|
||||
if not archive.pack and 'episode' in guess:
|
||||
wanted_episode = set(episode) if isinstance(episode, list) else {episode}
|
||||
archive_episode = guess['episode'] if isinstance(guess['episode'], list) else {guess['episode']}
|
||||
|
||||
if not wanted_episode.intersection(archive_episode):
|
||||
logger.debug('Mismatched episode %s, discarding archive: %s', guess['episode'], clean_name)
|
||||
continue
|
||||
|
||||
# extract text containing downloads, rating and timestamp
|
||||
data_text = archive_soup.find('p', class_='data').text
|
||||
|
@ -392,8 +300,6 @@ class LegendasTVProvider(Provider):
|
|||
raise ProviderError('Archive timestamp is in the future')
|
||||
|
||||
# add archive
|
||||
logger.info('Found archive for title %d and language %d at page %s: %s',
|
||||
title_id, language_code, page, archive)
|
||||
archives.append(archive)
|
||||
|
||||
# stop on last page
|
||||
|
@ -416,7 +322,7 @@ class LegendasTVProvider(Provider):
|
|||
"""
|
||||
logger.info('Downloading archive %s', archive.id)
|
||||
r = self.session.get(self.server_url + 'downloadarquivo/{}'.format(archive.id))
|
||||
raise_for_status(r)
|
||||
r.raise_for_status()
|
||||
|
||||
# open the archive
|
||||
archive_stream = io.BytesIO(r.content)
|
||||
|
@ -431,26 +337,60 @@ class LegendasTVProvider(Provider):
|
|||
|
||||
def query(self, language, title, season=None, episode=None, year=None):
|
||||
# search for titles
|
||||
titles = self.search_titles(title, season, year)
|
||||
titles = self.search_titles(sanitize(title))
|
||||
|
||||
# search for titles with the quote or dot character
|
||||
ignore_characters = {'\'', '.'}
|
||||
if any(c in title for c in ignore_characters):
|
||||
titles.update(self.search_titles(sanitize(title, ignore_characters=ignore_characters)))
|
||||
|
||||
subtitles = []
|
||||
# iterate over titles
|
||||
for title_id, t in titles.items():
|
||||
# discard mismatches on title
|
||||
if sanitize(t['title']) != sanitize(title):
|
||||
continue
|
||||
|
||||
logger.info('Getting archives for title %d and language %d', title_id, language.legendastv)
|
||||
archives = self.get_archives(title_id, language.legendastv, t['type'], season, episode)
|
||||
if not archives:
|
||||
logger.info('No archives found for title %d and language %d', title_id, language.legendastv)
|
||||
# episode
|
||||
if season and episode:
|
||||
# discard mismatches on type
|
||||
if t['type'] != 'episode':
|
||||
continue
|
||||
|
||||
# discard mismatches on season
|
||||
if 'season' not in t or t['season'] != season:
|
||||
continue
|
||||
# movie
|
||||
else:
|
||||
# discard mismatches on type
|
||||
if t['type'] != 'movie':
|
||||
continue
|
||||
|
||||
# discard mismatches on year
|
||||
if year is not None and 'year' in t and t['year'] != year:
|
||||
continue
|
||||
|
||||
# iterate over title's archives
|
||||
for a in archives:
|
||||
for a in self.get_archives(title_id, language.legendastv):
|
||||
# clean name of path separators and pack flags
|
||||
clean_name = a.name.replace('/', '-')
|
||||
if a.pack and clean_name.startswith('(p)'):
|
||||
clean_name = clean_name[3:]
|
||||
|
||||
# guess from name
|
||||
guess = guessit(clean_name, {'type': t['type']})
|
||||
|
||||
# episode
|
||||
if season and episode:
|
||||
# discard mismatches on episode in non-pack archives
|
||||
if not a.pack and 'episode' in guess and guess['episode'] != episode:
|
||||
continue
|
||||
|
||||
# compute an expiration time based on the archive timestamp
|
||||
expiration_time = (datetime.utcnow().replace(tzinfo=pytz.utc) - a.timestamp).total_seconds()
|
||||
|
||||
# attempt to get the releases from the cache
|
||||
cache_key = releases_key.format(archive_id=a.id, archive_name=a.name)
|
||||
releases = region.get(cache_key, expiration_time=expiration_time)
|
||||
releases = region.get(releases_key.format(archive_id=a.id), expiration_time=expiration_time)
|
||||
|
||||
# the releases are not in cache or cache is expired
|
||||
if releases == NO_VALUE:
|
||||
|
@ -477,12 +417,12 @@ class LegendasTVProvider(Provider):
|
|||
releases.append(name)
|
||||
|
||||
# cache the releases
|
||||
region.set(cache_key, releases)
|
||||
region.set(releases_key.format(archive_id=a.id), releases)
|
||||
|
||||
# iterate over releases
|
||||
for r in releases:
|
||||
subtitle = self.subtitle_class(language, t['type'], t['title'], t.get('year'), t.get('imdb_id'),
|
||||
t.get('season'), a, r)
|
||||
subtitle = LegendasTVSubtitle(language, t['type'], t['title'], t.get('year'), t.get('imdb_id'),
|
||||
t.get('season'), a, r)
|
||||
logger.debug('Found subtitle %r', subtitle)
|
||||
subtitles.append(subtitle)
|
||||
|
||||
|
@ -491,19 +431,13 @@ class LegendasTVProvider(Provider):
|
|||
def list_subtitles(self, video, languages):
|
||||
season = episode = None
|
||||
if isinstance(video, Episode):
|
||||
titles = [video.series] + video.alternative_series
|
||||
title = video.series
|
||||
season = video.season
|
||||
episode = video.episode
|
||||
else:
|
||||
titles = [video.title] + video.alternative_titles
|
||||
title = video.title
|
||||
|
||||
for title in titles:
|
||||
subtitles = [s for l in languages for s in
|
||||
self.query(l, title, season=season, episode=episode, year=video.year)]
|
||||
if subtitles:
|
||||
return subtitles
|
||||
|
||||
return []
|
||||
return [s for l in languages for s in self.query(l, title, season=season, episode=episode, year=video.year)]
|
||||
|
||||
def download_subtitle(self, subtitle):
|
||||
# download archive in case we previously hit the releases cache and didn't download it
|
||||
|
@ -512,11 +446,3 @@ class LegendasTVProvider(Provider):
|
|||
|
||||
# extract subtitle's content
|
||||
subtitle.content = fix_line_ending(subtitle.archive.content.read(subtitle.name))
|
||||
|
||||
|
||||
def raise_for_status(r):
|
||||
# When site is under maintaince and http status code 200.
|
||||
if 'Em breve estaremos de volta' in r.text:
|
||||
raise ServiceUnavailable
|
||||
else:
|
||||
r.raise_for_status()
|
||||
|
|
|
@ -42,7 +42,6 @@ class NapiProjektSubtitle(Subtitle):
|
|||
def __init__(self, language, hash):
|
||||
super(NapiProjektSubtitle, self).__init__(language)
|
||||
self.hash = hash
|
||||
self.content = None
|
||||
|
||||
@property
|
||||
def id(self):
|
||||
|
@ -63,10 +62,6 @@ class NapiProjektProvider(Provider):
|
|||
languages = {Language.fromalpha2(l) for l in ['pl']}
|
||||
required_hash = 'napiprojekt'
|
||||
server_url = 'http://napiprojekt.pl/unit_napisy/dl.php'
|
||||
subtitle_class = NapiProjektSubtitle
|
||||
|
||||
def __init__(self):
|
||||
self.session = None
|
||||
|
||||
def initialize(self):
|
||||
self.session = Session()
|
||||
|
@ -86,16 +81,16 @@ class NapiProjektProvider(Provider):
|
|||
'f': hash,
|
||||
't': get_subhash(hash)}
|
||||
logger.info('Searching subtitle %r', params)
|
||||
r = self.session.get(self.server_url, params=params, timeout=10)
|
||||
r.raise_for_status()
|
||||
response = self.session.get(self.server_url, params=params, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
# handle subtitles not found and errors
|
||||
if r.content[:4] == b'NPc0':
|
||||
if response.content[:4] == b'NPc0':
|
||||
logger.debug('No subtitles found')
|
||||
return None
|
||||
|
||||
subtitle = self.subtitle_class(language, hash)
|
||||
subtitle.content = r.content
|
||||
subtitle = NapiProjektSubtitle(language, hash)
|
||||
subtitle.content = response.content
|
||||
logger.debug('Found subtitle %r', subtitle)
|
||||
|
||||
return subtitle
|
||||
|
|
|
@ -11,8 +11,7 @@ from six.moves.xmlrpc_client import ServerProxy
|
|||
|
||||
from . import Provider, TimeoutSafeTransport
|
||||
from .. import __short_version__
|
||||
from ..exceptions import (AuthenticationError, ConfigurationError, DownloadLimitExceeded, ProviderError,
|
||||
ServiceUnavailable)
|
||||
from ..exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded, ProviderError
|
||||
from ..subtitle import Subtitle, fix_line_ending, guess_matches
|
||||
from ..utils import sanitize
|
||||
from ..video import Episode, Movie
|
||||
|
@ -27,8 +26,7 @@ class OpenSubtitlesSubtitle(Subtitle):
|
|||
|
||||
def __init__(self, language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind, hash, movie_name,
|
||||
movie_release_name, movie_year, movie_imdb_id, series_season, series_episode, filename, encoding):
|
||||
super(OpenSubtitlesSubtitle, self).__init__(language, hearing_impaired=hearing_impaired,
|
||||
page_link=page_link, encoding=encoding)
|
||||
super(OpenSubtitlesSubtitle, self).__init__(language, hearing_impaired, page_link, encoding)
|
||||
self.subtitle_id = subtitle_id
|
||||
self.matched_by = matched_by
|
||||
self.movie_kind = movie_kind
|
||||
|
@ -60,8 +58,7 @@ class OpenSubtitlesSubtitle(Subtitle):
|
|||
if isinstance(video, Episode) and self.movie_kind == 'episode':
|
||||
# tag match, assume series, year, season and episode matches
|
||||
if self.matched_by == 'tag':
|
||||
if not video.imdb_id or self.movie_imdb_id == video.imdb_id:
|
||||
matches |= {'series', 'year', 'season', 'episode'}
|
||||
matches |= {'series', 'year', 'season', 'episode'}
|
||||
# series
|
||||
if video.series and sanitize(self.series_name) == sanitize(video.series):
|
||||
matches.add('series')
|
||||
|
@ -90,8 +87,7 @@ class OpenSubtitlesSubtitle(Subtitle):
|
|||
elif isinstance(video, Movie) and self.movie_kind == 'movie':
|
||||
# tag match, assume title and year matches
|
||||
if self.matched_by == 'tag':
|
||||
if not video.imdb_id or self.movie_imdb_id == video.imdb_id:
|
||||
matches |= {'title', 'year'}
|
||||
matches |= {'title', 'year'}
|
||||
# title
|
||||
if video.title and sanitize(self.movie_name) == sanitize(video.title):
|
||||
matches.add('title')
|
||||
|
@ -126,11 +122,10 @@ class OpenSubtitlesProvider(Provider):
|
|||
|
||||
"""
|
||||
languages = {Language.fromopensubtitles(l) for l in language_converters['opensubtitles'].codes}
|
||||
subtitle_class = OpenSubtitlesSubtitle
|
||||
|
||||
def __init__(self, username=None, password=None):
|
||||
self.server = ServerProxy('https://api.opensubtitles.org/xml-rpc', TimeoutSafeTransport(10))
|
||||
if any((username, password)) and not all((username, password)):
|
||||
if username and not password or not username and password:
|
||||
raise ConfigurationError('Username and password must be specified')
|
||||
# None values not allowed for logging in, so replace it by ''
|
||||
self.username = username or ''
|
||||
|
@ -161,10 +156,7 @@ class OpenSubtitlesProvider(Provider):
|
|||
if hash and size:
|
||||
criteria.append({'moviehash': hash, 'moviebytesize': str(size)})
|
||||
if imdb_id:
|
||||
if season and episode:
|
||||
criteria.append({'imdbid': imdb_id[2:], 'season': season, 'episode': episode})
|
||||
else:
|
||||
criteria.append({'imdbid': imdb_id[2:]})
|
||||
criteria.append({'imdbid': imdb_id[2:]})
|
||||
if tag:
|
||||
criteria.append({'tag': tag})
|
||||
if query and season and episode:
|
||||
|
@ -207,9 +199,9 @@ class OpenSubtitlesProvider(Provider):
|
|||
filename = subtitle_item['SubFileName']
|
||||
encoding = subtitle_item.get('SubEncoding') or None
|
||||
|
||||
subtitle = self.subtitle_class(language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind,
|
||||
hash, movie_name, movie_release_name, movie_year, movie_imdb_id,
|
||||
series_season, series_episode, filename, encoding)
|
||||
subtitle = OpenSubtitlesSubtitle(language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind,
|
||||
hash, movie_name, movie_release_name, movie_year, movie_imdb_id,
|
||||
series_season, series_episode, filename, encoding)
|
||||
logger.debug('Found subtitle %r by %s', subtitle, matched_by)
|
||||
subtitles.append(subtitle)
|
||||
|
||||
|
@ -268,6 +260,11 @@ class DisabledUserAgent(OpenSubtitlesError, AuthenticationError):
|
|||
pass
|
||||
|
||||
|
||||
class ServiceUnavailable(OpenSubtitlesError):
|
||||
"""Exception raised when status is '503 Service Unavailable'."""
|
||||
pass
|
||||
|
||||
|
||||
def checked(response):
|
||||
"""Check a response status before returning it.
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ class PodnapisiSubtitle(Subtitle):
|
|||
|
||||
def __init__(self, language, hearing_impaired, page_link, pid, releases, title, season=None, episode=None,
|
||||
year=None):
|
||||
super(PodnapisiSubtitle, self).__init__(language, hearing_impaired=hearing_impaired, page_link=page_link)
|
||||
super(PodnapisiSubtitle, self).__init__(language, hearing_impaired, page_link)
|
||||
self.pid = pid
|
||||
self.releases = releases
|
||||
self.title = title
|
||||
|
@ -49,8 +49,7 @@ class PodnapisiSubtitle(Subtitle):
|
|||
# episode
|
||||
if isinstance(video, Episode):
|
||||
# series
|
||||
if video.series and (sanitize(self.title) in (
|
||||
sanitize(name) for name in [video.series] + video.alternative_series)):
|
||||
if video.series and sanitize(self.title) == sanitize(video.series):
|
||||
matches.add('series')
|
||||
# year
|
||||
if video.original_series and self.year is None or video.year and video.year == self.year:
|
||||
|
@ -67,8 +66,7 @@ class PodnapisiSubtitle(Subtitle):
|
|||
# movie
|
||||
elif isinstance(video, Movie):
|
||||
# title
|
||||
if video.title and (sanitize(self.title) in (
|
||||
sanitize(name) for name in [video.title] + video.alternative_titles)):
|
||||
if video.title and sanitize(self.title) == sanitize(video.title):
|
||||
matches.add('title')
|
||||
# year
|
||||
if video.year and self.year == video.year:
|
||||
|
@ -84,11 +82,7 @@ class PodnapisiProvider(Provider):
|
|||
"""Podnapisi Provider."""
|
||||
languages = ({Language('por', 'BR'), Language('srp', script='Latn')} |
|
||||
{Language.fromalpha2(l) for l in language_converters['alpha2'].codes})
|
||||
server_url = 'https://www.podnapisi.net/subtitles/'
|
||||
subtitle_class = PodnapisiSubtitle
|
||||
|
||||
def __init__(self):
|
||||
self.session = None
|
||||
server_url = 'http://podnapisi.net/subtitles/'
|
||||
|
||||
def initialize(self):
|
||||
self.session = Session()
|
||||
|
@ -114,9 +108,7 @@ class PodnapisiProvider(Provider):
|
|||
pids = set()
|
||||
while True:
|
||||
# query the server
|
||||
r = self.session.get(self.server_url + 'search/old', params=params, timeout=10)
|
||||
r.raise_for_status()
|
||||
xml = etree.fromstring(r.content)
|
||||
xml = etree.fromstring(self.session.get(self.server_url + 'search/old', params=params, timeout=10).content)
|
||||
|
||||
# exit if no results
|
||||
if not int(xml.find('pagination/results').text):
|
||||
|
@ -126,14 +118,10 @@ class PodnapisiProvider(Provider):
|
|||
# loop over subtitles
|
||||
for subtitle_xml in xml.findall('subtitle'):
|
||||
# read xml elements
|
||||
pid = subtitle_xml.find('pid').text
|
||||
# ignore duplicates, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164&start=10#p213321
|
||||
if pid in pids:
|
||||
continue
|
||||
|
||||
language = Language.fromietf(subtitle_xml.find('language').text)
|
||||
hearing_impaired = 'n' in (subtitle_xml.find('flags').text or '')
|
||||
page_link = subtitle_xml.find('url').text
|
||||
pid = subtitle_xml.find('pid').text
|
||||
releases = []
|
||||
if subtitle_xml.find('release').text:
|
||||
for release in subtitle_xml.find('release').text.split():
|
||||
|
@ -146,11 +134,15 @@ class PodnapisiProvider(Provider):
|
|||
year = int(subtitle_xml.find('year').text)
|
||||
|
||||
if is_episode:
|
||||
subtitle = self.subtitle_class(language, hearing_impaired, page_link, pid, releases, title,
|
||||
season=season, episode=episode, year=year)
|
||||
subtitle = PodnapisiSubtitle(language, hearing_impaired, page_link, pid, releases, title,
|
||||
season=season, episode=episode, year=year)
|
||||
else:
|
||||
subtitle = self.subtitle_class(language, hearing_impaired, page_link, pid, releases, title,
|
||||
year=year)
|
||||
subtitle = PodnapisiSubtitle(language, hearing_impaired, page_link, pid, releases, title,
|
||||
year=year)
|
||||
|
||||
# ignore duplicates, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164&start=10#p213321
|
||||
if pid in pids:
|
||||
continue
|
||||
|
||||
logger.debug('Found subtitle %r', subtitle)
|
||||
subtitles.append(subtitle)
|
||||
|
@ -167,21 +159,11 @@ class PodnapisiProvider(Provider):
|
|||
return subtitles
|
||||
|
||||
def list_subtitles(self, video, languages):
|
||||
season = episode = None
|
||||
if isinstance(video, Episode):
|
||||
titles = [video.series] + video.alternative_series
|
||||
season = video.season
|
||||
episode = video.episode
|
||||
else:
|
||||
titles = [video.title] + video.alternative_titles
|
||||
|
||||
for title in titles:
|
||||
subtitles = [s for l in languages for s in
|
||||
self.query(l, title, season=season, episode=episode, year=video.year)]
|
||||
if subtitles:
|
||||
return subtitles
|
||||
|
||||
return []
|
||||
return [s for l in languages for s in self.query(l, video.series, season=video.season,
|
||||
episode=video.episode, year=video.year)]
|
||||
elif isinstance(video, Movie):
|
||||
return [s for l in languages for s in self.query(l, video.title, year=video.year)]
|
||||
|
||||
def download_subtitle(self, subtitle):
|
||||
# download as a zip
|
||||
|
|
|
@ -42,10 +42,6 @@ class ShooterProvider(Provider):
|
|||
"""Shooter Provider."""
|
||||
languages = {Language(l) for l in ['eng', 'zho']}
|
||||
server_url = 'https://www.shooter.cn/api/subapi.php'
|
||||
subtitle_class = ShooterSubtitle
|
||||
|
||||
def __init__(self):
|
||||
self.session = None
|
||||
|
||||
def initialize(self):
|
||||
self.session = Session()
|
||||
|
@ -68,7 +64,7 @@ class ShooterProvider(Provider):
|
|||
|
||||
# parse the subtitles
|
||||
results = json.loads(r.text)
|
||||
subtitles = [self.subtitle_class(language, hash, t['Link']) for s in results for t in s['Files']]
|
||||
subtitles = [ShooterSubtitle(language, hash, t['Link']) for s in results for t in s['Files']]
|
||||
|
||||
return subtitles
|
||||
|
||||
|
|
|
@ -26,7 +26,7 @@ class SubsCenterSubtitle(Subtitle):
|
|||
provider_name = 'subscenter'
|
||||
|
||||
def __init__(self, language, hearing_impaired, page_link, series, season, episode, title, subtitle_id, subtitle_key,
|
||||
subtitle_version, downloaded, releases):
|
||||
downloaded, releases):
|
||||
super(SubsCenterSubtitle, self).__init__(language, hearing_impaired, page_link)
|
||||
self.series = series
|
||||
self.season = season
|
||||
|
@ -34,7 +34,6 @@ class SubsCenterSubtitle(Subtitle):
|
|||
self.title = title
|
||||
self.subtitle_id = subtitle_id
|
||||
self.subtitle_key = subtitle_key
|
||||
self.subtitle_version = subtitle_version
|
||||
self.downloaded = downloaded
|
||||
self.releases = releases
|
||||
|
||||
|
@ -75,8 +74,7 @@ class SubsCenterSubtitle(Subtitle):
|
|||
class SubsCenterProvider(Provider):
|
||||
"""SubsCenter Provider."""
|
||||
languages = {Language.fromalpha2(l) for l in ['he']}
|
||||
server_url = 'http://www.subscenter.org/he/'
|
||||
subtitle_class = SubsCenterSubtitle
|
||||
server_url = 'http://www.subscenter.co/he/'
|
||||
|
||||
def __init__(self, username=None, password=None):
|
||||
if username is not None and password is None or username is None and password is not None:
|
||||
|
@ -191,7 +189,6 @@ class SubsCenterProvider(Provider):
|
|||
hearing_impaired = bool(subtitle_item['hearing_impaired'])
|
||||
subtitle_id = subtitle_item['id']
|
||||
subtitle_key = subtitle_item['key']
|
||||
subtitle_version = subtitle_item['h_version']
|
||||
downloaded = subtitle_item['downloaded']
|
||||
release = subtitle_item['subtitle_version']
|
||||
|
||||
|
@ -203,9 +200,8 @@ class SubsCenterProvider(Provider):
|
|||
continue
|
||||
|
||||
# otherwise create it
|
||||
subtitle = self.subtitle_class(language, hearing_impaired, page_link, title, season, episode,
|
||||
title, subtitle_id, subtitle_key, subtitle_version, downloaded,
|
||||
[release])
|
||||
subtitle = SubsCenterSubtitle(language, hearing_impaired, page_link, title, season, episode,
|
||||
title, subtitle_id, subtitle_key, downloaded, [release])
|
||||
logger.debug('Found subtitle %r', subtitle)
|
||||
subtitles[subtitle_id] = subtitle
|
||||
|
||||
|
@ -225,19 +221,15 @@ class SubsCenterProvider(Provider):
|
|||
def download_subtitle(self, subtitle):
|
||||
# download
|
||||
url = self.server_url + 'subtitle/download/{}/{}/'.format(subtitle.language.alpha2, subtitle.subtitle_id)
|
||||
params = {'v': subtitle.subtitle_version, 'key': subtitle.subtitle_key}
|
||||
params = {'v': subtitle.releases[0], 'key': subtitle.subtitle_key}
|
||||
r = self.session.get(url, params=params, headers={'Referer': subtitle.page_link}, timeout=10)
|
||||
r.raise_for_status()
|
||||
|
||||
# open the zip
|
||||
try:
|
||||
with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
|
||||
# remove some filenames from the namelist
|
||||
namelist = [n for n in zf.namelist() if not n.endswith('.txt')]
|
||||
if len(namelist) > 1:
|
||||
raise ProviderError('More than one file to unzip')
|
||||
with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
|
||||
# remove some filenames from the namelist
|
||||
namelist = [n for n in zf.namelist() if not n.endswith('.txt')]
|
||||
if len(namelist) > 1:
|
||||
raise ProviderError('More than one file to unzip')
|
||||
|
||||
subtitle.content = fix_line_ending(zf.read(namelist[0]))
|
||||
except zipfile.BadZipfile:
|
||||
# if no zip file was retrieved, daily downloads limit has exceeded
|
||||
raise ProviderError('Daily limit exceeded')
|
||||
subtitle.content = fix_line_ending(zf.read(namelist[0]))
|
||||
|
|
|
@ -40,10 +40,6 @@ class TheSubDBProvider(Provider):
|
|||
languages = {Language.fromthesubdb(l) for l in language_converters['thesubdb'].codes}
|
||||
required_hash = 'thesubdb'
|
||||
server_url = 'http://api.thesubdb.com/'
|
||||
subtitle_class = TheSubDBSubtitle
|
||||
|
||||
def __init__(self):
|
||||
self.session = None
|
||||
|
||||
def initialize(self):
|
||||
self.session = Session()
|
||||
|
@ -70,7 +66,7 @@ class TheSubDBProvider(Provider):
|
|||
for language_code in r.text.split(','):
|
||||
language = Language.fromthesubdb(language_code)
|
||||
|
||||
subtitle = self.subtitle_class(language, hash)
|
||||
subtitle = TheSubDBSubtitle(language, hash)
|
||||
logger.debug('Found subtitle %r', subtitle)
|
||||
subtitles.append(subtitle)
|
||||
|
||||
|
|
|
@ -47,8 +47,7 @@ class TVsubtitlesSubtitle(Subtitle):
|
|||
matches = set()
|
||||
|
||||
# series
|
||||
if video.series and (sanitize(self.series) in (
|
||||
sanitize(name) for name in [video.series] + video.alternative_series)):
|
||||
if video.series and sanitize(self.series) == sanitize(video.series):
|
||||
matches.add('series')
|
||||
# season
|
||||
if video.season and self.season == video.season:
|
||||
|
@ -81,10 +80,6 @@ class TVsubtitlesProvider(Provider):
|
|||
]}
|
||||
video_types = (Episode,)
|
||||
server_url = 'http://www.tvsubtitles.net/'
|
||||
subtitle_class = TVsubtitlesSubtitle
|
||||
|
||||
def __init__(self):
|
||||
self.session = None
|
||||
|
||||
def initialize(self):
|
||||
self.session = Session()
|
||||
|
@ -163,7 +158,13 @@ class TVsubtitlesProvider(Provider):
|
|||
|
||||
return episode_ids
|
||||
|
||||
def query(self, show_id, series, season, episode, year=None):
|
||||
def query(self, series, season, episode, year=None):
|
||||
# search the show id
|
||||
show_id = self.search_show_id(series, year)
|
||||
if show_id is None:
|
||||
logger.error('No show id found for %r (%r)', series, {'year': year})
|
||||
return []
|
||||
|
||||
# get the episode ids
|
||||
episode_ids = self.get_episode_ids(show_id, season)
|
||||
if episode not in episode_ids:
|
||||
|
@ -183,9 +184,9 @@ class TVsubtitlesProvider(Provider):
|
|||
subtitle_id = int(row.parent['href'][10:-5])
|
||||
page_link = self.server_url + 'subtitle-%d.html' % subtitle_id
|
||||
rip = row.find('p', title='rip').text.strip() or None
|
||||
release = row.find('h5').text.strip() or None
|
||||
release = row.find('p', title='release').text.strip() or None
|
||||
|
||||
subtitle = self.subtitle_class(language, page_link, subtitle_id, series, season, episode, year, rip,
|
||||
subtitle = TVsubtitlesSubtitle(language, page_link, subtitle_id, series, season, episode, year, rip,
|
||||
release)
|
||||
logger.debug('Found subtitle %s', subtitle)
|
||||
subtitles.append(subtitle)
|
||||
|
@ -193,24 +194,7 @@ class TVsubtitlesProvider(Provider):
|
|||
return subtitles
|
||||
|
||||
def list_subtitles(self, video, languages):
|
||||
# lookup show_id
|
||||
titles = [video.series] + video.alternative_series
|
||||
show_id = None
|
||||
for title in titles:
|
||||
show_id = self.search_show_id(title, video.year)
|
||||
if show_id is not None:
|
||||
break
|
||||
|
||||
# query for subtitles with the show_id
|
||||
if show_id is not None:
|
||||
subtitles = [s for s in self.query(show_id, title, video.season, video.episode, video.year)
|
||||
if s.language in languages and s.episode == video.episode]
|
||||
if subtitles:
|
||||
return subtitles
|
||||
else:
|
||||
logger.error('No show id found for %r (%r)', video.series, {'year': video.year})
|
||||
|
||||
return []
|
||||
return [s for s in self.query(video.series, video.season, video.episode, video.year) if s.language in languages]
|
||||
|
||||
def download_subtitle(self, subtitle):
|
||||
# download as a zip
|
||||
|
|
|
@ -3,7 +3,7 @@ from datetime import datetime, timedelta
|
|||
from functools import wraps
|
||||
import logging
|
||||
import re
|
||||
import _strptime
|
||||
|
||||
import requests
|
||||
|
||||
from .. import __short_version__
|
||||
|
@ -331,7 +331,6 @@ def refine(video, **kwargs):
|
|||
# add series information
|
||||
logger.debug('Found series %r', series)
|
||||
video.series = matching_result['match']['series']
|
||||
video.alternative_series.extend(series['aliases'])
|
||||
video.year = matching_result['match']['year']
|
||||
video.original_series = matching_result['match']['original_series']
|
||||
video.series_tvdb_id = series['id']
|
||||
|
|
|
@ -44,7 +44,7 @@ movie_scores = {'hash': 119, 'title': 60, 'year': 30, 'release_group': 15,
|
|||
'format': 7, 'audio_codec': 3, 'resolution': 2, 'video_codec': 2, 'hearing_impaired': 1}
|
||||
|
||||
#: Equivalent release groups
|
||||
equivalent_release_groups = ({'LOL', 'DIMENSION'}, {'ASAP', 'IMMERSE', 'FLEET'}, {'AVS', 'SVA'})
|
||||
equivalent_release_groups = ({'LOL', 'DIMENSION'}, {'ASAP', 'IMMERSE', 'FLEET'})
|
||||
|
||||
|
||||
def get_equivalent_release_groups(release_group):
|
||||
|
|
|
@ -208,14 +208,8 @@ def guess_matches(video, guess, partial=False):
|
|||
if video.season and 'season' in guess and guess['season'] == video.season:
|
||||
matches.add('season')
|
||||
# episode
|
||||
# Currently we only have single-ep support (guessit returns a multi-ep as a list with int values)
|
||||
# Most providers only support single-ep, so make sure it contains only 1 episode
|
||||
# In case of multi-ep, take the lowest episode (subtitles will normally be available on lowest episode number)
|
||||
if video.episode and 'episode' in guess:
|
||||
episode_guess = guess['episode']
|
||||
episode = min(episode_guess) if episode_guess and isinstance(episode_guess, list) else episode_guess
|
||||
if episode == video.episode:
|
||||
matches.add('episode')
|
||||
if video.episode and 'episode' in guess and guess['episode'] == video.episode:
|
||||
matches.add('episode')
|
||||
# year
|
||||
if video.year and 'year' in guess and guess['year'] == video.year:
|
||||
matches.add('year')
|
||||
|
@ -258,4 +252,4 @@ def fix_line_ending(content):
|
|||
:rtype: bytes
|
||||
|
||||
"""
|
||||
return content.replace(b'\r\n', b'\n')
|
||||
return content.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
|
||||
|
|
|
@ -13,9 +13,9 @@ VIDEO_EXTENSIONS = ('.3g2', '.3gp', '.3gp2', '.3gpp', '.60d', '.ajp', '.asf', '.
|
|||
'.bix', '.box', '.cam', '.dat', '.divx', '.dmf', '.dv', '.dvr-ms', '.evo', '.flc', '.fli',
|
||||
'.flic', '.flv', '.flx', '.gvi', '.gvp', '.h264', '.m1v', '.m2p', '.m2ts', '.m2v', '.m4e',
|
||||
'.m4v', '.mjp', '.mjpeg', '.mjpg', '.mkv', '.moov', '.mov', '.movhd', '.movie', '.movx', '.mp4',
|
||||
'.mpe', '.mpeg', '.mpg', '.mpv', '.mpv2', '.mxf', '.nsv', '.nut', '.ogg', '.ogm', '.ogv', '.omf',
|
||||
'.mpe', '.mpeg', '.mpg', '.mpv', '.mpv2', '.mxf', '.nsv', '.nut', '.ogg', '.ogm' '.ogv', '.omf',
|
||||
'.ps', '.qt', '.ram', '.rm', '.rmvb', '.swf', '.ts', '.vfw', '.vid', '.video', '.viv', '.vivo',
|
||||
'.vob', '.vro', '.webm', '.wm', '.wmv', '.wmx', '.wrap', '.wvx', '.wx', '.x264', '.xvid')
|
||||
'.vob', '.vro', '.wm', '.wmv', '.wmx', '.wrap', '.wvx', '.wx', '.x264', '.xvid')
|
||||
|
||||
|
||||
class Video(object):
|
||||
|
@ -123,12 +123,11 @@ class Episode(Video):
|
|||
:param int year: year of the series.
|
||||
:param bool original_series: whether the series is the first with this name.
|
||||
:param int tvdb_id: TVDB id of the episode.
|
||||
:param list alternative_series: alternative names of the series
|
||||
:param \*\*kwargs: additional parameters for the :class:`Video` constructor.
|
||||
|
||||
"""
|
||||
def __init__(self, name, series, season, episode, title=None, year=None, original_series=True, tvdb_id=None,
|
||||
series_tvdb_id=None, series_imdb_id=None, alternative_series=None, **kwargs):
|
||||
series_tvdb_id=None, series_imdb_id=None, **kwargs):
|
||||
super(Episode, self).__init__(name, **kwargs)
|
||||
|
||||
#: Series of the episode
|
||||
|
@ -158,9 +157,6 @@ class Episode(Video):
|
|||
#: IMDb id of the series
|
||||
self.series_imdb_id = series_imdb_id
|
||||
|
||||
#: Alternative names of the series
|
||||
self.alternative_series = alternative_series or []
|
||||
|
||||
@classmethod
|
||||
def fromguess(cls, name, guess):
|
||||
if guess['type'] != 'episode':
|
||||
|
@ -169,13 +165,7 @@ class Episode(Video):
|
|||
if 'title' not in guess or 'episode' not in guess:
|
||||
raise ValueError('Insufficient data to process the guess')
|
||||
|
||||
# Currently we only have single-ep support (guessit returns a multi-ep as a list with int values)
|
||||
# Most providers only support single-ep, so make sure it contains only 1 episode
|
||||
# In case of multi-ep, take the lowest episode (subtitles will normally be available on lowest episode number)
|
||||
episode_guess = guess.get('episode')
|
||||
episode = min(episode_guess) if episode_guess and isinstance(episode_guess, list) else episode_guess
|
||||
|
||||
return cls(name, guess['title'], guess.get('season', 1), episode, title=guess.get('episode_title'),
|
||||
return cls(name, guess['title'], guess.get('season', 1), guess['episode'], title=guess.get('episode_title'),
|
||||
year=guess.get('year'), format=guess.get('format'), original_series='year' not in guess,
|
||||
release_group=guess.get('release_group'), resolution=guess.get('screen_size'),
|
||||
video_codec=guess.get('video_codec'), audio_codec=guess.get('audio_codec'))
|
||||
|
@ -196,11 +186,10 @@ class Movie(Video):
|
|||
|
||||
:param str title: title of the movie.
|
||||
:param int year: year of the movie.
|
||||
:param list alternative_titles: alternative titles of the movie
|
||||
:param \*\*kwargs: additional parameters for the :class:`Video` constructor.
|
||||
|
||||
"""
|
||||
def __init__(self, name, title, year=None, alternative_titles=None, **kwargs):
|
||||
def __init__(self, name, title, year=None, **kwargs):
|
||||
super(Movie, self).__init__(name, **kwargs)
|
||||
|
||||
#: Title of the movie
|
||||
|
@ -209,9 +198,6 @@ class Movie(Video):
|
|||
#: Year of the movie
|
||||
self.year = year
|
||||
|
||||
#: Alternative titles of the movie
|
||||
self.alternative_titles = alternative_titles or []
|
||||
|
||||
@classmethod
|
||||
def fromguess(cls, name, guess):
|
||||
if guess['type'] != 'movie':
|
||||
|
@ -220,13 +206,9 @@ class Movie(Video):
|
|||
if 'title' not in guess:
|
||||
raise ValueError('Insufficient data to process the guess')
|
||||
|
||||
alternative_titles = []
|
||||
if 'alternative_title' in guess:
|
||||
alternative_titles.append(u"%s %s" % (guess['title'], guess['alternative_title']))
|
||||
|
||||
return cls(name, guess['title'], format=guess.get('format'), release_group=guess.get('release_group'),
|
||||
resolution=guess.get('screen_size'), video_codec=guess.get('video_codec'),
|
||||
audio_codec=guess.get('audio_codec'), year=guess.get('year'), alternative_titles=alternative_titles)
|
||||
audio_codec=guess.get('audio_codec'), year=guess.get('year'))
|
||||
|
||||
@classmethod
|
||||
def fromname(cls, name):
|
||||
|
|
|
@ -10,7 +10,7 @@ import time
|
|||
import operator
|
||||
|
||||
import itertools
|
||||
from httplib import ResponseNotReady
|
||||
from http.client import ResponseNotReady
|
||||
|
||||
import rarfile
|
||||
import requests
|
||||
|
@ -21,14 +21,13 @@ from babelfish import LanguageReverseError
|
|||
from guessit.jsonutils import GuessitEncoder
|
||||
from subliminal import ProviderError, refiner_manager
|
||||
|
||||
from extensions import provider_registry
|
||||
from subliminal.exceptions import ServiceUnavailable, DownloadLimitExceeded
|
||||
from subliminal_patch.extensions import provider_registry
|
||||
from subliminal.score import compute_score as default_compute_score
|
||||
from subliminal.utils import hash_napiprojekt, hash_opensubtitles, hash_shooter, hash_thesubdb
|
||||
from subliminal.video import VIDEO_EXTENSIONS, Video, Episode, Movie
|
||||
from subliminal.core import guessit, ProviderPool, io, is_windows_special_path, \
|
||||
ThreadPoolExecutor, check_video
|
||||
from subliminal_patch.exceptions import TooManyRequests, APIThrottled
|
||||
from subliminal_patch.exceptions import TooManyRequests, APIThrottled, ServiceUnavailable, DownloadLimitExceeded
|
||||
|
||||
from subzero.language import Language
|
||||
from scandir import scandir, scandir_generic as _scandir_generic
|
||||
|
@ -186,7 +185,7 @@ class SZProviderPool(ProviderPool):
|
|||
except (requests.Timeout, socket.timeout):
|
||||
logger.error('Provider %r timed out', provider)
|
||||
|
||||
except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled), e:
|
||||
except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled) as e:
|
||||
self.throttle_callback(provider, e)
|
||||
return
|
||||
|
||||
|
@ -283,7 +282,7 @@ class SZProviderPool(ProviderPool):
|
|||
logger.debug("RAR Traceback: %s", traceback.format_exc())
|
||||
return False
|
||||
|
||||
except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled), e:
|
||||
except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled) as e:
|
||||
self.throttle_callback(subtitle.provider_name, e)
|
||||
self.discarded_providers.add(subtitle.provider_name)
|
||||
return False
|
||||
|
@ -648,7 +647,7 @@ def search_external_subtitles(path, languages=None, only_one=False):
|
|||
abspath = unicode(os.path.abspath(
|
||||
os.path.join(*[video_path if not os.path.isabs(folder_or_subfolder) else "", folder_or_subfolder,
|
||||
video_filename])))
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
logger.error("skipping path %s because of %s", repr(folder_or_subfolder), e)
|
||||
continue
|
||||
logger.debug("external subs: scanning path %s", abspath)
|
||||
|
|
|
@ -9,3 +9,13 @@ class TooManyRequests(ProviderError):
|
|||
|
||||
class APIThrottled(ProviderError):
|
||||
pass
|
||||
|
||||
|
||||
class ServiceUnavailable(ProviderError):
|
||||
"""Exception raised when status is '503 Service Unavailable'."""
|
||||
pass
|
||||
|
||||
|
||||
class DownloadLimitExceeded(ProviderError):
|
||||
"""Exception raised by providers when download limit is exceeded."""
|
||||
pass
|
||||
|
|
|
@ -8,7 +8,7 @@ import os
|
|||
import socket
|
||||
import logging
|
||||
import requests
|
||||
import xmlrpclib
|
||||
import xmlrpc.client
|
||||
import dns.resolver
|
||||
import ipaddress
|
||||
import re
|
||||
|
@ -16,7 +16,7 @@ import re
|
|||
from requests import exceptions
|
||||
from urllib3.util import connection
|
||||
from retry.api import retry_call
|
||||
from exceptions import APIThrottled
|
||||
from .exceptions import APIThrottled
|
||||
from dogpile.cache.api import NO_VALUE
|
||||
from subliminal.cache import region
|
||||
from subliminal_patch.pitcher import pitchers
|
||||
|
@ -32,10 +32,8 @@ try:
|
|||
except ImportError:
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from subzero.lib.io import get_viable_encoding
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
pem_file = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(unicode(__file__, get_viable_encoding()))), "..", certifi.where()))
|
||||
pem_file = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", certifi.where()))
|
||||
try:
|
||||
default_ssl_context = ssl.create_default_context(cafile=pem_file)
|
||||
except AttributeError:
|
||||
|
@ -99,7 +97,7 @@ class CFSession(CloudScraper):
|
|||
# Solve Challenge
|
||||
resp = self.sendChallengeResponse(resp, **kwargs)
|
||||
|
||||
except ValueError, e:
|
||||
except ValueError as e:
|
||||
if e.message == "Captcha":
|
||||
parsed_url = urlparse(url)
|
||||
domain = parsed_url.netloc
|
||||
|
@ -231,7 +229,7 @@ class RetryingCFSession(RetryingSession, CFSession):
|
|||
pass
|
||||
|
||||
|
||||
class SubZeroRequestsTransport(xmlrpclib.SafeTransport):
|
||||
class SubZeroRequestsTransport(xmlrpc.client.SafeTransport):
|
||||
"""
|
||||
Drop in Transport for xmlrpclib that uses Requests instead of httplib
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ from subliminal.cache import region
|
|||
from dogpile.cache.api import NO_VALUE
|
||||
from python_anticaptcha import AnticaptchaClient, NoCaptchaTaskProxylessTask, NoCaptchaTask, AnticaptchaException,\
|
||||
Proxy
|
||||
from deathbycaptcha import SocketClient as DBCClient, DEFAULT_TOKEN_TIMEOUT
|
||||
from deathbycaptcha import SocketClient as DBCClient, DEFAULT_TIMEOUT
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -185,7 +185,7 @@ class DBCProxyLessPitcher(Pitcher):
|
|||
password = None
|
||||
|
||||
def __init__(self, website_name, website_url, website_key,
|
||||
timeout=DEFAULT_TOKEN_TIMEOUT, tries=3, *args, **kwargs):
|
||||
timeout=DEFAULT_TIMEOUT, tries=3, *args, **kwargs):
|
||||
super(DBCProxyLessPitcher, self).__init__(website_name, website_url, website_key, tries=tries)
|
||||
|
||||
self.username, self.password = self.client_key.split(":", 1)
|
||||
|
|
|
@ -5,7 +5,7 @@ import datetime
|
|||
from subliminal.refiners.tvdb import Episode, logger, search_series, series_re, sanitize, get_series, \
|
||||
get_series_episode, region, tvdb_client
|
||||
|
||||
from util import fix_session_bases
|
||||
from .util import fix_session_bases
|
||||
|
||||
TVDB_SEASON_EXPIRATION_TIME = datetime.timedelta(days=1).total_seconds()
|
||||
|
||||
|
|
|
@ -272,9 +272,9 @@ class Subtitle(Subtitle_):
|
|||
def prepare_text(text, style):
|
||||
body = []
|
||||
for fragment, sty in parse_tags(text, style, sub.styles):
|
||||
fragment = fragment.replace(ur"\h", u" ")
|
||||
fragment = fragment.replace(ur"\n", u"\n")
|
||||
fragment = fragment.replace(ur"\N", u"\n")
|
||||
fragment = fragment.replace(r"\h", u" ")
|
||||
fragment = fragment.replace(r"\n", u"\n")
|
||||
fragment = fragment.replace(r"\N", u"\n")
|
||||
if format == "srt":
|
||||
if sty.italic:
|
||||
fragment = u"<i>%s</i>" % fragment
|
||||
|
|
|
@ -1,2 +1,8 @@
|
|||
|
||||
import dict, geezip, httpfake, io, json, rar, which
|
||||
from .dict import *
|
||||
from .geezip import *
|
||||
from .httpfake import *
|
||||
from .io import *
|
||||
from .json import *
|
||||
from .rar import *
|
||||
from .which import *
|
|
@ -28,7 +28,7 @@ class GeezipFile(gzip.GzipFile):
|
|||
fileobj.write(self.compress.flush(Z_FINISH))
|
||||
gzip.write32u(fileobj, self.crc)
|
||||
# self.size may exceed 2GB, or even 4GB
|
||||
gzip.write32u(fileobj, self.size & 0xffffffffL)
|
||||
gzip.write32u(fileobj, self.size & 0xffffffff)
|
||||
fileobj.flush()
|
||||
finally:
|
||||
myfileobj = self.myfileobj
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# coding=utf-8
|
||||
|
||||
from registry import registry
|
||||
from mods import hearing_impaired, ocr_fixes, fps, offset, common, color
|
||||
from main import SubtitleModifications, SubMod
|
||||
from .registry import registry
|
||||
from .mods import hearing_impaired, ocr_fixes, fps, offset, common, color
|
||||
from .main import SubtitleModifications, SubMod
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
# coding=utf-8
|
||||
|
||||
from data import data
|
||||
from .data import data
|
File diff suppressed because one or more lines are too long
|
@ -6,14 +6,14 @@ import pysubs2
|
|||
import logging
|
||||
import time
|
||||
|
||||
from mods import EMPTY_TAG_PROCESSOR, EmptyEntryError
|
||||
from registry import registry
|
||||
from .mods import EMPTY_TAG_PROCESSOR, EmptyEntryError
|
||||
from .registry import registry
|
||||
from subzero.language import Language
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
lowercase_re = re.compile(ur'(?sux)[a-zà-ž]')
|
||||
lowercase_re = re.compile(r'(?sux)[a-zà-ž]')
|
||||
|
||||
|
||||
class SubtitleModifications(object):
|
||||
|
@ -143,7 +143,7 @@ class SubtitleModifications(object):
|
|||
continue
|
||||
|
||||
# clear empty args
|
||||
final_mod_args = dict(filter(lambda (k, v): bool(v), args.iteritems()))
|
||||
final_mod_args = dict(filter(lambda kv: bool(kv[1]), args.iteritems()))
|
||||
|
||||
_data = SubtitleModifications.get_mod_signature(identifier, **final_mod_args)
|
||||
if _data == mods_merged_log[identifier]["final_identifier"]:
|
||||
|
@ -180,7 +180,7 @@ class SubtitleModifications(object):
|
|||
entries_used = 0
|
||||
for entry in self.f:
|
||||
entry_used = False
|
||||
for sub in entry.text.strip().split("\N"):
|
||||
for sub in entry.text.strip().split(r"\N"):
|
||||
# skip HI bracket entries, those might actually be lowercase
|
||||
sub = sub.strip()
|
||||
for processor in registry.mods["remove_HI"].processors[:4]:
|
||||
|
@ -272,7 +272,7 @@ class SubtitleModifications(object):
|
|||
continue
|
||||
|
||||
skip_entry = False
|
||||
for line in t.split(ur"\N"):
|
||||
for line in t.split(r"\N"):
|
||||
# don't bother the mods with surrounding tags
|
||||
old_line = line
|
||||
line = line.strip()
|
||||
|
@ -377,7 +377,7 @@ class SubtitleModifications(object):
|
|||
logger.debug(u"%d: %r -> ''", index, entry.text)
|
||||
continue
|
||||
|
||||
new_text = ur"\N".join(lines)
|
||||
new_text = r"\N".join(lines)
|
||||
|
||||
# cheap man's approach to avoid open tags
|
||||
add_start_tags = []
|
||||
|
|
|
@ -95,7 +95,7 @@ class SubtitleTextModification(SubtitleModification):
|
|||
pass
|
||||
|
||||
|
||||
TAG = ur"(?:\s*{\\[iusb][0-1]}\s*)*"
|
||||
TAG = r"(?:\s*{\\[iusb][0-1]}\s*)*"
|
||||
EMPTY_TAG_PROCESSOR = ReProcessor(re.compile(r'({\\\w1})[\s.,-_!?]*({\\\w0})'), "", name="empty_tag")
|
||||
|
||||
empty_line_post_processors = [
|
||||
|
|
|
@ -22,10 +22,10 @@ class CommonFixes(SubtitleTextModification):
|
|||
|
||||
processors = [
|
||||
# normalize hyphens
|
||||
NReProcessor(re.compile(ur'(?u)([‑‐﹘﹣])'), u"-", name="CM_hyphens"),
|
||||
NReProcessor(re.compile(r'(?u)([‑‐﹘﹣])'), u"-", name="CM_hyphens"),
|
||||
|
||||
# -- = em dash
|
||||
NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), ur"\1—", name="CM_multidash"),
|
||||
NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), r"\1—", name="CM_multidash"),
|
||||
|
||||
# line = _/-/\s
|
||||
NReProcessor(re.compile(r'(?u)(^\W*[-_.:>~]+\W*$)'), "", name="<CM_non_word_only"),
|
||||
|
@ -37,23 +37,23 @@ class CommonFixes(SubtitleTextModification):
|
|||
NReProcessor(re.compile(r'(?u)(^\W*:\s*(?=\w+))'), "", name="CM_empty_colon_start"),
|
||||
|
||||
# fix music symbols
|
||||
NReProcessor(re.compile(ur'(?u)(^[-\s>~]*[*#¶]+\s+)|(\s*[*#¶]+\s*$)'),
|
||||
NReProcessor(re.compile(r'(?u)(^[-\s>~]*[*#¶]+\s+)|(\s*[*#¶]+\s*$)'),
|
||||
lambda x: u"♪ " if x.group(1) else u" ♪",
|
||||
name="CM_music_symbols"),
|
||||
|
||||
# '' = "
|
||||
NReProcessor(re.compile(ur'(?u)([\'’ʼ❜‘‛][\'’ʼ❜‘‛]+)'), u'"', name="CM_double_apostrophe"),
|
||||
NReProcessor(re.compile(r'(?u)([\'’ʼ❜‘‛][\'’ʼ❜‘‛]+)'), u'"', name="CM_double_apostrophe"),
|
||||
|
||||
# double quotes instead of single quotes inside words
|
||||
NReProcessor(re.compile(ur'(?u)([A-zÀ-ž])"([A-zÀ-ž])'), ur"\1'\2", name="CM_double_as_single"),
|
||||
NReProcessor(re.compile(r'(?u)([A-zÀ-ž])"([A-zÀ-ž])'), r"\1'\2", name="CM_double_as_single"),
|
||||
|
||||
# normalize quotes
|
||||
NReProcessor(re.compile(ur'(?u)(\s*["”“‟„])\s*(["”“‟„]["”“‟„\s]*)'),
|
||||
NReProcessor(re.compile(r'(?u)(\s*["”“‟„])\s*(["”“‟„]["”“‟„\s]*)'),
|
||||
lambda match: '"' + (" " if match.group(2).endswith(" ") else ""),
|
||||
name="CM_normalize_quotes"),
|
||||
|
||||
# normalize single quotes
|
||||
NReProcessor(re.compile(ur'(?u)([\'’ʼ❜‘‛])'), u"'", name="CM_normalize_squotes"),
|
||||
NReProcessor(re.compile(r'(?u)([\'’ʼ❜‘‛])'), u"'", name="CM_normalize_squotes"),
|
||||
|
||||
# remove leading ...
|
||||
NReProcessor(re.compile(r'(?u)^\.\.\.[\s]*'), "", name="CM_leading_ellipsis"),
|
||||
|
@ -89,8 +89,8 @@ class CommonFixes(SubtitleTextModification):
|
|||
# space before ending doublequote?
|
||||
|
||||
# replace uppercase I with lowercase L in words
|
||||
NReProcessor(re.compile(ur'(?u)([a-zà-ž]+)(I+)'),
|
||||
lambda match: ur'%s%s' % (match.group(1), "l" * len(match.group(2))),
|
||||
NReProcessor(re.compile(r'(?u)([a-zà-ž]+)(I+)'),
|
||||
lambda match: r'%s%s' % (match.group(1), "l" * len(match.group(2))),
|
||||
name="CM_uppercase_i_in_word"),
|
||||
|
||||
# fix spaces in numbers (allows for punctuation: ,.:' (comma/dot only fixed if after space, those may be
|
||||
|
@ -101,11 +101,11 @@ class CommonFixes(SubtitleTextModification):
|
|||
name="CM_spaces_in_numbers"),
|
||||
|
||||
# uppercase after dot
|
||||
NReProcessor(re.compile(ur'(?u)((?<!(?=\s*[A-ZÀ-Ž-_0-9.]\s*))(?:[^.\s])+\.\s+)([a-zà-ž])'),
|
||||
lambda match: ur'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"),
|
||||
NReProcessor(re.compile(r'(?u)((?<!(?=\s*[A-ZÀ-Ž-_0-9.]\s*))(?:[^.\s])+\.\s+)([a-zà-ž])'),
|
||||
lambda match: r'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"),
|
||||
|
||||
# remove double interpunction
|
||||
NReProcessor(re.compile(ur'(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)'),
|
||||
NReProcessor(re.compile(r'(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)'),
|
||||
lambda match: match.group(1).strip() + (" " if match.group(2).endswith(" ") else ""),
|
||||
name="CM_double_interpunct"),
|
||||
|
||||
|
@ -149,14 +149,14 @@ class ReverseRTL(SubtitleModification):
|
|||
|
||||
processors = [
|
||||
# new? (?u)(^([\s.!?]*)(.+?)(\s*)(-?\s*)$); \5\4\3\2
|
||||
#NReProcessor(re.compile(ur"(?u)((?=(?<=\b|^)|(?<=\s))([.!?-]+)([^.!?-]+)(?=\b|$|\s))"), r"\3\2",
|
||||
#NReProcessor(re.compile(r"(?u)((?=(?<=\b|^)|(?<=\s))([.!?-]+)([^.!?-]+)(?=\b|$|\s))"), r"\3\2",
|
||||
# name="CM_RTL_reverse")
|
||||
NReProcessor(re.compile(ur"(?u)(^([\s.!?:,'-]*)(.+?)(\s*)(-?\s*)$)"), r"\5\4\3\2",
|
||||
NReProcessor(re.compile(r"(?u)(^([\s.!?:,'-]*)(.+?)(\s*)(-?\s*)$)"), r"\5\4\3\2",
|
||||
name="CM_RTL_reverse")
|
||||
]
|
||||
|
||||
|
||||
split_upper_re = re.compile(ur"(\s*[.!?♪\-]\s*)")
|
||||
split_upper_re = re.compile(r"(\s*[.!?♪\-]\s*)")
|
||||
|
||||
|
||||
class FixUppercase(SubtitleModification):
|
||||
|
|
|
@ -26,71 +26,71 @@ class HearingImpaired(SubtitleTextModification):
|
|||
|
||||
processors = [
|
||||
# full bracket entry, single or multiline; starting with brackets and ending with brackets
|
||||
FullBracketEntryProcessor(re.compile(ur'(?sux)^-?%(t)s[([].+(?=[^)\]]{3,}).+[)\]]%(t)s$' % {"t": TAG}),
|
||||
FullBracketEntryProcessor(re.compile(r'(?sux)^-?%(t)s[([].+(?=[^)\]]{3,}).+[)\]]%(t)s$' % {"t": TAG}),
|
||||
"", name="HI_brackets_full"),
|
||||
|
||||
# uppercase text before colon (at least 3 uppercase chars); at start or after a sentence,
|
||||
# possibly with a dash in front; ignore anything ending with a quote
|
||||
NReProcessor(re.compile(ur'(?u)(?:(?<=^)|(?<=[.\-!?\"\']))([\s\->~]*(?=[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+])'
|
||||
ur'[A-zÀ-ž-_0-9\s\"\'&+()\[\],:]+:(?![\"\'’ʼ❜‘‛”“‟„])(?:\s+|$))(?![0-9])'), "",
|
||||
NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=[.\-!?\"\']))([\s\->~]*(?=[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+])'
|
||||
r'[A-zÀ-ž-_0-9\s\"\'&+()\[\],:]+:(?![\"\'’ʼ❜‘‛”“‟„])(?:\s+|$))(?![0-9])'), "",
|
||||
name="HI_before_colon_caps"),
|
||||
|
||||
# any text before colon (at least 3 chars); at start or after a sentence,
|
||||
# possibly with a dash in front; try not breaking actual sentences with a colon at the end by not matching if
|
||||
# a space is inside the text; ignore anything ending with a quote
|
||||
NReProcessor(re.compile(ur'(?u)(?:(?<=^)|(?<=[.\-!?\"]))([\s\->~]*((?=[A-zÀ-ž&+]\s*[A-zÀ-ž&+]\s*[A-zÀ-ž&+])'
|
||||
ur'[A-zÀ-ž-_0-9\s\"\'&+()\[\]]+:)(?![\"’ʼ❜‘‛”“‟„])\s*)(?![0-9])'),
|
||||
NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=[.\-!?\"]))([\s\->~]*((?=[A-zÀ-ž&+]\s*[A-zÀ-ž&+]\s*[A-zÀ-ž&+])'
|
||||
r'[A-zÀ-ž-_0-9\s\"\'&+()\[\]]+:)(?![\"’ʼ❜‘‛”“‟„])\s*)(?![0-9])'),
|
||||
lambda match:
|
||||
match.group(1) if (match.group(2).count(" ") > 0 or match.group(1).count("-") > 0)
|
||||
else "" if not match.group(1).startswith(" ") else " ",
|
||||
name="HI_before_colon_noncaps"),
|
||||
|
||||
# brackets (only remove if at least 3 chars in brackets)
|
||||
NReProcessor(re.compile(ur'(?sux)-?%(t)s[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' %
|
||||
NReProcessor(re.compile(r'(?sux)-?%(t)s[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' %
|
||||
{"t": TAG}), "", name="HI_brackets"),
|
||||
|
||||
#NReProcessor(re.compile(ur'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}),
|
||||
#NReProcessor(re.compile(r'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}),
|
||||
# "", name="HI_bracket_open_start"),
|
||||
|
||||
#NReProcessor(re.compile(ur'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "",
|
||||
#NReProcessor(re.compile(r'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "",
|
||||
# name="HI_bracket_open_end"),
|
||||
|
||||
# text before colon (and possible dash in front), max 11 chars after the first whitespace (if any)
|
||||
# NReProcessor(re.compile(r'(?u)(^[A-z\-\'"_]+[\w\s]{0,11}:[^0-9{2}][\s]*)'), "", name="HI_before_colon"),
|
||||
|
||||
# starting text before colon (at least 3 chars)
|
||||
#NReProcessor(re.compile(ur'(?u)(\b|^)([\s-]*(?=[A-zÀ-ž-_0-9"\']{3,})[A-zÀ-ž-_0-9"\']+:\s*)'), "",
|
||||
#NReProcessor(re.compile(r'(?u)(\b|^)([\s-]*(?=[A-zÀ-ž-_0-9"\']{3,})[A-zÀ-ž-_0-9"\']+:\s*)'), "",
|
||||
# name="HI_before_colon"),
|
||||
|
||||
|
||||
# text in brackets at start, after optional dash, before colon or at end of line
|
||||
# fixme: may be too aggressive
|
||||
#NReProcessor(re.compile(ur'(?um)(^-?\s?[([][A-zÀ-ž-_\s]{3,}[)\]](?:(?=$)|:\s*))'), "",
|
||||
#NReProcessor(re.compile(r'(?um)(^-?\s?[([][A-zÀ-ž-_\s]{3,}[)\]](?:(?=$)|:\s*))'), "",
|
||||
# name="HI_brackets_special"),
|
||||
|
||||
# all caps line (at least 4 consecutive uppercase chars)
|
||||
NReProcessor(re.compile(ur'(?u)(^(?=.*[A-ZÀ-Ž&+]{4,})[A-ZÀ-Ž-_\s&+]+$)'), "", name="HI_all_caps",
|
||||
NReProcessor(re.compile(r'(?u)(^(?=.*[A-ZÀ-Ž&+]{4,})[A-ZÀ-Ž-_\s&+]+$)'), "", name="HI_all_caps",
|
||||
supported=lambda p: not p.only_uppercase),
|
||||
|
||||
# remove MAN:
|
||||
NReProcessor(re.compile(ur'(?suxi)(\b(?:WO)MAN:\s*)'), "", name="HI_remove_man"),
|
||||
NReProcessor(re.compile(r'(?suxi)(\b(?:WO)MAN:\s*)'), "", name="HI_remove_man"),
|
||||
|
||||
# dash in front
|
||||
# NReProcessor(re.compile(r'(?u)^\s*-\s*'), "", name="HI_starting_dash"),
|
||||
|
||||
# all caps at start before new sentence
|
||||
NReProcessor(re.compile(ur'(?u)^(?=[A-ZÀ-Ž]{4,})[A-ZÀ-Ž-_\s]+\s([A-ZÀ-Ž][a-zà-ž].+)'), r"\1",
|
||||
NReProcessor(re.compile(r'(?u)^(?=[A-ZÀ-Ž]{4,})[A-ZÀ-Ž-_\s]+\s([A-ZÀ-Ž][a-zà-ž].+)'), r"\1",
|
||||
name="HI_starting_upper_then_sentence", supported=lambda p: not p.only_uppercase),
|
||||
]
|
||||
|
||||
post_processors = empty_line_post_processors
|
||||
last_processors = [
|
||||
# remove music symbols
|
||||
NReProcessor(re.compile(ur'(?u)(^%(t)s[*#¶♫♪\s]*%(t)s[*#¶♫♪\s]+%(t)s[*#¶♫♪\s]*%(t)s$)' % {"t": TAG}),
|
||||
NReProcessor(re.compile(r'(?u)(^%(t)s[*#¶♫♪\s]*%(t)s[*#¶♫♪\s]+%(t)s[*#¶♫♪\s]*%(t)s$)' % {"t": TAG}),
|
||||
"", name="HI_music_symbols_only"),
|
||||
|
||||
# remove music entries
|
||||
NReProcessor(re.compile(ur'(?ums)(^[-\s>~]*[♫♪]+\s*.+|.+\s*[♫♪]+\s*$)'),
|
||||
NReProcessor(re.compile(r'(?ums)(^[-\s>~]*[♫♪]+\s*.+|.+\s*[♫♪]+\s*$)'),
|
||||
"", name="HI_music"),
|
||||
]
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue