WIP

2025-04-23 22:27:17 -04:00 · 2019-09-20 17:56:33 -04:00 · 2019-09-20 17:56:33 -04:00 · a7b40eaf79
commit a7b40eaf79
parent e7cb2a71e2
17 changed files with 75 additions and 99 deletions
--- a/bazarr/get_subtitle.py
+++ b/bazarr/get_subtitle.py
@ -202,7 +202,7 @@ def download_subtitle(path, language, hi, forced, providers, providers_auth, sce
                                                     directory=fld,
                                                     chmod=chmod,
                                                     # formats=("srt", "vtt")
-                                                     path_decoder=force_unicode
+                                                     path_decoder=None
                                                     )
                except Exception as e:
                    logging.exception('BAZARR Error saving subtitles file to disk for this file:' + path)
@ -419,7 +419,6 @@ def manual_download_subtitle(path, language, hi, forced, subtitle, provider, pro
            if not subtitle.is_valid():
                logging.exception('BAZARR No valid subtitles file found for this file: ' + path)
                return
-            logging.debug('BAZARR Subtitles file downloaded for this file:' + path)
            try:
                score = round(subtitle.score / max_score * 100, 2)
                fld = get_target_folder(path)
--- a/libs/pysubs2/common.py
+++ b/libs/pysubs2/common.py
@ -17,7 +17,7 @@ class Color(_Color):
        return _Color.__new__(cls, r, g, b, a)

 #: Version of the pysubs2 library.
-VERSION = "0.2.3"
+VERSION = "0.2.4"


 PY3 = sys.version_info.major == 3
--- a/libs/pysubs2/formats.py
+++ b/libs/pysubs2/formats.py
@ -4,6 +4,7 @@ from .subrip import SubripFormat
 from .jsonformat import JSONFormat
 from .substation import SubstationFormat
 from .mpl2 import MPL2Format
+from .tmp import TmpFormat
 from .exceptions import *

 #: Dict mapping file extensions to format identifiers.
@ -13,6 +14,7 @@ FILE_EXTENSION_TO_FORMAT_IDENTIFIER = {
    ".ssa": "ssa",
    ".sub": "microdvd",
    ".json": "json",
+    ".txt": "tmp",
 }

 #: Dict mapping format identifiers to implementations (FormatBase subclasses).
@ -23,6 +25,7 @@ FORMAT_IDENTIFIER_TO_FORMAT_CLASS = {
    "microdvd": MicroDVDFormat,
    "json": JSONFormat,
    "mpl2": MPL2Format,
+    "tmp": TmpFormat,
 }

 def get_format_class(format_):
--- a/libs/pysubs2/ssafile.py
+++ b/libs/pysubs2/ssafile.py
@ -66,7 +66,14 @@ class SSAFile(MutableSequence):
                be detected from the file, in which case you don't need
                to specify it here (when given, this argument overrides
                autodetection).
-            kwargs: Extra options for the parser.
+            keep_unknown_html_tags (bool): This affects SubRip only (SRT),
+                for other formats this argument is ignored.
+                By default, HTML tags are converted to equivalent SubStation tags
+                (eg. ``<i>`` to ``{\\i1}`` and any remaining tags are removed
+                to keep the text clean. Set this parameter to ``True``
+                if you want to pass through these tags (eg. ``<sub>``).
+                This is useful if your output format is SRT and your player
+                supports these tags.

        Returns:
            SSAFile
@ -86,6 +93,7 @@ class SSAFile(MutableSequence):
        Example:
            >>> subs1 = pysubs2.load("subrip-subtitles.srt")
            >>> subs2 = pysubs2.load("microdvd-subtitles.sub", fps=23.976)
+            >>> subs3 = pysubs2.load("subrip-subtitles-with-fancy-tags.srt", keep_unknown_html_tags=True)

        """
        with open(path, encoding=encoding) as fp:
--- a/libs/pysubs2/ssastyle.py
+++ b/libs/pysubs2/ssastyle.py
@ -56,7 +56,7 @@ class SSAStyle(object):
        self.encoding = 1 #: Charset

        for k, v in fields.items():
-            if k in self.FIELDS and v is not None:
+            if k in self.FIELDS:
                setattr(self, k, v)
            else:
                raise ValueError("SSAStyle has no field named %r" % k)
--- a/libs/pysubs2/subrip.py
+++ b/libs/pysubs2/subrip.py
@ -31,7 +31,7 @@ class SubripFormat(FormatBase):
                return "srt"

    @classmethod
-    def from_file(cls, subs, fp, format_, **kwargs):
+    def from_file(cls, subs, fp, format_, keep_unknown_html_tags=False, **kwargs):
        timestamps = [] # (start, end)
        following_lines = [] # contains lists of lines following each timestamp

@ -56,15 +56,15 @@ class SubripFormat(FormatBase):
            # Handle the general case.
            s = "".join(lines).strip()
            s = re.sub(r"\n+ *\d+ *$", "", s) # strip number of next subtitle
-            s = re.sub(r"< *i *>", r"{\i1}", s)
-            s = re.sub(r"< */ *i *>", r"{\i0}", s)
-            s = re.sub(r"< *s *>", r"{\s1}", s)
-            s = re.sub(r"< */ *s *>", r"{\s0}", s)
-            s = re.sub(r"< *u *>", "{\\u1}", s) # not r" for Python 2.7 compat, triggers unicodeescape
-            s = re.sub(r"< */ *u *>", "{\\u0}", s)
-            s = re.sub(r"< */? *[a-zA-Z][^>]*>", "", s) # strip other HTML tags
-            s = re.sub(r"\r", "", s)  # convert newlines
-            s = re.sub(r"\n", r"\N", s) # convert newlines
+            s = re.sub(r"< *i *>", r"{\\i1}", s)
+            s = re.sub(r"< */ *i *>", r"{\\i0}", s)
+            s = re.sub(r"< *s *>", r"{\\s1}", s)
+            s = re.sub(r"< */ *s *>", r"{\\s0}", s)
+            s = re.sub(r"< *u *>", "{\\\\u1}", s) # not r" for Python 2.7 compat, triggers unicodeescape
+            s = re.sub(r"< */ *u *>", "{\\\\u0}", s)
+            if not keep_unknown_html_tags:
+                s = re.sub(r"< */? *[a-zA-Z][^>]*>", "", s) # strip other HTML tags
+            s = re.sub(r"\n", r"\\N", s) # convert newlines
            return s

        subs.events = [SSAEvent(start=start, end=end, text=prepare_text(lines))
--- a/libs/pysubs2/substation.py
+++ b/libs/pysubs2/substation.py
@ -145,7 +145,12 @@ class SubstationFormat(FormatBase):

        def string_to_field(f, v):
            if f in {"start", "end"}:
-                return timestamp_to_ms(TIMESTAMP.match(v).groups())
+                if v.startswith("-"):
+                    # handle negative timestamps
+                    v = v[1:]
+                    return -timestamp_to_ms(TIMESTAMP.match(v).groups())
+                else:
+                    return timestamp_to_ms(TIMESTAMP.match(v).groups())
            elif "color" in f:
                if format_ == "ass":
                    return ass_rgba_to_color(v)
@ -184,22 +189,22 @@ class SubstationFormat(FormatBase):
            elif inside_info_section or inside_aegisub_section:
                if line.startswith(";"): continue # skip comments
                try:
-                    k, v = line.split(": ", 1)
+                    k, v = line.split(":", 1)
                    if inside_info_section:
-                        subs.info[k] = v
+                        subs.info[k] = v.strip()
                    elif inside_aegisub_section:
-                        subs.aegisub_project[k] = v
+                        subs.aegisub_project[k] = v.strip()
                except ValueError:
                    pass
            elif line.startswith("Style:"):
-                _, rest = line.split(": ", 1)
+                _, rest = line.split(":", 1)
                buf = rest.strip().split(",")
                name, raw_fields = buf[0], buf[1:] # splat workaround for Python 2.7
                field_dict = {f: string_to_field(f, v) for f, v in zip(STYLE_FIELDS[format_], raw_fields)}
                sty = SSAStyle(**field_dict)
                subs.styles[name] = sty
            elif line.startswith("Dialogue:") or line.startswith("Comment:"):
-                ev_type, rest = line.split(": ", 1)
+                ev_type, rest = line.split(":", 1)
                raw_fields = rest.strip().split(",", len(EVENT_FIELDS[format_])-1)
                field_dict = {f: string_to_field(f, v) for f, v in zip(EVENT_FIELDS[format_], raw_fields)}
                field_dict["type"] = ev_type
--- a/libs/pysubs2/time.py
+++ b/libs/pysubs2/time.py
@ -49,6 +49,20 @@ def timestamp_to_ms(groups):
    ms += h * 3600000
    return ms

+def tmptimestamp_to_ms(groups):
+    """
+    Convert groups from :data:`pysubs2.time.TMPTIMESTAMP` match to milliseconds.
+    
+    Example:
+        >>> timestamp_to_ms(TIMESTAMP.match("0:00:01").groups())
+        1000
+    
+    """
+    h, m, s  = map(int, groups)
+    ms = s * 1000
+    ms += m * 60000
+    ms += h * 3600000
+    return ms
 def times_to_ms(h=0, m=0, s=0, ms=0):
    """
    Convert hours, minutes, seconds to milliseconds.
--- a/libs/pysubs2/txt_generic.py
+++ b/libs/pysubs2/txt_generic.py
@ -1,45 +0,0 @@
-# coding=utf-8
-
-from __future__ import print_function, division, unicode_literals
-import re
-from numbers import Number
-
-from pysubs2.time import times_to_ms
-from .formatbase import FormatBase
-from .ssaevent import SSAEvent
-from .ssastyle import SSAStyle
-
-
-# thanks to http://otsaloma.io/gaupol/doc/api/aeidon.files.mpl2_source.html
-MPL2_FORMAT = re.compile(r"^(?um)\[(-?\d+)\]\[(-?\d+)\](.*?)$")
-
-
-class TXTGenericFormat(FormatBase):
-    @classmethod
-    def guess_format(cls, text):
-        if MPL2_FORMAT.match(text):
-            return "mpl2"
-
-
-class MPL2Format(FormatBase):
-    @classmethod
-    def guess_format(cls, text):
-        return TXTGenericFormat.guess_format(text)
-
-    @classmethod
-    def from_file(cls, subs, fp, format_, **kwargs):
-        def prepare_text(lines):
-            out = []
-            for s in lines.split("|"):
-                if s.startswith("/"):
-                    out.append(r"{\i1}%s{\i0}" % s[1:])
-                    continue
-                out.append(s)
-            return "\n".join(out)
-
-        subs.events = [SSAEvent(start=times_to_ms(s=float(start) / 10), end=times_to_ms(s=float(end) / 10),
-                       text=prepare_text(text)) for start, end, text in MPL2_FORMAT.findall(fp.getvalue())]
-
-    @classmethod
-    def to_file(cls, subs, fp, format_, **kwargs):
-        raise NotImplemented
--- a/libs/subliminal_patch/core.py
+++ b/libs/subliminal_patch/core.py
@ -854,8 +854,8 @@ def save_subtitles(file_path, subtitles, single=False, directory=None, chmod=Non
            logger.debug(u"Saving %r to %r", subtitle, subtitle_path)
            content = subtitle.get_modified_content(format=format, debug=debug_mods)
            if content:
-                with open(subtitle_path, 'w') as f:
-                    f.write(content.decode('utf-8'))
+                with open(subtitle_path, 'wb') as f:
+                    f.write(content)
                subtitle.storage_path = subtitle_path
            else:
                logger.error(u"Something went wrong when getting modified subtitle for %s", subtitle)
--- a/libs/subliminal_patch/http.py
+++ b/libs/subliminal_patch/http.py
@ -148,7 +148,7 @@ class CFSession(CloudScraper):
        cache_key = "cf_data3_%s" % domain

        if not self.cookies.get("cf_clearance", "", domain=domain):
-            cf_data = region.get(cache_key)
+            cf_data = str(region.get(cache_key))
            if cf_data is not NO_VALUE:
                cf_cookies, hdrs = cf_data
                logger.debug("Trying to use old cf data for %s: %s", domain, cf_data)
@ -165,9 +165,9 @@ class CFSession(CloudScraper):
            pass
        else:
            if cf_data and "cf_clearance" in cf_data[0] and cf_data[0]["cf_clearance"]:
-                if cf_data != region.get(cache_key):
+                if cf_data != str(region.get(cache_key)):
                    logger.debug("Storing cf data for %s: %s", domain, cf_data)
-                    region.set(cache_key, cf_data)
+                    region.set(cache_key, bytearray(cf_data, encoding='utf-8'))
                elif cf_data[0]["cf_clearance"]:
                    logger.debug("CF Live tokens not updated")

--- a/libs/subliminal_patch/pitcher.py
+++ b/libs/subliminal_patch/pitcher.py
@ -257,4 +257,4 @@ def load_verification(site_name, session, callback=lambda x: None):


 def store_verification(site_name, session):
-    region.set("%s_data" % site_name, session.cookies._cookies, session.headers["User-Agent"])
+    region.set("%s_data" % site_name, (session.cookies._cookies, session.headers["User-Agent"]))
--- a/libs/subliminal_patch/providers/addic7ed.py
+++ b/libs/subliminal_patch/providers/addic7ed.py
@ -104,11 +104,11 @@ class Addic7edProvider(_Addic7edProvider):
            tries = 0
            while tries < 3:
                r = self.session.get(self.server_url + 'login.php', timeout=10, headers={"Referer": self.server_url})
-                if "grecaptcha" in r.content:
+                if "grecaptcha" in r.text:
                    logger.info('Addic7ed: Solving captcha. This might take a couple of minutes, but should only '
                                'happen once every so often')

-                    site_key = re.search(r'grecaptcha.execute\(\'(.+?)\',', r.content).group(1)
+                    site_key = re.search(r'grecaptcha.execute\(\'(.+?)\',', r.text).group(1)
                    if not site_key:
                        logger.error("Addic7ed: Captcha site-key not found!")
                        return
@ -127,11 +127,11 @@ class Addic7edProvider(_Addic7edProvider):
                r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10,
                                      headers={"Referer": self.server_url + "login.php"})

-                if "relax, slow down" in r.content:
+                if "relax, slow down" in r.text:
                    raise TooManyRequests(self.username)

                if r.status_code != 302:
-                    if "User <b></b> doesn't exist" in r.content and tries <= 2:
+                    if "User <b></b> doesn't exist" in r.text and tries <= 2:
                        logger.info("Addic7ed: Error, trying again. (%s/%s)", tries+1, 3)
                        tries += 1
                        continue
@ -208,8 +208,8 @@ class Addic7edProvider(_Addic7edProvider):
        if show_cells:
            soup = ParserBeautifulSoup(b''.join(show_cells), ['lxml', 'html.parser'])
        else:
-            # If RegEx fails, fall back to original r.content and use 'html.parser'
-            soup = ParserBeautifulSoup(r.content, ['html.parser'])
+            # If RegEx fails, fall back to original r.text and use 'html.parser'
+            soup = ParserBeautifulSoup(r.text, ['html.parser'])

        # populate the show ids
        show_ids = {}
@ -265,7 +265,7 @@ class Addic7edProvider(_Addic7edProvider):
            r = self.session.get(self.server_url + endpoint, params=params, timeout=10, headers=headers)
            r.raise_for_status()

-            if r.content and "Sorry, your search" not in r.content:
+            if r.text and "Sorry, your search" not in r.text:
                break

            time.sleep(4)
@ -273,7 +273,7 @@ class Addic7edProvider(_Addic7edProvider):
        if r.status_code == 304:
            raise TooManyRequests()

-        soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
+        soup = ParserBeautifulSoup(r.text, ['lxml', 'html.parser'])

        suggestion = None

@ -315,13 +315,13 @@ class Addic7edProvider(_Addic7edProvider):
        if r.status_code == 304:
            raise TooManyRequests()

-        if not r.content:
+        if not r.text:
            # Provider wrongful return a status of 304 Not Modified with an empty content
            # raise_for_status won't raise exception for that status code
            logger.error('No data returned from provider')
            return []

-        soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
+        soup = ParserBeautifulSoup(r.text, ['lxml', 'html.parser'])

        # loop over subtitle rows
        subtitles = []
@ -364,7 +364,7 @@ class Addic7edProvider(_Addic7edProvider):
        if r.status_code == 304:
            raise TooManyRequests()

-        if not r.content:
+        if not r.text:
            # Provider wrongful return a status of 304 Not Modified with an empty content
            # raise_for_status won't raise exception for that status code
            logger.error('Unable to download subtitle. No data returned from provider')
--- a/libs/subliminal_patch/providers/hosszupuska.py
+++ b/libs/subliminal_patch/providers/hosszupuska.py
@ -116,7 +116,7 @@ class HosszupuskaSubtitle(Subtitle):
        if video.format and self.version and video.format.lower() in self.version.lower():
            matches.add('format')
        # other properties
-        matches |= guess_matches(video, guessit(self.release_info.encode("utf-8")))
+        matches |= guess_matches(video, guessit(self.release_info))

        return matches

--- a/libs/subliminal_patch/providers/legendastv.py
+++ b/libs/subliminal_patch/providers/legendastv.py
@ -199,7 +199,7 @@ class LegendasTVProvider(_LegendasTVProvider):

                # attempt to get the releases from the cache
                cache_key = releases_key.format(archive_id=a.id, archive_name=a.name)
-                releases = region.get(cache_key, expiration_time=expiration_time)
+                releases = str(region.get(cache_key, expiration_time=expiration_time))

                # the releases are not in cache or cache is expired
                if releases == NO_VALUE:
@ -226,7 +226,7 @@ class LegendasTVProvider(_LegendasTVProvider):
                        releases.append(name)

                    # cache the releases
-                    region.set(cache_key, releases)
+                    region.set(cache_key, bytearray(releases, encoding='utf-8'))

                # iterate over releases
                for r in releases:
--- a/libs/subliminal_patch/providers/mixins.py
+++ b/libs/subliminal_patch/providers/mixins.py
@ -158,13 +158,5 @@ class ProviderSubtitleArchiveMixin(object):
        elif subs_fallback:
            matching_sub = subs_fallback[0]

-        try:
-            matching_sub_unicode = matching_sub.decode("utf-8")
-        except UnicodeDecodeError:
-            try:
-                matching_sub_unicode = matching_sub.decode("cp437")
-            except UnicodeDecodeError:
-                matching_sub_unicode = matching_sub.decode("utf-8", errors='replace')
-
-        logger.info(u"Using %s from the archive", matching_sub_unicode)
+        logger.info(u"Using %s from the archive", matching_sub)
        return fix_line_ending(archive.read(matching_sub))
--- a/libs/subliminal_patch/providers/subscene.py
+++ b/libs/subliminal_patch/providers/subscene.py
@ -141,7 +141,7 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin):
        logger.info("Creating session")
        self.session = RetryingCFSession()

-        prev_cookies = region.get("subscene_cookies2")
+        prev_cookies = str(region.get("subscene_cookies2"))
        if prev_cookies != NO_VALUE:
            logger.debug("Re-using old subscene cookies: %r", prev_cookies)
            self.session.cookies.update(prev_cookies)
@ -194,7 +194,7 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin):
                            del cj[cn]

                    logger.debug("Storing cookies: %r", cj)
-                    region.set("subscene_cookies2", cj)
+                    region.set("subscene_cookies2", bytearray(cj, encoding='utf-8'))
                    return
        raise ProviderError("Something went wrong when trying to log in #1")

@ -219,9 +219,9 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin):
        acc_filters["SelectedIds"] = selected_ids
        self.filters["LanguageFilter"] = ",".join(acc_filters["SelectedIds"])

-        last_filters = region.get("subscene_filters")
+        last_filters = str(region.get("subscene_filters"))
        if last_filters != acc_filters:
-            region.set("subscene_filters", acc_filters)
+            region.set("subscene_filters", bytearray(acc_filters, encoding='utf-8'))
            logger.debug("Setting account filters to %r", acc_filters)
            self.session.post("https://u.subscene.com/filter", acc_filters, allow_redirects=False)