py3: More unicode porting

2025-08-30 23:00:21 -04:00 · 2019-05-29 17:55:26 +05:30 · 2019-05-29 17:55:26 +05:30 · 4730fce41b
commit 4730fce41b
parent 06dc7dd15b
8 changed files with 106 additions and 96 deletions
--- a/src/calibre/utils/formatter.py
+++ b/src/calibre/utils/formatter.py
@ -1,3 +1,4 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
 '''
 Created on 23 Sep 2010

@ -23,7 +24,7 @@ class _Parser(object):
    LEX_NUM = 4
    LEX_EOF = 5

-    LEX_CONSTANTS = frozenset([LEX_STR, LEX_NUM])
+    LEX_CONSTANTS = frozenset((LEX_STR, LEX_NUM))

    def __init__(self, val, prog, funcs, parent):
        self.lex_pos = 0
@ -205,7 +206,7 @@ class TemplateFormatter(string.Formatter):
        elif 'bcdoxXn'.find(typ) >= 0:
            try:
                val = int(val)
-            except:
+            except Exception:
                raise ValueError(
                    _('format: type {0} requires an integer value, got {1}').format(typ, val))
        elif 'eEfFgGn%'.find(typ) >= 0:
--- a/src/calibre/utils/img.py
+++ b/src/calibre/utils/img.py
@ -18,12 +18,12 @@ from PyQt5.QtCore import QBuffer, QByteArray, Qt
 from PyQt5.QtGui import QColor, QImage, QImageReader, QImageWriter, QPixmap, QTransform

 from calibre import fit_image, force_unicode
-from calibre.constants import iswindows, plugins
+from calibre.constants import iswindows, plugins, ispy3
 from calibre.ptempfile import TemporaryDirectory
 from calibre.utils.config_base import tweaks
 from calibre.utils.filenames import atomic_rename
 from calibre.utils.imghdr import what
-from polyglot.builtins import string_or_bytes
+from polyglot.builtins import string_or_bytes, unicode_type

 # Utilities {{{
 imageops, imageops_err = plugins['imageops']
@ -465,11 +465,11 @@ def run_optimizer(file_path, cmd, as_filter=False, input_data=None):
            cmd[cmd.index(q)] = r
        if not as_filter:
            repl(True, iname), repl(False, oname)
-        if iswindows:
+        if iswindows and not ispy3:
            # subprocess in python 2 cannot handle unicode strings that are not
            # encodeable in mbcs, so we fail here, where it is more explicit,
            # instead.
-            cmd = [x.encode('mbcs') if isinstance(x, type('')) else x for x in cmd]
+            cmd = [x.encode('mbcs') if isinstance(x, unicode_type) else x for x in cmd]
            if isinstance(cwd, type('')):
                cwd = cwd.encode('mbcs')
        stdin = subprocess.PIPE if as_filter else None
@ -534,7 +534,7 @@ def encode_jpeg(file_path, quality=80):
    from calibre.utils.speedups import ReadOnlyFileBuffer
    quality = max(0, min(100, int(quality)))
    exe = get_exe_path('cjpeg')
-    cmd = [exe] + '-optimize -progressive -maxmemory 100M -quality'.split() + [str(quality)]
+    cmd = [exe] + '-optimize -progressive -maxmemory 100M -quality'.split() + [unicode_type(quality)]
    img = QImage()
    if not img.load(file_path):
        raise ValueError('%s is not a valid image file' % file_path)
--- a/src/calibre/utils/matcher.py
+++ b/src/calibre/utils/matcher.py
@ -16,7 +16,7 @@ from itertools import islice
 from calibre import detect_ncpus as cpu_count, as_unicode
 from calibre.constants import plugins, filesystem_encoding
 from calibre.utils.icu import primary_sort_key, primary_find, primary_collator
-from polyglot.builtins import iteritems, itervalues, map, unicode_type, range, zip, raw_input, filter, getcwd
+from polyglot.builtins import iteritems, itervalues, map, unicode_type, range, zip, raw_input, filter, getcwd, unicode_type
 from polyglot.queue import Queue

 DEFAULT_LEVEL1 = '/'
@ -294,12 +294,12 @@ def test(return_tests=False):

            start = memory()
            for i in range(10):
-                doit(str(i))
+                doit(unicode_type(i))
            gc.collect()
            used10 = memory() - start
            start = memory()
            for i in range(100):
-                doit(str(i))
+                doit(unicode_type(i))
            gc.collect()
            used100 = memory() - start
            if used100 > 0 and used10 > 0:
--- a/src/calibre/utils/monotonic.py
+++ b/src/calibre/utils/monotonic.py
@ -1,4 +1,5 @@
 # vim:fileencoding=utf-8
+from __future__ import absolute_import, division, print_function, unicode_literals

 try:
    from time import monotonic
--- a/src/calibre/utils/serve_coffee.py
+++ b/src/calibre/utils/serve_coffee.py
@ -16,7 +16,7 @@ from threading import Lock, local

 from polyglot import socketserver
 from polyglot.http_server import HTTPServer, SimpleHTTPRequestHandler
-from polyglot.builtins import error_message, getcwd
+from polyglot.builtins import error_message, getcwd, unicode_type

 # Compiler {{{

@ -107,9 +107,9 @@ class HTTPRequestHandler(SimpleHTTPRequestHandler):  # {{{
        self.send_response(rtype)
        self.send_header("Accept-Ranges", "bytes")
        self.send_header("Content-Range", 'bytes ' +
-                         str(start_range) + '-' + str(end_range - 1) + '/' + str(size))
-        self.send_header("Content-Type", str(mimetype))
-        self.send_header("Content-Length", str(end_range - start_range))
+                         unicode_type(start_range) + '-' + unicode_type(end_range - 1) + '/' + unicode_type(size))
+        self.send_header("Content-Type", unicode_type(mimetype))
+        self.send_header("Content-Length", unicode_type(end_range - start_range))
        self.send_header("Last-Modified", self.date_time_string(int(mtime)))
        self.end_headers()
        return f, start_range, end_range
--- a/src/calibre/utils/smartypants.py
+++ b/src/calibre/utils/smartypants.py
@ -1,6 +1,8 @@
 #!/usr/bin/python2
 # vim:fileencoding=utf-8

+from __future__ import absolute_import, division, print_function, unicode_literals
+
 __author__ = "Chad Miller <smartypantspy@chad.org>, Kovid Goyal <kovid at kovidgoyal.net>"
 __description__ = "Smart-quotes, smart-ellipses, and smart-dashes for weblog entries in pyblosxom"

@ -525,7 +527,7 @@ def smartyPants(text, attr='1'):
    return "".join(result)


-def educateQuotes(str):
+def educateQuotes(text):
    """
    Parameter:  String.

@ -539,32 +541,32 @@ def educateQuotes(str):

    # Special case if the very first character is a quote
    # followed by punctuation at a non-word-break. Close the quotes by brute force:
-    str = re.sub(r"""^'(?=%s\\B)""" % (punct_class,), r"""&#8217;""", str)
-    str = re.sub(r"""^"(?=%s\\B)""" % (punct_class,), r"""&#8221;""", str)
+    text = re.sub(r"""^'(?=%s\\B)""" % (punct_class,), r"""&#8217;""", text)
+    text = re.sub(r"""^"(?=%s\\B)""" % (punct_class,), r"""&#8221;""", text)

    # Special case for double sets of quotes, e.g.:
    #   <p>He said, "'Quoted' words in a larger quote."</p>
-    str = re.sub(r""""'(?=\w)""", """&#8220;&#8216;""", str)
-    str = re.sub(r"""'"(?=\w)""", """&#8216;&#8220;""", str)
-    str = re.sub(r'''""(?=\w)''', """&#8220;&#8220;""", str)
-    str = re.sub(r"""''(?=\w)""", """&#8216;&#8216;""", str)
-    str = re.sub(r'''\"\'''',     """&#8221;&#8217;""", str)
-    str = re.sub(r'''\'\"''',     """&#8217;&#8221;""", str)
-    str = re.sub(r'''""''',       """&#8221;&#8221;""", str)
-    str = re.sub(r"""''""",       """&#8217;&#8217;""", str)
+    text = re.sub(r""""'(?=\w)""", """&#8220;&#8216;""", text)
+    text = re.sub(r"""'"(?=\w)""", """&#8216;&#8220;""", text)
+    text = re.sub(r'''""(?=\w)''', """&#8220;&#8220;""", text)
+    text = re.sub(r"""''(?=\w)""", """&#8216;&#8216;""", text)
+    text = re.sub(r'''\"\'''',     """&#8221;&#8217;""", text)
+    text = re.sub(r'''\'\"''',     """&#8217;&#8221;""", text)
+    text = re.sub(r'''""''',       """&#8221;&#8221;""", text)
+    text = re.sub(r"""''""",       """&#8217;&#8217;""", text)

    # Special case for decade abbreviations (the '80s --> ’80s):
    # See http://practicaltypography.com/apostrophes.html
-    str = re.sub(r"""(\W|^)'(?=\d{2}s)""", r"""\1&#8217;""", str)
+    text = re.sub(r"""(\W|^)'(?=\d{2}s)""", r"""\1&#8217;""", text)
    # Measurements in feet and inches or longitude/latitude: 19' 43.5" --> 19′ 43.5″
-    str = re.sub(r'''(\W|^)([-0-9.]+\s*)'(\s*[-0-9.]+)"''', r'\1\2&#8242;\3&#8243;', str)
+    text = re.sub(r'''(\W|^)([-0-9.]+\s*)'(\s*[-0-9.]+)"''', r'\1\2&#8242;\3&#8243;', text)

    # Special case for Quotes at inside of other entities, e.g.:
    #   <p>A double quote--"within dashes"--would be nice.</p>
-    str = re.sub(r"""(?<=\W)"(?=\w)""", r"""&#8220;""", str)
-    str = re.sub(r"""(?<=\W)'(?=\w)""", r"""&#8216;""", str)
-    str = re.sub(r"""(?<=\w)"(?=\W)""", r"""&#8221;""", str)
-    str = re.sub(r"""(?<=\w)'(?=\W)""", r"""&#8217;""", str)
+    text = re.sub(r"""(?<=\W)"(?=\w)""", r"""&#8220;""", text)
+    text = re.sub(r"""(?<=\W)'(?=\w)""", r"""&#8216;""", text)
+    text = re.sub(r"""(?<=\w)"(?=\W)""", r"""&#8221;""", text)
+    text = re.sub(r"""(?<=\w)'(?=\W)""", r"""&#8217;""", text)

    # The following are commented out as smartypants tokenizes text by
    # stripping out html tags. Therefore, there is no guarantee that the
@ -572,12 +574,12 @@ def educateQuotes(str):
    # meaningful

    # Special case for Quotes at end of line with a preceeding space (may change just to end of line)
-    # str = re.sub(r"""(?<=\s)"$""", r"""&#8221;""", str)
-    # str = re.sub(r"""(?<=\s)'$""", r"""&#8217;""", str)
+    # text = re.sub(r"""(?<=\s)"$""", r"""&#8221;""", text)
+    # text = re.sub(r"""(?<=\s)'$""", r"""&#8217;""", text)

    # Special case for Quotes at beginning of line with a space - multiparagraph quoted text:
-    # str = re.sub(r"""^"(?=\s)""", r"""&#8220;""", str)
-    # str = re.sub(r"""^'(?=\s)""", r"""&#8216;""", str)
+    # text = re.sub(r"""^"(?=\s)""", r"""&#8220;""", text)
+    # text = re.sub(r"""^'(?=\s)""", r"""&#8216;""", text)

    close_class = r"""[^\ \t\r\n\[\{\(\-]"""
    dec_dashes = r"""&#8211;|&#8212;"""
@ -595,24 +597,24 @@ def educateQuotes(str):
            '                 # the quote
            (?=\w)            # followed by a word character
            """ % (dec_dashes,), re.VERBOSE)
-    str = opening_single_quotes_regex.sub(r"""\1&#8216;""", str)
+    text = opening_single_quotes_regex.sub(r"""\1&#8216;""", text)

    closing_single_quotes_regex = re.compile(r"""
            (%s)
            '
            (?!\s | s\b | \d)
            """ % (close_class,), re.VERBOSE)
-    str = closing_single_quotes_regex.sub(r"""\1&#8217;""", str)
+    text = closing_single_quotes_regex.sub(r"""\1&#8217;""", text)

    closing_single_quotes_regex = re.compile(r"""
            (%s)
            '
            (\s | s\b)
            """ % (close_class,), re.VERBOSE)
-    str = closing_single_quotes_regex.sub(r"""\1&#8217;\2""", str)
+    text = closing_single_quotes_regex.sub(r"""\1&#8217;\2""", text)

    # Any remaining single quotes should be opening ones:
-    str = re.sub(r"""'""", r"""&#8216;""", str)
+    text = re.sub(r"""'""", r"""&#8216;""", text)

    # Get most opening double quotes:
    opening_double_quotes_regex = re.compile(r"""
@ -627,7 +629,7 @@ def educateQuotes(str):
            "                 # the quote
            (?=\w)            # followed by a word character
            """ % (dec_dashes,), re.VERBOSE)
-    str = opening_double_quotes_regex.sub(r"""\1&#8220;""", str)
+    text = opening_double_quotes_regex.sub(r"""\1&#8220;""", text)

    # Double closing quotes:
    closing_double_quotes_regex = re.compile(r"""
@ -635,25 +637,25 @@ def educateQuotes(str):
            "
            (?=\s)
            """ % (close_class,), re.VERBOSE)
-    str = closing_double_quotes_regex.sub(r"""&#8221;""", str)
+    text = closing_double_quotes_regex.sub(r"""&#8221;""", text)

    closing_double_quotes_regex = re.compile(r"""
            (%s)   # character that indicates the quote should be closing
            "
            """ % (close_class,), re.VERBOSE)
-    str = closing_double_quotes_regex.sub(r"""\1&#8221;""", str)
+    text = closing_double_quotes_regex.sub(r"""\1&#8221;""", text)

-    if str.endswith('-"'):
+    if text.endswith('-"'):
        # A string that endswith -" is sometimes used for dialogue
-        str = str[:-1] + '&#8221;'
+        text = text[:-1] + '&#8221;'

    # Any remaining quotes should be opening ones.
-    str = re.sub(r'"', r"""&#8220;""", str)
+    text = re.sub(r'"', r"""&#8220;""", text)

-    return str
+    return text


-def educateBackticks(str):
+def educateBackticks(text):
    """
    Parameter:  String.
    Returns:    The string, with ``backticks'' -style double quotes
@ -662,12 +664,12 @@ def educateBackticks(str):
    Example output: &#8220;Isn't this fun?&#8221;
    """

-    str = re.sub(r"""``""", r"""&#8220;""", str)
-    str = re.sub(r"""''""", r"""&#8221;""", str)
-    return str
+    text = re.sub(r"""``""", r"""&#8220;""", text)
+    text = re.sub(r"""''""", r"""&#8221;""", text)
+    return text


-def educateSingleBackticks(str):
+def educateSingleBackticks(text):
    """
    Parameter:  String.
    Returns:    The string, with `backticks' -style single quotes
@ -677,12 +679,12 @@ def educateSingleBackticks(str):
    Example output: &#8216;Isn&#8217;t this fun?&#8217;
    """

-    str = re.sub(r"""`""", r"""&#8216;""", str)
-    str = re.sub(r"""'""", r"""&#8217;""", str)
-    return str
+    text = re.sub(r"""`""", r"""&#8216;""", text)
+    text = re.sub(r"""'""", r"""&#8217;""", text)
+    return text


-def educateDashes(str):
+def educateDashes(text):
    """
    Parameter:  String.

@ -690,12 +692,12 @@ def educateDashes(str):
                an em-dash HTML entity.
    """

-    str = re.sub(r"""---""", r"""&#8211;""", str)  # en  (yes, backwards)
-    str = re.sub(r"""--""", r"""&#8212;""", str)  # em (yes, backwards)
-    return str
+    text = re.sub(r"""---""", r"""&#8211;""", text)  # en  (yes, backwards)
+    text = re.sub(r"""--""", r"""&#8212;""", text)  # em (yes, backwards)
+    return text


-def educateDashesOldSchool(str):
+def educateDashesOldSchool(text):
    """
    Parameter:  String.

@ -704,12 +706,12 @@ def educateDashesOldSchool(str):
                an em-dash HTML entity.
    """

-    str = re.sub(r"""---""", r"""&#8212;""", str)    # em (yes, backwards)
-    str = re.sub(r"""--""", r"""&#8211;""", str)    # en (yes, backwards)
-    return str
+    text = re.sub(r"""---""", r"""&#8212;""", text)    # em (yes, backwards)
+    text = re.sub(r"""--""", r"""&#8211;""", text)    # en (yes, backwards)
+    return text


-def educateDashesOldSchoolInverted(str):
+def educateDashesOldSchoolInverted(text):
    """
    Parameter:  String.

@ -724,12 +726,12 @@ def educateDashesOldSchoolInverted(str):
                the shortcut should be shorter to type. (Thanks to Aaron
                Swartz for the idea.)
    """
-    str = re.sub(r"""---""", r"""&#8211;""", str)    # em
-    str = re.sub(r"""--""", r"""&#8212;""", str)    # en
-    return str
+    text = re.sub(r"""---""", r"""&#8211;""", text)    # em
+    text = re.sub(r"""--""", r"""&#8212;""", text)    # en
+    return text


-def educateEllipses(str):
+def educateEllipses(text):
    """
    Parameter:  String.
    Returns:    The string, with each instance of "..." translated to
@ -739,12 +741,12 @@ def educateEllipses(str):
    Example output: Huh&#8230;?
    """

-    str = re.sub(r"""\.\.\.""", r"""&#8230;""", str)
-    str = re.sub(r"""\. \. \.""", r"""&#8230;""", str)
-    return str
+    text = re.sub(r"""\.\.\.""", r"""&#8230;""", text)
+    text = re.sub(r"""\. \. \.""", r"""&#8230;""", text)
+    return text


-def stupefyEntities(str):
+def stupefyEntities(text):
    """
    Parameter:  String.
    Returns:    The string, with each SmartyPants HTML entity translated to
@ -754,21 +756,21 @@ def stupefyEntities(str):
    Example output: "Hello -- world."
    """

-    str = re.sub(r"""&#8211;""", r"""-""", str)  # en-dash
-    str = re.sub(r"""&#8212;""", r"""--""", str)  # em-dash
+    text = re.sub(r"""&#8211;""", r"""-""", text)  # en-dash
+    text = re.sub(r"""&#8212;""", r"""--""", text)  # em-dash

-    str = re.sub(r"""&#8216;""", r"""'""", str)  # open single quote
-    str = re.sub(r"""&#8217;""", r"""'""", str)  # close single quote
+    text = re.sub(r"""&#8216;""", r"""'""", text)  # open single quote
+    text = re.sub(r"""&#8217;""", r"""'""", text)  # close single quote

-    str = re.sub(r"""&#8220;""", r'''"''', str)  # open double quote
-    str = re.sub(r"""&#8221;""", r'''"''', str)  # close double quote
+    text = re.sub(r"""&#8220;""", r'''"''', text)  # open double quote
+    text = re.sub(r"""&#8221;""", r'''"''', text)  # close double quote

-    str = re.sub(r"""&#8230;""", r"""...""", str)  # ellipsis
+    text = re.sub(r"""&#8230;""", r"""...""", text)  # ellipsis

-    return str
+    return text


-def processEscapes(str):
+def processEscapes(text):
    r"""
    Parameter:  String.
    Returns:    The string, with after processing the following backslash
@ -784,17 +786,17 @@ def processEscapes(str):
                \-      &#45;
                \`      &#96;
    """
-    str = re.sub(r"""\\\\""", r"""&#92;""", str)
-    str = re.sub(r'''\\"''', r"""&#34;""", str)
-    str = re.sub(r"""\\'""", r"""&#39;""", str)
-    str = re.sub(r"""\\\.""", r"""&#46;""", str)
-    str = re.sub(r"""\\-""", r"""&#45;""", str)
-    str = re.sub(r"""\\`""", r"""&#96;""", str)
+    text = re.sub(r"""\\\\""", r"""&#92;""", text)
+    text = re.sub(r'''\\"''', r"""&#34;""", text)
+    text = re.sub(r"""\\'""", r"""&#39;""", text)
+    text = re.sub(r"""\\\.""", r"""&#46;""", text)
+    text = re.sub(r"""\\-""", r"""&#45;""", text)
+    text = re.sub(r"""\\`""", r"""&#96;""", text)

-    return str
+    return text


-def _tokenize(str):
+def _tokenize(html):
    """
    Parameter:  String containing HTML markup.
    Returns:    Reference to an array of the tokens comprising the input
@ -817,7 +819,7 @@ def _tokenize(str):
    # %s  # nested tags       """ % (nested_tags,)
    tag_soup = re.compile(r"""([^<]*)(<[^>]*>)""")

-    token_match = tag_soup.search(str)
+    token_match = tag_soup.search(html)

    previous_end = 0
    while token_match is not None:
@ -827,10 +829,10 @@ def _tokenize(str):
        tokens.append(['tag', token_match.group(2)])

        previous_end = token_match.end()
-        token_match = tag_soup.search(str, token_match.end())
+        token_match = tag_soup.search(html, token_match.end())

-    if previous_end < len(str):
-        tokens.append(['text', str[previous_end:]])
+    if previous_end < len(html):
+        tokens.append(['text', html[previous_end:]])

    return tokens

--- a/src/calibre/utils/text2int.py
+++ b/src/calibre/utils/text2int.py
@ -1,4 +1,7 @@
 #!/usr/bin/env python2
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
 __author__ = "stackoverflow community"
 __docformat__ = 'restructuredtext en'
 """
--- a/src/calibre/utils/wordcount.py
+++ b/src/calibre/utils/wordcount.py
@ -1,5 +1,8 @@
 #!/usr/bin/python2
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
 """
 Get word, character, and Asian character counts

@ -44,7 +47,7 @@ def filter_jchars(c):


 def nonj_len(word):
-    u"""Returns number of non-Asian words in {word}
+    """Returns number of non-Asian words in {word}
    - 日本語AアジアンB -> 2
    - hello -> 1
    @param word: A word, possibly containing Asian characters
@ -56,7 +59,7 @@ def nonj_len(word):
    # -> ['spam', 'eggs']
    # The length of which is 2!
    chars = [filter_jchars(c) for c in word]
-    return len(u''.join(chars).split())
+    return len(''.join(chars).split())


 def get_wordcount(text):
@ -66,8 +69,8 @@ def get_wordcount(text):
    """

    characters = len(text)
-    chars_no_spaces = sum([not x.isspace() for x in text])
-    asian_chars =  sum([is_asian(x) for x in text])
+    chars_no_spaces = sum(not x.isspace() for x in text)
+    asian_chars =  sum(is_asian(x) for x in text)
    non_asian_words = nonj_len(text)
    words = non_asian_words + asian_chars