From 4730fce41ba953b459a298b246f19d7a298c35f6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 29 May 2019 17:55:26 +0530 Subject: [PATCH] py3: More unicode porting --- src/calibre/utils/formatter.py | 5 +- src/calibre/utils/img.py | 10 +- src/calibre/utils/matcher.py | 6 +- src/calibre/utils/monotonic.py | 1 + src/calibre/utils/serve_coffee.py | 8 +- src/calibre/utils/smartypants.py | 158 +++++++++++++++--------------- src/calibre/utils/text2int.py | 3 + src/calibre/utils/wordcount.py | 11 ++- 8 files changed, 106 insertions(+), 96 deletions(-) diff --git a/src/calibre/utils/formatter.py b/src/calibre/utils/formatter.py index e0ccd216d5..a82e08c5a9 100644 --- a/src/calibre/utils/formatter.py +++ b/src/calibre/utils/formatter.py @@ -1,3 +1,4 @@ +from __future__ import absolute_import, division, print_function, unicode_literals ''' Created on 23 Sep 2010 @@ -23,7 +24,7 @@ class _Parser(object): LEX_NUM = 4 LEX_EOF = 5 - LEX_CONSTANTS = frozenset([LEX_STR, LEX_NUM]) + LEX_CONSTANTS = frozenset((LEX_STR, LEX_NUM)) def __init__(self, val, prog, funcs, parent): self.lex_pos = 0 @@ -205,7 +206,7 @@ class TemplateFormatter(string.Formatter): elif 'bcdoxXn'.find(typ) >= 0: try: val = int(val) - except: + except Exception: raise ValueError( _('format: type {0} requires an integer value, got {1}').format(typ, val)) elif 'eEfFgGn%'.find(typ) >= 0: diff --git a/src/calibre/utils/img.py b/src/calibre/utils/img.py index af9a33b24a..e2e32f4c9c 100644 --- a/src/calibre/utils/img.py +++ b/src/calibre/utils/img.py @@ -18,12 +18,12 @@ from PyQt5.QtCore import QBuffer, QByteArray, Qt from PyQt5.QtGui import QColor, QImage, QImageReader, QImageWriter, QPixmap, QTransform from calibre import fit_image, force_unicode -from calibre.constants import iswindows, plugins +from calibre.constants import iswindows, plugins, ispy3 from calibre.ptempfile import TemporaryDirectory from calibre.utils.config_base import tweaks from calibre.utils.filenames import atomic_rename from calibre.utils.imghdr import what -from polyglot.builtins import string_or_bytes +from polyglot.builtins import string_or_bytes, unicode_type # Utilities {{{ imageops, imageops_err = plugins['imageops'] @@ -465,11 +465,11 @@ def run_optimizer(file_path, cmd, as_filter=False, input_data=None): cmd[cmd.index(q)] = r if not as_filter: repl(True, iname), repl(False, oname) - if iswindows: + if iswindows and not ispy3: # subprocess in python 2 cannot handle unicode strings that are not # encodeable in mbcs, so we fail here, where it is more explicit, # instead. - cmd = [x.encode('mbcs') if isinstance(x, type('')) else x for x in cmd] + cmd = [x.encode('mbcs') if isinstance(x, unicode_type) else x for x in cmd] if isinstance(cwd, type('')): cwd = cwd.encode('mbcs') stdin = subprocess.PIPE if as_filter else None @@ -534,7 +534,7 @@ def encode_jpeg(file_path, quality=80): from calibre.utils.speedups import ReadOnlyFileBuffer quality = max(0, min(100, int(quality))) exe = get_exe_path('cjpeg') - cmd = [exe] + '-optimize -progressive -maxmemory 100M -quality'.split() + [str(quality)] + cmd = [exe] + '-optimize -progressive -maxmemory 100M -quality'.split() + [unicode_type(quality)] img = QImage() if not img.load(file_path): raise ValueError('%s is not a valid image file' % file_path) diff --git a/src/calibre/utils/matcher.py b/src/calibre/utils/matcher.py index 5129ad93de..b06eb67ca2 100644 --- a/src/calibre/utils/matcher.py +++ b/src/calibre/utils/matcher.py @@ -16,7 +16,7 @@ from itertools import islice from calibre import detect_ncpus as cpu_count, as_unicode from calibre.constants import plugins, filesystem_encoding from calibre.utils.icu import primary_sort_key, primary_find, primary_collator -from polyglot.builtins import iteritems, itervalues, map, unicode_type, range, zip, raw_input, filter, getcwd +from polyglot.builtins import iteritems, itervalues, map, unicode_type, range, zip, raw_input, filter, getcwd, unicode_type from polyglot.queue import Queue DEFAULT_LEVEL1 = '/' @@ -294,12 +294,12 @@ def test(return_tests=False): start = memory() for i in range(10): - doit(str(i)) + doit(unicode_type(i)) gc.collect() used10 = memory() - start start = memory() for i in range(100): - doit(str(i)) + doit(unicode_type(i)) gc.collect() used100 = memory() - start if used100 > 0 and used10 > 0: diff --git a/src/calibre/utils/monotonic.py b/src/calibre/utils/monotonic.py index 6bf33e46d8..9d5840daff 100644 --- a/src/calibre/utils/monotonic.py +++ b/src/calibre/utils/monotonic.py @@ -1,4 +1,5 @@ # vim:fileencoding=utf-8 +from __future__ import absolute_import, division, print_function, unicode_literals try: from time import monotonic diff --git a/src/calibre/utils/serve_coffee.py b/src/calibre/utils/serve_coffee.py index 6d665b17e6..bb1da0f202 100644 --- a/src/calibre/utils/serve_coffee.py +++ b/src/calibre/utils/serve_coffee.py @@ -16,7 +16,7 @@ from threading import Lock, local from polyglot import socketserver from polyglot.http_server import HTTPServer, SimpleHTTPRequestHandler -from polyglot.builtins import error_message, getcwd +from polyglot.builtins import error_message, getcwd, unicode_type # Compiler {{{ @@ -107,9 +107,9 @@ class HTTPRequestHandler(SimpleHTTPRequestHandler): # {{{ self.send_response(rtype) self.send_header("Accept-Ranges", "bytes") self.send_header("Content-Range", 'bytes ' + - str(start_range) + '-' + str(end_range - 1) + '/' + str(size)) - self.send_header("Content-Type", str(mimetype)) - self.send_header("Content-Length", str(end_range - start_range)) + unicode_type(start_range) + '-' + unicode_type(end_range - 1) + '/' + unicode_type(size)) + self.send_header("Content-Type", unicode_type(mimetype)) + self.send_header("Content-Length", unicode_type(end_range - start_range)) self.send_header("Last-Modified", self.date_time_string(int(mtime))) self.end_headers() return f, start_range, end_range diff --git a/src/calibre/utils/smartypants.py b/src/calibre/utils/smartypants.py index 24f7514766..7f44d06a7e 100644 --- a/src/calibre/utils/smartypants.py +++ b/src/calibre/utils/smartypants.py @@ -1,6 +1,8 @@ #!/usr/bin/python2 # vim:fileencoding=utf-8 +from __future__ import absolute_import, division, print_function, unicode_literals + __author__ = "Chad Miller , Kovid Goyal " __description__ = "Smart-quotes, smart-ellipses, and smart-dashes for weblog entries in pyblosxom" @@ -525,7 +527,7 @@ def smartyPants(text, attr='1'): return "".join(result) -def educateQuotes(str): +def educateQuotes(text): """ Parameter: String. @@ -539,32 +541,32 @@ def educateQuotes(str): # Special case if the very first character is a quote # followed by punctuation at a non-word-break. Close the quotes by brute force: - str = re.sub(r"""^'(?=%s\\B)""" % (punct_class,), r"""’""", str) - str = re.sub(r"""^"(?=%s\\B)""" % (punct_class,), r"""”""", str) + text = re.sub(r"""^'(?=%s\\B)""" % (punct_class,), r"""’""", text) + text = re.sub(r"""^"(?=%s\\B)""" % (punct_class,), r"""”""", text) # Special case for double sets of quotes, e.g.: #

He said, "'Quoted' words in a larger quote."

- str = re.sub(r""""'(?=\w)""", """“‘""", str) - str = re.sub(r"""'"(?=\w)""", """‘“""", str) - str = re.sub(r'''""(?=\w)''', """““""", str) - str = re.sub(r"""''(?=\w)""", """‘‘""", str) - str = re.sub(r'''\"\'''', """”’""", str) - str = re.sub(r'''\'\"''', """’”""", str) - str = re.sub(r'''""''', """””""", str) - str = re.sub(r"""''""", """’’""", str) + text = re.sub(r""""'(?=\w)""", """“‘""", text) + text = re.sub(r"""'"(?=\w)""", """‘“""", text) + text = re.sub(r'''""(?=\w)''', """““""", text) + text = re.sub(r"""''(?=\w)""", """‘‘""", text) + text = re.sub(r'''\"\'''', """”’""", text) + text = re.sub(r'''\'\"''', """’”""", text) + text = re.sub(r'''""''', """””""", text) + text = re.sub(r"""''""", """’’""", text) # Special case for decade abbreviations (the '80s --> ’80s): # See http://practicaltypography.com/apostrophes.html - str = re.sub(r"""(\W|^)'(?=\d{2}s)""", r"""\1’""", str) + text = re.sub(r"""(\W|^)'(?=\d{2}s)""", r"""\1’""", text) # Measurements in feet and inches or longitude/latitude: 19' 43.5" --> 19′ 43.5″ - str = re.sub(r'''(\W|^)([-0-9.]+\s*)'(\s*[-0-9.]+)"''', r'\1\2′\3″', str) + text = re.sub(r'''(\W|^)([-0-9.]+\s*)'(\s*[-0-9.]+)"''', r'\1\2′\3″', text) # Special case for Quotes at inside of other entities, e.g.: #

A double quote--"within dashes"--would be nice.

- str = re.sub(r"""(?<=\W)"(?=\w)""", r"""“""", str) - str = re.sub(r"""(?<=\W)'(?=\w)""", r"""‘""", str) - str = re.sub(r"""(?<=\w)"(?=\W)""", r"""”""", str) - str = re.sub(r"""(?<=\w)'(?=\W)""", r"""’""", str) + text = re.sub(r"""(?<=\W)"(?=\w)""", r"""“""", text) + text = re.sub(r"""(?<=\W)'(?=\w)""", r"""‘""", text) + text = re.sub(r"""(?<=\w)"(?=\W)""", r"""”""", text) + text = re.sub(r"""(?<=\w)'(?=\W)""", r"""’""", text) # The following are commented out as smartypants tokenizes text by # stripping out html tags. Therefore, there is no guarantee that the @@ -572,12 +574,12 @@ def educateQuotes(str): # meaningful # Special case for Quotes at end of line with a preceeding space (may change just to end of line) - # str = re.sub(r"""(?<=\s)"$""", r"""”""", str) - # str = re.sub(r"""(?<=\s)'$""", r"""’""", str) + # text = re.sub(r"""(?<=\s)"$""", r"""”""", text) + # text = re.sub(r"""(?<=\s)'$""", r"""’""", text) # Special case for Quotes at beginning of line with a space - multiparagraph quoted text: - # str = re.sub(r"""^"(?=\s)""", r"""“""", str) - # str = re.sub(r"""^'(?=\s)""", r"""‘""", str) + # text = re.sub(r"""^"(?=\s)""", r"""“""", text) + # text = re.sub(r"""^'(?=\s)""", r"""‘""", text) close_class = r"""[^\ \t\r\n\[\{\(\-]""" dec_dashes = r"""–|—""" @@ -595,24 +597,24 @@ def educateQuotes(str): ' # the quote (?=\w) # followed by a word character """ % (dec_dashes,), re.VERBOSE) - str = opening_single_quotes_regex.sub(r"""\1‘""", str) + text = opening_single_quotes_regex.sub(r"""\1‘""", text) closing_single_quotes_regex = re.compile(r""" (%s) ' (?!\s | s\b | \d) """ % (close_class,), re.VERBOSE) - str = closing_single_quotes_regex.sub(r"""\1’""", str) + text = closing_single_quotes_regex.sub(r"""\1’""", text) closing_single_quotes_regex = re.compile(r""" (%s) ' (\s | s\b) """ % (close_class,), re.VERBOSE) - str = closing_single_quotes_regex.sub(r"""\1’\2""", str) + text = closing_single_quotes_regex.sub(r"""\1’\2""", text) # Any remaining single quotes should be opening ones: - str = re.sub(r"""'""", r"""‘""", str) + text = re.sub(r"""'""", r"""‘""", text) # Get most opening double quotes: opening_double_quotes_regex = re.compile(r""" @@ -627,7 +629,7 @@ def educateQuotes(str): " # the quote (?=\w) # followed by a word character """ % (dec_dashes,), re.VERBOSE) - str = opening_double_quotes_regex.sub(r"""\1“""", str) + text = opening_double_quotes_regex.sub(r"""\1“""", text) # Double closing quotes: closing_double_quotes_regex = re.compile(r""" @@ -635,25 +637,25 @@ def educateQuotes(str): " (?=\s) """ % (close_class,), re.VERBOSE) - str = closing_double_quotes_regex.sub(r"""”""", str) + text = closing_double_quotes_regex.sub(r"""”""", text) closing_double_quotes_regex = re.compile(r""" (%s) # character that indicates the quote should be closing " """ % (close_class,), re.VERBOSE) - str = closing_double_quotes_regex.sub(r"""\1”""", str) + text = closing_double_quotes_regex.sub(r"""\1”""", text) - if str.endswith('-"'): + if text.endswith('-"'): # A string that endswith -" is sometimes used for dialogue - str = str[:-1] + '”' + text = text[:-1] + '”' # Any remaining quotes should be opening ones. - str = re.sub(r'"', r"""“""", str) + text = re.sub(r'"', r"""“""", text) - return str + return text -def educateBackticks(str): +def educateBackticks(text): """ Parameter: String. Returns: The string, with ``backticks'' -style double quotes @@ -662,12 +664,12 @@ def educateBackticks(str): Example output: “Isn't this fun?” """ - str = re.sub(r"""``""", r"""“""", str) - str = re.sub(r"""''""", r"""”""", str) - return str + text = re.sub(r"""``""", r"""“""", text) + text = re.sub(r"""''""", r"""”""", text) + return text -def educateSingleBackticks(str): +def educateSingleBackticks(text): """ Parameter: String. Returns: The string, with `backticks' -style single quotes @@ -677,12 +679,12 @@ def educateSingleBackticks(str): Example output: ‘Isn’t this fun?’ """ - str = re.sub(r"""`""", r"""‘""", str) - str = re.sub(r"""'""", r"""’""", str) - return str + text = re.sub(r"""`""", r"""‘""", text) + text = re.sub(r"""'""", r"""’""", text) + return text -def educateDashes(str): +def educateDashes(text): """ Parameter: String. @@ -690,12 +692,12 @@ def educateDashes(str): an em-dash HTML entity. """ - str = re.sub(r"""---""", r"""–""", str) # en (yes, backwards) - str = re.sub(r"""--""", r"""—""", str) # em (yes, backwards) - return str + text = re.sub(r"""---""", r"""–""", text) # en (yes, backwards) + text = re.sub(r"""--""", r"""—""", text) # em (yes, backwards) + return text -def educateDashesOldSchool(str): +def educateDashesOldSchool(text): """ Parameter: String. @@ -704,12 +706,12 @@ def educateDashesOldSchool(str): an em-dash HTML entity. """ - str = re.sub(r"""---""", r"""—""", str) # em (yes, backwards) - str = re.sub(r"""--""", r"""–""", str) # en (yes, backwards) - return str + text = re.sub(r"""---""", r"""—""", text) # em (yes, backwards) + text = re.sub(r"""--""", r"""–""", text) # en (yes, backwards) + return text -def educateDashesOldSchoolInverted(str): +def educateDashesOldSchoolInverted(text): """ Parameter: String. @@ -724,12 +726,12 @@ def educateDashesOldSchoolInverted(str): the shortcut should be shorter to type. (Thanks to Aaron Swartz for the idea.) """ - str = re.sub(r"""---""", r"""–""", str) # em - str = re.sub(r"""--""", r"""—""", str) # en - return str + text = re.sub(r"""---""", r"""–""", text) # em + text = re.sub(r"""--""", r"""—""", text) # en + return text -def educateEllipses(str): +def educateEllipses(text): """ Parameter: String. Returns: The string, with each instance of "..." translated to @@ -739,12 +741,12 @@ def educateEllipses(str): Example output: Huh…? """ - str = re.sub(r"""\.\.\.""", r"""…""", str) - str = re.sub(r"""\. \. \.""", r"""…""", str) - return str + text = re.sub(r"""\.\.\.""", r"""…""", text) + text = re.sub(r"""\. \. \.""", r"""…""", text) + return text -def stupefyEntities(str): +def stupefyEntities(text): """ Parameter: String. Returns: The string, with each SmartyPants HTML entity translated to @@ -754,21 +756,21 @@ def stupefyEntities(str): Example output: "Hello -- world." """ - str = re.sub(r"""–""", r"""-""", str) # en-dash - str = re.sub(r"""—""", r"""--""", str) # em-dash + text = re.sub(r"""–""", r"""-""", text) # en-dash + text = re.sub(r"""—""", r"""--""", text) # em-dash - str = re.sub(r"""‘""", r"""'""", str) # open single quote - str = re.sub(r"""’""", r"""'""", str) # close single quote + text = re.sub(r"""‘""", r"""'""", text) # open single quote + text = re.sub(r"""’""", r"""'""", text) # close single quote - str = re.sub(r"""“""", r'''"''', str) # open double quote - str = re.sub(r"""”""", r'''"''', str) # close double quote + text = re.sub(r"""“""", r'''"''', text) # open double quote + text = re.sub(r"""”""", r'''"''', text) # close double quote - str = re.sub(r"""…""", r"""...""", str) # ellipsis + text = re.sub(r"""…""", r"""...""", text) # ellipsis - return str + return text -def processEscapes(str): +def processEscapes(text): r""" Parameter: String. Returns: The string, with after processing the following backslash @@ -784,17 +786,17 @@ def processEscapes(str): \- - \` ` """ - str = re.sub(r"""\\\\""", r"""\""", str) - str = re.sub(r'''\\"''', r""""""", str) - str = re.sub(r"""\\'""", r"""'""", str) - str = re.sub(r"""\\\.""", r""".""", str) - str = re.sub(r"""\\-""", r"""-""", str) - str = re.sub(r"""\\`""", r"""`""", str) + text = re.sub(r"""\\\\""", r"""\""", text) + text = re.sub(r'''\\"''', r""""""", text) + text = re.sub(r"""\\'""", r"""'""", text) + text = re.sub(r"""\\\.""", r""".""", text) + text = re.sub(r"""\\-""", r"""-""", text) + text = re.sub(r"""\\`""", r"""`""", text) - return str + return text -def _tokenize(str): +def _tokenize(html): """ Parameter: String containing HTML markup. Returns: Reference to an array of the tokens comprising the input @@ -817,7 +819,7 @@ def _tokenize(str): # %s # nested tags """ % (nested_tags,) tag_soup = re.compile(r"""([^<]*)(<[^>]*>)""") - token_match = tag_soup.search(str) + token_match = tag_soup.search(html) previous_end = 0 while token_match is not None: @@ -827,10 +829,10 @@ def _tokenize(str): tokens.append(['tag', token_match.group(2)]) previous_end = token_match.end() - token_match = tag_soup.search(str, token_match.end()) + token_match = tag_soup.search(html, token_match.end()) - if previous_end < len(str): - tokens.append(['text', str[previous_end:]]) + if previous_end < len(html): + tokens.append(['text', html[previous_end:]]) return tokens diff --git a/src/calibre/utils/text2int.py b/src/calibre/utils/text2int.py index cf40f49c84..c9a88946c1 100644 --- a/src/calibre/utils/text2int.py +++ b/src/calibre/utils/text2int.py @@ -1,4 +1,7 @@ #!/usr/bin/env python2 + +from __future__ import absolute_import, division, print_function, unicode_literals + __author__ = "stackoverflow community" __docformat__ = 'restructuredtext en' """ diff --git a/src/calibre/utils/wordcount.py b/src/calibre/utils/wordcount.py index e711061b14..86ba2874a8 100644 --- a/src/calibre/utils/wordcount.py +++ b/src/calibre/utils/wordcount.py @@ -1,5 +1,8 @@ #!/usr/bin/python2 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + +from __future__ import absolute_import, division, print_function, unicode_literals + """ Get word, character, and Asian character counts @@ -44,7 +47,7 @@ def filter_jchars(c): def nonj_len(word): - u"""Returns number of non-Asian words in {word} + """Returns number of non-Asian words in {word} - 日本語AアジアンB -> 2 - hello -> 1 @param word: A word, possibly containing Asian characters @@ -56,7 +59,7 @@ def nonj_len(word): # -> ['spam', 'eggs'] # The length of which is 2! chars = [filter_jchars(c) for c in word] - return len(u''.join(chars).split()) + return len(''.join(chars).split()) def get_wordcount(text): @@ -66,8 +69,8 @@ def get_wordcount(text): """ characters = len(text) - chars_no_spaces = sum([not x.isspace() for x in text]) - asian_chars = sum([is_asian(x) for x in text]) + chars_no_spaces = sum(not x.isspace() for x in text) + asian_chars = sum(is_asian(x) for x in text) non_asian_words = nonj_len(text) words = non_asian_words + asian_chars