py3: More unicode porting

This commit is contained in:
Kovid Goyal 2019-05-29 17:55:26 +05:30
parent 06dc7dd15b
commit 4730fce41b
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
8 changed files with 106 additions and 96 deletions

View File

@ -1,3 +1,4 @@
from __future__ import absolute_import, division, print_function, unicode_literals
'''
Created on 23 Sep 2010
@ -23,7 +24,7 @@ class _Parser(object):
LEX_NUM = 4
LEX_EOF = 5
LEX_CONSTANTS = frozenset([LEX_STR, LEX_NUM])
LEX_CONSTANTS = frozenset((LEX_STR, LEX_NUM))
def __init__(self, val, prog, funcs, parent):
self.lex_pos = 0
@ -205,7 +206,7 @@ class TemplateFormatter(string.Formatter):
elif 'bcdoxXn'.find(typ) >= 0:
try:
val = int(val)
except:
except Exception:
raise ValueError(
_('format: type {0} requires an integer value, got {1}').format(typ, val))
elif 'eEfFgGn%'.find(typ) >= 0:

View File

@ -18,12 +18,12 @@ from PyQt5.QtCore import QBuffer, QByteArray, Qt
from PyQt5.QtGui import QColor, QImage, QImageReader, QImageWriter, QPixmap, QTransform
from calibre import fit_image, force_unicode
from calibre.constants import iswindows, plugins
from calibre.constants import iswindows, plugins, ispy3
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.config_base import tweaks
from calibre.utils.filenames import atomic_rename
from calibre.utils.imghdr import what
from polyglot.builtins import string_or_bytes
from polyglot.builtins import string_or_bytes, unicode_type
# Utilities {{{
imageops, imageops_err = plugins['imageops']
@ -465,11 +465,11 @@ def run_optimizer(file_path, cmd, as_filter=False, input_data=None):
cmd[cmd.index(q)] = r
if not as_filter:
repl(True, iname), repl(False, oname)
if iswindows:
if iswindows and not ispy3:
# subprocess in python 2 cannot handle unicode strings that are not
# encodeable in mbcs, so we fail here, where it is more explicit,
# instead.
cmd = [x.encode('mbcs') if isinstance(x, type('')) else x for x in cmd]
cmd = [x.encode('mbcs') if isinstance(x, unicode_type) else x for x in cmd]
if isinstance(cwd, type('')):
cwd = cwd.encode('mbcs')
stdin = subprocess.PIPE if as_filter else None
@ -534,7 +534,7 @@ def encode_jpeg(file_path, quality=80):
from calibre.utils.speedups import ReadOnlyFileBuffer
quality = max(0, min(100, int(quality)))
exe = get_exe_path('cjpeg')
cmd = [exe] + '-optimize -progressive -maxmemory 100M -quality'.split() + [str(quality)]
cmd = [exe] + '-optimize -progressive -maxmemory 100M -quality'.split() + [unicode_type(quality)]
img = QImage()
if not img.load(file_path):
raise ValueError('%s is not a valid image file' % file_path)

View File

@ -16,7 +16,7 @@ from itertools import islice
from calibre import detect_ncpus as cpu_count, as_unicode
from calibre.constants import plugins, filesystem_encoding
from calibre.utils.icu import primary_sort_key, primary_find, primary_collator
from polyglot.builtins import iteritems, itervalues, map, unicode_type, range, zip, raw_input, filter, getcwd
from polyglot.builtins import iteritems, itervalues, map, unicode_type, range, zip, raw_input, filter, getcwd, unicode_type
from polyglot.queue import Queue
DEFAULT_LEVEL1 = '/'
@ -294,12 +294,12 @@ def test(return_tests=False):
start = memory()
for i in range(10):
doit(str(i))
doit(unicode_type(i))
gc.collect()
used10 = memory() - start
start = memory()
for i in range(100):
doit(str(i))
doit(unicode_type(i))
gc.collect()
used100 = memory() - start
if used100 > 0 and used10 > 0:

View File

@ -1,4 +1,5 @@
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
try:
from time import monotonic

View File

@ -16,7 +16,7 @@ from threading import Lock, local
from polyglot import socketserver
from polyglot.http_server import HTTPServer, SimpleHTTPRequestHandler
from polyglot.builtins import error_message, getcwd
from polyglot.builtins import error_message, getcwd, unicode_type
# Compiler {{{
@ -107,9 +107,9 @@ class HTTPRequestHandler(SimpleHTTPRequestHandler): # {{{
self.send_response(rtype)
self.send_header("Accept-Ranges", "bytes")
self.send_header("Content-Range", 'bytes ' +
str(start_range) + '-' + str(end_range - 1) + '/' + str(size))
self.send_header("Content-Type", str(mimetype))
self.send_header("Content-Length", str(end_range - start_range))
unicode_type(start_range) + '-' + unicode_type(end_range - 1) + '/' + unicode_type(size))
self.send_header("Content-Type", unicode_type(mimetype))
self.send_header("Content-Length", unicode_type(end_range - start_range))
self.send_header("Last-Modified", self.date_time_string(int(mtime)))
self.end_headers()
return f, start_range, end_range

View File

@ -1,6 +1,8 @@
#!/usr/bin/python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__author__ = "Chad Miller <smartypantspy@chad.org>, Kovid Goyal <kovid at kovidgoyal.net>"
__description__ = "Smart-quotes, smart-ellipses, and smart-dashes for weblog entries in pyblosxom"
@ -525,7 +527,7 @@ def smartyPants(text, attr='1'):
return "".join(result)
def educateQuotes(str):
def educateQuotes(text):
"""
Parameter: String.
@ -539,32 +541,32 @@ def educateQuotes(str):
# Special case if the very first character is a quote
# followed by punctuation at a non-word-break. Close the quotes by brute force:
str = re.sub(r"""^'(?=%s\\B)""" % (punct_class,), r"""&#8217;""", str)
str = re.sub(r"""^"(?=%s\\B)""" % (punct_class,), r"""&#8221;""", str)
text = re.sub(r"""^'(?=%s\\B)""" % (punct_class,), r"""&#8217;""", text)
text = re.sub(r"""^"(?=%s\\B)""" % (punct_class,), r"""&#8221;""", text)
# Special case for double sets of quotes, e.g.:
# <p>He said, "'Quoted' words in a larger quote."</p>
str = re.sub(r""""'(?=\w)""", """&#8220;&#8216;""", str)
str = re.sub(r"""'"(?=\w)""", """&#8216;&#8220;""", str)
str = re.sub(r'''""(?=\w)''', """&#8220;&#8220;""", str)
str = re.sub(r"""''(?=\w)""", """&#8216;&#8216;""", str)
str = re.sub(r'''\"\'''', """&#8221;&#8217;""", str)
str = re.sub(r'''\'\"''', """&#8217;&#8221;""", str)
str = re.sub(r'''""''', """&#8221;&#8221;""", str)
str = re.sub(r"""''""", """&#8217;&#8217;""", str)
text = re.sub(r""""'(?=\w)""", """&#8220;&#8216;""", text)
text = re.sub(r"""'"(?=\w)""", """&#8216;&#8220;""", text)
text = re.sub(r'''""(?=\w)''', """&#8220;&#8220;""", text)
text = re.sub(r"""''(?=\w)""", """&#8216;&#8216;""", text)
text = re.sub(r'''\"\'''', """&#8221;&#8217;""", text)
text = re.sub(r'''\'\"''', """&#8217;&#8221;""", text)
text = re.sub(r'''""''', """&#8221;&#8221;""", text)
text = re.sub(r"""''""", """&#8217;&#8217;""", text)
# Special case for decade abbreviations (the '80s --> 80s):
# See http://practicaltypography.com/apostrophes.html
str = re.sub(r"""(\W|^)'(?=\d{2}s)""", r"""\1&#8217;""", str)
text = re.sub(r"""(\W|^)'(?=\d{2}s)""", r"""\1&#8217;""", text)
# Measurements in feet and inches or longitude/latitude: 19' 43.5" --> 19 43.5″
str = re.sub(r'''(\W|^)([-0-9.]+\s*)'(\s*[-0-9.]+)"''', r'\1\2&#8242;\3&#8243;', str)
text = re.sub(r'''(\W|^)([-0-9.]+\s*)'(\s*[-0-9.]+)"''', r'\1\2&#8242;\3&#8243;', text)
# Special case for Quotes at inside of other entities, e.g.:
# <p>A double quote--"within dashes"--would be nice.</p>
str = re.sub(r"""(?<=\W)"(?=\w)""", r"""&#8220;""", str)
str = re.sub(r"""(?<=\W)'(?=\w)""", r"""&#8216;""", str)
str = re.sub(r"""(?<=\w)"(?=\W)""", r"""&#8221;""", str)
str = re.sub(r"""(?<=\w)'(?=\W)""", r"""&#8217;""", str)
text = re.sub(r"""(?<=\W)"(?=\w)""", r"""&#8220;""", text)
text = re.sub(r"""(?<=\W)'(?=\w)""", r"""&#8216;""", text)
text = re.sub(r"""(?<=\w)"(?=\W)""", r"""&#8221;""", text)
text = re.sub(r"""(?<=\w)'(?=\W)""", r"""&#8217;""", text)
# The following are commented out as smartypants tokenizes text by
# stripping out html tags. Therefore, there is no guarantee that the
@ -572,12 +574,12 @@ def educateQuotes(str):
# meaningful
# Special case for Quotes at end of line with a preceeding space (may change just to end of line)
# str = re.sub(r"""(?<=\s)"$""", r"""&#8221;""", str)
# str = re.sub(r"""(?<=\s)'$""", r"""&#8217;""", str)
# text = re.sub(r"""(?<=\s)"$""", r"""&#8221;""", text)
# text = re.sub(r"""(?<=\s)'$""", r"""&#8217;""", text)
# Special case for Quotes at beginning of line with a space - multiparagraph quoted text:
# str = re.sub(r"""^"(?=\s)""", r"""&#8220;""", str)
# str = re.sub(r"""^'(?=\s)""", r"""&#8216;""", str)
# text = re.sub(r"""^"(?=\s)""", r"""&#8220;""", text)
# text = re.sub(r"""^'(?=\s)""", r"""&#8216;""", text)
close_class = r"""[^\ \t\r\n\[\{\(\-]"""
dec_dashes = r"""&#8211;|&#8212;"""
@ -595,24 +597,24 @@ def educateQuotes(str):
' # the quote
(?=\w) # followed by a word character
""" % (dec_dashes,), re.VERBOSE)
str = opening_single_quotes_regex.sub(r"""\1&#8216;""", str)
text = opening_single_quotes_regex.sub(r"""\1&#8216;""", text)
closing_single_quotes_regex = re.compile(r"""
(%s)
'
(?!\s | s\b | \d)
""" % (close_class,), re.VERBOSE)
str = closing_single_quotes_regex.sub(r"""\1&#8217;""", str)
text = closing_single_quotes_regex.sub(r"""\1&#8217;""", text)
closing_single_quotes_regex = re.compile(r"""
(%s)
'
(\s | s\b)
""" % (close_class,), re.VERBOSE)
str = closing_single_quotes_regex.sub(r"""\1&#8217;\2""", str)
text = closing_single_quotes_regex.sub(r"""\1&#8217;\2""", text)
# Any remaining single quotes should be opening ones:
str = re.sub(r"""'""", r"""&#8216;""", str)
text = re.sub(r"""'""", r"""&#8216;""", text)
# Get most opening double quotes:
opening_double_quotes_regex = re.compile(r"""
@ -627,7 +629,7 @@ def educateQuotes(str):
" # the quote
(?=\w) # followed by a word character
""" % (dec_dashes,), re.VERBOSE)
str = opening_double_quotes_regex.sub(r"""\1&#8220;""", str)
text = opening_double_quotes_regex.sub(r"""\1&#8220;""", text)
# Double closing quotes:
closing_double_quotes_regex = re.compile(r"""
@ -635,25 +637,25 @@ def educateQuotes(str):
"
(?=\s)
""" % (close_class,), re.VERBOSE)
str = closing_double_quotes_regex.sub(r"""&#8221;""", str)
text = closing_double_quotes_regex.sub(r"""&#8221;""", text)
closing_double_quotes_regex = re.compile(r"""
(%s) # character that indicates the quote should be closing
"
""" % (close_class,), re.VERBOSE)
str = closing_double_quotes_regex.sub(r"""\1&#8221;""", str)
text = closing_double_quotes_regex.sub(r"""\1&#8221;""", text)
if str.endswith('-"'):
if text.endswith('-"'):
# A string that endswith -" is sometimes used for dialogue
str = str[:-1] + '&#8221;'
text = text[:-1] + '&#8221;'
# Any remaining quotes should be opening ones.
str = re.sub(r'"', r"""&#8220;""", str)
text = re.sub(r'"', r"""&#8220;""", text)
return str
return text
def educateBackticks(str):
def educateBackticks(text):
"""
Parameter: String.
Returns: The string, with ``backticks'' -style double quotes
@ -662,12 +664,12 @@ def educateBackticks(str):
Example output: &#8220;Isn't this fun?&#8221;
"""
str = re.sub(r"""``""", r"""&#8220;""", str)
str = re.sub(r"""''""", r"""&#8221;""", str)
return str
text = re.sub(r"""``""", r"""&#8220;""", text)
text = re.sub(r"""''""", r"""&#8221;""", text)
return text
def educateSingleBackticks(str):
def educateSingleBackticks(text):
"""
Parameter: String.
Returns: The string, with `backticks' -style single quotes
@ -677,12 +679,12 @@ def educateSingleBackticks(str):
Example output: &#8216;Isn&#8217;t this fun?&#8217;
"""
str = re.sub(r"""`""", r"""&#8216;""", str)
str = re.sub(r"""'""", r"""&#8217;""", str)
return str
text = re.sub(r"""`""", r"""&#8216;""", text)
text = re.sub(r"""'""", r"""&#8217;""", text)
return text
def educateDashes(str):
def educateDashes(text):
"""
Parameter: String.
@ -690,12 +692,12 @@ def educateDashes(str):
an em-dash HTML entity.
"""
str = re.sub(r"""---""", r"""&#8211;""", str) # en (yes, backwards)
str = re.sub(r"""--""", r"""&#8212;""", str) # em (yes, backwards)
return str
text = re.sub(r"""---""", r"""&#8211;""", text) # en (yes, backwards)
text = re.sub(r"""--""", r"""&#8212;""", text) # em (yes, backwards)
return text
def educateDashesOldSchool(str):
def educateDashesOldSchool(text):
"""
Parameter: String.
@ -704,12 +706,12 @@ def educateDashesOldSchool(str):
an em-dash HTML entity.
"""
str = re.sub(r"""---""", r"""&#8212;""", str) # em (yes, backwards)
str = re.sub(r"""--""", r"""&#8211;""", str) # en (yes, backwards)
return str
text = re.sub(r"""---""", r"""&#8212;""", text) # em (yes, backwards)
text = re.sub(r"""--""", r"""&#8211;""", text) # en (yes, backwards)
return text
def educateDashesOldSchoolInverted(str):
def educateDashesOldSchoolInverted(text):
"""
Parameter: String.
@ -724,12 +726,12 @@ def educateDashesOldSchoolInverted(str):
the shortcut should be shorter to type. (Thanks to Aaron
Swartz for the idea.)
"""
str = re.sub(r"""---""", r"""&#8211;""", str) # em
str = re.sub(r"""--""", r"""&#8212;""", str) # en
return str
text = re.sub(r"""---""", r"""&#8211;""", text) # em
text = re.sub(r"""--""", r"""&#8212;""", text) # en
return text
def educateEllipses(str):
def educateEllipses(text):
"""
Parameter: String.
Returns: The string, with each instance of "..." translated to
@ -739,12 +741,12 @@ def educateEllipses(str):
Example output: Huh&#8230;?
"""
str = re.sub(r"""\.\.\.""", r"""&#8230;""", str)
str = re.sub(r"""\. \. \.""", r"""&#8230;""", str)
return str
text = re.sub(r"""\.\.\.""", r"""&#8230;""", text)
text = re.sub(r"""\. \. \.""", r"""&#8230;""", text)
return text
def stupefyEntities(str):
def stupefyEntities(text):
"""
Parameter: String.
Returns: The string, with each SmartyPants HTML entity translated to
@ -754,21 +756,21 @@ def stupefyEntities(str):
Example output: "Hello -- world."
"""
str = re.sub(r"""&#8211;""", r"""-""", str) # en-dash
str = re.sub(r"""&#8212;""", r"""--""", str) # em-dash
text = re.sub(r"""&#8211;""", r"""-""", text) # en-dash
text = re.sub(r"""&#8212;""", r"""--""", text) # em-dash
str = re.sub(r"""&#8216;""", r"""'""", str) # open single quote
str = re.sub(r"""&#8217;""", r"""'""", str) # close single quote
text = re.sub(r"""&#8216;""", r"""'""", text) # open single quote
text = re.sub(r"""&#8217;""", r"""'""", text) # close single quote
str = re.sub(r"""&#8220;""", r'''"''', str) # open double quote
str = re.sub(r"""&#8221;""", r'''"''', str) # close double quote
text = re.sub(r"""&#8220;""", r'''"''', text) # open double quote
text = re.sub(r"""&#8221;""", r'''"''', text) # close double quote
str = re.sub(r"""&#8230;""", r"""...""", str) # ellipsis
text = re.sub(r"""&#8230;""", r"""...""", text) # ellipsis
return str
return text
def processEscapes(str):
def processEscapes(text):
r"""
Parameter: String.
Returns: The string, with after processing the following backslash
@ -784,17 +786,17 @@ def processEscapes(str):
\- &#45;
\` &#96;
"""
str = re.sub(r"""\\\\""", r"""&#92;""", str)
str = re.sub(r'''\\"''', r"""&#34;""", str)
str = re.sub(r"""\\'""", r"""&#39;""", str)
str = re.sub(r"""\\\.""", r"""&#46;""", str)
str = re.sub(r"""\\-""", r"""&#45;""", str)
str = re.sub(r"""\\`""", r"""&#96;""", str)
text = re.sub(r"""\\\\""", r"""&#92;""", text)
text = re.sub(r'''\\"''', r"""&#34;""", text)
text = re.sub(r"""\\'""", r"""&#39;""", text)
text = re.sub(r"""\\\.""", r"""&#46;""", text)
text = re.sub(r"""\\-""", r"""&#45;""", text)
text = re.sub(r"""\\`""", r"""&#96;""", text)
return str
return text
def _tokenize(str):
def _tokenize(html):
"""
Parameter: String containing HTML markup.
Returns: Reference to an array of the tokens comprising the input
@ -817,7 +819,7 @@ def _tokenize(str):
# %s # nested tags """ % (nested_tags,)
tag_soup = re.compile(r"""([^<]*)(<[^>]*>)""")
token_match = tag_soup.search(str)
token_match = tag_soup.search(html)
previous_end = 0
while token_match is not None:
@ -827,10 +829,10 @@ def _tokenize(str):
tokens.append(['tag', token_match.group(2)])
previous_end = token_match.end()
token_match = tag_soup.search(str, token_match.end())
token_match = tag_soup.search(html, token_match.end())
if previous_end < len(str):
tokens.append(['text', str[previous_end:]])
if previous_end < len(html):
tokens.append(['text', html[previous_end:]])
return tokens

View File

@ -1,4 +1,7 @@
#!/usr/bin/env python2
from __future__ import absolute_import, division, print_function, unicode_literals
__author__ = "stackoverflow community"
__docformat__ = 'restructuredtext en'
"""

View File

@ -1,5 +1,8 @@
#!/usr/bin/python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
"""
Get word, character, and Asian character counts
@ -44,7 +47,7 @@ def filter_jchars(c):
def nonj_len(word):
u"""Returns number of non-Asian words in {word}
"""Returns number of non-Asian words in {word}
- 日本語AアジアンB -> 2
- hello -> 1
@param word: A word, possibly containing Asian characters
@ -56,7 +59,7 @@ def nonj_len(word):
# -> ['spam', 'eggs']
# The length of which is 2!
chars = [filter_jchars(c) for c in word]
return len(u''.join(chars).split())
return len(''.join(chars).split())
def get_wordcount(text):
@ -66,8 +69,8 @@ def get_wordcount(text):
"""
characters = len(text)
chars_no_spaces = sum([not x.isspace() for x in text])
asian_chars = sum([is_asian(x) for x in text])
chars_no_spaces = sum(not x.isspace() for x in text)
asian_chars = sum(is_asian(x) for x in text)
non_asian_words = nonj_len(text)
words = non_asian_words + asian_chars