py3: more unicode porting

2025-07-09 03:04:10 -04:00 · 2019-06-01 14:47:05 +05:30 · 2019-06-01 14:47:05 +05:30 · 151e736538
commit 151e736538
parent 20b065fb49
4 changed files with 62 additions and 48 deletions
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -1,3 +1,4 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
 '''
 Basic support for manipulating OEB 1.x/2.0 content and metadata.
 '''
@ -22,7 +23,7 @@ from calibre.ebooks.oeb.parse_utils import (barename, XHTML_NS, RECOVER_PARSER,
        namespace, XHTML, parse_html, NotHTML)
 from calibre.utils.cleantext import clean_xml_chars
 from calibre.utils.short_uuid import uuid4
-from polyglot.builtins import iteritems, unicode_type, string_or_bytes, range, itervalues, filter
+from polyglot.builtins import iteritems, unicode_type, string_or_bytes, range, itervalues, filter, codepoint_to_chr
 from polyglot.urllib import unquote as urlunquote, urldefrag, urljoin, urlparse, urlunparse
 from calibre.utils.icu import numeric_sort_key

@ -148,7 +149,7 @@ def close_self_closing_tags(raw):


 def uuid_id():
-    return u'u'+uuid4()
+    return 'u' + uuid4()


 def itercsslinks(raw):
@ -169,12 +170,12 @@ def iterlinks(root, find_links_in_css=True):
    '''
    assert etree.iselement(root)

-    for el in root.iter():
-        attribs = el.attrib
+    for el in root.iter('*'):
        try:
            tag = barename(el.tag).lower()
        except Exception:
            continue
+        attribs = el.attrib

        if tag == 'object':
            codebase = None
@ -323,7 +324,7 @@ PNG_MIME       = types_map['.png']
 SVG_MIME       = types_map['.svg']
 BINARY_MIME    = 'application/octet-stream'

-XHTML_CSS_NAMESPACE = u'@namespace "%s";\n' % XHTML_NS
+XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS

 OEB_STYLES        = {CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css', 'xhtml/css'}
 OEB_DOCS          = {XHTML_MIME, 'text/html', OEB_DOC_MIME,
@ -394,7 +395,7 @@ def xml2str(root, pretty_print=False, strip_comments=False, with_tail=True):
                          pretty_print=pretty_print, with_tail=with_tail)

    if strip_comments:
-        ans = re.compile(r'<!--.*?-->', re.DOTALL).sub('', ans)
+        ans = re.compile(br'<!--.*?-->', re.DOTALL).sub(b'', ans)

    return ans

@ -432,12 +433,15 @@ def serialize(data, media_type, pretty_print=False):
    return bytes(data)


-ASCII_CHARS   = set(chr(x) for x in range(128))
-UNIBYTE_CHARS = set(chr(x) for x in range(256))
-URL_SAFE      = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+ASCII_CHARS   = frozenset(codepoint_to_chr(x) for x in range(128))
+UNIBYTE_CHARS = frozenset(x.encode('ascii') for x in ASCII_CHARS)
+USAFE         = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
                 'abcdefghijklmnopqrstuvwxyz'
                 '0123456789' '_.-/~')
-URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE]
+URL_SAFE      = frozenset(USAFE)
+URL_SAFE_BYTES = frozenset(USAFE.encode('ascii'))
+URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE_BYTES]
+del USAFE


 def urlquote(href):
@ -445,13 +449,16 @@ def urlquote(href):
    That is, this function returns valid IRIs not valid URIs. In particular,
    IRIs can contain non-ascii characters.  """
    result = []
-    unsafe = 0 if isinstance(href, unicode_type) else 1
-    unsafe = URL_UNSAFE[unsafe]
+    isbytes = isinstance(href, bytes)
+    unsafe = URL_UNSAFE[int(isbytes)]
+    esc, join = "%%%02x", ''
+    if isbytes:
+        esc, join = esc.encode('ascii'), b''
    for char in href:
        if char in unsafe:
-            char = "%%%02x" % ord(char)
+            char = esc % ord(char)
        result.append(char)
-    return ''.join(result)
+    return join.join(result)


 def urlnormalize(href):
@ -852,7 +859,7 @@ class Metadata(object):

    def to_opf1(self, parent=None):
        nsmap = self._opf1_nsmap
-        nsrmap = dict((value, key) for key, value in nsmap.items())
+        nsrmap = {value: key for key, value in iteritems(nsmap)}
        elem = element(parent, 'metadata', nsmap=nsmap)
        dcmeta = element(elem, 'dc-metadata', nsmap=OPF1_NSMAP)
        xmeta = element(elem, 'x-metadata')
@ -866,7 +873,7 @@ class Metadata(object):

    def to_opf2(self, parent=None):
        nsmap = self._opf2_nsmap
-        nsrmap = dict((value, key) for key, value in nsmap.items())
+        nsrmap = {value: key for key, value in iteritems(nsmap)}
        elem = element(parent, OPF('metadata'), nsmap=nsmap)
        for term in self.items:
            for item in self.items[term]:
@ -935,10 +942,10 @@ class Manifest(object):

        # Parsing {{{
        def _parse_xml(self, data):
+            if not data:
+                return
            data = xml_to_unicode(data, strip_encoding_pats=True,
                    assume_utf8=True, resolve_entities=True)[0]
-            if not data:
-                return None
            return etree.fromstring(data, parser=RECOVER_PARSER)

        def _parse_xhtml(self, data):
@ -956,7 +963,10 @@ class Manifest(object):
            return data

        def _parse_txt(self, data):
-            if '<html>' in data:
+            has_html = '<html>'
+            if isinstance(data, bytes):
+                has_html = has_html.encode('ascii')
+            if has_html in data:
                return self._parse_xhtml(data)

            self.oeb.log.debug('Converting', self.href, '...')
@ -1202,7 +1212,7 @@ class Manifest(object):
            base = id
            index = 1
            while id in self.ids:
-                id = base + str(index)
+                id = base + unicode_type(index)
                index += 1
        if href is not None:
            href = urlnormalize(href)
@ -1210,7 +1220,7 @@ class Manifest(object):
            index = 1
            lhrefs = {x.lower() for x in self.hrefs}
            while href.lower() in lhrefs:
-                href = base + str(index) + ext
+                href = base + unicode_type(index) + ext
                index += 1
        return id, unicode_type(href)

@ -1600,17 +1610,17 @@ class TOC(object):
            return 1

    def get_lines(self, lvl=0):
-        ans = [(u'\t'*lvl) + u'TOC: %s --> %s'%(self.title, self.href)]
+        ans = [('\t'*lvl) + 'TOC: %s --> %s'%(self.title, self.href)]
        for child in self:
            ans.extend(child.get_lines(lvl+1))
        return ans

    if ispy3:
        def __str__(self):
-            return u'\n'.join(self.get_lines())
+            return '\n'.join(self.get_lines())
    else:
        def __unicode__(self):
-            return u'\n'.join(self.get_lines())
+            return '\n'.join(self.get_lines())

        def __str__(self):
            return b'\n'.join([x.encode('utf-8') for x in self.get_lines()])
@ -1734,11 +1744,11 @@ class PageList(object):

    def to_ncx(self, parent=None):
        plist = element(parent, NCX('pageList'), id=uuid_id())
-        values = dict((t, count(1)) for t in ('front', 'normal', 'special'))
+        values = {t: count(1) for t in ('front', 'normal', 'special')}
        for page in self.pages:
            id = page.id or uuid_id()
            type = page.type
-            value = str(next(values[type]))
+            value = unicode_type(next(values[type]))
            attrib = {'id': id, 'value': value, 'type': type, 'playOrder': '0'}
            if page.klass:
                attrib['class'] = page.klass
@ -1831,7 +1841,7 @@ class OEBBook(object):

    def translate(self, text):
        """Translate :param:`text` into the book's primary language."""
-        lang = str(self.metadata.language[0])
+        lang = unicode_type(self.metadata.language[0])
        lang = lang.split('-', 1)[0].lower()
        return translate(lang, text)

@ -1842,14 +1852,14 @@ class OEBBook(object):
        if isinstance(data, unicode_type):
            return fix_data(data)
        bom_enc = None
-        if data[:4] in ('\0\0\xfe\xff', '\xff\xfe\0\0'):
-            bom_enc = {'\0\0\xfe\xff':'utf-32-be',
-                    '\xff\xfe\0\0':'utf-32-le'}[data[:4]]
+        if data[:4] in (b'\0\0\xfe\xff', b'\xff\xfe\0\0'):
+            bom_enc = {b'\0\0\xfe\xff':'utf-32-be',
+                    b'\xff\xfe\0\0':'utf-32-le'}[data[:4]]
            data = data[4:]
-        elif data[:2] in ('\xff\xfe', '\xfe\xff'):
-            bom_enc = {'\xff\xfe':'utf-16-le', '\xfe\xff':'utf-16-be'}[data[:2]]
+        elif data[:2] in (b'\xff\xfe', b'\xfe\xff'):
+            bom_enc = {b'\xff\xfe':'utf-16-le', 'b\xfe\xff':'utf-16-be'}[data[:2]]
            data = data[2:]
-        elif data[:3] == '\xef\xbb\xbf':
+        elif data[:3] == b'\xef\xbb\xbf':
            bom_enc = 'utf-8'
            data = data[3:]
        if bom_enc is not None:
--- a/src/calibre/ebooks/pdb/ereader/reader202.py
+++ b/src/calibre/ebooks/pdb/ereader/reader202.py
@ -1,5 +1,7 @@
 # -*- coding: utf-8 -*-

+from __future__ import absolute_import, division, print_function, unicode_literals
+
 '''
 Read content from ereader pdb file with a 116 and 202 byte header created by Makebook.
 '''
@ -57,15 +59,17 @@ class Reader202(FormatReader):

    def decompress_text(self, number):
        from calibre.ebooks.compression.palmdoc import decompress_doc
-        return decompress_doc(''.join([chr(ord(x) ^ 0xA5) for x in self.section_data(number)])).decode('cp1252' if self.encoding is None else self.encoding, 'replace')  # noqa
+        data = bytearray(self.section_data(number))
+        data = bytes(bytearray(x ^ 0xA5 for x in data))
+        return decompress_doc(data).decode(self.encoding or 'cp1252', 'replace')

    def get_image(self, number):
        name = None
        img = None

        data = self.section_data(number)
-        if data.startswith('PNG'):
-            name = data[4:4 + 32].strip('\x00')
+        if data.startswith(b'PNG'):
+            name = data[4:4 + 32].strip(b'\x00')
            img = data[62:]

        return name, img
@ -89,7 +93,7 @@ class Reader202(FormatReader):
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

-        pml = u''
+        pml = ''
        for i in range(1, self.header_record.num_text_pages + 1):
            self.log.debug('Extracting text page %i' % i)
            pml += self.get_text_page(i)
@ -98,7 +102,7 @@ class Reader202(FormatReader):
        if not isinstance(title, unicode_type):
            title = title.decode('utf-8', 'replace')

-        html = u'<html><head><title>%s</title></head><body>%s</body></html>' % \
+        html = '<html><head><title>%s</title></head><body>%s</body></html>' % \
            (title, pml_to_html(pml))

        with CurrentDir(output_dir):
--- a/src/calibre/ebooks/pdf/render/fonts.py
+++ b/src/calibre/ebooks/pdf/render/fonts.py
@ -10,7 +10,7 @@ import re
 from itertools import groupby
 from operator import itemgetter
 from collections import Counter, OrderedDict
-from polyglot.builtins import iteritems, map, zip, unicode_type
+from polyglot.builtins import iteritems, map, zip, unicode_type, codepoint_to_chr

 from calibre import as_unicode
 from calibre.ebooks.pdf.render.common import (Array, String, Stream,
@ -123,7 +123,7 @@ class Font(object):
        self.metrics, self.compress = metrics, compress
        self.is_otf = self.metrics.is_otf
        self.subset_tag = unicode_type(
-            re.sub('.', lambda m: chr(int(m.group())+ord('A')), oct(num).replace('o', '')
+            re.sub('.', lambda m: codepoint_to_chr(int(m.group())+ord('A')), oct(num).replace('o', '')
        )).rjust(6, 'A')
        self.font_stream = FontStream(metrics.is_otf, compress=compress)
        try:
--- a/src/calibre/gui2/tweak_book/char_select.py
+++ b/src/calibre/gui2/tweak_book/char_select.py
@ -19,7 +19,7 @@ from calibre.constants import plugins
 from calibre.gui2.widgets2 import HistoryLineEdit2
 from calibre.gui2.tweak_book import tprefs
 from calibre.gui2.tweak_book.widgets import Dialog, BusyCursor
-from calibre.utils.icu import safe_chr as chr
+from calibre.utils.icu import safe_chr as codepoint_to_chr
 from calibre.utils.unicode_names import character_name_from_code, points_for_word
 from polyglot.builtins import unicode_type, range, map

@ -570,7 +570,7 @@ class CharDelegate(QStyledItemDelegate):
        f = option.font
        f.setPixelSize(option.rect.height() - 8)
        painter.setFont(f)
-        painter.drawText(option.rect, Qt.AlignHCenter | Qt.AlignBottom | Qt.TextSingleLine, chr(charcode))
+        painter.drawText(option.rect, Qt.AlignHCenter | Qt.AlignBottom | Qt.TextSingleLine, codepoint_to_chr(charcode))

    def paint_non_printing(self, painter, option, charcode):
        text = self.np_pat.sub(r'\n\1', non_printing[charcode])
@ -612,7 +612,7 @@ class CharView(QListView):
        except (TypeError, ValueError):
            pass
        else:
-            self.char_selected.emit(chr(char_code))
+            self.char_selected.emit(codepoint_to_chr(char_code))

    def set_allow_drag_and_drop(self, enabled):
        if not enabled:
@ -663,9 +663,9 @@ class CharView(QListView):
                pass
            else:
                m = QMenu(self)
-                m.addAction(QIcon(I('edit-copy.png')), _('Copy %s to clipboard') % chr(char_code), partial(self.copy_to_clipboard, char_code))
+                m.addAction(QIcon(I('edit-copy.png')), _('Copy %s to clipboard') % codepoint_to_chr(char_code), partial(self.copy_to_clipboard, char_code))
                m.addAction(QIcon(I('rating.png')),
-                            (_('Remove %s from favorites') if self.showing_favorites else _('Add %s to favorites')) % chr(char_code),
+                            (_('Remove %s from favorites') if self.showing_favorites else _('Add %s to favorites')) % codepoint_to_chr(char_code),
                            partial(self.remove_from_favorites, char_code))
                if self.showing_favorites:
                    m.addAction(_('Restore favorites to defaults'), self.restore_defaults)
@ -679,7 +679,7 @@ class CharView(QListView):

    def copy_to_clipboard(self, char_code):
        c = QApplication.clipboard()
-        c.setText(chr(char_code))
+        c.setText(codepoint_to_chr(char_code))

    def remove_from_favorites(self, char_code):
        existing = tprefs['charmap_favorites']