py3: Port basic AZW3 output

2025-07-09 03:04:10 -04:00 · 2019-04-15 15:08:13 +05:30 · 2019-04-15 15:08:13 +05:30 · a07ad9633f
commit a07ad9633f
parent 2a2a08b660
3 changed files with 28 additions and 10 deletions
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@ -21,6 +21,24 @@ IMAGE_MAX_SIZE = 10 * 1024 * 1024
 RECORD_SIZE = 0x1000  # 4096 (Text record size (uncompressed))


+class PolyglotDict(dict):
+
+    def __setitem__(self, key, val):
+        if isinstance(key, unicode_type):
+            key = key.encode('utf-8')
+        dict.__setitem__(self, key, val)
+
+    def __getitem__(self, key):
+        if isinstance(key, unicode_type):
+            key = key.encode('utf-8')
+        return dict.__getitem__(self, key)
+
+    def __contains__(self, key):
+        if isinstance(key, unicode_type):
+            key = key.encode('utf-8')
+        return dict.__contains__(self, key)
+
+
 def decode_string(raw, codec='utf-8', ordt_map=None):
    length, = struct.unpack(b'>B', raw[0:1])
    raw = raw[1:1+length]
--- a/src/calibre/ebooks/mobi/writer8/exth.py
+++ b/src/calibre/ebooks/mobi/writer8/exth.py
@ -145,7 +145,7 @@ def build_exth(metadata, prefer_author_sort=False, is_periodical=False,
    if datestr is None:
        raise ValueError("missing date or timestamp")

-    datestr = bytes(datestr)
+    datestr = datestr.encode('utf-8')
    exth.write(pack(b'>II', EXTH_CODES['pubdate'], len(datestr) + 8))
    exth.write(datestr)
    nrecs += 1
@ -179,7 +179,7 @@ def build_exth(metadata, prefer_author_sort=False, is_periodical=False,
    if thumbnail_offset is not None:
        exth.write(pack(b'>III', EXTH_CODES['thumboffset'], 12,
            thumbnail_offset))
-        thumbnail_uri_str = bytes('kindle:embed:%s' %(to_base(thumbnail_offset, base=32, min_num_digits=4)))
+        thumbnail_uri_str = ('kindle:embed:%s' %(to_base(thumbnail_offset, base=32, min_num_digits=4))).encode('utf-8')
        exth.write(pack(b'>II', EXTH_CODES['kf8_thumbnail_uri'], len(thumbnail_uri_str) + 8))
        exth.write(thumbnail_uri_str)
        nrecs += 2
@ -217,7 +217,7 @@ def build_exth(metadata, prefer_author_sort=False, is_periodical=False,
        nrecs += 1

    if page_progression_direction in {'rtl', 'ltr', 'default'}:
-        ppd = bytes(page_progression_direction)
+        ppd = page_progression_direction.encode('ascii')
        exth.write(pack(b'>II', EXTH_CODES['page_progression_direction'], len(ppd) + 8))
        exth.write(ppd)
        nrecs += 1
--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@ -15,7 +15,7 @@ from xml.sax.saxutils import escape
 from lxml import etree

 from calibre.ebooks.oeb.base import XHTML_NS, extract
-from calibre.ebooks.mobi.utils import to_base
+from calibre.ebooks.mobi.utils import to_base, PolyglotDict
 from polyglot.builtins import iteritems, unicode_type, codepoint_to_chr as mychr

 CHUNK_SIZE = 8192
@ -34,7 +34,7 @@ aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b',
 'video'}

 _self_closing_pat = re.compile(
-    r'<(?P<tag>%s)(?=[\s/])(?P<arg>[^>]*)/>'%('|'.join(aid_able_tags|{'script',
+    br'<(?P<tag>%s)(?=[\s/])(?P<arg>[^>]*)/>'%('|'.join(aid_able_tags|{'script',
        'style', 'title', 'head'})).encode('ascii'),
    re.IGNORECASE)

@ -116,7 +116,7 @@ class Skeleton(object):

    def render(self, root):
        raw = tostring(root, xml_declaration=True)
-        raw = raw.replace(b'<html', bytes('<html xmlns="%s"'%XHTML_NS), 1)
+        raw = raw.replace(b'<html', ('<html xmlns="%s"'%XHTML_NS).encode('ascii'), 1)
        raw = close_self_closing_tags(raw)
        return raw

@ -180,7 +180,7 @@ class Chunker(object):
                    with_tail=True))
                orig_dumps[-1] = close_self_closing_tags(
                        orig_dumps[-1].replace(b'<html',
-                        bytes('<html xmlns="%s"'%XHTML_NS), 1))
+                        ('<html xmlns="%s"'%XHTML_NS).encode('ascii'), 1))

            # First pass: break up document into rendered strings of length no
            # more than CHUNK_SIZE
@ -366,7 +366,7 @@ class Chunker(object):
        # The first number is an index into the chunk table and the second is
        # an offset from the start of the chunk to the start of the tag pointed
        # to by the link.
-        aid_map = {}  # Map of aid to (fid, offset_from_start_of_chunk, offset_from_start_of_text)
+        aid_map = PolyglotDict()  # Map of aid to (fid, offset_from_start_of_chunk, offset_from_start_of_text)
        for match in re.finditer(br'<[^>]+? [ac]id=[\'"]([cA-Z0-9]+)[\'"]', rebuilt_text):
            offset = match.start()
            pos_fid = None
@ -395,9 +395,9 @@ class Chunker(object):
        def to_placeholder(aid):
            pos, fid, _ = aid_map[aid]
            pos, fid = to_base(pos, min_num_digits=4), to_href(fid)
-            return bytes(':off:'.join((pos, fid)))
+            return ':off:'.join((pos, fid)).encode('utf-8')

-        placeholder_map = {bytes(k):to_placeholder(v) for k, v in
+        placeholder_map = {k:to_placeholder(v) for k, v in
                iteritems(self.placeholder_map)}

        # Now update the links