py3: Port basic AZW3 output

This commit is contained in:
Kovid Goyal 2019-04-15 15:08:13 +05:30
parent 2a2a08b660
commit a07ad9633f
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 28 additions and 10 deletions

View File

@ -21,6 +21,24 @@ IMAGE_MAX_SIZE = 10 * 1024 * 1024
RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
class PolyglotDict(dict):
def __setitem__(self, key, val):
if isinstance(key, unicode_type):
key = key.encode('utf-8')
dict.__setitem__(self, key, val)
def __getitem__(self, key):
if isinstance(key, unicode_type):
key = key.encode('utf-8')
return dict.__getitem__(self, key)
def __contains__(self, key):
if isinstance(key, unicode_type):
key = key.encode('utf-8')
return dict.__contains__(self, key)
def decode_string(raw, codec='utf-8', ordt_map=None):
length, = struct.unpack(b'>B', raw[0:1])
raw = raw[1:1+length]

View File

@ -145,7 +145,7 @@ def build_exth(metadata, prefer_author_sort=False, is_periodical=False,
if datestr is None:
raise ValueError("missing date or timestamp")
datestr = bytes(datestr)
datestr = datestr.encode('utf-8')
exth.write(pack(b'>II', EXTH_CODES['pubdate'], len(datestr) + 8))
exth.write(datestr)
nrecs += 1
@ -179,7 +179,7 @@ def build_exth(metadata, prefer_author_sort=False, is_periodical=False,
if thumbnail_offset is not None:
exth.write(pack(b'>III', EXTH_CODES['thumboffset'], 12,
thumbnail_offset))
thumbnail_uri_str = bytes('kindle:embed:%s' %(to_base(thumbnail_offset, base=32, min_num_digits=4)))
thumbnail_uri_str = ('kindle:embed:%s' %(to_base(thumbnail_offset, base=32, min_num_digits=4))).encode('utf-8')
exth.write(pack(b'>II', EXTH_CODES['kf8_thumbnail_uri'], len(thumbnail_uri_str) + 8))
exth.write(thumbnail_uri_str)
nrecs += 2
@ -217,7 +217,7 @@ def build_exth(metadata, prefer_author_sort=False, is_periodical=False,
nrecs += 1
if page_progression_direction in {'rtl', 'ltr', 'default'}:
ppd = bytes(page_progression_direction)
ppd = page_progression_direction.encode('ascii')
exth.write(pack(b'>II', EXTH_CODES['page_progression_direction'], len(ppd) + 8))
exth.write(ppd)
nrecs += 1

View File

@ -15,7 +15,7 @@ from xml.sax.saxutils import escape
from lxml import etree
from calibre.ebooks.oeb.base import XHTML_NS, extract
from calibre.ebooks.mobi.utils import to_base
from calibre.ebooks.mobi.utils import to_base, PolyglotDict
from polyglot.builtins import iteritems, unicode_type, codepoint_to_chr as mychr
CHUNK_SIZE = 8192
@ -34,7 +34,7 @@ aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b',
'video'}
_self_closing_pat = re.compile(
r'<(?P<tag>%s)(?=[\s/])(?P<arg>[^>]*)/>'%('|'.join(aid_able_tags|{'script',
br'<(?P<tag>%s)(?=[\s/])(?P<arg>[^>]*)/>'%('|'.join(aid_able_tags|{'script',
'style', 'title', 'head'})).encode('ascii'),
re.IGNORECASE)
@ -116,7 +116,7 @@ class Skeleton(object):
def render(self, root):
raw = tostring(root, xml_declaration=True)
raw = raw.replace(b'<html', bytes('<html xmlns="%s"'%XHTML_NS), 1)
raw = raw.replace(b'<html', ('<html xmlns="%s"'%XHTML_NS).encode('ascii'), 1)
raw = close_self_closing_tags(raw)
return raw
@ -180,7 +180,7 @@ class Chunker(object):
with_tail=True))
orig_dumps[-1] = close_self_closing_tags(
orig_dumps[-1].replace(b'<html',
bytes('<html xmlns="%s"'%XHTML_NS), 1))
('<html xmlns="%s"'%XHTML_NS).encode('ascii'), 1))
# First pass: break up document into rendered strings of length no
# more than CHUNK_SIZE
@ -366,7 +366,7 @@ class Chunker(object):
# The first number is an index into the chunk table and the second is
# an offset from the start of the chunk to the start of the tag pointed
# to by the link.
aid_map = {} # Map of aid to (fid, offset_from_start_of_chunk, offset_from_start_of_text)
aid_map = PolyglotDict() # Map of aid to (fid, offset_from_start_of_chunk, offset_from_start_of_text)
for match in re.finditer(br'<[^>]+? [ac]id=[\'"]([cA-Z0-9]+)[\'"]', rebuilt_text):
offset = match.start()
pos_fid = None
@ -395,9 +395,9 @@ class Chunker(object):
def to_placeholder(aid):
pos, fid, _ = aid_map[aid]
pos, fid = to_base(pos, min_num_digits=4), to_href(fid)
return bytes(':off:'.join((pos, fid)))
return ':off:'.join((pos, fid)).encode('utf-8')
placeholder_map = {bytes(k):to_placeholder(v) for k, v in
placeholder_map = {k:to_placeholder(v) for k, v in
iteritems(self.placeholder_map)}
# Now update the links