diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index 11ffd66906..74c76f721d 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -21,6 +21,24 @@ IMAGE_MAX_SIZE = 10 * 1024 * 1024 RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed)) +class PolyglotDict(dict): + + def __setitem__(self, key, val): + if isinstance(key, unicode_type): + key = key.encode('utf-8') + dict.__setitem__(self, key, val) + + def __getitem__(self, key): + if isinstance(key, unicode_type): + key = key.encode('utf-8') + return dict.__getitem__(self, key) + + def __contains__(self, key): + if isinstance(key, unicode_type): + key = key.encode('utf-8') + return dict.__contains__(self, key) + + def decode_string(raw, codec='utf-8', ordt_map=None): length, = struct.unpack(b'>B', raw[0:1]) raw = raw[1:1+length] diff --git a/src/calibre/ebooks/mobi/writer8/exth.py b/src/calibre/ebooks/mobi/writer8/exth.py index a25f1070c3..bcf3462cec 100644 --- a/src/calibre/ebooks/mobi/writer8/exth.py +++ b/src/calibre/ebooks/mobi/writer8/exth.py @@ -145,7 +145,7 @@ def build_exth(metadata, prefer_author_sort=False, is_periodical=False, if datestr is None: raise ValueError("missing date or timestamp") - datestr = bytes(datestr) + datestr = datestr.encode('utf-8') exth.write(pack(b'>II', EXTH_CODES['pubdate'], len(datestr) + 8)) exth.write(datestr) nrecs += 1 @@ -179,7 +179,7 @@ def build_exth(metadata, prefer_author_sort=False, is_periodical=False, if thumbnail_offset is not None: exth.write(pack(b'>III', EXTH_CODES['thumboffset'], 12, thumbnail_offset)) - thumbnail_uri_str = bytes('kindle:embed:%s' %(to_base(thumbnail_offset, base=32, min_num_digits=4))) + thumbnail_uri_str = ('kindle:embed:%s' %(to_base(thumbnail_offset, base=32, min_num_digits=4))).encode('utf-8') exth.write(pack(b'>II', EXTH_CODES['kf8_thumbnail_uri'], len(thumbnail_uri_str) + 8)) exth.write(thumbnail_uri_str) nrecs += 2 @@ -217,7 +217,7 @@ def build_exth(metadata, prefer_author_sort=False, is_periodical=False, nrecs += 1 if page_progression_direction in {'rtl', 'ltr', 'default'}: - ppd = bytes(page_progression_direction) + ppd = page_progression_direction.encode('ascii') exth.write(pack(b'>II', EXTH_CODES['page_progression_direction'], len(ppd) + 8)) exth.write(ppd) nrecs += 1 diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index be36f5e54a..b5901778fe 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -15,7 +15,7 @@ from xml.sax.saxutils import escape from lxml import etree from calibre.ebooks.oeb.base import XHTML_NS, extract -from calibre.ebooks.mobi.utils import to_base +from calibre.ebooks.mobi.utils import to_base, PolyglotDict from polyglot.builtins import iteritems, unicode_type, codepoint_to_chr as mychr CHUNK_SIZE = 8192 @@ -34,7 +34,7 @@ aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b', 'video'} _self_closing_pat = re.compile( - r'<(?P%s)(?=[\s/])(?P[^>]*)/>'%('|'.join(aid_able_tags|{'script', + br'<(?P%s)(?=[\s/])(?P[^>]*)/>'%('|'.join(aid_able_tags|{'script', 'style', 'title', 'head'})).encode('ascii'), re.IGNORECASE) @@ -116,7 +116,7 @@ class Skeleton(object): def render(self, root): raw = tostring(root, xml_declaration=True) - raw = raw.replace(b']+? [ac]id=[\'"]([cA-Z0-9]+)[\'"]', rebuilt_text): offset = match.start() pos_fid = None @@ -395,9 +395,9 @@ class Chunker(object): def to_placeholder(aid): pos, fid, _ = aid_map[aid] pos, fid = to_base(pos, min_num_digits=4), to_href(fid) - return bytes(':off:'.join((pos, fid))) + return ':off:'.join((pos, fid)).encode('utf-8') - placeholder_map = {bytes(k):to_placeholder(v) for k, v in + placeholder_map = {k:to_placeholder(v) for k, v in iteritems(self.placeholder_map)} # Now update the links