diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 1b921860e0..a7306ccfa2 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -9,6 +9,9 @@ __docformat__ = 'restructuredtext en' import struct, datetime, sys, os, shutil from collections import OrderedDict, defaultdict + +from lxml import html + from calibre.utils.date import utc_tz from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.utils import (decode_hex_number, decint, @@ -1208,6 +1211,19 @@ def inspect_mobi(path_or_stream, prefix='decompiled'): for rec in getattr(f, attr): rec.dump(tdir) + alltext = os.path.join(ddir, 'text.html') + with open(alltext, 'wb') as of: + alltext = b'' + for rec in f.text_records: + of.write(rec.raw) + alltext += rec.raw + of.seek(0) + root = html.fromstring(alltext.decode('utf-8')) + with open(os.path.join(ddir, 'pretty.html'), 'wb') as of: + of.write(html.tostring(root, pretty_print=True, encoding='utf-8', + include_meta_content_type=True)) + + print ('Debug data saved to:', ddir) def main(): diff --git a/src/calibre/ebooks/mobi/writer2/serializer.py b/src/calibre/ebooks/mobi/writer2/serializer.py index 881937ce73..2f80fd1715 100644 --- a/src/calibre/ebooks/mobi/writer2/serializer.py +++ b/src/calibre/ebooks/mobi/writer2/serializer.py @@ -53,6 +53,35 @@ class Serializer(object): # become uncrossable breaks in the MOBI self.breaks = [] + self.find_blocks() + + def find_blocks(self): + ''' + Mark every item in the spine if it is the start/end of a + section/article, so that it can be wrapped in divs appropariately. + ''' + for item in self.oeb.spine: + item.is_section_start = item.is_section_end = False + item.is_article_start = item.is_article_end = False + + def spine_item(tocitem): + href = urldefrag(tocitem.href)[0] + for item in self.oeb.spine: + if item.href == href: + return item + + for item in self.oeb.toc.iterdescendants(): + if item.klass == 'section': + articles = list(item) + if not articles: continue + spine_item(item).is_section_start = True + for i, article in enumerate(articles): + si = spine_item(article) + si.is_article_start = True + si.is_article_end = True + if i == len(articles) - 1: + si.is_section_end = True + def __call__(self): ''' Return the document serialized as a single UTF-8 encoded bytestring. @@ -155,6 +184,8 @@ class Serializer(object): if not item.linear: self.breaks.append(buf.tell() - 1) self.id_offsets[urlnormalize(item.href)] = buf.tell() + if item.is_section_start: + buf.write(b'