From e2ba917116bb0b3a703b9f407c612cb6c609d06b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 29 Jul 2011 14:35:54 -0600 Subject: [PATCH] New mobi output: Wrap sections in
tags --- src/calibre/ebooks/mobi/debug.py | 16 +++++++++ src/calibre/ebooks/mobi/writer2/serializer.py | 33 +++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py index 1b921860e0..a7306ccfa2 100644 --- a/src/calibre/ebooks/mobi/debug.py +++ b/src/calibre/ebooks/mobi/debug.py @@ -9,6 +9,9 @@ __docformat__ = 'restructuredtext en' import struct, datetime, sys, os, shutil from collections import OrderedDict, defaultdict + +from lxml import html + from calibre.utils.date import utc_tz from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.utils import (decode_hex_number, decint, @@ -1208,6 +1211,19 @@ def inspect_mobi(path_or_stream, prefix='decompiled'): for rec in getattr(f, attr): rec.dump(tdir) + alltext = os.path.join(ddir, 'text.html') + with open(alltext, 'wb') as of: + alltext = b'' + for rec in f.text_records: + of.write(rec.raw) + alltext += rec.raw + of.seek(0) + root = html.fromstring(alltext.decode('utf-8')) + with open(os.path.join(ddir, 'pretty.html'), 'wb') as of: + of.write(html.tostring(root, pretty_print=True, encoding='utf-8', + include_meta_content_type=True)) + + print ('Debug data saved to:', ddir) def main(): diff --git a/src/calibre/ebooks/mobi/writer2/serializer.py b/src/calibre/ebooks/mobi/writer2/serializer.py index 881937ce73..2f80fd1715 100644 --- a/src/calibre/ebooks/mobi/writer2/serializer.py +++ b/src/calibre/ebooks/mobi/writer2/serializer.py @@ -53,6 +53,35 @@ class Serializer(object): # become uncrossable breaks in the MOBI self.breaks = [] + self.find_blocks() + + def find_blocks(self): + ''' + Mark every item in the spine if it is the start/end of a + section/article, so that it can be wrapped in divs appropariately. + ''' + for item in self.oeb.spine: + item.is_section_start = item.is_section_end = False + item.is_article_start = item.is_article_end = False + + def spine_item(tocitem): + href = urldefrag(tocitem.href)[0] + for item in self.oeb.spine: + if item.href == href: + return item + + for item in self.oeb.toc.iterdescendants(): + if item.klass == 'section': + articles = list(item) + if not articles: continue + spine_item(item).is_section_start = True + for i, article in enumerate(articles): + si = spine_item(article) + si.is_article_start = True + si.is_article_end = True + if i == len(articles) - 1: + si.is_section_end = True + def __call__(self): ''' Return the document serialized as a single UTF-8 encoded bytestring. @@ -155,6 +184,8 @@ class Serializer(object): if not item.linear: self.breaks.append(buf.tell() - 1) self.id_offsets[urlnormalize(item.href)] = buf.tell() + if item.is_section_start: + buf.write(b'
') # Kindle periodical articles are contained in a
tag buf.write(b'
') for elem in item.data.find(XHTML('body')): @@ -164,6 +195,8 @@ class Serializer(object): if self.write_page_breaks_after_item: buf.write(b'') buf.write(b'
') + if item.is_section_end: + buf.write(b'
') self.anchor_offset = None def serialize_elem(self, elem, item, nsrmap=NSRMAP):