From e2ba917116bb0b3a703b9f407c612cb6c609d06b Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 29 Jul 2011 14:35:54 -0600
Subject: [PATCH] New mobi output: Wrap sections in <div> tags

---
 src/calibre/ebooks/mobi/debug.py              | 16 +++++++++
 src/calibre/ebooks/mobi/writer2/serializer.py | 33 +++++++++++++++++++
 2 files changed, 49 insertions(+)
diff --git a/src/calibre/ebooks/mobi/debug.py b/src/calibre/ebooks/mobi/debug.py
index 1b921860e0..a7306ccfa2 100644
--- a/src/calibre/ebooks/mobi/debug.py
+++ b/src/calibre/ebooks/mobi/debug.py
@@ -9,6 +9,9 @@ __docformat__ = 'restructuredtext en'
 
 import struct, datetime, sys, os, shutil
 from collections import OrderedDict, defaultdict
+
+from lxml import html
+
 from calibre.utils.date import utc_tz
 from calibre.ebooks.mobi.langcodes import main_language, sub_language
 from calibre.ebooks.mobi.utils import (decode_hex_number, decint,
@@ -1208,6 +1211,19 @@ def inspect_mobi(path_or_stream, prefix='decompiled'):
         for rec in getattr(f, attr):
             rec.dump(tdir)
 
+    alltext = os.path.join(ddir, 'text.html')
+    with open(alltext, 'wb') as of:
+        alltext = b''
+        for rec in f.text_records:
+            of.write(rec.raw)
+            alltext += rec.raw
+        of.seek(0)
+    root = html.fromstring(alltext.decode('utf-8'))
+    with open(os.path.join(ddir, 'pretty.html'), 'wb') as of:
+        of.write(html.tostring(root, pretty_print=True, encoding='utf-8',
+            include_meta_content_type=True))
+
+
     print ('Debug data saved to:', ddir)
 
 def main():
diff --git a/src/calibre/ebooks/mobi/writer2/serializer.py b/src/calibre/ebooks/mobi/writer2/serializer.py
index 881937ce73..2f80fd1715 100644
--- a/src/calibre/ebooks/mobi/writer2/serializer.py
+++ b/src/calibre/ebooks/mobi/writer2/serializer.py
@@ -53,6 +53,35 @@ class Serializer(object):
         # become uncrossable breaks in the MOBI
         self.breaks = []
 
+        self.find_blocks()
+
+    def find_blocks(self):
+        '''
+        Mark every item in the spine if it is the start/end of a
+        section/article, so that it can be wrapped in divs appropariately.
+        '''
+        for item in self.oeb.spine:
+            item.is_section_start = item.is_section_end = False
+            item.is_article_start = item.is_article_end = False
+
+        def spine_item(tocitem):
+            href = urldefrag(tocitem.href)[0]
+            for item in self.oeb.spine:
+                if item.href == href:
+                    return item
+
+        for item in self.oeb.toc.iterdescendants():
+            if item.klass == 'section':
+                articles = list(item)
+                if not articles: continue
+                spine_item(item).is_section_start = True
+                for i, article in enumerate(articles):
+                    si = spine_item(article)
+                    si.is_article_start = True
+                    si.is_article_end = True
+                    if i == len(articles) - 1:
+                        si.is_section_end = True
+
     def __call__(self):
         '''
         Return the document serialized as a single UTF-8 encoded bytestring.
@@ -155,6 +184,8 @@ class Serializer(object):
         if not item.linear:
             self.breaks.append(buf.tell() - 1)
         self.id_offsets[urlnormalize(item.href)] = buf.tell()
+        if item.is_section_start:
+            buf.write(b'<div>')
         # Kindle periodical articles are contained in a <div> tag
         buf.write(b'<div>')
         for elem in item.data.find(XHTML('body')):
@@ -164,6 +195,8 @@ class Serializer(object):
         if self.write_page_breaks_after_item:
             buf.write(b'<mbp:pagebreak/>')
         buf.write(b'</div>')
+        if item.is_section_end:
+            buf.write(b'</div>')
         self.anchor_offset = None
 
     def serialize_elem(self, elem, item, nsrmap=NSRMAP):