New mobi output: Wrap sections in <div> tags

This commit is contained in:
Kovid Goyal 2011-07-29 14:35:54 -06:00
parent c313bdf380
commit e2ba917116
2 changed files with 49 additions and 0 deletions

View File

@ -9,6 +9,9 @@ __docformat__ = 'restructuredtext en'
import struct, datetime, sys, os, shutil import struct, datetime, sys, os, shutil
from collections import OrderedDict, defaultdict from collections import OrderedDict, defaultdict
from lxml import html
from calibre.utils.date import utc_tz from calibre.utils.date import utc_tz
from calibre.ebooks.mobi.langcodes import main_language, sub_language from calibre.ebooks.mobi.langcodes import main_language, sub_language
from calibre.ebooks.mobi.utils import (decode_hex_number, decint, from calibre.ebooks.mobi.utils import (decode_hex_number, decint,
@ -1208,6 +1211,19 @@ def inspect_mobi(path_or_stream, prefix='decompiled'):
for rec in getattr(f, attr): for rec in getattr(f, attr):
rec.dump(tdir) rec.dump(tdir)
alltext = os.path.join(ddir, 'text.html')
with open(alltext, 'wb') as of:
alltext = b''
for rec in f.text_records:
of.write(rec.raw)
alltext += rec.raw
of.seek(0)
root = html.fromstring(alltext.decode('utf-8'))
with open(os.path.join(ddir, 'pretty.html'), 'wb') as of:
of.write(html.tostring(root, pretty_print=True, encoding='utf-8',
include_meta_content_type=True))
print ('Debug data saved to:', ddir) print ('Debug data saved to:', ddir)
def main(): def main():

View File

@ -53,6 +53,35 @@ class Serializer(object):
# become uncrossable breaks in the MOBI # become uncrossable breaks in the MOBI
self.breaks = [] self.breaks = []
self.find_blocks()
def find_blocks(self):
'''
Mark every item in the spine if it is the start/end of a
section/article, so that it can be wrapped in divs appropariately.
'''
for item in self.oeb.spine:
item.is_section_start = item.is_section_end = False
item.is_article_start = item.is_article_end = False
def spine_item(tocitem):
href = urldefrag(tocitem.href)[0]
for item in self.oeb.spine:
if item.href == href:
return item
for item in self.oeb.toc.iterdescendants():
if item.klass == 'section':
articles = list(item)
if not articles: continue
spine_item(item).is_section_start = True
for i, article in enumerate(articles):
si = spine_item(article)
si.is_article_start = True
si.is_article_end = True
if i == len(articles) - 1:
si.is_section_end = True
def __call__(self): def __call__(self):
''' '''
Return the document serialized as a single UTF-8 encoded bytestring. Return the document serialized as a single UTF-8 encoded bytestring.
@ -155,6 +184,8 @@ class Serializer(object):
if not item.linear: if not item.linear:
self.breaks.append(buf.tell() - 1) self.breaks.append(buf.tell() - 1)
self.id_offsets[urlnormalize(item.href)] = buf.tell() self.id_offsets[urlnormalize(item.href)] = buf.tell()
if item.is_section_start:
buf.write(b'<div>')
# Kindle periodical articles are contained in a <div> tag # Kindle periodical articles are contained in a <div> tag
buf.write(b'<div>') buf.write(b'<div>')
for elem in item.data.find(XHTML('body')): for elem in item.data.find(XHTML('body')):
@ -164,6 +195,8 @@ class Serializer(object):
if self.write_page_breaks_after_item: if self.write_page_breaks_after_item:
buf.write(b'<mbp:pagebreak/>') buf.write(b'<mbp:pagebreak/>')
buf.write(b'</div>') buf.write(b'</div>')
if item.is_section_end:
buf.write(b'</div>')
self.anchor_offset = None self.anchor_offset = None
def serialize_elem(self, elem, item, nsrmap=NSRMAP): def serialize_elem(self, elem, item, nsrmap=NSRMAP):