From 888c95d0cf6d9e7c49c8131792ee6c3a254833d9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 30 Apr 2012 10:41:26 +0530 Subject: [PATCH] KF8 Output: Handle documents that have XML comments and/or processing instructions --- src/calibre/ebooks/mobi/debug/mobi8.py | 7 +++-- src/calibre/ebooks/mobi/writer8/skeleton.py | 33 +++++++++++++++------ 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/mobi/debug/mobi8.py b/src/calibre/ebooks/mobi/debug/mobi8.py index a03205edd7..788ca3ed0a 100644 --- a/src/calibre/ebooks/mobi/debug/mobi8.py +++ b/src/calibre/ebooks/mobi/debug/mobi8.py @@ -141,9 +141,10 @@ class MOBIFile(object): self.files.append(File(skel, skeleton, ftext, first_aid, sections)) def dump_flows(self, ddir): - if self.fdst is None: - raise ValueError('This MOBI file has no FDST record') - for i, x in enumerate(self.fdst.sections): + boundaries = [(0, len(self.raw_text))] + if self.fdst is not None: + boundaries = self.fdst.sections + for i, x in enumerate(boundaries): start, end = x raw = self.raw_text[start:end] with open(os.path.join(ddir, 'flow%04d.txt'%i), 'wb') as f: diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index c2cd9b4283..8fd4714e1c 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -13,7 +13,7 @@ from functools import partial from lxml import etree -from calibre.ebooks.oeb.base import XHTML_NS +from calibre.ebooks.oeb.base import XHTML_NS, extract from calibre.constants import ispy3 from calibre.ebooks.mobi.utils import to_base @@ -224,14 +224,24 @@ class Chunker(object): nroot.text = root.text nroot.tail = '\n' - for tag in root.iterdescendants(etree.Element): - # We are ignoring all non tag entities in the tree - # like comments and processing instructions, as they make the - # chunking code even harder, for minimal gain. - elem = nroot.makeelement(tag.tag.rpartition('}')[-1], - attrib={k.rpartition('}')[-1]:v for k, v in - tag.attrib.iteritems()}) - elem.text, elem.tail = tag.text, tag.tail + # Remove Comments and ProcessingInstructions as kindlegen seems to + # remove them as well + for tag in root.iterdescendants(): + if tag.tag in {etree.Comment, etree.ProcessingInstruction}: + extract(tag) + + for tag in root.iterdescendants(): + if tag.tag == etree.Entity: + elem = etree.Entity(tag.name) + else: + tn = tag.tag + if tn is not None: + tn = tn.rpartition('}')[-1] + elem = nroot.makeelement(tn, + attrib={k.rpartition('}')[-1]:v for k, v in + tag.attrib.iteritems()}) + elem.text = tag.text + elem.tail = tag.tail parent = node_from_path(nroot, path_to_node(tag.getparent())) parent.append(elem) @@ -251,6 +261,11 @@ class Chunker(object): # Now loop over children for child in list(tag): raw = tostring(child, with_tail=False) + if child.tag == etree.Entity: + chunks.append(raw) + if child.tail: + chunks.extend(self.chunk_up_text(child.tail, aid)) + continue raw = close_self_closing_tags(raw) if len(raw) > CHUNK_SIZE and child.get('aid', None): self.step_into_tag(child, chunks)