KF8 Output: Handle documents that have XML comments and/or processing instructions

This commit is contained in:
Kovid Goyal 2012-04-30 10:41:26 +05:30
parent 12d0e754db
commit 888c95d0cf
2 changed files with 28 additions and 12 deletions

View File

@ -141,9 +141,10 @@ class MOBIFile(object):
self.files.append(File(skel, skeleton, ftext, first_aid, sections)) self.files.append(File(skel, skeleton, ftext, first_aid, sections))
def dump_flows(self, ddir): def dump_flows(self, ddir):
if self.fdst is None: boundaries = [(0, len(self.raw_text))]
raise ValueError('This MOBI file has no FDST record') if self.fdst is not None:
for i, x in enumerate(self.fdst.sections): boundaries = self.fdst.sections
for i, x in enumerate(boundaries):
start, end = x start, end = x
raw = self.raw_text[start:end] raw = self.raw_text[start:end]
with open(os.path.join(ddir, 'flow%04d.txt'%i), 'wb') as f: with open(os.path.join(ddir, 'flow%04d.txt'%i), 'wb') as f:

View File

@ -13,7 +13,7 @@ from functools import partial
from lxml import etree from lxml import etree
from calibre.ebooks.oeb.base import XHTML_NS from calibre.ebooks.oeb.base import XHTML_NS, extract
from calibre.constants import ispy3 from calibre.constants import ispy3
from calibre.ebooks.mobi.utils import to_base from calibre.ebooks.mobi.utils import to_base
@ -224,14 +224,24 @@ class Chunker(object):
nroot.text = root.text nroot.text = root.text
nroot.tail = '\n' nroot.tail = '\n'
for tag in root.iterdescendants(etree.Element): # Remove Comments and ProcessingInstructions as kindlegen seems to
# We are ignoring all non tag entities in the tree # remove them as well
# like comments and processing instructions, as they make the for tag in root.iterdescendants():
# chunking code even harder, for minimal gain. if tag.tag in {etree.Comment, etree.ProcessingInstruction}:
elem = nroot.makeelement(tag.tag.rpartition('}')[-1], extract(tag)
for tag in root.iterdescendants():
if tag.tag == etree.Entity:
elem = etree.Entity(tag.name)
else:
tn = tag.tag
if tn is not None:
tn = tn.rpartition('}')[-1]
elem = nroot.makeelement(tn,
attrib={k.rpartition('}')[-1]:v for k, v in attrib={k.rpartition('}')[-1]:v for k, v in
tag.attrib.iteritems()}) tag.attrib.iteritems()})
elem.text, elem.tail = tag.text, tag.tail elem.text = tag.text
elem.tail = tag.tail
parent = node_from_path(nroot, path_to_node(tag.getparent())) parent = node_from_path(nroot, path_to_node(tag.getparent()))
parent.append(elem) parent.append(elem)
@ -251,6 +261,11 @@ class Chunker(object):
# Now loop over children # Now loop over children
for child in list(tag): for child in list(tag):
raw = tostring(child, with_tail=False) raw = tostring(child, with_tail=False)
if child.tag == etree.Entity:
chunks.append(raw)
if child.tail:
chunks.extend(self.chunk_up_text(child.tail, aid))
continue
raw = close_self_closing_tags(raw) raw = close_self_closing_tags(raw)
if len(raw) > CHUNK_SIZE and child.get('aid', None): if len(raw) > CHUNK_SIZE and child.get('aid', None):
self.step_into_tag(child, chunks) self.step_into_tag(child, chunks)