mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
KF8 Output: Handle documents that have XML comments and/or processing instructions
This commit is contained in:
parent
12d0e754db
commit
888c95d0cf
@ -141,9 +141,10 @@ class MOBIFile(object):
|
|||||||
self.files.append(File(skel, skeleton, ftext, first_aid, sections))
|
self.files.append(File(skel, skeleton, ftext, first_aid, sections))
|
||||||
|
|
||||||
def dump_flows(self, ddir):
|
def dump_flows(self, ddir):
|
||||||
if self.fdst is None:
|
boundaries = [(0, len(self.raw_text))]
|
||||||
raise ValueError('This MOBI file has no FDST record')
|
if self.fdst is not None:
|
||||||
for i, x in enumerate(self.fdst.sections):
|
boundaries = self.fdst.sections
|
||||||
|
for i, x in enumerate(boundaries):
|
||||||
start, end = x
|
start, end = x
|
||||||
raw = self.raw_text[start:end]
|
raw = self.raw_text[start:end]
|
||||||
with open(os.path.join(ddir, 'flow%04d.txt'%i), 'wb') as f:
|
with open(os.path.join(ddir, 'flow%04d.txt'%i), 'wb') as f:
|
||||||
|
@ -13,7 +13,7 @@ from functools import partial
|
|||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.ebooks.oeb.base import XHTML_NS
|
from calibre.ebooks.oeb.base import XHTML_NS, extract
|
||||||
from calibre.constants import ispy3
|
from calibre.constants import ispy3
|
||||||
from calibre.ebooks.mobi.utils import to_base
|
from calibre.ebooks.mobi.utils import to_base
|
||||||
|
|
||||||
@ -224,14 +224,24 @@ class Chunker(object):
|
|||||||
nroot.text = root.text
|
nroot.text = root.text
|
||||||
nroot.tail = '\n'
|
nroot.tail = '\n'
|
||||||
|
|
||||||
for tag in root.iterdescendants(etree.Element):
|
# Remove Comments and ProcessingInstructions as kindlegen seems to
|
||||||
# We are ignoring all non tag entities in the tree
|
# remove them as well
|
||||||
# like comments and processing instructions, as they make the
|
for tag in root.iterdescendants():
|
||||||
# chunking code even harder, for minimal gain.
|
if tag.tag in {etree.Comment, etree.ProcessingInstruction}:
|
||||||
elem = nroot.makeelement(tag.tag.rpartition('}')[-1],
|
extract(tag)
|
||||||
attrib={k.rpartition('}')[-1]:v for k, v in
|
|
||||||
tag.attrib.iteritems()})
|
for tag in root.iterdescendants():
|
||||||
elem.text, elem.tail = tag.text, tag.tail
|
if tag.tag == etree.Entity:
|
||||||
|
elem = etree.Entity(tag.name)
|
||||||
|
else:
|
||||||
|
tn = tag.tag
|
||||||
|
if tn is not None:
|
||||||
|
tn = tn.rpartition('}')[-1]
|
||||||
|
elem = nroot.makeelement(tn,
|
||||||
|
attrib={k.rpartition('}')[-1]:v for k, v in
|
||||||
|
tag.attrib.iteritems()})
|
||||||
|
elem.text = tag.text
|
||||||
|
elem.tail = tag.tail
|
||||||
parent = node_from_path(nroot, path_to_node(tag.getparent()))
|
parent = node_from_path(nroot, path_to_node(tag.getparent()))
|
||||||
parent.append(elem)
|
parent.append(elem)
|
||||||
|
|
||||||
@ -251,6 +261,11 @@ class Chunker(object):
|
|||||||
# Now loop over children
|
# Now loop over children
|
||||||
for child in list(tag):
|
for child in list(tag):
|
||||||
raw = tostring(child, with_tail=False)
|
raw = tostring(child, with_tail=False)
|
||||||
|
if child.tag == etree.Entity:
|
||||||
|
chunks.append(raw)
|
||||||
|
if child.tail:
|
||||||
|
chunks.extend(self.chunk_up_text(child.tail, aid))
|
||||||
|
continue
|
||||||
raw = close_self_closing_tags(raw)
|
raw = close_self_closing_tags(raw)
|
||||||
if len(raw) > CHUNK_SIZE and child.get('aid', None):
|
if len(raw) > CHUNK_SIZE and child.get('aid', None):
|
||||||
self.step_into_tag(child, chunks)
|
self.step_into_tag(child, chunks)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user