mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Refactor tree splitting code to make it re-useable
This commit is contained in:
parent
2b1053c05d
commit
a52407b67a
149
src/calibre/ebooks/oeb/polish/split.py
Normal file
149
src/calibre/ebooks/oeb/polish/split.py
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=utf-8
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
|
import copy
|
||||||
|
from future_builtins import map
|
||||||
|
|
||||||
|
from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath
|
||||||
|
from calibre.ebooks.oeb.polish.toc import node_from_loc
|
||||||
|
|
||||||
|
def in_table(node):
|
||||||
|
while node is not None:
|
||||||
|
if node.tag.endswith('}table'):
|
||||||
|
return True
|
||||||
|
node = node.getparent()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def adjust_split_point(split_point, log):
|
||||||
|
'''
|
||||||
|
Move the split point up its ancestor chain if it has no content
|
||||||
|
before it. This handles the common case:
|
||||||
|
<div id="chapter1"><h2>Chapter 1</h2>...</div> with a page break on the
|
||||||
|
h2.
|
||||||
|
'''
|
||||||
|
sp = split_point
|
||||||
|
while True:
|
||||||
|
parent = sp.getparent()
|
||||||
|
if (
|
||||||
|
parent is None or
|
||||||
|
barename(parent.tag) in {'body', 'html'} or
|
||||||
|
(parent.text and parent.text.strip()) or
|
||||||
|
parent.index(sp) > 0
|
||||||
|
):
|
||||||
|
break
|
||||||
|
sp = parent
|
||||||
|
|
||||||
|
if sp is not split_point:
|
||||||
|
log.debug('Adjusted split point to ancestor')
|
||||||
|
|
||||||
|
return sp
|
||||||
|
|
||||||
|
def get_body(root):
|
||||||
|
return root.find('h:body', namespaces=XPNSMAP)
|
||||||
|
|
||||||
|
def do_split(split_point, log, before=True):
|
||||||
|
'''
|
||||||
|
Split tree into a *before* and an *after* tree at ``split_point``.
|
||||||
|
|
||||||
|
:param split_point: The Element at which to split
|
||||||
|
:param before: If True tree is split before split_point, otherwise after split_point
|
||||||
|
:return: before_tree, after_tree
|
||||||
|
'''
|
||||||
|
if before:
|
||||||
|
# We cannot adjust for after since moving an after split point to a
|
||||||
|
# parent will cause breakage if the parent contains any content
|
||||||
|
# after the original split point
|
||||||
|
split_point = adjust_split_point(split_point, log)
|
||||||
|
tree = split_point.getroottree()
|
||||||
|
path = tree.getpath(split_point)
|
||||||
|
|
||||||
|
tree, tree2 = copy.deepcopy(tree), copy.deepcopy(tree)
|
||||||
|
root, root2 = tree.getroot(), tree2.getroot()
|
||||||
|
body, body2 = map(get_body, (root, root2))
|
||||||
|
split_point = root.xpath(path)[0]
|
||||||
|
split_point2 = root2.xpath(path)[0]
|
||||||
|
|
||||||
|
def nix_element(elem, top=True):
|
||||||
|
# Remove elem unless top is False in which case replace elem by its
|
||||||
|
# children
|
||||||
|
parent = elem.getparent()
|
||||||
|
if top:
|
||||||
|
parent.remove(elem)
|
||||||
|
else:
|
||||||
|
index = parent.index(elem)
|
||||||
|
parent[index:index+1] = list(elem.iterchildren())
|
||||||
|
|
||||||
|
# Tree 1
|
||||||
|
hit_split_point = False
|
||||||
|
keep_descendants = False
|
||||||
|
split_point_descendants = frozenset(split_point.iterdescendants())
|
||||||
|
for elem in tuple(body.iterdescendants()):
|
||||||
|
if elem is split_point:
|
||||||
|
hit_split_point = True
|
||||||
|
if before:
|
||||||
|
nix_element(elem)
|
||||||
|
else:
|
||||||
|
# We want to keep the descendants of the split point in
|
||||||
|
# Tree 1
|
||||||
|
keep_descendants = True
|
||||||
|
# We want the split point element, but not its tail
|
||||||
|
elem.tail = '\n'
|
||||||
|
|
||||||
|
continue
|
||||||
|
if hit_split_point:
|
||||||
|
if keep_descendants:
|
||||||
|
if elem in split_point_descendants:
|
||||||
|
# elem is a descendant keep it
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# We are out of split_point, so prevent further set
|
||||||
|
# lookups of split_point_descendants
|
||||||
|
keep_descendants = False
|
||||||
|
nix_element(elem)
|
||||||
|
|
||||||
|
# Tree 2
|
||||||
|
ancestors = frozenset(XPath('ancestor::*')(split_point2))
|
||||||
|
for elem in tuple(body2.iterdescendants()):
|
||||||
|
if elem is split_point2:
|
||||||
|
if not before:
|
||||||
|
# Keep the split point element's tail, if it contains non-whitespace
|
||||||
|
# text
|
||||||
|
tail = elem.tail
|
||||||
|
if tail and not tail.isspace():
|
||||||
|
parent = elem.getparent()
|
||||||
|
idx = parent.index(elem)
|
||||||
|
if idx == 0:
|
||||||
|
parent.text = (parent.text or '') + tail
|
||||||
|
else:
|
||||||
|
sib = parent[idx-1]
|
||||||
|
sib.tail = (sib.tail or '') + tail
|
||||||
|
# Remove the element itself
|
||||||
|
nix_element(elem)
|
||||||
|
break
|
||||||
|
if elem in ancestors:
|
||||||
|
# We have to preserve the ancestors as they could have CSS
|
||||||
|
# styles that are inherited/applicable, like font or
|
||||||
|
# width. So we only remove the text, if any.
|
||||||
|
elem.text = '\n'
|
||||||
|
else:
|
||||||
|
nix_element(elem, top=False)
|
||||||
|
|
||||||
|
body2.text = '\n'
|
||||||
|
|
||||||
|
return tree, tree2
|
||||||
|
|
||||||
|
|
||||||
|
def split(container, name, loc):
|
||||||
|
root = container.parsed(name)
|
||||||
|
split_point = node_from_loc(root, loc)
|
||||||
|
if in_table(split_point):
|
||||||
|
raise ValueError('Cannot split inside tables')
|
||||||
|
if split_point.tag.endswith('}body'):
|
||||||
|
raise ValueError('Cannot split on the <body> tag')
|
||||||
|
|
||||||
|
|
@ -16,9 +16,10 @@ from lxml.etree import XPath as _XPath
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
from cssselect import HTMLTranslator
|
from cssselect import HTMLTranslator
|
||||||
|
|
||||||
from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES,
|
|
||||||
urldefrag, rewrite_links, urlunquote, barename, XHTML, urlnormalize)
|
|
||||||
from calibre.ebooks.epub import rules
|
from calibre.ebooks.epub import rules
|
||||||
|
from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES,
|
||||||
|
urldefrag, rewrite_links, urlunquote, XHTML, urlnormalize)
|
||||||
|
from calibre.ebooks.oeb.polish.split import do_split
|
||||||
|
|
||||||
XPath = functools.partial(_XPath, namespaces=NAMESPACES)
|
XPath = functools.partial(_XPath, namespaces=NAMESPACES)
|
||||||
|
|
||||||
@ -271,31 +272,6 @@ class FlowSplitter(object):
|
|||||||
return None
|
return None
|
||||||
return body[0]
|
return body[0]
|
||||||
|
|
||||||
def adjust_split_point(self, root, path):
|
|
||||||
'''
|
|
||||||
Move the split point up its ancestor chain if it has no textual content
|
|
||||||
before it. This handles the common case:
|
|
||||||
<div id="chapter1"><h2>Chapter 1</h2>...</div> with a page break on the
|
|
||||||
h2.
|
|
||||||
'''
|
|
||||||
sp = root.xpath(path)[0]
|
|
||||||
while True:
|
|
||||||
parent = sp.getparent()
|
|
||||||
if barename(parent.tag) in ('body', 'html'):
|
|
||||||
break
|
|
||||||
if parent.text and parent.text.strip():
|
|
||||||
break
|
|
||||||
if parent.index(sp) > 0:
|
|
||||||
break
|
|
||||||
sp = parent
|
|
||||||
|
|
||||||
npath = sp.getroottree().getpath(sp)
|
|
||||||
|
|
||||||
if self.opts.verbose > 3 and npath != path:
|
|
||||||
self.log.debug('\t\t\tMoved split point %s to %s'%(path, npath))
|
|
||||||
|
|
||||||
return npath
|
|
||||||
|
|
||||||
def do_split(self, tree, split_point, before):
|
def do_split(self, tree, split_point, before):
|
||||||
'''
|
'''
|
||||||
Split ``tree`` into a *before* and *after* tree at ``split_point``.
|
Split ``tree`` into a *before* and *after* tree at ``split_point``.
|
||||||
@ -303,87 +279,7 @@ class FlowSplitter(object):
|
|||||||
:param before: If True tree is split before split_point, otherwise after split_point
|
:param before: If True tree is split before split_point, otherwise after split_point
|
||||||
:return: before_tree, after_tree
|
:return: before_tree, after_tree
|
||||||
'''
|
'''
|
||||||
path = tree.getpath(split_point)
|
return do_split(split_point, self.log, before=before)
|
||||||
tree, tree2 = copy.deepcopy(tree), copy.deepcopy(tree)
|
|
||||||
root = tree.getroot()
|
|
||||||
root2 = tree2.getroot()
|
|
||||||
body, body2 = map(self.get_body, (root, root2))
|
|
||||||
if before:
|
|
||||||
# We cannot adjust for after since moving an after split point to a
|
|
||||||
# parent will cause breakage if the parent contains any content
|
|
||||||
# after the original split point
|
|
||||||
path = self.adjust_split_point(root, path)
|
|
||||||
split_point = root.xpath(path)[0]
|
|
||||||
split_point2 = root2.xpath(path)[0]
|
|
||||||
|
|
||||||
def nix_element(elem, top=True):
|
|
||||||
# Remove elem unless top is False in which case replace elem by its
|
|
||||||
# children
|
|
||||||
parent = elem.getparent()
|
|
||||||
if top:
|
|
||||||
parent.remove(elem)
|
|
||||||
else:
|
|
||||||
index = parent.index(elem)
|
|
||||||
parent[index:index+1] = list(elem.iterchildren())
|
|
||||||
|
|
||||||
# Tree 1
|
|
||||||
hit_split_point = False
|
|
||||||
keep_descendants = False
|
|
||||||
split_point_descendants = frozenset(split_point.iterdescendants())
|
|
||||||
for elem in tuple(body.iterdescendants()):
|
|
||||||
if elem is split_point:
|
|
||||||
hit_split_point = True
|
|
||||||
if before:
|
|
||||||
nix_element(elem)
|
|
||||||
else:
|
|
||||||
# We want to keep the descendants of the split point in
|
|
||||||
# Tree 1
|
|
||||||
keep_descendants = True
|
|
||||||
# We want the split point element, but not its tail
|
|
||||||
elem.tail = '\n'
|
|
||||||
|
|
||||||
continue
|
|
||||||
if hit_split_point:
|
|
||||||
if keep_descendants:
|
|
||||||
if elem in split_point_descendants:
|
|
||||||
# elem is a descendant keep it
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
# We are out of split_point, so prevent further set
|
|
||||||
# lookups of split_point_descendants
|
|
||||||
keep_descendants = False
|
|
||||||
nix_element(elem)
|
|
||||||
|
|
||||||
# Tree 2
|
|
||||||
ancestors = frozenset(XPath('ancestor::*')(split_point2))
|
|
||||||
for elem in tuple(body2.iterdescendants()):
|
|
||||||
if elem is split_point2:
|
|
||||||
if not before:
|
|
||||||
# Keep the split point element's tail, if it contains non-whitespace
|
|
||||||
# text
|
|
||||||
tail = elem.tail
|
|
||||||
if tail and not tail.isspace():
|
|
||||||
parent = elem.getparent()
|
|
||||||
idx = parent.index(elem)
|
|
||||||
if idx == 0:
|
|
||||||
parent.text = (parent.text or '') + tail
|
|
||||||
else:
|
|
||||||
sib = parent[idx-1]
|
|
||||||
sib.tail = (sib.tail or '') + tail
|
|
||||||
# Remove the element itself
|
|
||||||
nix_element(elem)
|
|
||||||
break
|
|
||||||
if elem in ancestors:
|
|
||||||
# We have to preserve the ancestors as they could have CSS
|
|
||||||
# styles that are inherited/applicable, like font or
|
|
||||||
# width. So we only remove the text, if any.
|
|
||||||
elem.text = '\n'
|
|
||||||
else:
|
|
||||||
nix_element(elem, top=False)
|
|
||||||
|
|
||||||
body2.text = '\n'
|
|
||||||
|
|
||||||
return tree, tree2
|
|
||||||
|
|
||||||
def is_page_empty(self, root):
|
def is_page_empty(self, root):
|
||||||
body = self.get_body(root)
|
body = self.get_body(root)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user