From a52407b67a31ec75cf7be915032b69d360ee2923 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 Nov 2013 11:42:34 +0530 Subject: [PATCH] Refactor tree splitting code to make it re-useable --- src/calibre/ebooks/oeb/polish/split.py | 149 +++++++++++++++++++++ src/calibre/ebooks/oeb/transforms/split.py | 112 +--------------- 2 files changed, 153 insertions(+), 108 deletions(-) create mode 100644 src/calibre/ebooks/oeb/polish/split.py diff --git a/src/calibre/ebooks/oeb/polish/split.py b/src/calibre/ebooks/oeb/polish/split.py new file mode 100644 index 0000000000..0775f64489 --- /dev/null +++ b/src/calibre/ebooks/oeb/polish/split.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +import copy +from future_builtins import map + +from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath +from calibre.ebooks.oeb.polish.toc import node_from_loc + +def in_table(node): + while node is not None: + if node.tag.endswith('}table'): + return True + node = node.getparent() + return False + +def adjust_split_point(split_point, log): + ''' + Move the split point up its ancestor chain if it has no content + before it. This handles the common case: +

Chapter 1

...
with a page break on the + h2. + ''' + sp = split_point + while True: + parent = sp.getparent() + if ( + parent is None or + barename(parent.tag) in {'body', 'html'} or + (parent.text and parent.text.strip()) or + parent.index(sp) > 0 + ): + break + sp = parent + + if sp is not split_point: + log.debug('Adjusted split point to ancestor') + + return sp + +def get_body(root): + return root.find('h:body', namespaces=XPNSMAP) + +def do_split(split_point, log, before=True): + ''' + Split tree into a *before* and an *after* tree at ``split_point``. + + :param split_point: The Element at which to split + :param before: If True tree is split before split_point, otherwise after split_point + :return: before_tree, after_tree + ''' + if before: + # We cannot adjust for after since moving an after split point to a + # parent will cause breakage if the parent contains any content + # after the original split point + split_point = adjust_split_point(split_point, log) + tree = split_point.getroottree() + path = tree.getpath(split_point) + + tree, tree2 = copy.deepcopy(tree), copy.deepcopy(tree) + root, root2 = tree.getroot(), tree2.getroot() + body, body2 = map(get_body, (root, root2)) + split_point = root.xpath(path)[0] + split_point2 = root2.xpath(path)[0] + + def nix_element(elem, top=True): + # Remove elem unless top is False in which case replace elem by its + # children + parent = elem.getparent() + if top: + parent.remove(elem) + else: + index = parent.index(elem) + parent[index:index+1] = list(elem.iterchildren()) + + # Tree 1 + hit_split_point = False + keep_descendants = False + split_point_descendants = frozenset(split_point.iterdescendants()) + for elem in tuple(body.iterdescendants()): + if elem is split_point: + hit_split_point = True + if before: + nix_element(elem) + else: + # We want to keep the descendants of the split point in + # Tree 1 + keep_descendants = True + # We want the split point element, but not its tail + elem.tail = '\n' + + continue + if hit_split_point: + if keep_descendants: + if elem in split_point_descendants: + # elem is a descendant keep it + continue + else: + # We are out of split_point, so prevent further set + # lookups of split_point_descendants + keep_descendants = False + nix_element(elem) + + # Tree 2 + ancestors = frozenset(XPath('ancestor::*')(split_point2)) + for elem in tuple(body2.iterdescendants()): + if elem is split_point2: + if not before: + # Keep the split point element's tail, if it contains non-whitespace + # text + tail = elem.tail + if tail and not tail.isspace(): + parent = elem.getparent() + idx = parent.index(elem) + if idx == 0: + parent.text = (parent.text or '') + tail + else: + sib = parent[idx-1] + sib.tail = (sib.tail or '') + tail + # Remove the element itself + nix_element(elem) + break + if elem in ancestors: + # We have to preserve the ancestors as they could have CSS + # styles that are inherited/applicable, like font or + # width. So we only remove the text, if any. + elem.text = '\n' + else: + nix_element(elem, top=False) + + body2.text = '\n' + + return tree, tree2 + + +def split(container, name, loc): + root = container.parsed(name) + split_point = node_from_loc(root, loc) + if in_table(split_point): + raise ValueError('Cannot split inside tables') + if split_point.tag.endswith('}body'): + raise ValueError('Cannot split on the tag') + + diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index 36fe6b3167..01e4348b34 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -16,9 +16,10 @@ from lxml.etree import XPath as _XPath from lxml import etree from cssselect import HTMLTranslator -from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES, - urldefrag, rewrite_links, urlunquote, barename, XHTML, urlnormalize) from calibre.ebooks.epub import rules +from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES, + urldefrag, rewrite_links, urlunquote, XHTML, urlnormalize) +from calibre.ebooks.oeb.polish.split import do_split XPath = functools.partial(_XPath, namespaces=NAMESPACES) @@ -271,31 +272,6 @@ class FlowSplitter(object): return None return body[0] - def adjust_split_point(self, root, path): - ''' - Move the split point up its ancestor chain if it has no textual content - before it. This handles the common case: -

Chapter 1

...
with a page break on the - h2. - ''' - sp = root.xpath(path)[0] - while True: - parent = sp.getparent() - if barename(parent.tag) in ('body', 'html'): - break - if parent.text and parent.text.strip(): - break - if parent.index(sp) > 0: - break - sp = parent - - npath = sp.getroottree().getpath(sp) - - if self.opts.verbose > 3 and npath != path: - self.log.debug('\t\t\tMoved split point %s to %s'%(path, npath)) - - return npath - def do_split(self, tree, split_point, before): ''' Split ``tree`` into a *before* and *after* tree at ``split_point``. @@ -303,87 +279,7 @@ class FlowSplitter(object): :param before: If True tree is split before split_point, otherwise after split_point :return: before_tree, after_tree ''' - path = tree.getpath(split_point) - tree, tree2 = copy.deepcopy(tree), copy.deepcopy(tree) - root = tree.getroot() - root2 = tree2.getroot() - body, body2 = map(self.get_body, (root, root2)) - if before: - # We cannot adjust for after since moving an after split point to a - # parent will cause breakage if the parent contains any content - # after the original split point - path = self.adjust_split_point(root, path) - split_point = root.xpath(path)[0] - split_point2 = root2.xpath(path)[0] - - def nix_element(elem, top=True): - # Remove elem unless top is False in which case replace elem by its - # children - parent = elem.getparent() - if top: - parent.remove(elem) - else: - index = parent.index(elem) - parent[index:index+1] = list(elem.iterchildren()) - - # Tree 1 - hit_split_point = False - keep_descendants = False - split_point_descendants = frozenset(split_point.iterdescendants()) - for elem in tuple(body.iterdescendants()): - if elem is split_point: - hit_split_point = True - if before: - nix_element(elem) - else: - # We want to keep the descendants of the split point in - # Tree 1 - keep_descendants = True - # We want the split point element, but not its tail - elem.tail = '\n' - - continue - if hit_split_point: - if keep_descendants: - if elem in split_point_descendants: - # elem is a descendant keep it - continue - else: - # We are out of split_point, so prevent further set - # lookups of split_point_descendants - keep_descendants = False - nix_element(elem) - - # Tree 2 - ancestors = frozenset(XPath('ancestor::*')(split_point2)) - for elem in tuple(body2.iterdescendants()): - if elem is split_point2: - if not before: - # Keep the split point element's tail, if it contains non-whitespace - # text - tail = elem.tail - if tail and not tail.isspace(): - parent = elem.getparent() - idx = parent.index(elem) - if idx == 0: - parent.text = (parent.text or '') + tail - else: - sib = parent[idx-1] - sib.tail = (sib.tail or '') + tail - # Remove the element itself - nix_element(elem) - break - if elem in ancestors: - # We have to preserve the ancestors as they could have CSS - # styles that are inherited/applicable, like font or - # width. So we only remove the text, if any. - elem.text = '\n' - else: - nix_element(elem, top=False) - - body2.text = '\n' - - return tree, tree2 + return do_split(split_point, self.log, before=before) def is_page_empty(self, root): body = self.get_body(root)