Refactor tree splitting code to make it re-useable

2025-11-21 14:03:03 -05:00 · 2013-11-21 11:42:34 +05:30 · 2013-11-21 11:42:34 +05:30 · a52407b67a
commit a52407b67a
parent 2b1053c05d
2 changed files with 153 additions and 108 deletions
--- a/src/calibre/ebooks/oeb/polish/split.py
+++ b/src/calibre/ebooks/oeb/polish/split.py
@ -0,0 +1,149 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 import copy
 from future_builtins import map
 from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath
 from calibre.ebooks.oeb.polish.toc import node_from_loc
 def in_table(node):
    while node is not None:
        if node.tag.endswith('}table'):
            return True
        node = node.getparent()
    return False
 def adjust_split_point(split_point, log):
    '''
    Move the split point up its ancestor chain if it has no content
    before it. This handles the common case:
    <div id="chapter1"><h2>Chapter 1</h2>...</div> with a page break on the
    h2.
    '''
    sp = split_point
    while True:
        parent = sp.getparent()
        if (
            parent is None or
            barename(parent.tag) in {'body', 'html'} or
            (parent.text and parent.text.strip()) or
            parent.index(sp) > 0
        ):
            break
        sp = parent
    if sp is not split_point:
        log.debug('Adjusted split point to ancestor')
    return sp
 def get_body(root):
    return root.find('h:body', namespaces=XPNSMAP)
 def do_split(split_point, log, before=True):
    '''
    Split tree into a *before* and an *after* tree at ``split_point``.
    :param split_point: The Element at which to split
    :param before: If True tree is split before split_point, otherwise after split_point
    :return: before_tree, after_tree
    '''
    if before:
        # We cannot adjust for after since moving an after split point to a
        # parent will cause breakage if the parent contains any content
        # after the original split point
        split_point = adjust_split_point(split_point, log)
    tree         = split_point.getroottree()
    path         = tree.getpath(split_point)
    tree, tree2  = copy.deepcopy(tree), copy.deepcopy(tree)
    root, root2  = tree.getroot(), tree2.getroot()
    body, body2  = map(get_body, (root, root2))
    split_point  = root.xpath(path)[0]
    split_point2 = root2.xpath(path)[0]
    def nix_element(elem, top=True):
        # Remove elem unless top is False in which case replace elem by its
        # children
        parent = elem.getparent()
        if top:
            parent.remove(elem)
        else:
            index = parent.index(elem)
            parent[index:index+1] = list(elem.iterchildren())
    # Tree 1
    hit_split_point = False
    keep_descendants = False
    split_point_descendants = frozenset(split_point.iterdescendants())
    for elem in tuple(body.iterdescendants()):
        if elem is split_point:
            hit_split_point = True
            if before:
                nix_element(elem)
            else:
                # We want to keep the descendants of the split point in
                # Tree 1
                keep_descendants = True
                # We want the split point element, but not its tail
                elem.tail = '\n'
            continue
        if hit_split_point:
            if keep_descendants:
                if elem in split_point_descendants:
                    # elem is a descendant keep it
                    continue
                else:
                    # We are out of split_point, so prevent further set
                    # lookups of split_point_descendants
                    keep_descendants = False
            nix_element(elem)
    # Tree 2
    ancestors = frozenset(XPath('ancestor::*')(split_point2))
    for elem in tuple(body2.iterdescendants()):
        if elem is split_point2:
            if not before:
                # Keep the split point element's tail, if it contains non-whitespace
                # text
                tail = elem.tail
                if tail and not tail.isspace():
                    parent = elem.getparent()
                    idx = parent.index(elem)
                    if idx == 0:
                        parent.text = (parent.text or '') + tail
                    else:
                        sib = parent[idx-1]
                        sib.tail = (sib.tail or '') + tail
                # Remove the element itself
                nix_element(elem)
            break
        if elem in ancestors:
            # We have to preserve the ancestors as they could have CSS
            # styles that are inherited/applicable, like font or
            # width. So we only remove the text, if any.
            elem.text = '\n'
        else:
            nix_element(elem, top=False)
    body2.text = '\n'
    return tree, tree2
 def split(container, name, loc):
    root = container.parsed(name)
    split_point = node_from_loc(root, loc)
    if in_table(split_point):
        raise ValueError('Cannot split inside tables')
    if split_point.tag.endswith('}body'):
        raise ValueError('Cannot split on the <body> tag')
--- a/src/calibre/ebooks/oeb/transforms/split.py
+++ b/src/calibre/ebooks/oeb/transforms/split.py
@ -16,9 +16,10 @@ from lxml.etree import XPath as _XPath
 from lxml import etree
 from cssselect import HTMLTranslator
 from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES,
        urldefrag, rewrite_links, urlunquote, barename, XHTML, urlnormalize)
 from calibre.ebooks.epub import rules
 from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES,
        urldefrag, rewrite_links, urlunquote, XHTML, urlnormalize)
 from calibre.ebooks.oeb.polish.split import do_split
 XPath = functools.partial(_XPath, namespaces=NAMESPACES)
@ -271,31 +272,6 @@ class FlowSplitter(object):
            return None
        return body[0]
    def adjust_split_point(self, root, path):
        '''
        Move the split point up its ancestor chain if it has no textual content
        before it. This handles the common case:
        <div id="chapter1"><h2>Chapter 1</h2>...</div> with a page break on the
        h2.
        '''
        sp = root.xpath(path)[0]
        while True:
            parent = sp.getparent()
            if barename(parent.tag) in ('body', 'html'):
                break
            if parent.text and parent.text.strip():
                break
            if parent.index(sp) > 0:
                break
            sp = parent
        npath = sp.getroottree().getpath(sp)
        if self.opts.verbose > 3 and npath != path:
            self.log.debug('\t\t\tMoved split point %s to %s'%(path, npath))
        return npath
    def do_split(self, tree, split_point, before):
        '''
        Split ``tree`` into a *before* and *after* tree at ``split_point``.
@ -303,87 +279,7 @@ class FlowSplitter(object):
        :param before: If True tree is split before split_point, otherwise after split_point
        :return: before_tree, after_tree
        '''
-        path         = tree.getpath(split_point)
+        return do_split(split_point, self.log, before=before)
        tree, tree2  = copy.deepcopy(tree), copy.deepcopy(tree)
        root         = tree.getroot()
        root2        = tree2.getroot()
        body, body2  = map(self.get_body, (root, root2))
        if before:
            # We cannot adjust for after since moving an after split point to a
            # parent will cause breakage if the parent contains any content
            # after the original split point
            path = self.adjust_split_point(root, path)
        split_point  = root.xpath(path)[0]
        split_point2 = root2.xpath(path)[0]
        def nix_element(elem, top=True):
            # Remove elem unless top is False in which case replace elem by its
            # children
            parent = elem.getparent()
            if top:
                parent.remove(elem)
            else:
                index = parent.index(elem)
                parent[index:index+1] = list(elem.iterchildren())
        # Tree 1
        hit_split_point = False
        keep_descendants = False
        split_point_descendants = frozenset(split_point.iterdescendants())
        for elem in tuple(body.iterdescendants()):
            if elem is split_point:
                hit_split_point = True
                if before:
                    nix_element(elem)
                else:
                    # We want to keep the descendants of the split point in
                    # Tree 1
                    keep_descendants = True
                    # We want the split point element, but not its tail
                    elem.tail = '\n'
                continue
            if hit_split_point:
                if keep_descendants:
                    if elem in split_point_descendants:
                        # elem is a descendant keep it
                        continue
                    else:
                        # We are out of split_point, so prevent further set
                        # lookups of split_point_descendants
                        keep_descendants = False
                nix_element(elem)
        # Tree 2
        ancestors = frozenset(XPath('ancestor::*')(split_point2))
        for elem in tuple(body2.iterdescendants()):
            if elem is split_point2:
                if not before:
                    # Keep the split point element's tail, if it contains non-whitespace
                    # text
                    tail = elem.tail
                    if tail and not tail.isspace():
                        parent = elem.getparent()
                        idx = parent.index(elem)
                        if idx == 0:
                            parent.text = (parent.text or '') + tail
                        else:
                            sib = parent[idx-1]
                            sib.tail = (sib.tail or '') + tail
                    # Remove the element itself
                    nix_element(elem)
                break
            if elem in ancestors:
                # We have to preserve the ancestors as they could have CSS
                # styles that are inherited/applicable, like font or
                # width. So we only remove the text, if any.
                elem.text = '\n'
            else:
                nix_element(elem, top=False)
        body2.text = '\n'
        return tree, tree2
    def is_page_empty(self, root):
        body = self.get_body(root)