Edit book: Fix splitting of HTML file occurring at the wrong location if the HTML contains invalid constructs like nested <p> tags

2025-07-09 03:04:10 -04:00 · 2014-02-18 14:25:30 +05:30 · 2014-02-18 14:25:30 +05:30 · 371aa6ef77
commit 371aa6ef77
parent 06a40d9d2b
9 changed files with 48 additions and 32 deletions
--- a/resources/compiled_coffeescript.zip
+++ b/resources/compiled_coffeescript.zip
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@ -403,9 +403,9 @@ class Container(object):  # {{{
            data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)
        return etree.fromstring(data, parser=RECOVER_PARSER)

-    def parse_xhtml(self, data, fname='<string>'):
+    def parse_xhtml(self, data, fname='<string>', force_html5_parse=False):
        if self.tweak_mode:
-            return parse_html_tweak(data, log=self.log, decoder=self.decode)
+            return parse_html_tweak(data, log=self.log, decoder=self.decode, force_html5_parse=force_html5_parse)
        else:
            try:
                return parse_html(
--- a/src/calibre/ebooks/oeb/polish/errors.py
+++ b/src/calibre/ebooks/oeb/polish/errors.py
@ -16,3 +16,5 @@ class DRMError(_DRMError):
    def __init__(self):
        super(DRMError, self).__init__(_('This file is locked with DRM. It cannot be edited.'))

+class MalformedMarkup(ValueError):
+    pass
--- a/src/calibre/ebooks/oeb/polish/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/parsing.py
@ -636,7 +636,7 @@ def strip_encoding_declarations(raw):
        raw = prefix + suffix
    return raw

-def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True):
+def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
    if replace_entities:
@ -653,6 +653,8 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
        break

    raw = strip_encoding_declarations(raw)
+    if force_html5_parse:
+        return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
    try:
        parser = XMLParser(no_network=True)
        ans = fromstring(raw, parser=parser)
--- a/src/calibre/ebooks/oeb/polish/preview.coffee
+++ b/src/calibre/ebooks/oeb/polish/preview.coffee
@ -17,12 +17,6 @@ is_hidden = (elem) ->
        elem = elem.parentNode
    return false

-previous_sibling = (node) ->
-    node = node.previousSibling
-    while node and node.nodeType != Node.ELEMENT_NODE
-        node = node.previousSibling
-    return node
-
 is_block = (elem) ->
    style = window.getComputedStyle(elem)
    return style.display in ['block', 'flex-box', 'box']
@ -88,17 +82,20 @@ class PreviewIntegration

    report_split: (node) =>
        loc = []
+        totals = []
        parent = find_containing_block(node)
        while parent and parent.tagName.toLowerCase() != 'body'
+            totals.push(parent.parentNode.children.length)
            num = 0
-            sibling = previous_sibling(parent)
+            sibling = parent.previousElementSibling
            while sibling
                num += 1
-                sibling = previous_sibling(sibling)
+                sibling = sibling.previousElementSibling
            loc.push(num)
            parent = parent.parentNode
        loc.reverse()
-        window.py_bridge.request_split(JSON.stringify(loc))
+        totals.reverse()
+        window.py_bridge.request_split(JSON.stringify(loc), JSON.stringify(totals))

    onload: () =>
        window.document.body.addEventListener('click', this.onclick, true)
--- a/src/calibre/ebooks/oeb/polish/split.py
+++ b/src/calibre/ebooks/oeb/polish/split.py
@ -11,6 +11,7 @@ from future_builtins import map
 from urlparse import urlparse

 from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML, OEB_DOCS
+from calibre.ebooks.oeb.polish.errors import MalformedMarkup
 from calibre.ebooks.oeb.polish.toc import node_from_loc
 from calibre.ebooks.oeb.polish.replace import LinkRebaser

@ -162,14 +163,28 @@ class SplitLinkReplacer(object):
            self.replaced = True
        return url

-def split(container, name, loc_or_xpath, before=True):
+def split(container, name, loc_or_xpath, before=True, totals=None):
    ''' Split the file specified by name at the position specified by loc_or_xpath. '''

    root = container.parsed(name)
    if isinstance(loc_or_xpath, type('')):
        split_point = root.xpath(loc_or_xpath)[0]
    else:
-        split_point = node_from_loc(root, loc_or_xpath)
+        try:
+            split_point = node_from_loc(root, loc_or_xpath, totals=totals)
+        except MalformedMarkup:
+            # The webkit HTML parser and the container parser have yielded
+            # different node counts, this can happen if the file is valid XML
+            # but contains constructs like nested <p> tags. So force parse it
+            # with the HTML 5 parser and try again.
+            raw = container.raw_data(name)
+            root = container.parse_xhtml(raw, fname=name, force_html5_parse=True)
+            try:
+                split_point = node_from_loc(root, loc_or_xpath, totals=totals)
+            except MalformedMarkup:
+                raise MalformedMarkup(_('The file %s has malformed markup. Try running the Fix HTML tool'
+                                        ' before splitting') % name)
+            container.replace(name, root)
    if in_table(split_point):
        raise AbortError('Cannot split inside tables')
    if split_point.tag.endswith('}body'):
--- a/src/calibre/ebooks/oeb/polish/toc.py
+++ b/src/calibre/ebooks/oeb/polish/toc.py
@ -9,7 +9,7 @@ __docformat__ = 'restructuredtext en'

 import re
 from urlparse import urlparse
-from collections import deque, Counter, OrderedDict
+from collections import Counter, OrderedDict
 from functools import partial
 from operator import itemgetter

@ -18,6 +18,7 @@ from lxml.builder import ElementMaker

 from calibre import __version__
 from calibre.ebooks.oeb.base import XPath, uuid_id, xml2text, NCX, NCX_NS, XML, XHTML, XHTML_NS, serialize
+from calibre.ebooks.oeb.polish.errors import MalformedMarkup
 from calibre.ebooks.oeb.polish.utils import guess_type
 from calibre.ebooks.oeb.polish.pretty import pretty_html_tree
 from calibre.utils.localization import get_lang, canonicalize_lang, lang_as_iso639_1
@ -349,14 +350,13 @@ def from_files(container):
        toc.add(text, name)
    return toc

-def node_from_loc(root, loc):
-    body = root.xpath('//*[local-name()="body"]')[0]
-    locs = deque(loc)
-    node = body
-    while locs:
+def node_from_loc(root, locs, totals=None):
+    node = root.xpath('//*[local-name()="body"]')[0]
+    for i, loc in enumerate(locs):
        children = tuple(node.iterchildren(etree.Element))
+        if totals is not None and totals[i] != len(children):
+            raise MalformedMarkup()
        node = children[locs[0]]
-        locs.popleft()
    return node

 def add_id(container, name, loc):
--- a/src/calibre/gui2/tweak_book/boss.py
+++ b/src/calibre/gui2/tweak_book/boss.py
@ -902,10 +902,10 @@ class Boss(QObject):
        self.gui.preview.do_start_split()

    @in_thread_job
-    def split_requested(self, name, loc):
+    def split_requested(self, name, loc, totals):
        self.add_savepoint(_('Before: Split %s') % self.gui.elided_text(name))
        try:
-            bottom_name = split(current_container(), name, loc)
+            bottom_name = split(current_container(), name, loc, totals=totals)
        except AbortError:
            self.rewind_savepoint()
            raise
--- a/src/calibre/gui2/tweak_book/preview.py
+++ b/src/calibre/gui2/tweak_book/preview.py
@ -281,7 +281,7 @@ def find_le(a, x):
 class WebPage(QWebPage):

    sync_requested = pyqtSignal(object, object, object)
-    split_requested = pyqtSignal(object)
+    split_requested = pyqtSignal(object, object)

    def __init__(self, parent):
        QWebPage.__init__(self, parent)
@ -330,14 +330,14 @@ class WebPage(QWebPage):
        self.mainFrame().evaluateJavaScript('window.calibre_preview_integration.go_to_anchor(%s, %s)' % (
            json.dumps(anchor), json.dumps(str(lnum))))

-    @pyqtSlot(str)
-    def request_split(self, loc):
+    @pyqtSlot(str, str)
+    def request_split(self, loc, totals):
        actions['split-in-preview'].setChecked(False)
-        loc = json.loads(unicode(loc))
-        if not loc:
+        loc, totals = json.loads(unicode(loc)), json.loads(unicode(totals))
+        if not loc or not totals:
            return error_dialog(self.view(), _('Invalid location'),
                                _('Cannot split on the body tag'), show=True)
-        self.split_requested.emit(loc)
+        self.split_requested.emit(loc, totals)

    @property
    def line_numbers(self):
@ -423,7 +423,7 @@ class WebView(QWebView):
 class Preview(QWidget):

    sync_requested = pyqtSignal(object, object)
-    split_requested = pyqtSignal(object, object)
+    split_requested = pyqtSignal(object, object, object)
    split_start_requested = pyqtSignal()
    link_clicked = pyqtSignal(object, object)

@ -508,9 +508,9 @@ class Preview(QWidget):
                    return self.link_clicked.emit(name, urlparse(href).fragment or TOP)
            self.sync_requested.emit(self.current_name, lnum)

-    def request_split(self, loc):
+    def request_split(self, loc, totals):
        if self.current_name:
-            self.split_requested.emit(self.current_name, loc)
+            self.split_requested.emit(self.current_name, loc, totals)

    def sync_to_editor(self, name, lnum):
        self.current_sync_request = (name, lnum)