From 371aa6ef7795ae22d4b096a75f7c87ed082083ef Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 18 Feb 2014 14:25:30 +0530
Subject: [PATCH] Edit book: Fix splitting of HTML file occurring at the wrong
 location if the HTML contains invalid constructs like nested <p> tags

---
 resources/compiled_coffeescript.zip          | Bin 79831 -> 79761 bytes
 src/calibre/ebooks/oeb/polish/container.py   |   4 ++--
 src/calibre/ebooks/oeb/polish/errors.py      |   2 ++
 src/calibre/ebooks/oeb/polish/parsing.py     |   4 +++-
 src/calibre/ebooks/oeb/polish/preview.coffee |  15 ++++++---------
 src/calibre/ebooks/oeb/polish/split.py       |  19 +++++++++++++++++--
 src/calibre/ebooks/oeb/polish/toc.py         |  14 +++++++-------
 src/calibre/gui2/tweak_book/boss.py          |   4 ++--
 src/calibre/gui2/tweak_book/preview.py       |  18 +++++++++---------
 9 files changed, 48 insertions(+), 32 deletions(-)
diff --git a/resources/compiled_coffeescript.zip b/resources/compiled_coffeescript.zip
index f1eecfc58dd9bc4cb595ae6fc2d7bd39fa6887d4..257baf3a67e503fde840513b0d7f19cb7c6c4f92 100644
GIT binary patch
delta 331
zcmccqo@L^DmWC~ip;Cf#a)VsBbC3Sx6K7xmVVUWHOpKz_1I!t@rngEl9+@t{%y@sg
zl?>yX?ayTyZ}4#FD3s)vB<2)P->=0e!3JfY(qgRDhp_buN{cfz3KEM_^Gfs}lwW>I
zs$OzNW=;xFP%kGnFTEr~(|Wp{HlwNxR9#VOS!RA|v1?9hE>K@^W>QXOUi$PZZAKTd
zVhBgC07cE@iz+hHUu!dl!JMG4!{{xpqu>?n@26K>QUuhVnO3O*antrn9Y!7wUKGDh
zpJc-5$UfPSk>MXx^>jfqMv?9BOc)E8G>k3N3=ET#ERxbpP0cLLlPygQjMB_3jSLKv
XjEsy^O-v1p%+r!gji$GmGX??xU~X=C

delta 419
zcmbREp5^*`mWC~ip;Cec2|g|tr(HG*5ocfkVVUWHOpKz_tECvv2`Cg4rIuypmlnqt
zXC~!j=1u2gW_&nZLx%B<0GdqtWP>z``n>#<R0UfF5TyrG5DeCtZq20t1mzi-IjIU7
zV0CJ05QQL4NM%8)f}*Xhf*(k$tB<R%t6xaGpTCQ%rb0DXrvg|pUc+no!77VVOG=CK
zK+3ms$}&FY;f7ePppcWFJY7hOv3B}#Ek+q}RG+156eJd<=9Oqp|F6a9A^~$e)ZghE
z5dWu7UU*V;dZ{*J*kqOxiRr(!85Nl{t*6&(GHPv4&|$pKK0U#hk%bc;oYQ?w7#$gJ
zZtpW;>|)YLNl7v`PBk(yF)>TFNK8pdGcz$tO)@t~O|djIv$QlwF;6ovH?}aJ?qJRs
F2mlx0hpqqs

diff --git a/src/calibre/ebooks/oeb/polish/container.py b/src/calibre/ebooks/oeb/polish/container.py
index 9a58f0ef6b..5910b85a0b 100644
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@@ -403,9 +403,9 @@ class Container(object):  # {{{
             data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)
         return etree.fromstring(data, parser=RECOVER_PARSER)
 
-    def parse_xhtml(self, data, fname='<string>'):
+    def parse_xhtml(self, data, fname='<string>', force_html5_parse=False):
         if self.tweak_mode:
-            return parse_html_tweak(data, log=self.log, decoder=self.decode)
+            return parse_html_tweak(data, log=self.log, decoder=self.decode, force_html5_parse=force_html5_parse)
         else:
             try:
                 return parse_html(
diff --git a/src/calibre/ebooks/oeb/polish/errors.py b/src/calibre/ebooks/oeb/polish/errors.py
index 0aa0f4993c..6165524075 100644
--- a/src/calibre/ebooks/oeb/polish/errors.py
+++ b/src/calibre/ebooks/oeb/polish/errors.py
@@ -16,3 +16,5 @@ class DRMError(_DRMError):
     def __init__(self):
         super(DRMError, self).__init__(_('This file is locked with DRM. It cannot be edited.'))
 
+class MalformedMarkup(ValueError):
+    pass
diff --git a/src/calibre/ebooks/oeb/polish/parsing.py b/src/calibre/ebooks/oeb/polish/parsing.py
index 602be85929..2dcf1fb1dc 100644
--- a/src/calibre/ebooks/oeb/polish/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/parsing.py
@@ -636,7 +636,7 @@ def strip_encoding_declarations(raw):
         raw = prefix + suffix
     return raw
 
-def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True):
+def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
     if isinstance(raw, bytes):
         raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
     if replace_entities:
@@ -653,6 +653,8 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
         break
 
     raw = strip_encoding_declarations(raw)
+    if force_html5_parse:
+        return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
     try:
         parser = XMLParser(no_network=True)
         ans = fromstring(raw, parser=parser)
diff --git a/src/calibre/ebooks/oeb/polish/preview.coffee b/src/calibre/ebooks/oeb/polish/preview.coffee
index d0f155e079..c4abe96db9 100644
--- a/src/calibre/ebooks/oeb/polish/preview.coffee
+++ b/src/calibre/ebooks/oeb/polish/preview.coffee
@@ -17,12 +17,6 @@ is_hidden = (elem) ->
         elem = elem.parentNode
     return false
 
-previous_sibling = (node) ->
-    node = node.previousSibling
-    while node and node.nodeType != Node.ELEMENT_NODE
-        node = node.previousSibling
-    return node
-
 is_block = (elem) ->
     style = window.getComputedStyle(elem)
     return style.display in ['block', 'flex-box', 'box']
@@ -88,17 +82,20 @@ class PreviewIntegration
 
     report_split: (node) =>
         loc = []
+        totals = []
         parent = find_containing_block(node)
         while parent and parent.tagName.toLowerCase() != 'body'
+            totals.push(parent.parentNode.children.length)
             num = 0
-            sibling = previous_sibling(parent)
+            sibling = parent.previousElementSibling
             while sibling
                 num += 1
-                sibling = previous_sibling(sibling)
+                sibling = sibling.previousElementSibling
             loc.push(num)
             parent = parent.parentNode
         loc.reverse()
-        window.py_bridge.request_split(JSON.stringify(loc))
+        totals.reverse()
+        window.py_bridge.request_split(JSON.stringify(loc), JSON.stringify(totals))
 
     onload: () =>
         window.document.body.addEventListener('click', this.onclick, true)
diff --git a/src/calibre/ebooks/oeb/polish/split.py b/src/calibre/ebooks/oeb/polish/split.py
index a6d4124498..0c7effabb4 100644
--- a/src/calibre/ebooks/oeb/polish/split.py
+++ b/src/calibre/ebooks/oeb/polish/split.py
@@ -11,6 +11,7 @@ from future_builtins import map
 from urlparse import urlparse
 
 from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML, OEB_DOCS
+from calibre.ebooks.oeb.polish.errors import MalformedMarkup
 from calibre.ebooks.oeb.polish.toc import node_from_loc
 from calibre.ebooks.oeb.polish.replace import LinkRebaser
 
@@ -162,14 +163,28 @@ class SplitLinkReplacer(object):
             self.replaced = True
         return url
 
-def split(container, name, loc_or_xpath, before=True):
+def split(container, name, loc_or_xpath, before=True, totals=None):
     ''' Split the file specified by name at the position specified by loc_or_xpath. '''
 
     root = container.parsed(name)
     if isinstance(loc_or_xpath, type('')):
         split_point = root.xpath(loc_or_xpath)[0]
     else:
-        split_point = node_from_loc(root, loc_or_xpath)
+        try:
+            split_point = node_from_loc(root, loc_or_xpath, totals=totals)
+        except MalformedMarkup:
+            # The webkit HTML parser and the container parser have yielded
+            # different node counts, this can happen if the file is valid XML
+            # but contains constructs like nested <p> tags. So force parse it
+            # with the HTML 5 parser and try again.
+            raw = container.raw_data(name)
+            root = container.parse_xhtml(raw, fname=name, force_html5_parse=True)
+            try:
+                split_point = node_from_loc(root, loc_or_xpath, totals=totals)
+            except MalformedMarkup:
+                raise MalformedMarkup(_('The file %s has malformed markup. Try running the Fix HTML tool'
+                                        ' before splitting') % name)
+            container.replace(name, root)
     if in_table(split_point):
         raise AbortError('Cannot split inside tables')
     if split_point.tag.endswith('}body'):
diff --git a/src/calibre/ebooks/oeb/polish/toc.py b/src/calibre/ebooks/oeb/polish/toc.py
index 72ac577a66..7a1b94b2ef 100644
--- a/src/calibre/ebooks/oeb/polish/toc.py
+++ b/src/calibre/ebooks/oeb/polish/toc.py
@@ -9,7 +9,7 @@ __docformat__ = 'restructuredtext en'
 
 import re
 from urlparse import urlparse
-from collections import deque, Counter, OrderedDict
+from collections import Counter, OrderedDict
 from functools import partial
 from operator import itemgetter
 
@@ -18,6 +18,7 @@ from lxml.builder import ElementMaker
 
 from calibre import __version__
 from calibre.ebooks.oeb.base import XPath, uuid_id, xml2text, NCX, NCX_NS, XML, XHTML, XHTML_NS, serialize
+from calibre.ebooks.oeb.polish.errors import MalformedMarkup
 from calibre.ebooks.oeb.polish.utils import guess_type
 from calibre.ebooks.oeb.polish.pretty import pretty_html_tree
 from calibre.utils.localization import get_lang, canonicalize_lang, lang_as_iso639_1
@@ -349,14 +350,13 @@ def from_files(container):
         toc.add(text, name)
     return toc
 
-def node_from_loc(root, loc):
-    body = root.xpath('//*[local-name()="body"]')[0]
-    locs = deque(loc)
-    node = body
-    while locs:
+def node_from_loc(root, locs, totals=None):
+    node = root.xpath('//*[local-name()="body"]')[0]
+    for i, loc in enumerate(locs):
         children = tuple(node.iterchildren(etree.Element))
+        if totals is not None and totals[i] != len(children):
+            raise MalformedMarkup()
         node = children[locs[0]]
-        locs.popleft()
     return node
 
 def add_id(container, name, loc):
diff --git a/src/calibre/gui2/tweak_book/boss.py b/src/calibre/gui2/tweak_book/boss.py
index 7325d19aec..28a091158a 100644
--- a/src/calibre/gui2/tweak_book/boss.py
+++ b/src/calibre/gui2/tweak_book/boss.py
@@ -902,10 +902,10 @@ class Boss(QObject):
         self.gui.preview.do_start_split()
 
     @in_thread_job
-    def split_requested(self, name, loc):
+    def split_requested(self, name, loc, totals):
         self.add_savepoint(_('Before: Split %s') % self.gui.elided_text(name))
         try:
-            bottom_name = split(current_container(), name, loc)
+            bottom_name = split(current_container(), name, loc, totals=totals)
         except AbortError:
             self.rewind_savepoint()
             raise
diff --git a/src/calibre/gui2/tweak_book/preview.py b/src/calibre/gui2/tweak_book/preview.py
index e2632aa69a..7c12c35292 100644
--- a/src/calibre/gui2/tweak_book/preview.py
+++ b/src/calibre/gui2/tweak_book/preview.py
@@ -281,7 +281,7 @@ def find_le(a, x):
 class WebPage(QWebPage):
 
     sync_requested = pyqtSignal(object, object, object)
-    split_requested = pyqtSignal(object)
+    split_requested = pyqtSignal(object, object)
 
     def __init__(self, parent):
         QWebPage.__init__(self, parent)
@@ -330,14 +330,14 @@ class WebPage(QWebPage):
         self.mainFrame().evaluateJavaScript('window.calibre_preview_integration.go_to_anchor(%s, %s)' % (
             json.dumps(anchor), json.dumps(str(lnum))))
 
-    @pyqtSlot(str)
-    def request_split(self, loc):
+    @pyqtSlot(str, str)
+    def request_split(self, loc, totals):
         actions['split-in-preview'].setChecked(False)
-        loc = json.loads(unicode(loc))
-        if not loc:
+        loc, totals = json.loads(unicode(loc)), json.loads(unicode(totals))
+        if not loc or not totals:
             return error_dialog(self.view(), _('Invalid location'),
                                 _('Cannot split on the body tag'), show=True)
-        self.split_requested.emit(loc)
+        self.split_requested.emit(loc, totals)
 
     @property
     def line_numbers(self):
@@ -423,7 +423,7 @@ class WebView(QWebView):
 class Preview(QWidget):
 
     sync_requested = pyqtSignal(object, object)
-    split_requested = pyqtSignal(object, object)
+    split_requested = pyqtSignal(object, object, object)
     split_start_requested = pyqtSignal()
     link_clicked = pyqtSignal(object, object)
 
@@ -508,9 +508,9 @@ class Preview(QWidget):
                     return self.link_clicked.emit(name, urlparse(href).fragment or TOP)
             self.sync_requested.emit(self.current_name, lnum)
 
-    def request_split(self, loc):
+    def request_split(self, loc, totals):
         if self.current_name:
-            self.split_requested.emit(self.current_name, loc)
+            self.split_requested.emit(self.current_name, loc, totals)
 
     def sync_to_editor(self, name, lnum):
         self.current_sync_request = (name, lnum)