From 371aa6ef7795ae22d4b096a75f7c87ed082083ef Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 18 Feb 2014 14:25:30 +0530 Subject: [PATCH] Edit book: Fix splitting of HTML file occurring at the wrong location if the HTML contains invalid constructs like nested

tags --- resources/compiled_coffeescript.zip | Bin 79831 -> 79761 bytes src/calibre/ebooks/oeb/polish/container.py | 4 ++-- src/calibre/ebooks/oeb/polish/errors.py | 2 ++ src/calibre/ebooks/oeb/polish/parsing.py | 4 +++- src/calibre/ebooks/oeb/polish/preview.coffee | 15 ++++++--------- src/calibre/ebooks/oeb/polish/split.py | 19 +++++++++++++++++-- src/calibre/ebooks/oeb/polish/toc.py | 14 +++++++------- src/calibre/gui2/tweak_book/boss.py | 4 ++-- src/calibre/gui2/tweak_book/preview.py | 18 +++++++++--------- 9 files changed, 48 insertions(+), 32 deletions(-) diff --git a/resources/compiled_coffeescript.zip b/resources/compiled_coffeescript.zip index f1eecfc58dd9bc4cb595ae6fc2d7bd39fa6887d4..257baf3a67e503fde840513b0d7f19cb7c6c4f92 100644 GIT binary patch delta 331 zcmccqo@L^DmWC~ip;Cf#a)VsBbC3Sx6K7xmVVUWHOpKz_1I!t@rngEl9+@t{%y@sg zl?>yX?ayTyZ}4#FD3s)vB<2)P->=0e!3JfY(qgRDhp_buN{cfz3KEM_^Gfs}lwW>I zs$OzNW=;xFP%kGnFTEr~(|Wp{HlwNxR9#VOS!RA|v1?9hE>K@^W>QXOUi$PZZAKTd zVhBgC07cE@iz+hHUu!dl!JMG4!{{xpqu>?n@26K>QUuhVnO3O*antrn9Y!7wUKGDh zpJc-5$UfPSk>MXx^>jfqMv?9BOc)E8G>k3N3=ET#ERxbpP0cLLlPygQjMB_3jSLKv XjEsy^O-v1p%+r!gji$GmGX??xU~X=C delta 419 zcmbREp5^*`mWC~ip;Cec2|g|tr(HG*5ocfkVVUWHOpKz_tECvv2`Cg4rIuypmlnqt zXC~!j=1u2gW_&nZLx%B<0GdqtWP>z``n>#|)YLNl7v`PBk(yF)>TFNK8pdGcz$tO)@t~O|djIv$QlwF;6ovH?}aJ?qJRs F2mlx0hpqqs diff --git a/src/calibre/ebooks/oeb/polish/container.py b/src/calibre/ebooks/oeb/polish/container.py index 9a58f0ef6b..5910b85a0b 100644 --- a/src/calibre/ebooks/oeb/polish/container.py +++ b/src/calibre/ebooks/oeb/polish/container.py @@ -403,9 +403,9 @@ class Container(object): # {{{ data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True) return etree.fromstring(data, parser=RECOVER_PARSER) - def parse_xhtml(self, data, fname=''): + def parse_xhtml(self, data, fname='', force_html5_parse=False): if self.tweak_mode: - return parse_html_tweak(data, log=self.log, decoder=self.decode) + return parse_html_tweak(data, log=self.log, decoder=self.decode, force_html5_parse=force_html5_parse) else: try: return parse_html( diff --git a/src/calibre/ebooks/oeb/polish/errors.py b/src/calibre/ebooks/oeb/polish/errors.py index 0aa0f4993c..6165524075 100644 --- a/src/calibre/ebooks/oeb/polish/errors.py +++ b/src/calibre/ebooks/oeb/polish/errors.py @@ -16,3 +16,5 @@ class DRMError(_DRMError): def __init__(self): super(DRMError, self).__init__(_('This file is locked with DRM. It cannot be edited.')) +class MalformedMarkup(ValueError): + pass diff --git a/src/calibre/ebooks/oeb/polish/parsing.py b/src/calibre/ebooks/oeb/polish/parsing.py index 602be85929..2dcf1fb1dc 100644 --- a/src/calibre/ebooks/oeb/polish/parsing.py +++ b/src/calibre/ebooks/oeb/polish/parsing.py @@ -636,7 +636,7 @@ def strip_encoding_declarations(raw): raw = prefix + suffix return raw -def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True): +def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) if replace_entities: @@ -653,6 +653,8 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N break raw = strip_encoding_declarations(raw) + if force_html5_parse: + return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False) try: parser = XMLParser(no_network=True) ans = fromstring(raw, parser=parser) diff --git a/src/calibre/ebooks/oeb/polish/preview.coffee b/src/calibre/ebooks/oeb/polish/preview.coffee index d0f155e079..c4abe96db9 100644 --- a/src/calibre/ebooks/oeb/polish/preview.coffee +++ b/src/calibre/ebooks/oeb/polish/preview.coffee @@ -17,12 +17,6 @@ is_hidden = (elem) -> elem = elem.parentNode return false -previous_sibling = (node) -> - node = node.previousSibling - while node and node.nodeType != Node.ELEMENT_NODE - node = node.previousSibling - return node - is_block = (elem) -> style = window.getComputedStyle(elem) return style.display in ['block', 'flex-box', 'box'] @@ -88,17 +82,20 @@ class PreviewIntegration report_split: (node) => loc = [] + totals = [] parent = find_containing_block(node) while parent and parent.tagName.toLowerCase() != 'body' + totals.push(parent.parentNode.children.length) num = 0 - sibling = previous_sibling(parent) + sibling = parent.previousElementSibling while sibling num += 1 - sibling = previous_sibling(sibling) + sibling = sibling.previousElementSibling loc.push(num) parent = parent.parentNode loc.reverse() - window.py_bridge.request_split(JSON.stringify(loc)) + totals.reverse() + window.py_bridge.request_split(JSON.stringify(loc), JSON.stringify(totals)) onload: () => window.document.body.addEventListener('click', this.onclick, true) diff --git a/src/calibre/ebooks/oeb/polish/split.py b/src/calibre/ebooks/oeb/polish/split.py index a6d4124498..0c7effabb4 100644 --- a/src/calibre/ebooks/oeb/polish/split.py +++ b/src/calibre/ebooks/oeb/polish/split.py @@ -11,6 +11,7 @@ from future_builtins import map from urlparse import urlparse from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML, OEB_DOCS +from calibre.ebooks.oeb.polish.errors import MalformedMarkup from calibre.ebooks.oeb.polish.toc import node_from_loc from calibre.ebooks.oeb.polish.replace import LinkRebaser @@ -162,14 +163,28 @@ class SplitLinkReplacer(object): self.replaced = True return url -def split(container, name, loc_or_xpath, before=True): +def split(container, name, loc_or_xpath, before=True, totals=None): ''' Split the file specified by name at the position specified by loc_or_xpath. ''' root = container.parsed(name) if isinstance(loc_or_xpath, type('')): split_point = root.xpath(loc_or_xpath)[0] else: - split_point = node_from_loc(root, loc_or_xpath) + try: + split_point = node_from_loc(root, loc_or_xpath, totals=totals) + except MalformedMarkup: + # The webkit HTML parser and the container parser have yielded + # different node counts, this can happen if the file is valid XML + # but contains constructs like nested

tags. So force parse it + # with the HTML 5 parser and try again. + raw = container.raw_data(name) + root = container.parse_xhtml(raw, fname=name, force_html5_parse=True) + try: + split_point = node_from_loc(root, loc_or_xpath, totals=totals) + except MalformedMarkup: + raise MalformedMarkup(_('The file %s has malformed markup. Try running the Fix HTML tool' + ' before splitting') % name) + container.replace(name, root) if in_table(split_point): raise AbortError('Cannot split inside tables') if split_point.tag.endswith('}body'): diff --git a/src/calibre/ebooks/oeb/polish/toc.py b/src/calibre/ebooks/oeb/polish/toc.py index 72ac577a66..7a1b94b2ef 100644 --- a/src/calibre/ebooks/oeb/polish/toc.py +++ b/src/calibre/ebooks/oeb/polish/toc.py @@ -9,7 +9,7 @@ __docformat__ = 'restructuredtext en' import re from urlparse import urlparse -from collections import deque, Counter, OrderedDict +from collections import Counter, OrderedDict from functools import partial from operator import itemgetter @@ -18,6 +18,7 @@ from lxml.builder import ElementMaker from calibre import __version__ from calibre.ebooks.oeb.base import XPath, uuid_id, xml2text, NCX, NCX_NS, XML, XHTML, XHTML_NS, serialize +from calibre.ebooks.oeb.polish.errors import MalformedMarkup from calibre.ebooks.oeb.polish.utils import guess_type from calibre.ebooks.oeb.polish.pretty import pretty_html_tree from calibre.utils.localization import get_lang, canonicalize_lang, lang_as_iso639_1 @@ -349,14 +350,13 @@ def from_files(container): toc.add(text, name) return toc -def node_from_loc(root, loc): - body = root.xpath('//*[local-name()="body"]')[0] - locs = deque(loc) - node = body - while locs: +def node_from_loc(root, locs, totals=None): + node = root.xpath('//*[local-name()="body"]')[0] + for i, loc in enumerate(locs): children = tuple(node.iterchildren(etree.Element)) + if totals is not None and totals[i] != len(children): + raise MalformedMarkup() node = children[locs[0]] - locs.popleft() return node def add_id(container, name, loc): diff --git a/src/calibre/gui2/tweak_book/boss.py b/src/calibre/gui2/tweak_book/boss.py index 7325d19aec..28a091158a 100644 --- a/src/calibre/gui2/tweak_book/boss.py +++ b/src/calibre/gui2/tweak_book/boss.py @@ -902,10 +902,10 @@ class Boss(QObject): self.gui.preview.do_start_split() @in_thread_job - def split_requested(self, name, loc): + def split_requested(self, name, loc, totals): self.add_savepoint(_('Before: Split %s') % self.gui.elided_text(name)) try: - bottom_name = split(current_container(), name, loc) + bottom_name = split(current_container(), name, loc, totals=totals) except AbortError: self.rewind_savepoint() raise diff --git a/src/calibre/gui2/tweak_book/preview.py b/src/calibre/gui2/tweak_book/preview.py index e2632aa69a..7c12c35292 100644 --- a/src/calibre/gui2/tweak_book/preview.py +++ b/src/calibre/gui2/tweak_book/preview.py @@ -281,7 +281,7 @@ def find_le(a, x): class WebPage(QWebPage): sync_requested = pyqtSignal(object, object, object) - split_requested = pyqtSignal(object) + split_requested = pyqtSignal(object, object) def __init__(self, parent): QWebPage.__init__(self, parent) @@ -330,14 +330,14 @@ class WebPage(QWebPage): self.mainFrame().evaluateJavaScript('window.calibre_preview_integration.go_to_anchor(%s, %s)' % ( json.dumps(anchor), json.dumps(str(lnum)))) - @pyqtSlot(str) - def request_split(self, loc): + @pyqtSlot(str, str) + def request_split(self, loc, totals): actions['split-in-preview'].setChecked(False) - loc = json.loads(unicode(loc)) - if not loc: + loc, totals = json.loads(unicode(loc)), json.loads(unicode(totals)) + if not loc or not totals: return error_dialog(self.view(), _('Invalid location'), _('Cannot split on the body tag'), show=True) - self.split_requested.emit(loc) + self.split_requested.emit(loc, totals) @property def line_numbers(self): @@ -423,7 +423,7 @@ class WebView(QWebView): class Preview(QWidget): sync_requested = pyqtSignal(object, object) - split_requested = pyqtSignal(object, object) + split_requested = pyqtSignal(object, object, object) split_start_requested = pyqtSignal() link_clicked = pyqtSignal(object, object) @@ -508,9 +508,9 @@ class Preview(QWidget): return self.link_clicked.emit(name, urlparse(href).fragment or TOP) self.sync_requested.emit(self.current_name, lnum) - def request_split(self, loc): + def request_split(self, loc, totals): if self.current_name: - self.split_requested.emit(self.current_name, loc) + self.split_requested.emit(self.current_name, loc, totals) def sync_to_editor(self, name, lnum): self.current_sync_request = (name, lnum)