Edit book: Fix splitting of HTML file occurring at the wrong location if the HTML contains invalid constructs like nested <p> tags

This commit is contained in:
Kovid Goyal 2014-02-18 14:25:30 +05:30
parent 06a40d9d2b
commit 371aa6ef77
9 changed files with 48 additions and 32 deletions

Binary file not shown.

View File

@ -403,9 +403,9 @@ class Container(object): # {{{
data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True) data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)
return etree.fromstring(data, parser=RECOVER_PARSER) return etree.fromstring(data, parser=RECOVER_PARSER)
def parse_xhtml(self, data, fname='<string>'): def parse_xhtml(self, data, fname='<string>', force_html5_parse=False):
if self.tweak_mode: if self.tweak_mode:
return parse_html_tweak(data, log=self.log, decoder=self.decode) return parse_html_tweak(data, log=self.log, decoder=self.decode, force_html5_parse=force_html5_parse)
else: else:
try: try:
return parse_html( return parse_html(

View File

@ -16,3 +16,5 @@ class DRMError(_DRMError):
def __init__(self): def __init__(self):
super(DRMError, self).__init__(_('This file is locked with DRM. It cannot be edited.')) super(DRMError, self).__init__(_('This file is locked with DRM. It cannot be edited.'))
class MalformedMarkup(ValueError):
pass

View File

@ -636,7 +636,7 @@ def strip_encoding_declarations(raw):
raw = prefix + suffix raw = prefix + suffix
return raw return raw
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True): def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
if isinstance(raw, bytes): if isinstance(raw, bytes):
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
if replace_entities: if replace_entities:
@ -653,6 +653,8 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
break break
raw = strip_encoding_declarations(raw) raw = strip_encoding_declarations(raw)
if force_html5_parse:
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
try: try:
parser = XMLParser(no_network=True) parser = XMLParser(no_network=True)
ans = fromstring(raw, parser=parser) ans = fromstring(raw, parser=parser)

View File

@ -17,12 +17,6 @@ is_hidden = (elem) ->
elem = elem.parentNode elem = elem.parentNode
return false return false
previous_sibling = (node) ->
node = node.previousSibling
while node and node.nodeType != Node.ELEMENT_NODE
node = node.previousSibling
return node
is_block = (elem) -> is_block = (elem) ->
style = window.getComputedStyle(elem) style = window.getComputedStyle(elem)
return style.display in ['block', 'flex-box', 'box'] return style.display in ['block', 'flex-box', 'box']
@ -88,17 +82,20 @@ class PreviewIntegration
report_split: (node) => report_split: (node) =>
loc = [] loc = []
totals = []
parent = find_containing_block(node) parent = find_containing_block(node)
while parent and parent.tagName.toLowerCase() != 'body' while parent and parent.tagName.toLowerCase() != 'body'
totals.push(parent.parentNode.children.length)
num = 0 num = 0
sibling = previous_sibling(parent) sibling = parent.previousElementSibling
while sibling while sibling
num += 1 num += 1
sibling = previous_sibling(sibling) sibling = sibling.previousElementSibling
loc.push(num) loc.push(num)
parent = parent.parentNode parent = parent.parentNode
loc.reverse() loc.reverse()
window.py_bridge.request_split(JSON.stringify(loc)) totals.reverse()
window.py_bridge.request_split(JSON.stringify(loc), JSON.stringify(totals))
onload: () => onload: () =>
window.document.body.addEventListener('click', this.onclick, true) window.document.body.addEventListener('click', this.onclick, true)

View File

@ -11,6 +11,7 @@ from future_builtins import map
from urlparse import urlparse from urlparse import urlparse
from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML, OEB_DOCS from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML, OEB_DOCS
from calibre.ebooks.oeb.polish.errors import MalformedMarkup
from calibre.ebooks.oeb.polish.toc import node_from_loc from calibre.ebooks.oeb.polish.toc import node_from_loc
from calibre.ebooks.oeb.polish.replace import LinkRebaser from calibre.ebooks.oeb.polish.replace import LinkRebaser
@ -162,14 +163,28 @@ class SplitLinkReplacer(object):
self.replaced = True self.replaced = True
return url return url
def split(container, name, loc_or_xpath, before=True): def split(container, name, loc_or_xpath, before=True, totals=None):
''' Split the file specified by name at the position specified by loc_or_xpath. ''' ''' Split the file specified by name at the position specified by loc_or_xpath. '''
root = container.parsed(name) root = container.parsed(name)
if isinstance(loc_or_xpath, type('')): if isinstance(loc_or_xpath, type('')):
split_point = root.xpath(loc_or_xpath)[0] split_point = root.xpath(loc_or_xpath)[0]
else: else:
split_point = node_from_loc(root, loc_or_xpath) try:
split_point = node_from_loc(root, loc_or_xpath, totals=totals)
except MalformedMarkup:
# The webkit HTML parser and the container parser have yielded
# different node counts, this can happen if the file is valid XML
# but contains constructs like nested <p> tags. So force parse it
# with the HTML 5 parser and try again.
raw = container.raw_data(name)
root = container.parse_xhtml(raw, fname=name, force_html5_parse=True)
try:
split_point = node_from_loc(root, loc_or_xpath, totals=totals)
except MalformedMarkup:
raise MalformedMarkup(_('The file %s has malformed markup. Try running the Fix HTML tool'
' before splitting') % name)
container.replace(name, root)
if in_table(split_point): if in_table(split_point):
raise AbortError('Cannot split inside tables') raise AbortError('Cannot split inside tables')
if split_point.tag.endswith('}body'): if split_point.tag.endswith('}body'):

View File

@ -9,7 +9,7 @@ __docformat__ = 'restructuredtext en'
import re import re
from urlparse import urlparse from urlparse import urlparse
from collections import deque, Counter, OrderedDict from collections import Counter, OrderedDict
from functools import partial from functools import partial
from operator import itemgetter from operator import itemgetter
@ -18,6 +18,7 @@ from lxml.builder import ElementMaker
from calibre import __version__ from calibre import __version__
from calibre.ebooks.oeb.base import XPath, uuid_id, xml2text, NCX, NCX_NS, XML, XHTML, XHTML_NS, serialize from calibre.ebooks.oeb.base import XPath, uuid_id, xml2text, NCX, NCX_NS, XML, XHTML, XHTML_NS, serialize
from calibre.ebooks.oeb.polish.errors import MalformedMarkup
from calibre.ebooks.oeb.polish.utils import guess_type from calibre.ebooks.oeb.polish.utils import guess_type
from calibre.ebooks.oeb.polish.pretty import pretty_html_tree from calibre.ebooks.oeb.polish.pretty import pretty_html_tree
from calibre.utils.localization import get_lang, canonicalize_lang, lang_as_iso639_1 from calibre.utils.localization import get_lang, canonicalize_lang, lang_as_iso639_1
@ -349,14 +350,13 @@ def from_files(container):
toc.add(text, name) toc.add(text, name)
return toc return toc
def node_from_loc(root, loc): def node_from_loc(root, locs, totals=None):
body = root.xpath('//*[local-name()="body"]')[0] node = root.xpath('//*[local-name()="body"]')[0]
locs = deque(loc) for i, loc in enumerate(locs):
node = body
while locs:
children = tuple(node.iterchildren(etree.Element)) children = tuple(node.iterchildren(etree.Element))
if totals is not None and totals[i] != len(children):
raise MalformedMarkup()
node = children[locs[0]] node = children[locs[0]]
locs.popleft()
return node return node
def add_id(container, name, loc): def add_id(container, name, loc):

View File

@ -902,10 +902,10 @@ class Boss(QObject):
self.gui.preview.do_start_split() self.gui.preview.do_start_split()
@in_thread_job @in_thread_job
def split_requested(self, name, loc): def split_requested(self, name, loc, totals):
self.add_savepoint(_('Before: Split %s') % self.gui.elided_text(name)) self.add_savepoint(_('Before: Split %s') % self.gui.elided_text(name))
try: try:
bottom_name = split(current_container(), name, loc) bottom_name = split(current_container(), name, loc, totals=totals)
except AbortError: except AbortError:
self.rewind_savepoint() self.rewind_savepoint()
raise raise

View File

@ -281,7 +281,7 @@ def find_le(a, x):
class WebPage(QWebPage): class WebPage(QWebPage):
sync_requested = pyqtSignal(object, object, object) sync_requested = pyqtSignal(object, object, object)
split_requested = pyqtSignal(object) split_requested = pyqtSignal(object, object)
def __init__(self, parent): def __init__(self, parent):
QWebPage.__init__(self, parent) QWebPage.__init__(self, parent)
@ -330,14 +330,14 @@ class WebPage(QWebPage):
self.mainFrame().evaluateJavaScript('window.calibre_preview_integration.go_to_anchor(%s, %s)' % ( self.mainFrame().evaluateJavaScript('window.calibre_preview_integration.go_to_anchor(%s, %s)' % (
json.dumps(anchor), json.dumps(str(lnum)))) json.dumps(anchor), json.dumps(str(lnum))))
@pyqtSlot(str) @pyqtSlot(str, str)
def request_split(self, loc): def request_split(self, loc, totals):
actions['split-in-preview'].setChecked(False) actions['split-in-preview'].setChecked(False)
loc = json.loads(unicode(loc)) loc, totals = json.loads(unicode(loc)), json.loads(unicode(totals))
if not loc: if not loc or not totals:
return error_dialog(self.view(), _('Invalid location'), return error_dialog(self.view(), _('Invalid location'),
_('Cannot split on the body tag'), show=True) _('Cannot split on the body tag'), show=True)
self.split_requested.emit(loc) self.split_requested.emit(loc, totals)
@property @property
def line_numbers(self): def line_numbers(self):
@ -423,7 +423,7 @@ class WebView(QWebView):
class Preview(QWidget): class Preview(QWidget):
sync_requested = pyqtSignal(object, object) sync_requested = pyqtSignal(object, object)
split_requested = pyqtSignal(object, object) split_requested = pyqtSignal(object, object, object)
split_start_requested = pyqtSignal() split_start_requested = pyqtSignal()
link_clicked = pyqtSignal(object, object) link_clicked = pyqtSignal(object, object)
@ -508,9 +508,9 @@ class Preview(QWidget):
return self.link_clicked.emit(name, urlparse(href).fragment or TOP) return self.link_clicked.emit(name, urlparse(href).fragment or TOP)
self.sync_requested.emit(self.current_name, lnum) self.sync_requested.emit(self.current_name, lnum)
def request_split(self, loc): def request_split(self, loc, totals):
if self.current_name: if self.current_name:
self.split_requested.emit(self.current_name, loc) self.split_requested.emit(self.current_name, loc, totals)
def sync_to_editor(self, name, lnum): def sync_to_editor(self, name, lnum):
self.current_sync_request = (name, lnum) self.current_sync_request = (name, lnum)