Edit book: Fix splitting of HTML file occurring at the wrong location if the HTML contains invalid constructs like nested <p> tags

This commit is contained in:
Kovid Goyal 2014-02-18 14:25:30 +05:30
parent 06a40d9d2b
commit 371aa6ef77
9 changed files with 48 additions and 32 deletions

Binary file not shown.

View File

@ -403,9 +403,9 @@ class Container(object): # {{{
data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)
return etree.fromstring(data, parser=RECOVER_PARSER)
def parse_xhtml(self, data, fname='<string>'):
def parse_xhtml(self, data, fname='<string>', force_html5_parse=False):
if self.tweak_mode:
return parse_html_tweak(data, log=self.log, decoder=self.decode)
return parse_html_tweak(data, log=self.log, decoder=self.decode, force_html5_parse=force_html5_parse)
else:
try:
return parse_html(

View File

@ -16,3 +16,5 @@ class DRMError(_DRMError):
def __init__(self):
super(DRMError, self).__init__(_('This file is locked with DRM. It cannot be edited.'))
class MalformedMarkup(ValueError):
pass

View File

@ -636,7 +636,7 @@ def strip_encoding_declarations(raw):
raw = prefix + suffix
return raw
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True):
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
if isinstance(raw, bytes):
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
if replace_entities:
@ -653,6 +653,8 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
break
raw = strip_encoding_declarations(raw)
if force_html5_parse:
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
try:
parser = XMLParser(no_network=True)
ans = fromstring(raw, parser=parser)

View File

@ -17,12 +17,6 @@ is_hidden = (elem) ->
elem = elem.parentNode
return false
previous_sibling = (node) ->
node = node.previousSibling
while node and node.nodeType != Node.ELEMENT_NODE
node = node.previousSibling
return node
is_block = (elem) ->
style = window.getComputedStyle(elem)
return style.display in ['block', 'flex-box', 'box']
@ -88,17 +82,20 @@ class PreviewIntegration
report_split: (node) =>
loc = []
totals = []
parent = find_containing_block(node)
while parent and parent.tagName.toLowerCase() != 'body'
totals.push(parent.parentNode.children.length)
num = 0
sibling = previous_sibling(parent)
sibling = parent.previousElementSibling
while sibling
num += 1
sibling = previous_sibling(sibling)
sibling = sibling.previousElementSibling
loc.push(num)
parent = parent.parentNode
loc.reverse()
window.py_bridge.request_split(JSON.stringify(loc))
totals.reverse()
window.py_bridge.request_split(JSON.stringify(loc), JSON.stringify(totals))
onload: () =>
window.document.body.addEventListener('click', this.onclick, true)

View File

@ -11,6 +11,7 @@ from future_builtins import map
from urlparse import urlparse
from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML, OEB_DOCS
from calibre.ebooks.oeb.polish.errors import MalformedMarkup
from calibre.ebooks.oeb.polish.toc import node_from_loc
from calibre.ebooks.oeb.polish.replace import LinkRebaser
@ -162,14 +163,28 @@ class SplitLinkReplacer(object):
self.replaced = True
return url
def split(container, name, loc_or_xpath, before=True):
def split(container, name, loc_or_xpath, before=True, totals=None):
''' Split the file specified by name at the position specified by loc_or_xpath. '''
root = container.parsed(name)
if isinstance(loc_or_xpath, type('')):
split_point = root.xpath(loc_or_xpath)[0]
else:
split_point = node_from_loc(root, loc_or_xpath)
try:
split_point = node_from_loc(root, loc_or_xpath, totals=totals)
except MalformedMarkup:
# The webkit HTML parser and the container parser have yielded
# different node counts, this can happen if the file is valid XML
# but contains constructs like nested <p> tags. So force parse it
# with the HTML 5 parser and try again.
raw = container.raw_data(name)
root = container.parse_xhtml(raw, fname=name, force_html5_parse=True)
try:
split_point = node_from_loc(root, loc_or_xpath, totals=totals)
except MalformedMarkup:
raise MalformedMarkup(_('The file %s has malformed markup. Try running the Fix HTML tool'
' before splitting') % name)
container.replace(name, root)
if in_table(split_point):
raise AbortError('Cannot split inside tables')
if split_point.tag.endswith('}body'):

View File

@ -9,7 +9,7 @@ __docformat__ = 'restructuredtext en'
import re
from urlparse import urlparse
from collections import deque, Counter, OrderedDict
from collections import Counter, OrderedDict
from functools import partial
from operator import itemgetter
@ -18,6 +18,7 @@ from lxml.builder import ElementMaker
from calibre import __version__
from calibre.ebooks.oeb.base import XPath, uuid_id, xml2text, NCX, NCX_NS, XML, XHTML, XHTML_NS, serialize
from calibre.ebooks.oeb.polish.errors import MalformedMarkup
from calibre.ebooks.oeb.polish.utils import guess_type
from calibre.ebooks.oeb.polish.pretty import pretty_html_tree
from calibre.utils.localization import get_lang, canonicalize_lang, lang_as_iso639_1
@ -349,14 +350,13 @@ def from_files(container):
toc.add(text, name)
return toc
def node_from_loc(root, loc):
body = root.xpath('//*[local-name()="body"]')[0]
locs = deque(loc)
node = body
while locs:
def node_from_loc(root, locs, totals=None):
node = root.xpath('//*[local-name()="body"]')[0]
for i, loc in enumerate(locs):
children = tuple(node.iterchildren(etree.Element))
if totals is not None and totals[i] != len(children):
raise MalformedMarkup()
node = children[locs[0]]
locs.popleft()
return node
def add_id(container, name, loc):

View File

@ -902,10 +902,10 @@ class Boss(QObject):
self.gui.preview.do_start_split()
@in_thread_job
def split_requested(self, name, loc):
def split_requested(self, name, loc, totals):
self.add_savepoint(_('Before: Split %s') % self.gui.elided_text(name))
try:
bottom_name = split(current_container(), name, loc)
bottom_name = split(current_container(), name, loc, totals=totals)
except AbortError:
self.rewind_savepoint()
raise

View File

@ -281,7 +281,7 @@ def find_le(a, x):
class WebPage(QWebPage):
sync_requested = pyqtSignal(object, object, object)
split_requested = pyqtSignal(object)
split_requested = pyqtSignal(object, object)
def __init__(self, parent):
QWebPage.__init__(self, parent)
@ -330,14 +330,14 @@ class WebPage(QWebPage):
self.mainFrame().evaluateJavaScript('window.calibre_preview_integration.go_to_anchor(%s, %s)' % (
json.dumps(anchor), json.dumps(str(lnum))))
@pyqtSlot(str)
def request_split(self, loc):
@pyqtSlot(str, str)
def request_split(self, loc, totals):
actions['split-in-preview'].setChecked(False)
loc = json.loads(unicode(loc))
if not loc:
loc, totals = json.loads(unicode(loc)), json.loads(unicode(totals))
if not loc or not totals:
return error_dialog(self.view(), _('Invalid location'),
_('Cannot split on the body tag'), show=True)
self.split_requested.emit(loc)
self.split_requested.emit(loc, totals)
@property
def line_numbers(self):
@ -423,7 +423,7 @@ class WebView(QWebView):
class Preview(QWidget):
sync_requested = pyqtSignal(object, object)
split_requested = pyqtSignal(object, object)
split_requested = pyqtSignal(object, object, object)
split_start_requested = pyqtSignal()
link_clicked = pyqtSignal(object, object)
@ -508,9 +508,9 @@ class Preview(QWidget):
return self.link_clicked.emit(name, urlparse(href).fragment or TOP)
self.sync_requested.emit(self.current_name, lnum)
def request_split(self, loc):
def request_split(self, loc, totals):
if self.current_name:
self.split_requested.emit(self.current_name, loc)
self.split_requested.emit(self.current_name, loc, totals)
def sync_to_editor(self, name, lnum):
self.current_sync_request = (name, lnum)