mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Edit book: Fix splitting of HTML file occurring at the wrong location if the HTML contains invalid constructs like nested <p> tags
This commit is contained in:
parent
06a40d9d2b
commit
371aa6ef77
Binary file not shown.
@ -403,9 +403,9 @@ class Container(object): # {{{
|
||||
data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)
|
||||
return etree.fromstring(data, parser=RECOVER_PARSER)
|
||||
|
||||
def parse_xhtml(self, data, fname='<string>'):
|
||||
def parse_xhtml(self, data, fname='<string>', force_html5_parse=False):
|
||||
if self.tweak_mode:
|
||||
return parse_html_tweak(data, log=self.log, decoder=self.decode)
|
||||
return parse_html_tweak(data, log=self.log, decoder=self.decode, force_html5_parse=force_html5_parse)
|
||||
else:
|
||||
try:
|
||||
return parse_html(
|
||||
|
@ -16,3 +16,5 @@ class DRMError(_DRMError):
|
||||
def __init__(self):
|
||||
super(DRMError, self).__init__(_('This file is locked with DRM. It cannot be edited.'))
|
||||
|
||||
class MalformedMarkup(ValueError):
|
||||
pass
|
||||
|
@ -636,7 +636,7 @@ def strip_encoding_declarations(raw):
|
||||
raw = prefix + suffix
|
||||
return raw
|
||||
|
||||
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True):
|
||||
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
|
||||
if isinstance(raw, bytes):
|
||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||
if replace_entities:
|
||||
@ -653,6 +653,8 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
|
||||
break
|
||||
|
||||
raw = strip_encoding_declarations(raw)
|
||||
if force_html5_parse:
|
||||
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
|
||||
try:
|
||||
parser = XMLParser(no_network=True)
|
||||
ans = fromstring(raw, parser=parser)
|
||||
|
@ -17,12 +17,6 @@ is_hidden = (elem) ->
|
||||
elem = elem.parentNode
|
||||
return false
|
||||
|
||||
previous_sibling = (node) ->
|
||||
node = node.previousSibling
|
||||
while node and node.nodeType != Node.ELEMENT_NODE
|
||||
node = node.previousSibling
|
||||
return node
|
||||
|
||||
is_block = (elem) ->
|
||||
style = window.getComputedStyle(elem)
|
||||
return style.display in ['block', 'flex-box', 'box']
|
||||
@ -88,17 +82,20 @@ class PreviewIntegration
|
||||
|
||||
report_split: (node) =>
|
||||
loc = []
|
||||
totals = []
|
||||
parent = find_containing_block(node)
|
||||
while parent and parent.tagName.toLowerCase() != 'body'
|
||||
totals.push(parent.parentNode.children.length)
|
||||
num = 0
|
||||
sibling = previous_sibling(parent)
|
||||
sibling = parent.previousElementSibling
|
||||
while sibling
|
||||
num += 1
|
||||
sibling = previous_sibling(sibling)
|
||||
sibling = sibling.previousElementSibling
|
||||
loc.push(num)
|
||||
parent = parent.parentNode
|
||||
loc.reverse()
|
||||
window.py_bridge.request_split(JSON.stringify(loc))
|
||||
totals.reverse()
|
||||
window.py_bridge.request_split(JSON.stringify(loc), JSON.stringify(totals))
|
||||
|
||||
onload: () =>
|
||||
window.document.body.addEventListener('click', this.onclick, true)
|
||||
|
@ -11,6 +11,7 @@ from future_builtins import map
|
||||
from urlparse import urlparse
|
||||
|
||||
from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML, OEB_DOCS
|
||||
from calibre.ebooks.oeb.polish.errors import MalformedMarkup
|
||||
from calibre.ebooks.oeb.polish.toc import node_from_loc
|
||||
from calibre.ebooks.oeb.polish.replace import LinkRebaser
|
||||
|
||||
@ -162,14 +163,28 @@ class SplitLinkReplacer(object):
|
||||
self.replaced = True
|
||||
return url
|
||||
|
||||
def split(container, name, loc_or_xpath, before=True):
|
||||
def split(container, name, loc_or_xpath, before=True, totals=None):
|
||||
''' Split the file specified by name at the position specified by loc_or_xpath. '''
|
||||
|
||||
root = container.parsed(name)
|
||||
if isinstance(loc_or_xpath, type('')):
|
||||
split_point = root.xpath(loc_or_xpath)[0]
|
||||
else:
|
||||
split_point = node_from_loc(root, loc_or_xpath)
|
||||
try:
|
||||
split_point = node_from_loc(root, loc_or_xpath, totals=totals)
|
||||
except MalformedMarkup:
|
||||
# The webkit HTML parser and the container parser have yielded
|
||||
# different node counts, this can happen if the file is valid XML
|
||||
# but contains constructs like nested <p> tags. So force parse it
|
||||
# with the HTML 5 parser and try again.
|
||||
raw = container.raw_data(name)
|
||||
root = container.parse_xhtml(raw, fname=name, force_html5_parse=True)
|
||||
try:
|
||||
split_point = node_from_loc(root, loc_or_xpath, totals=totals)
|
||||
except MalformedMarkup:
|
||||
raise MalformedMarkup(_('The file %s has malformed markup. Try running the Fix HTML tool'
|
||||
' before splitting') % name)
|
||||
container.replace(name, root)
|
||||
if in_table(split_point):
|
||||
raise AbortError('Cannot split inside tables')
|
||||
if split_point.tag.endswith('}body'):
|
||||
|
@ -9,7 +9,7 @@ __docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
from urlparse import urlparse
|
||||
from collections import deque, Counter, OrderedDict
|
||||
from collections import Counter, OrderedDict
|
||||
from functools import partial
|
||||
from operator import itemgetter
|
||||
|
||||
@ -18,6 +18,7 @@ from lxml.builder import ElementMaker
|
||||
|
||||
from calibre import __version__
|
||||
from calibre.ebooks.oeb.base import XPath, uuid_id, xml2text, NCX, NCX_NS, XML, XHTML, XHTML_NS, serialize
|
||||
from calibre.ebooks.oeb.polish.errors import MalformedMarkup
|
||||
from calibre.ebooks.oeb.polish.utils import guess_type
|
||||
from calibre.ebooks.oeb.polish.pretty import pretty_html_tree
|
||||
from calibre.utils.localization import get_lang, canonicalize_lang, lang_as_iso639_1
|
||||
@ -349,14 +350,13 @@ def from_files(container):
|
||||
toc.add(text, name)
|
||||
return toc
|
||||
|
||||
def node_from_loc(root, loc):
|
||||
body = root.xpath('//*[local-name()="body"]')[0]
|
||||
locs = deque(loc)
|
||||
node = body
|
||||
while locs:
|
||||
def node_from_loc(root, locs, totals=None):
|
||||
node = root.xpath('//*[local-name()="body"]')[0]
|
||||
for i, loc in enumerate(locs):
|
||||
children = tuple(node.iterchildren(etree.Element))
|
||||
if totals is not None and totals[i] != len(children):
|
||||
raise MalformedMarkup()
|
||||
node = children[locs[0]]
|
||||
locs.popleft()
|
||||
return node
|
||||
|
||||
def add_id(container, name, loc):
|
||||
|
@ -902,10 +902,10 @@ class Boss(QObject):
|
||||
self.gui.preview.do_start_split()
|
||||
|
||||
@in_thread_job
|
||||
def split_requested(self, name, loc):
|
||||
def split_requested(self, name, loc, totals):
|
||||
self.add_savepoint(_('Before: Split %s') % self.gui.elided_text(name))
|
||||
try:
|
||||
bottom_name = split(current_container(), name, loc)
|
||||
bottom_name = split(current_container(), name, loc, totals=totals)
|
||||
except AbortError:
|
||||
self.rewind_savepoint()
|
||||
raise
|
||||
|
@ -281,7 +281,7 @@ def find_le(a, x):
|
||||
class WebPage(QWebPage):
|
||||
|
||||
sync_requested = pyqtSignal(object, object, object)
|
||||
split_requested = pyqtSignal(object)
|
||||
split_requested = pyqtSignal(object, object)
|
||||
|
||||
def __init__(self, parent):
|
||||
QWebPage.__init__(self, parent)
|
||||
@ -330,14 +330,14 @@ class WebPage(QWebPage):
|
||||
self.mainFrame().evaluateJavaScript('window.calibre_preview_integration.go_to_anchor(%s, %s)' % (
|
||||
json.dumps(anchor), json.dumps(str(lnum))))
|
||||
|
||||
@pyqtSlot(str)
|
||||
def request_split(self, loc):
|
||||
@pyqtSlot(str, str)
|
||||
def request_split(self, loc, totals):
|
||||
actions['split-in-preview'].setChecked(False)
|
||||
loc = json.loads(unicode(loc))
|
||||
if not loc:
|
||||
loc, totals = json.loads(unicode(loc)), json.loads(unicode(totals))
|
||||
if not loc or not totals:
|
||||
return error_dialog(self.view(), _('Invalid location'),
|
||||
_('Cannot split on the body tag'), show=True)
|
||||
self.split_requested.emit(loc)
|
||||
self.split_requested.emit(loc, totals)
|
||||
|
||||
@property
|
||||
def line_numbers(self):
|
||||
@ -423,7 +423,7 @@ class WebView(QWebView):
|
||||
class Preview(QWidget):
|
||||
|
||||
sync_requested = pyqtSignal(object, object)
|
||||
split_requested = pyqtSignal(object, object)
|
||||
split_requested = pyqtSignal(object, object, object)
|
||||
split_start_requested = pyqtSignal()
|
||||
link_clicked = pyqtSignal(object, object)
|
||||
|
||||
@ -508,9 +508,9 @@ class Preview(QWidget):
|
||||
return self.link_clicked.emit(name, urlparse(href).fragment or TOP)
|
||||
self.sync_requested.emit(self.current_name, lnum)
|
||||
|
||||
def request_split(self, loc):
|
||||
def request_split(self, loc, totals):
|
||||
if self.current_name:
|
||||
self.split_requested.emit(self.current_name, loc)
|
||||
self.split_requested.emit(self.current_name, loc, totals)
|
||||
|
||||
def sync_to_editor(self, name, lnum):
|
||||
self.current_sync_request = (name, lnum)
|
||||
|
Loading…
x
Reference in New Issue
Block a user