mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Edit book: Fix splitting of HTML file occurring at the wrong location if the HTML contains invalid constructs like nested <p> tags
This commit is contained in:
parent
06a40d9d2b
commit
371aa6ef77
Binary file not shown.
@ -403,9 +403,9 @@ class Container(object): # {{{
|
|||||||
data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)
|
data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)
|
||||||
return etree.fromstring(data, parser=RECOVER_PARSER)
|
return etree.fromstring(data, parser=RECOVER_PARSER)
|
||||||
|
|
||||||
def parse_xhtml(self, data, fname='<string>'):
|
def parse_xhtml(self, data, fname='<string>', force_html5_parse=False):
|
||||||
if self.tweak_mode:
|
if self.tweak_mode:
|
||||||
return parse_html_tweak(data, log=self.log, decoder=self.decode)
|
return parse_html_tweak(data, log=self.log, decoder=self.decode, force_html5_parse=force_html5_parse)
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
return parse_html(
|
return parse_html(
|
||||||
|
@ -16,3 +16,5 @@ class DRMError(_DRMError):
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(DRMError, self).__init__(_('This file is locked with DRM. It cannot be edited.'))
|
super(DRMError, self).__init__(_('This file is locked with DRM. It cannot be edited.'))
|
||||||
|
|
||||||
|
class MalformedMarkup(ValueError):
|
||||||
|
pass
|
||||||
|
@ -636,7 +636,7 @@ def strip_encoding_declarations(raw):
|
|||||||
raw = prefix + suffix
|
raw = prefix + suffix
|
||||||
return raw
|
return raw
|
||||||
|
|
||||||
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True):
|
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
|
||||||
if isinstance(raw, bytes):
|
if isinstance(raw, bytes):
|
||||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||||
if replace_entities:
|
if replace_entities:
|
||||||
@ -653,6 +653,8 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
|
|||||||
break
|
break
|
||||||
|
|
||||||
raw = strip_encoding_declarations(raw)
|
raw = strip_encoding_declarations(raw)
|
||||||
|
if force_html5_parse:
|
||||||
|
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
|
||||||
try:
|
try:
|
||||||
parser = XMLParser(no_network=True)
|
parser = XMLParser(no_network=True)
|
||||||
ans = fromstring(raw, parser=parser)
|
ans = fromstring(raw, parser=parser)
|
||||||
|
@ -17,12 +17,6 @@ is_hidden = (elem) ->
|
|||||||
elem = elem.parentNode
|
elem = elem.parentNode
|
||||||
return false
|
return false
|
||||||
|
|
||||||
previous_sibling = (node) ->
|
|
||||||
node = node.previousSibling
|
|
||||||
while node and node.nodeType != Node.ELEMENT_NODE
|
|
||||||
node = node.previousSibling
|
|
||||||
return node
|
|
||||||
|
|
||||||
is_block = (elem) ->
|
is_block = (elem) ->
|
||||||
style = window.getComputedStyle(elem)
|
style = window.getComputedStyle(elem)
|
||||||
return style.display in ['block', 'flex-box', 'box']
|
return style.display in ['block', 'flex-box', 'box']
|
||||||
@ -88,17 +82,20 @@ class PreviewIntegration
|
|||||||
|
|
||||||
report_split: (node) =>
|
report_split: (node) =>
|
||||||
loc = []
|
loc = []
|
||||||
|
totals = []
|
||||||
parent = find_containing_block(node)
|
parent = find_containing_block(node)
|
||||||
while parent and parent.tagName.toLowerCase() != 'body'
|
while parent and parent.tagName.toLowerCase() != 'body'
|
||||||
|
totals.push(parent.parentNode.children.length)
|
||||||
num = 0
|
num = 0
|
||||||
sibling = previous_sibling(parent)
|
sibling = parent.previousElementSibling
|
||||||
while sibling
|
while sibling
|
||||||
num += 1
|
num += 1
|
||||||
sibling = previous_sibling(sibling)
|
sibling = sibling.previousElementSibling
|
||||||
loc.push(num)
|
loc.push(num)
|
||||||
parent = parent.parentNode
|
parent = parent.parentNode
|
||||||
loc.reverse()
|
loc.reverse()
|
||||||
window.py_bridge.request_split(JSON.stringify(loc))
|
totals.reverse()
|
||||||
|
window.py_bridge.request_split(JSON.stringify(loc), JSON.stringify(totals))
|
||||||
|
|
||||||
onload: () =>
|
onload: () =>
|
||||||
window.document.body.addEventListener('click', this.onclick, true)
|
window.document.body.addEventListener('click', this.onclick, true)
|
||||||
|
@ -11,6 +11,7 @@ from future_builtins import map
|
|||||||
from urlparse import urlparse
|
from urlparse import urlparse
|
||||||
|
|
||||||
from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML, OEB_DOCS
|
from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML, OEB_DOCS
|
||||||
|
from calibre.ebooks.oeb.polish.errors import MalformedMarkup
|
||||||
from calibre.ebooks.oeb.polish.toc import node_from_loc
|
from calibre.ebooks.oeb.polish.toc import node_from_loc
|
||||||
from calibre.ebooks.oeb.polish.replace import LinkRebaser
|
from calibre.ebooks.oeb.polish.replace import LinkRebaser
|
||||||
|
|
||||||
@ -162,14 +163,28 @@ class SplitLinkReplacer(object):
|
|||||||
self.replaced = True
|
self.replaced = True
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def split(container, name, loc_or_xpath, before=True):
|
def split(container, name, loc_or_xpath, before=True, totals=None):
|
||||||
''' Split the file specified by name at the position specified by loc_or_xpath. '''
|
''' Split the file specified by name at the position specified by loc_or_xpath. '''
|
||||||
|
|
||||||
root = container.parsed(name)
|
root = container.parsed(name)
|
||||||
if isinstance(loc_or_xpath, type('')):
|
if isinstance(loc_or_xpath, type('')):
|
||||||
split_point = root.xpath(loc_or_xpath)[0]
|
split_point = root.xpath(loc_or_xpath)[0]
|
||||||
else:
|
else:
|
||||||
split_point = node_from_loc(root, loc_or_xpath)
|
try:
|
||||||
|
split_point = node_from_loc(root, loc_or_xpath, totals=totals)
|
||||||
|
except MalformedMarkup:
|
||||||
|
# The webkit HTML parser and the container parser have yielded
|
||||||
|
# different node counts, this can happen if the file is valid XML
|
||||||
|
# but contains constructs like nested <p> tags. So force parse it
|
||||||
|
# with the HTML 5 parser and try again.
|
||||||
|
raw = container.raw_data(name)
|
||||||
|
root = container.parse_xhtml(raw, fname=name, force_html5_parse=True)
|
||||||
|
try:
|
||||||
|
split_point = node_from_loc(root, loc_or_xpath, totals=totals)
|
||||||
|
except MalformedMarkup:
|
||||||
|
raise MalformedMarkup(_('The file %s has malformed markup. Try running the Fix HTML tool'
|
||||||
|
' before splitting') % name)
|
||||||
|
container.replace(name, root)
|
||||||
if in_table(split_point):
|
if in_table(split_point):
|
||||||
raise AbortError('Cannot split inside tables')
|
raise AbortError('Cannot split inside tables')
|
||||||
if split_point.tag.endswith('}body'):
|
if split_point.tag.endswith('}body'):
|
||||||
|
@ -9,7 +9,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
from urlparse import urlparse
|
from urlparse import urlparse
|
||||||
from collections import deque, Counter, OrderedDict
|
from collections import Counter, OrderedDict
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
|
|
||||||
@ -18,6 +18,7 @@ from lxml.builder import ElementMaker
|
|||||||
|
|
||||||
from calibre import __version__
|
from calibre import __version__
|
||||||
from calibre.ebooks.oeb.base import XPath, uuid_id, xml2text, NCX, NCX_NS, XML, XHTML, XHTML_NS, serialize
|
from calibre.ebooks.oeb.base import XPath, uuid_id, xml2text, NCX, NCX_NS, XML, XHTML, XHTML_NS, serialize
|
||||||
|
from calibre.ebooks.oeb.polish.errors import MalformedMarkup
|
||||||
from calibre.ebooks.oeb.polish.utils import guess_type
|
from calibre.ebooks.oeb.polish.utils import guess_type
|
||||||
from calibre.ebooks.oeb.polish.pretty import pretty_html_tree
|
from calibre.ebooks.oeb.polish.pretty import pretty_html_tree
|
||||||
from calibre.utils.localization import get_lang, canonicalize_lang, lang_as_iso639_1
|
from calibre.utils.localization import get_lang, canonicalize_lang, lang_as_iso639_1
|
||||||
@ -349,14 +350,13 @@ def from_files(container):
|
|||||||
toc.add(text, name)
|
toc.add(text, name)
|
||||||
return toc
|
return toc
|
||||||
|
|
||||||
def node_from_loc(root, loc):
|
def node_from_loc(root, locs, totals=None):
|
||||||
body = root.xpath('//*[local-name()="body"]')[0]
|
node = root.xpath('//*[local-name()="body"]')[0]
|
||||||
locs = deque(loc)
|
for i, loc in enumerate(locs):
|
||||||
node = body
|
|
||||||
while locs:
|
|
||||||
children = tuple(node.iterchildren(etree.Element))
|
children = tuple(node.iterchildren(etree.Element))
|
||||||
|
if totals is not None and totals[i] != len(children):
|
||||||
|
raise MalformedMarkup()
|
||||||
node = children[locs[0]]
|
node = children[locs[0]]
|
||||||
locs.popleft()
|
|
||||||
return node
|
return node
|
||||||
|
|
||||||
def add_id(container, name, loc):
|
def add_id(container, name, loc):
|
||||||
|
@ -902,10 +902,10 @@ class Boss(QObject):
|
|||||||
self.gui.preview.do_start_split()
|
self.gui.preview.do_start_split()
|
||||||
|
|
||||||
@in_thread_job
|
@in_thread_job
|
||||||
def split_requested(self, name, loc):
|
def split_requested(self, name, loc, totals):
|
||||||
self.add_savepoint(_('Before: Split %s') % self.gui.elided_text(name))
|
self.add_savepoint(_('Before: Split %s') % self.gui.elided_text(name))
|
||||||
try:
|
try:
|
||||||
bottom_name = split(current_container(), name, loc)
|
bottom_name = split(current_container(), name, loc, totals=totals)
|
||||||
except AbortError:
|
except AbortError:
|
||||||
self.rewind_savepoint()
|
self.rewind_savepoint()
|
||||||
raise
|
raise
|
||||||
|
@ -281,7 +281,7 @@ def find_le(a, x):
|
|||||||
class WebPage(QWebPage):
|
class WebPage(QWebPage):
|
||||||
|
|
||||||
sync_requested = pyqtSignal(object, object, object)
|
sync_requested = pyqtSignal(object, object, object)
|
||||||
split_requested = pyqtSignal(object)
|
split_requested = pyqtSignal(object, object)
|
||||||
|
|
||||||
def __init__(self, parent):
|
def __init__(self, parent):
|
||||||
QWebPage.__init__(self, parent)
|
QWebPage.__init__(self, parent)
|
||||||
@ -330,14 +330,14 @@ class WebPage(QWebPage):
|
|||||||
self.mainFrame().evaluateJavaScript('window.calibre_preview_integration.go_to_anchor(%s, %s)' % (
|
self.mainFrame().evaluateJavaScript('window.calibre_preview_integration.go_to_anchor(%s, %s)' % (
|
||||||
json.dumps(anchor), json.dumps(str(lnum))))
|
json.dumps(anchor), json.dumps(str(lnum))))
|
||||||
|
|
||||||
@pyqtSlot(str)
|
@pyqtSlot(str, str)
|
||||||
def request_split(self, loc):
|
def request_split(self, loc, totals):
|
||||||
actions['split-in-preview'].setChecked(False)
|
actions['split-in-preview'].setChecked(False)
|
||||||
loc = json.loads(unicode(loc))
|
loc, totals = json.loads(unicode(loc)), json.loads(unicode(totals))
|
||||||
if not loc:
|
if not loc or not totals:
|
||||||
return error_dialog(self.view(), _('Invalid location'),
|
return error_dialog(self.view(), _('Invalid location'),
|
||||||
_('Cannot split on the body tag'), show=True)
|
_('Cannot split on the body tag'), show=True)
|
||||||
self.split_requested.emit(loc)
|
self.split_requested.emit(loc, totals)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def line_numbers(self):
|
def line_numbers(self):
|
||||||
@ -423,7 +423,7 @@ class WebView(QWebView):
|
|||||||
class Preview(QWidget):
|
class Preview(QWidget):
|
||||||
|
|
||||||
sync_requested = pyqtSignal(object, object)
|
sync_requested = pyqtSignal(object, object)
|
||||||
split_requested = pyqtSignal(object, object)
|
split_requested = pyqtSignal(object, object, object)
|
||||||
split_start_requested = pyqtSignal()
|
split_start_requested = pyqtSignal()
|
||||||
link_clicked = pyqtSignal(object, object)
|
link_clicked = pyqtSignal(object, object)
|
||||||
|
|
||||||
@ -508,9 +508,9 @@ class Preview(QWidget):
|
|||||||
return self.link_clicked.emit(name, urlparse(href).fragment or TOP)
|
return self.link_clicked.emit(name, urlparse(href).fragment or TOP)
|
||||||
self.sync_requested.emit(self.current_name, lnum)
|
self.sync_requested.emit(self.current_name, lnum)
|
||||||
|
|
||||||
def request_split(self, loc):
|
def request_split(self, loc, totals):
|
||||||
if self.current_name:
|
if self.current_name:
|
||||||
self.split_requested.emit(self.current_name, loc)
|
self.split_requested.emit(self.current_name, loc, totals)
|
||||||
|
|
||||||
def sync_to_editor(self, name, lnum):
|
def sync_to_editor(self, name, lnum):
|
||||||
self.current_sync_request = (name, lnum)
|
self.current_sync_request = (name, lnum)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user