From fbfebda03fee013bc8f6716fc22f60a7744f94c2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 1 Jan 2020 14:00:36 +0530 Subject: [PATCH] Dont use a recovering XML parser in a few places where we are expected to fail on invalid XML --- src/calibre/ebooks/conversion/plugins/rtf_input.py | 2 +- src/calibre/ebooks/oeb/parse_utils.py | 8 ++++---- src/calibre/ebooks/oeb/polish/parsing.py | 2 +- src/calibre/srv/opds.py | 4 ++-- src/calibre/web/feeds/recipes/collection.py | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/conversion/plugins/rtf_input.py b/src/calibre/ebooks/conversion/plugins/rtf_input.py index d18c18320b..952898d1d0 100644 --- a/src/calibre/ebooks/conversion/plugins/rtf_input.py +++ b/src/calibre/ebooks/conversion/plugins/rtf_input.py @@ -282,7 +282,7 @@ class RTFInput(InputFormatPlugin): self.log('Converting XML to HTML...') inline_class = InlineClass(self.log) - styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True)) + styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True), recover=False) extensions = {('calibre', 'inline-class') : inline_class} transform = etree.XSLT(styledoc, extensions=extensions) result = transform(doc) diff --git a/src/calibre/ebooks/oeb/parse_utils.py b/src/calibre/ebooks/oeb/parse_utils.py index 427b47f168..0db0bc7b20 100644 --- a/src/calibre/ebooks/oeb/parse_utils.py +++ b/src/calibre/ebooks/oeb/parse_utils.py @@ -208,14 +208,14 @@ def parse_html(data, log=None, decoder=None, preprocessor=None, # Try with more & more drastic measures to parse try: - data = safe_xml_fromstring(data) + data = safe_xml_fromstring(data, recover=False) check_for_html5(pre, data) except (HTML5Doc, etree.XMLSyntaxError): log.debug('Initial parse failed, using more' ' forgiving parsers') raw = data = xml_replace_entities(raw) try: - data = safe_xml_fromstring(data) + data = safe_xml_fromstring(data, recover=False) check_for_html5(pre, data) except (HTML5Doc, etree.XMLSyntaxError): log.debug('Parsing %s as HTML' % filename) @@ -269,12 +269,12 @@ def parse_html(data, log=None, decoder=None, preprocessor=None, data = etree.tostring(data, encoding='unicode') try: - data = safe_xml_fromstring(data) + data = safe_xml_fromstring(data, recover=False) except: data = data.replace(':=', '=').replace(':>', '>') data = data.replace('', '') try: - data = safe_xml_fromstring(data) + data = safe_xml_fromstring(data, recover=False) except etree.XMLSyntaxError: log.warn('Stripping comments from %s'% filename) diff --git a/src/calibre/ebooks/oeb/polish/parsing.py b/src/calibre/ebooks/oeb/polish/parsing.py index cac8e307a3..2d0ee7dde3 100644 --- a/src/calibre/ebooks/oeb/polish/parsing.py +++ b/src/calibre/ebooks/oeb/polish/parsing.py @@ -78,7 +78,7 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N if force_html5_parse: return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False) try: - ans = safe_xml_fromstring(raw) + ans = safe_xml_fromstring(raw, recover=False) if ans.tag != '{%s}html' % XHTML_NS: raise ValueError('Root tag is not in the XHTML namespace') if linenumber_attribute: diff --git a/src/calibre/srv/opds.py b/src/calibre/srv/opds.py index 24f38d8c75..dd46b888f9 100644 --- a/src/calibre/srv/opds.py +++ b/src/calibre/srv/opds.py @@ -124,7 +124,7 @@ def html_to_lxml(raw): root.set('xmlns', "http://www.w3.org/1999/xhtml") raw = etree.tostring(root, encoding=None) try: - return safe_xml_fromstring(raw) + return safe_xml_fromstring(raw, recover=False) except: for x in root.iterdescendants(): remove = [] @@ -135,7 +135,7 @@ def html_to_lxml(raw): del x.attrib[a] raw = etree.tostring(root, encoding=None) try: - return safe_xml_fromstring(raw) + return safe_xml_fromstring(raw, recover=False) except: from calibre.ebooks.oeb.parse_utils import _html4_parse return _html4_parse(raw) diff --git a/src/calibre/web/feeds/recipes/collection.py b/src/calibre/web/feeds/recipes/collection.py index 5c81fae1c1..0129954baf 100644 --- a/src/calibre/web/feeds/recipes/collection.py +++ b/src/calibre/web/feeds/recipes/collection.py @@ -125,7 +125,7 @@ def get_custom_recipe_collection(*args): import traceback traceback.print_exc() continue - return safe_xml_fromstring(serialize_collection(rmap)) + return safe_xml_fromstring(serialize_collection(rmap), recover=False) def update_custom_recipe(id_, title, script): @@ -288,7 +288,7 @@ class SchedulerConfig(object): if os.access(self.conf_path, os.R_OK): with ExclusiveFile(self.conf_path) as f: try: - self.root = safe_xml_fromstring(f.read()) + self.root = safe_xml_fromstring(f.read(), recover=False) except: print('Failed to read recipe scheduler config') import traceback