From fbfebda03fee013bc8f6716fc22f60a7744f94c2 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 1 Jan 2020 14:00:36 +0530
Subject: [PATCH] Dont use a recovering XML parser in a few places where we are
 expected to fail on invalid XML

---
 src/calibre/ebooks/conversion/plugins/rtf_input.py | 2 +-
 src/calibre/ebooks/oeb/parse_utils.py              | 8 ++++----
 src/calibre/ebooks/oeb/polish/parsing.py           | 2 +-
 src/calibre/srv/opds.py                            | 4 ++--
 src/calibre/web/feeds/recipes/collection.py        | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/src/calibre/ebooks/conversion/plugins/rtf_input.py b/src/calibre/ebooks/conversion/plugins/rtf_input.py
index d18c18320b..952898d1d0 100644
--- a/src/calibre/ebooks/conversion/plugins/rtf_input.py
+++ b/src/calibre/ebooks/conversion/plugins/rtf_input.py
@@ -282,7 +282,7 @@ class RTFInput(InputFormatPlugin):
 
         self.log('Converting XML to HTML...')
         inline_class = InlineClass(self.log)
-        styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True))
+        styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True), recover=False)
         extensions = {('calibre', 'inline-class') : inline_class}
         transform = etree.XSLT(styledoc, extensions=extensions)
         result = transform(doc)
diff --git a/src/calibre/ebooks/oeb/parse_utils.py b/src/calibre/ebooks/oeb/parse_utils.py
index 427b47f168..0db0bc7b20 100644
--- a/src/calibre/ebooks/oeb/parse_utils.py
+++ b/src/calibre/ebooks/oeb/parse_utils.py
@@ -208,14 +208,14 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
 
     # Try with more & more drastic measures to parse
     try:
-        data = safe_xml_fromstring(data)
+        data = safe_xml_fromstring(data, recover=False)
         check_for_html5(pre, data)
     except (HTML5Doc, etree.XMLSyntaxError):
         log.debug('Initial parse failed, using more'
                 ' forgiving parsers')
         raw = data = xml_replace_entities(raw)
         try:
-            data = safe_xml_fromstring(data)
+            data = safe_xml_fromstring(data, recover=False)
             check_for_html5(pre, data)
         except (HTML5Doc, etree.XMLSyntaxError):
             log.debug('Parsing %s as HTML' % filename)
@@ -269,12 +269,12 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
         data = etree.tostring(data, encoding='unicode')
 
         try:
-            data = safe_xml_fromstring(data)
+            data = safe_xml_fromstring(data, recover=False)
         except:
             data = data.replace(':=', '=').replace(':>', '>')
             data = data.replace('<http:/>', '')
             try:
-                data = safe_xml_fromstring(data)
+                data = safe_xml_fromstring(data, recover=False)
             except etree.XMLSyntaxError:
                 log.warn('Stripping comments from %s'%
                         filename)
diff --git a/src/calibre/ebooks/oeb/polish/parsing.py b/src/calibre/ebooks/oeb/polish/parsing.py
index cac8e307a3..2d0ee7dde3 100644
--- a/src/calibre/ebooks/oeb/polish/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/parsing.py
@@ -78,7 +78,7 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
     if force_html5_parse:
         return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
     try:
-        ans = safe_xml_fromstring(raw)
+        ans = safe_xml_fromstring(raw, recover=False)
         if ans.tag != '{%s}html' % XHTML_NS:
             raise ValueError('Root tag is not <html> in the XHTML namespace')
         if linenumber_attribute:
diff --git a/src/calibre/srv/opds.py b/src/calibre/srv/opds.py
index 24f38d8c75..dd46b888f9 100644
--- a/src/calibre/srv/opds.py
+++ b/src/calibre/srv/opds.py
@@ -124,7 +124,7 @@ def html_to_lxml(raw):
     root.set('xmlns', "http://www.w3.org/1999/xhtml")
     raw = etree.tostring(root, encoding=None)
     try:
-        return safe_xml_fromstring(raw)
+        return safe_xml_fromstring(raw, recover=False)
     except:
         for x in root.iterdescendants():
             remove = []
@@ -135,7 +135,7 @@ def html_to_lxml(raw):
                 del x.attrib[a]
         raw = etree.tostring(root, encoding=None)
         try:
-            return safe_xml_fromstring(raw)
+            return safe_xml_fromstring(raw, recover=False)
         except:
             from calibre.ebooks.oeb.parse_utils import _html4_parse
             return _html4_parse(raw)
diff --git a/src/calibre/web/feeds/recipes/collection.py b/src/calibre/web/feeds/recipes/collection.py
index 5c81fae1c1..0129954baf 100644
--- a/src/calibre/web/feeds/recipes/collection.py
+++ b/src/calibre/web/feeds/recipes/collection.py
@@ -125,7 +125,7 @@ def get_custom_recipe_collection(*args):
             import traceback
             traceback.print_exc()
             continue
-    return safe_xml_fromstring(serialize_collection(rmap))
+    return safe_xml_fromstring(serialize_collection(rmap), recover=False)
 
 
 def update_custom_recipe(id_, title, script):
@@ -288,7 +288,7 @@ class SchedulerConfig(object):
         if os.access(self.conf_path, os.R_OK):
             with ExclusiveFile(self.conf_path) as f:
                 try:
-                    self.root = safe_xml_fromstring(f.read())
+                    self.root = safe_xml_fromstring(f.read(), recover=False)
                 except:
                     print('Failed to read recipe scheduler config')
                     import traceback