Dont use a recovering XML parser in a few places where we are expected to fail on invalid XML

2025-07-07 10:14:46 -04:00 · 2020-01-01 14:00:36 +05:30 · 2020-01-01 14:00:36 +05:30 · fbfebda03f
commit fbfebda03f
parent 3eb28b395e
5 changed files with 10 additions and 10 deletions
--- a/src/calibre/ebooks/conversion/plugins/rtf_input.py
+++ b/src/calibre/ebooks/conversion/plugins/rtf_input.py
@ -282,7 +282,7 @@ class RTFInput(InputFormatPlugin):

        self.log('Converting XML to HTML...')
        inline_class = InlineClass(self.log)
-        styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True))
+        styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True), recover=False)
        extensions = {('calibre', 'inline-class') : inline_class}
        transform = etree.XSLT(styledoc, extensions=extensions)
        result = transform(doc)
--- a/src/calibre/ebooks/oeb/parse_utils.py
+++ b/src/calibre/ebooks/oeb/parse_utils.py
@ -208,14 +208,14 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,

    # Try with more & more drastic measures to parse
    try:
-        data = safe_xml_fromstring(data)
+        data = safe_xml_fromstring(data, recover=False)
        check_for_html5(pre, data)
    except (HTML5Doc, etree.XMLSyntaxError):
        log.debug('Initial parse failed, using more'
                ' forgiving parsers')
        raw = data = xml_replace_entities(raw)
        try:
-            data = safe_xml_fromstring(data)
+            data = safe_xml_fromstring(data, recover=False)
            check_for_html5(pre, data)
        except (HTML5Doc, etree.XMLSyntaxError):
            log.debug('Parsing %s as HTML' % filename)
@ -269,12 +269,12 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
        data = etree.tostring(data, encoding='unicode')

        try:
-            data = safe_xml_fromstring(data)
+            data = safe_xml_fromstring(data, recover=False)
        except:
            data = data.replace(':=', '=').replace(':>', '>')
            data = data.replace('<http:/>', '')
            try:
-                data = safe_xml_fromstring(data)
+                data = safe_xml_fromstring(data, recover=False)
            except etree.XMLSyntaxError:
                log.warn('Stripping comments from %s'%
                        filename)
--- a/src/calibre/ebooks/oeb/polish/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/parsing.py
@ -78,7 +78,7 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
    if force_html5_parse:
        return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
    try:
-        ans = safe_xml_fromstring(raw)
+        ans = safe_xml_fromstring(raw, recover=False)
        if ans.tag != '{%s}html' % XHTML_NS:
            raise ValueError('Root tag is not <html> in the XHTML namespace')
        if linenumber_attribute:
--- a/src/calibre/srv/opds.py
+++ b/src/calibre/srv/opds.py
@ -124,7 +124,7 @@ def html_to_lxml(raw):
    root.set('xmlns', "http://www.w3.org/1999/xhtml")
    raw = etree.tostring(root, encoding=None)
    try:
-        return safe_xml_fromstring(raw)
+        return safe_xml_fromstring(raw, recover=False)
    except:
        for x in root.iterdescendants():
            remove = []
@ -135,7 +135,7 @@ def html_to_lxml(raw):
                del x.attrib[a]
        raw = etree.tostring(root, encoding=None)
        try:
-            return safe_xml_fromstring(raw)
+            return safe_xml_fromstring(raw, recover=False)
        except:
            from calibre.ebooks.oeb.parse_utils import _html4_parse
            return _html4_parse(raw)
--- a/src/calibre/web/feeds/recipes/collection.py
+++ b/src/calibre/web/feeds/recipes/collection.py
@ -125,7 +125,7 @@ def get_custom_recipe_collection(*args):
            import traceback
            traceback.print_exc()
            continue
-    return safe_xml_fromstring(serialize_collection(rmap))
+    return safe_xml_fromstring(serialize_collection(rmap), recover=False)


 def update_custom_recipe(id_, title, script):
@ -288,7 +288,7 @@ class SchedulerConfig(object):
        if os.access(self.conf_path, os.R_OK):
            with ExclusiveFile(self.conf_path) as f:
                try:
-                    self.root = safe_xml_fromstring(f.read())
+                    self.root = safe_xml_fromstring(f.read(), recover=False)
                except:
                    print('Failed to read recipe scheduler config')
                    import traceback