Dont use a recovering XML parser in a few places where we are expected to fail on invalid XML

This commit is contained in:
Kovid Goyal 2020-01-01 14:00:36 +05:30
parent 3eb28b395e
commit fbfebda03f
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
5 changed files with 10 additions and 10 deletions

View File

@ -282,7 +282,7 @@ class RTFInput(InputFormatPlugin):
self.log('Converting XML to HTML...')
inline_class = InlineClass(self.log)
styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True))
styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True), recover=False)
extensions = {('calibre', 'inline-class') : inline_class}
transform = etree.XSLT(styledoc, extensions=extensions)
result = transform(doc)

View File

@ -208,14 +208,14 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
# Try with more & more drastic measures to parse
try:
data = safe_xml_fromstring(data)
data = safe_xml_fromstring(data, recover=False)
check_for_html5(pre, data)
except (HTML5Doc, etree.XMLSyntaxError):
log.debug('Initial parse failed, using more'
' forgiving parsers')
raw = data = xml_replace_entities(raw)
try:
data = safe_xml_fromstring(data)
data = safe_xml_fromstring(data, recover=False)
check_for_html5(pre, data)
except (HTML5Doc, etree.XMLSyntaxError):
log.debug('Parsing %s as HTML' % filename)
@ -269,12 +269,12 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
data = etree.tostring(data, encoding='unicode')
try:
data = safe_xml_fromstring(data)
data = safe_xml_fromstring(data, recover=False)
except:
data = data.replace(':=', '=').replace(':>', '>')
data = data.replace('<http:/>', '')
try:
data = safe_xml_fromstring(data)
data = safe_xml_fromstring(data, recover=False)
except etree.XMLSyntaxError:
log.warn('Stripping comments from %s'%
filename)

View File

@ -78,7 +78,7 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
if force_html5_parse:
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
try:
ans = safe_xml_fromstring(raw)
ans = safe_xml_fromstring(raw, recover=False)
if ans.tag != '{%s}html' % XHTML_NS:
raise ValueError('Root tag is not <html> in the XHTML namespace')
if linenumber_attribute:

View File

@ -124,7 +124,7 @@ def html_to_lxml(raw):
root.set('xmlns', "http://www.w3.org/1999/xhtml")
raw = etree.tostring(root, encoding=None)
try:
return safe_xml_fromstring(raw)
return safe_xml_fromstring(raw, recover=False)
except:
for x in root.iterdescendants():
remove = []
@ -135,7 +135,7 @@ def html_to_lxml(raw):
del x.attrib[a]
raw = etree.tostring(root, encoding=None)
try:
return safe_xml_fromstring(raw)
return safe_xml_fromstring(raw, recover=False)
except:
from calibre.ebooks.oeb.parse_utils import _html4_parse
return _html4_parse(raw)

View File

@ -125,7 +125,7 @@ def get_custom_recipe_collection(*args):
import traceback
traceback.print_exc()
continue
return safe_xml_fromstring(serialize_collection(rmap))
return safe_xml_fromstring(serialize_collection(rmap), recover=False)
def update_custom_recipe(id_, title, script):
@ -288,7 +288,7 @@ class SchedulerConfig(object):
if os.access(self.conf_path, os.R_OK):
with ExclusiveFile(self.conf_path) as f:
try:
self.root = safe_xml_fromstring(f.read())
self.root = safe_xml_fromstring(f.read(), recover=False)
except:
print('Failed to read recipe scheduler config')
import traceback