mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-11 09:13:57 -04:00
Conversion: Fix parsing of HTML 5 documents that are also valid XML but use non-namespaced inline svg
This commit is contained in:
parent
46383bf264
commit
ab983165a4
@ -216,6 +216,14 @@ def clean_word_doc(data, log):
|
||||
data = pat.sub('', data)
|
||||
return data
|
||||
|
||||
class HTML5Doc(ValueError):
|
||||
pass
|
||||
|
||||
def check_for_html5(prefix, root):
|
||||
if re.search(r'<!DOCTYPE\s+html\s*>', prefix, re.IGNORECASE) is not None:
|
||||
if root.xpath('//svg'):
|
||||
raise HTML5Doc('This document appears to be un-namespaced HTML 5, should be parsed by the HTML 5 parser')
|
||||
|
||||
def parse_html(data, log=None, decoder=None, preprocessor=None,
|
||||
filename='<string>', non_html_file_tags=frozenset()):
|
||||
if log is None:
|
||||
@ -257,7 +265,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
||||
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
|
||||
data = pat.sub(lambda m:user_entities[m.group(1)], data)
|
||||
|
||||
data = clean_word_doc(data, log)
|
||||
data = raw = clean_word_doc(data, log)
|
||||
|
||||
# Setting huge_tree=True causes crashes in windows with large files
|
||||
parser = etree.XMLParser(no_network=True)
|
||||
@ -265,14 +273,17 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
||||
# Try with more & more drastic measures to parse
|
||||
try:
|
||||
data = etree.fromstring(data, parser=parser)
|
||||
except etree.XMLSyntaxError:
|
||||
check_for_html5(pre, data)
|
||||
except (HTML5Doc, etree.XMLSyntaxError):
|
||||
log.debug('Initial parse failed, using more'
|
||||
' forgiving parsers')
|
||||
data = xml_replace_entities(data)
|
||||
raw = data = xml_replace_entities(raw)
|
||||
try:
|
||||
data = etree.fromstring(data, parser=parser)
|
||||
except etree.XMLSyntaxError:
|
||||
check_for_html5(pre, data)
|
||||
except (HTML5Doc, etree.XMLSyntaxError):
|
||||
log.debug('Parsing %s as HTML' % filename)
|
||||
data = raw
|
||||
try:
|
||||
data = html5_parse(data)
|
||||
except:
|
||||
|
Loading…
x
Reference in New Issue
Block a user