Conversion: Fix parsing of HTML 5 documents that are also valid XML but use non-namespaced inline svg

This commit is contained in:
Kovid Goyal 2014-02-20 09:34:47 +05:30
parent 46383bf264
commit ab983165a4

View File

@ -216,6 +216,14 @@ def clean_word_doc(data, log):
data = pat.sub('', data)
return data
class HTML5Doc(ValueError):
pass
def check_for_html5(prefix, root):
if re.search(r'<!DOCTYPE\s+html\s*>', prefix, re.IGNORECASE) is not None:
if root.xpath('//svg'):
raise HTML5Doc('This document appears to be un-namespaced HTML 5, should be parsed by the HTML 5 parser')
def parse_html(data, log=None, decoder=None, preprocessor=None,
filename='<string>', non_html_file_tags=frozenset()):
if log is None:
@ -257,7 +265,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
data = pat.sub(lambda m:user_entities[m.group(1)], data)
data = clean_word_doc(data, log)
data = raw = clean_word_doc(data, log)
# Setting huge_tree=True causes crashes in windows with large files
parser = etree.XMLParser(no_network=True)
@ -265,14 +273,17 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
# Try with more & more drastic measures to parse
try:
data = etree.fromstring(data, parser=parser)
except etree.XMLSyntaxError:
check_for_html5(pre, data)
except (HTML5Doc, etree.XMLSyntaxError):
log.debug('Initial parse failed, using more'
' forgiving parsers')
data = xml_replace_entities(data)
raw = data = xml_replace_entities(raw)
try:
data = etree.fromstring(data, parser=parser)
except etree.XMLSyntaxError:
check_for_html5(pre, data)
except (HTML5Doc, etree.XMLSyntaxError):
log.debug('Parsing %s as HTML' % filename)
data = raw
try:
data = html5_parse(data)
except: