mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-11 09:13:57 -04:00
Conversion: Fix parsing of HTML 5 documents that are also valid XML but use non-namespaced inline svg
This commit is contained in:
parent
46383bf264
commit
ab983165a4
@ -216,6 +216,14 @@ def clean_word_doc(data, log):
|
|||||||
data = pat.sub('', data)
|
data = pat.sub('', data)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
class HTML5Doc(ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def check_for_html5(prefix, root):
|
||||||
|
if re.search(r'<!DOCTYPE\s+html\s*>', prefix, re.IGNORECASE) is not None:
|
||||||
|
if root.xpath('//svg'):
|
||||||
|
raise HTML5Doc('This document appears to be un-namespaced HTML 5, should be parsed by the HTML 5 parser')
|
||||||
|
|
||||||
def parse_html(data, log=None, decoder=None, preprocessor=None,
|
def parse_html(data, log=None, decoder=None, preprocessor=None,
|
||||||
filename='<string>', non_html_file_tags=frozenset()):
|
filename='<string>', non_html_file_tags=frozenset()):
|
||||||
if log is None:
|
if log is None:
|
||||||
@ -257,7 +265,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|||||||
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
|
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
|
||||||
data = pat.sub(lambda m:user_entities[m.group(1)], data)
|
data = pat.sub(lambda m:user_entities[m.group(1)], data)
|
||||||
|
|
||||||
data = clean_word_doc(data, log)
|
data = raw = clean_word_doc(data, log)
|
||||||
|
|
||||||
# Setting huge_tree=True causes crashes in windows with large files
|
# Setting huge_tree=True causes crashes in windows with large files
|
||||||
parser = etree.XMLParser(no_network=True)
|
parser = etree.XMLParser(no_network=True)
|
||||||
@ -265,14 +273,17 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|||||||
# Try with more & more drastic measures to parse
|
# Try with more & more drastic measures to parse
|
||||||
try:
|
try:
|
||||||
data = etree.fromstring(data, parser=parser)
|
data = etree.fromstring(data, parser=parser)
|
||||||
except etree.XMLSyntaxError:
|
check_for_html5(pre, data)
|
||||||
|
except (HTML5Doc, etree.XMLSyntaxError):
|
||||||
log.debug('Initial parse failed, using more'
|
log.debug('Initial parse failed, using more'
|
||||||
' forgiving parsers')
|
' forgiving parsers')
|
||||||
data = xml_replace_entities(data)
|
raw = data = xml_replace_entities(raw)
|
||||||
try:
|
try:
|
||||||
data = etree.fromstring(data, parser=parser)
|
data = etree.fromstring(data, parser=parser)
|
||||||
except etree.XMLSyntaxError:
|
check_for_html5(pre, data)
|
||||||
|
except (HTML5Doc, etree.XMLSyntaxError):
|
||||||
log.debug('Parsing %s as HTML' % filename)
|
log.debug('Parsing %s as HTML' % filename)
|
||||||
|
data = raw
|
||||||
try:
|
try:
|
||||||
data = html5_parse(data)
|
data = html5_parse(data)
|
||||||
except:
|
except:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user