From ab983165a497ff788be86806a6b532ef6c8ff60a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 20 Feb 2014 09:34:47 +0530 Subject: [PATCH] Conversion: Fix parsing of HTML 5 documents that are also valid XML but use non-namespaced inline svg --- src/calibre/ebooks/oeb/parse_utils.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/oeb/parse_utils.py b/src/calibre/ebooks/oeb/parse_utils.py index 1bb13d21a2..52f7976998 100644 --- a/src/calibre/ebooks/oeb/parse_utils.py +++ b/src/calibre/ebooks/oeb/parse_utils.py @@ -216,6 +216,14 @@ def clean_word_doc(data, log): data = pat.sub('', data) return data +class HTML5Doc(ValueError): + pass + +def check_for_html5(prefix, root): + if re.search(r'', prefix, re.IGNORECASE) is not None: + if root.xpath('//svg'): + raise HTML5Doc('This document appears to be un-namespaced HTML 5, should be parsed by the HTML 5 parser') + def parse_html(data, log=None, decoder=None, preprocessor=None, filename='', non_html_file_tags=frozenset()): if log is None: @@ -257,7 +265,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None, pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys()))) data = pat.sub(lambda m:user_entities[m.group(1)], data) - data = clean_word_doc(data, log) + data = raw = clean_word_doc(data, log) # Setting huge_tree=True causes crashes in windows with large files parser = etree.XMLParser(no_network=True) @@ -265,14 +273,17 @@ def parse_html(data, log=None, decoder=None, preprocessor=None, # Try with more & more drastic measures to parse try: data = etree.fromstring(data, parser=parser) - except etree.XMLSyntaxError: + check_for_html5(pre, data) + except (HTML5Doc, etree.XMLSyntaxError): log.debug('Initial parse failed, using more' ' forgiving parsers') - data = xml_replace_entities(data) + raw = data = xml_replace_entities(raw) try: data = etree.fromstring(data, parser=parser) - except etree.XMLSyntaxError: + check_for_html5(pre, data) + except (HTML5Doc, etree.XMLSyntaxError): log.debug('Parsing %s as HTML' % filename) + data = raw try: data = html5_parse(data) except: