From fb2296beb24861fb93ebe0cf106ec154e4bb39bd Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 18 Jun 2009 16:38:16 -0700 Subject: [PATCH] Fix #2652 (NotHTML: File 'foo.htm' does not appear to be (X)HTML) --- src/calibre/ebooks/oeb/base.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 3f890dedf4..98f1be1068 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -759,28 +759,33 @@ class Manifest(object): # Convert to Unicode and normalize line endings data = self.oeb.decode(data) data = self.oeb.html_preprocessor(data) + orig_data = data # Try with more & more drastic measures to parse - try: - data = etree.fromstring(data) - except etree.XMLSyntaxError: - repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) - data = ENTITY_RE.sub(repl, data) + def first_pass(data): try: data = etree.fromstring(data) except etree.XMLSyntaxError: - # TODO: Factor out HTML->XML coercion - self.oeb.logger.warn('Parsing file %r as HTML' % self.href) - data = html.fromstring(data) - data.attrib.pop('xmlns', None) - for elem in data.iter(tag=etree.Comment): - if elem.text: - elem.text = elem.text.strip('-') - data = etree.tostring(data, encoding=unicode) + repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) + data = ENTITY_RE.sub(repl, data) try: data = etree.fromstring(data) except etree.XMLSyntaxError: - data = etree.fromstring(data, parser=RECOVER_PARSER) + self.oeb.logger.warn('Parsing file %r as HTML' % self.href) + data = html.fromstring(data) + data.attrib.pop('xmlns', None) + for elem in data.iter(tag=etree.Comment): + if elem.text: + elem.text = elem.text.strip('-') + data = etree.tostring(data, encoding=unicode) + try: + data = etree.fromstring(data) + except etree.XMLSyntaxError: + data = etree.fromstring(data, parser=RECOVER_PARSER) + return data + data = first_pass(data) # Force into the XHTML namespace + if barename(data.tag) != 'html': + data = first_pass(''+data+'') if barename(data.tag) != 'html': raise NotHTML( 'File %r does not appear to be (X)HTML' % self.href)