Fix #2652 (NotHTML: File 'foo.htm' does not appear to be (X)HTML)

This commit is contained in:
Kovid Goyal 2009-06-18 16:38:16 -07:00
parent d1dadc3402
commit fb2296beb2

View File

@ -759,28 +759,33 @@ class Manifest(object):
# Convert to Unicode and normalize line endings # Convert to Unicode and normalize line endings
data = self.oeb.decode(data) data = self.oeb.decode(data)
data = self.oeb.html_preprocessor(data) data = self.oeb.html_preprocessor(data)
orig_data = data
# Try with more & more drastic measures to parse # Try with more & more drastic measures to parse
try: def first_pass(data):
data = etree.fromstring(data)
except etree.XMLSyntaxError:
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
data = ENTITY_RE.sub(repl, data)
try: try:
data = etree.fromstring(data) data = etree.fromstring(data)
except etree.XMLSyntaxError: except etree.XMLSyntaxError:
# TODO: Factor out HTML->XML coercion repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
self.oeb.logger.warn('Parsing file %r as HTML' % self.href) data = ENTITY_RE.sub(repl, data)
data = html.fromstring(data)
data.attrib.pop('xmlns', None)
for elem in data.iter(tag=etree.Comment):
if elem.text:
elem.text = elem.text.strip('-')
data = etree.tostring(data, encoding=unicode)
try: try:
data = etree.fromstring(data) data = etree.fromstring(data)
except etree.XMLSyntaxError: except etree.XMLSyntaxError:
data = etree.fromstring(data, parser=RECOVER_PARSER) self.oeb.logger.warn('Parsing file %r as HTML' % self.href)
data = html.fromstring(data)
data.attrib.pop('xmlns', None)
for elem in data.iter(tag=etree.Comment):
if elem.text:
elem.text = elem.text.strip('-')
data = etree.tostring(data, encoding=unicode)
try:
data = etree.fromstring(data)
except etree.XMLSyntaxError:
data = etree.fromstring(data, parser=RECOVER_PARSER)
return data
data = first_pass(data)
# Force into the XHTML namespace # Force into the XHTML namespace
if barename(data.tag) != 'html':
data = first_pass('<html>'+data+'</html>')
if barename(data.tag) != 'html': if barename(data.tag) != 'html':
raise NotHTML( raise NotHTML(
'File %r does not appear to be (X)HTML' % self.href) 'File %r does not appear to be (X)HTML' % self.href)