Make handling of malformed/fragmentary HTML more robust

This commit is contained in:
Kovid Goyal 2009-07-27 09:31:45 -06:00
parent 2330c2a88e
commit 74875fff97

View File

@ -815,15 +815,28 @@ class Manifest(object):
data = etree.fromstring(data, parser=RECOVER_PARSER)
return data
data = first_pass(data)
# Force into the XHTML namespace
# Handle weird (non-HTML/fragment) files
if barename(data.tag) != 'html':
self.oeb.log.warn('File %r does not appear to be (X)HTML'%self.href)
nroot = etree.fromstring('<html></html>')
has_body = False
for child in list(data):
if barename(child.tag) == 'body':
has_body = True
break
parent = nroot
if not has_body:
self.oeb.log.warn('File %r appears to be a HTML fragment'%self.href)
nroot = etree.fromstring('<html><body/></html>')
parent = nroot[0]
for child in list(data):
child.getparent().remove(child)
nroot.append(child)
parent.append(child)
data = nroot
elif not namespace(data.tag):
# Force into the XHTML namespace
if not namespace(data.tag):
data.attrib['xmlns'] = XHTML_NS
data = etree.tostring(data, encoding=unicode)
try: