mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Make handling of malformed/fragmentary HTML more robust
This commit is contained in:
parent
2330c2a88e
commit
74875fff97
@ -815,15 +815,28 @@ class Manifest(object):
|
|||||||
data = etree.fromstring(data, parser=RECOVER_PARSER)
|
data = etree.fromstring(data, parser=RECOVER_PARSER)
|
||||||
return data
|
return data
|
||||||
data = first_pass(data)
|
data = first_pass(data)
|
||||||
# Force into the XHTML namespace
|
|
||||||
|
# Handle weird (non-HTML/fragment) files
|
||||||
if barename(data.tag) != 'html':
|
if barename(data.tag) != 'html':
|
||||||
self.oeb.log.warn('File %r does not appear to be (X)HTML'%self.href)
|
self.oeb.log.warn('File %r does not appear to be (X)HTML'%self.href)
|
||||||
nroot = etree.fromstring('<html></html>')
|
nroot = etree.fromstring('<html></html>')
|
||||||
|
has_body = False
|
||||||
|
for child in list(data):
|
||||||
|
if barename(child.tag) == 'body':
|
||||||
|
has_body = True
|
||||||
|
break
|
||||||
|
parent = nroot
|
||||||
|
if not has_body:
|
||||||
|
self.oeb.log.warn('File %r appears to be a HTML fragment'%self.href)
|
||||||
|
nroot = etree.fromstring('<html><body/></html>')
|
||||||
|
parent = nroot[0]
|
||||||
for child in list(data):
|
for child in list(data):
|
||||||
child.getparent().remove(child)
|
child.getparent().remove(child)
|
||||||
nroot.append(child)
|
parent.append(child)
|
||||||
data = nroot
|
data = nroot
|
||||||
elif not namespace(data.tag):
|
|
||||||
|
# Force into the XHTML namespace
|
||||||
|
if not namespace(data.tag):
|
||||||
data.attrib['xmlns'] = XHTML_NS
|
data.attrib['xmlns'] = XHTML_NS
|
||||||
data = etree.tostring(data, encoding=unicode)
|
data = etree.tostring(data, encoding=unicode)
|
||||||
try:
|
try:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user