mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix #2652 (NotHTML: File 'foo.htm' does not appear to be (X)HTML)
This commit is contained in:
parent
d1dadc3402
commit
fb2296beb2
@ -759,28 +759,33 @@ class Manifest(object):
|
|||||||
# Convert to Unicode and normalize line endings
|
# Convert to Unicode and normalize line endings
|
||||||
data = self.oeb.decode(data)
|
data = self.oeb.decode(data)
|
||||||
data = self.oeb.html_preprocessor(data)
|
data = self.oeb.html_preprocessor(data)
|
||||||
|
orig_data = data
|
||||||
# Try with more & more drastic measures to parse
|
# Try with more & more drastic measures to parse
|
||||||
try:
|
def first_pass(data):
|
||||||
data = etree.fromstring(data)
|
|
||||||
except etree.XMLSyntaxError:
|
|
||||||
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
|
|
||||||
data = ENTITY_RE.sub(repl, data)
|
|
||||||
try:
|
try:
|
||||||
data = etree.fromstring(data)
|
data = etree.fromstring(data)
|
||||||
except etree.XMLSyntaxError:
|
except etree.XMLSyntaxError:
|
||||||
# TODO: Factor out HTML->XML coercion
|
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
|
||||||
self.oeb.logger.warn('Parsing file %r as HTML' % self.href)
|
data = ENTITY_RE.sub(repl, data)
|
||||||
data = html.fromstring(data)
|
|
||||||
data.attrib.pop('xmlns', None)
|
|
||||||
for elem in data.iter(tag=etree.Comment):
|
|
||||||
if elem.text:
|
|
||||||
elem.text = elem.text.strip('-')
|
|
||||||
data = etree.tostring(data, encoding=unicode)
|
|
||||||
try:
|
try:
|
||||||
data = etree.fromstring(data)
|
data = etree.fromstring(data)
|
||||||
except etree.XMLSyntaxError:
|
except etree.XMLSyntaxError:
|
||||||
data = etree.fromstring(data, parser=RECOVER_PARSER)
|
self.oeb.logger.warn('Parsing file %r as HTML' % self.href)
|
||||||
|
data = html.fromstring(data)
|
||||||
|
data.attrib.pop('xmlns', None)
|
||||||
|
for elem in data.iter(tag=etree.Comment):
|
||||||
|
if elem.text:
|
||||||
|
elem.text = elem.text.strip('-')
|
||||||
|
data = etree.tostring(data, encoding=unicode)
|
||||||
|
try:
|
||||||
|
data = etree.fromstring(data)
|
||||||
|
except etree.XMLSyntaxError:
|
||||||
|
data = etree.fromstring(data, parser=RECOVER_PARSER)
|
||||||
|
return data
|
||||||
|
data = first_pass(data)
|
||||||
# Force into the XHTML namespace
|
# Force into the XHTML namespace
|
||||||
|
if barename(data.tag) != 'html':
|
||||||
|
data = first_pass('<html>'+data+'</html>')
|
||||||
if barename(data.tag) != 'html':
|
if barename(data.tag) != 'html':
|
||||||
raise NotHTML(
|
raise NotHTML(
|
||||||
'File %r does not appear to be (X)HTML' % self.href)
|
'File %r does not appear to be (X)HTML' % self.href)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user