mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
MOBI Input: Fix regression that broke reading of some documents
Apparently lxml.html is super fragile on Windows with python 3, so fallback to html5-parser when it barfs.
This commit is contained in:
parent
02095fcf81
commit
56a9b9529a
@ -192,7 +192,15 @@ class MobiReader(object):
|
|||||||
except Exception:
|
except Exception:
|
||||||
self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
|
self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
|
||||||
self.processed_html = self.remove_random_bytes(self.processed_html)
|
self.processed_html = self.remove_random_bytes(self.processed_html)
|
||||||
root = html.fromstring(self.processed_html)
|
try:
|
||||||
|
root = html.fromstring(self.processed_html)
|
||||||
|
except Exception:
|
||||||
|
self.log.warning('MOBI markup could not be parsed by lxml using html5-parser')
|
||||||
|
# Happens on windows with python 3 where lxml causes libxml to die with an
|
||||||
|
# error about using UCS-4 little endian encoding if certain
|
||||||
|
# characters are present in the input
|
||||||
|
from html5_parser import parse
|
||||||
|
root = parse(self.processed_html, keep_doctype=False, namespace_elements=False, maybe_xhtml=False, sanitize_names=True)
|
||||||
if root.xpath('descendant::p/descendant::p'):
|
if root.xpath('descendant::p/descendant::p'):
|
||||||
from html5_parser import parse
|
from html5_parser import parse
|
||||||
self.log.warning('Malformed markup, parsing using html5-parser')
|
self.log.warning('Malformed markup, parsing using html5-parser')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user