mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
MOBI Input: Fix regression that broke reading of some documents
Apparently lxml.html is super fragile on Windows with python 3, so fallback to html5-parser when it barfs.
This commit is contained in:
parent
02095fcf81
commit
56a9b9529a
@ -192,7 +192,15 @@ class MobiReader(object):
|
||||
except Exception:
|
||||
self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
|
||||
self.processed_html = self.remove_random_bytes(self.processed_html)
|
||||
try:
|
||||
root = html.fromstring(self.processed_html)
|
||||
except Exception:
|
||||
self.log.warning('MOBI markup could not be parsed by lxml using html5-parser')
|
||||
# Happens on windows with python 3 where lxml causes libxml to die with an
|
||||
# error about using UCS-4 little endian encoding if certain
|
||||
# characters are present in the input
|
||||
from html5_parser import parse
|
||||
root = parse(self.processed_html, keep_doctype=False, namespace_elements=False, maybe_xhtml=False, sanitize_names=True)
|
||||
if root.xpath('descendant::p/descendant::p'):
|
||||
from html5_parser import parse
|
||||
self.log.warning('Malformed markup, parsing using html5-parser')
|
||||
|
Loading…
x
Reference in New Issue
Block a user