From 56a9b9529a479d1624765fc94938eaed866287f8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 5 Oct 2020 16:00:54 +0530 Subject: [PATCH] MOBI Input: Fix regression that broke reading of some documents Apparently lxml.html is super fragile on Windows with python 3, so fallback to html5-parser when it barfs. --- src/calibre/ebooks/mobi/reader/mobi6.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/mobi/reader/mobi6.py b/src/calibre/ebooks/mobi/reader/mobi6.py index 5498d1714f..08d2072371 100644 --- a/src/calibre/ebooks/mobi/reader/mobi6.py +++ b/src/calibre/ebooks/mobi/reader/mobi6.py @@ -192,7 +192,15 @@ class MobiReader(object): except Exception: self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) - root = html.fromstring(self.processed_html) + try: + root = html.fromstring(self.processed_html) + except Exception: + self.log.warning('MOBI markup could not be parsed by lxml using html5-parser') + # Happens on windows with python 3 where lxml causes libxml to die with an + # error about using UCS-4 little endian encoding if certain + # characters are present in the input + from html5_parser import parse + root = parse(self.processed_html, keep_doctype=False, namespace_elements=False, maybe_xhtml=False, sanitize_names=True) if root.xpath('descendant::p/descendant::p'): from html5_parser import parse self.log.warning('Malformed markup, parsing using html5-parser')