From 56a9b9529a479d1624765fc94938eaed866287f8 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 5 Oct 2020 16:00:54 +0530
Subject: [PATCH] MOBI Input: Fix regression that broke reading of some
 documents

Apparently lxml.html is super fragile on Windows with python 3, so
fallback to html5-parser when it barfs.
---
 src/calibre/ebooks/mobi/reader/mobi6.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/mobi/reader/mobi6.py b/src/calibre/ebooks/mobi/reader/mobi6.py
index 5498d1714f..08d2072371 100644
--- a/src/calibre/ebooks/mobi/reader/mobi6.py
+++ b/src/calibre/ebooks/mobi/reader/mobi6.py
@@ -192,7 +192,15 @@ class MobiReader(object):
         except Exception:
             self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
             self.processed_html = self.remove_random_bytes(self.processed_html)
-            root = html.fromstring(self.processed_html)
+            try:
+                root = html.fromstring(self.processed_html)
+            except Exception:
+                self.log.warning('MOBI markup could not be parsed by lxml using html5-parser')
+                # Happens on windows with python 3 where lxml causes libxml to die with an
+                # error about using UCS-4 little endian encoding if certain
+                # characters are present in the input
+                from html5_parser import parse
+                root = parse(self.processed_html, keep_doctype=False, namespace_elements=False, maybe_xhtml=False, sanitize_names=True)
         if root.xpath('descendant::p/descendant::p'):
             from html5_parser import parse
             self.log.warning('Malformed markup, parsing using html5-parser')