From 1a5fa973ea79fa4ec962d949af269a97f1663d3c Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 17 Apr 2021 09:28:38 +0530
Subject: [PATCH] CHM Input: Fix handling of some CHM files that use non-ascii
 internal filenames and dont specify a character encoding in their metadata.
 Fixes #1924703 [E-book viewer: another error opening a *.chm
 file](https://bugs.launchpad.net/calibre/+bug/1924703)

---
 src/calibre/ebooks/chm/metadata.py |  2 +-
 src/calibre/ebooks/chm/reader.py   | 90 +++++++++++++++++-------------
 2 files changed, 53 insertions(+), 39 deletions(-)

diff --git a/src/calibre/ebooks/chm/metadata.py b/src/calibre/ebooks/chm/metadata.py
index 9278f758bd..712edb9228 100644
--- a/src/calibre/ebooks/chm/metadata.py
+++ b/src/calibre/ebooks/chm/metadata.py
@@ -137,7 +137,7 @@ def _get_cover(soup, rdr):
 
 
 def get_metadata_from_reader(rdr):
-    raw = rdr.GetFile(rdr.home)
+    raw = rdr.get_home()
     home = BeautifulSoup(xml_to_unicode(raw, strip_encoding_pats=True,
         resolve_entities=True)[0])
 
diff --git a/src/calibre/ebooks/chm/reader.py b/src/calibre/ebooks/chm/reader.py
index b101aed047..2ede9fe140 100644
--- a/src/calibre/ebooks/chm/reader.py
+++ b/src/calibre/ebooks/chm/reader.py
@@ -56,40 +56,62 @@ class CHMReader(CHMFile):
             raise CHMError("Unable to open CHM file '%s'"%(input,))
         self.log = log
         self.input_encoding = input_encoding
-        self.chm_encoding = self.get_encoding() or 'cp1252'
         self._sourcechm = input
         self._contents = None
         self._playorder = 0
         self._metadata = False
         self._extracted = False
         self.re_encoded_files = set()
+        self.get_encodings()
         if self.home:
-            self.home = as_unicode(self.home, self.chm_encoding)
+            self.home = self.decode_hhp_filename(self.home)
         if self.topics:
-            self.topics = as_unicode(self.topics, self.chm_encoding)
+            self.topics = self.decode_hhp_filename(self.topics)
 
         # location of '.hhc' file, which is the CHM TOC.
-        if self.topics is None:
-            self.root, ext = os.path.splitext(self.home.lstrip('/'))
-            self.hhc_path = self.root + ".hhc"
-        else:
-            self.root, ext = os.path.splitext(self.topics.lstrip('/'))
-            self.hhc_path = self.root + ".hhc"
+        base = self.topics or self.home
+        self.root = os.path.splitext(base.lstrip('/'))[0]
+        self.hhc_path = self.root + ".hhc"
+
+    def decode_hhp_filename(self, path):
+        if isinstance(path, str):
+            return path
+        for enc in (self.encoding_from_system_file, self.encoding_from_lcid, 'cp1252', 'cp1251', 'latin1', 'utf-8'):
+            if enc:
+                try:
+                    q = path.decode(enc)
+                except UnicodeDecodeError:
+                    continue
+                res, ui = self.ResolveObject(q)
+                if res == chmlib.CHM_RESOLVE_SUCCESS:
+                    return q
+
+    def get_encodings(self):
+        self.encoding_from_system_file = self.encoding_from_lcid = None
+        q = self.GetEncoding()
+        if q:
+            try:
+                if isinstance(q, bytes):
+                    q = q.decode('ascii')
+                    codecs.lookup(q)
+                    self.encoding_from_system_file = q
+            except Exception:
+                pass
+
+        lcid = self.GetLCID()
+        if lcid is not None:
+            q = lcid[0]
+            if q:
+                try:
+                    if isinstance(q, bytes):
+                        q = q.decode('ascii')
+                        codecs.lookup(q)
+                        self.encoding_from_lcid = q
+                except Exception:
+                    pass
 
     def get_encoding(self):
-        ans = self.GetEncoding()
-        if ans is None:
-            lcid = self.GetLCID()
-            if lcid is not None:
-                ans = lcid[0]
-        if ans:
-            try:
-                if isinstance(ans, bytes):
-                    ans = ans.decode('ascii')
-                codecs.lookup(ans)
-            except Exception:
-                ans = None
-        return ans
+        return self.encoding_from_system_file or self.encoding_from_lcid or 'cp1252'
 
     def _parse_toc(self, ul, basedir=getcwd()):
         toc = TOC(play_order=self._playorder, base_path=basedir, text='')
@@ -112,14 +134,9 @@ class CHMReader(CHMFile):
         return toc
 
     def ResolveObject(self, path):
-        opath = path
         if not isinstance(path, bytes):
-            path = path.encode(self.chm_encoding)
-        ans = CHMFile.ResolveObject(self, path)
-        if ans[0] != chmlib.CHM_RESOLVE_SUCCESS and not isinstance(opath, bytes):
-            path = opath.encode('utf-8')
-            ans = CHMFile.ResolveObject(self, path)
-        return ans
+            path = path.encode('utf-8')
+        return CHMFile.ResolveObject(self, path)
 
     def GetFile(self, path):
         # have to have abs paths for ResolveObject, but Contents() deliberately
@@ -132,16 +149,16 @@ class CHMReader(CHMFile):
             raise CHMError(f"Unable to locate {path!r} within CHM file {self.filename!r}")
         size, data = self.RetrieveObject(ui)
         if size == 0:
-            raise CHMError("'%s' is zero bytes in length!"%(path,))
+            raise CHMError(f"{path!r} is zero bytes in length!")
         return data
 
+    def get_home(self):
+        return self.GetFile(self.home)
+
     def ExtractFiles(self, output_dir=getcwd(), debug_dump=False):
         html_files = set()
-        enc = self.chm_encoding
         for path in self.Contents():
             fpath = path
-            if not isinstance(path, unicode_type):
-                fpath = path.decode(enc)
             lpath = os.path.join(output_dir, fpath)
             self._ensure_dir(lpath)
             try:
@@ -300,11 +317,8 @@ class CHMReader(CHMFile):
         paths = []
 
         def get_paths(chm, ui, ctx):
-            try:
-                path = as_unicode(ui.path, self.chm_encoding)
-            except UnicodeDecodeError:
-                path = as_unicode(ui.path, 'utf-8')
-
+            # these are supposed to be UTF-8 in CHM as best as I can determine
+            path = as_unicode(ui.path, 'utf-8')
             # skip directories
             # note this path refers to the internal CHM structure
             if path[-1] != '/':