CHM Input: Fix handling of some CHM files that use non-ascii internal filenames and dont specify a character encoding in their metadata. Fixes #1924703 [E-book viewer: another error opening a *.chm file](https://bugs.launchpad.net/calibre/+bug/1924703)

2025-07-09 03:04:10 -04:00 · 2021-04-17 09:28:38 +05:30 · 2021-04-17 09:28:38 +05:30 · 1a5fa973ea
commit 1a5fa973ea
parent 59d39cca4c
2 changed files with 53 additions and 39 deletions
--- a/src/calibre/ebooks/chm/metadata.py
+++ b/src/calibre/ebooks/chm/metadata.py
@ -137,7 +137,7 @@ def _get_cover(soup, rdr):
 def get_metadata_from_reader(rdr):
-    raw = rdr.GetFile(rdr.home)
+    raw = rdr.get_home()
    home = BeautifulSoup(xml_to_unicode(raw, strip_encoding_pats=True,
        resolve_entities=True)[0])
--- a/src/calibre/ebooks/chm/reader.py
+++ b/src/calibre/ebooks/chm/reader.py
@ -56,40 +56,62 @@ class CHMReader(CHMFile):
            raise CHMError("Unable to open CHM file '%s'"%(input,))
        self.log = log
        self.input_encoding = input_encoding
        self.chm_encoding = self.get_encoding() or 'cp1252'
        self._sourcechm = input
        self._contents = None
        self._playorder = 0
        self._metadata = False
        self._extracted = False
        self.re_encoded_files = set()
        self.get_encodings()
        if self.home:
-            self.home = as_unicode(self.home, self.chm_encoding)
+            self.home = self.decode_hhp_filename(self.home)
        if self.topics:
-            self.topics = as_unicode(self.topics, self.chm_encoding)
+            self.topics = self.decode_hhp_filename(self.topics)
        # location of '.hhc' file, which is the CHM TOC.
-        if self.topics is None:
+        base = self.topics or self.home
-            self.root, ext = os.path.splitext(self.home.lstrip('/'))
+        self.root = os.path.splitext(base.lstrip('/'))[0]
-            self.hhc_path = self.root + ".hhc"
+        self.hhc_path = self.root + ".hhc"
-        else:
+
-            self.root, ext = os.path.splitext(self.topics.lstrip('/'))
+    def decode_hhp_filename(self, path):
-            self.hhc_path = self.root + ".hhc"
+        if isinstance(path, str):
            return path
        for enc in (self.encoding_from_system_file, self.encoding_from_lcid, 'cp1252', 'cp1251', 'latin1', 'utf-8'):
            if enc:
                try:
                    q = path.decode(enc)
                except UnicodeDecodeError:
                    continue
                res, ui = self.ResolveObject(q)
                if res == chmlib.CHM_RESOLVE_SUCCESS:
                    return q
    def get_encodings(self):
        self.encoding_from_system_file = self.encoding_from_lcid = None
        q = self.GetEncoding()
        if q:
            try:
                if isinstance(q, bytes):
                    q = q.decode('ascii')
                    codecs.lookup(q)
                    self.encoding_from_system_file = q
            except Exception:
                pass
        lcid = self.GetLCID()
        if lcid is not None:
            q = lcid[0]
            if q:
                try:
                    if isinstance(q, bytes):
                        q = q.decode('ascii')
                        codecs.lookup(q)
                        self.encoding_from_lcid = q
                except Exception:
                    pass
    def get_encoding(self):
-        ans = self.GetEncoding()
+        return self.encoding_from_system_file or self.encoding_from_lcid or 'cp1252'
        if ans is None:
            lcid = self.GetLCID()
            if lcid is not None:
                ans = lcid[0]
        if ans:
            try:
                if isinstance(ans, bytes):
                    ans = ans.decode('ascii')
                codecs.lookup(ans)
            except Exception:
                ans = None
        return ans
    def _parse_toc(self, ul, basedir=getcwd()):
        toc = TOC(play_order=self._playorder, base_path=basedir, text='')
@ -112,14 +134,9 @@ class CHMReader(CHMFile):
        return toc
    def ResolveObject(self, path):
        opath = path
        if not isinstance(path, bytes):
-            path = path.encode(self.chm_encoding)
+            path = path.encode('utf-8')
-        ans = CHMFile.ResolveObject(self, path)
+        return CHMFile.ResolveObject(self, path)
        if ans[0] != chmlib.CHM_RESOLVE_SUCCESS and not isinstance(opath, bytes):
            path = opath.encode('utf-8')
            ans = CHMFile.ResolveObject(self, path)
        return ans
    def GetFile(self, path):
        # have to have abs paths for ResolveObject, but Contents() deliberately
@ -132,16 +149,16 @@ class CHMReader(CHMFile):
            raise CHMError(f"Unable to locate {path!r} within CHM file {self.filename!r}")
        size, data = self.RetrieveObject(ui)
        if size == 0:
-            raise CHMError("'%s' is zero bytes in length!"%(path,))
+            raise CHMError(f"{path!r} is zero bytes in length!")
        return data
    def get_home(self):
        return self.GetFile(self.home)
    def ExtractFiles(self, output_dir=getcwd(), debug_dump=False):
        html_files = set()
        enc = self.chm_encoding
        for path in self.Contents():
            fpath = path
            if not isinstance(path, unicode_type):
                fpath = path.decode(enc)
            lpath = os.path.join(output_dir, fpath)
            self._ensure_dir(lpath)
            try:
@ -300,11 +317,8 @@ class CHMReader(CHMFile):
        paths = []
        def get_paths(chm, ui, ctx):
-            try:
+            # these are supposed to be UTF-8 in CHM as best as I can determine
-                path = as_unicode(ui.path, self.chm_encoding)
+            path = as_unicode(ui.path, 'utf-8')
            except UnicodeDecodeError:
                path = as_unicode(ui.path, 'utf-8')
            # skip directories
            # note this path refers to the internal CHM structure
            if path[-1] != '/':