CHM Input: Fix incorrect decoding for CHM files whose hhc file is also a content file. Fixes #1151721 (Private bug)

2025-07-09 03:04:10 -04:00 · 2013-03-07 22:58:52 +05:30 · 2013-03-07 22:58:52 +05:30 · d46af974bc
commit d46af974bc
parent 55030cc0b0
2 changed files with 10 additions and 5 deletions
--- a/src/calibre/ebooks/chm/reader.py
+++ b/src/calibre/ebooks/chm/reader.py
@ -53,6 +53,7 @@ class CHMReader(CHMFile):
        self._playorder = 0
        self._metadata = False
        self._extracted = False
+        self.re_encoded_files = set()

        # location of '.hhc' file, which is the CHM TOC.
        if self.topics is None:
@ -147,8 +148,8 @@ class CHMReader(CHMFile):
                f.write(data)

        self._extracted = True
-        files = [x for x in os.listdir(output_dir) if
-                os.path.isfile(os.path.join(output_dir, x))]
+        files = [y for y in os.listdir(output_dir) if
+                os.path.isfile(os.path.join(output_dir, y))]
        if self.hhc_path not in files:
            for f in files:
                if f.lower() == self.hhc_path.lower():
@ -249,7 +250,9 @@ class CHMReader(CHMFile):
            pass
        # do not prettify, it would reformat the <pre> tags!
        try:
-            return str(soup)
+            ans = str(soup)
+            self.re_encoded_files.add(os.path.abspath(htmlpath))
+            return ans
        except RuntimeError:
            return data

--- a/src/calibre/ebooks/conversion/plugins/chm_input.py
+++ b/src/calibre/ebooks/conversion/plugins/chm_input.py
@ -25,7 +25,6 @@ class CHMInput(InputFormatPlugin):
        self._chm_reader = rdr
        return rdr.hhc_path

-
    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.ebooks.chm.metadata import get_metadata_from_reader
        from calibre.customize.ui import plugin_for_input_format
@ -63,7 +62,10 @@ class CHMInput(InputFormatPlugin):

            options.debug_pipeline = None
            options.input_encoding = 'utf-8'
-            htmlpath, toc = self._create_html_root(mainpath, log, encoding)
+            uenc = encoding
+            if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files:
+                uenc = 'utf-8'
+            htmlpath, toc = self._create_html_root(mainpath, log, uenc)
            oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
            options.debug_pipeline = odi
            if toc.count() > 1: