CHM Input: Handle chm files that contain files with url unsafe filenames. Fixes #1100610 (Error in "Table of contents" when convert CHM to epub)

2025-08-30 23:00:21 -04:00 · 2013-01-17 17:46:32 +05:30 · 2013-01-17 17:46:32 +05:30 · 79823fbf61
commit 79823fbf61
parent b9581e5905
1 changed files with 26 additions and 6 deletions
--- a/src/calibre/ebooks/conversion/plugins/chm_input.py
+++ b/src/calibre/ebooks/conversion/plugins/chm_input.py
@ -58,9 +58,9 @@ class CHMInput(InputFormatPlugin):
            metadata = get_metadata_from_reader(self._chm_reader)
            self._chm_reader.CloseCHM()
-            #print tdir
+            # print tdir, mainpath
-            #from calibre import ipython
+            # from calibre import ipython
-            #ipython()
+            # ipython()
            options.debug_pipeline = None
            options.input_encoding = 'utf-8'
@ -143,6 +143,8 @@ class CHMInput(InputFormatPlugin):
    def _create_html_root(self, hhcpath, log):
        from lxml import html
        from urllib import unquote as _unquote
        from calibre.ebooks.oeb.base import urlquote
        hhcdata = self._read_file(hhcpath)
        hhcroot = html.fromstring(hhcdata)
        chapters = self._process_nodes(hhcroot)
@ -152,23 +154,41 @@ class CHMInput(InputFormatPlugin):
        #print "============================="
        log.debug('Found %d section nodes' % len(chapters))
        htmlpath = os.path.splitext(hhcpath)[0] + ".html"
        base = os.path.dirname(os.path.abspath(htmlpath))
        def unquote(x):
            if isinstance(x, unicode):
                x = x.encode('utf-8')
            return _unquote(x).decode('utf-8')
        def unquote_path(x):
            y = unquote(x)
            if (not os.path.exists(os.path.join(base, x)) and
                os.path.exists(os.path.join(base, y))):
                x = y
            return x
        with open(htmlpath, 'wb') as f:
            if chapters:
                f.write('<html><head><meta http-equiv="Content-type"'
                    ' content="text/html;charset=UTF-8" /></head><body>\n')
                path0 = chapters[0][1]
                path0 = unquote_path(path0)
                subpath = os.path.dirname(path0)
                base = os.path.dirname(f.name)
                for chapter in chapters:
                    title = chapter[0]
-                    rsrcname = os.path.basename(chapter[1])
+                    raw = unquote_path(chapter[1])
                    rsrcname = os.path.basename(raw)
                    rsrcpath = os.path.join(subpath, rsrcname)
                    if (not os.path.exists(os.path.join(base, rsrcpath)) and
-                            os.path.exists(os.path.join(base, chapter[1]))):
+                            os.path.exists(os.path.join(base, raw))):
-                        rsrcpath = chapter[1]
+                        rsrcpath = raw
                    # title should already be url encoded
                    if '%' not in rsrcpath:
                        rsrcpath = urlquote(rsrcpath)
                    url = "<br /><a href=" + rsrcpath + ">" + title + " </a>\n"
                    if isinstance(url, unicode):
                        url = url.encode('utf-8')