CHM Input: Handle CHM files with no HHC ToC. Fixes #2056470 [Ablity to render CHM when there is no TOC/.hhc file present:](https://bugs.launchpad.net/calibre/+bug/2056470)

2025-07-09 03:04:10 -04:00 · 2024-03-12 19:05:02 +05:30 · 2024-03-12 19:05:02 +05:30 · 91120754a3
commit 91120754a3
parent 4b88d010ef
5 changed files with 107 additions and 19 deletions
--- a/src/calibre/ebooks/chm/reader.py
+++ b/src/calibre/ebooks/chm/reader.py
@ -6,13 +6,14 @@ __copyright__  = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
 import codecs
 import os
 import re
+import struct
+from chm.chm import CHMFile, chmlib

 from calibre import guess_type as guess_mimetype
 from calibre.constants import filesystem_encoding, iswindows
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.ebooks.metadata.toc import TOC
-from chm.chm import CHMFile, chmlib
 from polyglot.builtins import as_unicode


@ -73,6 +74,22 @@ class CHMReader(CHMFile):
        self.root = os.path.splitext(base.lstrip('/'))[0]
        self.hhc_path = self.root + ".hhc"

+    def relpath_to_first_html_file(self):
+        # See https://www.nongnu.org/chmspec/latest/Internal.html#SYSTEM
+        data = self.GetFile('/#SYSTEM')
+        pos = 4
+        while pos < len(data):
+            code, length_of_data = struct.unpack_from('<HH', data, pos)
+            pos += 4
+            if code == 2:
+                default_topic = data[pos:pos+length_of_data].rstrip(b'\0')
+                break
+            pos += length_of_data
+        else:
+            raise CHMError('No default topic found in CHM file that has no HHC ToC either')
+        default_topic = self.decode_hhp_filename(b'/' + default_topic)
+        return default_topic[1:]
+
    def decode_hhp_filename(self, path):
        if isinstance(path, str):
            return path
@ -140,6 +157,10 @@ class CHMReader(CHMFile):
            path = path.encode('utf-8')
        return CHMFile.ResolveObject(self, path)

+    def file_exists(self, path):
+        res, ui = self.ResolveObject(path)
+        return res == chmlib.CHM_RESOLVE_SUCCESS
+
    def GetFile(self, path):
        # have to have abs paths for ResolveObject, but Contents() deliberately
        # makes them relative. So we don't have to worry, re-add the leading /.
--- a/src/calibre/ebooks/conversion/plugins/chm_input.py
+++ b/src/calibre/ebooks/conversion/plugins/chm_input.py
@ -63,7 +63,6 @@ class CHMInput(InputFormatPlugin):
                from calibre.ebooks.metadata.book.base import Metadata
                metadata = Metadata(os.path.basename(chm_name))
            encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252'
-            self._chm_reader.CloseCHM()
            # print((tdir, mainpath))
            # from calibre import ipython
            # ipython()
@ -74,6 +73,7 @@ class CHMInput(InputFormatPlugin):
            if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files:
                uenc = 'utf-8'
            htmlpath, toc = self._create_html_root(mainpath, log, uenc)
+            self._chm_reader.CloseCHM()
            oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
            options.debug_pipeline = odi
            if toc.count() > 1:
@ -102,6 +102,8 @@ class CHMInput(InputFormatPlugin):
        # use HTMLInput plugin to generate book
        from calibre.customize.builtins import HTMLInput
        opts.breadth_first = True
+        opts.max_levels = 30
+        opts.correct_case_mismatches = True
        htmlinput = HTMLInput(None)
        htmlinput.set_root_dir_of_input(basedir)
        htmlinput.root_dir_for_absolute_links = basedir
@ -113,7 +115,12 @@ class CHMInput(InputFormatPlugin):
        from polyglot.urllib import unquote as _unquote
        from calibre.ebooks.oeb.base import urlquote
        from calibre.ebooks.chardet import xml_to_unicode
-        hhcdata = self._read_file(hhcpath)
+        try:
+            hhcdata = self._read_file(hhcpath)
+        except FileNotFoundError:
+            log.warn('No HHC file found in CHM, using the default topic as the first HTML file')
+            from calibre.ebooks.oeb.base import TOC
+            return os.path.join(os.path.dirname(hhcpath), self._chm_reader.relpath_to_first_html_file()), TOC()
        hhcdata = hhcdata.decode(encoding)
        hhcdata = xml_to_unicode(hhcdata, verbose=True,
                            strip_encoding_pats=True, resolve_entities=True)[0]
--- a/src/calibre/ebooks/conversion/plugins/html_input.py
+++ b/src/calibre/ebooks/conversion/plugins/html_input.py
@ -8,12 +8,15 @@ __docformat__ = 'restructuredtext en'
 import os
 import re
 import tempfile
+from contextlib import suppress
 from functools import partial
 from urllib.parse import quote

-from calibre.constants import isbsd, islinux, filesystem_encoding
+from calibre.constants import filesystem_encoding, isbsd, islinux
 from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
-from calibre.utils.filenames import ascii_filename, get_long_path_name
+from calibre.utils.filenames import (
+    ascii_filename, case_ignoring_open_file, get_long_path_name,
+)
 from calibre.utils.imghdr import what
 from calibre.utils.localization import __, get_lang
 from polyglot.builtins import as_unicode
@ -293,7 +296,13 @@ class HTMLInput(InputFormatPlugin):
        except:
            return link_
        if not os.access(link, os.R_OK):
-            return link_
+            corrected = False
+            if getattr(self.opts, 'correct_case_mismatches', False):
+                with suppress(OSError), case_ignoring_open_file(link) as f:
+                    link = f.name
+                    corrected = True
+            if not corrected:
+                return link_
        if os.path.isdir(link):
            self.log.warn(link_, 'is a link to a directory. Ignoring.')
            return link_
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -10,12 +10,16 @@ __docformat__ = 'restructuredtext en'
 Input plugin for HTML or OPF ebooks.
 '''

-import os, re, sys,  errno as gerrno
+import errno as gerrno
+import os
+import re
+import sys

-from calibre.ebooks.oeb.base import urlunquote
-from calibre.ebooks.chardet import detect_xml_encoding
+from calibre import replace_entities, unicode_path
 from calibre.constants import iswindows
-from calibre import unicode_path, replace_entities
+from calibre.ebooks.chardet import detect_xml_encoding
+from calibre.ebooks.oeb.base import urlunquote
+from calibre.utils.filenames import case_ignoring_open_file
 from polyglot.urllib import urlparse, urlunparse


@ -92,21 +96,23 @@ class HTMLFile:
    r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
    re.DOTALL|re.IGNORECASE)

-    def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
+    def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None, correct_case_mismatches=False):
        '''
        :param level: The level of this file. Should be 0 for the root file.
        :param encoding: Use `encoding` to decode HTML.
        :param referrer: The :class:`HTMLFile` that first refers to this file.
        '''
        self.path     = unicode_path(path_to_html_file, abs=True)
-        self.title    = os.path.splitext(os.path.basename(self.path))[0]
-        self.base     = os.path.dirname(self.path)
        self.level    = level
        self.referrer = referrer
        self.links    = []

        try:
-            with open(self.path, 'rb') as f:
+            f = case_ignoring_open_file(self.path, 'rb') if correct_case_mismatches else open(self.path, 'rb')
+            self.path = f.name
+            self.base = os.path.dirname(self.path)
+            self.title = os.path.splitext(os.path.basename(self.path))[0]
+            with f:
                src = header = f.read(4096)
                encoding = detect_xml_encoding(src)[1]
                if encoding:
@ -238,7 +244,7 @@ def find_tests():
    return unittest.defaultTestLoader.loadTestsFromTestCase(TestHTMLInput)


-def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None):
+def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None, correct_case_mismatches=False):
    '''
    Recursively traverse all links in the HTML file.

@ -251,7 +257,8 @@ def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None
    '''
    assert max_levels >= 0
    level = 0
-    flat =  [HTMLFile(path_to_html_file, level, encoding, verbose)]
+    flat =  [HTMLFile(path_to_html_file, level, encoding, verbose, correct_case_mismatches=correct_case_mismatches)]
+    seen = {flat[0].path}
    next_level = list(flat)
    while level < max_levels and len(next_level) > 0:
        level += 1
@ -259,10 +266,13 @@ def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None
        for hf in next_level:
            rejects = []
            for link in hf.links:
-                if link.path is None or link.path in flat:
+                if link.path is None or link.path in seen:
                    continue
                try:
-                    nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
+                    nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf, correct_case_mismatches=correct_case_mismatches)
+                    if nf.path in seen:
+                        continue
+                    seen.add(nf.path)
                    if nf.is_binary:
                        raise IgnoreFile('%s is a binary file'%nf.path, -1)
                    nl.append(nf)
@ -285,7 +295,7 @@ def get_filelist(htmlfile, dir, opts, log):
    '''
    log.info('Building file list...')
    filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
-                        verbose=opts.verbose,
+                        verbose=opts.verbose, correct_case_mismatches=getattr(opts, 'correct_case_mismatches', False),
                        encoding=opts.input_encoding)[0 if opts.breadth_first else 1]
    if opts.verbose:
        log.debug('\tFound files...')
--- a/src/calibre/utils/filenames.py
+++ b/src/calibre/utils/filenames.py
@ -133,6 +133,47 @@ def is_case_sensitive(path):
    return is_case_sensitive


+def case_ignoring_open_file(path, mode='r'):
+    '''
+    Open an existing file case insensitively, even on case sensitive file systems
+    '''
+    try:
+        return open(path, mode)
+    except FileNotFoundError as err:
+        original_err = err
+
+    def next_component(final_path, components):
+        if not components:
+            return final_path
+        component = components.pop()
+        cl = component.lower()
+        try:
+            matches = {x for x in os.listdir(final_path) if x.lower() == cl}
+        except OSError:
+            raise original_err from None
+        for x in matches:
+            current = os.path.join(final_path, x)
+            try:
+                return next_component(current, list(components))
+            except Exception:
+                continue
+        raise original_err
+
+    if isbytestring(path):
+        path = path.decode(filesystem_encoding)
+    if path.endswith(os.sep):
+        path = path[:-1]
+    if not path:
+        raise ValueError('Path must not point to root')
+
+    components = path.split(os.sep)
+    if len(components) <= 1:
+        raise ValueError(f'Invalid path: {path}')
+    final_path = (components[0].upper() + os.sep) if iswindows else '/'
+    components = list(reversed(components))[:-1]
+    return open(next_component(final_path, components), mode)
+
+
 def case_preserving_open_file(path, mode='wb', mkdir_mode=0o777):
    '''
    Open the file pointed to by path with the specified mode. If any