diff --git a/src/calibre/ebooks/chm/reader.py b/src/calibre/ebooks/chm/reader.py index 102f439857..7da8e4efe6 100644 --- a/src/calibre/ebooks/chm/reader.py +++ b/src/calibre/ebooks/chm/reader.py @@ -6,13 +6,14 @@ __copyright__ = '2008, Kovid Goyal ,' \ import codecs import os import re +import struct +from chm.chm import CHMFile, chmlib from calibre import guess_type as guess_mimetype from calibre.constants import filesystem_encoding, iswindows from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata.toc import TOC -from chm.chm import CHMFile, chmlib from polyglot.builtins import as_unicode @@ -73,6 +74,22 @@ class CHMReader(CHMFile): self.root = os.path.splitext(base.lstrip('/'))[0] self.hhc_path = self.root + ".hhc" + def relpath_to_first_html_file(self): + # See https://www.nongnu.org/chmspec/latest/Internal.html#SYSTEM + data = self.GetFile('/#SYSTEM') + pos = 4 + while pos < len(data): + code, length_of_data = struct.unpack_from(' 1: @@ -102,6 +102,8 @@ class CHMInput(InputFormatPlugin): # use HTMLInput plugin to generate book from calibre.customize.builtins import HTMLInput opts.breadth_first = True + opts.max_levels = 30 + opts.correct_case_mismatches = True htmlinput = HTMLInput(None) htmlinput.set_root_dir_of_input(basedir) htmlinput.root_dir_for_absolute_links = basedir @@ -113,7 +115,12 @@ class CHMInput(InputFormatPlugin): from polyglot.urllib import unquote as _unquote from calibre.ebooks.oeb.base import urlquote from calibre.ebooks.chardet import xml_to_unicode - hhcdata = self._read_file(hhcpath) + try: + hhcdata = self._read_file(hhcpath) + except FileNotFoundError: + log.warn('No HHC file found in CHM, using the default topic as the first HTML file') + from calibre.ebooks.oeb.base import TOC + return os.path.join(os.path.dirname(hhcpath), self._chm_reader.relpath_to_first_html_file()), TOC() hhcdata = hhcdata.decode(encoding) hhcdata = xml_to_unicode(hhcdata, verbose=True, strip_encoding_pats=True, resolve_entities=True)[0] diff --git a/src/calibre/ebooks/conversion/plugins/html_input.py b/src/calibre/ebooks/conversion/plugins/html_input.py index 56bb02c3e0..3ab35fea0d 100644 --- a/src/calibre/ebooks/conversion/plugins/html_input.py +++ b/src/calibre/ebooks/conversion/plugins/html_input.py @@ -8,12 +8,15 @@ __docformat__ = 'restructuredtext en' import os import re import tempfile +from contextlib import suppress from functools import partial from urllib.parse import quote -from calibre.constants import isbsd, islinux, filesystem_encoding +from calibre.constants import filesystem_encoding, isbsd, islinux from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation -from calibre.utils.filenames import ascii_filename, get_long_path_name +from calibre.utils.filenames import ( + ascii_filename, case_ignoring_open_file, get_long_path_name, +) from calibre.utils.imghdr import what from calibre.utils.localization import __, get_lang from polyglot.builtins import as_unicode @@ -293,7 +296,13 @@ class HTMLInput(InputFormatPlugin): except: return link_ if not os.access(link, os.R_OK): - return link_ + corrected = False + if getattr(self.opts, 'correct_case_mismatches', False): + with suppress(OSError), case_ignoring_open_file(link) as f: + link = f.name + corrected = True + if not corrected: + return link_ if os.path.isdir(link): self.log.warn(link_, 'is a link to a directory. Ignoring.') return link_ diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index baf7794d0f..915dd042a4 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -10,12 +10,16 @@ __docformat__ = 'restructuredtext en' Input plugin for HTML or OPF ebooks. ''' -import os, re, sys, errno as gerrno +import errno as gerrno +import os +import re +import sys -from calibre.ebooks.oeb.base import urlunquote -from calibre.ebooks.chardet import detect_xml_encoding +from calibre import replace_entities, unicode_path from calibre.constants import iswindows -from calibre import unicode_path, replace_entities +from calibre.ebooks.chardet import detect_xml_encoding +from calibre.ebooks.oeb.base import urlunquote +from calibre.utils.filenames import case_ignoring_open_file from polyglot.urllib import urlparse, urlunparse @@ -92,21 +96,23 @@ class HTMLFile: r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P[^"]+)")|(?:\'(?P[^\']+)\')|(?P[^\s>]+))', re.DOTALL|re.IGNORECASE) - def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None): + def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None, correct_case_mismatches=False): ''' :param level: The level of this file. Should be 0 for the root file. :param encoding: Use `encoding` to decode HTML. :param referrer: The :class:`HTMLFile` that first refers to this file. ''' self.path = unicode_path(path_to_html_file, abs=True) - self.title = os.path.splitext(os.path.basename(self.path))[0] - self.base = os.path.dirname(self.path) self.level = level self.referrer = referrer self.links = [] try: - with open(self.path, 'rb') as f: + f = case_ignoring_open_file(self.path, 'rb') if correct_case_mismatches else open(self.path, 'rb') + self.path = f.name + self.base = os.path.dirname(self.path) + self.title = os.path.splitext(os.path.basename(self.path))[0] + with f: src = header = f.read(4096) encoding = detect_xml_encoding(src)[1] if encoding: @@ -238,7 +244,7 @@ def find_tests(): return unittest.defaultTestLoader.loadTestsFromTestCase(TestHTMLInput) -def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None): +def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None, correct_case_mismatches=False): ''' Recursively traverse all links in the HTML file. @@ -251,7 +257,8 @@ def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None ''' assert max_levels >= 0 level = 0 - flat = [HTMLFile(path_to_html_file, level, encoding, verbose)] + flat = [HTMLFile(path_to_html_file, level, encoding, verbose, correct_case_mismatches=correct_case_mismatches)] + seen = {flat[0].path} next_level = list(flat) while level < max_levels and len(next_level) > 0: level += 1 @@ -259,10 +266,13 @@ def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None for hf in next_level: rejects = [] for link in hf.links: - if link.path is None or link.path in flat: + if link.path is None or link.path in seen: continue try: - nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf) + nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf, correct_case_mismatches=correct_case_mismatches) + if nf.path in seen: + continue + seen.add(nf.path) if nf.is_binary: raise IgnoreFile('%s is a binary file'%nf.path, -1) nl.append(nf) @@ -285,7 +295,7 @@ def get_filelist(htmlfile, dir, opts, log): ''' log.info('Building file list...') filelist = traverse(htmlfile, max_levels=int(opts.max_levels), - verbose=opts.verbose, + verbose=opts.verbose, correct_case_mismatches=getattr(opts, 'correct_case_mismatches', False), encoding=opts.input_encoding)[0 if opts.breadth_first else 1] if opts.verbose: log.debug('\tFound files...') diff --git a/src/calibre/utils/filenames.py b/src/calibre/utils/filenames.py index 107d37f049..b76af7913c 100644 --- a/src/calibre/utils/filenames.py +++ b/src/calibre/utils/filenames.py @@ -133,6 +133,47 @@ def is_case_sensitive(path): return is_case_sensitive +def case_ignoring_open_file(path, mode='r'): + ''' + Open an existing file case insensitively, even on case sensitive file systems + ''' + try: + return open(path, mode) + except FileNotFoundError as err: + original_err = err + + def next_component(final_path, components): + if not components: + return final_path + component = components.pop() + cl = component.lower() + try: + matches = {x for x in os.listdir(final_path) if x.lower() == cl} + except OSError: + raise original_err from None + for x in matches: + current = os.path.join(final_path, x) + try: + return next_component(current, list(components)) + except Exception: + continue + raise original_err + + if isbytestring(path): + path = path.decode(filesystem_encoding) + if path.endswith(os.sep): + path = path[:-1] + if not path: + raise ValueError('Path must not point to root') + + components = path.split(os.sep) + if len(components) <= 1: + raise ValueError(f'Invalid path: {path}') + final_path = (components[0].upper() + os.sep) if iswindows else '/' + components = list(reversed(components))[:-1] + return open(next_component(final_path, components), mode) + + def case_preserving_open_file(path, mode='wb', mkdir_mode=0o777): ''' Open the file pointed to by path with the specified mode. If any