CHM Input: Handle CHM files with no HHC ToC. Fixes #2056470 [Ablity to render CHM when there is no TOC/.hhc file present:](https://bugs.launchpad.net/calibre/+bug/2056470)

This commit is contained in:
Kovid Goyal 2024-03-12 19:05:02 +05:30
parent 4b88d010ef
commit 91120754a3
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
5 changed files with 107 additions and 19 deletions

View File

@ -6,13 +6,14 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
import codecs
import os
import re
import struct
from chm.chm import CHMFile, chmlib
from calibre import guess_type as guess_mimetype
from calibre.constants import filesystem_encoding, iswindows
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata.toc import TOC
from chm.chm import CHMFile, chmlib
from polyglot.builtins import as_unicode
@ -73,6 +74,22 @@ class CHMReader(CHMFile):
self.root = os.path.splitext(base.lstrip('/'))[0]
self.hhc_path = self.root + ".hhc"
def relpath_to_first_html_file(self):
# See https://www.nongnu.org/chmspec/latest/Internal.html#SYSTEM
data = self.GetFile('/#SYSTEM')
pos = 4
while pos < len(data):
code, length_of_data = struct.unpack_from('<HH', data, pos)
pos += 4
if code == 2:
default_topic = data[pos:pos+length_of_data].rstrip(b'\0')
break
pos += length_of_data
else:
raise CHMError('No default topic found in CHM file that has no HHC ToC either')
default_topic = self.decode_hhp_filename(b'/' + default_topic)
return default_topic[1:]
def decode_hhp_filename(self, path):
if isinstance(path, str):
return path
@ -140,6 +157,10 @@ class CHMReader(CHMFile):
path = path.encode('utf-8')
return CHMFile.ResolveObject(self, path)
def file_exists(self, path):
res, ui = self.ResolveObject(path)
return res == chmlib.CHM_RESOLVE_SUCCESS
def GetFile(self, path):
# have to have abs paths for ResolveObject, but Contents() deliberately
# makes them relative. So we don't have to worry, re-add the leading /.

View File

@ -63,7 +63,6 @@ class CHMInput(InputFormatPlugin):
from calibre.ebooks.metadata.book.base import Metadata
metadata = Metadata(os.path.basename(chm_name))
encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252'
self._chm_reader.CloseCHM()
# print((tdir, mainpath))
# from calibre import ipython
# ipython()
@ -74,6 +73,7 @@ class CHMInput(InputFormatPlugin):
if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files:
uenc = 'utf-8'
htmlpath, toc = self._create_html_root(mainpath, log, uenc)
self._chm_reader.CloseCHM()
oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
options.debug_pipeline = odi
if toc.count() > 1:
@ -102,6 +102,8 @@ class CHMInput(InputFormatPlugin):
# use HTMLInput plugin to generate book
from calibre.customize.builtins import HTMLInput
opts.breadth_first = True
opts.max_levels = 30
opts.correct_case_mismatches = True
htmlinput = HTMLInput(None)
htmlinput.set_root_dir_of_input(basedir)
htmlinput.root_dir_for_absolute_links = basedir
@ -113,7 +115,12 @@ class CHMInput(InputFormatPlugin):
from polyglot.urllib import unquote as _unquote
from calibre.ebooks.oeb.base import urlquote
from calibre.ebooks.chardet import xml_to_unicode
hhcdata = self._read_file(hhcpath)
try:
hhcdata = self._read_file(hhcpath)
except FileNotFoundError:
log.warn('No HHC file found in CHM, using the default topic as the first HTML file')
from calibre.ebooks.oeb.base import TOC
return os.path.join(os.path.dirname(hhcpath), self._chm_reader.relpath_to_first_html_file()), TOC()
hhcdata = hhcdata.decode(encoding)
hhcdata = xml_to_unicode(hhcdata, verbose=True,
strip_encoding_pats=True, resolve_entities=True)[0]

View File

@ -8,12 +8,15 @@ __docformat__ = 'restructuredtext en'
import os
import re
import tempfile
from contextlib import suppress
from functools import partial
from urllib.parse import quote
from calibre.constants import isbsd, islinux, filesystem_encoding
from calibre.constants import filesystem_encoding, isbsd, islinux
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.utils.filenames import ascii_filename, get_long_path_name
from calibre.utils.filenames import (
ascii_filename, case_ignoring_open_file, get_long_path_name,
)
from calibre.utils.imghdr import what
from calibre.utils.localization import __, get_lang
from polyglot.builtins import as_unicode
@ -293,7 +296,13 @@ class HTMLInput(InputFormatPlugin):
except:
return link_
if not os.access(link, os.R_OK):
return link_
corrected = False
if getattr(self.opts, 'correct_case_mismatches', False):
with suppress(OSError), case_ignoring_open_file(link) as f:
link = f.name
corrected = True
if not corrected:
return link_
if os.path.isdir(link):
self.log.warn(link_, 'is a link to a directory. Ignoring.')
return link_

View File

@ -10,12 +10,16 @@ __docformat__ = 'restructuredtext en'
Input plugin for HTML or OPF ebooks.
'''
import os, re, sys, errno as gerrno
import errno as gerrno
import os
import re
import sys
from calibre.ebooks.oeb.base import urlunquote
from calibre.ebooks.chardet import detect_xml_encoding
from calibre import replace_entities, unicode_path
from calibre.constants import iswindows
from calibre import unicode_path, replace_entities
from calibre.ebooks.chardet import detect_xml_encoding
from calibre.ebooks.oeb.base import urlunquote
from calibre.utils.filenames import case_ignoring_open_file
from polyglot.urllib import urlparse, urlunparse
@ -92,21 +96,23 @@ class HTMLFile:
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
re.DOTALL|re.IGNORECASE)
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None, correct_case_mismatches=False):
'''
:param level: The level of this file. Should be 0 for the root file.
:param encoding: Use `encoding` to decode HTML.
:param referrer: The :class:`HTMLFile` that first refers to this file.
'''
self.path = unicode_path(path_to_html_file, abs=True)
self.title = os.path.splitext(os.path.basename(self.path))[0]
self.base = os.path.dirname(self.path)
self.level = level
self.referrer = referrer
self.links = []
try:
with open(self.path, 'rb') as f:
f = case_ignoring_open_file(self.path, 'rb') if correct_case_mismatches else open(self.path, 'rb')
self.path = f.name
self.base = os.path.dirname(self.path)
self.title = os.path.splitext(os.path.basename(self.path))[0]
with f:
src = header = f.read(4096)
encoding = detect_xml_encoding(src)[1]
if encoding:
@ -238,7 +244,7 @@ def find_tests():
return unittest.defaultTestLoader.loadTestsFromTestCase(TestHTMLInput)
def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None):
def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None, correct_case_mismatches=False):
'''
Recursively traverse all links in the HTML file.
@ -251,7 +257,8 @@ def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None
'''
assert max_levels >= 0
level = 0
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
flat = [HTMLFile(path_to_html_file, level, encoding, verbose, correct_case_mismatches=correct_case_mismatches)]
seen = {flat[0].path}
next_level = list(flat)
while level < max_levels and len(next_level) > 0:
level += 1
@ -259,10 +266,13 @@ def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None
for hf in next_level:
rejects = []
for link in hf.links:
if link.path is None or link.path in flat:
if link.path is None or link.path in seen:
continue
try:
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf, correct_case_mismatches=correct_case_mismatches)
if nf.path in seen:
continue
seen.add(nf.path)
if nf.is_binary:
raise IgnoreFile('%s is a binary file'%nf.path, -1)
nl.append(nf)
@ -285,7 +295,7 @@ def get_filelist(htmlfile, dir, opts, log):
'''
log.info('Building file list...')
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
verbose=opts.verbose,
verbose=opts.verbose, correct_case_mismatches=getattr(opts, 'correct_case_mismatches', False),
encoding=opts.input_encoding)[0 if opts.breadth_first else 1]
if opts.verbose:
log.debug('\tFound files...')

View File

@ -133,6 +133,47 @@ def is_case_sensitive(path):
return is_case_sensitive
def case_ignoring_open_file(path, mode='r'):
'''
Open an existing file case insensitively, even on case sensitive file systems
'''
try:
return open(path, mode)
except FileNotFoundError as err:
original_err = err
def next_component(final_path, components):
if not components:
return final_path
component = components.pop()
cl = component.lower()
try:
matches = {x for x in os.listdir(final_path) if x.lower() == cl}
except OSError:
raise original_err from None
for x in matches:
current = os.path.join(final_path, x)
try:
return next_component(current, list(components))
except Exception:
continue
raise original_err
if isbytestring(path):
path = path.decode(filesystem_encoding)
if path.endswith(os.sep):
path = path[:-1]
if not path:
raise ValueError('Path must not point to root')
components = path.split(os.sep)
if len(components) <= 1:
raise ValueError(f'Invalid path: {path}')
final_path = (components[0].upper() + os.sep) if iswindows else '/'
components = list(reversed(components))[:-1]
return open(next_component(final_path, components), mode)
def case_preserving_open_file(path, mode='wb', mkdir_mode=0o777):
'''
Open the file pointed to by path with the specified mode. If any