mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
CHM Input: Handle CHM files with no HHC ToC. Fixes #2056470 [Ablity to render CHM when there is no TOC/.hhc file present:](https://bugs.launchpad.net/calibre/+bug/2056470)
This commit is contained in:
parent
4b88d010ef
commit
91120754a3
@ -6,13 +6,14 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
|
||||
import codecs
|
||||
import os
|
||||
import re
|
||||
import struct
|
||||
from chm.chm import CHMFile, chmlib
|
||||
|
||||
from calibre import guess_type as guess_mimetype
|
||||
from calibre.constants import filesystem_encoding, iswindows
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from chm.chm import CHMFile, chmlib
|
||||
from polyglot.builtins import as_unicode
|
||||
|
||||
|
||||
@ -73,6 +74,22 @@ class CHMReader(CHMFile):
|
||||
self.root = os.path.splitext(base.lstrip('/'))[0]
|
||||
self.hhc_path = self.root + ".hhc"
|
||||
|
||||
def relpath_to_first_html_file(self):
|
||||
# See https://www.nongnu.org/chmspec/latest/Internal.html#SYSTEM
|
||||
data = self.GetFile('/#SYSTEM')
|
||||
pos = 4
|
||||
while pos < len(data):
|
||||
code, length_of_data = struct.unpack_from('<HH', data, pos)
|
||||
pos += 4
|
||||
if code == 2:
|
||||
default_topic = data[pos:pos+length_of_data].rstrip(b'\0')
|
||||
break
|
||||
pos += length_of_data
|
||||
else:
|
||||
raise CHMError('No default topic found in CHM file that has no HHC ToC either')
|
||||
default_topic = self.decode_hhp_filename(b'/' + default_topic)
|
||||
return default_topic[1:]
|
||||
|
||||
def decode_hhp_filename(self, path):
|
||||
if isinstance(path, str):
|
||||
return path
|
||||
@ -140,6 +157,10 @@ class CHMReader(CHMFile):
|
||||
path = path.encode('utf-8')
|
||||
return CHMFile.ResolveObject(self, path)
|
||||
|
||||
def file_exists(self, path):
|
||||
res, ui = self.ResolveObject(path)
|
||||
return res == chmlib.CHM_RESOLVE_SUCCESS
|
||||
|
||||
def GetFile(self, path):
|
||||
# have to have abs paths for ResolveObject, but Contents() deliberately
|
||||
# makes them relative. So we don't have to worry, re-add the leading /.
|
||||
|
@ -63,7 +63,6 @@ class CHMInput(InputFormatPlugin):
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
metadata = Metadata(os.path.basename(chm_name))
|
||||
encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252'
|
||||
self._chm_reader.CloseCHM()
|
||||
# print((tdir, mainpath))
|
||||
# from calibre import ipython
|
||||
# ipython()
|
||||
@ -74,6 +73,7 @@ class CHMInput(InputFormatPlugin):
|
||||
if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files:
|
||||
uenc = 'utf-8'
|
||||
htmlpath, toc = self._create_html_root(mainpath, log, uenc)
|
||||
self._chm_reader.CloseCHM()
|
||||
oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
|
||||
options.debug_pipeline = odi
|
||||
if toc.count() > 1:
|
||||
@ -102,6 +102,8 @@ class CHMInput(InputFormatPlugin):
|
||||
# use HTMLInput plugin to generate book
|
||||
from calibre.customize.builtins import HTMLInput
|
||||
opts.breadth_first = True
|
||||
opts.max_levels = 30
|
||||
opts.correct_case_mismatches = True
|
||||
htmlinput = HTMLInput(None)
|
||||
htmlinput.set_root_dir_of_input(basedir)
|
||||
htmlinput.root_dir_for_absolute_links = basedir
|
||||
@ -113,7 +115,12 @@ class CHMInput(InputFormatPlugin):
|
||||
from polyglot.urllib import unquote as _unquote
|
||||
from calibre.ebooks.oeb.base import urlquote
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
hhcdata = self._read_file(hhcpath)
|
||||
try:
|
||||
hhcdata = self._read_file(hhcpath)
|
||||
except FileNotFoundError:
|
||||
log.warn('No HHC file found in CHM, using the default topic as the first HTML file')
|
||||
from calibre.ebooks.oeb.base import TOC
|
||||
return os.path.join(os.path.dirname(hhcpath), self._chm_reader.relpath_to_first_html_file()), TOC()
|
||||
hhcdata = hhcdata.decode(encoding)
|
||||
hhcdata = xml_to_unicode(hhcdata, verbose=True,
|
||||
strip_encoding_pats=True, resolve_entities=True)[0]
|
||||
|
@ -8,12 +8,15 @@ __docformat__ = 'restructuredtext en'
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
from contextlib import suppress
|
||||
from functools import partial
|
||||
from urllib.parse import quote
|
||||
|
||||
from calibre.constants import isbsd, islinux, filesystem_encoding
|
||||
from calibre.constants import filesystem_encoding, isbsd, islinux
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre.utils.filenames import ascii_filename, get_long_path_name
|
||||
from calibre.utils.filenames import (
|
||||
ascii_filename, case_ignoring_open_file, get_long_path_name,
|
||||
)
|
||||
from calibre.utils.imghdr import what
|
||||
from calibre.utils.localization import __, get_lang
|
||||
from polyglot.builtins import as_unicode
|
||||
@ -293,7 +296,13 @@ class HTMLInput(InputFormatPlugin):
|
||||
except:
|
||||
return link_
|
||||
if not os.access(link, os.R_OK):
|
||||
return link_
|
||||
corrected = False
|
||||
if getattr(self.opts, 'correct_case_mismatches', False):
|
||||
with suppress(OSError), case_ignoring_open_file(link) as f:
|
||||
link = f.name
|
||||
corrected = True
|
||||
if not corrected:
|
||||
return link_
|
||||
if os.path.isdir(link):
|
||||
self.log.warn(link_, 'is a link to a directory. Ignoring.')
|
||||
return link_
|
||||
|
@ -10,12 +10,16 @@ __docformat__ = 'restructuredtext en'
|
||||
Input plugin for HTML or OPF ebooks.
|
||||
'''
|
||||
|
||||
import os, re, sys, errno as gerrno
|
||||
import errno as gerrno
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
from calibre.ebooks.oeb.base import urlunquote
|
||||
from calibre.ebooks.chardet import detect_xml_encoding
|
||||
from calibre import replace_entities, unicode_path
|
||||
from calibre.constants import iswindows
|
||||
from calibre import unicode_path, replace_entities
|
||||
from calibre.ebooks.chardet import detect_xml_encoding
|
||||
from calibre.ebooks.oeb.base import urlunquote
|
||||
from calibre.utils.filenames import case_ignoring_open_file
|
||||
from polyglot.urllib import urlparse, urlunparse
|
||||
|
||||
|
||||
@ -92,21 +96,23 @@ class HTMLFile:
|
||||
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
|
||||
re.DOTALL|re.IGNORECASE)
|
||||
|
||||
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
|
||||
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None, correct_case_mismatches=False):
|
||||
'''
|
||||
:param level: The level of this file. Should be 0 for the root file.
|
||||
:param encoding: Use `encoding` to decode HTML.
|
||||
:param referrer: The :class:`HTMLFile` that first refers to this file.
|
||||
'''
|
||||
self.path = unicode_path(path_to_html_file, abs=True)
|
||||
self.title = os.path.splitext(os.path.basename(self.path))[0]
|
||||
self.base = os.path.dirname(self.path)
|
||||
self.level = level
|
||||
self.referrer = referrer
|
||||
self.links = []
|
||||
|
||||
try:
|
||||
with open(self.path, 'rb') as f:
|
||||
f = case_ignoring_open_file(self.path, 'rb') if correct_case_mismatches else open(self.path, 'rb')
|
||||
self.path = f.name
|
||||
self.base = os.path.dirname(self.path)
|
||||
self.title = os.path.splitext(os.path.basename(self.path))[0]
|
||||
with f:
|
||||
src = header = f.read(4096)
|
||||
encoding = detect_xml_encoding(src)[1]
|
||||
if encoding:
|
||||
@ -238,7 +244,7 @@ def find_tests():
|
||||
return unittest.defaultTestLoader.loadTestsFromTestCase(TestHTMLInput)
|
||||
|
||||
|
||||
def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None):
|
||||
def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None, correct_case_mismatches=False):
|
||||
'''
|
||||
Recursively traverse all links in the HTML file.
|
||||
|
||||
@ -251,7 +257,8 @@ def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None
|
||||
'''
|
||||
assert max_levels >= 0
|
||||
level = 0
|
||||
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
|
||||
flat = [HTMLFile(path_to_html_file, level, encoding, verbose, correct_case_mismatches=correct_case_mismatches)]
|
||||
seen = {flat[0].path}
|
||||
next_level = list(flat)
|
||||
while level < max_levels and len(next_level) > 0:
|
||||
level += 1
|
||||
@ -259,10 +266,13 @@ def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None
|
||||
for hf in next_level:
|
||||
rejects = []
|
||||
for link in hf.links:
|
||||
if link.path is None or link.path in flat:
|
||||
if link.path is None or link.path in seen:
|
||||
continue
|
||||
try:
|
||||
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
|
||||
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf, correct_case_mismatches=correct_case_mismatches)
|
||||
if nf.path in seen:
|
||||
continue
|
||||
seen.add(nf.path)
|
||||
if nf.is_binary:
|
||||
raise IgnoreFile('%s is a binary file'%nf.path, -1)
|
||||
nl.append(nf)
|
||||
@ -285,7 +295,7 @@ def get_filelist(htmlfile, dir, opts, log):
|
||||
'''
|
||||
log.info('Building file list...')
|
||||
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
||||
verbose=opts.verbose,
|
||||
verbose=opts.verbose, correct_case_mismatches=getattr(opts, 'correct_case_mismatches', False),
|
||||
encoding=opts.input_encoding)[0 if opts.breadth_first else 1]
|
||||
if opts.verbose:
|
||||
log.debug('\tFound files...')
|
||||
|
@ -133,6 +133,47 @@ def is_case_sensitive(path):
|
||||
return is_case_sensitive
|
||||
|
||||
|
||||
def case_ignoring_open_file(path, mode='r'):
|
||||
'''
|
||||
Open an existing file case insensitively, even on case sensitive file systems
|
||||
'''
|
||||
try:
|
||||
return open(path, mode)
|
||||
except FileNotFoundError as err:
|
||||
original_err = err
|
||||
|
||||
def next_component(final_path, components):
|
||||
if not components:
|
||||
return final_path
|
||||
component = components.pop()
|
||||
cl = component.lower()
|
||||
try:
|
||||
matches = {x for x in os.listdir(final_path) if x.lower() == cl}
|
||||
except OSError:
|
||||
raise original_err from None
|
||||
for x in matches:
|
||||
current = os.path.join(final_path, x)
|
||||
try:
|
||||
return next_component(current, list(components))
|
||||
except Exception:
|
||||
continue
|
||||
raise original_err
|
||||
|
||||
if isbytestring(path):
|
||||
path = path.decode(filesystem_encoding)
|
||||
if path.endswith(os.sep):
|
||||
path = path[:-1]
|
||||
if not path:
|
||||
raise ValueError('Path must not point to root')
|
||||
|
||||
components = path.split(os.sep)
|
||||
if len(components) <= 1:
|
||||
raise ValueError(f'Invalid path: {path}')
|
||||
final_path = (components[0].upper() + os.sep) if iswindows else '/'
|
||||
components = list(reversed(components))[:-1]
|
||||
return open(next_component(final_path, components), mode)
|
||||
|
||||
|
||||
def case_preserving_open_file(path, mode='wb', mkdir_mode=0o777):
|
||||
'''
|
||||
Open the file pointed to by path with the specified mode. If any
|
||||
|
Loading…
x
Reference in New Issue
Block a user