mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
CHM Input: Fix handling of some CHM files that use non-ascii internal filenames and dont specify a character encoding in their metadata. Fixes #1924703 [E-book viewer: another error opening a *.chm file](https://bugs.launchpad.net/calibre/+bug/1924703)
This commit is contained in:
parent
59d39cca4c
commit
1a5fa973ea
@ -137,7 +137,7 @@ def _get_cover(soup, rdr):
|
|||||||
|
|
||||||
|
|
||||||
def get_metadata_from_reader(rdr):
|
def get_metadata_from_reader(rdr):
|
||||||
raw = rdr.GetFile(rdr.home)
|
raw = rdr.get_home()
|
||||||
home = BeautifulSoup(xml_to_unicode(raw, strip_encoding_pats=True,
|
home = BeautifulSoup(xml_to_unicode(raw, strip_encoding_pats=True,
|
||||||
resolve_entities=True)[0])
|
resolve_entities=True)[0])
|
||||||
|
|
||||||
|
@ -56,40 +56,62 @@ class CHMReader(CHMFile):
|
|||||||
raise CHMError("Unable to open CHM file '%s'"%(input,))
|
raise CHMError("Unable to open CHM file '%s'"%(input,))
|
||||||
self.log = log
|
self.log = log
|
||||||
self.input_encoding = input_encoding
|
self.input_encoding = input_encoding
|
||||||
self.chm_encoding = self.get_encoding() or 'cp1252'
|
|
||||||
self._sourcechm = input
|
self._sourcechm = input
|
||||||
self._contents = None
|
self._contents = None
|
||||||
self._playorder = 0
|
self._playorder = 0
|
||||||
self._metadata = False
|
self._metadata = False
|
||||||
self._extracted = False
|
self._extracted = False
|
||||||
self.re_encoded_files = set()
|
self.re_encoded_files = set()
|
||||||
|
self.get_encodings()
|
||||||
if self.home:
|
if self.home:
|
||||||
self.home = as_unicode(self.home, self.chm_encoding)
|
self.home = self.decode_hhp_filename(self.home)
|
||||||
if self.topics:
|
if self.topics:
|
||||||
self.topics = as_unicode(self.topics, self.chm_encoding)
|
self.topics = self.decode_hhp_filename(self.topics)
|
||||||
|
|
||||||
# location of '.hhc' file, which is the CHM TOC.
|
# location of '.hhc' file, which is the CHM TOC.
|
||||||
if self.topics is None:
|
base = self.topics or self.home
|
||||||
self.root, ext = os.path.splitext(self.home.lstrip('/'))
|
self.root = os.path.splitext(base.lstrip('/'))[0]
|
||||||
self.hhc_path = self.root + ".hhc"
|
self.hhc_path = self.root + ".hhc"
|
||||||
else:
|
|
||||||
self.root, ext = os.path.splitext(self.topics.lstrip('/'))
|
def decode_hhp_filename(self, path):
|
||||||
self.hhc_path = self.root + ".hhc"
|
if isinstance(path, str):
|
||||||
|
return path
|
||||||
|
for enc in (self.encoding_from_system_file, self.encoding_from_lcid, 'cp1252', 'cp1251', 'latin1', 'utf-8'):
|
||||||
|
if enc:
|
||||||
|
try:
|
||||||
|
q = path.decode(enc)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
continue
|
||||||
|
res, ui = self.ResolveObject(q)
|
||||||
|
if res == chmlib.CHM_RESOLVE_SUCCESS:
|
||||||
|
return q
|
||||||
|
|
||||||
|
def get_encodings(self):
|
||||||
|
self.encoding_from_system_file = self.encoding_from_lcid = None
|
||||||
|
q = self.GetEncoding()
|
||||||
|
if q:
|
||||||
|
try:
|
||||||
|
if isinstance(q, bytes):
|
||||||
|
q = q.decode('ascii')
|
||||||
|
codecs.lookup(q)
|
||||||
|
self.encoding_from_system_file = q
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
lcid = self.GetLCID()
|
||||||
|
if lcid is not None:
|
||||||
|
q = lcid[0]
|
||||||
|
if q:
|
||||||
|
try:
|
||||||
|
if isinstance(q, bytes):
|
||||||
|
q = q.decode('ascii')
|
||||||
|
codecs.lookup(q)
|
||||||
|
self.encoding_from_lcid = q
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
def get_encoding(self):
|
def get_encoding(self):
|
||||||
ans = self.GetEncoding()
|
return self.encoding_from_system_file or self.encoding_from_lcid or 'cp1252'
|
||||||
if ans is None:
|
|
||||||
lcid = self.GetLCID()
|
|
||||||
if lcid is not None:
|
|
||||||
ans = lcid[0]
|
|
||||||
if ans:
|
|
||||||
try:
|
|
||||||
if isinstance(ans, bytes):
|
|
||||||
ans = ans.decode('ascii')
|
|
||||||
codecs.lookup(ans)
|
|
||||||
except Exception:
|
|
||||||
ans = None
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def _parse_toc(self, ul, basedir=getcwd()):
|
def _parse_toc(self, ul, basedir=getcwd()):
|
||||||
toc = TOC(play_order=self._playorder, base_path=basedir, text='')
|
toc = TOC(play_order=self._playorder, base_path=basedir, text='')
|
||||||
@ -112,14 +134,9 @@ class CHMReader(CHMFile):
|
|||||||
return toc
|
return toc
|
||||||
|
|
||||||
def ResolveObject(self, path):
|
def ResolveObject(self, path):
|
||||||
opath = path
|
|
||||||
if not isinstance(path, bytes):
|
if not isinstance(path, bytes):
|
||||||
path = path.encode(self.chm_encoding)
|
path = path.encode('utf-8')
|
||||||
ans = CHMFile.ResolveObject(self, path)
|
return CHMFile.ResolveObject(self, path)
|
||||||
if ans[0] != chmlib.CHM_RESOLVE_SUCCESS and not isinstance(opath, bytes):
|
|
||||||
path = opath.encode('utf-8')
|
|
||||||
ans = CHMFile.ResolveObject(self, path)
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def GetFile(self, path):
|
def GetFile(self, path):
|
||||||
# have to have abs paths for ResolveObject, but Contents() deliberately
|
# have to have abs paths for ResolveObject, but Contents() deliberately
|
||||||
@ -132,16 +149,16 @@ class CHMReader(CHMFile):
|
|||||||
raise CHMError(f"Unable to locate {path!r} within CHM file {self.filename!r}")
|
raise CHMError(f"Unable to locate {path!r} within CHM file {self.filename!r}")
|
||||||
size, data = self.RetrieveObject(ui)
|
size, data = self.RetrieveObject(ui)
|
||||||
if size == 0:
|
if size == 0:
|
||||||
raise CHMError("'%s' is zero bytes in length!"%(path,))
|
raise CHMError(f"{path!r} is zero bytes in length!")
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
def get_home(self):
|
||||||
|
return self.GetFile(self.home)
|
||||||
|
|
||||||
def ExtractFiles(self, output_dir=getcwd(), debug_dump=False):
|
def ExtractFiles(self, output_dir=getcwd(), debug_dump=False):
|
||||||
html_files = set()
|
html_files = set()
|
||||||
enc = self.chm_encoding
|
|
||||||
for path in self.Contents():
|
for path in self.Contents():
|
||||||
fpath = path
|
fpath = path
|
||||||
if not isinstance(path, unicode_type):
|
|
||||||
fpath = path.decode(enc)
|
|
||||||
lpath = os.path.join(output_dir, fpath)
|
lpath = os.path.join(output_dir, fpath)
|
||||||
self._ensure_dir(lpath)
|
self._ensure_dir(lpath)
|
||||||
try:
|
try:
|
||||||
@ -300,11 +317,8 @@ class CHMReader(CHMFile):
|
|||||||
paths = []
|
paths = []
|
||||||
|
|
||||||
def get_paths(chm, ui, ctx):
|
def get_paths(chm, ui, ctx):
|
||||||
try:
|
# these are supposed to be UTF-8 in CHM as best as I can determine
|
||||||
path = as_unicode(ui.path, self.chm_encoding)
|
path = as_unicode(ui.path, 'utf-8')
|
||||||
except UnicodeDecodeError:
|
|
||||||
path = as_unicode(ui.path, 'utf-8')
|
|
||||||
|
|
||||||
# skip directories
|
# skip directories
|
||||||
# note this path refers to the internal CHM structure
|
# note this path refers to the internal CHM structure
|
||||||
if path[-1] != '/':
|
if path[-1] != '/':
|
||||||
|
Loading…
x
Reference in New Issue
Block a user