Add reference for claim that filenames in chm pgml blocks are utf-8

This commit is contained in:
Kovid Goyal 2021-04-17 09:37:28 +05:30
parent 1a5fa973ea
commit 52adf96211
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -134,6 +134,8 @@ class CHMReader(CHMFile):
return toc return toc
def ResolveObject(self, path): def ResolveObject(self, path):
# filenames are utf-8 encoded in the chm index as far as I can
# determine, see https://tika.apache.org/1.11/api/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.html
if not isinstance(path, bytes): if not isinstance(path, bytes):
path = path.encode('utf-8') path = path.encode('utf-8')
return CHMFile.ResolveObject(self, path) return CHMFile.ResolveObject(self, path)
@ -318,6 +320,7 @@ class CHMReader(CHMFile):
def get_paths(chm, ui, ctx): def get_paths(chm, ui, ctx):
# these are supposed to be UTF-8 in CHM as best as I can determine # these are supposed to be UTF-8 in CHM as best as I can determine
# see https://tika.apache.org/1.11/api/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.html
path = as_unicode(ui.path, 'utf-8') path = as_unicode(ui.path, 'utf-8')
# skip directories # skip directories
# note this path refers to the internal CHM structure # note this path refers to the internal CHM structure