From 65c3a9c2a37e83576fab160cbad66fa5cbc19881 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 6 Mar 2010 11:06:40 -0700 Subject: [PATCH] When creating ZIP/EPUB files encode all file names in UTF-8 and set the UTF-8 bit marker. When extracting from a ZIP/EPUB file, if the UTF-8 bit marker is not set, still assume that the file name is encoded in UTF-8. If UTF-8 decoding fails try to detect character encoding using chardet. This is because most ZIP/EPUB files calibre comes across seem to be utf-8 encoded. --- src/calibre/utils/zipfile.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/src/calibre/utils/zipfile.py b/src/calibre/utils/zipfile.py index 9943119968..71eccc7543 100644 --- a/src/calibre/utils/zipfile.py +++ b/src/calibre/utils/zipfile.py @@ -138,12 +138,16 @@ _CD64_OFFSET_START_CENTDIR = 9 def decode_arcname(name): if not isinstance(name, unicode): - encoding = detect(name)['encoding'] try: - name = name.decode(encoding) + name = name.decode('utf-8') except: - name = name.decode('utf-8', 'replace') - return name.encode(filesystem_encoding, 'replace') + res = detect(name) + encoding = res['encoding'] + try: + name = name.decode(encoding) + except: + name = name.decode('utf-8', 'replace') + return name def is_zipfile(filename): @@ -352,10 +356,7 @@ class ZipInfo (object): def _encodeFilenameFlags(self): if isinstance(self.filename, unicode): - try: - return self.filename.encode('ascii'), self.flag_bits - except: - return self.filename.encode('utf-8'), self.flag_bits | 0x800 + return self.filename.encode('utf-8'), self.flag_bits | 0x800 else: return self.filename, self.flag_bits @@ -363,7 +364,7 @@ class ZipInfo (object): if self.flag_bits & 0x800: return self.filename.decode('utf-8') else: - return self.filename + return decode_arcname(self.filename) def _decodeExtra(self): # Try to decode the extra field. @@ -1059,7 +1060,9 @@ class ZipFile: targetpath = targetpath[:-1] # don't include leading "/" from file name if present - fname = decode_arcname(member.filename) + fname = member.filename + if isinstance(fname, unicode): + fname = fname.encode(filesystem_encoding, 'replace') if fname.startswith('/'): fname = fname[1:] targetpath = os.path.join(targetpath, fname) @@ -1111,8 +1114,6 @@ class ZipFile: def write(self, filename, arcname=None, compress_type=None): """Put the bytes from filename into the archive under the name arcname.""" - if isinstance(filename, unicode): - filename = filename.encode('utf-8') if not self.fp: raise RuntimeError( "Attempt to write to ZIP archive that was already closed") @@ -1126,6 +1127,8 @@ class ZipFile: arcname = os.path.normpath(os.path.splitdrive(arcname)[1]) while arcname[0] in (os.sep, os.altsep): arcname = arcname[1:] + if not isinstance(arcname, unicode): + arcname = arcname.decode(filesystem_encoding) zinfo = ZipInfo(arcname, date_time) zinfo.external_attr = (st[0] & 0xFFFF) << 16L # Unix attributes if compress_type is None: @@ -1187,8 +1190,8 @@ class ZipFile: assert not raw_bytes or (raw_bytes and isinstance(zinfo_or_arcname, ZipInfo)) if not isinstance(zinfo_or_arcname, ZipInfo): - if isinstance(zinfo_or_arcname, unicode): - zinfo_or_arcname = zinfo_or_arcname.encode('utf-8') + if not isinstance(zinfo_or_arcname, unicode): + zinfo_or_arcname = zinfo_or_arcname.decode(filesystem_encoding) zinfo = ZipInfo(filename=zinfo_or_arcname, date_time=time.localtime(time.time())[:6]) zinfo.compress_type = compression