From 6dbd826c513628e11d974428ca9b87fd2365d7da Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 22 Apr 2013 18:23:36 +0530 Subject: [PATCH] EPUB Input: Fix handling of EPUB files that contain images with non-ascii filenames. Fixes #1171186 (Private bug) --- src/calibre/ebooks/oeb/base.py | 18 +++++++++++++++--- src/calibre/ebooks/oeb/reader.py | 2 ++ .../ebooks/oeb/transforms/trimmanifest.py | 2 ++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 29a809e190..21c0c60a55 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -1,7 +1,6 @@ ''' Basic support for manipulating OEB 1.x/2.0 content and metadata. ''' -from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' @@ -11,7 +10,7 @@ import os, re, uuid, logging from collections import defaultdict from itertools import count from urlparse import urldefrag, urlparse, urlunparse, urljoin -from urllib import unquote as urlunquote +from urllib import unquote from lxml import etree, html from calibre.constants import filesystem_encoding, __version__ @@ -372,6 +371,19 @@ def urlquote(href): result.append(char) return ''.join(result) +def urlunquote(href): + # unquote must run on a bytestring and will return a bytestring + # If it runs on a unicode object, it returns a double encoded unicode + # string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8') + # and the latter is correct + want_unicode = isinstance(href, unicode) + if want_unicode: + href = href.encode('utf-8') + href = unquote(href) + if want_unicode: + href = href.decode('utf-8') + return href + def urlnormalize(href): """Convert a URL into normalized form, with all and only URL-unsafe characters URL quoted. @@ -468,7 +480,7 @@ class DirContainer(object): return def _unquote(self, path): - # urlunquote must run on a bytestring and will return a bytestring + # unquote must run on a bytestring and will return a bytestring # If it runs on a unicode object, it returns a double encoded unicode # string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8') # and the latter is correct diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index 8d63f30526..6a3747d2d3 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -196,6 +196,8 @@ class OEBReader(object): item.media_type[-4:] in ('/xml', '+xml')): hrefs = [r[2] for r in iterlinks(data)] for href in hrefs: + if isinstance(href, bytes): + href = href.decode('utf-8') href, _ = urldefrag(href) if not href: continue diff --git a/src/calibre/ebooks/oeb/transforms/trimmanifest.py b/src/calibre/ebooks/oeb/transforms/trimmanifest.py index 3d56f0ef3d..67d55a581e 100644 --- a/src/calibre/ebooks/oeb/transforms/trimmanifest.py +++ b/src/calibre/ebooks/oeb/transforms/trimmanifest.py @@ -47,6 +47,8 @@ class ManifestTrimmer(object): item.data is not None: hrefs = [r[2] for r in iterlinks(item.data)] for href in hrefs: + if isinstance(href, bytes): + href = href.decode('utf-8') try: href = item.abshref(urlnormalize(href)) except: