From 6dbd826c513628e11d974428ca9b87fd2365d7da Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 22 Apr 2013 18:23:36 +0530
Subject: [PATCH] EPUB Input: Fix handling of EPUB files that contain images
 with non-ascii filenames. Fixes #1171186 (Private bug)

---
 src/calibre/ebooks/oeb/base.py                 | 18 +++++++++++++++---
 src/calibre/ebooks/oeb/reader.py               |  2 ++
 .../ebooks/oeb/transforms/trimmanifest.py      |  2 ++
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py
index 29a809e190..21c0c60a55 100644
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@@ -1,7 +1,6 @@
 '''
 Basic support for manipulating OEB 1.x/2.0 content and metadata.
 '''
-from __future__ import with_statement
 
 __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
@@ -11,7 +10,7 @@ import os, re, uuid, logging
 from collections import defaultdict
 from itertools import count
 from urlparse import urldefrag, urlparse, urlunparse, urljoin
-from urllib import unquote as urlunquote
+from urllib import unquote
 
 from lxml import etree, html
 from calibre.constants import filesystem_encoding, __version__
@@ -372,6 +371,19 @@ def urlquote(href):
         result.append(char)
     return ''.join(result)
 
+def urlunquote(href):
+    # unquote must run on a bytestring and will return a bytestring
+    # If it runs on a unicode object, it returns a double encoded unicode
+    # string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8')
+    # and the latter is correct
+    want_unicode = isinstance(href, unicode)
+    if want_unicode:
+        href = href.encode('utf-8')
+    href = unquote(href)
+    if want_unicode:
+        href = href.decode('utf-8')
+    return href
+
 def urlnormalize(href):
     """Convert a URL into normalized form, with all and only URL-unsafe
     characters URL quoted.
@@ -468,7 +480,7 @@ class DirContainer(object):
                     return
 
     def _unquote(self, path):
-        # urlunquote must run on a bytestring and will return a bytestring
+        # unquote must run on a bytestring and will return a bytestring
         # If it runs on a unicode object, it returns a double encoded unicode
         # string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8')
         # and the latter is correct
diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py
index 8d63f30526..6a3747d2d3 100644
--- a/src/calibre/ebooks/oeb/reader.py
+++ b/src/calibre/ebooks/oeb/reader.py
@@ -196,6 +196,8 @@ class OEBReader(object):
                         item.media_type[-4:] in ('/xml', '+xml')):
                     hrefs = [r[2] for r in iterlinks(data)]
                     for href in hrefs:
+                        if isinstance(href, bytes):
+                            href = href.decode('utf-8')
                         href, _ = urldefrag(href)
                         if not href:
                             continue
diff --git a/src/calibre/ebooks/oeb/transforms/trimmanifest.py b/src/calibre/ebooks/oeb/transforms/trimmanifest.py
index 3d56f0ef3d..67d55a581e 100644
--- a/src/calibre/ebooks/oeb/transforms/trimmanifest.py
+++ b/src/calibre/ebooks/oeb/transforms/trimmanifest.py
@@ -47,6 +47,8 @@ class ManifestTrimmer(object):
                    item.data is not None:
                     hrefs = [r[2] for r in iterlinks(item.data)]
                     for href in hrefs:
+                        if isinstance(href, bytes):
+                            href = href.decode('utf-8')
                         try:
                             href = item.abshref(urlnormalize(href))
                         except: