EPUB Input: Fix handling of EPUB files that contain images with non-ascii filenames. Fixes #1171186 (Private bug)

2025-07-07 10:14:46 -04:00 · 2013-04-22 18:23:36 +05:30 · 2013-04-22 18:23:36 +05:30 · 6dbd826c51
commit 6dbd826c51
parent 763c921108
3 changed files with 19 additions and 3 deletions
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -1,7 +1,6 @@
 '''
 Basic support for manipulating OEB 1.x/2.0 content and metadata.
 '''
-from __future__ import with_statement

 __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
@ -11,7 +10,7 @@ import os, re, uuid, logging
 from collections import defaultdict
 from itertools import count
 from urlparse import urldefrag, urlparse, urlunparse, urljoin
-from urllib import unquote as urlunquote
+from urllib import unquote

 from lxml import etree, html
 from calibre.constants import filesystem_encoding, __version__
@ -372,6 +371,19 @@ def urlquote(href):
        result.append(char)
    return ''.join(result)

+def urlunquote(href):
+    # unquote must run on a bytestring and will return a bytestring
+    # If it runs on a unicode object, it returns a double encoded unicode
+    # string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8')
+    # and the latter is correct
+    want_unicode = isinstance(href, unicode)
+    if want_unicode:
+        href = href.encode('utf-8')
+    href = unquote(href)
+    if want_unicode:
+        href = href.decode('utf-8')
+    return href
+
 def urlnormalize(href):
    """Convert a URL into normalized form, with all and only URL-unsafe
    characters URL quoted.
@ -468,7 +480,7 @@ class DirContainer(object):
                    return

    def _unquote(self, path):
-        # urlunquote must run on a bytestring and will return a bytestring
+        # unquote must run on a bytestring and will return a bytestring
        # If it runs on a unicode object, it returns a double encoded unicode
        # string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8')
        # and the latter is correct
--- a/src/calibre/ebooks/oeb/reader.py
+++ b/src/calibre/ebooks/oeb/reader.py
@ -196,6 +196,8 @@ class OEBReader(object):
                        item.media_type[-4:] in ('/xml', '+xml')):
                    hrefs = [r[2] for r in iterlinks(data)]
                    for href in hrefs:
+                        if isinstance(href, bytes):
+                            href = href.decode('utf-8')
                        href, _ = urldefrag(href)
                        if not href:
                            continue
--- a/src/calibre/ebooks/oeb/transforms/trimmanifest.py
+++ b/src/calibre/ebooks/oeb/transforms/trimmanifest.py
@ -47,6 +47,8 @@ class ManifestTrimmer(object):
                   item.data is not None:
                    hrefs = [r[2] for r in iterlinks(item.data)]
                    for href in hrefs:
+                        if isinstance(href, bytes):
+                            href = href.decode('utf-8')
                        try:
                            href = item.abshref(urlnormalize(href))
                        except: