EPUB Input: Fix handling of EPUB files that contain images with non-ascii filenames. Fixes #1171186 (Private bug)

This commit is contained in:
Kovid Goyal 2013-04-22 18:23:36 +05:30
parent 763c921108
commit 6dbd826c51
3 changed files with 19 additions and 3 deletions

View File

@ -1,7 +1,6 @@
''' '''
Basic support for manipulating OEB 1.x/2.0 content and metadata. Basic support for manipulating OEB 1.x/2.0 content and metadata.
''' '''
from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
@ -11,7 +10,7 @@ import os, re, uuid, logging
from collections import defaultdict from collections import defaultdict
from itertools import count from itertools import count
from urlparse import urldefrag, urlparse, urlunparse, urljoin from urlparse import urldefrag, urlparse, urlunparse, urljoin
from urllib import unquote as urlunquote from urllib import unquote
from lxml import etree, html from lxml import etree, html
from calibre.constants import filesystem_encoding, __version__ from calibre.constants import filesystem_encoding, __version__
@ -372,6 +371,19 @@ def urlquote(href):
result.append(char) result.append(char)
return ''.join(result) return ''.join(result)
def urlunquote(href):
# unquote must run on a bytestring and will return a bytestring
# If it runs on a unicode object, it returns a double encoded unicode
# string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8')
# and the latter is correct
want_unicode = isinstance(href, unicode)
if want_unicode:
href = href.encode('utf-8')
href = unquote(href)
if want_unicode:
href = href.decode('utf-8')
return href
def urlnormalize(href): def urlnormalize(href):
"""Convert a URL into normalized form, with all and only URL-unsafe """Convert a URL into normalized form, with all and only URL-unsafe
characters URL quoted. characters URL quoted.
@ -468,7 +480,7 @@ class DirContainer(object):
return return
def _unquote(self, path): def _unquote(self, path):
# urlunquote must run on a bytestring and will return a bytestring # unquote must run on a bytestring and will return a bytestring
# If it runs on a unicode object, it returns a double encoded unicode # If it runs on a unicode object, it returns a double encoded unicode
# string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8') # string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8')
# and the latter is correct # and the latter is correct

View File

@ -196,6 +196,8 @@ class OEBReader(object):
item.media_type[-4:] in ('/xml', '+xml')): item.media_type[-4:] in ('/xml', '+xml')):
hrefs = [r[2] for r in iterlinks(data)] hrefs = [r[2] for r in iterlinks(data)]
for href in hrefs: for href in hrefs:
if isinstance(href, bytes):
href = href.decode('utf-8')
href, _ = urldefrag(href) href, _ = urldefrag(href)
if not href: if not href:
continue continue

View File

@ -47,6 +47,8 @@ class ManifestTrimmer(object):
item.data is not None: item.data is not None:
hrefs = [r[2] for r in iterlinks(item.data)] hrefs = [r[2] for r in iterlinks(item.data)]
for href in hrefs: for href in hrefs:
if isinstance(href, bytes):
href = href.decode('utf-8')
try: try:
href = item.abshref(urlnormalize(href)) href = item.abshref(urlnormalize(href))
except: except: