EPUB/AZW3 Output: Fix regression that caused erros when trying to convert documents that have URLs with invalid (non-utf-8) quoting. Fixes #1181049 (in vers .9.30 I can't download WSJ but works fine in earlier versions)

This commit is contained in:
Kovid Goyal 2013-05-18 08:27:07 +05:30
parent 55808e5f68
commit 16c5f8b1c1
2 changed files with 10 additions and 3 deletions

View File

@ -373,7 +373,7 @@ def urlquote(href):
result.append(char)
return ''.join(result)
def urlunquote(href):
def urlunquote(href, error_handling='strict'):
# unquote must run on a bytestring and will return a bytestring
# If it runs on a unicode object, it returns a double encoded unicode
# string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8')
@ -383,7 +383,10 @@ def urlunquote(href):
href = href.encode('utf-8')
href = unquote(href)
if want_unicode:
href = href.decode('utf-8')
# The quoted characters could have been in some encoding other than
# UTF-8, this often happens with old/broken web servers. There is no
# way to know what that encoding should be in this context.
href = href.decode('utf-8', error_handling)
return href
def urlnormalize(href):

View File

@ -159,7 +159,11 @@ class Split(object):
except ValueError:
# Unparseable URL
return url
href = urlnormalize(href)
try:
href = urlnormalize(href)
except ValueError:
# href has non utf-8 quoting
return url
if href in self.map:
anchor_map = self.map[href]
nhref = anchor_map[frag if frag else None]