From 16c5f8b1c1c99f180e3ec00d8a5c22cc69886e6c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 18 May 2013 08:27:07 +0530 Subject: [PATCH] EPUB/AZW3 Output: Fix regression that caused erros when trying to convert documents that have URLs with invalid (non-utf-8) quoting. Fixes #1181049 (in vers .9.30 I can't download WSJ but works fine in earlier versions) --- src/calibre/ebooks/oeb/base.py | 7 +++++-- src/calibre/ebooks/oeb/transforms/split.py | 6 +++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 671caf49fc..d4b3a2b7ab 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -373,7 +373,7 @@ def urlquote(href): result.append(char) return ''.join(result) -def urlunquote(href): +def urlunquote(href, error_handling='strict'): # unquote must run on a bytestring and will return a bytestring # If it runs on a unicode object, it returns a double encoded unicode # string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8') @@ -383,7 +383,10 @@ def urlunquote(href): href = href.encode('utf-8') href = unquote(href) if want_unicode: - href = href.decode('utf-8') + # The quoted characters could have been in some encoding other than + # UTF-8, this often happens with old/broken web servers. There is no + # way to know what that encoding should be in this context. + href = href.decode('utf-8', error_handling) return href def urlnormalize(href): diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index 0ca3888c17..123f61b047 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -159,7 +159,11 @@ class Split(object): except ValueError: # Unparseable URL return url - href = urlnormalize(href) + try: + href = urlnormalize(href) + except ValueError: + # href has non utf-8 quoting + return url if href in self.map: anchor_map = self.map[href] nhref = anchor_map[frag if frag else None]