diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 671caf49fc..d4b3a2b7ab 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -373,7 +373,7 @@ def urlquote(href): result.append(char) return ''.join(result) -def urlunquote(href): +def urlunquote(href, error_handling='strict'): # unquote must run on a bytestring and will return a bytestring # If it runs on a unicode object, it returns a double encoded unicode # string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8') @@ -383,7 +383,10 @@ def urlunquote(href): href = href.encode('utf-8') href = unquote(href) if want_unicode: - href = href.decode('utf-8') + # The quoted characters could have been in some encoding other than + # UTF-8, this often happens with old/broken web servers. There is no + # way to know what that encoding should be in this context. + href = href.decode('utf-8', error_handling) return href def urlnormalize(href): diff --git a/src/calibre/ebooks/oeb/transforms/split.py b/src/calibre/ebooks/oeb/transforms/split.py index 0ca3888c17..123f61b047 100644 --- a/src/calibre/ebooks/oeb/transforms/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -159,7 +159,11 @@ class Split(object): except ValueError: # Unparseable URL return url - href = urlnormalize(href) + try: + href = urlnormalize(href) + except ValueError: + # href has non utf-8 quoting + return url if href in self.map: anchor_map = self.map[href] nhref = anchor_map[frag if frag else None]