EPUB/AZW3 Output: Fix regression that caused erros when trying to convert documents that have URLs with invalid (non-utf-8) quoting. Fixes #1181049 (in vers .9.30 I can't download WSJ but works fine in earlier versions)

2025-11-03 02:57:01 -05:00 · 2013-05-18 08:27:07 +05:30 · 2013-05-18 08:27:07 +05:30 · 16c5f8b1c1
commit 16c5f8b1c1
parent 55808e5f68
2 changed files with 10 additions and 3 deletions
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -373,7 +373,7 @@ def urlquote(href):
        result.append(char)
    return ''.join(result)

-def urlunquote(href):
+def urlunquote(href, error_handling='strict'):
    # unquote must run on a bytestring and will return a bytestring
    # If it runs on a unicode object, it returns a double encoded unicode
    # string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8')
@ -383,7 +383,10 @@ def urlunquote(href):
        href = href.encode('utf-8')
    href = unquote(href)
    if want_unicode:
-        href = href.decode('utf-8')
+        # The quoted characters could have been in some encoding other than
+        # UTF-8, this often happens with old/broken web servers. There is no
+        # way to know what that encoding should be in this context.
+        href = href.decode('utf-8', error_handling)
    return href

 def urlnormalize(href):
--- a/src/calibre/ebooks/oeb/transforms/split.py
+++ b/src/calibre/ebooks/oeb/transforms/split.py
@ -159,7 +159,11 @@ class Split(object):
        except ValueError:
            # Unparseable URL
            return url
-        href = urlnormalize(href)
+        try:
+            href = urlnormalize(href)
+        except ValueError:
+            # href has non utf-8 quoting
+            return url
        if href in self.map:
            anchor_map = self.map[href]
            nhref = anchor_map[frag if frag else None]