From 1a602a7873550c2a1c500dcb62759e91ff647552 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 10 Apr 2019 20:53:01 +0530
Subject: [PATCH] py3: Port urlunquote

Also take the opportunity to make unquote correct on python 2 by moving
urlunquote to the polyglot module
---
 src/calibre/ebooks/oeb/base.py | 19 +------------------
 src/polyglot/urllib.py         | 26 ++++++++++++++++++++++++--
 2 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py
index 4cfcc07c08..cf2deb47f2 100644
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@@ -23,7 +23,7 @@ from calibre.ebooks.oeb.parse_utils import (barename, XHTML_NS, RECOVER_PARSER,
 from calibre.utils.cleantext import clean_xml_chars
 from calibre.utils.short_uuid import uuid4
 from polyglot.builtins import iteritems, unicode_type, string_or_bytes, range, itervalues, filter
-from polyglot.urllib import unquote, urldefrag, urljoin, urlparse, urlunparse
+from polyglot.urllib import unquote as urlunquote, urldefrag, urljoin, urlparse, urlunparse
 from calibre.utils.icu import numeric_sort_key
 
 XML_NS       = 'http://www.w3.org/XML/1998/namespace'
@@ -455,23 +455,6 @@ def urlquote(href):
     return ''.join(result)
 
 
-def urlunquote(href, error_handling='strict'):
-    # unquote must run on a bytestring and will return a bytestring
-    # If it runs on a unicode object, it returns a double encoded unicode
-    # string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8')
-    # and the latter is correct
-    want_unicode = isinstance(href, unicode_type)
-    if want_unicode:
-        href = href.encode('utf-8')
-    href = unquote(href)
-    if want_unicode:
-        # The quoted characters could have been in some encoding other than
-        # UTF-8, this often happens with old/broken web servers. There is no
-        # way to know what that encoding should be in this context.
-        href = href.decode('utf-8', error_handling)
-    return href
-
-
 def urlnormalize(href):
     """Convert a URL into normalized form, with all and only URL-unsafe
     characters URL quoted.
diff --git a/src/polyglot/urllib.py b/src/polyglot/urllib.py
index 9a706fcdf5..8f9e008f8c 100644
--- a/src/polyglot/urllib.py
+++ b/src/polyglot/urllib.py
@@ -10,14 +10,36 @@ if is_py3:
     from urllib.request import (build_opener, getproxies, install_opener,  # noqa
             HTTPBasicAuthHandler, HTTPCookieProcessor, HTTPDigestAuthHandler,  # noqa
             url2pathname, urlopen, Request)  # noqa
-    from urllib.parse import (parse_qs, quote, unquote, quote_plus, urldefrag,  # noqa
+    from urllib.parse import (parse_qs, quote, unquote as uq, quote_plus, urldefrag,  # noqa
             urlencode, urljoin, urlparse, urlunparse, urlsplit, urlunsplit)  # noqa
     from urllib.error import HTTPError, URLError  # noqa
+
+    def unquote(x, encoding='utf-8', errors='replace'):
+        binary = isinstance(x, bytes)
+        if binary:
+            x = x.decode(encoding, errors)
+        ans = uq(x, encoding, errors)
+        if binary:
+            ans = ans.encode(encoding, errors)
+        return ans
 else:
-    from urllib import (getproxies, quote, unquote, quote_plus, url2pathname,  # noqa
+    from urllib import (getproxies, quote, unquote as uq, quote_plus, url2pathname,  # noqa
             urlencode)  # noqa
     from urllib2 import (build_opener, install_opener, HTTPBasicAuthHandler,  # noqa
             HTTPCookieProcessor, HTTPDigestAuthHandler, HTTPError, URLError,  # noqa
             urlopen, Request)  # noqa
     from urlparse import (parse_qs, urldefrag, urljoin, urlparse, urlunparse,  # noqa
             urlsplit, urlunsplit)  # noqa
+
+    def unquote(x, encoding='utf-8', errors='replace'):
+        # unquote must run on a bytestring and will return a bytestring
+        # If it runs on a unicode object, it returns a double encoded unicode
+        # string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8')
+        # and the latter is correct
+        binary = isinstance(x, bytes)
+        if not binary:
+            x = x.encode(encoding, errors)
+        ans = uq(x)
+        if not binary:
+            ans = ans.decode(encoding, errors)
+        return ans