From 1a602a7873550c2a1c500dcb62759e91ff647552 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 10 Apr 2019 20:53:01 +0530 Subject: [PATCH] py3: Port urlunquote Also take the opportunity to make unquote correct on python 2 by moving urlunquote to the polyglot module --- src/calibre/ebooks/oeb/base.py | 19 +------------------ src/polyglot/urllib.py | 26 ++++++++++++++++++++++++-- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 4cfcc07c08..cf2deb47f2 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -23,7 +23,7 @@ from calibre.ebooks.oeb.parse_utils import (barename, XHTML_NS, RECOVER_PARSER, from calibre.utils.cleantext import clean_xml_chars from calibre.utils.short_uuid import uuid4 from polyglot.builtins import iteritems, unicode_type, string_or_bytes, range, itervalues, filter -from polyglot.urllib import unquote, urldefrag, urljoin, urlparse, urlunparse +from polyglot.urllib import unquote as urlunquote, urldefrag, urljoin, urlparse, urlunparse from calibre.utils.icu import numeric_sort_key XML_NS = 'http://www.w3.org/XML/1998/namespace' @@ -455,23 +455,6 @@ def urlquote(href): return ''.join(result) -def urlunquote(href, error_handling='strict'): - # unquote must run on a bytestring and will return a bytestring - # If it runs on a unicode object, it returns a double encoded unicode - # string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8') - # and the latter is correct - want_unicode = isinstance(href, unicode_type) - if want_unicode: - href = href.encode('utf-8') - href = unquote(href) - if want_unicode: - # The quoted characters could have been in some encoding other than - # UTF-8, this often happens with old/broken web servers. There is no - # way to know what that encoding should be in this context. - href = href.decode('utf-8', error_handling) - return href - - def urlnormalize(href): """Convert a URL into normalized form, with all and only URL-unsafe characters URL quoted. diff --git a/src/polyglot/urllib.py b/src/polyglot/urllib.py index 9a706fcdf5..8f9e008f8c 100644 --- a/src/polyglot/urllib.py +++ b/src/polyglot/urllib.py @@ -10,14 +10,36 @@ if is_py3: from urllib.request import (build_opener, getproxies, install_opener, # noqa HTTPBasicAuthHandler, HTTPCookieProcessor, HTTPDigestAuthHandler, # noqa url2pathname, urlopen, Request) # noqa - from urllib.parse import (parse_qs, quote, unquote, quote_plus, urldefrag, # noqa + from urllib.parse import (parse_qs, quote, unquote as uq, quote_plus, urldefrag, # noqa urlencode, urljoin, urlparse, urlunparse, urlsplit, urlunsplit) # noqa from urllib.error import HTTPError, URLError # noqa + + def unquote(x, encoding='utf-8', errors='replace'): + binary = isinstance(x, bytes) + if binary: + x = x.decode(encoding, errors) + ans = uq(x, encoding, errors) + if binary: + ans = ans.encode(encoding, errors) + return ans else: - from urllib import (getproxies, quote, unquote, quote_plus, url2pathname, # noqa + from urllib import (getproxies, quote, unquote as uq, quote_plus, url2pathname, # noqa urlencode) # noqa from urllib2 import (build_opener, install_opener, HTTPBasicAuthHandler, # noqa HTTPCookieProcessor, HTTPDigestAuthHandler, HTTPError, URLError, # noqa urlopen, Request) # noqa from urlparse import (parse_qs, urldefrag, urljoin, urlparse, urlunparse, # noqa urlsplit, urlunsplit) # noqa + + def unquote(x, encoding='utf-8', errors='replace'): + # unquote must run on a bytestring and will return a bytestring + # If it runs on a unicode object, it returns a double encoded unicode + # string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8') + # and the latter is correct + binary = isinstance(x, bytes) + if not binary: + x = x.encode(encoding, errors) + ans = uq(x) + if not binary: + ans = ans.decode(encoding, errors) + return ans