News download: Fix urls with spaces in them not being downloaded since calibre 5.0

py3: Yet another stupid API
This commit is contained in:
Kovid Goyal 2021-07-04 15:30:33 +05:30
parent 1b240e8882
commit c621941789
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -28,13 +28,13 @@ from calibre.utils.img import image_from_data, image_to_data
from calibre.utils.imghdr import what from calibre.utils.imghdr import what
from calibre.utils.logging import Log from calibre.utils.logging import Log
from calibre.web.fetch.utils import rescale_image from calibre.web.fetch.utils import rescale_image
from polyglot.builtins import unicode_type from polyglot.binary import from_base64_bytes
from polyglot.builtins import as_bytes, unicode_type
from polyglot.http_client import responses from polyglot.http_client import responses
from polyglot.urllib import ( from polyglot.urllib import (
URLError, quote, url2pathname, urljoin, urlparse, urlsplit, urlunparse, URLError, quote, url2pathname, urljoin, urlparse, urlsplit, urlunparse,
urlunsplit urlunsplit
) )
from polyglot.binary import from_base64_bytes
class AbortArticle(Exception): class AbortArticle(Exception):
@ -62,6 +62,18 @@ class closing(object):
pass pass
def canonicalize_url(url):
# mechanize does not handle quoting automatically
if re.search(r'\s+', url) is not None:
if isinstance(url, unicode_type):
url = url.encode('utf-8')
purl = list(urlparse(url))
for i in range(2, 6):
purl[i] = as_bytes(quote(purl[i]))
url = urlunparse(purl).decode('utf-8')
return url
bad_url_counter = 0 bad_url_counter = 0
@ -261,14 +273,7 @@ class RecursiveFetcher(object):
delta = time.monotonic() - self.last_fetch_at delta = time.monotonic() - self.last_fetch_at
if delta < self.delay: if delta < self.delay:
time.sleep(self.delay - delta) time.sleep(self.delay - delta)
# mechanize does not handle quoting automatically url = canonicalize_url(url)
if re.search(r'\s+', url) is not None:
if isinstance(url, unicode_type):
url = url.encode('utf-8')
purl = list(urlparse(url))
for i in range(2, 6):
purl[i] = quote(purl[i])
url = urlunparse(purl).decode('utf-8')
open_func = getattr(self.browser, 'open_novisit', self.browser.open) open_func = getattr(self.browser, 'open_novisit', self.browser.open)
try: try:
with closing(open_func(url, timeout=self.timeout)) as f: with closing(open_func(url, timeout=self.timeout)) as f: