News download: Fix urls with spaces in them not being downloaded since calibre 5.0

py3: Yet another stupid API
2025-07-09 03:04:10 -04:00 · 2021-07-04 15:30:33 +05:30 · 2021-07-04 15:30:33 +05:30 · c621941789
commit c621941789
parent 1b240e8882
1 changed files with 15 additions and 10 deletions
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -28,13 +28,13 @@ from calibre.utils.img import image_from_data, image_to_data
 from calibre.utils.imghdr import what
 from calibre.utils.logging import Log
 from calibre.web.fetch.utils import rescale_image
-from polyglot.builtins import unicode_type
+from polyglot.binary import from_base64_bytes
 from polyglot.builtins import as_bytes, unicode_type
 from polyglot.http_client import responses
 from polyglot.urllib import (
    URLError, quote, url2pathname, urljoin, urlparse, urlsplit, urlunparse,
    urlunsplit
 )
 from polyglot.binary import from_base64_bytes
 class AbortArticle(Exception):
@ -62,6 +62,18 @@ class closing(object):
            pass
 def canonicalize_url(url):
    # mechanize does not handle quoting automatically
    if re.search(r'\s+', url) is not None:
        if isinstance(url, unicode_type):
            url = url.encode('utf-8')
        purl = list(urlparse(url))
        for i in range(2, 6):
            purl[i] = as_bytes(quote(purl[i]))
        url = urlunparse(purl).decode('utf-8')
    return url
 bad_url_counter = 0
@ -261,14 +273,7 @@ class RecursiveFetcher(object):
        delta = time.monotonic() - self.last_fetch_at
        if delta < self.delay:
            time.sleep(self.delay - delta)
-        # mechanize does not handle quoting automatically
+        url = canonicalize_url(url)
        if re.search(r'\s+', url) is not None:
            if isinstance(url, unicode_type):
                url = url.encode('utf-8')
            purl = list(urlparse(url))
            for i in range(2, 6):
                purl[i] = quote(purl[i])
            url = urlunparse(purl).decode('utf-8')
        open_func = getattr(self.browser, 'open_novisit', self.browser.open)
        try:
            with closing(open_func(url, timeout=self.timeout)) as f: