py3: basic news fetching works

Just some more juggling around of bytes types. Specifically note that urls are encoded and quoted, then decoded before being given to mechanize as mechanize expects to see unicode. Furthermore, ascii_filename is already there to sanitize filenames.
2025-11-30 18:25:03 -05:00 · 2019-04-23 03:28:59 -04:00 · 2019-04-23 03:28:59 -04:00 · b9224f17c4
commit b9224f17c4
parent bc661a812d
1 changed files with 6 additions and 11 deletions
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -103,7 +103,7 @@ def save_soup(soup, target):
        f.write(html.encode('utf-8'))
-class response(str):
+class response(bytes):
    def __new__(cls, *args):
        obj = super(response, cls).__new__(cls, *args)
@ -261,16 +261,13 @@ class RecursiveFetcher(object):
        delta = time.time() - self.last_fetch_at
        if delta < self.delay:
            time.sleep(self.delay - delta)
-        if isinstance(url, unicode_type):
+        # mechanize does not handle quoting automatically
            url = url.encode('utf-8')
        # Not sure is this is really needed as I think mechanize
        # handles quoting automatically, but leaving it
        # in case it breaks something
        if re.search(r'\s+', url) is not None:
            url = url.encode('utf-8')
            purl = list(urlparse(url))
            for i in range(2, 6):
                purl[i] = quote(purl[i])
-            url = urlunparse(purl)
+            url = urlunparse(purl).decode('utf-8')
        open_func = getattr(self.browser, 'open_novisit', self.browser.open)
        try:
            with closing(open_func(url, timeout=self.timeout)) as f:
@ -414,8 +411,6 @@ class RecursiveFetcher(object):
                    continue
            c += 1
            fname = ascii_filename('img'+str(c))
            if isinstance(fname, unicode_type):
                fname = fname.encode('ascii', 'replace')
            data = self.preprocess_image_ext(data, iurl) if self.preprocess_image_ext is not None else data
            if data is None:
                continue
@ -520,7 +515,7 @@ class RecursiveFetcher(object):
                    dsrc = self.fetch_url(iurl)
                    newbaseurl = dsrc.newurl
                    if len(dsrc) == 0 or \
-                       len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0:
+                       len(re.compile(b'<!--.*?-->', re.DOTALL).sub(b'', dsrc).strip()) == 0:
                        raise ValueError('No content at URL %r'%iurl)
                    if callable(self.encoding):
                        dsrc = self.encoding(dsrc)
@ -544,7 +539,7 @@ class RecursiveFetcher(object):
                    _fname = basename(iurl)
                    if not isinstance(_fname, unicode_type):
                        _fname.decode('latin1', 'replace')
-                    _fname = _fname.encode('ascii', 'replace').replace('%', '').replace(os.sep, '')
+                    _fname = _fname.replace('%', '').replace(os.sep, '').encode('ascii', 'replace')
                    _fname = ascii_filename(_fname)
                    _fname = os.path.splitext(_fname)[0][:120] + '.xhtml'
                    res = os.path.join(linkdiskpath, _fname)