py3: basic news fetching works

Just some more juggling around of bytes types. Specifically note that
urls are encoded and quoted, then decoded before being given to
mechanize as mechanize expects to see unicode. Furthermore,
ascii_filename is already there to sanitize filenames.
This commit is contained in:
Eli Schwartz 2019-04-23 03:28:59 -04:00
parent bc661a812d
commit b9224f17c4
No known key found for this signature in database
GPG Key ID: CEB167EFB5722BD6

View File

@ -103,7 +103,7 @@ def save_soup(soup, target):
f.write(html.encode('utf-8')) f.write(html.encode('utf-8'))
class response(str): class response(bytes):
def __new__(cls, *args): def __new__(cls, *args):
obj = super(response, cls).__new__(cls, *args) obj = super(response, cls).__new__(cls, *args)
@ -261,16 +261,13 @@ class RecursiveFetcher(object):
delta = time.time() - self.last_fetch_at delta = time.time() - self.last_fetch_at
if delta < self.delay: if delta < self.delay:
time.sleep(self.delay - delta) time.sleep(self.delay - delta)
if isinstance(url, unicode_type): # mechanize does not handle quoting automatically
url = url.encode('utf-8')
# Not sure is this is really needed as I think mechanize
# handles quoting automatically, but leaving it
# in case it breaks something
if re.search(r'\s+', url) is not None: if re.search(r'\s+', url) is not None:
url = url.encode('utf-8')
purl = list(urlparse(url)) purl = list(urlparse(url))
for i in range(2, 6): for i in range(2, 6):
purl[i] = quote(purl[i]) purl[i] = quote(purl[i])
url = urlunparse(purl) url = urlunparse(purl).decode('utf-8')
open_func = getattr(self.browser, 'open_novisit', self.browser.open) open_func = getattr(self.browser, 'open_novisit', self.browser.open)
try: try:
with closing(open_func(url, timeout=self.timeout)) as f: with closing(open_func(url, timeout=self.timeout)) as f:
@ -414,8 +411,6 @@ class RecursiveFetcher(object):
continue continue
c += 1 c += 1
fname = ascii_filename('img'+str(c)) fname = ascii_filename('img'+str(c))
if isinstance(fname, unicode_type):
fname = fname.encode('ascii', 'replace')
data = self.preprocess_image_ext(data, iurl) if self.preprocess_image_ext is not None else data data = self.preprocess_image_ext(data, iurl) if self.preprocess_image_ext is not None else data
if data is None: if data is None:
continue continue
@ -520,7 +515,7 @@ class RecursiveFetcher(object):
dsrc = self.fetch_url(iurl) dsrc = self.fetch_url(iurl)
newbaseurl = dsrc.newurl newbaseurl = dsrc.newurl
if len(dsrc) == 0 or \ if len(dsrc) == 0 or \
len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0: len(re.compile(b'<!--.*?-->', re.DOTALL).sub(b'', dsrc).strip()) == 0:
raise ValueError('No content at URL %r'%iurl) raise ValueError('No content at URL %r'%iurl)
if callable(self.encoding): if callable(self.encoding):
dsrc = self.encoding(dsrc) dsrc = self.encoding(dsrc)
@ -544,7 +539,7 @@ class RecursiveFetcher(object):
_fname = basename(iurl) _fname = basename(iurl)
if not isinstance(_fname, unicode_type): if not isinstance(_fname, unicode_type):
_fname.decode('latin1', 'replace') _fname.decode('latin1', 'replace')
_fname = _fname.encode('ascii', 'replace').replace('%', '').replace(os.sep, '') _fname = _fname.replace('%', '').replace(os.sep, '').encode('ascii', 'replace')
_fname = ascii_filename(_fname) _fname = ascii_filename(_fname)
_fname = os.path.splitext(_fname)[0][:120] + '.xhtml' _fname = os.path.splitext(_fname)[0][:120] + '.xhtml'
res = os.path.join(linkdiskpath, _fname) res = os.path.join(linkdiskpath, _fname)