This commit is contained in:
Kovid Goyal 2019-04-23 15:55:08 +05:30
commit bfbc31fa9f
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -103,7 +103,7 @@ def save_soup(soup, target):
f.write(html.encode('utf-8')) f.write(html.encode('utf-8'))
class response(str): class response(bytes):
def __new__(cls, *args): def __new__(cls, *args):
obj = super(response, cls).__new__(cls, *args) obj = super(response, cls).__new__(cls, *args)
@ -261,16 +261,14 @@ class RecursiveFetcher(object):
delta = time.time() - self.last_fetch_at delta = time.time() - self.last_fetch_at
if delta < self.delay: if delta < self.delay:
time.sleep(self.delay - delta) time.sleep(self.delay - delta)
# mechanize does not handle quoting automatically
if re.search(r'\s+', url) is not None:
if isinstance(url, unicode_type): if isinstance(url, unicode_type):
url = url.encode('utf-8') url = url.encode('utf-8')
# Not sure is this is really needed as I think mechanize
# handles quoting automatically, but leaving it
# in case it breaks something
if re.search(r'\s+', url) is not None:
purl = list(urlparse(url)) purl = list(urlparse(url))
for i in range(2, 6): for i in range(2, 6):
purl[i] = quote(purl[i]) purl[i] = quote(purl[i])
url = urlunparse(purl) url = urlunparse(purl).decode('utf-8')
open_func = getattr(self.browser, 'open_novisit', self.browser.open) open_func = getattr(self.browser, 'open_novisit', self.browser.open)
try: try:
with closing(open_func(url, timeout=self.timeout)) as f: with closing(open_func(url, timeout=self.timeout)) as f:
@ -414,8 +412,6 @@ class RecursiveFetcher(object):
continue continue
c += 1 c += 1
fname = ascii_filename('img'+str(c)) fname = ascii_filename('img'+str(c))
if isinstance(fname, unicode_type):
fname = fname.encode('ascii', 'replace')
data = self.preprocess_image_ext(data, iurl) if self.preprocess_image_ext is not None else data data = self.preprocess_image_ext(data, iurl) if self.preprocess_image_ext is not None else data
if data is None: if data is None:
continue continue
@ -520,7 +516,7 @@ class RecursiveFetcher(object):
dsrc = self.fetch_url(iurl) dsrc = self.fetch_url(iurl)
newbaseurl = dsrc.newurl newbaseurl = dsrc.newurl
if len(dsrc) == 0 or \ if len(dsrc) == 0 or \
len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0: len(re.compile(b'<!--.*?-->', re.DOTALL).sub(b'', dsrc).strip()) == 0:
raise ValueError('No content at URL %r'%iurl) raise ValueError('No content at URL %r'%iurl)
if callable(self.encoding): if callable(self.encoding):
dsrc = self.encoding(dsrc) dsrc = self.encoding(dsrc)
@ -544,7 +540,7 @@ class RecursiveFetcher(object):
_fname = basename(iurl) _fname = basename(iurl)
if not isinstance(_fname, unicode_type): if not isinstance(_fname, unicode_type):
_fname.decode('latin1', 'replace') _fname.decode('latin1', 'replace')
_fname = _fname.encode('ascii', 'replace').replace('%', '').replace(os.sep, '') _fname = _fname.replace('%', '').replace(os.sep, '')
_fname = ascii_filename(_fname) _fname = ascii_filename(_fname)
_fname = os.path.splitext(_fname)[0][:120] + '.xhtml' _fname = os.path.splitext(_fname)[0][:120] + '.xhtml'
res = os.path.join(linkdiskpath, _fname) res = os.path.join(linkdiskpath, _fname)