This commit is contained in:
Kovid Goyal 2019-04-23 15:55:08 +05:30
commit bfbc31fa9f
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -103,7 +103,7 @@ def save_soup(soup, target):
f.write(html.encode('utf-8'))
class response(str):
class response(bytes):
def __new__(cls, *args):
obj = super(response, cls).__new__(cls, *args)
@ -261,16 +261,14 @@ class RecursiveFetcher(object):
delta = time.time() - self.last_fetch_at
if delta < self.delay:
time.sleep(self.delay - delta)
# mechanize does not handle quoting automatically
if re.search(r'\s+', url) is not None:
if isinstance(url, unicode_type):
url = url.encode('utf-8')
# Not sure is this is really needed as I think mechanize
# handles quoting automatically, but leaving it
# in case it breaks something
if re.search(r'\s+', url) is not None:
purl = list(urlparse(url))
for i in range(2, 6):
purl[i] = quote(purl[i])
url = urlunparse(purl)
url = urlunparse(purl).decode('utf-8')
open_func = getattr(self.browser, 'open_novisit', self.browser.open)
try:
with closing(open_func(url, timeout=self.timeout)) as f:
@ -414,8 +412,6 @@ class RecursiveFetcher(object):
continue
c += 1
fname = ascii_filename('img'+str(c))
if isinstance(fname, unicode_type):
fname = fname.encode('ascii', 'replace')
data = self.preprocess_image_ext(data, iurl) if self.preprocess_image_ext is not None else data
if data is None:
continue
@ -520,7 +516,7 @@ class RecursiveFetcher(object):
dsrc = self.fetch_url(iurl)
newbaseurl = dsrc.newurl
if len(dsrc) == 0 or \
len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0:
len(re.compile(b'<!--.*?-->', re.DOTALL).sub(b'', dsrc).strip()) == 0:
raise ValueError('No content at URL %r'%iurl)
if callable(self.encoding):
dsrc = self.encoding(dsrc)
@ -544,7 +540,7 @@ class RecursiveFetcher(object):
_fname = basename(iurl)
if not isinstance(_fname, unicode_type):
_fname.decode('latin1', 'replace')
_fname = _fname.encode('ascii', 'replace').replace('%', '').replace(os.sep, '')
_fname = _fname.replace('%', '').replace(os.sep, '')
_fname = ascii_filename(_fname)
_fname = os.path.splitext(_fname)[0][:120] + '.xhtml'
res = os.path.join(linkdiskpath, _fname)