mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
py3: basic news fetching works
Just some more juggling around of bytes types. Specifically note that urls are encoded and quoted, then decoded before being given to mechanize as mechanize expects to see unicode. Furthermore, ascii_filename is already there to sanitize filenames.
This commit is contained in:
parent
bc661a812d
commit
b9224f17c4
@ -103,7 +103,7 @@ def save_soup(soup, target):
|
|||||||
f.write(html.encode('utf-8'))
|
f.write(html.encode('utf-8'))
|
||||||
|
|
||||||
|
|
||||||
class response(str):
|
class response(bytes):
|
||||||
|
|
||||||
def __new__(cls, *args):
|
def __new__(cls, *args):
|
||||||
obj = super(response, cls).__new__(cls, *args)
|
obj = super(response, cls).__new__(cls, *args)
|
||||||
@ -261,16 +261,13 @@ class RecursiveFetcher(object):
|
|||||||
delta = time.time() - self.last_fetch_at
|
delta = time.time() - self.last_fetch_at
|
||||||
if delta < self.delay:
|
if delta < self.delay:
|
||||||
time.sleep(self.delay - delta)
|
time.sleep(self.delay - delta)
|
||||||
if isinstance(url, unicode_type):
|
# mechanize does not handle quoting automatically
|
||||||
url = url.encode('utf-8')
|
|
||||||
# Not sure is this is really needed as I think mechanize
|
|
||||||
# handles quoting automatically, but leaving it
|
|
||||||
# in case it breaks something
|
|
||||||
if re.search(r'\s+', url) is not None:
|
if re.search(r'\s+', url) is not None:
|
||||||
|
url = url.encode('utf-8')
|
||||||
purl = list(urlparse(url))
|
purl = list(urlparse(url))
|
||||||
for i in range(2, 6):
|
for i in range(2, 6):
|
||||||
purl[i] = quote(purl[i])
|
purl[i] = quote(purl[i])
|
||||||
url = urlunparse(purl)
|
url = urlunparse(purl).decode('utf-8')
|
||||||
open_func = getattr(self.browser, 'open_novisit', self.browser.open)
|
open_func = getattr(self.browser, 'open_novisit', self.browser.open)
|
||||||
try:
|
try:
|
||||||
with closing(open_func(url, timeout=self.timeout)) as f:
|
with closing(open_func(url, timeout=self.timeout)) as f:
|
||||||
@ -414,8 +411,6 @@ class RecursiveFetcher(object):
|
|||||||
continue
|
continue
|
||||||
c += 1
|
c += 1
|
||||||
fname = ascii_filename('img'+str(c))
|
fname = ascii_filename('img'+str(c))
|
||||||
if isinstance(fname, unicode_type):
|
|
||||||
fname = fname.encode('ascii', 'replace')
|
|
||||||
data = self.preprocess_image_ext(data, iurl) if self.preprocess_image_ext is not None else data
|
data = self.preprocess_image_ext(data, iurl) if self.preprocess_image_ext is not None else data
|
||||||
if data is None:
|
if data is None:
|
||||||
continue
|
continue
|
||||||
@ -520,7 +515,7 @@ class RecursiveFetcher(object):
|
|||||||
dsrc = self.fetch_url(iurl)
|
dsrc = self.fetch_url(iurl)
|
||||||
newbaseurl = dsrc.newurl
|
newbaseurl = dsrc.newurl
|
||||||
if len(dsrc) == 0 or \
|
if len(dsrc) == 0 or \
|
||||||
len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0:
|
len(re.compile(b'<!--.*?-->', re.DOTALL).sub(b'', dsrc).strip()) == 0:
|
||||||
raise ValueError('No content at URL %r'%iurl)
|
raise ValueError('No content at URL %r'%iurl)
|
||||||
if callable(self.encoding):
|
if callable(self.encoding):
|
||||||
dsrc = self.encoding(dsrc)
|
dsrc = self.encoding(dsrc)
|
||||||
@ -544,7 +539,7 @@ class RecursiveFetcher(object):
|
|||||||
_fname = basename(iurl)
|
_fname = basename(iurl)
|
||||||
if not isinstance(_fname, unicode_type):
|
if not isinstance(_fname, unicode_type):
|
||||||
_fname.decode('latin1', 'replace')
|
_fname.decode('latin1', 'replace')
|
||||||
_fname = _fname.encode('ascii', 'replace').replace('%', '').replace(os.sep, '')
|
_fname = _fname.replace('%', '').replace(os.sep, '').encode('ascii', 'replace')
|
||||||
_fname = ascii_filename(_fname)
|
_fname = ascii_filename(_fname)
|
||||||
_fname = os.path.splitext(_fname)[0][:120] + '.xhtml'
|
_fname = os.path.splitext(_fname)[0][:120] + '.xhtml'
|
||||||
res = os.path.join(linkdiskpath, _fname)
|
res = os.path.join(linkdiskpath, _fname)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user