mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Make news download more robust and fetch blogs in The Atlantic recipe
This commit is contained in:
parent
7fb21ac00d
commit
dbe52fd1b3
@ -16,7 +16,7 @@ class TheAtlantic(BasicNewsRecipe):
|
|||||||
INDEX = 'http://www.theatlantic.com/doc/current'
|
INDEX = 'http://www.theatlantic.com/doc/current'
|
||||||
|
|
||||||
remove_tags_before = dict(name='div', id='storytop')
|
remove_tags_before = dict(name='div', id='storytop')
|
||||||
remove_tags = [dict(name='div', id=['seealso', 'storybottom', 'footer'])]
|
remove_tags = [dict(name='div', id=['seealso', 'storybottom', 'footer', 'ad_banner_top', 'sidebar'])]
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
@ -35,8 +35,9 @@ class TheAtlantic(BasicNewsRecipe):
|
|||||||
for item in soup.findAll('div', attrs={'class':'item'}):
|
for item in soup.findAll('div', attrs={'class':'item'}):
|
||||||
a = item.find('a')
|
a = item.find('a')
|
||||||
if a and a.has_key('href'):
|
if a and a.has_key('href'):
|
||||||
url = a['href']
|
url = a['href'].replace('/doc', 'doc/print')
|
||||||
url = 'http://www.theatlantic.com/'+url.replace('/doc', 'doc/print')
|
if not url.startswith('http://'):
|
||||||
|
url = 'http://www.theatlantic.com/'+url
|
||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
byline = item.find(attrs={'class':'byline'})
|
byline = item.find(attrs={'class':'byline'})
|
||||||
date = self.tag_to_string(byline) if byline else ''
|
date = self.tag_to_string(byline) if byline else ''
|
||||||
@ -48,5 +49,4 @@ class TheAtlantic(BasicNewsRecipe):
|
|||||||
'description':description
|
'description':description
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
return [('Current Issue', articles)]
|
return [('Current Issue', articles)]
|
||||||
|
@ -10,6 +10,7 @@ UTF-8 encoding with any charset declarations removed.
|
|||||||
import sys, socket, os, urlparse, logging, re, time, copy, urllib2, threading, traceback
|
import sys, socket, os, urlparse, logging, re, time, copy, urllib2, threading, traceback
|
||||||
from urllib import url2pathname
|
from urllib import url2pathname
|
||||||
from httplib import responses
|
from httplib import responses
|
||||||
|
from contextlib import closing
|
||||||
|
|
||||||
from calibre import setup_cli_handlers, browser, sanitize_file_name, \
|
from calibre import setup_cli_handlers, browser, sanitize_file_name, \
|
||||||
relpath, LoggingInterface
|
relpath, LoggingInterface
|
||||||
@ -48,6 +49,11 @@ def save_soup(soup, target):
|
|||||||
with open(target, 'wb') as f:
|
with open(target, 'wb') as f:
|
||||||
f.write(html.encode('utf-8'))
|
f.write(html.encode('utf-8'))
|
||||||
|
|
||||||
|
class response(str):
|
||||||
|
|
||||||
|
def __init__(self, *args):
|
||||||
|
str.__init__(self, *args)
|
||||||
|
self.newurl = None
|
||||||
|
|
||||||
class RecursiveFetcher(object, LoggingInterface):
|
class RecursiveFetcher(object, LoggingInterface):
|
||||||
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
|
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
|
||||||
@ -134,13 +140,25 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
|
|
||||||
|
|
||||||
def fetch_url(self, url):
|
def fetch_url(self, url):
|
||||||
f = None
|
data = None
|
||||||
self.log_debug('Fetching %s', url)
|
self.log_debug('Fetching %s', url)
|
||||||
delta = time.time() - self.last_fetch_at
|
delta = time.time() - self.last_fetch_at
|
||||||
if delta < self.delay:
|
if delta < self.delay:
|
||||||
time.sleep(delta)
|
time.sleep(delta)
|
||||||
try:
|
try:
|
||||||
f = self.browser.open(url)
|
try:
|
||||||
|
with closing(self.browser.open(url)) as f:
|
||||||
|
data = response(f.read())
|
||||||
|
data.newurl = f.geturl()
|
||||||
|
except AttributeError:
|
||||||
|
time.sleep(2)
|
||||||
|
try:
|
||||||
|
with closing(self.browser.open(url)) as f:
|
||||||
|
data = response(f.read())
|
||||||
|
data.newurl = f.geturl()
|
||||||
|
except AttributeError:
|
||||||
|
data = response(urllib2.urlopen(url).read())
|
||||||
|
data.newurl = f.geturl()
|
||||||
except urllib2.URLError, err:
|
except urllib2.URLError, err:
|
||||||
if hasattr(err, 'code') and responses.has_key(err.code):
|
if hasattr(err, 'code') and responses.has_key(err.code):
|
||||||
raise FetchError, responses[err.code]
|
raise FetchError, responses[err.code]
|
||||||
@ -149,12 +167,13 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
if hasattr(f, 'close'):
|
if hasattr(f, 'close'):
|
||||||
f.close()
|
f.close()
|
||||||
f = self.browser.open(url)
|
with closing(self.browser.open(url)) as f:
|
||||||
|
data = f.read()
|
||||||
else:
|
else:
|
||||||
raise err
|
raise err
|
||||||
finally:
|
finally:
|
||||||
self.last_fetch_at = time.time()
|
self.last_fetch_at = time.time()
|
||||||
return f
|
return data
|
||||||
|
|
||||||
|
|
||||||
def start_fetch(self, url):
|
def start_fetch(self, url):
|
||||||
@ -196,7 +215,7 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
tag['href'] = self.stylemap[iurl]
|
tag['href'] = self.stylemap[iurl]
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
f = self.fetch_url(iurl)
|
data = self.fetch_url(iurl)
|
||||||
except Exception, err:
|
except Exception, err:
|
||||||
self.log_debug('Could not fetch stylesheet %s', iurl)
|
self.log_debug('Could not fetch stylesheet %s', iurl)
|
||||||
self.log_debug('Error: %s', str(err), exc_info=True)
|
self.log_debug('Error: %s', str(err), exc_info=True)
|
||||||
@ -205,8 +224,7 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
with self.stylemap_lock:
|
with self.stylemap_lock:
|
||||||
self.stylemap[iurl] = stylepath
|
self.stylemap[iurl] = stylepath
|
||||||
with open(stylepath, 'wb') as x:
|
with open(stylepath, 'wb') as x:
|
||||||
x.write(f.read())
|
x.write(data)
|
||||||
f.close()
|
|
||||||
tag['href'] = stylepath
|
tag['href'] = stylepath
|
||||||
else:
|
else:
|
||||||
for ns in tag.findAll(text=True):
|
for ns in tag.findAll(text=True):
|
||||||
@ -221,19 +239,17 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
|
ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
f = self.fetch_url(iurl)
|
data = self.fetch_url(iurl)
|
||||||
except Exception, err:
|
except Exception, err:
|
||||||
self.log_warning('Could not fetch stylesheet %s', iurl)
|
self.log_warning('Could not fetch stylesheet %s', iurl)
|
||||||
self.log_debug('Error: %s', str(err), exc_info=True)
|
self.log_debug('Error: %s', str(err), exc_info=True)
|
||||||
if hasattr(f, 'close'): f.close()
|
|
||||||
continue
|
continue
|
||||||
c += 1
|
c += 1
|
||||||
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
|
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
|
||||||
with self.stylemap_lock:
|
with self.stylemap_lock:
|
||||||
self.stylemap[iurl] = stylepath
|
self.stylemap[iurl] = stylepath
|
||||||
with open(stylepath, 'wb') as x:
|
with open(stylepath, 'wb') as x:
|
||||||
x.write(f.read())
|
x.write(data)
|
||||||
f.close()
|
|
||||||
ns.replaceWith(src.replace(m.group(1), stylepath))
|
ns.replaceWith(src.replace(m.group(1), stylepath))
|
||||||
|
|
||||||
|
|
||||||
@ -256,7 +272,7 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
tag['src'] = self.imagemap[iurl]
|
tag['src'] = self.imagemap[iurl]
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
f = self.fetch_url(iurl)
|
data = self.fetch_url(iurl)
|
||||||
except Exception, err:
|
except Exception, err:
|
||||||
self.log_warning('Could not fetch image %s', iurl)
|
self.log_warning('Could not fetch image %s', iurl)
|
||||||
self.log_debug('Error: %s', str(err), exc_info=True)
|
self.log_debug('Error: %s', str(err), exc_info=True)
|
||||||
@ -269,8 +285,7 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
with self.imagemap_lock:
|
with self.imagemap_lock:
|
||||||
self.imagemap[iurl] = imgpath
|
self.imagemap[iurl] = imgpath
|
||||||
with open(imgpath, 'wb') as x:
|
with open(imgpath, 'wb') as x:
|
||||||
x.write(f.read())
|
x.write(data)
|
||||||
f.close()
|
|
||||||
tag['src'] = imgpath
|
tag['src'] = imgpath
|
||||||
|
|
||||||
def absurl(self, baseurl, tag, key, filter=True):
|
def absurl(self, baseurl, tag, key, filter=True):
|
||||||
@ -337,9 +352,8 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
os.mkdir(linkdiskpath)
|
os.mkdir(linkdiskpath)
|
||||||
try:
|
try:
|
||||||
self.current_dir = linkdiskpath
|
self.current_dir = linkdiskpath
|
||||||
f = self.fetch_url(iurl)
|
dsrc = self.fetch_url(iurl)
|
||||||
dsrc = f.read()
|
newbaseurl = dsrc.newurl
|
||||||
f.close()
|
|
||||||
if len(dsrc) == 0 or \
|
if len(dsrc) == 0 or \
|
||||||
len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0:
|
len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0:
|
||||||
raise ValueError('No content at URL %s'%iurl)
|
raise ValueError('No content at URL %s'%iurl)
|
||||||
@ -349,7 +363,7 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
dsrc = xml_to_unicode(dsrc, self.verbose)[0]
|
dsrc = xml_to_unicode(dsrc, self.verbose)[0]
|
||||||
|
|
||||||
soup = self.get_soup(dsrc)
|
soup = self.get_soup(dsrc)
|
||||||
newbaseurl = f.geturl()
|
|
||||||
base = soup.find('base', href=True)
|
base = soup.find('base', href=True)
|
||||||
if base is not None:
|
if base is not None:
|
||||||
newbaseurl = base['href']
|
newbaseurl = base['href']
|
||||||
@ -372,6 +386,7 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
soup = self.postprocess_html_ext(soup,
|
soup = self.postprocess_html_ext(soup,
|
||||||
c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
|
c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
|
||||||
self.job_info)
|
self.job_info)
|
||||||
|
|
||||||
if c==0 and recursion_level == 0:
|
if c==0 and recursion_level == 0:
|
||||||
self.called_first = True
|
self.called_first = True
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user