Make news download more robust and fetch blogs in The Atlantic recipe

This commit is contained in:
Kovid Goyal 2008-11-06 11:06:32 -08:00
parent 7fb21ac00d
commit dbe52fd1b3
2 changed files with 37 additions and 22 deletions

View File

@ -16,7 +16,7 @@ class TheAtlantic(BasicNewsRecipe):
INDEX = 'http://www.theatlantic.com/doc/current' INDEX = 'http://www.theatlantic.com/doc/current'
remove_tags_before = dict(name='div', id='storytop') remove_tags_before = dict(name='div', id='storytop')
remove_tags = [dict(name='div', id=['seealso', 'storybottom', 'footer'])] remove_tags = [dict(name='div', id=['seealso', 'storybottom', 'footer', 'ad_banner_top', 'sidebar'])]
no_stylesheets = True no_stylesheets = True
def parse_index(self): def parse_index(self):
@ -35,8 +35,9 @@ class TheAtlantic(BasicNewsRecipe):
for item in soup.findAll('div', attrs={'class':'item'}): for item in soup.findAll('div', attrs={'class':'item'}):
a = item.find('a') a = item.find('a')
if a and a.has_key('href'): if a and a.has_key('href'):
url = a['href'] url = a['href'].replace('/doc', 'doc/print')
url = 'http://www.theatlantic.com/'+url.replace('/doc', 'doc/print') if not url.startswith('http://'):
url = 'http://www.theatlantic.com/'+url
title = self.tag_to_string(a) title = self.tag_to_string(a)
byline = item.find(attrs={'class':'byline'}) byline = item.find(attrs={'class':'byline'})
date = self.tag_to_string(byline) if byline else '' date = self.tag_to_string(byline) if byline else ''
@ -48,5 +49,4 @@ class TheAtlantic(BasicNewsRecipe):
'description':description 'description':description
}) })
return [('Current Issue', articles)] return [('Current Issue', articles)]

View File

@ -10,6 +10,7 @@ UTF-8 encoding with any charset declarations removed.
import sys, socket, os, urlparse, logging, re, time, copy, urllib2, threading, traceback import sys, socket, os, urlparse, logging, re, time, copy, urllib2, threading, traceback
from urllib import url2pathname from urllib import url2pathname
from httplib import responses from httplib import responses
from contextlib import closing
from calibre import setup_cli_handlers, browser, sanitize_file_name, \ from calibre import setup_cli_handlers, browser, sanitize_file_name, \
relpath, LoggingInterface relpath, LoggingInterface
@ -48,6 +49,11 @@ def save_soup(soup, target):
with open(target, 'wb') as f: with open(target, 'wb') as f:
f.write(html.encode('utf-8')) f.write(html.encode('utf-8'))
class response(str):
def __init__(self, *args):
str.__init__(self, *args)
self.newurl = None
class RecursiveFetcher(object, LoggingInterface): class RecursiveFetcher(object, LoggingInterface):
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
@ -134,13 +140,25 @@ class RecursiveFetcher(object, LoggingInterface):
def fetch_url(self, url): def fetch_url(self, url):
f = None data = None
self.log_debug('Fetching %s', url) self.log_debug('Fetching %s', url)
delta = time.time() - self.last_fetch_at delta = time.time() - self.last_fetch_at
if delta < self.delay: if delta < self.delay:
time.sleep(delta) time.sleep(delta)
try: try:
f = self.browser.open(url) try:
with closing(self.browser.open(url)) as f:
data = response(f.read())
data.newurl = f.geturl()
except AttributeError:
time.sleep(2)
try:
with closing(self.browser.open(url)) as f:
data = response(f.read())
data.newurl = f.geturl()
except AttributeError:
data = response(urllib2.urlopen(url).read())
data.newurl = f.geturl()
except urllib2.URLError, err: except urllib2.URLError, err:
if hasattr(err, 'code') and responses.has_key(err.code): if hasattr(err, 'code') and responses.has_key(err.code):
raise FetchError, responses[err.code] raise FetchError, responses[err.code]
@ -149,12 +167,13 @@ class RecursiveFetcher(object, LoggingInterface):
time.sleep(1) time.sleep(1)
if hasattr(f, 'close'): if hasattr(f, 'close'):
f.close() f.close()
f = self.browser.open(url) with closing(self.browser.open(url)) as f:
data = f.read()
else: else:
raise err raise err
finally: finally:
self.last_fetch_at = time.time() self.last_fetch_at = time.time()
return f return data
def start_fetch(self, url): def start_fetch(self, url):
@ -196,7 +215,7 @@ class RecursiveFetcher(object, LoggingInterface):
tag['href'] = self.stylemap[iurl] tag['href'] = self.stylemap[iurl]
continue continue
try: try:
f = self.fetch_url(iurl) data = self.fetch_url(iurl)
except Exception, err: except Exception, err:
self.log_debug('Could not fetch stylesheet %s', iurl) self.log_debug('Could not fetch stylesheet %s', iurl)
self.log_debug('Error: %s', str(err), exc_info=True) self.log_debug('Error: %s', str(err), exc_info=True)
@ -205,8 +224,7 @@ class RecursiveFetcher(object, LoggingInterface):
with self.stylemap_lock: with self.stylemap_lock:
self.stylemap[iurl] = stylepath self.stylemap[iurl] = stylepath
with open(stylepath, 'wb') as x: with open(stylepath, 'wb') as x:
x.write(f.read()) x.write(data)
f.close()
tag['href'] = stylepath tag['href'] = stylepath
else: else:
for ns in tag.findAll(text=True): for ns in tag.findAll(text=True):
@ -221,19 +239,17 @@ class RecursiveFetcher(object, LoggingInterface):
ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl])) ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
continue continue
try: try:
f = self.fetch_url(iurl) data = self.fetch_url(iurl)
except Exception, err: except Exception, err:
self.log_warning('Could not fetch stylesheet %s', iurl) self.log_warning('Could not fetch stylesheet %s', iurl)
self.log_debug('Error: %s', str(err), exc_info=True) self.log_debug('Error: %s', str(err), exc_info=True)
if hasattr(f, 'close'): f.close()
continue continue
c += 1 c += 1
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
with self.stylemap_lock: with self.stylemap_lock:
self.stylemap[iurl] = stylepath self.stylemap[iurl] = stylepath
with open(stylepath, 'wb') as x: with open(stylepath, 'wb') as x:
x.write(f.read()) x.write(data)
f.close()
ns.replaceWith(src.replace(m.group(1), stylepath)) ns.replaceWith(src.replace(m.group(1), stylepath))
@ -256,7 +272,7 @@ class RecursiveFetcher(object, LoggingInterface):
tag['src'] = self.imagemap[iurl] tag['src'] = self.imagemap[iurl]
continue continue
try: try:
f = self.fetch_url(iurl) data = self.fetch_url(iurl)
except Exception, err: except Exception, err:
self.log_warning('Could not fetch image %s', iurl) self.log_warning('Could not fetch image %s', iurl)
self.log_debug('Error: %s', str(err), exc_info=True) self.log_debug('Error: %s', str(err), exc_info=True)
@ -269,8 +285,7 @@ class RecursiveFetcher(object, LoggingInterface):
with self.imagemap_lock: with self.imagemap_lock:
self.imagemap[iurl] = imgpath self.imagemap[iurl] = imgpath
with open(imgpath, 'wb') as x: with open(imgpath, 'wb') as x:
x.write(f.read()) x.write(data)
f.close()
tag['src'] = imgpath tag['src'] = imgpath
def absurl(self, baseurl, tag, key, filter=True): def absurl(self, baseurl, tag, key, filter=True):
@ -337,9 +352,8 @@ class RecursiveFetcher(object, LoggingInterface):
os.mkdir(linkdiskpath) os.mkdir(linkdiskpath)
try: try:
self.current_dir = linkdiskpath self.current_dir = linkdiskpath
f = self.fetch_url(iurl) dsrc = self.fetch_url(iurl)
dsrc = f.read() newbaseurl = dsrc.newurl
f.close()
if len(dsrc) == 0 or \ if len(dsrc) == 0 or \
len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0: len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0:
raise ValueError('No content at URL %s'%iurl) raise ValueError('No content at URL %s'%iurl)
@ -349,7 +363,7 @@ class RecursiveFetcher(object, LoggingInterface):
dsrc = xml_to_unicode(dsrc, self.verbose)[0] dsrc = xml_to_unicode(dsrc, self.verbose)[0]
soup = self.get_soup(dsrc) soup = self.get_soup(dsrc)
newbaseurl = f.geturl()
base = soup.find('base', href=True) base = soup.find('base', href=True)
if base is not None: if base is not None:
newbaseurl = base['href'] newbaseurl = base['href']
@ -372,6 +386,7 @@ class RecursiveFetcher(object, LoggingInterface):
soup = self.postprocess_html_ext(soup, soup = self.postprocess_html_ext(soup,
c==0 and recursion_level==0 and not getattr(self, 'called_first', False), c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
self.job_info) self.job_info)
if c==0 and recursion_level == 0: if c==0 and recursion_level == 0:
self.called_first = True self.called_first = True