calibre/recipes/economist_free.recipe
Kovid Goyal 0ba4f9a8ad ...
2011-04-07 09:33:47 -06:00

147 lines
5.6 KiB
Plaintext

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.threadpool import ThreadPool, makeRequests
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
import time, string, re
from datetime import datetime
from lxml import html
class Economist(BasicNewsRecipe):
title = 'The Economist (RSS)'
language = 'en'
__author__ = "Kovid Goyal"
description = ('Global news and current affairs from a European'
' perspective. Best downloaded on Friday mornings (GMT).'
' Much slower than the print edition based version.')
oldest_article = 7.0
cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
remove_tags = [
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
dict(attrs={'class':['dblClkTrk', 'ec-article-info',
'share_inline_header']}),
{'class': lambda x: x and 'share-links-header' in x},
]
keep_only_tags = [dict(id='ec-article-body')]
no_stylesheets = True
preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
lambda x:'</html>')]
def parse_index(self):
from calibre.web.feeds.feedparser import parse
if self.test:
self.oldest_article = 14.0
raw = self.index_to_soup(
'http://feeds.feedburner.com/economist/full_print_edition',
raw=True)
entries = parse(raw).entries
pool = ThreadPool(10)
self.feed_dict = {}
requests = []
for i, item in enumerate(entries):
title = item.get('title', _('Untitled article'))
published = item.date_parsed
if not published:
published = time.gmtime()
utctime = datetime(*published[:6])
delta = datetime.utcnow() - utctime
if delta.days*24*3600 + delta.seconds > 24*3600*self.oldest_article:
self.log.debug('Skipping article %s as it is too old.'%title)
continue
link = item.get('link', None)
description = item.get('description', '')
author = item.get('author', '')
requests.append([i, link, title, description, author, published])
if self.test:
requests = requests[:4]
requests = makeRequests(self.process_eco_feed_article, requests, self.eco_article_found,
self.eco_article_failed)
for r in requests: pool.putRequest(r)
pool.wait()
return self.eco_sort_sections([(t, a) for t, a in
self.feed_dict.items()])
def eco_sort_sections(self, feeds):
if not feeds:
raise ValueError('No new articles found')
order = {
'The World This Week': 1,
'Leaders': 2,
'Letters': 3,
'Briefing': 4,
'Business': 5,
'Finance And Economics': 6,
'Science & Technology': 7,
'Books & Arts': 8,
'International': 9,
'United States': 10,
'Asia': 11,
'Europe': 12,
'The Americas': 13,
'Middle East & Africa': 14,
'Britain': 15,
'Obituary': 16,
}
return sorted(feeds, cmp=lambda x,y:cmp(order.get(x[0], 100),
order.get(y[0], 100)))
def process_eco_feed_article(self, args):
from calibre import browser
i, url, title, description, author, published = args
br = browser()
ret = br.open(url)
raw = ret.read()
url = br.geturl().split('?')[0]+'/print'
root = html.fromstring(raw)
matches = root.xpath('//*[@class = "ec-article-info"]')
feedtitle = 'Miscellaneous'
if matches:
feedtitle = string.capwords(html.tostring(matches[-1], method='text',
encoding=unicode).split('|')[-1].strip())
return (i, feedtitle, url, title, description, author, published)
def eco_article_found(self, req, result):
from calibre.web.feeds import Article
i, feedtitle, link, title, description, author, published = result
self.log('Found print version for article:', title, 'in', feedtitle,
'at', link)
a = Article(i, title, link, author, description, published, '')
article = dict(title=a.title, description=a.text_summary,
date=time.strftime(self.timefmt, a.date), author=a.author, url=a.url)
if feedtitle not in self.feed_dict:
self.feed_dict[feedtitle] = []
self.feed_dict[feedtitle].append(article)
def eco_article_failed(self, req, tb):
self.log.error('Failed to download %s with error:'%req.args[0][2])
self.log.debug(tb)
def eco_find_image_tables(self, soup):
for x in soup.findAll('table', align=['right', 'center']):
if len(x.findAll('font')) in (1,2) and len(x.findAll('img')) == 1:
yield x
def postprocess_html(self, soup, first):
body = soup.find('body')
for name, val in body.attrs:
del body[name]
for table in list(self.eco_find_image_tables(soup)):
caption = table.find('font')
img = table.find('img')
div = Tag(soup, 'div')
div['style'] = 'text-align:left;font-size:70%'
ns = NavigableString(self.tag_to_string(caption))
div.insert(0, ns)
div.insert(1, Tag(soup, 'br'))
img.extract()
del img['width']
del img['height']
div.insert(2, img)
table.replaceWith(div)
return soup