Update Irish Times. Fixes #1159553 (Updated news recipe for the Irish Times)

This commit is contained in:
Kovid Goyal 2013-03-25 08:59:17 +05:30
parent 8535e21694
commit 3eacc9cadb

View File

@ -1,65 +1,62 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = "2008, Derry FitzGerald. 2009 Modified by Ray Kinsella and David O'Callaghan, 2011 Modified by Phil Burns" __copyright__ = "2008, Derry FitzGerald. 2009 Modified by Ray Kinsella and David O'Callaghan, 2011 Modified by Phil Burns, 2013 Tom Scholl"
''' '''
irishtimes.com irishtimes.com
''' '''
import re import urlparse, re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
class IrishTimes(BasicNewsRecipe): class IrishTimes(BasicNewsRecipe):
title = u'The Irish Times' title = u'The Irish Times'
encoding = 'ISO-8859-15' __author__ = "Derry FitzGerald, Ray Kinsella, David O'Callaghan and Phil Burns, Tom Scholl"
__author__ = "Derry FitzGerald, Ray Kinsella, David O'Callaghan and Phil Burns"
language = 'en_IE' language = 'en_IE'
timefmt = ' (%A, %B %d, %Y)'
masthead_url = 'http://www.irishtimes.com/assets/images/generic/website/logo_theirishtimes.png'
encoding = 'utf-8'
oldest_article = 1.0 oldest_article = 1.0
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_empty_feeds = True
no_stylesheets = True no_stylesheets = True
simultaneous_downloads= 5 temp_files = []
articles_are_obfuscated = True
r = re.compile('.*(?P<url>http:\/\/(www.irishtimes.com)|(rss.feedsportal.com\/c)\/.*\.html?).*')
remove_tags = [dict(name='div', attrs={'class':'footer'})]
extra_css = 'p, div { margin: 0pt; border: 0pt; text-indent: 0.5em } .headline {font-size: large;} \n .fact { padding-top: 10pt }'
feeds = [ feeds = [
('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'), ('News', 'http://www.irishtimes.com/cmlink/the-irish-times-news-1.1319192'),
('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'), ('World', 'http://www.irishtimes.com/cmlink/irishtimesworldfeed-1.1321046'),
('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'), ('Politics', 'http://www.irishtimes.com/cmlink/irish-times-politics-rss-1.1315953'),
('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'), ('Business', 'http://www.irishtimes.com/cmlink/the-irish-times-business-1.1319195'),
('Features', 'http://www.irishtimes.com/feeds/rss/newspaper/features.rss'), ('Culture', 'http://www.irishtimes.com/cmlink/the-irish-times-culture-1.1319213'),
('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'), ('Sport', 'http://www.irishtimes.com/cmlink/the-irish-times-sport-1.1319194'),
('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'), ('Debate', 'http://www.irishtimes.com/cmlink/debate-1.1319211'),
('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'), ('Life & Style', 'http://www.irishtimes.com/cmlink/the-irish-times-life-style-1.1319214'),
('Magazine', 'http://www.irishtimes.com/feeds/rss/newspaper/magazine.rss'),
('Health', 'http://www.irishtimes.com/feeds/rss/newspaper/health.rss'),
('Education & Parenting', 'http://www.irishtimes.com/feeds/rss/newspaper/education.rss'),
('Motors', 'http://www.irishtimes.com/feeds/rss/newspaper/motors.rss'),
('An Teanga Bheo', 'http://www.irishtimes.com/feeds/rss/newspaper/anteangabheo.rss'),
('Commercial Property', 'http://www.irishtimes.com/feeds/rss/newspaper/commercialproperty.rss'),
('Science Today', 'http://www.irishtimes.com/feeds/rss/newspaper/sciencetoday.rss'),
('Property', 'http://www.irishtimes.com/feeds/rss/newspaper/property.rss'),
('The Tickets', 'http://www.irishtimes.com/feeds/rss/newspaper/theticket.rss'),
('Weekend', 'http://www.irishtimes.com/feeds/rss/newspaper/weekend.rss'),
('News features', 'http://www.irishtimes.com/feeds/rss/newspaper/newsfeatures.rss'),
('Obituaries', 'http://www.irishtimes.com/feeds/rss/newspaper/obituaries.rss'),
] ]
def print_version(self, url): def get_obfuscated_article(self, url):
if url.count('rss.feedsportal.com'): # Insert a pic from the original url, but use content from the print url
#u = url.replace('0Bhtml/story01.htm','_pf0Bhtml/story01.htm') pic = None
u = url.find('irishtimes') pics = self.index_to_soup(url)
u = 'http://www.irishtimes.com' + url[u + 12:] div = pics.find('div', {'class' : re.compile('image-carousel')})
u = u.replace('0C', '/') if div:
u = u.replace('A', '') pic = div.img
u = u.replace('0Bhtml/story01.htm', '_pf.html') if pic:
else: try:
u = url.replace('.html','_pf.html') pic['src'] = urlparse.urljoin(url, pic['src'])
return u pic.extract()
except:
pic = None
content = self.index_to_soup(url + '?mode=print&ot=example.AjaxPageLayout.ot')
if pic:
content.p.insert(0, pic)
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(content.prettify())
self.temp_files[-1].close()
return self.temp_files[-1].name
def get_article_url(self, article):
return article.link