#!/usr/bin/env python __license__ = 'GPL v3' __author__ = 'Lorenzo Vigentini, based on Darko Miletic' __copyright__ = '2009, Darko Miletic , Lorenzo Vigentini ' __version__ = 'v1.02' __date__ = '14, March 2010' __description__ = 'Italian daily newspaper (english version)' # NOTE: the feeds url are broken on the main site as the permalink structure has been changed erroneously ie: # actual link in feed http://www.corriere.it/english/10_marzo_11/legitimate_impediment_approved_de9ba480-2cfd-11df-a00c-00144f02aabe.shtml # this needs to be change to # real feed URL # http://www.corriere.it/International/english/articoli/2010/03/11/legitimate_impediment_approved.shtml ''' http://www.corriere.it/ ''' from calibre.web.feeds.news import BasicNewsRecipe class ilCorriereEn(BasicNewsRecipe): author = 'Lorenzo Vigentini, based on Darko Miletic' description = 'Italian daily newspaper (english version)' cover_url = 'http://images.corriereobjects.it/images/static/common/logo_home.gif?v=200709121520' title = u'Il Corriere della sera (english) ' publisher = 'RCS Digital' category = 'News, politics, culture, economy, general interest' language = 'en' timefmt = '[%a, %d %b, %Y]' oldest_article = 5 max_articles_per_feed = 100 use_embedded_content = False recursion = 10 remove_javascript = True no_stylesheets = True def get_article_url(self, article): articleUrl = article.get('link') segments = articleUrl.split('/') basename = '/'.join(segments[:3]) + '/' + \ 'International/english/articoli/' # the date has to be redone with the url structure mlist1 = ['gennaio', 'febbraio', 'marzo', 'aprile', 'maggio', 'giugno', 'luglio', 'agosto', 'settembre', 'ottobre', 'novembre', 'dicembre'] mlist2 = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'] myDate = segments[4].split('_') x = 0 for x in range(11): if myDate[1] == mlist1[x]: noMonth = mlist2[x] break newDateUrl = '20' + myDate[0] + '/' + noMonth + '/' + myDate[2] + '/' # clean the article title articleURLseg = segments[5].split('-') myArticle = (articleURLseg[0])[:-9] + '.shtml' myURL = basename + newDateUrl + myArticle # print myURL return myURL keep_only_tags = [ dict(name='div', attrs={'class': ['news-dettaglio article', 'article']})] remove_tags = [ dict(name=['base', 'object', 'link', 'embed']), dict(name='div', attrs={'class': 'news-goback'}), dict(name='ul', attrs={'class': 'toolbar'}) ] remove_tags_after = dict(name='p', attrs={'class': 'footnotes'}) feeds = [ (u'News', u'http://www.corriere.it/rss/english.xml') ]