#!/usr/bin/env python __license__ = 'GPL v3' __author__ = 'Lorenzo Vigentini, based on Darko Miletic' __copyright__ = '2009, Darko Miletic , Lorenzo Vigentini ' __version__ = 'v1.02' __date__ = '14, March 2010' __description__ = 'Italian daily newspaper (english version)' # NOTE: the feeds url are broken on the main site as the permalink structure has been changed erroneously ie: # actual link in feed http://www.corriere.it/english/10_marzo_11/legitimate_impediment_approved_de9ba480-2cfd-11df-a00c-00144f02aabe.shtml # this needs to be change to # real feed URL http://www.corriere.it/International/english/articoli/2010/03/11/legitimate_impediment_approved.shtml ''' http://www.corriere.it/ ''' from calibre.web.feeds.news import BasicNewsRecipe class ilCorriereEn(BasicNewsRecipe): author = 'Lorenzo Vigentini, based on Darko Miletic' description = 'Italian daily newspaper (english version)' cover_url = 'http://images.corriereobjects.it/images/static/common/logo_home.gif?v=200709121520' title = u'Il Corriere della sera (english) ' publisher = 'RCS Digital' category = 'News, politics, culture, economy, general interest' language = 'en' timefmt = '[%a, %d %b, %Y]' oldest_article = 5 max_articles_per_feed = 100 use_embedded_content = False recursion = 10 remove_javascript = True no_stylesheets = True def get_article_url(self, article): articleUrl= article.get('link') segments = articleUrl.split('/') basename = '/'.join(segments[:3]) + '/' + 'International/english/articoli/' #the date has to be redone with the url structure mlist1 = ['gennaio','febbraio','marzo','aprile','maggio','giugno','luglio','agosto','settembre','ottobre','novembre','dicembre'] mlist2 = ['01','02','03','04','05','06','07','08','09','10','11','12'] myDate = segments[4].split('_') x=0 for x in range(11): if myDate[1] == mlist1[x]: noMonth=mlist2[x] break newDateUrl= '20'+ myDate[0] + '/' + noMonth + '/' + myDate[2] + '/' #clean the article title articleURLseg=segments[5].split('-') myArticle = (articleURLseg[0])[:-9] + '.shtml' myURL= basename + newDateUrl + myArticle #print myURL return myURL keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})] remove_tags = [ dict(name=['base','object','link','embed']), dict(name='div', attrs={'class':'news-goback'}), dict(name='ul', attrs={'class':'toolbar'}) ] remove_tags_after = dict(name='p', attrs={'class':'footnotes'}) feeds = [ (u'News' , u'http://www.corriere.it/rss/english.xml' ) ]