diff --git a/resources/recipes/corriere_della_sera_en.recipe b/resources/recipes/corriere_della_sera_en.recipe index ca6aae2cc1..fad1636518 100644 --- a/resources/recipes/corriere_della_sera_en.recipe +++ b/resources/recipes/corriere_della_sera_en.recipe @@ -1,75 +1,75 @@ -#!/usr/bin/env python -__license__ = 'GPL v3' -__author__ = 'Lorenzo Vigentini, based on Darko Miletic' -__copyright__ = '2009, Darko Miletic , Lorenzo Vigentini ' -__version__ = 'v1.02' -__date__ = '14, March 2010' -__description__ = 'Italian daily newspaper (english version)' -# NOTE: the feeds url are broken on the main site as the permalink structure has been changed erroneously ie: -# actual link in feed http://www.corriere.it/english/10_marzo_11/legitimate_impediment_approved_de9ba480-2cfd-11df-a00c-00144f02aabe.shtml -# this needs to be change to -# real feed URL http://www.corriere.it/International/english/articoli/2010/03/11/legitimate_impediment_approved.shtml -''' -http://www.corriere.it/ -''' - -from calibre.web.feeds.news import BasicNewsRecipe - -class ilCorriereEn(BasicNewsRecipe): - author = 'Lorenzo Vigentini, based on Darko Miletic' - description = 'Italian daily newspaper (english version)' - - cover_url = 'http://images.corriereobjects.it/images/static/common/logo_home.gif?v=200709121520' - title = u'Il Corriere della sera (english) ' - publisher = 'RCS Digital' - category = 'News, politics, culture, economy, general interest' - - language = 'en' - timefmt = '[%a, %d %b, %Y]' - - oldest_article = 5 - max_articles_per_feed = 100 - use_embedded_content = False - recursion = 10 - - remove_javascript = True - no_stylesheets = True - - def get_article_url(self, article): - articleUrl= article.get('link') - segments = articleUrl.split('/') - basename = '/'.join(segments[:3]) + '/' + 'International/english/articoli/' - - #the date has to be redone with the url structure - mlist1 = ['gennaio','febbraio','marzo','aprile','maggio','giugno','luglio','agosto','settembre','ottobre','novembre','dicembre'] - mlist2 = ['01','02','03','04','05','06','07','08','09','10','11','12'] - myDate = segments[4].split('_') - x=0 - for x in range(11): - if myDate[1] == mlist1[x]: - noMonth=mlist2[x] - break - - newDateUrl= '20'+ myDate[0] + '/' + noMonth + '/' + myDate[2] + '/' - - #clean the article title - articleURLseg=segments[5].split('-') - myArticle = (articleURLseg[0])[:-9] + '.shtml' - - myURL= basename + newDateUrl + myArticle - #print myURL - return myURL - - keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})] - - remove_tags = [ - dict(name=['base','object','link','embed']), - dict(name='div', attrs={'class':'news-goback'}), - dict(name='ul', attrs={'class':'toolbar'}) - ] - - remove_tags_after = dict(name='p', attrs={'class':'footnotes'}) - - feeds = [ - (u'News' , u'http://www.corriere.it/rss/english.xml' ) - ] +#!/usr/bin/env python +__license__ = 'GPL v3' +__author__ = 'Lorenzo Vigentini, based on Darko Miletic' +__copyright__ = '2009, Darko Miletic , Lorenzo Vigentini ' +__version__ = 'v1.02' +__date__ = '14, March 2010' +__description__ = 'Italian daily newspaper (english version)' +# NOTE: the feeds url are broken on the main site as the permalink structure has been changed erroneously ie: +# actual link in feed http://www.corriere.it/english/10_marzo_11/legitimate_impediment_approved_de9ba480-2cfd-11df-a00c-00144f02aabe.shtml +# this needs to be change to +# real feed URL http://www.corriere.it/International/english/articoli/2010/03/11/legitimate_impediment_approved.shtml +''' +http://www.corriere.it/ +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class ilCorriereEn(BasicNewsRecipe): + author = 'Lorenzo Vigentini, based on Darko Miletic' + description = 'Italian daily newspaper (english version)' + + cover_url = 'http://images.corriereobjects.it/images/static/common/logo_home.gif?v=200709121520' + title = u'Il Corriere della sera (english) ' + publisher = 'RCS Digital' + category = 'News, politics, culture, economy, general interest' + + language = 'en' + timefmt = '[%a, %d %b, %Y]' + + oldest_article = 5 + max_articles_per_feed = 100 + use_embedded_content = False + recursion = 10 + + remove_javascript = True + no_stylesheets = True + + def get_article_url(self, article): + articleUrl= article.get('link') + segments = articleUrl.split('/') + basename = '/'.join(segments[:3]) + '/' + 'International/english/articoli/' + + #the date has to be redone with the url structure + mlist1 = ['gennaio','febbraio','marzo','aprile','maggio','giugno','luglio','agosto','settembre','ottobre','novembre','dicembre'] + mlist2 = ['01','02','03','04','05','06','07','08','09','10','11','12'] + myDate = segments[4].split('_') + x=0 + for x in range(11): + if myDate[1] == mlist1[x]: + noMonth=mlist2[x] + break + + newDateUrl= '20'+ myDate[0] + '/' + noMonth + '/' + myDate[2] + '/' + + #clean the article title + articleURLseg=segments[5].split('-') + myArticle = (articleURLseg[0])[:-9] + '.shtml' + + myURL= basename + newDateUrl + myArticle + #print myURL + return myURL + + keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})] + + remove_tags = [ + dict(name=['base','object','link','embed']), + dict(name='div', attrs={'class':'news-goback'}), + dict(name='ul', attrs={'class':'toolbar'}) + ] + + remove_tags_after = dict(name='p', attrs={'class':'footnotes'}) + + feeds = [ + (u'News' , u'http://www.corriere.it/rss/english.xml' ) + ]