From 576cf5c26902e170bf0432f4dcc9081bc1824f2b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 15 Mar 2010 18:16:26 +0530 Subject: [PATCH] Fix Corriere Della Seraa (EN) --- .../recipes/corriere_della_sera_en.recipe | 130 ++++++++++-------- 1 file changed, 75 insertions(+), 55 deletions(-) diff --git a/resources/recipes/corriere_della_sera_en.recipe b/resources/recipes/corriere_della_sera_en.recipe index 0a15d77b9a..ca6aae2cc1 100644 --- a/resources/recipes/corriere_della_sera_en.recipe +++ b/resources/recipes/corriere_della_sera_en.recipe @@ -1,55 +1,75 @@ -#!/usr/bin/env python -__license__ = 'GPL v3' -__author__ = 'Lorenzo Vigentini, based on Darko Miletic' -__copyright__ = '2009, Darko Miletic , Lorenzo Vigentini ' -__version__ = 'v1.01' -__date__ = '10, January 2010' -__description__ = 'Italian daily newspaper (english version)' -''' -http://www.corriere.it/ -''' - -from calibre.web.feeds.news import BasicNewsRecipe - -class ilCorriere(BasicNewsRecipe): - __author__ = 'Lorenzo Vigentini, based on Darko Miletic' - description = 'Italian daily newspaper (english version)' - - cover_url = 'http://images.corriereobjects.it/images/static/common/logo_home.gif?v=200709121520' - title = u'Il Corriere della sera (english) ' - publisher = 'RCS Digital' - category = 'News, politics, culture, economy, general interest' - - language = 'en' - timefmt = '[%a, %d %b, %Y]' - - oldest_article = 1 - max_articles_per_feed = 100 - use_embedded_content = False - recursion = 10 - - remove_javascript = True - no_stylesheets = True - - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - , '--ignore-tables' - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' - - keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})] - - remove_tags = [ - dict(name=['base','object','link','embed']), - dict(name='div', attrs={'class':'news-goback'}), - dict(name='ul', attrs={'class':'toolbar'}) - ] - - remove_tags_after = dict(name='p', attrs={'class':'footnotes'}) - - feeds = [ - (u'News' , u'http://www.corriere.it/rss/english.xml' ) - ] +#!/usr/bin/env python +__license__ = 'GPL v3' +__author__ = 'Lorenzo Vigentini, based on Darko Miletic' +__copyright__ = '2009, Darko Miletic , Lorenzo Vigentini ' +__version__ = 'v1.02' +__date__ = '14, March 2010' +__description__ = 'Italian daily newspaper (english version)' +# NOTE: the feeds url are broken on the main site as the permalink structure has been changed erroneously ie: +# actual link in feed http://www.corriere.it/english/10_marzo_11/legitimate_impediment_approved_de9ba480-2cfd-11df-a00c-00144f02aabe.shtml +# this needs to be change to +# real feed URL http://www.corriere.it/International/english/articoli/2010/03/11/legitimate_impediment_approved.shtml +''' +http://www.corriere.it/ +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class ilCorriereEn(BasicNewsRecipe): + author = 'Lorenzo Vigentini, based on Darko Miletic' + description = 'Italian daily newspaper (english version)' + + cover_url = 'http://images.corriereobjects.it/images/static/common/logo_home.gif?v=200709121520' + title = u'Il Corriere della sera (english) ' + publisher = 'RCS Digital' + category = 'News, politics, culture, economy, general interest' + + language = 'en' + timefmt = '[%a, %d %b, %Y]' + + oldest_article = 5 + max_articles_per_feed = 100 + use_embedded_content = False + recursion = 10 + + remove_javascript = True + no_stylesheets = True + + def get_article_url(self, article): + articleUrl= article.get('link') + segments = articleUrl.split('/') + basename = '/'.join(segments[:3]) + '/' + 'International/english/articoli/' + + #the date has to be redone with the url structure + mlist1 = ['gennaio','febbraio','marzo','aprile','maggio','giugno','luglio','agosto','settembre','ottobre','novembre','dicembre'] + mlist2 = ['01','02','03','04','05','06','07','08','09','10','11','12'] + myDate = segments[4].split('_') + x=0 + for x in range(11): + if myDate[1] == mlist1[x]: + noMonth=mlist2[x] + break + + newDateUrl= '20'+ myDate[0] + '/' + noMonth + '/' + myDate[2] + '/' + + #clean the article title + articleURLseg=segments[5].split('-') + myArticle = (articleURLseg[0])[:-9] + '.shtml' + + myURL= basename + newDateUrl + myArticle + #print myURL + return myURL + + keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})] + + remove_tags = [ + dict(name=['base','object','link','embed']), + dict(name='div', attrs={'class':'news-goback'}), + dict(name='ul', attrs={'class':'toolbar'}) + ] + + remove_tags_after = dict(name='p', attrs={'class':'footnotes'}) + + feeds = [ + (u'News' , u'http://www.corriere.it/rss/english.xml' ) + ]