New recipes for l'Espresso, Quotidiano, La Gazzeta dello Sport and Panorama by Lorenzo Vigentini

2025-12-07 21:55:07 -05:00 · 2010-01-10 20:34:15 -07:00 · 2010-01-10 20:34:15 -07:00 · e37f0747db
commit e37f0747db
parent a58920d592
7 changed files with 365 additions and 70 deletions
--- a/resources/recipes/corriere_della_sera_en.recipe
+++ b/resources/recipes/corriere_della_sera_en.recipe
@ -1,27 +1,35 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
-__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
+__author__    = 'Lorenzo Vigentini, based on Darko Miletic'
 __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
 __version__     = 'v1.01'
 __date__        = '10, January 2010'
 __description__ = 'Italian daily newspaper (english version)'
 '''
-www.corriere.it/english
+http://www.corriere.it/
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
-class Corriere_en(BasicNewsRecipe):
+class ilCorriere(BasicNewsRecipe):
-    title                 = 'Corriere della Sera in English'
+    __author__     = 'Lorenzo Vigentini, based on Darko Miletic'
-    __author__            = 'Darko Miletic'
+    description    = 'Italian daily newspaper (english version)'
    description           = 'News from Milan and Italy'
    oldest_article        = 15
    publisher             = 'Corriere della Sera'
    category              = 'news, politics, Italy'
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    encoding              = 'cp1252'
    remove_javascript     = True
    language = 'en'
    cover_url      = 'http://images.corriereobjects.it/images/static/common/logo_home.gif?v=200709121520'
    title          = u'Il Corriere della sera (english) '
    publisher      = 'RCS Digital'
    category       = 'News, politics, culture, economy, general interest'
    language       = 'en'
    timefmt        = '[%a, %d %b, %Y]'
    oldest_article = 1
    max_articles_per_feed = 100
    use_embedded_content  = False
    recursion             = 10
    remove_javascript = True
    no_stylesheets = True
    html2lrf_options = [
                          '--comment', description
@ -35,12 +43,13 @@ class Corriere_en(BasicNewsRecipe):
    keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})]
    remove_tags = [
-                    dict(name=['base','object','link','embed','img'])
+                   dict(name=['base','object','link','embed']),
-                   ,dict(name='div', attrs={'class':'news-goback'})
+                   dict(name='div', attrs={'class':'news-goback'}),
-                   ,dict(name='ul', attrs={'class':'toolbar'})
+                   dict(name='ul', attrs={'class':'toolbar'})
                  ]
    remove_tags_after = dict(name='p', attrs={'class':'footnotes'})
-    feeds = [(u'Italian Life', u'http://www.corriere.it/rss/english.xml')]
+    feeds = [
-
+             (u'News'  , u'http://www.corriere.it/rss/english.xml'  )
            ]
--- a/resources/recipes/corriere_della_sera_it.recipe
+++ b/resources/recipes/corriere_della_sera_it.recipe
@ -1,26 +1,36 @@
 #!/usr/bin/env  python
 __license__     = 'GPL v3'
 __author__      = 'Lorenzo Vigentini, based on Darko Miletic'
 __copyright__   = '2009, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
 __version__     = 'v1.01'
 __date__        = '10, January 2010'
 __description__ = 'Italian daily newspaper'
 __license__   = 'GPL v3'
 __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
 '''
-www.corriere.it
+http://www.corriere.it/
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class Corriere_it(BasicNewsRecipe):
    title                 = 'Corriere della Sera'
    __author__            = 'Darko Miletic'
    description           = 'News from Milan and Italy'    
    oldest_article        = 7
    publisher             = 'Corriere della Sera'
    category              = 'news, politics, Italy'        
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    encoding              = 'cp1252'
    remove_javascript     = True
    language = 'it'
 class ilCorriere(BasicNewsRecipe):
    __author__     = 'Lorenzo Vigentini, based on Darko Miletic'
    description    = 'Italian daily newspaper'
    cover_url      = 'http://images.corriereobjects.it/images/static/common/logo_home.gif?v=200709121520'
    title          = u'Il Corriere della sera '
    publisher      = 'RCS Digital'
    category       = 'News, politics, culture, economy, general interest'
    language       = 'it'
    timefmt        = '[%a, %d %b, %Y]'
    oldest_article = 1
    max_articles_per_feed = 100
    use_embedded_content  = False
    recursion             = 10
    remove_javascript = True
    no_stylesheets = True
    html2lrf_options = [
                          '--comment', description
@ -28,29 +38,30 @@ class Corriere_it(BasicNewsRecipe):
                        , '--publisher', publisher
                        , '--ignore-tables'
                        ]
-    
+
-    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' 
+    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
    keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})]
    remove_tags = [
-                    dict(name=['base','object','link','embed','img'])
+                   dict(name=['base','object','link','embed']),
-                   ,dict(name='div', attrs={'class':'news-goback'})
+                   dict(name='div', attrs={'class':'news-goback'}),
-                   ,dict(name='ul', attrs={'class':'toolbar'})
+                   dict(name='ul', attrs={'class':'toolbar'})
                  ]
    remove_tags_after = dict(name='p', attrs={'class':'footnotes'})
    feeds = [ 
              (u'Ultimora'  , u'http://www.corriere.it/rss/ultimora.xml'  )
             ,(u'Cronache'  , u'http://www.corriere.it/rss/cronache.xml'  )
             ,(u'Economia'  , u'http://www.corriere.it/rss/economia.xml'  )
             ,(u'Editoriali', u'http://www.corriere.it/rss/editoriali.xml')
             ,(u'Esteri'    , u'http://www.corriere.it/rss/esteri.xml'    )
             ,(u'Politica'  , u'http://www.corriere.it/rss/politica.xml'  )
             ,(u'Salute'    , u'http://www.corriere.it/rss/salute.xml'    )
             ,(u'Scienze'   , u'http://www.corriere.it/rss/scienze.xml'   )
             ,(u'Spettacolo', u'http://www.corriere.it/rss/spettacoli.xml')
             ,(u'Sport'     , u'http://www.corriere.it/rss/sport.xml'     )
            ]
    feeds = [
             (u'Ultimora'  , u'http://www.corriere.it/rss/ultimora.xml'  ),
             (u'Editoriali', u'http://www.corriere.it/rss/editoriali.xml'),
             (u'Cronache'  , u'http://www.corriere.it/rss/cronache.xml'  ),
             (u'Politica'  , u'http://www.corriere.it/rss/politica.xml'  ),
             (u'Esteri'    , u'http://www.corriere.it/rss/esteri.xml'    ),
             (u'Economia'  , u'http://www.corriere.it/rss/economia.xml'  ),
             (u'Cultura'    , u'http://www.corriere.it/rss/cultura.xml'  ),
             (u'Scienze'   , u'http://www.corriere.it/rss/scienze.xml'   ),
             (u'Salute'    , u'http://www.corriere.it/rss/salute.xml'    ),
             (u'Spettacolo', u'http://www.corriere.it/rss/spettacoli.xml'),
             (u'Cinema e TV', u'http://www.corriere.it/rss/cinema.xml'   ),
             (u'Sport'     , u'http://www.corriere.it/rss/sport.xml'     )
            ]
--- a/resources/recipes/l_espresso.recipe
+++ b/resources/recipes/l_espresso.recipe
@ -0,0 +1,67 @@
 #!/usr/bin/env  python
 __license__     = 'GPL v3'
 __author__      = 'Lorenzo Vigentini'
 __copyright__   = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
 __version__     = 'v1.02'
 __date__        = '10, January 2010'
 __description__ = 'Italian weekly magazine'
 '''espresso.repubblica.it'''
 from calibre.web.feeds.news import BasicNewsRecipe
 class laGazzetta(BasicNewsRecipe):
    __author__     = 'Lorenzo Vigentini'
    description    = 'Italian weekly magazine'
    cover_url      = 'http://espresso.repubblica.it/images/logo_espresso.gif'
    title          = 'l Espresso '
    publisher      = 'Gruppo editoriale lEspresso'
    category       = 'News, politics, culture, economy, general interest'
    language       = 'it'
    encoding       = 'cp1252'
    timefmt        = '[%a, %d %b, %Y]'
    oldest_article        = 16
    max_articles_per_feed = 100
    use_embedded_content  = False
    recursion             = 10
    remove_javascript     = True
    no_stylesheets = True
    feeds          = [
                       (u'Espresso Homepage', u'http://kpm.data.kataweb.it/kpm3eolx/rss/home'),
                       (u'Espresso Local', u'http://kpm.data.kataweb.it/kpm3eolx/rss/local'),
                       (u'Espresso Style & Design', u'http://kpm.data.kataweb.it/kpm3eolx/rss/style_design'),
                       (u'Espresso Opinioni', u'http://kpm.data.kataweb.it/kpm3eolx/rss/opinioni'),
                       (u'Espresso Rubriche', u'http://kpm.data.kataweb.it/kpm3eolx/rss/rubriche'),
                       (u'Espresso Limes', u'http://temi.repubblica.it/limes/feed/')
                    ]
    def print_version(self,url):
        return url + '/&print=true'
    keep_only_tags     = [
                            dict(name='div', attrs={'class':['testo','copertina','occhiello','firma','didascalia','content-second-right','detail-articles','titolo-local','generic-articles']}),
                            dict(name='div', attrs={'class':['generic-articles','summary','detail-articles']}),
                            dict(name='div', attrs={'id':'content-second-right'})
                          ]
    remove_tags        = [
                            dict(name='div',attrs={'class':['servizi','aggiungi','label-web','bottom-mobile','box-abbonamenti','box-cerca','big','little','stampaweb']}),
                            dict(name='div',attrs={'id':['topheader','header','navigation-new','navigation','content-second-left']}),
                            dict(name=['script','noscript','iframe'])
                         ]
    extra_css = '''
                h1 {font-family:"Trebuchet MS",Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:18px;}
                h2 {font-family:"Trebuchet MS",Arial,Helvetica,sans-serif; font-size:18px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:16px; }
                h3 {color:#333333;font-family:"Trebuchet MS",Arial,Helvetica,sans-serif; font-size:16px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px;}
                h4 {color:#333333; font-family:"Trebuchet MS",Arial,Helvetica,sans-serif;font-size:16px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; }
                h5 {color:#333333; font-family:"Trebuchet MS",Arial,Helvetica,sans-serif; font-size:12px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; text-transform:uppercase;}
                .firma {color:#333333;font-family:"Trebuchet MS",Arial,Helvetica,sans-serif;font-size:12px; font-size-adjust:none; font-stretch:normal; font-style:italic; font-variant:normal; font-weight:bold; line-height:15px; text-decoration:none;}
                .testo {font-family:"Trebuchet MS",Arial,Helvetica,sans-serif; font-size:10px;}
                '''
--- a/resources/recipes/la_gazzeta_dello_sport.recipe
+++ b/resources/recipes/la_gazzeta_dello_sport.recipe
@ -0,0 +1,79 @@
 #!/usr/bin/env  python
 __license__     = 'GPL v3'
 __author__      = 'Lorenzo Vigentini'
 __copyright__   = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
 __version__     = 'v1.02'
 __date__        = '10, January 2010'
 __description__ = 'Sport news from the most read sport newspaper in Italy'
 '''www.gazzetta.it'''
 from calibre.web.feeds.news import BasicNewsRecipe
 class laGazzetta(BasicNewsRecipe):
    __author__        = 'Lorenzo Vigentini'
    description   = 'Sport news from the most read sport newspaper in Italy'
    cover_url      = 'http://www.gazzetta.it/primapagina/images/prima_pagina_grande.png'
    title          = 'La Gazzetta dello Sport '
    publisher      = 'RCS Digital'
    category       = 'Sport News'
    language       = 'it'
    encoding       = 'cp1252'
    timefmt        = '[%a, %d %b, %Y]'
    oldest_article = 2
    max_articles_per_feed = 20
    use_embedded_content  = False
    recursion             = 10
    remove_javascript = True
    no_stylesheets = True
    keep_only_tags = [ dict(name='div', attrs={'id':'articolo'})]
    remove_tags = [
                dict(name='ul',attrs={'id':['service-toolbar','sections-menu']}),
                dict(name='div',attrs={'id':['header','rightcol','sponsored','vxFlashPlayer','footer','print-box']}),
                dict(name='iframe',attrs={'id':'mirago-feed'}),
                dict(name='a',attrs={'id':'commenta-up'}),
                dict(name='cite',attrs={'class':['signature','parag-title']}),
                dict(name='a',attrs={'class':['last-comment','button-bold2']}),
                dict(name=['base','object','link','a','script','noscript'])
            ]
    extra_css      = '''
                        h1 {font: sans-serif large;}
                        h2 {font: sans-serif medium;}
                        h3 {font: sans-serif small;}
                        h4 {font: sans-serif bold small;}
                        p  {font:10pt helvetica}
                        dd {font:8pt helvetica}
                      '''
    feeds       = [
                   (u'Calcio',u'http://www.gazzetta.it/rss/Calcio.xml'),
                   (u'Formula 1',u'http://www.gazzetta.it/rss/Formula1.xml'),
                   (u'Motomodiale',u'http://www.gazzetta.it/rss/Motomondiale.xml'),
                   (u'Motori',u'http://www.gazzetta.it/rss/Motori.xml'),
                   (u'Ciclismo',u'http://www.gazzetta.it/rss/Ciclismo.xml'),
                   (u'Basket',u'http://www.gazzetta.it/rss/Basket.xml'),
                   (u'Tennis',u'http://www.gazzetta.it/rss/Tennis.xml'),
                   (u'Pallavolo',u'http://www.gazzetta.it/rss/Pallavolo.xml'),
                   (u'Vela',u'http://www.gazzetta.it/rss/Vela.xml'),
                   (u'Atletica',u'http://www.gazzetta.it/rss/Atletica.xml'),
                   (u'Altri Sport',u'http://www.gazzetta.it/rss/Sport_Vari.xml')
                 ]
    def print_version(self,url):
        segments = url.split('/')
        basename = '/'.join(segments[:3])+'/'
        subPath= '/'.join(segments[3:7])+'/'
        articleURL=(segments[len(segments)-1])[:-6]
        myArticleSegs=articleURL.split('.')
        myArticle=myArticleSegs[0]
        printVerString=myArticle+ '_print.html'
        myURL = basename + subPath + printVerString
        print 'this is the url: ' + myURL
        return basename + subPath + printVerString
--- a/resources/recipes/la_republica.recipe
+++ b/resources/recipes/la_republica.recipe
@ -1,29 +1,55 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __author__    = 'Lorenzo Vigentini, based on Darko Miletic'
 __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
 description   = 'Italian daily newspaper - v1.01 (04, January 2010)'
 '''
 http://www.repubblica.it/
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class LaRepublica(BasicNewsRecipe):
-    title          = u'la Repubblica'
+    author        = 'Lorenzo Vigentini, based on Darko Miletic'
-    oldest_article = 1
+    description   = 'Italian daily newspaper'
    language = 'it'
-    author = 'Darko Miletic'
+    cover_url      = 'http://www.repubblica.it/images/homepage/la_repubblica_logo.gif'
    title          = u'La Repubblica'
    publisher      = 'Gruppo editoriale L\'Espresso'
    category       = 'News, politics, culture, economy, general interest'
    language       = 'it'
    timefmt        = '[%a, %d %b, %Y]'
    oldest_article = 1
    max_articles_per_feed = 100
    use_embedded_content  = False
    recursion             = 10
    remove_javascript = True
-    no_stylesheets = True
+
    keep_only_tags     = [dict(name='div', attrs={'class':'articolo'})]
    remove_tags        = [
-                            dict(name=['object','link'])
+                            dict(name=['object','link']),
-                           ,dict(name='span',attrs={'class':'linkindice'})
+                            dict(name='span',attrs={'class':'linkindice'}),
-                           ,dict(name='div',attrs={'class':'bottom-mobile'})
+                            dict(name='div',attrs={'class':'bottom-mobile'}),
-                           ,dict(name='div',attrs={'id':['rssdiv','blocco']})
+                            dict(name='div',attrs={'id':['rssdiv','blocco']})
                         ]
-    
+
    feeds          = [
-                       (u'Repubblica homepage', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
+                       (u'Repubblica Rilievo', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
                       (u'Repubblica Cronaca', u'http://www.repubblica.it/rss/cronaca/rss2.0.xml'),
                       (u'Repubblica Esteri', u'http://www.repubblica.it/rss/esteri/rss2.0.xml'),
                       (u'Repubblica Economia', u'http://www.repubblica.it/rss/economia/rss2.0.xml'),
                       (u'Repubblica Politica', u'http://www.repubblica.it/rss/politica/rss2.0.xml'),
                       (u'Repubblica Scienze', u'http://www.repubblica.it/rss/scienze/rss2.0.xml'),
                       (u'Repubblica Tecnologia', u'http://www.repubblica.it/rss/tecnologia/rss2.0.xml'),
-                       (u'Repubblica Esteri', u'http://www.repubblica.it/rss/esteri/rss2.0.xml')
+                       (u'Repubblica Scuola e Universita', u'http://www.repubblica.it/rss/scuola_e_universita/rss2.0.xml'),
                       (u'Repubblica Ambiente', u'http://www.repubblica.it/rss/ambiente/rss2.0.xml'),
 		       (u'Repubblica Cultura', u'http://www.repubblica.it/rss/spettacoli_e_cultura/rss2.0.xml'),
 		       (u'Repubblica Persone', u'http://www.repubblica.it/rss/persone/rss2.0.xml'),
 		       (u'Repubblica Sport', u'http://www.repubblica.it/rss/sport/rss2.0.xml'),
 		       (u'Repubblica Calcio', u'http://www.repubblica.it/rss/sport/calcio/rss2.0.xml')
                     ]
--- a/resources/recipes/panorama.recipe
+++ b/resources/recipes/panorama.recipe
@ -0,0 +1,51 @@
 #!/usr/bin/env  python
 __license__     = 'GPL v3'
 __author__      = 'Lorenzo Vigentini'
 __copyright__   = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
 __version__     = 'v1.01'
 __date__        = '10, January 2010'
 __description__ = 'Italian weekly magazine'
 '''
 http://www.panorama.it/
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class panorama(BasicNewsRecipe):
    __author__        = 'Lorenzo Vigentini, based on Darko Miletic'
    description   = 'Italian weekly magazine'
    cover_url      = 'http://www.panorama.it/panorama/images/panorama_large.gif'
    title          = u'Panorama '
    publisher      = 'Mondadori'
    category       = 'News, politics, culture, economy, general interest'
    language       = 'it'
    timefmt        = '[%a, %d %b, %Y]'
    oldest_article = 7
    max_articles_per_feed = 100
    use_embedded_content  = False
    recursion             = 10
    remove_javascript = True
    keep_only_tags     = [dict(name='div', attrs={'class':['post','article']})]
    remove_tags        = [
                            dict(name=['object','link']),
                            dict(name='div',attrs={'class':['post-meta','sharing-tools','related','comments','prev-next']}),
                            dict(name='div',attrs={'id':['related-posts','footer']})
                         ]
    feeds          = [
                       (u'Panorama Italia', u'http://blog.panorama.it/italia/feed'),
                       (u'Panorama Mondo', u'http://blog.panorama.it/mondo/feed'),
                       (u'Panorama Cultura e societa', u'http://blog.panorama.it/culturaesocieta/feed'),
                       (u'Panorama Hitech e scienza', u'http://blog.panorama.it/hitechescienza/feed'),
                       (u'Panorama Motori', u'http://blog.panorama.it/autoemoto/feed'),
                       (u'Panorama libri', u'http://blog.panorama.it/libri/feed'),
                       (u'Panorama Opinioni', u'http://blog.panorama.it/opinioni/feed'),
                     ]
--- a/resources/recipes/quotidiano.recipe
+++ b/resources/recipes/quotidiano.recipe
@ -0,0 +1,52 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __author__    = 'Lorenzo Vigentini'
 __copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
 __version__     = 'v1.01'
 __date__        = '10, January 2010'
 __description__ = 'Italian News Agency'
 '''
 http://www.quotidianonet.ilsole24ore.com/
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class panorama(BasicNewsRecipe):
    __author__     = 'Lorenzo Vigentini, based on Darko Miletic'
    description    = 'Italian News Agency'
    cover_url      = 'http://quotidianonet.ilsole24ore.com/file_generali/img/logo_quotidianonet-top.gif'
    title          = u'Quotidiano Net '
    publisher      = 'italiaNews'
    category       = 'News, politics, culture, economy, general interest'
    language       = 'it'
    timefmt        = '[%a, %d %b, %Y]'
    oldest_article = 7
    max_articles_per_feed = 100
    use_embedded_content  = False
    recursion             = 10
    remove_javascript = True
    keep_only_tags     = [dict(name='div', attrs={'class':'box_contenuto articolo'})]
    remove_tags        = [
                            dict(name=['object','link']),
                            dict(name='div',attrs={'class':['post-meta','sharing-tools','related','comments','prev-next','box_contenuto adsense']}),
                            dict(name='div',attrs={'id':['strumenti','related-posts','footer','inline_boxes','inline_boxes_header','inline_boxes_body','bottom']}),
                            dict(name='span',attrs={'class':'titolosezione default'})
                         ]
    feeds          = [
                       (u'Prima pagina', u'http://quotidianonet.ilsole24ore.com/rss/home.xml'),
                       (u'Cronaca', u'http://quotidianonet.ilsole24ore.com/rss/cronaca.xml'),
                       (u'Economia', u'http://quotidianonet.ilsole24ore.com/rss/economia.xml'),
                       (u'Esteri', u'http://quotidianonet.ilsole24ore.com/rss/esteri.xml'),
                       (u'Politica', u'http://quotidianonet.ilsole24ore.com/rss/politica.xml'),
                       (u'Salute', u'http://quotidianonet.ilsole24ore.com/rss/salute.xml'),
                       (u'Tecnologia', u'http://quotidianonet.ilsole24ore.com/rss/tecnologia.xml'),
                     ]