New recipes for l'Espresso, Quotidiano, La Gazzeta dello Sport and Panorama by Lorenzo Vigentini

This commit is contained in:
Kovid Goyal 2010-01-10 20:34:15 -07:00
parent a58920d592
commit e37f0747db
7 changed files with 365 additions and 70 deletions

View File

@ -1,27 +1,35 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __author__ = 'Lorenzo Vigentini, based on Darko Miletic'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
__version__ = 'v1.01'
__date__ = '10, January 2010'
__description__ = 'Italian daily newspaper (english version)'
''' '''
www.corriere.it/english http://www.corriere.it/
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Corriere_en(BasicNewsRecipe): class ilCorriere(BasicNewsRecipe):
title = 'Corriere della Sera in English' __author__ = 'Lorenzo Vigentini, based on Darko Miletic'
__author__ = 'Darko Miletic' description = 'Italian daily newspaper (english version)'
description = 'News from Milan and Italy'
oldest_article = 15
publisher = 'Corriere della Sera'
category = 'news, politics, Italy'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
remove_javascript = True
language = 'en'
cover_url = 'http://images.corriereobjects.it/images/static/common/logo_home.gif?v=200709121520'
title = u'Il Corriere della sera (english) '
publisher = 'RCS Digital'
category = 'News, politics, culture, economy, general interest'
language = 'en'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 1
max_articles_per_feed = 100
use_embedded_content = False
recursion = 10
remove_javascript = True
no_stylesheets = True
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
@ -35,12 +43,13 @@ class Corriere_en(BasicNewsRecipe):
keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})] keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})]
remove_tags = [ remove_tags = [
dict(name=['base','object','link','embed','img']) dict(name=['base','object','link','embed']),
,dict(name='div', attrs={'class':'news-goback'}) dict(name='div', attrs={'class':'news-goback'}),
,dict(name='ul', attrs={'class':'toolbar'}) dict(name='ul', attrs={'class':'toolbar'})
] ]
remove_tags_after = dict(name='p', attrs={'class':'footnotes'}) remove_tags_after = dict(name='p', attrs={'class':'footnotes'})
feeds = [(u'Italian Life', u'http://www.corriere.it/rss/english.xml')] feeds = [
(u'News' , u'http://www.corriere.it/rss/english.xml' )
]

View File

@ -1,26 +1,36 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = 'Lorenzo Vigentini, based on Darko Miletic'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
__version__ = 'v1.01'
__date__ = '10, January 2010'
__description__ = 'Italian daily newspaper'
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.corriere.it http://www.corriere.it/
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Corriere_it(BasicNewsRecipe):
title = 'Corriere della Sera'
__author__ = 'Darko Miletic'
description = 'News from Milan and Italy'
oldest_article = 7
publisher = 'Corriere della Sera'
category = 'news, politics, Italy'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
remove_javascript = True
language = 'it'
class ilCorriere(BasicNewsRecipe):
__author__ = 'Lorenzo Vigentini, based on Darko Miletic'
description = 'Italian daily newspaper'
cover_url = 'http://images.corriereobjects.it/images/static/common/logo_home.gif?v=200709121520'
title = u'Il Corriere della sera '
publisher = 'RCS Digital'
category = 'News, politics, culture, economy, general interest'
language = 'it'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 1
max_articles_per_feed = 100
use_embedded_content = False
recursion = 10
remove_javascript = True
no_stylesheets = True
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
@ -28,29 +38,30 @@ class Corriere_it(BasicNewsRecipe):
, '--publisher', publisher , '--publisher', publisher
, '--ignore-tables' , '--ignore-tables'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})] keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})]
remove_tags = [ remove_tags = [
dict(name=['base','object','link','embed','img']) dict(name=['base','object','link','embed']),
,dict(name='div', attrs={'class':'news-goback'}) dict(name='div', attrs={'class':'news-goback'}),
,dict(name='ul', attrs={'class':'toolbar'}) dict(name='ul', attrs={'class':'toolbar'})
] ]
remove_tags_after = dict(name='p', attrs={'class':'footnotes'}) remove_tags_after = dict(name='p', attrs={'class':'footnotes'})
feeds = [
(u'Ultimora' , u'http://www.corriere.it/rss/ultimora.xml' )
,(u'Cronache' , u'http://www.corriere.it/rss/cronache.xml' )
,(u'Economia' , u'http://www.corriere.it/rss/economia.xml' )
,(u'Editoriali', u'http://www.corriere.it/rss/editoriali.xml')
,(u'Esteri' , u'http://www.corriere.it/rss/esteri.xml' )
,(u'Politica' , u'http://www.corriere.it/rss/politica.xml' )
,(u'Salute' , u'http://www.corriere.it/rss/salute.xml' )
,(u'Scienze' , u'http://www.corriere.it/rss/scienze.xml' )
,(u'Spettacolo', u'http://www.corriere.it/rss/spettacoli.xml')
,(u'Sport' , u'http://www.corriere.it/rss/sport.xml' )
]
feeds = [
(u'Ultimora' , u'http://www.corriere.it/rss/ultimora.xml' ),
(u'Editoriali', u'http://www.corriere.it/rss/editoriali.xml'),
(u'Cronache' , u'http://www.corriere.it/rss/cronache.xml' ),
(u'Politica' , u'http://www.corriere.it/rss/politica.xml' ),
(u'Esteri' , u'http://www.corriere.it/rss/esteri.xml' ),
(u'Economia' , u'http://www.corriere.it/rss/economia.xml' ),
(u'Cultura' , u'http://www.corriere.it/rss/cultura.xml' ),
(u'Scienze' , u'http://www.corriere.it/rss/scienze.xml' ),
(u'Salute' , u'http://www.corriere.it/rss/salute.xml' ),
(u'Spettacolo', u'http://www.corriere.it/rss/spettacoli.xml'),
(u'Cinema e TV', u'http://www.corriere.it/rss/cinema.xml' ),
(u'Sport' , u'http://www.corriere.it/rss/sport.xml' )
]

View File

@ -0,0 +1,67 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = 'Lorenzo Vigentini'
__copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
__version__ = 'v1.02'
__date__ = '10, January 2010'
__description__ = 'Italian weekly magazine'
'''espresso.repubblica.it'''
from calibre.web.feeds.news import BasicNewsRecipe
class laGazzetta(BasicNewsRecipe):
__author__ = 'Lorenzo Vigentini'
description = 'Italian weekly magazine'
cover_url = 'http://espresso.repubblica.it/images/logo_espresso.gif'
title = 'l Espresso '
publisher = 'Gruppo editoriale lEspresso'
category = 'News, politics, culture, economy, general interest'
language = 'it'
encoding = 'cp1252'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 16
max_articles_per_feed = 100
use_embedded_content = False
recursion = 10
remove_javascript = True
no_stylesheets = True
feeds = [
(u'Espresso Homepage', u'http://kpm.data.kataweb.it/kpm3eolx/rss/home'),
(u'Espresso Local', u'http://kpm.data.kataweb.it/kpm3eolx/rss/local'),
(u'Espresso Style & Design', u'http://kpm.data.kataweb.it/kpm3eolx/rss/style_design'),
(u'Espresso Opinioni', u'http://kpm.data.kataweb.it/kpm3eolx/rss/opinioni'),
(u'Espresso Rubriche', u'http://kpm.data.kataweb.it/kpm3eolx/rss/rubriche'),
(u'Espresso Limes', u'http://temi.repubblica.it/limes/feed/')
]
def print_version(self,url):
return url + '/&print=true'
keep_only_tags = [
dict(name='div', attrs={'class':['testo','copertina','occhiello','firma','didascalia','content-second-right','detail-articles','titolo-local','generic-articles']}),
dict(name='div', attrs={'class':['generic-articles','summary','detail-articles']}),
dict(name='div', attrs={'id':'content-second-right'})
]
remove_tags = [
dict(name='div',attrs={'class':['servizi','aggiungi','label-web','bottom-mobile','box-abbonamenti','box-cerca','big','little','stampaweb']}),
dict(name='div',attrs={'id':['topheader','header','navigation-new','navigation','content-second-left']}),
dict(name=['script','noscript','iframe'])
]
extra_css = '''
h1 {font-family:"Trebuchet MS",Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:18px;}
h2 {font-family:"Trebuchet MS",Arial,Helvetica,sans-serif; font-size:18px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:16px; }
h3 {color:#333333;font-family:"Trebuchet MS",Arial,Helvetica,sans-serif; font-size:16px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px;}
h4 {color:#333333; font-family:"Trebuchet MS",Arial,Helvetica,sans-serif;font-size:16px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; }
h5 {color:#333333; font-family:"Trebuchet MS",Arial,Helvetica,sans-serif; font-size:12px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; text-transform:uppercase;}
.firma {color:#333333;font-family:"Trebuchet MS",Arial,Helvetica,sans-serif;font-size:12px; font-size-adjust:none; font-stretch:normal; font-style:italic; font-variant:normal; font-weight:bold; line-height:15px; text-decoration:none;}
.testo {font-family:"Trebuchet MS",Arial,Helvetica,sans-serif; font-size:10px;}
'''

View File

@ -0,0 +1,79 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = 'Lorenzo Vigentini'
__copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
__version__ = 'v1.02'
__date__ = '10, January 2010'
__description__ = 'Sport news from the most read sport newspaper in Italy'
'''www.gazzetta.it'''
from calibre.web.feeds.news import BasicNewsRecipe
class laGazzetta(BasicNewsRecipe):
__author__ = 'Lorenzo Vigentini'
description = 'Sport news from the most read sport newspaper in Italy'
cover_url = 'http://www.gazzetta.it/primapagina/images/prima_pagina_grande.png'
title = 'La Gazzetta dello Sport '
publisher = 'RCS Digital'
category = 'Sport News'
language = 'it'
encoding = 'cp1252'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 2
max_articles_per_feed = 20
use_embedded_content = False
recursion = 10
remove_javascript = True
no_stylesheets = True
keep_only_tags = [ dict(name='div', attrs={'id':'articolo'})]
remove_tags = [
dict(name='ul',attrs={'id':['service-toolbar','sections-menu']}),
dict(name='div',attrs={'id':['header','rightcol','sponsored','vxFlashPlayer','footer','print-box']}),
dict(name='iframe',attrs={'id':'mirago-feed'}),
dict(name='a',attrs={'id':'commenta-up'}),
dict(name='cite',attrs={'class':['signature','parag-title']}),
dict(name='a',attrs={'class':['last-comment','button-bold2']}),
dict(name=['base','object','link','a','script','noscript'])
]
extra_css = '''
h1 {font: sans-serif large;}
h2 {font: sans-serif medium;}
h3 {font: sans-serif small;}
h4 {font: sans-serif bold small;}
p {font:10pt helvetica}
dd {font:8pt helvetica}
'''
feeds = [
(u'Calcio',u'http://www.gazzetta.it/rss/Calcio.xml'),
(u'Formula 1',u'http://www.gazzetta.it/rss/Formula1.xml'),
(u'Motomodiale',u'http://www.gazzetta.it/rss/Motomondiale.xml'),
(u'Motori',u'http://www.gazzetta.it/rss/Motori.xml'),
(u'Ciclismo',u'http://www.gazzetta.it/rss/Ciclismo.xml'),
(u'Basket',u'http://www.gazzetta.it/rss/Basket.xml'),
(u'Tennis',u'http://www.gazzetta.it/rss/Tennis.xml'),
(u'Pallavolo',u'http://www.gazzetta.it/rss/Pallavolo.xml'),
(u'Vela',u'http://www.gazzetta.it/rss/Vela.xml'),
(u'Atletica',u'http://www.gazzetta.it/rss/Atletica.xml'),
(u'Altri Sport',u'http://www.gazzetta.it/rss/Sport_Vari.xml')
]
def print_version(self,url):
segments = url.split('/')
basename = '/'.join(segments[:3])+'/'
subPath= '/'.join(segments[3:7])+'/'
articleURL=(segments[len(segments)-1])[:-6]
myArticleSegs=articleURL.split('.')
myArticle=myArticleSegs[0]
printVerString=myArticle+ '_print.html'
myURL = basename + subPath + printVerString
print 'this is the url: ' + myURL
return basename + subPath + printVerString

View File

@ -1,29 +1,55 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = 'Lorenzo Vigentini, based on Darko Miletic'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
description = 'Italian daily newspaper - v1.01 (04, January 2010)'
'''
http://www.repubblica.it/
'''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class LaRepublica(BasicNewsRecipe): class LaRepublica(BasicNewsRecipe):
title = u'la Repubblica' author = 'Lorenzo Vigentini, based on Darko Miletic'
oldest_article = 1 description = 'Italian daily newspaper'
language = 'it'
author = 'Darko Miletic' cover_url = 'http://www.repubblica.it/images/homepage/la_repubblica_logo.gif'
title = u'La Repubblica'
publisher = 'Gruppo editoriale L\'Espresso'
category = 'News, politics, culture, economy, general interest'
language = 'it'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
use_embedded_content = False
recursion = 10
remove_javascript = True remove_javascript = True
no_stylesheets = True
keep_only_tags = [dict(name='div', attrs={'class':'articolo'})] keep_only_tags = [dict(name='div', attrs={'class':'articolo'})]
remove_tags = [ remove_tags = [
dict(name=['object','link']) dict(name=['object','link']),
,dict(name='span',attrs={'class':'linkindice'}) dict(name='span',attrs={'class':'linkindice'}),
,dict(name='div',attrs={'class':'bottom-mobile'}) dict(name='div',attrs={'class':'bottom-mobile'}),
,dict(name='div',attrs={'id':['rssdiv','blocco']}) dict(name='div',attrs={'id':['rssdiv','blocco']})
] ]
feeds = [ feeds = [
(u'Repubblica homepage', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'), (u'Repubblica Rilievo', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
(u'Repubblica Cronaca', u'http://www.repubblica.it/rss/cronaca/rss2.0.xml'),
(u'Repubblica Esteri', u'http://www.repubblica.it/rss/esteri/rss2.0.xml'),
(u'Repubblica Economia', u'http://www.repubblica.it/rss/economia/rss2.0.xml'),
(u'Repubblica Politica', u'http://www.repubblica.it/rss/politica/rss2.0.xml'),
(u'Repubblica Scienze', u'http://www.repubblica.it/rss/scienze/rss2.0.xml'), (u'Repubblica Scienze', u'http://www.repubblica.it/rss/scienze/rss2.0.xml'),
(u'Repubblica Tecnologia', u'http://www.repubblica.it/rss/tecnologia/rss2.0.xml'), (u'Repubblica Tecnologia', u'http://www.repubblica.it/rss/tecnologia/rss2.0.xml'),
(u'Repubblica Esteri', u'http://www.repubblica.it/rss/esteri/rss2.0.xml') (u'Repubblica Scuola e Universita', u'http://www.repubblica.it/rss/scuola_e_universita/rss2.0.xml'),
(u'Repubblica Ambiente', u'http://www.repubblica.it/rss/ambiente/rss2.0.xml'),
(u'Repubblica Cultura', u'http://www.repubblica.it/rss/spettacoli_e_cultura/rss2.0.xml'),
(u'Repubblica Persone', u'http://www.repubblica.it/rss/persone/rss2.0.xml'),
(u'Repubblica Sport', u'http://www.repubblica.it/rss/sport/rss2.0.xml'),
(u'Repubblica Calcio', u'http://www.repubblica.it/rss/sport/calcio/rss2.0.xml')
] ]

View File

@ -0,0 +1,51 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = 'Lorenzo Vigentini'
__copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
__version__ = 'v1.01'
__date__ = '10, January 2010'
__description__ = 'Italian weekly magazine'
'''
http://www.panorama.it/
'''
from calibre.web.feeds.news import BasicNewsRecipe
class panorama(BasicNewsRecipe):
__author__ = 'Lorenzo Vigentini, based on Darko Miletic'
description = 'Italian weekly magazine'
cover_url = 'http://www.panorama.it/panorama/images/panorama_large.gif'
title = u'Panorama '
publisher = 'Mondadori'
category = 'News, politics, culture, economy, general interest'
language = 'it'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 7
max_articles_per_feed = 100
use_embedded_content = False
recursion = 10
remove_javascript = True
keep_only_tags = [dict(name='div', attrs={'class':['post','article']})]
remove_tags = [
dict(name=['object','link']),
dict(name='div',attrs={'class':['post-meta','sharing-tools','related','comments','prev-next']}),
dict(name='div',attrs={'id':['related-posts','footer']})
]
feeds = [
(u'Panorama Italia', u'http://blog.panorama.it/italia/feed'),
(u'Panorama Mondo', u'http://blog.panorama.it/mondo/feed'),
(u'Panorama Cultura e societa', u'http://blog.panorama.it/culturaesocieta/feed'),
(u'Panorama Hitech e scienza', u'http://blog.panorama.it/hitechescienza/feed'),
(u'Panorama Motori', u'http://blog.panorama.it/autoemoto/feed'),
(u'Panorama libri', u'http://blog.panorama.it/libri/feed'),
(u'Panorama Opinioni', u'http://blog.panorama.it/opinioni/feed'),
]

View File

@ -0,0 +1,52 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = 'Lorenzo Vigentini'
__copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
__version__ = 'v1.01'
__date__ = '10, January 2010'
__description__ = 'Italian News Agency'
'''
http://www.quotidianonet.ilsole24ore.com/
'''
from calibre.web.feeds.news import BasicNewsRecipe
class panorama(BasicNewsRecipe):
__author__ = 'Lorenzo Vigentini, based on Darko Miletic'
description = 'Italian News Agency'
cover_url = 'http://quotidianonet.ilsole24ore.com/file_generali/img/logo_quotidianonet-top.gif'
title = u'Quotidiano Net '
publisher = 'italiaNews'
category = 'News, politics, culture, economy, general interest'
language = 'it'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 7
max_articles_per_feed = 100
use_embedded_content = False
recursion = 10
remove_javascript = True
keep_only_tags = [dict(name='div', attrs={'class':'box_contenuto articolo'})]
remove_tags = [
dict(name=['object','link']),
dict(name='div',attrs={'class':['post-meta','sharing-tools','related','comments','prev-next','box_contenuto adsense']}),
dict(name='div',attrs={'id':['strumenti','related-posts','footer','inline_boxes','inline_boxes_header','inline_boxes_body','bottom']}),
dict(name='span',attrs={'class':'titolosezione default'})
]
feeds = [
(u'Prima pagina', u'http://quotidianonet.ilsole24ore.com/rss/home.xml'),
(u'Cronaca', u'http://quotidianonet.ilsole24ore.com/rss/cronaca.xml'),
(u'Economia', u'http://quotidianonet.ilsole24ore.com/rss/economia.xml'),
(u'Esteri', u'http://quotidianonet.ilsole24ore.com/rss/esteri.xml'),
(u'Politica', u'http://quotidianonet.ilsole24ore.com/rss/politica.xml'),
(u'Salute', u'http://quotidianonet.ilsole24ore.com/rss/salute.xml'),
(u'Tecnologia', u'http://quotidianonet.ilsole24ore.com/rss/tecnologia.xml'),
]