Various French news sources by Aurelien Chabot. Update USA Today, CNN, Liberation

This commit is contained in:
Kovid Goyal 2011-10-17 04:10:06 +05:30
commit 2c6098794b
12 changed files with 502 additions and 51 deletions

71
recipes/20minutes.recipe Normal file
View File

@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2011 Aurélien Chabot <contact@aurelienchabot.fr>'
'''
20minutes.fr
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class Minutes(BasicNewsRecipe):
title = '20 minutes'
__author__ = 'calibre'
description = 'Actualités'
encoding = 'cp1252'
publisher = '20minutes.fr'
category = 'Actualités, France, Monde'
language = 'fr'
use_embedded_content = False
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 15
no_stylesheets = True
remove_empty_feeds = True
filterDuplicates = True
extra_css = '''
h1 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;}
.mna-details {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.mna-image {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.mna-body {font-size:medium; font-family:Arial,Helvetica,sans-serif;}
'''
remove_tags = [
dict(name='iframe'),
dict(name='div', attrs={'class':['mn-section-heading']}),
dict(name='a', attrs={'href':['#commentaires']}),
dict(name='div', attrs={'class':['mn-right']}),
dict(name='div', attrs={'class':['mna-box']}),
dict(name='div', attrs={'class':['mna-comment-call']}),
dict(name='div', attrs={'class':['mna-tools']}),
dict(name='div', attrs={'class':['mn-trilist']})
]
keep_only_tags = [dict(id='mn-article')]
remove_tags_after = dict(name='div', attrs={'class':['mna-body','mna-signature']})
feeds = [
('France', 'http://www.20minutes.fr/rss/actu-france.xml'),
('International', 'http://www.20minutes.fr/rss/monde.xml'),
('Tech/Web', 'http://www.20minutes.fr/rss/hightech.xml'),
('Sciences', 'http://www.20minutes.fr/rss/sciences.xml'),
('Economie', 'http://www.20minutes.fr/rss/economie.xml'),
('Politique', 'http://www.20minutes.fr/rss/politique.xml'),
(u'Médias', 'http://www.20minutes.fr/rss/media.xml'),
('Cinema', 'http://www.20minutes.fr/rss/cinema.xml'),
('People', 'http://www.20minutes.fr/rss/people.xml'),
('Culture', 'http://www.20minutes.fr/rss/culture.xml'),
('Sport', 'http://www.20minutes.fr/rss/sport.xml'),
('Paris', 'http://www.20minutes.fr/rss/paris.xml'),
('Lyon', 'http://www.20minutes.fr/rss/lyon.xml'),
('Toulouse', 'http://www.20minutes.fr/rss/toulouse.xml')
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -22,6 +22,14 @@ class CNN(BasicNewsRecipe):
#match_regexps = [r'http://sportsillustrated.cnn.com/.*/[1-9].html']
max_articles_per_feed = 25
extra_css = '''
h1 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;}
.cnn_story_author, .cnn_stryathrtmp {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.cnn_strycaptiontxt, .cnnArticleGalleryPhotoContainer {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.cnn_strycbftrtxt, .cnnEditorialNote {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.cnn_strycntntlft {font-size:medium; font-family:Arial,Helvetica,sans-serif;}
'''
preprocess_regexps = [
(re.compile(r'<!--\[if.*if\]-->', re.DOTALL), lambda m: ''),
(re.compile(r'<script.*?</script>', re.DOTALL), lambda m: ''),
@ -32,7 +40,12 @@ class CNN(BasicNewsRecipe):
remove_tags = [
{'class':['cnn_strybtntools', 'cnn_strylftcntnt',
'cnn_strybtntools', 'cnn_strybtntoolsbttm', 'cnn_strybtmcntnt',
'cnn_strycntntrgt', 'hed_side', 'foot']},
'cnn_strycntntrgt', 'hed_side', 'foot', 'cnn_strylftcntnt cnn_strylftcexpbx']},
{'class':['cnn_html_media_title_new', 'cnn_html_media_title_new cnn_html_media_title_none',
'cnnArticleGalleryCaptionControlText', 'articleGalleryNavContainer']},
{'id':['articleGalleryNav00JumpPrev', 'articleGalleryNav00Prev',
'articleGalleryNav00Next', 'articleGalleryNav00JumpNext']},
{'style':['display:none']},
dict(id=['ie_column']),
]
@ -58,3 +71,12 @@ class CNN(BasicNewsRecipe):
ans = BasicNewsRecipe.get_article_url(self, article)
return ans.partition('?')[0]
def get_masthead_url(self):
masthead = 'http://i.cdn.turner.com/cnn/.element/img/3.0/global/header/intl/hdr-globe-central.gif'
br = BasicNewsRecipe.get_browser()
try:
br.open(masthead)
except:
self.log("\nCover unavailable")
masthead = None
return masthead

8
recipes/frandroid.recipe Normal file
View File

@ -0,0 +1,8 @@
# -*- coding: utf-8 -*-
class BasicUserRecipe1318572550(AutomaticNewsRecipe):
title = u'FrAndroid'
oldest_article = 2
max_articles_per_feed = 100
auto_cleanup = True
feeds = [(u'FrAndroid', u'http://feeds.feedburner.com/Frandroid')]

View File

@ -0,0 +1,8 @@
# -*- coding: utf-8 -*-
class BasicUserRecipe1318572445(AutomaticNewsRecipe):
title = u'Google Mobile Blog'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
feeds = [(u'Google Mobile Blog', u'http://googlemobile.blogspot.com/atom.xml')]

18
recipes/korben.recipe Normal file
View File

@ -0,0 +1,18 @@
# -*- coding: utf-8 -*-
class BasicUserRecipe1318619728(AutomaticNewsRecipe):
title = u'Korben'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
feeds = [(u'Korben', u'http://feeds2.feedburner.com/KorbensBlog-UpgradeYourMind')]
def get_masthead_url(self):
masthead = 'http://korben.info/wp-content/themes/korben-steaw/hab/logo.png'
br = BasicNewsRecipe.get_browser()
try:
br.open(masthead)
except:
self.log("\nCover unavailable")
masthead = None
return masthead

76
recipes/lepoint.recipe Normal file
View File

@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2011 Aurélien Chabot <contact@aurelienchabot.fr>'
'''
LePoint.fr
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class lepoint(BasicNewsRecipe):
title = 'Le Point'
__author__ = 'calibre'
description = 'Actualités'
encoding = 'utf-8'
publisher = 'LePoint.fr'
category = 'news, France, world'
language = 'fr'
use_embedded_content = False
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 15
no_stylesheets = True
remove_empty_feeds = True
filterDuplicates = True
extra_css = '''
h1 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;}
.chapo {font-size:xx-small; font-family:Arial,Helvetica,sans-serif;}
.info_article {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.media_article {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.article {font-size:medium; font-family:Arial,Helvetica,sans-serif;}
'''
remove_tags = [
dict(name='iframe'),
dict(name='div', attrs={'class':['entete_chroniqueur']}),
dict(name='div', attrs={'class':['col_article']}),
dict(name='div', attrs={'class':['signature_article']}),
dict(name='div', attrs={'class':['util_font util_article']}),
dict(name='div', attrs={'class':['util_article bottom']})
]
keep_only_tags = [dict(name='div', attrs={'class':['page_article']})]
remove_tags_after = dict(name='div', attrs={'class':['util_article bottom']})
feeds = [
(u'À la une', 'http://www.lepoint.fr/rss.xml'),
('International', 'http://www.lepoint.fr/monde/rss.xml'),
('Tech/Web', 'http://www.lepoint.fr/high-tech-internet/rss.xml'),
('Sciences', 'http://www.lepoint.fr/science/rss.xml'),
('Economie', 'http://www.lepoint.fr/economie/rss.xml'),
(u'Socièté', 'http://www.lepoint.fr/societe/rss.xml'),
('Politique', 'http://www.lepoint.fr/politique/rss.xml'),
(u'Médias', 'http://www.lepoint.fr/medias/rss.xml'),
('Culture', 'http://www.lepoint.fr/culture/rss.xml'),
(u'Santé', 'http://www.lepoint.fr/sante/rss.xml'),
('Sport', 'http://www.lepoint.fr/sport/rss.xml')
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
def get_masthead_url(self):
masthead = 'http://www.lepoint.fr/images/commun/logo.png'
br = BasicNewsRecipe.get_browser()
try:
br.open(masthead)
except:
self.log("\nCover unavailable")
masthead = None
return masthead

74
recipes/lexpress.recipe Normal file
View File

@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2011 Aurélien Chabot <contact@aurelienchabot.fr>'
'''
Lexpress.fr
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class lepoint(BasicNewsRecipe):
title = 'L\'express'
__author__ = 'calibre'
description = 'Actualités'
encoding = 'cp1252'
publisher = 'LExpress.fr'
category = 'Actualité, France, Monde'
language = 'fr'
use_embedded_content = False
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 15
no_stylesheets = True
remove_empty_feeds = True
filterDuplicates = True
extra_css = '''
h1 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;}
.current_parent, p.heure, .ouverture {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
#contenu-article {font-size:medium; font-family:Arial,Helvetica,sans-serif;}
.entete { font-weiht:bold;}
'''
remove_tags = [
dict(name='iframe'),
dict(name='div', attrs={'class':['barre-outil-fb']}),
dict(name='div', attrs={'class':['barre-outils']}),
dict(id='bloc-sommaire'),
dict(id='footer-article')
]
keep_only_tags = [dict(name='div', attrs={'class':['bloc-article']})]
remove_tags_after = dict(id='content-article')
feeds = [
(u'À la une', 'http://www.lexpress.fr/rss/alaune.xml'),
('International', 'http://www.lexpress.fr/rss/monde.xml'),
('Tech/Web', 'http://www.lexpress.fr/rss/high-tech.xml'),
(u'Sciences/Santé', 'http://www.lexpress.fr/rss/science-et-sante.xml'),
(u'Envronnement', 'http://www.lexpress.fr/rss/environnement.xml'),
('Economie', 'http://www.lepoint.fr/economie/rss.xml'),
(u'Socièté', 'http://www.lexpress.fr/rss/societe.xml'),
('Politique', 'http://www.lexpress.fr/rss/politique.xml'),
(u'Médias', 'http://www.lexpress.fr/rss/medias.xml'),
('Culture', 'http://www.lexpress.fr/rss/culture.xml'),
('Sport', 'http://www.lexpress.fr/rss/sport.xml')
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
def get_masthead_url(self):
masthead = 'http://static.lexpress.fr/imgstat/logo_lexpress.gif'
br = BasicNewsRecipe.get_browser()
try:
br.open(masthead)
except:
self.log("\nCover unavailable")
masthead = None
return masthead

View File

@ -9,39 +9,72 @@ liberation.fr
from calibre.web.feeds.news import BasicNewsRecipe
class Liberation(BasicNewsRecipe):
title = u'Liberation'
__author__ = 'Darko Miletic'
description = 'News from France'
language = 'fr'
__author__ = 'calibre'
description = 'Actualités'
category = 'Actualités, France, Monde'
language = 'fr'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
use_embedded_content = False
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 15
no_stylesheets = True
remove_empty_feeds = True
filterDuplicates = True
html2lrf_options = ['--base-font-size', '10']
extra_css = '''
h1, h2, h3 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;}
p.subtitle {font-size:xx-small; font-family:Arial,Helvetica,sans-serif;}
h4, h5, h2.rubrique, {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.ref, .date, .author, .legende {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.mna-body, entry-body {font-size:medium; font-family:Arial,Helvetica,sans-serif;}
'''
keep_only_tags = [
dict(name='h1')
#,dict(name='div', attrs={'class':'object-content text text-item'})
,dict(name='div', attrs={'class':'article'})
#,dict(name='div', attrs={'class':'articleContent'})
,dict(name='div', attrs={'class':'entry'})
]
remove_tags_after = [ dict(name='div',attrs={'class':'toolbox extra_toolbox'}) ]
dict(name='div', attrs={'class':'article'})
,dict(name='div', attrs={'class':'text-article m-bot-s1'})
,dict(name='div', attrs={'class':'entry'})
,dict(name='div', attrs={'class':'col_contenu'})
]
remove_tags_after = [
dict(name='div',attrs={'class':['object-content text text-item', 'object-content', 'entry-content', 'col01', 'bloc_article_01']})
,dict(name='p',attrs={'class':['chapo']})
,dict(id='_twitter_facebook')
]
remove_tags = [
dict(name='p', attrs={'class':'clear'})
,dict(name='ul', attrs={'class':'floatLeft clear'})
,dict(name='div', attrs={'class':'clear floatRight'})
,dict(name='object')
,dict(name='div', attrs={'class':'toolbox'})
,dict(name='div', attrs={'class':'cartridge cartridge-basic-bubble cat-zoneabo'})
#,dict(name='div', attrs={'class':'clear block block-call-items'})
,dict(name='div', attrs={'class':'block-content'})
dict(name='iframe')
,dict(name='a', attrs={'class':'lnk-comments'})
,dict(name='div', attrs={'class':'toolbox'})
,dict(name='ul', attrs={'class':'share-box'})
,dict(name='ul', attrs={'class':'tool-box'})
,dict(name='ul', attrs={'class':'rub'})
,dict(name='p',attrs={'class':['chapo']})
,dict(name='p',attrs={'class':['tag']})
,dict(name='div',attrs={'class':['blokLies']})
,dict(name='div',attrs={'class':['alire']})
,dict(id='_twitter_facebook')
]
feeds = [
(u'La une', u'http://www.liberation.fr/rss/laune')
,(u'Monde' , u'http://www.liberation.fr/rss/monde')
,(u'Sports', u'http://www.liberation.fr/rss/sports')
(u'La une', u'http://rss.liberation.fr/rss/9/')
,(u'Monde' , u'http://www.liberation.fr/rss/10/')
,(u'Économie', u'http://www.liberation.fr/rss/13/')
,(u'Politiques', u'http://www.liberation.fr/rss/11/')
,(u'Société', u'http://www.liberation.fr/rss/12/')
,(u'Cinéma', u'http://www.liberation.fr/rss/58/')
,(u'Écran', u'http://www.liberation.fr/rss/53/')
,(u'Sports', u'http://www.liberation.fr/rss/12/')
]
def get_masthead_url(self):
masthead = 'http://s0.libe.com/libe/img/common/logo-liberation-150.png'
br = BasicNewsRecipe.get_browser()
try:
br.open(masthead)
except:
self.log("\nCover unavailable")
masthead = None
return masthead

18
recipes/omgubuntu.recipe Normal file
View File

@ -0,0 +1,18 @@
# -*- coding: utf-8 -*-
class BasicUserRecipe1318619832(AutomaticNewsRecipe):
title = u'OmgUbuntu'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
feeds = [(u'Omg Ubuntu', u'http://feeds.feedburner.com/d0od')]
def get_masthead_url(self):
masthead = 'http://cdn.omgubuntu.co.uk/wp-content/themes/omgubuntu/images/logo.png'
br = BasicNewsRecipe.get_browser()
try:
br.open(masthead)
except:
self.log("\nCover unavailable")
masthead = None
return masthead

47
recipes/phoronix.recipe Normal file
View File

@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2011 Aurélien Chabot <contact@aurelienchabot.fr>'
'''
Fetch phoronix.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class cdnet(BasicNewsRecipe):
title = 'Phoronix'
__author__ = 'calibre'
description = 'Actualités Phoronix'
encoding = 'utf-8'
publisher = 'Phoronix.com'
category = 'news, IT, linux'
language = 'en'
use_embedded_content = False
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 25
no_stylesheets = True
remove_empty_feeds = True
filterDuplicates = True
extra_css = '''
h1 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;}
h2 {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.KonaBody {font-size:medium; font-family:Arial,Helvetica,sans-serif;}
'''
remove_tags = []
remove_tags_before = dict(id='phxcms_content_phx')
remove_tags_after = dict(name='div', attrs={'class':'KonaBody'})
feeds = [('Phoronix', 'http://feeds.feedburner.com/Phoronix')]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -10,27 +10,28 @@ from calibre.web.feeds.news import BasicNewsRecipe
class USAToday(BasicNewsRecipe):
title = 'USA Today'
__author__ = 'Kovid Goyal'
oldest_article = 1
publication_type = 'newspaper'
timefmt = ''
max_articles_per_feed = 20
language = 'en'
no_stylesheets = True
extra_css = '.headline {text-align: left;}\n \
.byline {font-family: monospace; \
text-align: left; \
margin-bottom: 1em;}\n \
.image {text-align: center;}\n \
.caption {text-align: center; \
font-size: smaller; \
font-style: italic}\n \
.credit {text-align: right; \
margin-bottom: 0em; \
font-size: smaller;}\n \
.articleBody {text-align: left;}\n '
#simultaneous_downloads = 1
title = 'USA Today'
__author__ = 'calibre'
description = 'newspaper'
encoding = 'utf-8'
publisher = 'usatoday.com'
category = 'news, usa'
language = 'en'
use_embedded_content = False
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 15
no_stylesheets = True
remove_empty_feeds = True
filterDuplicates = True
extra_css = '''
h1, h2 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;}
#post-attributes, .info, .clear {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
#post-body, #content {font-size:medium; font-family:Arial,Helvetica,sans-serif;}
'''
feeds = [
('Top Headlines', 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'),
('Tech Headlines', 'http://rssfeeds.usatoday.com/usatoday-TechTopStories'),
@ -43,15 +44,18 @@ class USAToday(BasicNewsRecipe):
('Sport Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'),
('Weather Headlines', 'http://rssfeeds.usatoday.com/usatoday-WeatherTopStories'),
('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'),
('Offbeat News', 'http://rssfeeds.usatoday.com/UsatodaycomOffbeat-TopStories'),
('Offbeat News', 'http://rssfeeds.usatoday.com/UsatodaycomOffbeat-TopStories')
]
keep_only_tags = [dict(attrs={'class':'story'})]
remove_tags = [
dict(attrs={'class':[
'share',
'reprints',
'inline-h3',
'info-extras',
'info-extras rounded',
'inset',
'ppy-outer',
'ppy-caption',
'comments',
@ -61,9 +65,13 @@ class USAToday(BasicNewsRecipe):
'tags',
'bottom-tools',
'sponsoredlinks',
'corrections'
]}),
dict(name='ul', attrs={'class':'inside-copy'}),
dict(id=['pluck']),
]
dict(id=['updated']),
dict(id=['post-date-updated'])
]
def get_masthead_url(self):

68
recipes/zdnet.fr.recipe Normal file
View File

@ -0,0 +1,68 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2011 Aurélien Chabot <contact@aurelienchabot.fr>'
'''
Fetch zdnet.fr
'''
from calibre.web.feeds.news import BasicNewsRecipe
class zdnet(BasicNewsRecipe):
title = 'ZDNet.fr'
__author__ = 'calibre'
description = 'Actualités'
encoding = 'utf-8'
publisher = 'ZDNet.fr'
category = 'Actualité, Informatique, IT'
language = 'fr'
use_embedded_content = False
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 15
no_stylesheets = True
remove_empty_feeds = True
filterDuplicates = True
extra_css = '''
h1 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;}
.contentmetadata p {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
#content {font-size:medium; font-family:Arial,Helvetica,sans-serif;}
'''
remove_tags = [
dict(name='iframe'),
dict(name='div', attrs={'class':['toolbox']}),
dict(name='div', attrs={'class':['clear clearfix']}),
dict(id='emailtoafriend'),
dict(id='storyaudio'),
dict(id='fbtwContainer'),
dict(name='h5')
]
remove_tags_before = dict(id='leftcol')
remove_tags_after = dict(id='content')
feeds = [
('Informatique', 'http://www.zdnet.fr/feeds/rss/actualites/informatique/'),
('Internet', 'http://www.zdnet.fr/feeds/rss/actualites/internet/'),
('Telecom', 'http://www.zdnet.fr/feeds/rss/actualites/telecoms/')
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
def get_masthead_url(self):
masthead = 'http://www.zdnet.fr/images/base/logo.png'
br = BasicNewsRecipe.get_browser()
try:
br.open(masthead)
except:
self.log("\nCover unavailable")
masthead = None
return masthead