From 4bc485f6a4d6f745ce2962df5aa20e7b38d20516 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Chabot?= Date: Sun, 16 Oct 2011 11:23:56 +0200 Subject: [PATCH 01/12] Fix Liberation news website recipe --- recipes/liberation.recipe | 85 +++++++++++++++++++++++++++------------ 1 file changed, 59 insertions(+), 26 deletions(-) diff --git a/recipes/liberation.recipe b/recipes/liberation.recipe index 20f12b2d57..7183e26909 100644 --- a/recipes/liberation.recipe +++ b/recipes/liberation.recipe @@ -9,39 +9,72 @@ liberation.fr from calibre.web.feeds.news import BasicNewsRecipe class Liberation(BasicNewsRecipe): + title = u'Liberation' - __author__ = 'Darko Miletic' - description = 'News from France' - language = 'fr' + __author__ = 'calibre' + description = 'Actualités' + category = 'Actualités, France, Monde' + language = 'fr' - oldest_article = 7 - max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False + use_embedded_content = False + timefmt = ' [%d %b %Y]' + max_articles_per_feed = 15 + no_stylesheets = True + remove_empty_feeds = True + filterDuplicates = True - html2lrf_options = ['--base-font-size', '10'] + extra_css = ''' + h1, h2, h3 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;} + p.subtitle {font-size:xx-small; font-family:Arial,Helvetica,sans-serif;} + h4, h5, h2.rubrique, {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;} + .ref, .date, .author, .legende {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;} + .mna-body, entry-body {font-size:medium; font-family:Arial,Helvetica,sans-serif;} + ''' keep_only_tags = [ - dict(name='h1') - #,dict(name='div', attrs={'class':'object-content text text-item'}) - ,dict(name='div', attrs={'class':'article'}) - #,dict(name='div', attrs={'class':'articleContent'}) - ,dict(name='div', attrs={'class':'entry'}) - ] - remove_tags_after = [ dict(name='div',attrs={'class':'toolbox extra_toolbox'}) ] + dict(name='div', attrs={'class':'article'}) + ,dict(name='div', attrs={'class':'text-article m-bot-s1'}) + ,dict(name='div', attrs={'class':'entry'}) + ,dict(name='div', attrs={'class':'col_contenu'}) + ] + + remove_tags_after = [ + dict(name='div',attrs={'class':['object-content text text-item', 'object-content', 'entry-content', 'col01', 'bloc_article_01']}) + ,dict(name='p',attrs={'class':['chapo']}) + ,dict(id='_twitter_facebook') + ] + remove_tags = [ - dict(name='p', attrs={'class':'clear'}) - ,dict(name='ul', attrs={'class':'floatLeft clear'}) - ,dict(name='div', attrs={'class':'clear floatRight'}) - ,dict(name='object') - ,dict(name='div', attrs={'class':'toolbox'}) - ,dict(name='div', attrs={'class':'cartridge cartridge-basic-bubble cat-zoneabo'}) - #,dict(name='div', attrs={'class':'clear block block-call-items'}) - ,dict(name='div', attrs={'class':'block-content'}) + dict(name='iframe') + ,dict(name='a', attrs={'class':'lnk-comments'}) + ,dict(name='div', attrs={'class':'toolbox'}) + ,dict(name='ul', attrs={'class':'share-box'}) + ,dict(name='ul', attrs={'class':'tool-box'}) + ,dict(name='ul', attrs={'class':'rub'}) + ,dict(name='p',attrs={'class':['chapo']}) + ,dict(name='p',attrs={'class':['tag']}) + ,dict(name='div',attrs={'class':['blokLies']}) + ,dict(name='div',attrs={'class':['alire']}) + ,dict(id='_twitter_facebook') ] feeds = [ - (u'La une', u'http://www.liberation.fr/rss/laune') - ,(u'Monde' , u'http://www.liberation.fr/rss/monde') - ,(u'Sports', u'http://www.liberation.fr/rss/sports') + (u'La une', u'http://rss.liberation.fr/rss/9/') + ,(u'Monde' , u'http://www.liberation.fr/rss/10/') + ,(u'Économie', u'http://www.liberation.fr/rss/13/') + ,(u'Politiques', u'http://www.liberation.fr/rss/11/') + ,(u'Société', u'http://www.liberation.fr/rss/12/') + ,(u'Cinéma', u'http://www.liberation.fr/rss/58/') + ,(u'Écran', u'http://www.liberation.fr/rss/53/') + ,(u'Sports', u'http://www.liberation.fr/rss/12/') ] + + def get_masthead_url(self): + masthead = 'http://s0.libe.com/libe/img/common/logo-liberation-150.png' + br = BasicNewsRecipe.get_browser() + try: + br.open(masthead) + except: + self.log("\nCover unavailable") + masthead = None + return masthead From 1bb39ffd77b8371411968c43ceb3669479071822 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Chabot?= Date: Sun, 16 Oct 2011 11:24:15 +0200 Subject: [PATCH 02/12] Fix USA Today news website recipe --- recipes/usatoday.recipe | 56 +++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/recipes/usatoday.recipe b/recipes/usatoday.recipe index 18aeab2648..62c5f1c2da 100644 --- a/recipes/usatoday.recipe +++ b/recipes/usatoday.recipe @@ -10,27 +10,28 @@ from calibre.web.feeds.news import BasicNewsRecipe class USAToday(BasicNewsRecipe): - title = 'USA Today' - __author__ = 'Kovid Goyal' - oldest_article = 1 - publication_type = 'newspaper' - timefmt = '' - max_articles_per_feed = 20 - language = 'en' - no_stylesheets = True - extra_css = '.headline {text-align: left;}\n \ - .byline {font-family: monospace; \ - text-align: left; \ - margin-bottom: 1em;}\n \ - .image {text-align: center;}\n \ - .caption {text-align: center; \ - font-size: smaller; \ - font-style: italic}\n \ - .credit {text-align: right; \ - margin-bottom: 0em; \ - font-size: smaller;}\n \ - .articleBody {text-align: left;}\n ' - #simultaneous_downloads = 1 + title = 'USA Today' + __author__ = 'calibre' + description = 'newspaper' + encoding = 'utf-8' + publisher = 'usatoday.com' + category = 'news, usa' + language = 'en' + + use_embedded_content = False + timefmt = ' [%d %b %Y]' + max_articles_per_feed = 15 + no_stylesheets = True + remove_empty_feeds = True + filterDuplicates = True + + extra_css = ''' + h1, h2 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;} + #post-attributes, .info, .clear {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;} + #post-body, #content {font-size:medium; font-family:Arial,Helvetica,sans-serif;} + ''' + + feeds = [ ('Top Headlines', 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'), ('Tech Headlines', 'http://rssfeeds.usatoday.com/usatoday-TechTopStories'), @@ -43,15 +44,18 @@ class USAToday(BasicNewsRecipe): ('Sport Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'), ('Weather Headlines', 'http://rssfeeds.usatoday.com/usatoday-WeatherTopStories'), ('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'), - ('Offbeat News', 'http://rssfeeds.usatoday.com/UsatodaycomOffbeat-TopStories'), + ('Offbeat News', 'http://rssfeeds.usatoday.com/UsatodaycomOffbeat-TopStories') ] + keep_only_tags = [dict(attrs={'class':'story'})] + remove_tags = [ dict(attrs={'class':[ 'share', 'reprints', 'inline-h3', - 'info-extras', + 'info-extras rounded', + 'inset', 'ppy-outer', 'ppy-caption', 'comments', @@ -61,9 +65,13 @@ class USAToday(BasicNewsRecipe): 'tags', 'bottom-tools', 'sponsoredlinks', + 'corrections' ]}), + dict(name='ul', attrs={'class':'inside-copy'}), dict(id=['pluck']), - ] + dict(id=['updated']), + dict(id=['post-date-updated']) + ] def get_masthead_url(self): From 086582cb2d167945e875e76f8909c1931915352c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Chabot?= Date: Sun, 16 Oct 2011 11:25:19 +0200 Subject: [PATCH 03/12] Add 20minutes news website recipe --- recipes/20minutes.recipe | 70 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 recipes/20minutes.recipe diff --git a/recipes/20minutes.recipe b/recipes/20minutes.recipe new file mode 100644 index 0000000000..ec9121f2b5 --- /dev/null +++ b/recipes/20minutes.recipe @@ -0,0 +1,70 @@ +__license__ = 'GPL v3' +__copyright__ = '2011 Aurélien Chabot ' +''' +20minutes.fr +''' +import re +from calibre.web.feeds.recipes import BasicNewsRecipe + +class Minutes(BasicNewsRecipe): + + title = '20 minutes' + __author__ = 'calibre' + description = 'Actualités' + encoding = 'cp1252' + publisher = '20minutes.fr' + category = 'Actualités, France, Monde' + language = 'fr' + + use_embedded_content = False + timefmt = ' [%d %b %Y]' + max_articles_per_feed = 15 + no_stylesheets = True + remove_empty_feeds = True + filterDuplicates = True + + extra_css = ''' + h1 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;} + .mna-details {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;} + .mna-image {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;} + .mna-body {font-size:medium; font-family:Arial,Helvetica,sans-serif;} + ''' + + remove_tags = [ + dict(name='iframe'), + dict(name='div', attrs={'class':['mn-section-heading']}), + dict(name='a', attrs={'href':['#commentaires']}), + dict(name='div', attrs={'class':['mn-right']}), + dict(name='div', attrs={'class':['mna-box']}), + dict(name='div', attrs={'class':['mna-comment-call']}), + dict(name='div', attrs={'class':['mna-tools']}), + dict(name='div', attrs={'class':['mn-trilist']}) + ] + + keep_only_tags = [dict(id='mn-article')] + + remove_tags_after = dict(name='div', attrs={'class':['mna-body','mna-signature']}) + + + feeds = [ + ('France', 'http://www.20minutes.fr/rss/actu-france.xml'), + ('International', 'http://www.20minutes.fr/rss/monde.xml'), + ('Tech/Web', 'http://www.20minutes.fr/rss/hightech.xml'), + ('Sciences', 'http://www.20minutes.fr/rss/sciences.xml'), + ('Economie', 'http://www.20minutes.fr/rss/economie.xml'), + ('Politique', 'http://www.20minutes.fr/rss/politique.xml'), + (u'Médias', 'http://www.20minutes.fr/rss/media.xml'), + ('Cinema', 'http://www.20minutes.fr/rss/cinema.xml'), + ('People', 'http://www.20minutes.fr/rss/people.xml'), + ('Culture', 'http://www.20minutes.fr/rss/culture.xml'), + ('Sport', 'http://www.20minutes.fr/rss/sport.xml'), + ('Paris', 'http://www.20minutes.fr/rss/paris.xml'), + ('Lyon', 'http://www.20minutes.fr/rss/lyon.xml'), + ('Toulouse', 'http://www.20minutes.fr/rss/toulouse.xml') + ] + + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup From 10680496d9b64bde6885f80f809dfe9fa3aedfdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Chabot?= Date: Sun, 16 Oct 2011 11:25:58 +0200 Subject: [PATCH 04/12] Add FrAndroid news website recipe --- recipes/frandroid.recipe | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 recipes/frandroid.recipe diff --git a/recipes/frandroid.recipe b/recipes/frandroid.recipe new file mode 100644 index 0000000000..38d164190b --- /dev/null +++ b/recipes/frandroid.recipe @@ -0,0 +1,7 @@ +class BasicUserRecipe1318572550(AutomaticNewsRecipe): + title = u'FrAndroid' + oldest_article = 2 + max_articles_per_feed = 100 + auto_cleanup = True + + feeds = [(u'FrAndroid', u'http://feeds.feedburner.com/Frandroid')] From 7f091a5ffe0af5e37dfd8a0175ae7f8cc7dec08e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Chabot?= Date: Sun, 16 Oct 2011 11:26:34 +0200 Subject: [PATCH 05/12] Add lepoint news website recipe --- recipes/lepoint.recipe | 75 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 recipes/lepoint.recipe diff --git a/recipes/lepoint.recipe b/recipes/lepoint.recipe new file mode 100644 index 0000000000..2cdc42fa5f --- /dev/null +++ b/recipes/lepoint.recipe @@ -0,0 +1,75 @@ +__license__ = 'GPL v3' +__copyright__ = '2011 Aurélien Chabot ' +''' +LePoint.fr +''' +import re +from calibre.web.feeds.recipes import BasicNewsRecipe + +class lepoint(BasicNewsRecipe): + + title = 'Le Point' + __author__ = 'calibre' + description = 'Actualités' + encoding = 'utf-8' + publisher = 'LePoint.fr' + category = 'news, France, world' + language = 'fr' + + use_embedded_content = False + timefmt = ' [%d %b %Y]' + max_articles_per_feed = 15 + no_stylesheets = True + remove_empty_feeds = True + filterDuplicates = True + + extra_css = ''' + h1 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;} + .chapo {font-size:xx-small; font-family:Arial,Helvetica,sans-serif;} + .info_article {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;} + .media_article {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;} + .article {font-size:medium; font-family:Arial,Helvetica,sans-serif;} + ''' + + remove_tags = [ + dict(name='iframe'), + dict(name='div', attrs={'class':['entete_chroniqueur']}), + dict(name='div', attrs={'class':['col_article']}), + dict(name='div', attrs={'class':['signature_article']}), + dict(name='div', attrs={'class':['util_font util_article']}), + dict(name='div', attrs={'class':['util_article bottom']}) + ] + + keep_only_tags = [dict(name='div', attrs={'class':['page_article']})] + + remove_tags_after = dict(name='div', attrs={'class':['util_article bottom']}) + + feeds = [ + (u'À la une', 'http://www.lepoint.fr/rss.xml'), + ('International', 'http://www.lepoint.fr/monde/rss.xml'), + ('Tech/Web', 'http://www.lepoint.fr/high-tech-internet/rss.xml'), + ('Sciences', 'http://www.lepoint.fr/science/rss.xml'), + ('Economie', 'http://www.lepoint.fr/economie/rss.xml'), + (u'Socièté', 'http://www.lepoint.fr/societe/rss.xml'), + ('Politique', 'http://www.lepoint.fr/politique/rss.xml'), + (u'Médias', 'http://www.lepoint.fr/medias/rss.xml'), + ('Culture', 'http://www.lepoint.fr/culture/rss.xml'), + (u'Santé', 'http://www.lepoint.fr/sante/rss.xml'), + ('Sport', 'http://www.lepoint.fr/sport/rss.xml') + ] + + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup + + def get_masthead_url(self): + masthead = 'http://www.lepoint.fr/images/commun/logo.png' + br = BasicNewsRecipe.get_browser() + try: + br.open(masthead) + except: + self.log("\nCover unavailable") + masthead = None + return masthead From fcdfe4a44a2edf787e4c776bf0a6af2fac9006fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Chabot?= Date: Sun, 16 Oct 2011 11:27:02 +0200 Subject: [PATCH 06/12] Add lexpress news website recipe --- recipes/lexpress.recipe | 73 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 recipes/lexpress.recipe diff --git a/recipes/lexpress.recipe b/recipes/lexpress.recipe new file mode 100644 index 0000000000..faf6d46b6b --- /dev/null +++ b/recipes/lexpress.recipe @@ -0,0 +1,73 @@ +__license__ = 'GPL v3' +__copyright__ = '2011 Aurélien Chabot ' +''' +Lexpress.fr +''' +import re +from calibre.web.feeds.recipes import BasicNewsRecipe + +class lepoint(BasicNewsRecipe): + + title = 'L\'express' + __author__ = 'calibre' + description = 'Actualités' + encoding = 'cp1252' + publisher = 'LExpress.fr' + category = 'Actualité, France, Monde' + language = 'fr' + + use_embedded_content = False + timefmt = ' [%d %b %Y]' + max_articles_per_feed = 15 + no_stylesheets = True + remove_empty_feeds = True + filterDuplicates = True + + extra_css = ''' + h1 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;} + .current_parent, p.heure, .ouverture {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;} + #contenu-article {font-size:medium; font-family:Arial,Helvetica,sans-serif;} + .entete { font-weiht:bold;} + ''' + + remove_tags = [ + dict(name='iframe'), + dict(name='div', attrs={'class':['barre-outil-fb']}), + dict(name='div', attrs={'class':['barre-outils']}), + dict(id='bloc-sommaire'), + dict(id='footer-article') + ] + + keep_only_tags = [dict(name='div', attrs={'class':['bloc-article']})] + + remove_tags_after = dict(id='content-article') + + feeds = [ + (u'À la une', 'http://www.lexpress.fr/rss/alaune.xml'), + ('International', 'http://www.lexpress.fr/rss/monde.xml'), + ('Tech/Web', 'http://www.lexpress.fr/rss/high-tech.xml'), + (u'Sciences/Santé', 'http://www.lexpress.fr/rss/science-et-sante.xml'), + (u'Envronnement', 'http://www.lexpress.fr/rss/environnement.xml'), + ('Economie', 'http://www.lepoint.fr/economie/rss.xml'), + (u'Socièté', 'http://www.lexpress.fr/rss/societe.xml'), + ('Politique', 'http://www.lexpress.fr/rss/politique.xml'), + (u'Médias', 'http://www.lexpress.fr/rss/medias.xml'), + ('Culture', 'http://www.lexpress.fr/rss/culture.xml'), + ('Sport', 'http://www.lexpress.fr/rss/sport.xml') + ] + + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup + + def get_masthead_url(self): + masthead = 'http://static.lexpress.fr/imgstat/logo_lexpress.gif' + br = BasicNewsRecipe.get_browser() + try: + br.open(masthead) + except: + self.log("\nCover unavailable") + masthead = None + return masthead From 7bee77bbaa430077ca432c3eef4f7bcc32b14aa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Chabot?= Date: Sun, 16 Oct 2011 11:27:33 +0200 Subject: [PATCH 07/12] Add zdnet.fr news website recipe --- recipes/zdnet.fr.recipe | 67 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 recipes/zdnet.fr.recipe diff --git a/recipes/zdnet.fr.recipe b/recipes/zdnet.fr.recipe new file mode 100644 index 0000000000..b5c1afe62f --- /dev/null +++ b/recipes/zdnet.fr.recipe @@ -0,0 +1,67 @@ +__license__ = 'GPL v3' +__copyright__ = '2011 Aurélien Chabot ' + +''' +Fetch zdnet.fr +''' + +from calibre.web.feeds.news import BasicNewsRecipe + + +class zdnet(BasicNewsRecipe): + + title = 'ZDNet.fr' + __author__ = 'calibre' + description = 'Actualités' + encoding = 'utf-8' + publisher = 'ZDNet.fr' + category = 'Actualité, Informatique, IT' + language = 'fr' + + use_embedded_content = False + timefmt = ' [%d %b %Y]' + max_articles_per_feed = 15 + no_stylesheets = True + remove_empty_feeds = True + filterDuplicates = True + + extra_css = ''' + h1 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;} + .contentmetadata p {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;} + #content {font-size:medium; font-family:Arial,Helvetica,sans-serif;} + ''' + + remove_tags = [ + dict(name='iframe'), + dict(name='div', attrs={'class':['toolbox']}), + dict(name='div', attrs={'class':['clear clearfix']}), + dict(id='emailtoafriend'), + dict(id='storyaudio'), + dict(id='fbtwContainer'), + dict(name='h5') + ] + + remove_tags_before = dict(id='leftcol') + remove_tags_after = dict(id='content') + + feeds = [ + ('Informatique', 'http://www.zdnet.fr/feeds/rss/actualites/informatique/'), + ('Internet', 'http://www.zdnet.fr/feeds/rss/actualites/internet/'), + ('Telecom', 'http://www.zdnet.fr/feeds/rss/actualites/telecoms/') + ] + + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup + + def get_masthead_url(self): + masthead = 'http://www.zdnet.fr/images/base/logo.png' + br = BasicNewsRecipe.get_browser() + try: + br.open(masthead) + except: + self.log("\nCover unavailable") + masthead = None + return masthead From 4ca1db0c81c977a1fea4a2cef35714abf5cd933e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Chabot?= Date: Sun, 16 Oct 2011 11:28:07 +0200 Subject: [PATCH 08/12] Add omgubuntu news website recipe --- recipes/omgubuntu.recipe | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 recipes/omgubuntu.recipe diff --git a/recipes/omgubuntu.recipe b/recipes/omgubuntu.recipe new file mode 100644 index 0000000000..c5bf1fecef --- /dev/null +++ b/recipes/omgubuntu.recipe @@ -0,0 +1,17 @@ +class BasicUserRecipe1318619832(AutomaticNewsRecipe): + title = u'OmgUbuntu' + oldest_article = 7 + max_articles_per_feed = 100 + auto_cleanup = True + + feeds = [(u'Omg Ubuntu', u'http://feeds.feedburner.com/d0od')] + + def get_masthead_url(self): + masthead = 'http://cdn.omgubuntu.co.uk/wp-content/themes/omgubuntu/images/logo.png' + br = BasicNewsRecipe.get_browser() + try: + br.open(masthead) + except: + self.log("\nCover unavailable") + masthead = None + return masthead From 15c5ad672ca004b3f95c9a8ce252056a2f71c28e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Chabot?= Date: Sun, 16 Oct 2011 11:28:39 +0200 Subject: [PATCH 09/12] Add phoronix blog/news website recipe --- recipes/phoronix.recipe | 46 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 recipes/phoronix.recipe diff --git a/recipes/phoronix.recipe b/recipes/phoronix.recipe new file mode 100644 index 0000000000..3d3397d61f --- /dev/null +++ b/recipes/phoronix.recipe @@ -0,0 +1,46 @@ +__license__ = 'GPL v3' +__copyright__ = '2011 Aurélien Chabot ' + +''' +Fetch phoronix.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + + +class cdnet(BasicNewsRecipe): + + title = 'Phoronix' + __author__ = 'calibre' + description = 'Actualités Phoronix' + encoding = 'utf-8' + publisher = 'Phoronix.com' + category = 'news, IT, linux' + language = 'en' + + use_embedded_content = False + timefmt = ' [%d %b %Y]' + max_articles_per_feed = 25 + no_stylesheets = True + remove_empty_feeds = True + filterDuplicates = True + + extra_css = ''' + h1 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;} + h2 {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;} + .KonaBody {font-size:medium; font-family:Arial,Helvetica,sans-serif;} + ''' + + remove_tags = [] + + remove_tags_before = dict(id='phxcms_content_phx') + remove_tags_after = dict(name='div', attrs={'class':'KonaBody'}) + + feeds = [('Phoronix', 'http://feeds.feedburner.com/Phoronix')] + + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup + From dce91f13a234d1716f61f6f865af8e454bd52ee0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Chabot?= Date: Sun, 16 Oct 2011 11:29:20 +0200 Subject: [PATCH 10/12] Add Google Mobile blog website recipe --- recipes/googlemobileblog.recipe | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 recipes/googlemobileblog.recipe diff --git a/recipes/googlemobileblog.recipe b/recipes/googlemobileblog.recipe new file mode 100644 index 0000000000..5c897304d7 --- /dev/null +++ b/recipes/googlemobileblog.recipe @@ -0,0 +1,7 @@ +class BasicUserRecipe1318572445(AutomaticNewsRecipe): + title = u'Google Mobile Blog' + oldest_article = 7 + max_articles_per_feed = 100 + auto_cleanup = True + + feeds = [(u'Google Mobile Blog', u'http://googlemobile.blogspot.com/atom.xml')] From a24ba3ff592e074de6555ea97c4a8cf28127fd83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Chabot?= Date: Sun, 16 Oct 2011 11:29:56 +0200 Subject: [PATCH 11/12] Add korben blog website recipe --- recipes/korben.recipe | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 recipes/korben.recipe diff --git a/recipes/korben.recipe b/recipes/korben.recipe new file mode 100644 index 0000000000..62e50df78b --- /dev/null +++ b/recipes/korben.recipe @@ -0,0 +1,17 @@ +class BasicUserRecipe1318619728(AutomaticNewsRecipe): + title = u'Korben' + oldest_article = 7 + max_articles_per_feed = 100 + auto_cleanup = True + + feeds = [(u'Korben', u'http://feeds2.feedburner.com/KorbensBlog-UpgradeYourMind')] + + def get_masthead_url(self): + masthead = 'http://korben.info/wp-content/themes/korben-steaw/hab/logo.png' + br = BasicNewsRecipe.get_browser() + try: + br.open(masthead) + except: + self.log("\nCover unavailable") + masthead = None + return masthead From 5660b08fbbfa5cb6059e044540c8dacd9a42420a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Chabot?= Date: Sun, 16 Oct 2011 13:39:28 +0200 Subject: [PATCH 12/12] Fix CNN website recipe * Add style * Add CNN logo * Fix problem with image gallery --- recipes/cnn.recipe | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/recipes/cnn.recipe b/recipes/cnn.recipe index 096c370706..6043f8b401 100644 --- a/recipes/cnn.recipe +++ b/recipes/cnn.recipe @@ -22,6 +22,14 @@ class CNN(BasicNewsRecipe): #match_regexps = [r'http://sportsillustrated.cnn.com/.*/[1-9].html'] max_articles_per_feed = 25 + extra_css = ''' + h1 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;} + .cnn_story_author, .cnn_stryathrtmp {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;} + .cnn_strycaptiontxt, .cnnArticleGalleryPhotoContainer {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;} + .cnn_strycbftrtxt, .cnnEditorialNote {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;} + .cnn_strycntntlft {font-size:medium; font-family:Arial,Helvetica,sans-serif;} + ''' + preprocess_regexps = [ (re.compile(r'', re.DOTALL), lambda m: ''), (re.compile(r'', re.DOTALL), lambda m: ''), @@ -32,7 +40,12 @@ class CNN(BasicNewsRecipe): remove_tags = [ {'class':['cnn_strybtntools', 'cnn_strylftcntnt', 'cnn_strybtntools', 'cnn_strybtntoolsbttm', 'cnn_strybtmcntnt', - 'cnn_strycntntrgt', 'hed_side', 'foot']}, + 'cnn_strycntntrgt', 'hed_side', 'foot', 'cnn_strylftcntnt cnn_strylftcexpbx']}, + {'class':['cnn_html_media_title_new', 'cnn_html_media_title_new cnn_html_media_title_none', + 'cnnArticleGalleryCaptionControlText', 'articleGalleryNavContainer']}, + {'id':['articleGalleryNav00JumpPrev', 'articleGalleryNav00Prev', + 'articleGalleryNav00Next', 'articleGalleryNav00JumpNext']}, + {'style':['display:none']}, dict(id=['ie_column']), ] @@ -58,3 +71,12 @@ class CNN(BasicNewsRecipe): ans = BasicNewsRecipe.get_article_url(self, article) return ans.partition('?')[0] + def get_masthead_url(self): + masthead = 'http://i.cdn.turner.com/cnn/.element/img/3.0/global/header/intl/hdr-globe-central.gif' + br = BasicNewsRecipe.get_browser() + try: + br.open(masthead) + except: + self.log("\nCover unavailable") + masthead = None + return masthead