diff --git a/recipes/10minutos.recipe b/recipes/10minutos.recipe index 77903c363f..fcb4ba642b 100644 --- a/recipes/10minutos.recipe +++ b/recipes/10minutos.recipe @@ -1,16 +1,16 @@ #!/usr/bin/env python2 ## -## Title: Diario 10minutos.com.uy News and Sports Calibre Recipe -## Contact: Carlos Alves - +# Title: Diario 10minutos.com.uy News and Sports Calibre Recipe +# Contact: Carlos Alves - ## -## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html -## Copyright: Carlos Alves - +# License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html +# Copyright: Carlos Alves - ## -## Written: September 2013 -## Last Edited: 2016-01-11 +# Written: September 2013 +# Last Edited: 2016-01-11 ## -__license__ = 'GPL v3' +__license__ = 'GPL v3' __author__ = '2016, Carlos Alves ' ''' 10minutos.com.uy @@ -18,29 +18,30 @@ __author__ = '2016, Carlos Alves ' from calibre.web.feeds.news import BasicNewsRecipe + class General(BasicNewsRecipe): - title = '10minutos' - __author__ = 'Carlos Alves' - description = 'Noticias de Salto - Uruguay' + title = '10minutos' + __author__ = 'Carlos Alves' + description = 'Noticias de Salto - Uruguay' tags = 'news, sports' - language = 'es_UY' - timefmt = '[%a, %d %b, %Y]' - use_embedded_content = False - recursion = 5 + language = 'es_UY' + timefmt = '[%a, %d %b, %Y]' + use_embedded_content = False + recursion = 5 encoding = 'utf8' remove_javascript = True no_stylesheets = True - oldest_article = 2 + oldest_article = 2 max_articles_per_feed = 100 - keep_only_tags = [dict(name='div', attrs={'class':'post-content'})] + keep_only_tags = [dict(name='div', attrs={'class': 'post-content'})] remove_tags = [ - dict(name='div', attrs={'class':['hr', 'titlebar', 'navigation']}), - dict(name='div', attrs={'class':'sharedaddy sd-sharing-enabled'}), - dict(name='p', attrs={'class':'post-meta'}), - dict(name=['object','link']) - ] + dict(name='div', attrs={'class': ['hr', 'titlebar', 'navigation']}), + dict(name='div', attrs={'class': 'sharedaddy sd-sharing-enabled'}), + dict(name='p', attrs={'class': 'post-meta'}), + dict(name=['object', 'link']) + ] extra_css = ''' h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;} @@ -49,8 +50,8 @@ class General(BasicNewsRecipe): p {font-family:Arial,Helvetica,sans-serif;} ''' feeds = [ - (u'Articulos', u'http://10minutos.com.uy/?feed=rss2') - ] + (u'Articulos', u'http://10minutos.com.uy/?feed=rss2') + ] def get_cover_url(self): return 'http://10minutos.com.uy/a/img/logo.png' diff --git a/recipes/180.recipe b/recipes/180.recipe index 4be825a81e..a579a165ad 100644 --- a/recipes/180.recipe +++ b/recipes/180.recipe @@ -1,9 +1,9 @@ #!/usr/bin/env python2 ## -## Last Edited: 2016-01-11 Carlos Alves +# Last Edited: 2016-01-11 Carlos Alves ## -__license__ = 'GPL v3' +__license__ = 'GPL v3' __author__ = '2010, Gustavo Azambuja ' ''' 180.com.uy @@ -11,31 +11,32 @@ __author__ = '2010, Gustavo Azambuja ' from calibre.web.feeds.news import BasicNewsRecipe + class Noticias(BasicNewsRecipe): - title = '180.com.uy' - __author__ = 'Gustavo Azambuja' - description = 'Noticias de Uruguay' - language = 'es_UY' - timefmt = '[%a, %d %b, %Y]' - use_embedded_content = False - recursion = 5 + title = '180.com.uy' + __author__ = 'Gustavo Azambuja' + description = 'Noticias de Uruguay' + language = 'es_UY' + timefmt = '[%a, %d %b, %Y]' + use_embedded_content = False + recursion = 5 encoding = 'utf-8' remove_javascript = True no_stylesheets = True - oldest_article = 2 + oldest_article = 2 max_articles_per_feed = 100 - remove_tags_after = dict(name='article') + remove_tags_after = dict(name='article') keep_only_tags = [ - dict(name='h3', attrs={'class':'title'}), - dict(name='div', attrs={'class':'copete'}), - dict(name='article', attrs={'class':'texto'}) - ] + dict(name='h3', attrs={'class': 'title'}), + dict(name='div', attrs={'class': 'copete'}), + dict(name='article', attrs={'class': 'texto'}) + ] remove_tags = [ - dict(name=['object','link']) - ] + dict(name=['object', 'link']) + ] - remove_attributes = ['width','height', 'style', 'font', 'color'] + remove_attributes = ['width', 'height', 'style', 'font', 'color'] extra_css = ''' h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;} @@ -44,15 +45,13 @@ class Noticias(BasicNewsRecipe): p {font-family:Arial,Helvetica,sans-serif;} ''' feeds = [ - (u'Titulares', u'http://www.180.com.uy/feed.php') - ] + (u'Titulares', u'http://www.180.com.uy/feed.php') + ] def get_cover_url(self): - pass - + pass def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] return soup - diff --git a/recipes/1843.recipe b/recipes/1843.recipe index 037eedac86..199f576948 100644 --- a/recipes/1843.recipe +++ b/recipes/1843.recipe @@ -22,7 +22,7 @@ class E1843(BasicNewsRecipe): encoding = 'utf-8' keep_only_tags = [ - dict(name='h1', attrs={'class':'title'}), + dict(name='h1', attrs={'class': 'title'}), classes('field-name-field-rubric-summary article-header__overlay-main-image meta-info__author article__body'), ] @@ -54,7 +54,8 @@ class E1843(BasicNewsRecipe): r = div.find(**classes('article-rubric')) if r is not None: desc = self.tag_to_string(r) - articles.append({'title':title, 'url':url, 'description':desc}) + articles.append( + {'title': title, 'url': url, 'description': desc}) if current_section and articles: ans.append((current_section, articles)) diff --git a/recipes/20_minutos.recipe b/recipes/20_minutos.recipe index 106c0dcffa..208fbc7401 100644 --- a/recipes/20_minutos.recipe +++ b/recipes/20_minutos.recipe @@ -1,8 +1,8 @@ -__license__ = 'GPL v3' -__author__ = 'Luis Hernandez' +__license__ = 'GPL v3' +__author__ = 'Luis Hernandez' __copyright__ = 'Luis Hernandez' -__version__ = 'v0.85' -__date__ = '31 January 2011' +__version__ = 'v0.85' +__date__ = '31 January 2011' ''' www.20minutos.es @@ -10,46 +10,39 @@ www.20minutos.es import re from calibre.web.feeds.news import BasicNewsRecipe + class AdvancedUserRecipe1294946868(BasicNewsRecipe): - title = u'20 Minutos new' - publisher = u'Grupo 20 Minutos' + title = u'20 Minutos new' + publisher = u'Grupo 20 Minutos' - __author__ = 'Luis Hernandez' - description = 'Free spanish newspaper' - cover_url = 'http://estaticos.20minutos.es/mmedia/especiales/corporativo/css/img/logotipos_grupo20minutos.gif' + __author__ = 'Luis Hernandez' + description = 'Free spanish newspaper' + cover_url = 'http://estaticos.20minutos.es/mmedia/especiales/corporativo/css/img/logotipos_grupo20minutos.gif' oldest_article = 2 max_articles_per_feed = 100 remove_javascript = True - no_stylesheets = True - use_embedded_content = False + no_stylesheets = True + use_embedded_content = False - encoding = 'ISO-8859-1' - language = 'es' - timefmt = '[%a, %d %b, %Y]' - remove_empty_feeds = True + encoding = 'ISO-8859-1' + language = 'es' + timefmt = '[%a, %d %b, %Y]' + remove_empty_feeds = True - keep_only_tags = [ - dict(name='div', attrs={'id':['content','vinetas',]}) - ,dict(name='div', attrs={'class':['boxed','description','lead','article-content','cuerpo estirar']}) - ,dict(name='span', attrs={'class':['photo-bar']}) - ,dict(name='ul', attrs={'class':['article-author']}) - ] + keep_only_tags = [ + dict(name='div', attrs={'id': ['content', 'vinetas', ]}), dict(name='div', attrs={'class': ['boxed', 'description', 'lead', 'article-content', 'cuerpo estirar']}), dict(name='span', attrs={'class': ['photo-bar']}), dict(name='ul', attrs={'class': ['article-author']}) # noqa + ] - remove_tags_before = dict(name='ul' , attrs={'class':['servicios-sub']}) - remove_tags_after = dict(name='div' , attrs={'class':['related-news','col']}) + remove_tags_before = dict(name='ul', attrs={'class': ['servicios-sub']}) + remove_tags_after = dict( + name='div', attrs={'class': ['related-news', 'col']}) remove_tags = [ - dict(name='ol', attrs={'class':['navigation',]}) - ,dict(name='span', attrs={'class':['action']}) - ,dict(name='div', attrs={'class':['twitter comments-list hidden','related-news','col','photo-gallery','photo-gallery side-art-block','calendario','article-comment','postto estirar','otras_vinetas estirar','kment','user-actions']}) - ,dict(name='div', attrs={'id':['twitter-destacados','eco-tabs','inner','vineta_calendario','vinetistas clearfix','otras_vinetas estirar','MIN1','main','SUP1','INT']}) - ,dict(name='ul', attrs={'class':['article-user-actions','stripped-list']}) - ,dict(name='ul', attrs={'id':['site-links']}) - ,dict(name='li', attrs={'class':['puntuacion','enviar','compartir']}) - ] + dict(name='ol', attrs={'class': ['navigation', ]}), dict(name='span', attrs={'class': ['action']}), dict(name='div', attrs={'class': ['twitter comments-list hidden', 'related-news', 'col', 'photo-gallery', 'photo-gallery side-art-block', 'calendario', 'article-comment', 'postto estirar', 'otras_vinetas estirar', 'kment', 'user-actions']}), dict( name='div', attrs={'id': ['twitter-destacados', 'eco-tabs', 'inner', 'vineta_calendario', 'vinetistas clearfix', 'otras_vinetas estirar', 'MIN1', 'main', 'SUP1', 'INT']}), dict(name='ul', attrs={'class': ['article-user-actions', 'stripped-list']}), dict(name='ul', attrs={'id': ['site-links']}), dict(name='li', attrs={'class': ['puntuacion', 'enviar', 'compartir']}) # noqa + ] extra_css = """ p{text-align: justify; font-size: 100%} @@ -57,23 +50,25 @@ class AdvancedUserRecipe1294946868(BasicNewsRecipe): h3{font-family: sans-serif; font-size:150%; font-weight:bold; text-align: justify; } """ - preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: '')] + preprocess_regexps = [(re.compile( + r'', re.DOTALL), lambda m: '')] feeds = [ - (u'Portada' , u'http://www.20minutos.es/rss/') - ,(u'Nacional' , u'http://www.20minutos.es/rss/nacional/') - ,(u'Internacional' , u'http://www.20minutos.es/rss/internacional/') - ,(u'Economia' , u'http://www.20minutos.es/rss/economia/') - ,(u'Deportes' , u'http://www.20minutos.es/rss/deportes/') - ,(u'Tecnologia' , u'http://www.20minutos.es/rss/tecnologia/') - ,(u'Gente - TV' , u'http://www.20minutos.es/rss/gente-television/') - ,(u'Motor' , u'http://www.20minutos.es/rss/motor/') - ,(u'Salud' , u'http://www.20minutos.es/rss/belleza-y-salud/') - ,(u'Viajes' , u'http://www.20minutos.es/rss/viajes/') - ,(u'Vivienda' , u'http://www.20minutos.es/rss/vivienda/') - ,(u'Empleo' , u'http://www.20minutos.es/rss/empleo/') - ,(u'Cine' , u'http://www.20minutos.es/rss/cine/') - ,(u'Musica' , u'http://www.20minutos.es/rss/musica/') - ,(u'Vinetas' , u'http://www.20minutos.es/rss/vinetas/') - ,(u'Comunidad20' , u'http://www.20minutos.es/rss/zona20/') - ] + + (u'Portada', u'http://www.20minutos.es/rss/'), + (u'Nacional', u'http://www.20minutos.es/rss/nacional/'), + (u'Internacional', u'http://www.20minutos.es/rss/internacional/'), + (u'Economia', u'http://www.20minutos.es/rss/economia/'), + (u'Deportes', u'http://www.20minutos.es/rss/deportes/'), + (u'Tecnologia', u'http://www.20minutos.es/rss/tecnologia/'), + (u'Gente - TV', u'http://www.20minutos.es/rss/gente-television/'), + (u'Motor', u'http://www.20minutos.es/rss/motor/'), + (u'Salud', u'http://www.20minutos.es/rss/belleza-y-salud/'), + (u'Viajes', u'http://www.20minutos.es/rss/viajes/'), + (u'Vivienda', u'http://www.20minutos.es/rss/vivienda/'), + (u'Empleo', u'http://www.20minutos.es/rss/empleo/'), + (u'Cine', u'http://www.20minutos.es/rss/cine/'), + (u'Musica', u'http://www.20minutos.es/rss/musica/'), + (u'Vinetas', u'http://www.20minutos.es/rss/vinetas/'), + (u'Comunidad20', u'http://www.20minutos.es/rss/zona20/') + ] diff --git a/recipes/20minutes.recipe b/recipes/20minutes.recipe index 3ba27d73f6..b8e20afa20 100644 --- a/recipes/20minutes.recipe +++ b/recipes/20minutes.recipe @@ -1,33 +1,34 @@ # -*- coding: utf-8 -*- -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2011 Aurélien Chabot ' ''' 20minutes.fr ''' from calibre.web.feeds.recipes import BasicNewsRecipe + class Minutes(BasicNewsRecipe): - title = '20 minutes' - __author__ = u'Aurélien Chabot' - description = 'Actualités' - encoding = 'utf-8' - publisher = '20minutes.fr' - category = 'Actualités, France, Monde' - language = 'fr' + title = '20 minutes' + __author__ = u'Aurélien Chabot' + description = 'Actualités' + encoding = 'utf-8' + publisher = '20minutes.fr' + category = 'Actualités, France, Monde' + language = 'fr' - use_embedded_content = False - timefmt = ' [%d %b %Y]' - max_articles_per_feed = 15 - no_stylesheets = True - remove_empty_feeds = True + use_embedded_content = False + timefmt = ' [%d %b %Y]' + max_articles_per_feed = 15 + no_stylesheets = True + remove_empty_feeds = True keep_only_tags = [ dict(name='h1'), - dict(attrs={'class':lambda x: x and 'lt-content' in x.split()}), + dict(attrs={'class': lambda x: x and 'lt-content' in x.split()}), ] remove_tags = [ - dict(attrs={'class':lambda x:x and 'content-related' in x.split()}), + dict(attrs={'class': lambda x: x and 'content-related' in x.split()}), ] remove_tags_after = dict(id='ob_holder') diff --git a/recipes/20minutos.recipe b/recipes/20minutos.recipe index d7657f77c7..725278c045 100644 --- a/recipes/20minutos.recipe +++ b/recipes/20minutos.recipe @@ -1,4 +1,4 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2011, Darko Miletic ' ''' www.20minutos.es @@ -6,45 +6,44 @@ www.20minutos.es from calibre.web.feeds.news import BasicNewsRecipe + class t20Minutos(BasicNewsRecipe): - title = '20 Minutos' - __author__ = 'Darko Miletic' - description = 'Diario de informacion general y local mas leido de Espania, noticias de ultima hora de Espania, el mundo, local, deportes, noticias curiosas y mas' - publisher = '20 Minutos Online SL' - category = 'news, politics, Spain' - oldest_article = 2 + title = '20 Minutos' + __author__ = 'Darko Miletic' + description = 'Diario de informacion general y local mas leido de Espania, noticias de ultima hora de Espania, el mundo, local, deportes, noticias curiosas y mas' # noqa + publisher = '20 Minutos Online SL' + category = 'news, politics, Spain' + oldest_article = 2 max_articles_per_feed = 200 - no_stylesheets = True - encoding = 'utf8' - use_embedded_content = True - language = 'es' - remove_empty_feeds = True - publication_type = 'newspaper' - masthead_url = 'http://estaticos.20minutos.es/css4/img/ui/logo-301x54.png' + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = True + language = 'es' + remove_empty_feeds = True + publication_type = 'newspaper' + masthead_url = 'http://estaticos.20minutos.es/css4/img/ui/logo-301x54.png' extra_css = """ body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em; display:block} """ conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } + 'comment': description, 'tags': category, 'publisher': publisher, 'language': language + } - remove_tags = [dict(attrs={'class':'mf-viral'})] - remove_attributes=['border'] + remove_tags = [dict(attrs={'class': 'mf-viral'})] + remove_attributes = ['border'] feeds = [ - (u'Principal' , u'http://20minutos.feedsportal.com/c/32489/f/478284/index.rss') - ,(u'Cine' , u'http://20minutos.feedsportal.com/c/32489/f/478285/index.rss') - ,(u'Internacional' , u'http://20minutos.feedsportal.com/c/32489/f/492689/index.rss') - ,(u'Deportes' , u'http://20minutos.feedsportal.com/c/32489/f/478286/index.rss') - ,(u'Nacional' , u'http://20minutos.feedsportal.com/c/32489/f/492688/index.rss') - ,(u'Economia' , u'http://20minutos.feedsportal.com/c/32489/f/492690/index.rss') - ,(u'Tecnologia' , u'http://20minutos.feedsportal.com/c/32489/f/478292/index.rss') - ] + + (u'Principal', u'http://20minutos.feedsportal.com/c/32489/f/478284/index.rss'), + (u'Cine', u'http://20minutos.feedsportal.com/c/32489/f/478285/index.rss'), + (u'Internacional', u'http://20minutos.feedsportal.com/c/32489/f/492689/index.rss'), + (u'Deportes', u'http://20minutos.feedsportal.com/c/32489/f/478286/index.rss'), + (u'Nacional', u'http://20minutos.feedsportal.com/c/32489/f/492688/index.rss'), + (u'Economia', u'http://20minutos.feedsportal.com/c/32489/f/492690/index.rss'), + (u'Tecnologia', u'http://20minutos.feedsportal.com/c/32489/f/478292/index.rss') + ] def preprocess_html(self, soup): for item in soup.findAll(style=True): @@ -52,17 +51,16 @@ class t20Minutos(BasicNewsRecipe): for item in soup.findAll('a'): limg = item.find('img') if item.string is not None: - str = item.string - item.replaceWith(str) + str = item.string + item.replaceWith(str) else: - if limg: - item.name = 'div' - item.attrs = [] - else: - str = self.tag_to_string(item) - item.replaceWith(str) + if limg: + item.name = 'div' + item.attrs = [] + else: + str = self.tag_to_string(item) + item.replaceWith(str) for item in soup.findAll('img'): - if not item.has_key('alt'): - item['alt'] = 'image' + if not item.has_key('alt'): # noqa + item['alt'] = 'image' return soup - diff --git a/recipes/24sata.recipe b/recipes/24sata.recipe index 49d3a6b0b5..d7f080e139 100644 --- a/recipes/24sata.recipe +++ b/recipes/24sata.recipe @@ -1,6 +1,6 @@ #!/usr/bin/env python2 -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2009, Darko Miletic ' ''' @@ -11,51 +11,50 @@ import re from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag + class Cro24Sata(BasicNewsRecipe): - title = '24 Sata - Hr' - __author__ = 'Darko Miletic' - description = "News Portal from Croatia" - publisher = '24sata.hr' - category = 'news, politics, Croatia' - oldest_article = 2 + title = '24 Sata - Hr' + __author__ = 'Darko Miletic' + description = "News Portal from Croatia" + publisher = '24sata.hr' + category = 'news, politics, Croatia' + oldest_article = 2 max_articles_per_feed = 100 - delay = 4 - no_stylesheets = True - encoding = 'utf-8' - use_embedded_content = False + delay = 4 + no_stylesheets = True + encoding = 'utf-8' + use_embedded_content = False language = 'hr' - lang = 'hr-HR' + lang = 'hr-HR' - extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' + extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' # noqa conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : lang - , 'pretty_print' : True - } + 'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] remove_tags = [ - dict(name=['object','link','embed']) - ,dict(name='table', attrs={'class':'enumbox'}) - ] + dict(name=['object', 'link', 'embed']), dict( + name='table', attrs={'class': 'enumbox'}) + ] - feeds = [(u'Najnovije Vijesti', u'http://www.24sata.hr/index.php?cmd=show_rss&action=novo')] + feeds = [(u'Najnovije Vijesti', + u'http://www.24sata.hr/index.php?cmd=show_rss&action=novo')] def preprocess_html(self, soup): - soup.html['lang'] = self.lang - mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) - mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")]) - soup.head.insert(0,mlang) - soup.head.insert(1,mcharset) + soup.html['lang'] = self.lang + mlang = Tag(soup, 'meta', [ + ("http-equiv", "Content-Language"), ("content", self.lang)]) + mcharset = Tag(soup, 'meta', [ + ("http-equiv", "Content-Type"), ("content", "text/html; charset=UTF-8")]) + soup.head.insert(0, mlang) + soup.head.insert(1, mcharset) for item in soup.findAll(style=True): del item['style'] return soup def print_version(self, url): return url + '&action=ispis' - diff --git a/recipes/24sata_rs.recipe b/recipes/24sata_rs.recipe index a51323f21f..cb12819ce3 100644 --- a/recipes/24sata_rs.recipe +++ b/recipes/24sata_rs.recipe @@ -1,6 +1,6 @@ # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2009-2012, Darko Miletic ' ''' @@ -10,40 +10,38 @@ __copyright__ = '2009-2012, Darko Miletic ' import re from calibre.web.feeds.recipes import BasicNewsRecipe + class Ser24Sata(BasicNewsRecipe): - title = '24 Sata - Sr' - __author__ = 'Darko Miletic' - description = '24 sata portal vesti iz Srbije' - publisher = 'Ringier d.o.o.' - category = 'news, politics, entertainment, Serbia' - oldest_article = 2 + title = '24 Sata - Sr' + __author__ = 'Darko Miletic' + description = '24 sata portal vesti iz Srbije' + publisher = 'Ringier d.o.o.' + category = 'news, politics, entertainment, Serbia' + oldest_article = 2 max_articles_per_feed = 100 - no_stylesheets = True - encoding = 'utf-8' - use_embedded_content = False - language = 'sr' - publication_type = 'newsportal' + no_stylesheets = True + encoding = 'utf-8' + use_embedded_content = False + language = 'sr' + publication_type = 'newsportal' extra_css = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} """ conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher': publisher - , 'language' : language - } + 'comment': description, 'tags': category, 'publisher': publisher, 'language': language + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] feeds = [ - (u'Vesti' , u'http://www.24sata.rs/rss/vesti.xml' ), - (u'Sport' , u'http://www.24sata.rs/rss/sport.xml' ), - (u'Šou' , u'http://www.24sata.rs/rss/sou.xml' ), - (u'Specijal', u'http://www.24sata.rs/rss/specijal.xml'), - (u'Novi Sad', u'http://www.24sata.rs/rss/ns.xml' ) - ] + (u'Vesti', u'http://www.24sata.rs/rss/vesti.xml'), + (u'Sport', u'http://www.24sata.rs/rss/sport.xml'), + (u'Šou', u'http://www.24sata.rs/rss/sou.xml'), + (u'Specijal', u'http://www.24sata.rs/rss/specijal.xml'), + (u'Novi Sad', u'http://www.24sata.rs/rss/ns.xml') + ] def print_version(self, url): dpart, spart, apart = url.rpartition('/') diff --git a/recipes/3dnews.recipe b/recipes/3dnews.recipe index 4cc74f3f1e..b88188865b 100644 --- a/recipes/3dnews.recipe +++ b/recipes/3dnews.recipe @@ -3,44 +3,63 @@ from __future__ import unicode_literals, division, absolute_import, print_function from calibre.web.feeds.news import BasicNewsRecipe + class AdvancedUserRecipe1438446837(BasicNewsRecipe): - title = '3DNews: Daily Digital Digest' + title = '3DNews: Daily Digital Digest' __author__ = 'bugmen00t' - description = 'Независимое российское онлайн-издание, посвященное цифровым технологиям' - publisher = '3DNews' - category = 'news' + description = 'Независимое российское онлайн-издание, посвященное цифровым технологиям' + publisher = '3DNews' + category = 'news' cover_url = u'http://www.3dnews.ru/assets/images/logo.png' - language = 'ru' - auto_cleanup = True + language = 'ru' + auto_cleanup = True oldest_article = 15 max_articles_per_feed = 60 - feeds = [ - ('\u041d\u043e\u0432\u043e\u0441\u0442\u0438 Hardware', 'http://www.3dnews.ru/news/rss/'), - ('\u041d\u043e\u0432\u043e\u0441\u0442\u0438 Software', 'http://www.3dnews.ru/software-news/rss/'), - ('\u0423\u043c\u043d\u044b\u0435 \u0432\u0435\u0449\u0438', 'http://www.3dnews.ru/smart-things/rss/'), - ('\u0410\u043d\u0430\u043b\u0438\u0442\u0438\u043a\u0430', 'http://www.3dnews.ru/editorial/rss/'), - ('\u041f\u0440\u043e\u0446\u0435\u0441\u0441\u043e\u0440\u044b \u0438 \u043f\u0430\u043c\u044f\u0442\u044c', 'http://www.3dnews.ru/cpu/rss/'), - ('\u041c\u0430\u0442\u0435\u0440\u0438\u043d\u0441\u043a\u0438\u0435 \u043f\u043b\u0430\u0442\u044b', 'http://www.3dnews.ru/motherboard/rss/'), + feeds = [ + ('\u041d\u043e\u0432\u043e\u0441\u0442\u0438 Hardware', + 'http://www.3dnews.ru/news/rss/'), + ('\u041d\u043e\u0432\u043e\u0441\u0442\u0438 Software', + 'http://www.3dnews.ru/software-news/rss/'), + ('\u0423\u043c\u043d\u044b\u0435 \u0432\u0435\u0449\u0438', + 'http://www.3dnews.ru/smart-things/rss/'), + ('\u0410\u043d\u0430\u043b\u0438\u0442\u0438\u043a\u0430', + 'http://www.3dnews.ru/editorial/rss/'), + ('\u041f\u0440\u043e\u0446\u0435\u0441\u0441\u043e\u0440\u044b \u0438 \u043f\u0430\u043c\u044f\u0442\u044c', + 'http://www.3dnews.ru/cpu/rss/'), + ('\u041c\u0430\u0442\u0435\u0440\u0438\u043d\u0441\u043a\u0438\u0435 \u043f\u043b\u0430\u0442\u044b', + 'http://www.3dnews.ru/motherboard/rss/'), ('\u041a\u043e\u0440\u043f\u0443\u0441\u0430, \u0411\u041f \u0438 \u043e\u0445\u043b\u0430\u0436\u0434\u0435\u043d\u0438\u0435', 'http://www.3dnews.ru/cooling/rss/'), - ('\u0412\u0438\u0434\u0435\u043e\u043a\u0430\u0440\u0442\u044b', 'http://www.3dnews.ru/video/rss/'), - ('\u041c\u043e\u043d\u0438\u0442\u043e\u0440\u044b \u0438 \u043f\u0440\u043e\u0435\u043a\u0442\u043e\u0440\u044b', 'http://www.3dnews.ru/display/rss/'), - ('\u041d\u0430\u043a\u043e\u043f\u0438\u0442\u0435\u043b\u0438', 'http://www.3dnews.ru/storage/rss/'), - ('\u0426\u0438\u0444\u0440\u043e\u0432\u043e\u0439 \u0430\u0432\u0442\u043e\u043c\u043e\u0431\u0438\u043b\u044c', 'http://www.3dnews.ru/auto/rss/'), - ('\u0421\u043e\u0442\u043e\u0432\u0430\u044f \u0441\u0432\u044f\u0437\u044c', 'http://www.3dnews.ru/phone/rss/'), - ('\u041f\u0435\u0440\u0438\u0444\u0435\u0440\u0438\u044f', 'http://www.3dnews.ru/peripheral/rss/'), - ('\u041d\u043e\u0443\u0442\u0431\u0443\u043a\u0438 \u0438 \u041f\u041a', 'http://www.3dnews.ru/mobile/rss/'), - ('\u041f\u043b\u0430\u043d\u0448\u0435\u0442\u044b', 'http://www.3dnews.ru/tablets/rss/'), - ('\u0417\u0432\u0443\u043a \u0438 \u0430\u043a\u0443\u0441\u0442\u0438\u043a\u0430', 'http://www.3dnews.ru/multimedia/rss/'), + ('\u0412\u0438\u0434\u0435\u043e\u043a\u0430\u0440\u0442\u044b', + 'http://www.3dnews.ru/video/rss/'), + ('\u041c\u043e\u043d\u0438\u0442\u043e\u0440\u044b \u0438 \u043f\u0440\u043e\u0435\u043a\u0442\u043e\u0440\u044b', + 'http://www.3dnews.ru/display/rss/'), + ('\u041d\u0430\u043a\u043e\u043f\u0438\u0442\u0435\u043b\u0438', + 'http://www.3dnews.ru/storage/rss/'), + ('\u0426\u0438\u0444\u0440\u043e\u0432\u043e\u0439 \u0430\u0432\u0442\u043e\u043c\u043e\u0431\u0438\u043b\u044c', + 'http://www.3dnews.ru/auto/rss/'), + ('\u0421\u043e\u0442\u043e\u0432\u0430\u044f \u0441\u0432\u044f\u0437\u044c', + 'http://www.3dnews.ru/phone/rss/'), + ('\u041f\u0435\u0440\u0438\u0444\u0435\u0440\u0438\u044f', + 'http://www.3dnews.ru/peripheral/rss/'), + ('\u041d\u043e\u0443\u0442\u0431\u0443\u043a\u0438 \u0438 \u041f\u041a', + 'http://www.3dnews.ru/mobile/rss/'), + ('\u041f\u043b\u0430\u043d\u0448\u0435\u0442\u044b', + 'http://www.3dnews.ru/tablets/rss/'), + ('\u0417\u0432\u0443\u043a \u0438 \u0430\u043a\u0443\u0441\u0442\u0438\u043a\u0430', + 'http://www.3dnews.ru/multimedia/rss/'), ('\u0426\u0438\u0444\u0440\u043e\u0432\u043e\u0435 \u0444\u043e\u0442\u043e \u0438 \u0432\u0438\u0434\u0435\u043e', 'http://www.3dnews.ru/digital/rss/'), - ('\u0421\u0435\u0442\u0438 \u0438 \u043a\u043e\u043c\u043c\u0443\u043d\u0438\u043a\u0430\u0446\u0438\u0438', 'http://www.3dnews.ru/communication/rss/'), + ('\u0421\u0435\u0442\u0438 \u0438 \u043a\u043e\u043c\u043c\u0443\u043d\u0438\u043a\u0430\u0446\u0438\u0438', + 'http://www.3dnews.ru/communication/rss/'), ('\u0418\u0433\u0440\u044b', 'http://www.3dnews.ru/games/rss/'), ('\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u043d\u043e\u0435 \u043e\u0431\u0435\u0441\u043f\u0435\u0447\u0435\u043d\u0438\u0435', 'http://www.3dnews.ru/software/rss/'), - ('Off-\u0441\u044f\u043d\u043a\u0430', 'http://www.3dnews.ru/offsyanka/rss/'), - ('\u041c\u0430\u0441\u0442\u0435\u0440\u0441\u043a\u0430\u044f', 'http://www.3dnews.ru/workshop/rss/'), + ('Off-\u0441\u044f\u043d\u043a\u0430', + 'http://www.3dnews.ru/offsyanka/rss/'), + ('\u041c\u0430\u0441\u0442\u0435\u0440\u0441\u043a\u0430\u044f', + 'http://www.3dnews.ru/workshop/rss/'), ('ServerNews', 'http://servernews.ru/rss'), ] diff --git a/recipes/7dias.recipe b/recipes/7dias.recipe index 6e9856b0a7..f3b59200a0 100644 --- a/recipes/7dias.recipe +++ b/recipes/7dias.recipe @@ -1,6 +1,6 @@ #!/usr/bin/env python2 -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2009, Darko Miletic ' ''' elargentino.com @@ -9,33 +9,26 @@ elargentino.com from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag + class SieteDias(BasicNewsRecipe): - title = '7 dias' - __author__ = 'Darko Miletic' - description = 'Revista Argentina' - publisher = 'ElArgentino.com' - category = 'news, politics, show, Argentina' - oldest_article = 7 + title = '7 dias' + __author__ = 'Darko Miletic' + description = 'Revista Argentina' + publisher = 'ElArgentino.com' + category = 'news, politics, show, Argentina' + oldest_article = 7 max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - encoding = 'utf-8' + no_stylesheets = True + use_embedded_content = False + encoding = 'utf-8' language = 'es_AR' - lang = 'es-AR' - direction = 'ltr' - INDEX = 'http://www.elargentino.com/medios/125/7-Dias.html' - extra_css = ' .titulo{font-size: x-large; font-weight: bold} .volantaImp{font-size: small; font-weight: bold} ' + lang = 'es-AR' + direction = 'ltr' + INDEX = 'http://www.elargentino.com/medios/125/7-Dias.html' + extra_css = ' .titulo{font-size: x-large; font-weight: bold} .volantaImp{font-size: small; font-weight: bold} ' - html2lrf_options = [ - '--comment' , description - , '--category' , category - , '--publisher', publisher - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "' - - keep_only_tags = [dict(name='div', attrs={'class':'ContainerPop'})] + keep_only_tags = [dict(name='div', attrs={'class': 'ContainerPop'})] remove_tags = [dict(name='link')] @@ -50,20 +43,23 @@ class SieteDias(BasicNewsRecipe): for item in soup.findAll(style=True): del item['style'] soup.html['lang'] = self.lang - soup.html['dir' ] = self.direction - mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) - mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) - soup.head.insert(0,mlang) - soup.head.insert(1,mcharset) + soup.html['dir'] = self.direction + mlang = Tag(soup, 'meta', [ + ("http-equiv", "Content-Language"), ("content", self.lang)]) + mcharset = Tag(soup, 'meta', [ + ("http-equiv", "Content-Type"), ("content", "text/html; charset=utf-8")]) + soup.head.insert(0, mlang) + soup.head.insert(1, mcharset) return soup def get_cover_url(self): cover_url = None soup = self.index_to_soup(self.INDEX) - cover_item = soup.find('div',attrs={'class':'colder'}) + cover_item = soup.find('div', attrs={'class': 'colder'}) if cover_item: - clean_url = self.image_url_processor(None,cover_item.div.img['src']) - cover_url = 'http://www.elargentino.com' + clean_url + '&height=600' + clean_url = self.image_url_processor( + None, cover_item.div.img['src']) + cover_url = 'http://www.elargentino.com' + clean_url + '&height=600' return cover_url def image_url_processor(self, baseurl, url): diff --git a/recipes/7seri.recipe b/recipes/7seri.recipe index 2063500f03..0c852d103a 100644 --- a/recipes/7seri.recipe +++ b/recipes/7seri.recipe @@ -1,7 +1,7 @@ #!/usr/bin/env python2 # -*- coding: utf-8 -*- -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = u'2011, Silviu Cotoar\u0103' ''' sapteseri.ro @@ -9,43 +9,40 @@ sapteseri.ro from calibre.web.feeds.news import BasicNewsRecipe + class SapteSeri(BasicNewsRecipe): - title = u'Sapte Seri' - __author__ = u'Silviu Cotoar\u0103' - description = u'Sapte Seri' - publisher = u'Sapte Seri' - oldest_article = 5 - language = 'ro' + title = u'Sapte Seri' + __author__ = u'Silviu Cotoar\u0103' + description = u'Sapte Seri' + publisher = u'Sapte Seri' + oldest_article = 5 + language = 'ro' max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - category = 'Ziare,Oras,Distractie,Fun' - encoding = 'utf-8' - remove_empty_feeds = True - remove_javascript = True - cover_url = 'http://www.sapteseri.ro/Images/logo.jpg' + no_stylesheets = True + use_embedded_content = False + category = 'Ziare,Oras,Distractie,Fun' + encoding = 'utf-8' + remove_empty_feeds = True + remove_javascript = True + cover_url = 'http://www.sapteseri.ro/Images/logo.jpg' conversion_options = { - 'comments' : description - ,'tags' : category - ,'language' : language - ,'publisher' : publisher - } + 'comments': description, 'tags': category, 'language': language, 'publisher': publisher + } keep_only_tags = [ - dict(name='h1', attrs={'id':'title'}) - , dict(name='div', attrs={'class':'mt10 mb10'}) - , dict(name='div', attrs={'class':'mb20 mt10'}) - , dict(name='div', attrs={'class':'mt5 mb20'}) - ] + dict(name='h1', attrs={'id': 'title'}), dict(name='div', attrs={'class': 'mt10 mb10'}), dict( + name='div', attrs={'class': 'mb20 mt10'}), dict(name='div', attrs={'class': 'mt5 mb20'}) + ] remove_tags = [ - dict(name='div', attrs={'id':['entityimgworking']}) - ] + dict(name='div', attrs={'id': ['entityimgworking']}) + ] - feeds = [ - (u'Ce se intampla azi in Bucuresti', u'http://www.sapteseri.ro/ro/feed/ce-se-intampla-azi/bucuresti/') - ] + feeds = [ + (u'Ce se intampla azi in Bucuresti', + u'http://www.sapteseri.ro/ro/feed/ce-se-intampla-azi/bucuresti/') + ] def preprocess_html(self, soup): return self.adeify_images(soup) diff --git a/recipes/Ansa.recipe b/recipes/Ansa.recipe index 397ae7620d..b4dccb7106 100644 --- a/recipes/Ansa.recipe +++ b/recipes/Ansa.recipe @@ -1,69 +1,70 @@ #!/usr/bin/env python2 -__license__ = 'GPL v3' -__author__ = 'Gabriele Marini, based on Darko Miletic' +__license__ = 'GPL v3' +__author__ = 'Gabriele Marini, based on Darko Miletic' __copyright__ = '2010, Darko Miletic ' -description = 'Italian daily newspaper - 01-05-2010' +description = 'Italian daily newspaper - 01-05-2010' ''' http://www.ansa.it/ ''' from calibre.web.feeds.news import BasicNewsRecipe + class Ansa(BasicNewsRecipe): - __author__ = 'Gabriele Marini' - description = 'Italian News Agency' + __author__ = 'Gabriele Marini' + description = 'Italian News Agency' - cover_url = 'http://www.ansa.it/web/images/logo_ansa_interna.gif' - title = u'Ansa' - publisher = 'Ansa' - category = 'News, politics, culture, economy, general interest' + cover_url = 'http://www.ansa.it/web/images/logo_ansa_interna.gif' + title = u'Ansa' + publisher = 'Ansa' + category = 'News, politics, culture, economy, general interest' - language = 'it' - timefmt = '[%a, %d %b, %Y]' + language = 'it' + timefmt = '[%a, %d %b, %Y]' oldest_article = 1 max_articles_per_feed = 10 - use_embedded_content = False - recursion = 10 + use_embedded_content = False + recursion = 10 remove_javascript = True - no_stylesheets = True - conversion_options = {'linearize_tables':True} + no_stylesheets = True + conversion_options = {'linearize_tables': True} remove_attributes = ['colspan'] - keep_only_tags = [dict(name='div', attrs={'class':['path','header-content','corpo']}), - ] - - - remove_tags = [ - dict(name='div', attrs={'class':'tools-bar'}), - dict(name='div', attrs={'id':['rssdiv','blocco']}) - ] - - - feeds = [ - (u'HomePage', u'http://www.ansa.it/web/ansait_web_rss_homepage.xml'), - (u'Top New', u'http://www.ansa.it/web/notizie/rubriche/topnews/topnews_rss.xml'), - (u'Cronaca', u'http://www.ansa.it/web/notizie/rubriche/cronaca/cronaca_rss.xml'), - (u'Mondo', u'http://www.ansa.it/web/notizie/rubriche/mondo/mondo_rss.xml'), - (u'Economia', u'http://www.ansa.it/web/notizie/rubriche/economia/economia_rss.xml'), - (u'Politica', u'http://www.ansa.it/web/notizie/rubriche/politica/politica_rss.xml'), - (u'Scienze', u'http://www.ansa.it/web/notizie/rubriche/scienza/scienza_rss.xml'), - (u'Cinema', u'http://www.ansa.it/web/notizie/rubriche/cinema/cinema_rss.xml'), - (u'Tecnologia e Internet', u'http://www.ansa.it/web/notizie/rubriche/tecnologia/tecnologia_rss.xml'), - (u'Spettacolo', u'http://www.ansa.it/web/notizie/rubriche/spettacolo/spettacolo_rss.xml'), - (u'Cultura e Tendenze', u'http://www.ansa.it/web/notizie/rubriche/cultura/cultura_rss.xml'), - (u'Sport', u'http://www.ansa.it/web/notizie/rubriche/altrisport/altrisport_rss.xml'), - (u'Calcio', u'http://www.ansa.it/web/notizie/rubriche/calcio/calcio_rss.xml'), - (u'Lazio', u'http://www.ansa.it/web/notizie/regioni/lazio/lazio_rss.xml'), - (u'Lombardia', u'http://www.ansa.it/web/notizie/regioni/lombardia/lombardia.shtml'), - (u'Veneto', u'http://www.ansa.it/web/notizie/regioni/veneto/veneto.shtml'), - (u'Campanioa', u'http://www.ansa.it/web/notizie/regioni/campania/campania.shtml'), - (u'Sicilia', u'http://www.ansa.it/web/notizie/regioni/sicilia/sicilia.shtml'), - (u'Toscana', u'http://www.ansa.it/web/notizie/regioni/toscana/toscana.shtml'), - (u'Trentino', u'http://www.ansa.it/web/notizie/regioni/trentino/trentino.shtml') + keep_only_tags = [dict(name='div', attrs={'class': ['path', 'header-content', 'corpo']}), ] + remove_tags = [ + dict(name='div', attrs={'class': 'tools-bar'}), + dict(name='div', attrs={'id': ['rssdiv', 'blocco']}) + ] + + feeds = [ + (u'HomePage', u'http://www.ansa.it/web/ansait_web_rss_homepage.xml'), + (u'Top New', u'http://www.ansa.it/web/notizie/rubriche/topnews/topnews_rss.xml'), + (u'Cronaca', u'http://www.ansa.it/web/notizie/rubriche/cronaca/cronaca_rss.xml'), + (u'Mondo', u'http://www.ansa.it/web/notizie/rubriche/mondo/mondo_rss.xml'), + (u'Economia', u'http://www.ansa.it/web/notizie/rubriche/economia/economia_rss.xml'), + (u'Politica', u'http://www.ansa.it/web/notizie/rubriche/politica/politica_rss.xml'), + (u'Scienze', u'http://www.ansa.it/web/notizie/rubriche/scienza/scienza_rss.xml'), + (u'Cinema', u'http://www.ansa.it/web/notizie/rubriche/cinema/cinema_rss.xml'), + (u'Tecnologia e Internet', + u'http://www.ansa.it/web/notizie/rubriche/tecnologia/tecnologia_rss.xml'), + (u'Spettacolo', u'http://www.ansa.it/web/notizie/rubriche/spettacolo/spettacolo_rss.xml'), + (u'Cultura e Tendenze', + u'http://www.ansa.it/web/notizie/rubriche/cultura/cultura_rss.xml'), + (u'Sport', u'http://www.ansa.it/web/notizie/rubriche/altrisport/altrisport_rss.xml'), + (u'Calcio', u'http://www.ansa.it/web/notizie/rubriche/calcio/calcio_rss.xml'), + (u'Lazio', u'http://www.ansa.it/web/notizie/regioni/lazio/lazio_rss.xml'), + (u'Lombardia', u'http://www.ansa.it/web/notizie/regioni/lombardia/lombardia.shtml'), + (u'Veneto', u'http://www.ansa.it/web/notizie/regioni/veneto/veneto.shtml'), + (u'Campanioa', u'http://www.ansa.it/web/notizie/regioni/campania/campania.shtml'), + (u'Sicilia', u'http://www.ansa.it/web/notizie/regioni/sicilia/sicilia.shtml'), + (u'Toscana', u'http://www.ansa.it/web/notizie/regioni/toscana/toscana.shtml'), + (u'Trentino', u'http://www.ansa.it/web/notizie/regioni/trentino/trentino.shtml') + ] + extra_css = ''' .path{font-style: italic; font-size: small} .header-content h1{font-weight: bold; font-size: xx-large} diff --git a/recipes/DrawAndCook.recipe b/recipes/DrawAndCook.recipe index 8db4f71014..976b7c5393 100644 --- a/recipes/DrawAndCook.recipe +++ b/recipes/DrawAndCook.recipe @@ -1,21 +1,22 @@ from calibre.web.feeds.news import BasicNewsRecipe import re + class DrawAndCook(BasicNewsRecipe): - title = 'DrawAndCook' - __author__ = 'Starson17' - __version__ = 'v1.10' - __date__ = '13 March 2011' - description = 'Drawings of recipes!' - language = 'en' - publisher = 'Starson17' - category = 'news, food, recipes' - use_embedded_content= False - no_stylesheets = True - oldest_article = 24 - remove_javascript = True - remove_empty_feeds = True - cover_url = 'http://farm5.static.flickr.com/4043/4471139063_4dafced67f_o.jpg' + title = 'DrawAndCook' + __author__ = 'Starson17' + __version__ = 'v1.10' + __date__ = '13 March 2011' + description = 'Drawings of recipes!' + language = 'en' + publisher = 'Starson17' + category = 'news, food, recipes' + use_embedded_content = False + no_stylesheets = True + oldest_article = 24 + remove_javascript = True + remove_empty_feeds = True + cover_url = 'http://farm5.static.flickr.com/4043/4471139063_4dafced67f_o.jpg' INDEX = 'http://www.theydrawandcook.com' max_articles_per_feed = 30 @@ -24,8 +25,8 @@ class DrawAndCook(BasicNewsRecipe): def parse_index(self): feeds = [] for title, url in [ - ("They Draw and Cook", "http://www.theydrawandcook.com/") - ]: + ("They Draw and Cook", "http://www.theydrawandcook.com/") + ]: articles = self.make_links(url) if articles: feeds.append((title, articles)) @@ -38,22 +39,24 @@ class DrawAndCook(BasicNewsRecipe): date = '' current_articles = [] soup = self.index_to_soup(url) - featured_major_slider = soup.find(name='div', attrs={'id':'featured_major_slider'}) - recipes = featured_major_slider.findAll('li', attrs={'data-id': re.compile(r'artwork_entry_\d+', re.DOTALL)}) + featured_major_slider = soup.find( + name='div', attrs={'id': 'featured_major_slider'}) + recipes = featured_major_slider.findAll( + 'li', attrs={'data-id': re.compile(r'artwork_entry_\d+', re.DOTALL)}) for recipe in recipes: page_url = self.INDEX + recipe.a['href'] print 'page_url is: ', page_url title = recipe.find('strong').string print 'title is: ', title - current_articles.append({'title': title, 'url': page_url, 'description':'', 'date':date}) + current_articles.append( + {'title': title, 'url': page_url, 'description': '', 'date': date}) return current_articles - keep_only_tags = [dict(name='h1', attrs={'id':'page_title'}) - ,dict(name='section', attrs={'id':'artwork'}) - ] + keep_only_tags = [dict(name='h1', attrs={'id': 'page_title'}), dict(name='section', attrs={'id': 'artwork'}) + ] - remove_tags = [dict(name='article', attrs={'id':['recipe_actions', 'metadata']}) - ] + remove_tags = [dict(name='article', attrs={'id': ['recipe_actions', 'metadata']}) + ] extra_css = ''' h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} @@ -61,5 +64,4 @@ class DrawAndCook(BasicNewsRecipe): img {max-width:100%; min-width:100%;} p{font-family:Arial,Helvetica,sans-serif;font-size:small;} body{font-family:Helvetica,Arial,sans-serif;font-size:small;} - ''' - + ''' diff --git a/recipes/ZIVE.sk.recipe b/recipes/ZIVE.sk.recipe index e5bfd56cef..f1d5c2febb 100644 --- a/recipes/ZIVE.sk.recipe +++ b/recipes/ZIVE.sk.recipe @@ -2,9 +2,8 @@ from calibre.web.feeds.news import BasicNewsRecipe import re - class ZiveRecipe(BasicNewsRecipe): - __license__ = 'GPL v3' + __license__ = 'GPL v3' __author__ = 'Abelturd' language = 'sk' version = 1 @@ -25,21 +24,20 @@ class ZiveRecipe(BasicNewsRecipe): cover_url = 'http://www.zive.sk/Client.Images/Logos/logo-zive-sk.gif' feeds = [] - feeds.append((u'V\u0161etky \u010dl\xe1nky', u'http://www.zive.sk/rss/sc-47/default.aspx')) + feeds.append((u'V\u0161etky \u010dl\xe1nky', + u'http://www.zive.sk/rss/sc-47/default.aspx')) preprocess_regexps = [ - (re.compile(r'

Pokra.*ie

', re.DOTALL|re.IGNORECASE), - lambda match: ''), - - ] + (re.compile(r'

Pokra.*ie

', re.DOTALL | re.IGNORECASE), + lambda match: ''), + ] remove_tags = [] - keep_only_tags = [dict(name='h1'), dict(name='span', attrs={'class':'arlist-data-info-author'}), dict(name='div', attrs={'class':'bbtext font-resizer-area'}),] + keep_only_tags = [dict(name='h1'), dict(name='span', attrs={ + 'class': 'arlist-data-info-author'}), dict(name='div', attrs={'class': 'bbtext font-resizer-area'}), ] extra_css = ''' h1 {font-size:140%;font-family:georgia,serif; font-weight:bold} h3 {font-size:115%;font-family:georgia,serif; font-weight:bold} ''' - - diff --git a/recipes/aachener_nachrichten.recipe b/recipes/aachener_nachrichten.recipe index cdcb6895bc..7424426f70 100644 --- a/recipes/aachener_nachrichten.recipe +++ b/recipes/aachener_nachrichten.recipe @@ -1,71 +1,113 @@ from calibre.web.feeds.recipes import BasicNewsRecipe + + class AdvancedUserRecipe(BasicNewsRecipe): - title = u'Aachener Nachrichten' - __author__ = 'schuster' #AGE update 2012-11-28 - oldest_article = 1 + title = u'Aachener Nachrichten' + __author__ = 'schuster' # AGE update 2012-11-28 + oldest_article = 1 max_articles_per_feed = 100 - no_stylesheets = True - remove_javascript = True - remove_empty_feeds = True - language = 'de' + no_stylesheets = True + remove_javascript = True + remove_empty_feeds = True + language = 'de' # cover_url = 'http://www.aachener-nachrichten.de/img/logos/an_website_retina.png' - masthead_url = 'http://www.aachener-nachrichten.de/img/logos/an_website_retina.png' + masthead_url = 'http://www.aachener-nachrichten.de/img/logos/an_website_retina.png' - keep_only_tags = [ - dict(name='article', attrs={'class':['single']}) - ] + keep_only_tags = [ + dict(name='article', attrs={'class': ['single']}) + ] remove_tags = [ - dict(name='div', attrs={'class':["clearfix navi-wrapper"]}), - dict(name='div', attrs={'id':["article_actions"]}), - dict(name='style', attrs={'type':["text/css"]}), - dict(name='aside'), - dict(name='a', attrs={'class':["btn btn-action"]}) - ] + dict(name='div', attrs={'class': ["clearfix navi-wrapper"]}), + dict(name='div', attrs={'id': ["article_actions"]}), + dict(name='style', attrs={'type': ["text/css"]}), + dict(name='aside'), + dict(name='a', attrs={'class': ["btn btn-action"]}) + ] feeds = [ - (u'Lokales - Euregio', u'http://www.aachener-nachrichten.de/cmlink/euregio-rss-1.357285'), - (u'Lokales - Aachen', u'http://www.aachener-nachrichten.de/cmlink/aachen-rss-1.357286'), - (u'Lokales - Nordkreis', u'http://www.aachener-nachrichten.de/cmlink/nordkreis-rss-1.358150'), - (u'Lokales - Düren', u'http://www.aachener-nachrichten.de/cmlink/dueren-rss-1.358626'), - (u'Lokales - Eiffel', u'http://www.aachener-nachrichten.de/cmlink/eifel-rss-1.358978'), - (u'Lokales - Eschweiler', u'http://www.aachener-nachrichten.de/cmlink/eschweiler-rss-1.359332'), - (u'Lokales - Geilenkirchen', u'http://www.aachener-nachrichten.de/cmlink/geilenkirchen-rss-1.359643'), - (u'Lokales - Heinsberg', u'http://www.aachener-nachrichten.de/cmlink/heinsberg-rss-1.359724'), - (u'Lokales - Jülich', u'http://www.aachener-nachrichten.de/cmlink/juelich-rss-1.359725'), - (u'Lokales - Stolberg', u'http://www.aachener-nachrichten.de/cmlink/stolberg-rss-1.359726'), - (u'News - Politik', u'http://www.aachener-nachrichten.de/cmlink/politik-rss-1.359727'), - (u'News - Aus aller Welt', u'http://www.aachener-nachrichten.de/cmlink/ausallerwelt-rss-1.453282'), - (u'News - Wirtschaft', u'http://www.aachener-nachrichten.de/cmlink/wirtschaft-rss-1.359872'), - (u'News - Kultur', u'http://www.aachener-nachrichten.de/cmlink/kultur-rss-1.365018'), - (u'News - Kino', u'http://www.aachener-nachrichten.de/cmlink/kino-rss-1.365019'), - (u'News - Digital', u'http://www.aachener-nachrichten.de/cmlink/digital-rss-1.365020'), - (u'News - Wissenschaft', u'http://www.aachener-nachrichten.de/cmlink/wissenschaft-rss-1.365021'), - (u'News - Hochschule', u'http://www.aachener-nachrichten.de/cmlink/hochschule-rss-1.365022'), - (u'News - Auto', u'http://www.aachener-nachrichten.de/cmlink/auto-rss-1.365023'), - (u'News - Kurioses', u'http://www.aachener-nachrichten.de/cmlink/kurioses-rss-1.365067'), - (u'News - Musik', u'http://www.aachener-nachrichten.de/cmlink/musik-rss-1.365305'), - (u'News - Tagesthema', u'http://www.aachener-nachrichten.de/cmlink/tagesthema-rss-1.365519'), - (u'News - Newsticker', u'http://www.aachener-nachrichten.de/cmlink/newsticker-rss-1.451948'), - (u'Sport - Aktuell', u'http://www.aachener-nachrichten.de/cmlink/aktuell-rss-1.366716'), - (u'Sport - Fußball', u'http://www.aachener-nachrichten.de/cmlink/fussball-rss-1.367060'), - (u'Sport - Bundesliga', u'http://www.aachener-nachrichten.de/cmlink/bundesliga-rss-1.453367'), - (u'Sport - Alemannia Aachen', u'http://www.aachener-nachrichten.de/cmlink/alemanniaaachen-rss-1.366057'), - (u'Sport - Volleyball', u'http://www.aachener-nachrichten.de/cmlink/volleyball-rss-1.453370'), - (u'Sport - Chio', u'http://www.aachener-nachrichten.de/cmlink/chio-rss-1.453371'), - (u'Dossier - Kinderuni', u'http://www.aachener-nachrichten.de/cmlink/kinderuni-rss-1.453375'), - (u'Dossier - Karlspreis', u'http://www.aachener-nachrichten.de/cmlink/karlspreis-rss-1.453376'), - (u'Dossier - Ritterorden', u'http://www.aachener-nachrichten.de/cmlink/ritterorden-rss-1.453377'), - (u'Dossier - ZAB-Aachen', u'http://www.aachener-nachrichten.de/cmlink/zabaachen-rss-1.453380'), - (u'Dossier - Karneval', u'http://www.aachener-nachrichten.de/cmlink/karneval-rss-1.453384'), - (u'Ratgeber - Geld', u'http://www.aachener-nachrichten.de/cmlink/geld-rss-1.453385'), - (u'Ratgeber - Recht', u'http://www.aachener-nachrichten.de/cmlink/recht-rss-1.453386'), - (u'Ratgeber - Gesundheit', u'http://www.aachener-nachrichten.de/cmlink/gesundheit-rss-1.453387'), - (u'Ratgeber - Familie', u'http://www.aachener-nachrichten.de/cmlink/familie-rss-1.453388'), - (u'Ratgeber - Livestyle', u'http://www.aachener-nachrichten.de/cmlink/lifestyle-rss-1.453389'), - (u'Ratgeber - Reisen', u'http://www.aachener-nachrichten.de/cmlink/reisen-rss-1.453390'), - (u'Ratgeber - Bauen und Wohnen', u'http://www.aachener-nachrichten.de/cmlink/bauen-rss-1.453398'), - (u'Ratgeber - Bildung und Beruf', u'http://www.aachener-nachrichten.de/cmlink/bildung-rss-1.453400'), - ] + (u'Lokales - Euregio', + u'http://www.aachener-nachrichten.de/cmlink/euregio-rss-1.357285'), + (u'Lokales - Aachen', + u'http://www.aachener-nachrichten.de/cmlink/aachen-rss-1.357286'), + (u'Lokales - Nordkreis', + u'http://www.aachener-nachrichten.de/cmlink/nordkreis-rss-1.358150'), + (u'Lokales - Düren', + u'http://www.aachener-nachrichten.de/cmlink/dueren-rss-1.358626'), + (u'Lokales - Eiffel', + u'http://www.aachener-nachrichten.de/cmlink/eifel-rss-1.358978'), + (u'Lokales - Eschweiler', + u'http://www.aachener-nachrichten.de/cmlink/eschweiler-rss-1.359332'), + (u'Lokales - Geilenkirchen', + u'http://www.aachener-nachrichten.de/cmlink/geilenkirchen-rss-1.359643'), + (u'Lokales - Heinsberg', + u'http://www.aachener-nachrichten.de/cmlink/heinsberg-rss-1.359724'), + (u'Lokales - Jülich', + u'http://www.aachener-nachrichten.de/cmlink/juelich-rss-1.359725'), + (u'Lokales - Stolberg', + u'http://www.aachener-nachrichten.de/cmlink/stolberg-rss-1.359726'), + (u'News - Politik', + u'http://www.aachener-nachrichten.de/cmlink/politik-rss-1.359727'), + (u'News - Aus aller Welt', + u'http://www.aachener-nachrichten.de/cmlink/ausallerwelt-rss-1.453282'), + (u'News - Wirtschaft', + u'http://www.aachener-nachrichten.de/cmlink/wirtschaft-rss-1.359872'), + (u'News - Kultur', + u'http://www.aachener-nachrichten.de/cmlink/kultur-rss-1.365018'), + (u'News - Kino', u'http://www.aachener-nachrichten.de/cmlink/kino-rss-1.365019'), + (u'News - Digital', + u'http://www.aachener-nachrichten.de/cmlink/digital-rss-1.365020'), + (u'News - Wissenschaft', + u'http://www.aachener-nachrichten.de/cmlink/wissenschaft-rss-1.365021'), + (u'News - Hochschule', + u'http://www.aachener-nachrichten.de/cmlink/hochschule-rss-1.365022'), + (u'News - Auto', u'http://www.aachener-nachrichten.de/cmlink/auto-rss-1.365023'), + (u'News - Kurioses', + u'http://www.aachener-nachrichten.de/cmlink/kurioses-rss-1.365067'), + (u'News - Musik', + u'http://www.aachener-nachrichten.de/cmlink/musik-rss-1.365305'), + (u'News - Tagesthema', + u'http://www.aachener-nachrichten.de/cmlink/tagesthema-rss-1.365519'), + (u'News - Newsticker', + u'http://www.aachener-nachrichten.de/cmlink/newsticker-rss-1.451948'), + (u'Sport - Aktuell', + u'http://www.aachener-nachrichten.de/cmlink/aktuell-rss-1.366716'), + (u'Sport - Fußball', + u'http://www.aachener-nachrichten.de/cmlink/fussball-rss-1.367060'), + (u'Sport - Bundesliga', + u'http://www.aachener-nachrichten.de/cmlink/bundesliga-rss-1.453367'), + (u'Sport - Alemannia Aachen', + u'http://www.aachener-nachrichten.de/cmlink/alemanniaaachen-rss-1.366057'), + (u'Sport - Volleyball', + u'http://www.aachener-nachrichten.de/cmlink/volleyball-rss-1.453370'), + (u'Sport - Chio', + u'http://www.aachener-nachrichten.de/cmlink/chio-rss-1.453371'), + (u'Dossier - Kinderuni', + u'http://www.aachener-nachrichten.de/cmlink/kinderuni-rss-1.453375'), + (u'Dossier - Karlspreis', + u'http://www.aachener-nachrichten.de/cmlink/karlspreis-rss-1.453376'), + (u'Dossier - Ritterorden', + u'http://www.aachener-nachrichten.de/cmlink/ritterorden-rss-1.453377'), + (u'Dossier - ZAB-Aachen', + u'http://www.aachener-nachrichten.de/cmlink/zabaachen-rss-1.453380'), + (u'Dossier - Karneval', + u'http://www.aachener-nachrichten.de/cmlink/karneval-rss-1.453384'), + (u'Ratgeber - Geld', + u'http://www.aachener-nachrichten.de/cmlink/geld-rss-1.453385'), + (u'Ratgeber - Recht', + u'http://www.aachener-nachrichten.de/cmlink/recht-rss-1.453386'), + (u'Ratgeber - Gesundheit', + u'http://www.aachener-nachrichten.de/cmlink/gesundheit-rss-1.453387'), + (u'Ratgeber - Familie', + u'http://www.aachener-nachrichten.de/cmlink/familie-rss-1.453388'), + (u'Ratgeber - Livestyle', + u'http://www.aachener-nachrichten.de/cmlink/lifestyle-rss-1.453389'), + (u'Ratgeber - Reisen', + u'http://www.aachener-nachrichten.de/cmlink/reisen-rss-1.453390'), + (u'Ratgeber - Bauen und Wohnen', + u'http://www.aachener-nachrichten.de/cmlink/bauen-rss-1.453398'), + (u'Ratgeber - Bildung und Beruf', + u'http://www.aachener-nachrichten.de/cmlink/bildung-rss-1.453400'), + ] diff --git a/recipes/abc.recipe b/recipes/abc.recipe index c4ae0aa308..990307eaeb 100644 --- a/recipes/abc.recipe +++ b/recipes/abc.recipe @@ -1,43 +1,45 @@ import re from calibre.web.feeds.news import BasicNewsRecipe + class ABCRecipe(BasicNewsRecipe): - title = u'ABC Linuxu' - oldest_article = 5 - max_articles_per_feed = 3#5 - __author__ = 'Funthomas' - language = 'cs' + title = u'ABC Linuxu' + oldest_article = 5 + max_articles_per_feed = 3 # 5 + __author__ = 'Funthomas' + language = 'cs' - feeds = [ - #(u'Blogy', u'http://www.abclinuxu.cz/auto/blogDigest.rss'), - (u'Články', u'http://www.abclinuxu.cz/auto/abc.rss'), - (u'Zprávičky','http://www.abclinuxu.cz/auto/zpravicky.rss') - ] + feeds = [ + # (u'Blogy', u'http://www.abclinuxu.cz/auto/blogDigest.rss'), + (u'Články', u'http://www.abclinuxu.cz/auto/abc.rss'), + (u'Zprávičky', 'http://www.abclinuxu.cz/auto/zpravicky.rss') + ] - remove_javascript = True - no_stylesheets = True - remove_attributes = ['width','height'] + remove_javascript = True + no_stylesheets = True + remove_attributes = ['width', 'height'] - remove_tags_before = dict(name='h1') - remove_tags = [ - dict(attrs={'class':['meta-vypis','page_tools','cl_perex']}), - dict(attrs={'class':['cl_nadpis-link','komix-nav']}) - ] + remove_tags_before = dict(name='h1') + remove_tags = [ + dict(attrs={'class': ['meta-vypis', 'page_tools', 'cl_perex']}), + dict(attrs={'class': ['cl_nadpis-link', 'komix-nav']}) + ] - remove_tags_after = [ - dict(name='div',attrs={'class':['cl_perex','komix-nav']}), - dict(attrs={'class':['meta-vypis','page_tools']}), - dict(name='',attrs={'':''}), - ] + remove_tags_after = [ + dict(name='div', attrs={'class': ['cl_perex', 'komix-nav']}), + dict(attrs={'class': ['meta-vypis', 'page_tools']}), + dict(name='', attrs={'': ''}), + ] + preprocess_regexps = [ + (re.compile(r'.*

', re.DOTALL), + lambda match: '

') + ] - preprocess_regexps = [ - (re.compile(r'.*

', re.DOTALL),lambda match: '

') - ] - def print_version(self, url): - return url + '?varianta=print&noDiz' + def print_version(self, url): + return url + '?varianta=print&noDiz' - extra_css = ''' + extra_css = ''' h1 {font-size:130%; font-weight:bold} h3 {font-size:111%; font-weight:bold} ''' diff --git a/recipes/abc_au.recipe b/recipes/abc_au.recipe index f08beb4dae..ac21e6d730 100644 --- a/recipes/abc_au.recipe +++ b/recipes/abc_au.recipe @@ -1,4 +1,4 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2011, Pat Stapleton ' ''' abc.net.au/news @@ -6,51 +6,50 @@ abc.net.au/news import re from calibre.web.feeds.recipes import BasicNewsRecipe + class ABCNews(BasicNewsRecipe): - title = 'ABC News' - __author__ = 'Pat Stapleton, Dean Cording' - description = 'News from Australia' - masthead_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png' - cover_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png' + title = 'ABC News' + __author__ = 'Pat Stapleton, Dean Cording' + description = 'News from Australia' + masthead_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png' + cover_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png' - oldest_article = 2 - max_articles_per_feed = 100 - no_stylesheets = False - #delay = 1 - use_embedded_content = False - encoding = 'utf8' - publisher = 'ABC News' - category = 'News, Australia, World' - language = 'en_AU' - publication_type = 'newsportal' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = False + use_embedded_content = False + encoding = 'utf8' + publisher = 'ABC News' + category = 'News, Australia, World' + language = 'en_AU' + publication_type = 'newsportal' # preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: '')] -#Remove annoying map links (inline-caption class is also used for some image captions! hence regex to match maps.google) - preprocess_regexps = [(re.compile(r'(.*) - Acrimed \| Action Critique M.*dias'), lambda m: '' + m.group(1) + ''), +class Acrimed(BasicNewsRecipe): + title = u'Acrimed' + __author__ = 'Gaëtan Lehmann' + oldest_article = 30 + max_articles_per_feed = 100 + auto_cleanup = True + auto_cleanup_keep = '//div[@class="crayon article-chapo-4112 chapo"]' + language = 'fr' + masthead_url = 'http://www.acrimed.org/IMG/siteon0.gif' + feeds = [(u'Acrimed', u'http://www.acrimed.org/spip.php?page=backend')] + + preprocess_regexps = [ + (re.compile(r'(.*) - Acrimed \| Action Critique M.*dias'), + lambda m: '' + m.group(1) + ''), (re.compile(r'

(.*) - Acrimed \| Action Critique M.*dias

'), lambda m: '

' + m.group(1) + '

')] extra_css = """ diff --git a/recipes/ad.recipe b/recipes/ad.recipe index bc3fe40dad..2b7f8eb4ad 100644 --- a/recipes/ad.recipe +++ b/recipes/ad.recipe @@ -1,8 +1,9 @@ import re from calibre.web.feeds.news import BasicNewsRecipe + class ADRecipe(BasicNewsRecipe): - __license__ = 'GPL v3' + __license__ = 'GPL v3' __author__ = 'kwetal' language = 'nl' country = 'NL' @@ -22,41 +23,54 @@ class ADRecipe(BasicNewsRecipe): remove_javascript = True keep_only_tags = [] - keep_only_tags.append(dict(name = 'div', attrs = {'id': 'art_box2'})) - keep_only_tags.append(dict(name = 'p', attrs = {'class': 'gen_footnote3'})) + keep_only_tags.append(dict(name='div', attrs={'id': 'art_box2'})) + keep_only_tags.append(dict(name='p', attrs={'class': 'gen_footnote3'})) remove_tags = [] - remove_tags.append(dict(name = 'div', attrs = {'class': 'gen_clear'})) - remove_tags.append(dict(name = 'div', attrs = {'class': re.compile(r'gen_spacer.*')})) + remove_tags.append(dict(name='div', attrs={'class': 'gen_clear'})) + remove_tags.append( + dict(name='div', attrs={'class': re.compile(r'gen_spacer.*')})) remove_attributes = ['style'] - # feeds from http://ad.nl/ad/nl/1401/home/integration/nmc/frameset/ad_footer/rssFeeds.dhtml + # feeds from + # http://ad.nl/ad/nl/1401/home/integration/nmc/frameset/ad_footer/rssFeeds.dhtml feeds = [] - feeds.append((u'Binnenland', u'http://www.ad.nl/nieuws/binnenland/rss.xml')) - feeds.append((u'Buitenland', u'http://www.ad.nl/nieuws/buitenland/rss.xml')) + feeds.append( + (u'Binnenland', u'http://www.ad.nl/nieuws/binnenland/rss.xml')) + feeds.append( + (u'Buitenland', u'http://www.ad.nl/nieuws/buitenland/rss.xml')) feeds.append((u'Bizar', u'http://www.ad.nl/nieuws/bizar/rss.xml')) - feeds.append((u'Gezondheid & Wetenschap', u'http://www.ad.nl/nieuws/gezondheidwetenschap/rss.xml')) + feeds.append((u'Gezondheid & Wetenschap', + u'http://www.ad.nl/nieuws/gezondheidwetenschap/rss.xml')) feeds.append((u'Economie', u'http://www.ad.nl/nieuws/economie/rss.xml')) - feeds.append((u'Nederlands Voetbal', u'http://www.ad.nl/sportwereld/nederlandsvoetbal/rss.xml')) - feeds.append((u'Buitenlands Voetbal', u'http://www.ad.nl/sportwereld/buitenlandsvoetbal/rss.xml')) - feeds.append((u'Champions League/Europa League', u'http://www.ad.nl/sportwereld/championsleagueeuropaleague/rss.xml')) - feeds.append((u'Wielrennen', u'http://www.ad.nl/sportwereld/wielrennen/rss.xml')) + feeds.append((u'Nederlands Voetbal', + u'http://www.ad.nl/sportwereld/nederlandsvoetbal/rss.xml')) + feeds.append((u'Buitenlands Voetbal', + u'http://www.ad.nl/sportwereld/buitenlandsvoetbal/rss.xml')) + feeds.append((u'Champions League/Europa League', + u'http://www.ad.nl/sportwereld/championsleagueeuropaleague/rss.xml')) + feeds.append( + (u'Wielrennen', u'http://www.ad.nl/sportwereld/wielrennen/rss.xml')) feeds.append((u'Tennis', u'http://www.ad.nl/sportwereld/tennis/rss.xml')) - feeds.append((u'Formule 1', u'http://www.ad.nl/sportwereld/formule1/rss.xml')) - feeds.append((u'Meer Sport', u'http://www.ad.nl/sportwereld/meersport/rss.xml')) + feeds.append( + (u'Formule 1', u'http://www.ad.nl/sportwereld/formule1/rss.xml')) + feeds.append( + (u'Meer Sport', u'http://www.ad.nl/sportwereld/meersport/rss.xml')) feeds.append((u'Celebs', u'http://www.ad.nl/showbizz/celebs/rss.xml')) feeds.append((u'Film', u'http://www.ad.nl/showbizz/film/rss.xml')) feeds.append((u'Muziek', u'http://www.ad.nl/showbizz/muziek/rss.xml')) feeds.append((u'TV', u'http://www.ad.nl/showbizz/tv/rss.xml')) - feeds.append((u'Kunst & Literatuur', u'http://www.ad.nl/showbizz/kunstenliteratuur/rss.xml')) + feeds.append((u'Kunst & Literatuur', + u'http://www.ad.nl/showbizz/kunstenliteratuur/rss.xml')) feeds.append((u'Jouw Wereld', u'http://www.ad.nl/you/rss.xml')) feeds.append((u'Consument', u'http://www.ad.nl/consument/rss.xml')) feeds.append((u'Autowereld', u'http://www.ad.nl/autowereld/rss.xml')) feeds.append((u'Reiswereld', u'http://www.ad.nl/reiswereld/rss.xml')) feeds.append((u'Internet', u'http://www.ad.nl/digitaal/internet/rss.xml')) feeds.append((u'Games', u'http://www.ad.nl/digitaal/games/rss.xml')) - feeds.append((u'Multimedia', u'http://www.ad.nl/digitaal/multimedia/rss.xml')) + feeds.append( + (u'Multimedia', u'http://www.ad.nl/digitaal/multimedia/rss.xml')) feeds.append((u'Planet Watch', u'http://www.ad.nl/planetwatch/rss.xml')) extra_css = ''' @@ -71,7 +85,8 @@ class ADRecipe(BasicNewsRecipe): def print_version(self, url): parts = url.split('/') print_url = 'http://' + parts[2] + '/' + parts[3] + '/' + parts[4] + '/' + parts[5] + '/' \ - + parts[10] + '/' + parts[7] + '/print/' + parts[8] + '/' + parts[9] + '/' + parts[13] + + parts[10] + '/' + parts[7] + '/print/' + \ + parts[8] + '/' + parts[9] + '/' + parts[13] return print_url diff --git a/recipes/adevarul.recipe b/recipes/adevarul.recipe index 92c3c21cfb..617170d2c9 100644 --- a/recipes/adevarul.recipe +++ b/recipes/adevarul.recipe @@ -1,7 +1,7 @@ #!/usr/bin/env python2 # -*- coding: utf-8 -*- -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = u'2011, Silviu Cotoar\u0103' ''' adevarul.ro @@ -9,51 +9,38 @@ adevarul.ro from calibre.web.feeds.news import BasicNewsRecipe + class Adevarul(BasicNewsRecipe): - title = u'Adev\u0103rul' - language = 'ro' - __author__ = u'Silviu Cotoar\u0103' - description = u'\u0218tiri din Rom\u00e2nia' - publisher = 'Adevarul' - category = 'Ziare,Stiri,Romania' - oldest_article = 5 + title = u'Adev\u0103rul' + language = 'ro' + __author__ = u'Silviu Cotoar\u0103' + description = u'\u0218tiri din Rom\u00e2nia' + publisher = 'Adevarul' + category = 'Ziare,Stiri,Romania' + oldest_article = 5 max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - encoding = 'utf-8' - remove_javascript = True - cover_url = 'http://upload.wikimedia.org/wikipedia/en/d/d6/Logo_noul_adevarul.png' + no_stylesheets = True + use_embedded_content = False + encoding = 'utf-8' + remove_javascript = True + cover_url = 'http://upload.wikimedia.org/wikipedia/en/d/d6/Logo_noul_adevarul.png' conversion_options = { - 'comments' : description - ,'tags' : category - ,'language' : language - ,'publisher' : publisher - } - - keep_only_tags = [ dict(name='div', attrs={'class':'article_header'}) - ,dict(name='div', attrs={'class':'bb-tu first-t bb-article-body'}) - ] + 'comments': description, 'tags': category, 'language': language, 'publisher': publisher + } + keep_only_tags = [dict(name='div', attrs={'class': 'article_header'}), dict(name='div', attrs={'class': 'bb-tu first-t bb-article-body'}) + ] remove_tags = [ - dict(name='li', attrs={'class':'author'}) - ,dict(name='li', attrs={'class':'date'}) - ,dict(name='li', attrs={'class':'comments'}) - ,dict(name='div', attrs={'class':'bb-wg-article_related_attachements'}) - ,dict(name='div', attrs={'class':'bb-md bb-md-article_comments'}) - ,dict(name='form', attrs={'id':'bb-comment-create-form'}) - ,dict(name='div', attrs={'id':'mediatag'}) - ,dict(name='div', attrs={'id':'ft'}) - ,dict(name='div', attrs={'id':'comment_wrapper'}) - ] + dict(name='li', attrs={'class': 'author'}), dict(name='li', attrs={'class': 'date'}), dict(name='li', attrs={'class': 'comments'}), dict(name='div', attrs={'class': 'bb-wg-article_related_attachements'}), dict(name='div', attrs={'class': 'bb-md bb-md-article_comments'}), dict(name='form', attrs={'id': 'bb-comment-create-form'}), dict(name='div', attrs={'id': 'mediatag'}), dict(name='div', attrs={'id': 'ft'}), dict(name='div', attrs={'id': 'comment_wrapper'}) # noqa + ] remove_tags_after = [ - dict(name='div', attrs={'id':'comment_wrapper'}), - ] + dict(name='div', attrs={'id': 'comment_wrapper'}), + ] - feeds = [ (u'\u0218tiri', u'http://www.adevarul.ro/rss/latest') ] + feeds = [(u'\u0218tiri', u'http://www.adevarul.ro/rss/latest')] def preprocess_html(self, soup): return self.adeify_images(soup) - diff --git a/recipes/adnkronos.recipe b/recipes/adnkronos.recipe index 353cbabbbf..67ad8a26b5 100644 --- a/recipes/adnkronos.recipe +++ b/recipes/adnkronos.recipe @@ -1,8 +1,8 @@ #!/usr/bin/env python2 -__license__ = 'GPL v3' -__author__ = 'Gabriele Marini, based on Darko Miletic' +__license__ = 'GPL v3' +__author__ = 'Gabriele Marini, based on Darko Miletic' __copyright__ = '2009-2010, Darko Miletic ' -description = 'Italian daily newspaper - 02-05-2010' +description = 'Italian daily newspaper - 02-05-2010' ''' http://www.adnkronos.com/ @@ -10,50 +10,49 @@ http://www.adnkronos.com/ from calibre.web.feeds.news import BasicNewsRecipe -class Adnkronos(BasicNewsRecipe): - __author__ = 'Gabriele Marini' - description = 'News agency' - cover_url = 'http://www.adnkronos.com/IGN6/img/popup_ign.jpg' - title = u'Adnkronos' - publisher = 'Adnkronos Group - ews agency' - category = 'News, politics, culture, economy, general interest' - language = 'it' - timefmt = '[%a, %d %b, %Y]' +class Adnkronos(BasicNewsRecipe): + __author__ = 'Gabriele Marini' + description = 'News agency' + cover_url = 'http://www.adnkronos.com/IGN6/img/popup_ign.jpg' + title = u'Adnkronos' + publisher = 'Adnkronos Group - ews agency' + category = 'News, politics, culture, economy, general interest' + + language = 'it' + timefmt = '[%a, %d %b, %Y]' oldest_article = 7 max_articles_per_feed = 80 - use_embedded_content = False - recursion = 10 + use_embedded_content = False + recursion = 10 remove_javascript = True + def get_article_url(self, article): link = article.get('id', article.get('guid', None)) return link extra_css = ' .newsAbstract{font-style: italic} ' - keep_only_tags = [dict(name='div', attrs={'class':['breadCrumbs','newsTop','newsText']}) - ] - - - remove_tags = [ - dict(name='div', attrs={'class':['leogoo','leogoo2']}) - ] - - - feeds = [ - (u'Prima Pagina', u'http://rss.adnkronos.com/RSS_PrimaPagina.xml'), - (u'Ultima Ora', u'http://rss.adnkronos.com/RSS_Ultimora.xml'), - (u'Politica', u'http://rss.adnkronos.com/RSS_Politica.xml'), - (u'Esteri', u'http://rss.adnkronos.com/RSS_Esteri.xml'), - (u'Cronoca', u'http://rss.adnkronos.com/RSS_Cronaca.xml'), - (u'Economia', u'http://rss.adnkronos.com/RSS_Economia.xml'), - (u'Finanza', u'http://rss.adnkronos.com/RSS_Finanza.xml'), - (u'CyberNews', u'http://rss.adnkronos.com/RSS_CyberNews.xml'), - (u'Spettacolo', u'http://rss.adnkronos.com/RSS_Spettacolo.xml'), - (u'Cultura', u'http://rss.adnkronos.com/RSS_Cultura.xml'), - (u'Sport', u'http://rss.adnkronos.com/RSS_Sport.xml'), - (u'Sostenibilita', u'http://rss.adnkronos.com/RSS_Sostenibilita.xml'), - (u'Salute', u'http://rss.adnkronos.com/RSS_Salute.xml') + keep_only_tags = [dict(name='div', attrs={'class': ['breadCrumbs', 'newsTop', 'newsText']}) ] + remove_tags = [ + dict(name='div', attrs={'class': ['leogoo', 'leogoo2']}) + ] + + feeds = [ + (u'Prima Pagina', u'http://rss.adnkronos.com/RSS_PrimaPagina.xml'), + (u'Ultima Ora', u'http://rss.adnkronos.com/RSS_Ultimora.xml'), + (u'Politica', u'http://rss.adnkronos.com/RSS_Politica.xml'), + (u'Esteri', u'http://rss.adnkronos.com/RSS_Esteri.xml'), + (u'Cronoca', u'http://rss.adnkronos.com/RSS_Cronaca.xml'), + (u'Economia', u'http://rss.adnkronos.com/RSS_Economia.xml'), + (u'Finanza', u'http://rss.adnkronos.com/RSS_Finanza.xml'), + (u'CyberNews', u'http://rss.adnkronos.com/RSS_CyberNews.xml'), + (u'Spettacolo', u'http://rss.adnkronos.com/RSS_Spettacolo.xml'), + (u'Cultura', u'http://rss.adnkronos.com/RSS_Cultura.xml'), + (u'Sport', u'http://rss.adnkronos.com/RSS_Sport.xml'), + (u'Sostenibilita', u'http://rss.adnkronos.com/RSS_Sostenibilita.xml'), + (u'Salute', u'http://rss.adnkronos.com/RSS_Salute.xml') + ] diff --git a/recipes/ads_of_the_world.recipe b/recipes/ads_of_the_world.recipe index 11224f2382..d62766da98 100644 --- a/recipes/ads_of_the_world.recipe +++ b/recipes/ads_of_the_world.recipe @@ -1,26 +1,26 @@ from calibre.web.feeds.news import BasicNewsRecipe + class AdvancedUserRecipe1336986047(BasicNewsRecipe): - title = u'Ads of the World' + title = u'Ads of the World' oldest_article = 7 max_articles_per_feed = 100 auto_cleanup = False - description = 'The best international advertising campaigns' + description = 'The best international advertising campaigns' language = 'en' __author__ = 'faber1971' no_stylesheets = True keep_only_tags = [ - dict(name='div', attrs={'id':'primary'}) - ] + dict(name='div', attrs={'id': 'primary'}) + ] remove_tags = [ - dict(name='ul', attrs={'class':'links inline'}) - ,dict(name='div', attrs={'class':'form-item'}) - ,dict(name='div', attrs={'id':['options', 'comments']}) - ,dict(name='ul', attrs={'id':'nodePager'}) - ] + dict(name='ul', attrs={'class': 'links inline'}), dict(name='div', attrs={'class': 'form-item'}), dict( + name='div', attrs={'id': ['options', 'comments']}), dict(name='ul', attrs={'id': 'nodePager'}) + ] reverse_article_order = True - masthead_url = 'http://bigcatgroup.co.uk/files/2011/01/05-ads-of-the-world.png' - feeds = [(u'Ads of the world', u'http://feeds.feedburner.com/adsoftheworld-latest')] + masthead_url = 'http://bigcatgroup.co.uk/files/2011/01/05-ads-of-the-world.png' + feeds = [ + (u'Ads of the world', u'http://feeds.feedburner.com/adsoftheworld-latest')] diff --git a/recipes/adventure_zone_pl.recipe b/recipes/adventure_zone_pl.recipe index 0845fc7046..3f0c71fdcb 100644 --- a/recipes/adventure_zone_pl.recipe +++ b/recipes/adventure_zone_pl.recipe @@ -1,10 +1,12 @@ from calibre.web.feeds.news import BasicNewsRecipe + + class Adventure_zone(BasicNewsRecipe): - title = u'Adventure Zone' - __author__ = 'fenuks' - description = u'Czytaj więcej o przygodzie - codzienne nowinki. Szukaj u nas solucji i poradników, czytaj recenzje i zapowiedzi. Także galeria, pliki oraz forum dla wszystkich fanów gier przygodowych.' - category = 'games' - language = 'pl' + title = u'Adventure Zone' + __author__ = 'fenuks' + description = u'Czytaj więcej o przygodzie - codzienne nowinki. Szukaj u nas solucji i poradników, czytaj recenzje i zapowiedzi. Także galeria, pliki oraz forum dla wszystkich fanów gier przygodowych.' # noqa + category = 'games' + language = 'pl' BASEURL = 'http://www.adventure-zone.info/fusion/' no_stylesheets = True extra_css = '.image {float: left; margin-right: 5px;}' @@ -13,20 +15,20 @@ class Adventure_zone(BasicNewsRecipe): cover_url = 'http://www.adventure-zone.info/inne/logoaz_2012.png' remove_attributes = ['style'] use_embedded_content = False - keep_only_tags = [dict(attrs={'class':'content'})] - remove_tags = [dict(attrs={'class':'footer'})] - feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/rss/index.php')] + keep_only_tags = [dict(attrs={'class': 'content'})] + remove_tags = [dict(attrs={'class': 'footer'})] + feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/rss/index.php')] def skip_ad_pages(self, soup): - skip_tag = soup.body.find(attrs={'class':'content'}) + skip_tag = soup.body.find(attrs={'class': 'content'}) skip_tag = skip_tag.findAll(name='a') title = soup.title.string.lower() - if (('zapowied' in title) or ('recenzj' in title) or ('solucj' in title) or ('poradnik' in title)): + if (('zapowied' in title) or ('recenzj' in title) or ('solucj' in title) or ('poradnik' in title)): for r in skip_tag: if r.strong and r.strong.string: - word=r.strong.string.lower() - if (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)): - return self.index_to_soup(self.BASEURL+r['href'], raw=True) + word = r.strong.string.lower() + if (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)): + return self.index_to_soup(self.BASEURL + r['href'], raw=True) def preprocess_html(self, soup): for link in soup.findAll('a', href=True): diff --git a/recipes/adventuregamers.recipe b/recipes/adventuregamers.recipe index b82bb7d02d..e1c5ddfd20 100644 --- a/recipes/adventuregamers.recipe +++ b/recipes/adventuregamers.recipe @@ -1,4 +1,4 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2009-2012, Darko Miletic ' ''' www.adventuregamers.com @@ -6,21 +6,21 @@ www.adventuregamers.com from calibre.web.feeds.news import BasicNewsRecipe + class AdventureGamers(BasicNewsRecipe): - title = u'Adventure Gamers' - language = 'en' - __author__ = 'Darko Miletic' - description = 'Adventure games portal' - publisher = 'Adventure Gamers' - category = 'news, games, adventure, technology' - oldest_article = 10 - #delay = 10 + title = u'Adventure Gamers' + language = 'en' + __author__ = 'Darko Miletic' + description = 'Adventure games portal' + publisher = 'Adventure Gamers' + category = 'news, games, adventure, technology' + oldest_article = 10 max_articles_per_feed = 100 - no_stylesheets = True - encoding = 'utf8' - remove_javascript = True - use_embedded_content = False - INDEX = u'http://www.adventuregamers.com' + no_stylesheets = True + encoding = 'utf8' + remove_javascript = True + use_embedded_content = False + INDEX = u'http://www.adventuregamers.com' extra_css = """ .pageheader_type{font-size: x-large; font-weight: bold; color: #828D74} .pageheader_title,.page_title{font-size: xx-large; color: #394128} @@ -29,59 +29,54 @@ class AdventureGamers(BasicNewsRecipe): .score_column_1{ padding-left: 10px; font-size: small; width: 50%} .score_column_2{ padding-left: 10px; font-size: small; width: 50%} .score_column_3{ padding-left: 10px; font-size: small; width: 50%} - .score_header{font-size: large; color: #50544A} + .score_header{font-size: large; color: #50544A} img{margin-bottom: 1em;} - body{font-family: 'Open Sans',Helvetica,Arial,sans-serif} + body{font-family: 'Open Sans',Helvetica,Arial,sans-serif} """ conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } + 'comment': description, 'tags': category, 'publisher': publisher, 'language': language + } - keep_only_tags = [dict(name='div', attrs={'class':'cleft_inn'})] + keep_only_tags = [dict(name='div', attrs={'class': 'cleft_inn'})] remove_tags = [ - dict(name=['object','link','embed','form','iframe','meta']) - ,dict(name='a', attrs={'href':'http://www.adventuregamers.com/about/scoring'}) - ,dict(name='a', attrs={'href':'http://www.adventuregamers.com/about/policies'}) - ] - remove_tags_after = [dict(name='div', attrs={'class':'bodytext'})] - remove_attributes = ['width','height'] + dict(name=['object', 'link', 'embed', 'form', 'iframe', 'meta']), dict(name='a', attrs={ + 'href': 'http://www.adventuregamers.com/about/scoring'}), dict(name='a', attrs={'href': 'http://www.adventuregamers.com/about/policies'}) + ] + remove_tags_after = [dict(name='div', attrs={'class': 'bodytext'})] + remove_attributes = ['width', 'height'] feeds = [(u'Articles', u'http://www.adventuregamers.com/rss/')] def get_article_url(self, article): url = BasicNewsRecipe.get_article_url(self, article) if '/videos/' in url or '/hypeometer/' in url: - return None + return None return url def append_page(self, soup, appendtag, position): - pager = soup.find('div', attrs={'class':'pagination_big'}) + pager = soup.find('div', attrs={'class': 'pagination_big'}) if pager: - nextpage = soup.find('a', attrs={'class':'next-page'}) - if nextpage: - nexturl = nextpage['href'] - soup2 = self.index_to_soup(nexturl) - texttag = soup2.find('div', attrs={'class':'bodytext'}) - for it in texttag.findAll(style=True): - del it['style'] - newpos = len(texttag.contents) - self.append_page(soup2,texttag,newpos) - texttag.extract() - pager.extract() - appendtag.insert(position,texttag) - + nextpage = soup.find('a', attrs={'class': 'next-page'}) + if nextpage: + nexturl = nextpage['href'] + soup2 = self.index_to_soup(nexturl) + texttag = soup2.find('div', attrs={'class': 'bodytext'}) + for it in texttag.findAll(style=True): + del it['style'] + newpos = len(texttag.contents) + self.append_page(soup2, texttag, newpos) + texttag.extract() + pager.extract() + appendtag.insert(position, texttag) def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] - for item in soup.findAll('div', attrs={'class':'floatright'}): + for item in soup.findAll('div', attrs={'class': 'floatright'}): item.extract() self.append_page(soup, soup.body, 3) - pager = soup.find('div',attrs={'class':'pagination_big'}) + pager = soup.find('div', attrs={'class': 'pagination_big'}) if pager: - pager.extract() + pager.extract() return self.adeify_images(soup) diff --git a/recipes/aftenposten.recipe b/recipes/aftenposten.recipe index fcc73658e9..fea850fc00 100644 --- a/recipes/aftenposten.recipe +++ b/recipes/aftenposten.recipe @@ -1,20 +1,20 @@ from calibre.web.feeds.news import BasicNewsRecipe + class Aftenposten(BasicNewsRecipe): - title = u'Aftenposten' - __author__ = 'davotibarna' - description = 'Norske nyheter' - language = 'no' - oldest_article = 5 - max_articles_per_feed = 100 - recipe_disabled = ('The recipe to download Aftenposten has been ' - 'temporarily disabled at the publisher\'s request, while ' - 'they finalize their digital strategy.') - no_stylesheets = True - encoding = 'ISO-8859-1' + title = u'Aftenposten' + __author__ = 'davotibarna' + description = 'Norske nyheter' + language = 'no' + oldest_article = 5 + max_articles_per_feed = 100 + recipe_disabled = ('The recipe to download Aftenposten has been ' + 'temporarily disabled at the publisher\'s request, while ' + 'they finalize their digital strategy.') + no_stylesheets = True + encoding = 'ISO-8859-1' - feeds = [(u'Aftenposten', u'http://www.aftenposten.no/eksport/rss-1_0/')] - - def print_version(self, url): - return url.replace('#xtor=RSS-3', '?service=print') + feeds = [(u'Aftenposten', u'http://www.aftenposten.no/eksport/rss-1_0/')] + def print_version(self, url): + return url.replace('#xtor=RSS-3', '?service=print') diff --git a/recipes/agrogerila.recipe b/recipes/agrogerila.recipe index 8ca13af4dd..70abbe0960 100644 --- a/recipes/agrogerila.recipe +++ b/recipes/agrogerila.recipe @@ -1,5 +1,5 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2010, Darko Miletic ' ''' boljevac.blogspot.com @@ -8,25 +8,23 @@ boljevac.blogspot.com import re from calibre.web.feeds.news import BasicNewsRecipe + class AgroGerila(BasicNewsRecipe): - title = 'Agro Gerila' - __author__ = 'Darko Miletic' - description = 'Politicki nekorektan blog.' - oldest_article = 45 + title = 'Agro Gerila' + __author__ = 'Darko Miletic' + description = 'Politicki nekorektan blog.' + oldest_article = 45 max_articles_per_feed = 100 - language = 'sr' - encoding = 'utf-8' - no_stylesheets = True - use_embedded_content = True - publication_type = 'blog' - extra_css = ' @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: "Trebuchet MS",Trebuchet,Verdana,sans1,sans-serif} .article_description{font-family: sans1, sans-serif} img{margin-bottom: 0.8em; border: 1px solid #333333; padding: 4px } ' + language = 'sr' + encoding = 'utf-8' + no_stylesheets = True + use_embedded_content = True + publication_type = 'blog' + extra_css = ' @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: "Trebuchet MS",Trebuchet,Verdana,sans1,sans-serif} .article_description{font-family: sans1, sans-serif} img{margin-bottom: 0.8em; border: 1px solid #333333; padding: 4px } ' # noqa conversion_options = { - 'comment' : description - , 'tags' : 'film, blog, srbija' - , 'publisher': 'Dry-Na-Nord' - , 'language' : language - } + 'comment': description, 'tags': 'film, blog, srbija', 'publisher': 'Dry-Na-Nord', 'language': language + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] @@ -36,5 +34,3 @@ class AgroGerila(BasicNewsRecipe): for item in soup.findAll(style=True): del item['style'] return self.adeify_images(soup) - - diff --git a/recipes/aif_ru.recipe b/recipes/aif_ru.recipe index ac82a2f90c..c742e5f838 100644 --- a/recipes/aif_ru.recipe +++ b/recipes/aif_ru.recipe @@ -1,4 +1,4 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2010 - 2014, Darko Miletic ' ''' www.aif.ru @@ -6,35 +6,32 @@ www.aif.ru from calibre.web.feeds.news import BasicNewsRecipe + class AIF_ru(BasicNewsRecipe): - title = 'Arguments & Facts - Russian' - __author__ = 'Darko Miletic' - description = 'News from Russia' - publisher = 'AIF' - category = 'news, politics, Russia' - oldest_article = 2 + title = 'Arguments & Facts - Russian' + __author__ = 'Darko Miletic' + description = 'News from Russia' + publisher = 'AIF' + category = 'news, politics, Russia' + oldest_article = 2 max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - encoding = 'utf8' - language = 'ru' - publication_type = 'magazine' - masthead_url = 'http://static3.aif.ru/glossy/index/i/logo.png' + no_stylesheets = True + use_embedded_content = False + encoding = 'utf8' + language = 'ru' + publication_type = 'magazine' + masthead_url = 'http://static3.aif.ru/glossy/index/i/logo.png' extra_css = """ body{font-family: Verdana,Arial,Helvetica,sans1,sans-serif} img{display: block} """ - keep_only_tags = [ - dict(name='h1', attrs={'class':'title'}) - ,dict(name='div', attrs={'class':'prew_tags'}) - ,dict(name='article', attrs={'class':lambda x: x and 'articl_body' in x.split()}) - ] - remove_tags = [ - dict(name=['iframe','object','link','base','input','meta']) - ,dict(name='div',attrs={'class':'in-topic'}) - ,dict(name='div', attrs={'class':lambda x: x and 'related_article' in x.split()}) - ,dict(name='div', attrs={'class':lambda x: x and 'articl_tag' in x.split()}) - ] - - feeds = [(u'News', u'http://www.aif.ru/rss/all.php')] + keep_only_tags = [ + dict(name='h1', attrs={'class': 'title'}), dict(name='div', attrs={'class': 'prew_tags'}), dict( + name='article', attrs={'class': lambda x: x and 'articl_body' in x.split()}) + ] + remove_tags = [ + dict(name=['iframe', 'object', 'link', 'base', 'input', 'meta']), dict(name='div', attrs={'class': 'in-topic'}), dict(name='div', attrs={ + 'class': lambda x: x and 'related_article' in x.split()}), dict(name='div', attrs={'class': lambda x: x and 'articl_tag' in x.split()}) + ] + feeds = [(u'News', u'http://www.aif.ru/rss/all.php')] diff --git a/recipes/air_force_times.recipe b/recipes/air_force_times.recipe index e4f223bf4b..5ffd4fa81e 100644 --- a/recipes/air_force_times.recipe +++ b/recipes/air_force_times.recipe @@ -1,5 +1,6 @@ from calibre.web.feeds.news import BasicNewsRecipe + class AirForceTimes(BasicNewsRecipe): title = 'Air Force Times' __author__ = 'jde' @@ -12,7 +13,7 @@ class AirForceTimes(BasicNewsRecipe): tags = 'news, U.S. Air Force' cover_url = 'http://www.airforcetimes.com/images/logo_airforcetimes_alert.jpg' masthead_url = 'http://www.airforcetimes.com/images/logo_airforcetimes_alert.jpg' - oldest_article = 7 #days + oldest_article = 7 # days max_articles_per_feed = 25 publication_type = 'newspaper' no_stylesheets = True @@ -24,20 +25,14 @@ class AirForceTimes(BasicNewsRecipe): remove_empty_feeds = True auto_cleanup = True - - feeds = [ - ('News', 'http://www.airforcetimes.com/rss_news.php'), - ('Benefits', 'http://www.airforcetimes.com/rss_benefits.php'), - ('Money', 'http://www.airforcetimes.com/rss_money.php'), - ('Careers & Education', 'http://www.airforcetimes.com/rss_careers.php'), - ('Community', 'http://www.airforcetimes.com/rss_community.php'), - ('Off Duty', 'http://www.airforcetimes.com/rss_off_duty.php'), - ('Entertainment', 'http://www.airforcetimes.com/rss_entertainment.php'), - ('Guard & Reserve', 'http://www.airforcetimes.com/rss_guard.php'), - ] - - - - + ('News', 'http://www.airforcetimes.com/rss_news.php'), + ('Benefits', 'http://www.airforcetimes.com/rss_benefits.php'), + ('Money', 'http://www.airforcetimes.com/rss_money.php'), + ('Careers & Education', 'http://www.airforcetimes.com/rss_careers.php'), + ('Community', 'http://www.airforcetimes.com/rss_community.php'), + ('Off Duty', 'http://www.airforcetimes.com/rss_off_duty.php'), + ('Entertainment', 'http://www.airforcetimes.com/rss_entertainment.php'), + ('Guard & Reserve', 'http://www.airforcetimes.com/rss_guard.php'), + ] diff --git a/recipes/ajc.recipe b/recipes/ajc.recipe index 3729731548..f27a2fc8e1 100644 --- a/recipes/ajc.recipe +++ b/recipes/ajc.recipe @@ -1,15 +1,17 @@ #!/usr/bin/env python2 -__license__ = 'Creative Commons Attribution 4.0 International License' -__author__ = 'John McDole' +__license__ = 'Creative Commons Attribution 4.0 International License' +__author__ = 'John McDole' __copyright__ = '' -__version__ = '0.1' -__date__ = '2015/01/10' +__version__ = '0.1' +__date__ = '2015/01/10' __docformat__ = 'restructuredtext en' -import datetime, re +import datetime +import re from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag + class AdvancedUserRecipe1282101454(BasicNewsRecipe): now = datetime.datetime.now() title = 'The AJC' @@ -24,72 +26,81 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets = True - # The AJC lists identical articles in multiple feeds; this removes them based on their URL + # The AJC lists identical articles in multiple feeds; this removes them + # based on their URL ignore_duplicate_articles = {'title', 'url'} # And this says "Hey, AJC, different feeds should mean something!" remove_empty_feeds = True - # Sets whether a feed has full articles embedded in it. The AJC feeds do not. + # Sets whether a feed has full articles embedded in it. The AJC feeds do + # not. use_embedded_content = False masthead_url = 'http://gawand.org/wp-content/uploads/2010/06/ajc-logo.gif' # Pick your poison. Business seems to be mostly cross-linked articles. Premium and cross-linked # articels will be dropped. - feeds = [ - ('Breaking News', 'http://www.ajc.com/list/rss/online/ajc-auto-list-iphone-topnews/aFKq/'), - ('Metro and Georgia', 'http://www.ajc.com/list/rss/news/local/news-georgia-and-region/aCxP/'), - ('Business', 'http://www.ajc.com/feeds/categories/business/'), - ('Health', 'http://www.ajc.com/feeds/categories/health/'), - # ('Braves', 'http://www.ajc.com/list/rss/sports/baseball/atlanta-braves-news/aGpN/'), - # ('Falcons', 'http://www.ajc.com/list/rss/sports/football/falcons-news/aGK4/'), - # ('Georgia Tech Yellow Jackets', 'http://www.ajc.com/list/rss/sports/college/georgia-tech-headlines/aGK6/'), - ] + feeds = [ + ('Breaking News', 'http://www.ajc.com/list/rss/online/ajc-auto-list-iphone-topnews/aFKq/'), + ('Metro and Georgia', + 'http://www.ajc.com/list/rss/news/local/news-georgia-and-region/aCxP/'), + ('Business', 'http://www.ajc.com/feeds/categories/business/'), + ('Health', 'http://www.ajc.com/feeds/categories/health/'), + # ('Braves', 'http://www.ajc.com/list/rss/sports/baseball/atlanta-braves-news/aGpN/'), + # ('Falcons', 'http://www.ajc.com/list/rss/sports/football/falcons-news/aGK4/'), + # ('Georgia Tech Yellow Jackets', 'http://www.ajc.com/list/rss/sports/college/georgia-tech-headlines/aGK6/'), + ] headline_reg_exp = '^.*cm-story-headline.*$' story_body_reg_exp = '^.*cm-story-body.*$' author_reg_exp = '^.*cm-story-author.*$' keep_only_tags = [ - dict(name='div', attrs={'class':re.compile(headline_reg_exp, re.IGNORECASE)}), - dict(name='div', attrs={'class':'cm-story-meta'}), - dict(name='div', attrs={'class':re.compile(author_reg_exp, re.IGNORECASE)}), - dict(name='meta', attrs={'name':'description'}), - dict(name='div', attrs={'class':re.compile(story_body_reg_exp, re.IGNORECASE)}), - ] + dict(name='div', attrs={'class': re.compile( + headline_reg_exp, re.IGNORECASE)}), + dict(name='div', attrs={'class': 'cm-story-meta'}), + dict(name='div', attrs={'class': re.compile( + author_reg_exp, re.IGNORECASE)}), + dict(name='meta', attrs={'name': 'description'}), + dict(name='div', attrs={'class': re.compile( + story_body_reg_exp, re.IGNORECASE)}), + ] premium_reg_exp = '^.*cmPremiumContent.*$' footer_reg_exp = '^.*cm-story-footer.*$' remove_tags = [ - dict(name='div', attrs={'class':re.compile(footer_reg_exp, re.IGNORECASE)}), - dict(name='div', attrs={'class':'cm-inline-related-group'}) - ] + dict(name='div', attrs={'class': re.compile( + footer_reg_exp, re.IGNORECASE)}), + dict(name='div', attrs={'class': 'cm-inline-related-group'}) + ] extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \ .cm-story-headline h1 { text-align: center; font-size: 175%; font-weight: bold; } \ .cm-story-meta { font-size: 80%; } \ - .cm-related-caption, .cmPhotoImageAttribution, img { display: block; font-size: 75%; font-style: italic; text-align: center; margin: 5px auto;} \ + .cm-related-caption, .cmPhotoImageAttribution, img { display: block; font-size: 75%; font-style: italic; text-align: center; margin: 5px auto;} \ .cm-story-author { display: block; font-size: 80%; font-style: italic; }' # I would love to remove these completely from the finished product, but I can't see how at the momemnt. - # Retuning "None" from preprocess_html(soup) as suggested in mobileread forums leads to errors. + # Retuning "None" from preprocess_html(soup) as suggested in mobileread + # forums leads to errors. def preprocess_html(self, soup): - premium = soup.find('div', attrs={'class':re.compile(self.premium_reg_exp, re.IGNORECASE)}) + premium = soup.find('div', attrs={'class': re.compile( + self.premium_reg_exp, re.IGNORECASE)}) if premium: return None - crosslink = soup.find('a', attrs={'class':'cm-feed-story-more-link'}) + crosslink = soup.find('a', attrs={'class': 'cm-feed-story-more-link'}) if crosslink: return None return soup def populate_article_metadata(self, article, soup, first): - for meta in soup.findAll('meta', attrs={'name':'description'}): + for meta in soup.findAll('meta', attrs={'name': 'description'}): article.text_summary = meta['content'] article.summary = meta['content'] - lead = soup.find('div', attrs={'class':'cm-story-photo'}) + lead = soup.find('div', attrs={'class': 'cm-story-photo'}) if lead: lead = lead.find('img') else: @@ -98,10 +109,10 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe): self.add_toc_thumbnail(article, lead['src']) names = '' comma = '' - for div in soup.findAll('div', attrs={'class':re.compile(self.author_reg_exp, re.IGNORECASE)}): + for div in soup.findAll('div', attrs={'class': re.compile(self.author_reg_exp, re.IGNORECASE)}): div.extract() for auth in div.findAll('a'): - if (auth.has_key('class') and auth['class'] == 'cm-source-image'): + if (auth.has_key('class') and auth['class'] == 'cm-source-image'): # noqa continue names = names + comma + auth.contents[0] comma = ', ' @@ -110,7 +121,6 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe): tag = Tag(soup, 'div', [('class', 'cm-story-author')]) tag.append("by: ") tag.append(names) - meta = soup.find('div', attrs={'class':'cm-story-meta'}) + meta = soup.find('div', attrs={'class': 'cm-story-meta'}) meta_idx = meta.parent.contents.index(meta) meta.parent.insert(meta_idx + 1, tag) - diff --git a/recipes/ajiajin.recipe b/recipes/ajiajin.recipe index 344d3d21fb..4bf89af412 100644 --- a/recipes/ajiajin.recipe +++ b/recipes/ajiajin.recipe @@ -1,4 +1,4 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2010, Hiroshi Miura ' ''' ajiajin.com/blog @@ -6,18 +6,17 @@ ajiajin.com/blog from calibre.web.feeds.news import BasicNewsRecipe + class AjiajinBlog(BasicNewsRecipe): - title = u'Ajiajin blog' - __author__ = 'Hiroshi Miura' + title = u'Ajiajin blog' + __author__ = 'Hiroshi Miura' oldest_article = 5 publication_type = 'blog' max_articles_per_feed = 100 - description = 'The next generation internet trends in Japan and Asia' - publisher = '' - category = 'internet, asia, japan' - language = 'en' - encoding = 'utf-8' - - feeds = [(u'blog', u'http://feeds.feedburner.com/Asiajin')] - + description = 'The next generation internet trends in Japan and Asia' + publisher = '' + category = 'internet, asia, japan' + language = 'en' + encoding = 'utf-8' + feeds = [(u'blog', u'http://feeds.feedburner.com/Asiajin')] diff --git a/recipes/aksiyon_derigisi.recipe b/recipes/aksiyon_derigisi.recipe index d7be418413..e0b8526bd1 100644 --- a/recipes/aksiyon_derigisi.recipe +++ b/recipes/aksiyon_derigisi.recipe @@ -2,46 +2,51 @@ from calibre.web.feeds.news import BasicNewsRecipe + class Aksiyon (BasicNewsRecipe): - title = u'Aksiyon Dergisi' - __author__ = u'thomass' - description = 'Haftalık haber dergisi ' - oldest_article =13 - max_articles_per_feed =100 - no_stylesheets = True - #delay = 1 - #use_embedded_content = False - encoding = 'utf-8' - publisher = 'Aksiyon' - category = 'news, haberler,TR,gazete' - language = 'tr' + title = u'Aksiyon Dergisi' + __author__ = u'thomass' + description = 'Haftalık haber dergisi ' + oldest_article = 13 + max_articles_per_feed = 100 + no_stylesheets = True + encoding = 'utf-8' + publisher = 'Aksiyon' + category = 'news, haberler,TR,gazete' + language = 'tr' publication_type = 'magazine' auto_cleanup = True cover_img_url = 'http://www.aksiyon.com.tr/aksiyon/images/aksiyon/top-page/aksiyon_top_r2_c1.jpg' masthead_url = 'http://aksiyon.com.tr/aksiyon/images/aksiyon/top-page/aksiyon_top_r2_c1.jpg' - ignore_duplicate_articles = { 'title', 'url' } - remove_empty_feeds= True - feeds = [ - ( u'KAPAK', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=26'), - ( u'ANASAYFA', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=0'), - ( u'EKONOMİ', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=35'), - ( u'EKOANALİZ', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=284'), - ( u'YAZARLAR', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=17'), - ( u'KİTAPLIK', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=13'), - ( u'SİNEMA', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=14'), - ( u'ARKA PENCERE', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=27'), - ( u'DÜNYA', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=32'), - ( u'DOSYALAR', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=34'), - ( u'KARAKUTU', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=11'), - ( u'KÜLTÜR & SANAT', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=12'), - ( u'SPOR', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=38'), - ( u'BİLİŞİM - TEKNOLOJİ', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=39'), - ( u'3. BOYUT', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=172'), - ( u'HAYAT BİLGİSİ', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=283'), - ( u'İŞ DÜNYASI', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=283'), - ] - - #def print_version(self, url): - #return url.replace('http://www.aksiyon.com.tr/aksiyon/newsDetail_getNewsById.action?load=detay&', 'http://www.aksiyon.com.tr/aksiyon/mobile_detailn.action?') + ignore_duplicate_articles = {'title', 'url'} + remove_empty_feeds = True + feeds = [ + (u'KAPAK', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=26'), + (u'ANASAYFA', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=0'), + (u'EKONOMİ', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=35'), + (u'EKOANALİZ', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=284'), + (u'YAZARLAR', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=17'), + (u'KİTAPLIK', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=13'), + (u'SİNEMA', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=14'), + (u'ARKA PENCERE', + u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=27'), + (u'DÜNYA', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=32'), + (u'DOSYALAR', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=34'), + (u'KARAKUTU', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=11'), + (u'KÜLTÜR & SANAT', + u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=12'), + (u'SPOR', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=38'), + (u'BİLİŞİM - TEKNOLOJİ', + u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=39'), + (u'3. BOYUT', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=172'), + (u'HAYAT BİLGİSİ', + u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=283'), + (u'İŞ DÜNYASI', + u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=283'), + ] + # def print_version(self, url): + # return + # url.replace('http://www.aksiyon.com.tr/aksiyon/newsDetail_getNewsById.action?load=detay&', + # 'http://www.aksiyon.com.tr/aksiyon/mobile_detailn.action?') diff --git a/recipes/akter.recipe b/recipes/akter.recipe index 83625c240b..20d9860225 100644 --- a/recipes/akter.recipe +++ b/recipes/akter.recipe @@ -1,4 +1,4 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2010-2012, Darko Miletic ' ''' akter.co.rs @@ -7,37 +7,35 @@ akter.co.rs import re from calibre.web.feeds.news import BasicNewsRecipe + class Akter(BasicNewsRecipe): - title = 'AKTER - Nedeljnik' - __author__ = 'Darko Miletic' - description = 'AKTER - nedeljni politicki magazin savremene Srbije' - publisher = 'Akter Media Group d.o.o.' - category = 'vesti, online vesti, najnovije vesti, politika, sport, ekonomija, biznis, finansije, berza, kultura, zivot, putovanja, auto, automobili, tehnologija, politicki magazin, dogadjaji, desavanja, lifestyle, zdravlje, zdravstvo, vest, novine, nedeljnik, srbija, novi sad, vojvodina, svet, drustvo, zabava, republika srpska, beograd, intervju, komentar, reportaza, arhiva vesti, news, serbia, politics' - oldest_article = 8 + title = 'AKTER - Nedeljnik' + __author__ = 'Darko Miletic' + description = 'AKTER - nedeljni politicki magazin savremene Srbije' + publisher = 'Akter Media Group d.o.o.' + category = 'vesti, online vesti, najnovije vesti, politika, sport, ekonomija, biznis, finansije, berza, kultura, zivot, putovanja, auto, automobili, tehnologija, politicki magazin, dogadjaji, desavanja, lifestyle, zdravlje, zdravstvo, vest, novine, nedeljnik, srbija, novi sad, vojvodina, svet, drustvo, zabava, republika srpska, beograd, intervju, komentar, reportaza, arhiva vesti, news, serbia, politics' # noqa + oldest_article = 8 max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - encoding = 'utf-8' - masthead_url = 'http://www.akter.co.rs/gfx/logoneover.png' - language = 'sr' - publication_type = 'magazine' - remove_empty_feeds = True + no_stylesheets = True + use_embedded_content = False + encoding = 'utf-8' + masthead_url = 'http://www.akter.co.rs/gfx/logoneover.png' + language = 'sr' + publication_type = 'magazine' + remove_empty_feeds = True extra_css = """ @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: Tahoma,Geneva,sans1,sans-serif} - img{margin-bottom: 0.8em; display: block;} + img{margin-bottom: 0.8em; display: block;} """ conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher': publisher - , 'language' : language - } + 'comment': description, 'tags': category, 'publisher': publisher, 'language': language + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - keep_only_tags = [dict(name='div', attrs={'id':'section_to_print'})] - feeds = [(u'Nedeljnik', u'http://akter.co.rs/rss/nedeljnik')] + keep_only_tags = [dict(name='div', attrs={'id': 'section_to_print'})] + feeds = [(u'Nedeljnik', u'http://akter.co.rs/rss/nedeljnik')] def print_version(self, url): dpart, spart, apart = url.rpartition('/') @@ -45,10 +43,9 @@ class Akter(BasicNewsRecipe): def get_cover_url(self): soup = self.index_to_soup('http://www.akter.co.rs/weekly.html') - divt = soup.find('div', attrs={'class':'lastissue'}) + divt = soup.find('div', attrs={'class': 'lastissue'}) if divt: - imgt = divt.find('img') - if imgt: - return 'http://www.akter.co.rs' + imgt['src'] + imgt = divt.find('img') + if imgt: + return 'http://www.akter.co.rs' + imgt['src'] return None - diff --git a/recipes/akter_dnevnik.recipe b/recipes/akter_dnevnik.recipe index 7322baf4ec..7de7d107fd 100644 --- a/recipes/akter_dnevnik.recipe +++ b/recipes/akter_dnevnik.recipe @@ -1,4 +1,4 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2012, Darko Miletic ' ''' akter.co.rs @@ -7,37 +7,34 @@ akter.co.rs import re from calibre.web.feeds.news import BasicNewsRecipe + class Akter(BasicNewsRecipe): - title = 'AKTER - Dnevnik' - __author__ = 'Darko Miletic' - description = 'AKTER - Najnovije vesti iz Srbije' - publisher = 'Akter Media Group d.o.o.' - category = 'vesti, online vesti, najnovije vesti, politika, sport, ekonomija, biznis, finansije, berza, kultura, zivot, putovanja, auto, automobili, tehnologija, politicki magazin, dogadjaji, desavanja, lifestyle, zdravlje, zdravstvo, vest, novine, nedeljnik, srbija, novi sad, vojvodina, svet, drustvo, zabava, republika srpska, beograd, intervju, komentar, reportaza, arhiva vesti, news, serbia, politics' - oldest_article = 8 + title = 'AKTER - Dnevnik' + __author__ = 'Darko Miletic' + description = 'AKTER - Najnovije vesti iz Srbije' + publisher = 'Akter Media Group d.o.o.' + oldest_article = 8 max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - encoding = 'utf-8' - masthead_url = 'http://www.akter.co.rs/gfx/logodnover.png' - language = 'sr' - publication_type = 'magazine' - remove_empty_feeds = True + no_stylesheets = True + use_embedded_content = False + encoding = 'utf-8' + masthead_url = 'http://www.akter.co.rs/gfx/logodnover.png' + language = 'sr' + publication_type = 'magazine' + remove_empty_feeds = True extra_css = """ @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: Tahoma,Geneva,sans1,sans-serif} - img{margin-bottom: 0.8em; display: block;} + img{margin-bottom: 0.8em; display: block;} """ conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher': publisher - , 'language' : language - } + 'comment': description, 'publisher': publisher, 'language': language + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - keep_only_tags = [dict(name='div', attrs={'id':'section_to_print'})] - feeds = [(u'Vesti', u'http://akter.co.rs/rss/dnevni')] + keep_only_tags = [dict(name='div', attrs={'id': 'section_to_print'})] + feeds = [(u'Vesti', u'http://akter.co.rs/rss/dnevni')] def print_version(self, url): dpart, spart, apart = url.rpartition('/') diff --git a/recipes/aktualne.cz.recipe b/recipes/aktualne.cz.recipe index 1adebe7cd1..833f044da2 100644 --- a/recipes/aktualne.cz.recipe +++ b/recipes/aktualne.cz.recipe @@ -3,8 +3,9 @@ from __future__ import unicode_literals from calibre.web.feeds.recipes import BasicNewsRecipe import re + class aktualneRecipe(BasicNewsRecipe): - __author__ = 'bubak' + __author__ = 'bubak' title = u'aktualne.cz' publisher = u'Centrum holdings' description = 'aktuálně.cz' @@ -13,13 +14,13 @@ class aktualneRecipe(BasicNewsRecipe): encoding = 'utf-8' feeds = [ - (u'Domácí', u'http://aktualne.centrum.cz/feeds/rss/domaci/?photo=0'), - (u'Zprávy', u'http://aktualne.centrum.cz/feeds/rss/zpravy/?photo=0'), - (u'Praha', u'http://aktualne.centrum.cz/feeds/rss/domaci/regiony/praha/?photo=0'), - (u'Ekonomika', u'http://aktualne.centrum.cz/feeds/rss/ekonomika/?photo=0'), - (u'Finance', u'http://aktualne.centrum.cz/feeds/rss/finance/?photo=0'), - (u'Blogy a názory', u'http://blog.aktualne.centrum.cz/export-all.php') - ] + (u'Domácí', u'http://aktualne.centrum.cz/feeds/rss/domaci/?photo=0'), + (u'Zprávy', u'http://aktualne.centrum.cz/feeds/rss/zpravy/?photo=0'), + (u'Praha', u'http://aktualne.centrum.cz/feeds/rss/domaci/regiony/praha/?photo=0'), + (u'Ekonomika', u'http://aktualne.centrum.cz/feeds/rss/ekonomika/?photo=0'), + (u'Finance', u'http://aktualne.centrum.cz/feeds/rss/finance/?photo=0'), + (u'Blogy a názory', u'http://blog.aktualne.centrum.cz/export-all.php') + ] language = 'cs' cover_url = 'http://img.aktualne.centrum.cz/design/akt4/o/l/logo-akt-ciste.png' @@ -27,29 +28,31 @@ class aktualneRecipe(BasicNewsRecipe): no_stylesheets = True remove_attributes = [] - remove_tags_before = dict(name='h1', attrs={'class':['titulek-clanku']}) + remove_tags_before = dict(name='h1', attrs={'class': ['titulek-clanku']}) filter_regexps = [r'img.aktualne.centrum.cz'] - remove_tags = [dict(name='div', attrs={'id':['social-bookmark']}), - dict(name='div', attrs={'class':['box1', 'svazane-tagy']}), - dict(name='div', attrs={'class':'itemcomment id0'}), - dict(name='div', attrs={'class':'hlavicka'}), - dict(name='div', attrs={'class':'hlavni-menu'}), - dict(name='div', attrs={'class':'top-standard-brand-obal'}), - dict(name='div', attrs={'class':'breadcrumb'}), - dict(name='div', attrs={'id':'start-standard'}), - dict(name='div', attrs={'id':'forum'}), - dict(name='span', attrs={'class':'akce'}), - dict(name='span', attrs={'class':'odrazka vetsi'}), - dict(name='div', attrs={'class':'boxP'}), - dict(name='div', attrs={'class':'box2'})] + remove_tags = [dict(name='div', attrs={'id': ['social-bookmark']}), + dict(name='div', attrs={'class': ['box1', 'svazane-tagy']}), + dict(name='div', attrs={'class': 'itemcomment id0'}), + dict(name='div', attrs={'class': 'hlavicka'}), + dict(name='div', attrs={'class': 'hlavni-menu'}), + dict(name='div', attrs={ + 'class': 'top-standard-brand-obal'}), + dict(name='div', attrs={'class': 'breadcrumb'}), + dict(name='div', attrs={'id': 'start-standard'}), + dict(name='div', attrs={'id': 'forum'}), + dict(name='span', attrs={'class': 'akce'}), + dict(name='span', attrs={'class': 'odrazka vetsi'}), + dict(name='div', attrs={'class': 'boxP'}), + dict(name='div', attrs={'class': 'box2'})] preprocess_regexps = [ - (re.compile(r'
'), - (re.compile(r'
')] + (re.compile(r'
'), + (re.compile(r'
')] keep_only_tags = [] visited_urls = {} + def get_article_url(self, article): url = BasicNewsRecipe.get_article_url(self, article) if url in self.visited_urls: diff --git a/recipes/al_ahram.recipe b/recipes/al_ahram.recipe index c7fda681e7..d22f3331ee 100644 --- a/recipes/al_ahram.recipe +++ b/recipes/al_ahram.recipe @@ -1,66 +1,76 @@ # coding=utf-8 -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2011-2016, Hassan Williamson ' ''' ahram.org.eg ''' from calibre.web.feeds.recipes import BasicNewsRecipe + class AlAhram(BasicNewsRecipe): - title = u'Al-Ahram (الأهرام)' - __author__ = 'Hassan Williamson' - description = 'The Arabic version of the Al-Ahram newspaper.' - language = 'ar' - encoding = 'utf8' - cover_url = 'http://www.ahram.org.eg/Media/News/2015/3/14/2015-635619650946000713-600.jpg' - oldest_article = 7 - max_articles_per_feed = 100 - no_stylesheets = True - #delay = 1 - use_embedded_content = False - publisher = 'Al-Ahram' - category = 'News' - publication_type = 'newsportal' + title = u'Al-Ahram (الأهرام)' + __author__ = 'Hassan Williamson' + description = 'The Arabic version of the Al-Ahram newspaper.' + language = 'ar' + encoding = 'utf8' + cover_url = 'http://www.ahram.org.eg/Media/News/2015/3/14/2015-635619650946000713-600.jpg' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + publisher = 'Al-Ahram' + category = 'News' + publication_type = 'newsportal' - extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif; direction: rtl; } .bbtitle{ font-weight: bold; font-size: 2em; } .bbsubtitle{ font-size: 1.3em; } #WriterImage{ height: 10px; } ' + extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif; direction: rtl; } .bbtitle{ font-weight: bold; font-size: 2em; } .bbsubtitle{ font-size: 1.3em; } #WriterImage{ height: 10px; } ' # noqa - keep_only_tags = [ - dict(name='div', attrs={'class':['bbcolright']}) - ] + keep_only_tags = [ + dict(name='div', attrs={'class': ['bbcolright']}) + ] - remove_tags = [ - dict(name='div', attrs={'class':['bbnav', 'bbsp']}), - dict(name='div', attrs={'id':['AddThisButton']}), - dict(name='a', attrs={'class':['twitter-share-button']}), - dict(name='div', attrs={'id':['ReaderCount']}), - ] + remove_tags = [ + dict(name='div', attrs={'class': ['bbnav', 'bbsp']}), + dict(name='div', attrs={'id': ['AddThisButton']}), + dict(name='a', attrs={'class': ['twitter-share-button']}), + dict(name='div', attrs={'id': ['ReaderCount']}), + ] remove_attributes = [ - 'width','height','style' - ] + 'width', 'height', 'style' + ] - feeds = [ - (u'الأولى', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=25'), - (u'الصفحة الثانية', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=74'), - (u'مصر', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=27'), - (u'المشهد السياسي', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=60'), - (u'المحافظات', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=29'), - (u'الوطن العربي', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=31'), - (u'العالم', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=26'), - (u'تقارير المراسلين', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=2'), - (u'تحقيقات', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=3'), - (u'قضايا واراء', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=4'), - (u'اقتصاد', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=5'), - (u'رياضة', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=6'), - (u'حوادث', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=38'), - (u'دنيا الثقافة', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=7'), - (u'المراة والطفل', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=8'), - (u'يوم جديد', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=9'), - (u'الكتاب', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=10'), - (u'الاعمدة', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=11'), - (u'أراء حرة', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=59'), - (u'ملفات الاهرام', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=12'), - (u'بريد الاهرام', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=15'), - (u'برلمان الثورة', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=61'), - (u'الاخيرة', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=16'), - ] + feeds = [ + (u'الأولى', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=25'), + (u'الصفحة الثانية', + 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=74'), + (u'مصر', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=27'), + (u'المشهد السياسي', + 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=60'), + (u'المحافظات', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=29'), + (u'الوطن العربي', + 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=31'), + (u'العالم', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=26'), + (u'تقارير المراسلين', + 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=2'), + (u'تحقيقات', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=3'), + (u'قضايا واراء', + 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=4'), + (u'اقتصاد', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=5'), + (u'رياضة', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=6'), + (u'حوادث', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=38'), + (u'دنيا الثقافة', + 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=7'), + (u'المراة والطفل', + 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=8'), + (u'يوم جديد', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=9'), + (u'الكتاب', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=10'), + (u'الاعمدة', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=11'), + (u'أراء حرة', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=59'), + (u'ملفات الاهرام', + 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=12'), + (u'بريد الاهرام', + 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=15'), + (u'برلمان الثورة', + 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=61'), + (u'الاخيرة', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=16'), + ] diff --git a/recipes/al_jazeera.recipe b/recipes/al_jazeera.recipe index 8fad320c05..d2f33239cf 100644 --- a/recipes/al_jazeera.recipe +++ b/recipes/al_jazeera.recipe @@ -1,4 +1,4 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2009-2010, Darko Miletic ' ''' @@ -6,57 +6,62 @@ english.aljazeera.net ''' from calibre.web.feeds.news import BasicNewsRecipe + def has_cls(x): - return dict(attrs={'class':lambda cls: cls and x in cls.split()}) + return dict(attrs={'class': lambda cls: cls and x in cls.split()}) + class AlJazeera(BasicNewsRecipe): - title = 'Al Jazeera in English' - __author__ = 'Darko Miletic' - description = 'News from Middle East' - language = 'en' - publisher = 'Al Jazeera' - category = 'news, politics, middle east' - delay = 1 - oldest_article = 2 - max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False + title = 'Al Jazeera in English' + __author__ = 'Darko Miletic' + description = 'News from Middle East' + language = 'en' + publisher = 'Al Jazeera' + category = 'news, politics, middle east' + delay = 1 + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False extra_css = """ body{font-family: Arial,sans-serif} #ctl00_cphBody_dvSummary{font-weight: bold} #dvArticleDate{font-size: small; color: #999999} """ conversion_options = { - 'comment' : description , 'tags' : category , - 'publisher' : publisher , 'language' : language + 'comment': description, 'tags': category, + 'publisher': publisher, 'language': language } keep_only_tags = [ dict(id='main-story'), ] remove_tags = [ - has_cls('MoreOnTheStory'), has_cls('ArticleBottomToolbar'), dict(smtitle="ShowMore"), - dict(name=['object','link','table','meta','base','iframe','embed']), + has_cls('MoreOnTheStory'), has_cls( + 'ArticleBottomToolbar'), dict(smtitle="ShowMore"), + dict(name=['object', 'link', 'table', + 'meta', 'base', 'iframe', 'embed']), ] - feeds = [(u'Al Jazeera English', u'http://english.aljazeera.net/Services/Rss/?PostingId=2007731105943979989')] + feeds = [(u'Al Jazeera English', + u'http://english.aljazeera.net/Services/Rss/?PostingId=2007731105943979989')] def get_article_url(self, article): - artlurl = article.get('link', None) - return artlurl.replace('http://english.aljazeera.net//','http://english.aljazeera.net/') + artlurl = article.get('link', None) + return artlurl.replace('http://english.aljazeera.net//', 'http://english.aljazeera.net/') def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for item in soup.findAll(face=True): del item['face'] - td = soup.find('td',attrs={'class':'DetailedSummary'}) + td = soup.find('td', attrs={'class': 'DetailedSummary'}) if td: td.name = 'div' - spn = soup.find('span',attrs={'id':'DetailedTitle'}) + spn = soup.find('span', attrs={'id': 'DetailedTitle'}) if spn: - spn.name='h1' - for itm in soup.findAll('span', attrs={'id':['dvArticleDate','ctl00_cphBody_lblDate']}): + spn.name = 'h1' + for itm in soup.findAll('span', attrs={'id': ['dvArticleDate', 'ctl00_cphBody_lblDate']}): itm.name = 'div' for alink in soup.findAll('a'): if alink.string is not None: diff --git a/recipes/al_masry_alyoum_arabic.recipe b/recipes/al_masry_alyoum_arabic.recipe index f32e3c0d9d..c3696a27e3 100644 --- a/recipes/al_masry_alyoum_arabic.recipe +++ b/recipes/al_masry_alyoum_arabic.recipe @@ -1,79 +1,85 @@ # coding=utf-8 -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2016, Hassan Williamson ' ''' almasryalyoum.com ''' from calibre.web.feeds.recipes import BasicNewsRecipe + class AlMasryAlyoum(BasicNewsRecipe): - title = u'Al-Masry Alyoum (المصري اليوم)' - __author__ = 'Hassan Williamson' - description = 'The Arabic version of the Al-Masry Alyoum (Egypt Independent) newspaper.' - language = 'ar' - encoding = 'utf8' - cover_url = 'http://www.almasryalyoum.com/content/images/header_logo.png' - oldest_article = 7 - max_articles_per_feed = 100 - no_stylesheets = True - #delay = 1 - use_embedded_content = False - publisher = 'Al-Masry Alyoum' - category = 'News' - publication_type = 'newsportal' + title = u'Al-Masry Alyoum (المصري اليوم)' + __author__ = 'Hassan Williamson' + description = 'The Arabic version of the Al-Masry Alyoum (Egypt Independent) newspaper.' + language = 'ar' + encoding = 'utf8' + cover_url = 'http://www.almasryalyoum.com/content/images/header_logo.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + publisher = 'Al-Masry Alyoum' + category = 'News' + publication_type = 'newsportal' - extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif; direction: rtl; } .tit_2{ font-weight: bold; font-size: 2em; } .pinfo{ font-size: 1.3em; } .articleimg img{ max-width: 100%; } .imgauther{ display: block; font-size: 0.7em; } .caption{ font-size: 0.7em; } ' + extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif; direction: rtl; } .tit_2{ font-weight: bold; font-size: 2em; } .pinfo{ font-size: 1.3em; } .articleimg img{ max-width: 100%; } .imgauther{ display: block; font-size: 0.7em; } .caption{ font-size: 0.7em; } ' # noqa - keep_only_tags = [ - dict(name='div', attrs={'class':['article']}) - ] + keep_only_tags = [ + dict(name='div', attrs={'class': ['article']}) + ] - remove_tags = [ - dict(name='div', attrs={'class':['share_buttons_container']}), - dict(name='div', attrs={'class':['min_related']}), - dict(name='div', attrs={'id':['feedback']}), - dict(name='div', attrs={'class':['news_SMSBox']}), - dict(name='div', attrs={'class':['tags']}), - dict(name='div', attrs={'class':['ads', 'y_logo_news']}), - dict(name='div', attrs={'class':['ads']}), - dict(name='div', attrs={'class':['option']}), - dict(name='div', attrs={'class':['seealso']}), - dict(name='div', attrs={'id':['comments']}), - ] + remove_tags = [ + dict(name='div', attrs={'class': ['share_buttons_container']}), + dict(name='div', attrs={'class': ['min_related']}), + dict(name='div', attrs={'id': ['feedback']}), + dict(name='div', attrs={'class': ['news_SMSBox']}), + dict(name='div', attrs={'class': ['tags']}), + dict(name='div', attrs={'class': ['ads', 'y_logo_news']}), + dict(name='div', attrs={'class': ['ads']}), + dict(name='div', attrs={'class': ['option']}), + dict(name='div', attrs={'class': ['seealso']}), + dict(name='div', attrs={'id': ['comments']}), + ] remove_attributes = [ - 'width','height','style' - ] + 'width', 'height', 'style' + ] - feeds = [ - (u'أخر الأخبار', 'http://www.almasryalyoum.com/rss/RssFeeds'), - (u'الصفحة الرئيسية', 'http://www.almasryalyoum.com/rss/RssFeeds?homePage=true'), - (u'أقلام وآراء', 'http://www.almasryalyoum.com/rss/RssFeeds?typeId=2&homePage=false'), - (u'أخبار مصر', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=3'), - (u'رياضة', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=8'), - (u'اقتصاد', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=4'), - (u'حوادث', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=7'), - (u'فنون', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=10'), - (u'منوعاتنون', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=12'), - (u'ثقافة', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=6'), - (u'علوم وتكنولوجيا', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=9'), - (u'تحقيقات وحوارات', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=5'), - (u'المرأة', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=69'), - (u'رأي', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=2'), - (u'وسط الناس', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=13'), - (u'مركز المصري للدراسات و المعلومات', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=56'), - (u'مطبخ', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=81'), - (u'برلمان مصر', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=78'), - (u'تقارير', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=54'), - (u'تحليلات', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=60'), - (u'عروض نقدية', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=61'), - (u'دراسات', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=62'), - (u'كتاب المصري اليوم', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=65'), - (u'فعاليات', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=66'), - (u'إسلامي', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=75'), - (u'مطبخي', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=76'), - (u'مسلسلاتيطبخي', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=77'), - (u'رمضان زمان', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=82'), - (u'تقارير', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=85'), - (u'سيارات', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=86'), - ] + feeds = [ + (u'أخر الأخبار', 'http://www.almasryalyoum.com/rss/RssFeeds'), + (u'الصفحة الرئيسية', + 'http://www.almasryalyoum.com/rss/RssFeeds?homePage=true'), + (u'أقلام وآراء', 'http://www.almasryalyoum.com/rss/RssFeeds?typeId=2&homePage=false'), + (u'أخبار مصر', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=3'), + (u'رياضة', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=8'), + (u'اقتصاد', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=4'), + (u'حوادث', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=7'), + (u'فنون', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=10'), + (u'منوعاتنون', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=12'), + (u'ثقافة', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=6'), + (u'علوم وتكنولوجيا', + 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=9'), + (u'تحقيقات وحوارات', + 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=5'), + (u'المرأة', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=69'), + (u'رأي', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=2'), + (u'وسط الناس', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=13'), + (u'مركز المصري للدراسات و المعلومات', + 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=56'), + (u'مطبخ', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=81'), + (u'برلمان مصر', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=78'), + (u'تقارير', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=54'), + (u'تحليلات', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=60'), + (u'عروض نقدية', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=61'), + (u'دراسات', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=62'), + (u'كتاب المصري اليوم', + 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=65'), + (u'فعاليات', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=66'), + (u'إسلامي', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=75'), + (u'مطبخي', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=76'), + (u'مسلسلاتيطبخي', + 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=77'), + (u'رمضان زمان', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=82'), + (u'تقارير', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=85'), + (u'سيارات', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=86'), + ] diff --git a/recipes/al_monitor.recipe b/recipes/al_monitor.recipe index c450cec9e1..93ae4b4d02 100644 --- a/recipes/al_monitor.recipe +++ b/recipes/al_monitor.recipe @@ -1,14 +1,18 @@ #!/usr/bin/env python2 # -*- coding: utf-8 -*- -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2014, spswerling' ''' http://www.al-monitor.com/ ''' -import string, inspect, datetime, re +import string +import inspect +import datetime +import re from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup + class AlMonitor(BasicNewsRecipe): title = u'Al Monitor' __author__ = u'spswerling' @@ -26,39 +30,39 @@ class AlMonitor(BasicNewsRecipe): recursions = 0 compress_news_images = True compress_news_images_max_size = 7 - scale_news_images = (150,200) # (kindle touch: 600x800) + scale_news_images = (150, 200) # (kindle touch: 600x800) useHighResImages = False oldest_article = 1.5 max_articles_per_section = 15 sections = [ - (u'egypt',u'http://www.al-monitor.com/pulse/egypt-pulse'), - (u'gulf',u'http://www.al-monitor.com/pulse/gulf-pulse'), - (u'iran',u'http://www.al-monitor.com/pulse/iran-pulse'), - (u'iraq',u'http://www.al-monitor.com/pulse/iraq-pulse'), - (u'israel',u'http://www.al-monitor.com/pulse/israel-pulse'), - (u'lebanon',u'http://www.al-monitor.com/pulse/lebanon-pulse'), - (u'palistine',u'http://www.al-monitor.com/pulse/palistine-pulse'), - (u'syria',u'http://www.al-monitor.com/pulse/syria-pulse'), - (u'turkey',u'http://www.al-monitor.com/pulse/turkey-pulse'), - ] + (u'egypt', u'http://www.al-monitor.com/pulse/egypt-pulse'), + (u'gulf', u'http://www.al-monitor.com/pulse/gulf-pulse'), + (u'iran', u'http://www.al-monitor.com/pulse/iran-pulse'), + (u'iraq', u'http://www.al-monitor.com/pulse/iraq-pulse'), + (u'israel', u'http://www.al-monitor.com/pulse/israel-pulse'), + (u'lebanon', u'http://www.al-monitor.com/pulse/lebanon-pulse'), + (u'palistine', u'http://www.al-monitor.com/pulse/palistine-pulse'), + (u'syria', u'http://www.al-monitor.com/pulse/syria-pulse'), + (u'turkey', u'http://www.al-monitor.com/pulse/turkey-pulse'), + ] # util for creating remove_tags and keep_tags style regex matchers def tag_matcher(elt, attr, rgx_str): - return dict(name=elt, attrs={attr:re.compile(rgx_str, re.IGNORECASE)}) + return dict(name=elt, attrs={attr: re.compile(rgx_str, re.IGNORECASE)}) remove_tags = [ - dict(attrs={'id':[ - 'header', - 'pulsebanner', - 'relatedarticles', - 'sidecolumn', - 'disqus', - 'footer', - 'footer2', - 'footer3', - 'mobile-extras', - ]}), + dict(attrs={'id': [ + 'header', + 'pulsebanner', + 'relatedarticles', + 'sidecolumn', + 'disqus', + 'footer', + 'footer2', + 'footer3', + 'mobile-extras', + ]}), tag_matcher('hr', 'id', 'spacer'), tag_matcher('a', 'title', 'print this article'), tag_matcher('div', 'class', 'extras'), @@ -118,12 +122,12 @@ class AlMonitor(BasicNewsRecipe): if len(self.articles[section]) >= self.max_articles_per_section: return self.articles[section].append( - dict(title=title, - url=full_url, - date='', - description='', - author='', - content='')) + dict(title=title, + url=full_url, + date='', + description='', + author='', + content='')) def preprocess_raw_html(self, raw_html, url): reason_to_skip = self.should_skip_article(BeautifulSoup(raw_html)) @@ -136,7 +140,7 @@ class AlMonitor(BasicNewsRecipe): return super(self.__class__, self).preprocess_raw_html(raw_html, url) def populate_article_metadata(self, article, soup, first): - summary_node = soup.find('div', {'id':'summary'}) + summary_node = soup.find('div', {'id': 'summary'}) if summary_node: summary = self.text(summary_node) self._p('Summary: ' + summary) @@ -167,7 +171,7 @@ class AlMonitor(BasicNewsRecipe): def date_from_string(self, datestring): try: # eg: Posted September 17, 2014 - dt = datetime.datetime.strptime(datestring,"Posted %B %d, %Y") + dt = datetime.datetime.strptime(datestring, "Posted %B %d, %Y") except: dt = None @@ -192,14 +196,14 @@ class AlMonitor(BasicNewsRecipe): return abs_url - def text(self,n): + def text(self, n): return self.tag_to_string(n).strip() def _dbg_soup_node(self, node): s = ' cls: ' + str(node.get('class')).strip() + \ - ' id: ' + str(node.get('id')).strip() + \ - ' role: ' + str(node.get('role')).strip() + \ - ' txt: ' + self.text(node) + ' id: ' + str(node.get('id')).strip() + \ + ' role: ' + str(node.get('role')).strip() + \ + ' txt: ' + self.text(node) return s def _p(self, msg): diff --git a/recipes/albert_mohler.recipe b/recipes/albert_mohler.recipe index fca16ccae9..a85063290a 100644 --- a/recipes/albert_mohler.recipe +++ b/recipes/albert_mohler.recipe @@ -1,10 +1,11 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2012, Peter Grungi

' from calibre.web.feeds.news import BasicNewsRecipe + class AlbertMohlersBlog(BasicNewsRecipe): - title = u'Albert Mohler\'s Blog' + title = u'Albert Mohler\'s Blog' __author__ = 'Peter Grungi' language = 'en' oldest_article = 90 @@ -15,4 +16,5 @@ class AlbertMohlersBlog(BasicNewsRecipe): language = 'en' author = 'Albert Mohler' - feeds = [(u'Albert Mohler\'s Blog', u'http://feeds.feedburner.com/AlbertMohlersBlog?format=xml')] + feeds = [(u'Albert Mohler\'s Blog', + u'http://feeds.feedburner.com/AlbertMohlersBlog?format=xml')] diff --git a/recipes/alejakomiksu_com.recipe b/recipes/alejakomiksu_com.recipe index f34fd1183d..34cdede724 100644 --- a/recipes/alejakomiksu_com.recipe +++ b/recipes/alejakomiksu_com.recipe @@ -2,16 +2,16 @@ __license__ = 'GPL v3' import re from calibre.web.feeds.news import BasicNewsRecipe + class AlejaKomiksu(BasicNewsRecipe): - title = u'Aleja Komiksu' - __author__ = 'fenuks' - description = u'Serwis poświęcony komiksom. Najnowsze wieści, recenzje, artykuły, wywiady, galerie, komiksy online, konkursy, linki, baza komiksów online.' - category = 'comics' - #publication_type = '' - language = 'pl' - #encoding = '' + title = u'Aleja Komiksu' + __author__ = 'fenuks' + description = u'Serwis poświęcony komiksom. Najnowsze wieści, recenzje, artykuły, wywiady, galerie, komiksy online, konkursy, linki, baza komiksów online.' + category = 'comics' + language = 'pl' extra_css = 'ul {list-style-type: none;} .gfx_news {float: right;}' - preprocess_regexps = [(re.compile(ur'((

  • (Do poczytania)|(Nowości):
  • )|(

    Komentarze

    )).*', re.DOTALL|re.IGNORECASE), lambda match: '')] + preprocess_regexps = [(re.compile(ur'((
  • (Do poczytania)|(Nowości):
  • )|(

    Komentarze

    )).*', + re.DOTALL | re.IGNORECASE), lambda match: '')] cover_url = 'http://www.alejakomiksu.com/gfx/build/logo.png' masthead_url = 'http://www.alejakomiksu.com/gfx/build/logo.png' use_embedded_content = False @@ -23,15 +23,13 @@ class AlejaKomiksu(BasicNewsRecipe): remove_attributes = ['style', 'font'] ignore_duplicate_articles = {'title', 'url'} - keep_only_tags = [dict(attrs={'class':'cont_tresc'})] - #remove_tags = [dict()] - #remove_tags_before = dict() + keep_only_tags = [dict(attrs={'class': 'cont_tresc'})] feeds = [(u'Wiadomości', 'http://www.alejakomiksu.com/rss.php5')] def skip_ad_pages(self, soup): - tag = soup.find(attrs={'class':'rodzaj'}) + tag = soup.find(attrs={'class': 'rodzaj'}) if tag and tag.a.string.lower().strip() == 'recenzje': link = soup.find(text=re.compile('recenzuje')) if link: - return self.index_to_soup(link.parent['href'], raw=True) \ No newline at end of file + return self.index_to_soup(link.parent['href'], raw=True) diff --git a/recipes/alo_novine.recipe b/recipes/alo_novine.recipe index 45e53e99e8..45709c2001 100644 --- a/recipes/alo_novine.recipe +++ b/recipes/alo_novine.recipe @@ -1,4 +1,4 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2010, Darko Miletic ' ''' @@ -8,19 +8,20 @@ www.alo.rs import re from calibre.web.feeds.recipes import BasicNewsRecipe + class Alo_Novine(BasicNewsRecipe): - title = 'Alo!' - __author__ = 'Darko Miletic' - description = "News Portal from Serbia" - publisher = 'Alo novine d.o.o.' - category = 'news, politics, Serbia' - oldest_article = 2 + title = 'Alo!' + __author__ = 'Darko Miletic' + description = "News Portal from Serbia" + publisher = 'Alo novine d.o.o.' + category = 'news, politics, Serbia' + oldest_article = 2 max_articles_per_feed = 100 - delay = 4 - no_stylesheets = True - encoding = 'utf-8' - use_embedded_content = False - language = 'sr' + delay = 4 + no_stylesheets = True + encoding = 'utf-8' + use_embedded_content = False + language = 'sr' extra_css = """ @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} .article_description,body{font-family: Arial,Helvetica,sans1,sans-serif} @@ -30,25 +31,23 @@ class Alo_Novine(BasicNewsRecipe): img{margin-bottom: 0.8em} """ conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher': publisher - , 'language' : language - } + 'comment': description, 'tags': category, 'publisher': publisher, 'language': language + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - remove_tags = [dict(name=['object','link','embed'])] - remove_attributes = ['height','width'] + remove_tags = [dict(name=['object', 'link', 'embed'])] + remove_attributes = ['height', 'width'] feeds = [ - (u'Najnovije Vijesti', u'http://www.alo.rs/rss/danasnje_vesti') - ,(u'Politika' , u'http://www.alo.rs/rss/politika') - ,(u'Vesti' , u'http://www.alo.rs/rss/vesti') - ,(u'Sport' , u'http://www.alo.rs/rss/sport') - ,(u'Ljudi' , u'http://www.alo.rs/rss/ljudi') - ,(u'Saveti' , u'http://www.alo.rs/rss/saveti') - ] + + (u'Najnovije Vijesti', u'http://www.alo.rs/rss/danasnje_vesti'), + (u'Politika', u'http://www.alo.rs/rss/politika'), + (u'Vesti', u'http://www.alo.rs/rss/vesti'), + (u'Sport', u'http://www.alo.rs/rss/sport'), + (u'Ljudi', u'http://www.alo.rs/rss/ljudi'), + (u'Saveti', u'http://www.alo.rs/rss/saveti') + ] def preprocess_html(self, soup): for item in soup.findAll(style=True): @@ -61,5 +60,4 @@ class Alo_Novine(BasicNewsRecipe): return 'http://www.alo.rs/resources/templates/tools/print.php?id=' + artid def image_url_processor(self, baseurl, url): - return url.replace('alo.rs//','alo.rs/') - + return url.replace('alo.rs//', 'alo.rs/') diff --git a/recipes/alt_om_herning.recipe b/recipes/alt_om_herning.recipe index c60d142a85..f7e757165b 100644 --- a/recipes/alt_om_herning.recipe +++ b/recipes/alt_om_herning.recipe @@ -1,4 +1,4 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2011, Rasmus Lauritsen ' ''' aoh.dk @@ -6,38 +6,35 @@ aoh.dk from calibre.web.feeds.news import BasicNewsRecipe + class aoh_dk(BasicNewsRecipe): - title = 'Alt om Herning' - __author__ = 'Rasmus Lauritsen' - description = 'Nyheder fra Herning om omegn' - publisher = 'Mediehuset Herning Folkeblad' - category = 'news, local, Denmark' - oldest_article = 14 + title = 'Alt om Herning' + __author__ = 'Rasmus Lauritsen' + description = 'Nyheder fra Herning om omegn' + publisher = 'Mediehuset Herning Folkeblad' + category = 'news, local, Denmark' + oldest_article = 14 max_articles_per_feed = 50 - no_stylesheets = True - delay = 1 - encoding = 'utf8' - use_embedded_content = False - language = 'da' + no_stylesheets = True + delay = 1 + encoding = 'utf8' + use_embedded_content = False + language = 'da' extra_css = """ body{font-family: Verdana,Arial,sans-serif } img{margin-bottom: 0.4em} .txtContent,.stamp{font-size: small} """ conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } + 'comment': description, 'tags': category, 'publisher': publisher, 'language': language + } feeds = [(u'All news', u'http://aoh.dk/rss.xml')] - keep_only_tags = [ - dict(name='h1') - ,dict(name='span', attrs={'class':['frontpage_body']}) - ] + keep_only_tags = [ + dict(name='h1'), dict(name='span', attrs={'class': ['frontpage_body']}) + ] - remove_tags = [ - dict(name=['object','link']) - ] + remove_tags = [ + dict(name=['object', 'link']) + ] diff --git a/recipes/alternet.recipe b/recipes/alternet.recipe index 0bd608e0e7..fc4f2d9a5f 100644 --- a/recipes/alternet.recipe +++ b/recipes/alternet.recipe @@ -1,34 +1,35 @@ from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.news import BasicNewsRecipe + class Alternet(BasicNewsRecipe): - title = u'Alternet' - __author__= 'rty' + title = u'Alternet' + __author__ = 'rty' oldest_article = 7 max_articles_per_feed = 100 publisher = 'alternet.org' category = 'News, Magazine' description = 'News magazine and online community' - feeds = [ + feeds = [ (u'Front Page', u'http://feeds.feedblitz.com/alternet') - ] + ] - remove_attributes = ['width', 'align','cellspacing'] + remove_attributes = ['width', 'align', 'cellspacing'] remove_javascript = True - use_embedded_content = True + use_embedded_content = True no_stylesheets = True language = 'en' - encoding = 'UTF-8' + encoding = 'UTF-8' temp_files = [] articles_are_obfuscated = True def get_article_url(self, article): - return article.get('link', None) + return article.get('link', None) def get_obfuscated_article(self, url): br = self.get_browser() br.open(url) - response = br.follow_link(url_regex = r'/printversion/[0-9]+', nr = 0) + response = br.follow_link(url_regex=r'/printversion/[0-9]+', nr=0) html = response.read() self.temp_files.append(PersistentTemporaryFile('_fa.html')) self.temp_files[-1].write(html) diff --git a/recipes/am730.recipe b/recipes/am730.recipe index d491c49ef1..0d8e56c9cf 100644 --- a/recipes/am730.recipe +++ b/recipes/am730.recipe @@ -1,6 +1,6 @@ # vim:fileencoding=UTF-8 from __future__ import unicode_literals -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2013, Eddie Lau' __Date__ = '' @@ -12,7 +12,9 @@ Change Log: from calibre import (__appname__, force_unicode, strftime) from calibre.utils.date import now as nowf -import os, datetime, re +import os +import datetime +import re from calibre.web.feeds.recipes import BasicNewsRecipe from contextlib import nested from calibre.ebooks.BeautifulSoup import BeautifulSoup @@ -21,10 +23,11 @@ from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata import MetaInformation from calibre.utils.localization import canonicalize_lang + class AppleDaily(BasicNewsRecipe): - title = u'AM730' - __author__ = 'Eddie Lau' - publisher = 'AM730' + title = u'AM730' + __author__ = 'Eddie Lau' + publisher = 'AM730' oldest_article = 1 max_articles_per_feed = 100 auto_cleanup = False @@ -35,46 +38,46 @@ class AppleDaily(BasicNewsRecipe): use_embedded_content = False no_stylesheets = True description = 'http://www.am730.com.hk' - category = 'Chinese, News, Hong Kong' + category = 'Chinese, News, Hong Kong' masthead_url = 'http://www.am730.com.hk/images/logo.jpg' - extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 20px; margin-bottom: 20px; max-height:70%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} li {font-size:50%; margin-left:auto; margin-right:auto;}' - keep_only_tags = [dict(name='h2', attrs={'class':'printTopic'}), - dict(name='div', attrs={'id':'article_content'}), - dict(name='div', attrs={'id':'slider'})] - remove_tags = [dict(name='img', attrs={'src':'images/am730_article_logo.jpg'}), - dict(name='img', attrs={'src':'images/am_endmark.gif'})] + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 20px; margin-bottom: 20px; max-height:70%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} li {font-size:50%; margin-left:auto; margin-right:auto;}' # noqa + keep_only_tags = [dict(name='h2', attrs={'class': 'printTopic'}), + dict(name='div', attrs={'id': 'article_content'}), + dict(name='div', attrs={'id': 'slider'})] + remove_tags = [dict(name='img', attrs={'src': 'images/am730_article_logo.jpg'}), + dict(name='img', attrs={'src': 'images/am_endmark.gif'})] def get_dtlocal(self): dt_utc = datetime.datetime.utcnow() # convert UTC to local hk time - at HKT 6am, all news are available - return dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(6.0/24) + return dt_utc + datetime.timedelta(8.0 / 24) - datetime.timedelta(6.0 / 24) def get_fetchdate(self): - if __Date__ <> '': + if __Date__ != '': return __Date__ else: return self.get_dtlocal().strftime("%Y%m%d") def get_fetchformatteddate(self): - if __Date__ <> '': - return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8] + if __Date__ != '': + return __Date__[0:4] + '-' + __Date__[4:6] + '-' + __Date__[6:8] else: return self.get_dtlocal().strftime("%Y-%m-%d") def get_fetchyear(self): - if __Date__ <> '': + if __Date__ != '': return __Date__[0:4] else: return self.get_dtlocal().strftime("%Y") def get_fetchmonth(self): - if __Date__ <> '': + if __Date__ != '': return __Date__[4:6] else: return self.get_dtlocal().strftime("%m") def get_fetchday(self): - if __Date__ <> '': + if __Date__ != '': return __Date__[6:8] else: return self.get_dtlocal().strftime("%d") @@ -85,7 +88,9 @@ class AppleDaily(BasicNewsRecipe): def get_cover_url(self): soup = self.index_to_soup('http://www.am730.com.hk') - cover = 'http://www.am730.com.hk/' + soup.find(attrs={'id':'mini_news_img'}).find('img').get('src', False) + cover = 'http://www.am730.com.hk/' + \ + soup.find(attrs={'id': 'mini_news_img'}).find( + 'img').get('src', False) br = BasicNewsRecipe.get_browser(self) try: br.open(cover) @@ -97,7 +102,7 @@ class AppleDaily(BasicNewsRecipe): if first and hasattr(self, 'add_toc_thumbnail'): picdiv = soup.find('img') if picdiv is not None: - self.add_toc_thumbnail(article,picdiv['src']) + self.add_toc_thumbnail(article, picdiv['src']) def parse_index(self): feeds = [] @@ -123,7 +128,8 @@ class AppleDaily(BasicNewsRecipe): mi.publisher = __appname__ mi.author_sort = __appname__ if self.publication_type: - mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() + mi.publication_type = 'periodical:' + \ + self.publication_type + ':' + self.short_title() mi.timestamp = nowf() article_titles, aseen = [], set() for f in feeds: @@ -136,15 +142,15 @@ class AppleDaily(BasicNewsRecipe): if not isinstance(mi.comments, unicode): mi.comments = mi.comments.decode('utf-8', 'replace') mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' + - '\n\n'.join(article_titles)) + '\n\n'.join(article_titles)) language = canonicalize_lang(self.language) if language is not None: mi.language = language # This one affects the pub date shown in kindle title - #mi.pubdate = nowf() # now appears to need the time field to be > 12.00noon as well - mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0) + mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int( + self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0) opf_path = os.path.join(dir, 'index.opf') ncx_path = os.path.join(dir, 'index.ncx') @@ -153,12 +159,14 @@ class AppleDaily(BasicNewsRecipe): mp = getattr(self, 'masthead_path', None) if mp is not None and os.access(mp, os.R_OK): from calibre.ebooks.metadata.opf2 import Guide - ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu()) + ref = Guide.Reference(os.path.basename( + self.masthead_path), os.getcwdu()) ref.type = 'masthead' ref.title = 'Masthead Image' opf.guide.append(ref) - manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] + manifest = [os.path.join(dir, 'feed_%d' % i) + for i in range(len(feeds))] manifest.append(os.path.join(dir, 'index.html')) manifest.append(os.path.join(dir, 'index.ncx')) @@ -167,7 +175,7 @@ class AppleDaily(BasicNewsRecipe): if cpath is None: pf = open(os.path.join(dir, 'cover.jpg'), 'wb') if self.default_cover(pf): - cpath = pf.name + cpath = pf.name if cpath is not None and os.access(cpath, os.R_OK): opf.cover = cpath manifest.append(cpath) @@ -189,12 +197,11 @@ class AppleDaily(BasicNewsRecipe): self.play_order_counter = 0 self.play_order_map = {} - def feed_index(num, parent): f = feeds[num] for j, a in enumerate(f): if getattr(a, 'downloaded', False): - adir = 'feed_%d/article_%d/'%(num, j) + adir = 'feed_%d/article_%d/' % (num, j) auth = a.author if not auth: auth = None @@ -204,16 +211,18 @@ class AppleDaily(BasicNewsRecipe): else: desc = self.description_limiter(desc) tt = a.toc_thumbnail if a.toc_thumbnail else None - entries.append('%sindex.html'%adir) + entries.append('%sindex.html' % adir) po = self.play_order_map.get(entries[-1], None) if po is None: self.play_order_counter += 1 po = self.play_order_counter - parent.add_item('%sindex.html'%adir, None, - a.title if a.title else _('Untitled Article'), - play_order=po, author=auth, - description=desc, toc_thumbnail=tt) - last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) + parent.add_item('%sindex.html' % adir, None, + a.title if a.title else _( + 'Untitled Article'), + play_order=po, author=auth, + description=desc, toc_thumbnail=tt) + last = os.path.join( + self.output_dir, ('%sindex.html' % adir).replace('/', os.sep)) for sp in a.sub_pages: prefix = os.path.commonprefix([opf_path, sp]) relp = sp[len(prefix):] @@ -226,12 +235,14 @@ class AppleDaily(BasicNewsRecipe): soup = BeautifulSoup(src) body = soup.find('body') if body is not None: - prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) + prefix = '/'.join('..'for i in range(2 * + len(re.findall(r'link\d+', last)))) templ = self.navbar.generate(True, num, j, len(f), - not self.has_single_feed, - a.orig_url, __appname__, prefix=prefix, - center=self.center_navbar) - elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') + not self.has_single_feed, + a.orig_url, __appname__, prefix=prefix, + center=self.center_navbar) + elem = BeautifulSoup(templ.render( + doctype='xhtml').decode('utf-8')).find('div') body.insert(len(body.contents), elem) with open(last, 'wb') as fi: fi.write(unicode(soup).encode('utf-8')) @@ -240,7 +251,7 @@ class AppleDaily(BasicNewsRecipe): if len(feeds) > 1: for i, f in enumerate(feeds): - entries.append('feed_%d/index.html'%i) + entries.append('feed_%d/index.html' % i) po = self.play_order_map.get(entries[-1], None) if po is None: self.play_order_counter += 1 @@ -251,11 +262,11 @@ class AppleDaily(BasicNewsRecipe): desc = getattr(f, 'description', None) if not desc: desc = None - feed_index(i, toc.add_item('feed_%d/index.html'%i, None, - f.title, play_order=po, description=desc, author=auth)) + feed_index(i, toc.add_item('feed_%d/index.html' % i, None, + f.title, play_order=po, description=desc, author=auth)) else: - entries.append('feed_%d/index.html'%0) + entries.append('feed_%d/index.html' % 0) feed_index(0, toc) for i, p in enumerate(entries): @@ -265,5 +276,3 @@ class AppleDaily(BasicNewsRecipe): with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): opf.render(opf_file, ncx_file) - - diff --git a/recipes/ambito.recipe b/recipes/ambito.recipe index 6cbfb2d1dc..c1d3e424d6 100644 --- a/recipes/ambito.recipe +++ b/recipes/ambito.recipe @@ -1,4 +1,4 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2008-2015, Darko Miletic ' ''' ambito.com @@ -6,46 +6,46 @@ ambito.com from calibre.web.feeds.news import BasicNewsRecipe + class Ambito(BasicNewsRecipe): - title = 'Ambito.com' - __author__ = 'Darko Miletic' - description = 'Ambito.com con noticias del Diario Ambito Financiero de Buenos Aires' - publisher = 'Editorial Nefir S.A.' - category = 'news, politics, economy, finances, Argentina' - oldest_article = 2 - no_stylesheets = True - encoding = 'cp1252' - masthead_url = 'http://www.ambito.com/img/logo.jpg' - use_embedded_content = False - remove_empty_feeds = True - language = 'es_AR' - publication_type = 'newsportal' + title = 'Ambito.com' + __author__ = 'Darko Miletic' + description = 'Ambito.com con noticias del Diario Ambito Financiero de Buenos Aires' + publisher = 'Editorial Nefir S.A.' + category = 'news, politics, economy, finances, Argentina' + oldest_article = 2 + no_stylesheets = True + encoding = 'cp1252' + masthead_url = 'http://www.ambito.com/img/logo.jpg' + use_embedded_content = False + remove_empty_feeds = True + language = 'es_AR' + publication_type = 'newsportal' extra_css = """ body{font-family: "Trebuchet MS",Verdana,sans-serif} .volanta{font-size: small} .t2_portada{font-size: xx-large; font-family: Georgia,serif; color: #026698} """ - conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } + 'comment': description, 'tags': category, 'publisher': publisher, 'language': language + } - keep_only_tags = [dict(attrs={'id':['tituloDespliegue','imgDesp','textoDespliegue']})] - remove_tags = [dict(name=['object','link','embed','iframe','meta','link'])] + keep_only_tags = [ + dict(attrs={'id': ['tituloDespliegue', 'imgDesp', 'textoDespliegue']})] + remove_tags = [ + dict(name=['object', 'link', 'embed', 'iframe', 'meta', 'link'])] feeds = [ - (u'Principales Noticias', u'http://www.ambito.com/rss/noticiasp.asp' ) - ,(u'Economia' , u'http://www.ambito.com/rss/noticias.asp?S=Econom%EDa' ) - ,(u'Politica' , u'http://www.ambito.com/rss/noticias.asp?S=Pol%EDtica' ) - ,(u'Informacion General' , u'http://www.ambito.com/rss/noticias.asp?S=Informaci%F3n%20General') - ,(u'Campo' , u'http://www.ambito.com/rss/noticias.asp?S=Agro' ) - ,(u'Internacionales' , u'http://www.ambito.com/rss/noticias.asp?S=Internacionales' ) - ,(u'Deportes' , u'http://www.ambito.com/rss/noticias.asp?S=Deportes' ) - ,(u'Espectaculos' , u'http://www.ambito.com/rss/noticias.asp?S=Espect%E1culos' ) - ,(u'Tecnologia' , u'http://www.ambito.com/rss/noticias.asp?S=Tecnolog%EDa' ) - ,(u'Ambito Nacional' , u'http://www.ambito.com/rss/noticias.asp?S=Ambito%20Nacional' ) - ] + + (u'Principales Noticias', u'http://www.ambito.com/rss/noticiasp.asp'), + (u'Economia', u'http://www.ambito.com/rss/noticias.asp?S=Econom%EDa'), + (u'Politica', u'http://www.ambito.com/rss/noticias.asp?S=Pol%EDtica'), + (u'Informacion General', u'http://www.ambito.com/rss/noticias.asp?S=Informaci%F3n%20General'), + (u'Campo', u'http://www.ambito.com/rss/noticias.asp?S=Agro'), + (u'Internacionales', u'http://www.ambito.com/rss/noticias.asp?S=Internacionales'), + (u'Deportes', u'http://www.ambito.com/rss/noticias.asp?S=Deportes'), + (u'Espectaculos', u'http://www.ambito.com/rss/noticias.asp?S=Espect%E1culos'), + (u'Tecnologia', u'http://www.ambito.com/rss/noticias.asp?S=Tecnolog%EDa'), + (u'Ambito Nacional', u'http://www.ambito.com/rss/noticias.asp?S=Ambito%20Nacional') + ] diff --git a/recipes/ambito_financiero.recipe b/recipes/ambito_financiero.recipe index c45fa8fbce..8cb8fe6281 100644 --- a/recipes/ambito_financiero.recipe +++ b/recipes/ambito_financiero.recipe @@ -1,4 +1,4 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2011, Darko Miletic ' ''' ambito.com/diario @@ -8,22 +8,23 @@ import time from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe + class Ambito_Financiero(BasicNewsRecipe): - title = 'Ambito Financiero' - __author__ = 'Darko Miletic' - description = 'Informacion Libre las 24 horas' - publisher = 'Editorial Nefir S.A.' - category = 'news, politics, economy, Argentina' - no_stylesheets = True - encoding = 'cp1252' - masthead_url = 'http://www.ambito.com/diario/img/logo_af.gif' - publication_type = 'newspaper' - needs_subscription = 'optional' - use_embedded_content = False - language = 'es_AR' - PREFIX = 'http://www.ambito.com' - INDEX = PREFIX + '/diario/index.asp' - LOGIN = PREFIX + '/diario/login/entrada.asp' + title = 'Ambito Financiero' + __author__ = 'Darko Miletic' + description = 'Informacion Libre las 24 horas' + publisher = 'Editorial Nefir S.A.' + category = 'news, politics, economy, Argentina' + no_stylesheets = True + encoding = 'cp1252' + masthead_url = 'http://www.ambito.com/diario/img/logo_af.gif' + publication_type = 'newspaper' + needs_subscription = 'optional' + use_embedded_content = False + language = 'es_AR' + PREFIX = 'http://www.ambito.com' + INDEX = PREFIX + '/diario/index.asp' + LOGIN = PREFIX + '/diario/login/entrada.asp' extra_css = """ body{font-family: "Trebuchet MS",Verdana,sans-serif} .volanta{font-size: small} @@ -31,14 +32,12 @@ class Ambito_Financiero(BasicNewsRecipe): """ conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } + 'comment': description, 'tags': category, 'publisher': publisher, 'language': language + } - keep_only_tags = [dict(name='div', attrs={'align':'justify'})] - remove_tags = [dict(name=['object','link','embed','iframe','meta','link','table','img'])] + keep_only_tags = [dict(name='div', attrs={'align': 'justify'})] + remove_tags = [dict(name=['object', 'link', 'embed', + 'iframe', 'meta', 'link', 'table', 'img'])] remove_attributes = ['align'] def get_browser(self): @@ -53,7 +52,7 @@ class Ambito_Financiero(BasicNewsRecipe): return br def print_version(self, url): - return url.replace('/diario/noticia.asp?','/noticias/imprimir.asp?') + return url.replace('/diario/noticia.asp?', '/noticias/imprimir.asp?') def preprocess_html(self, soup): for item in soup.findAll(style=True): @@ -61,27 +60,24 @@ class Ambito_Financiero(BasicNewsRecipe): for item in soup.findAll('a'): str = item.string if str is None: - str = self.tag_to_string(item) + str = self.tag_to_string(item) item.replaceWith(str) return soup def parse_index(self): soup = self.index_to_soup(self.INDEX) - cover_item = soup.find('img',attrs={'class':'fotodespliegue'}) + cover_item = soup.find('img', attrs={'class': 'fotodespliegue'}) if cover_item: - self.cover_url = self.PREFIX + cover_item['src'] + self.cover_url = self.PREFIX + cover_item['src'] articles = [] - checker = [] - for feed_link in soup.findAll('a', attrs={'class':['t0_portada','t2_portada','bajada']}): - url = self.PREFIX + feed_link['href'] + checker = [] + for feed_link in soup.findAll('a', attrs={'class': ['t0_portada', 't2_portada', 'bajada']}): + url = self.PREFIX + feed_link['href'] title = self.tag_to_string(feed_link) - date = strftime("%a, %d %b %Y %H:%M:%S +0000",time.gmtime()) + date = strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) if url not in checker: checker.append(url) articles.append({ - 'title' :title - ,'date' :date - ,'url' :url - ,'description':u'' - }) + 'title': title, 'date': date, 'url': url, 'description': u'' + }) return [(self.title, articles)] diff --git a/recipes/american_thinker.recipe b/recipes/american_thinker.recipe index 5390a19eb8..ef022451c5 100644 --- a/recipes/american_thinker.recipe +++ b/recipes/american_thinker.recipe @@ -1,4 +1,4 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2010, Walt Anthony ' ''' www.americanthinker.com @@ -8,37 +8,34 @@ from calibre.web.feeds.news import BasicNewsRecipe from calibre.utils.cleantext import clean_xml_chars from lxml import etree + class AmericanThinker(BasicNewsRecipe): - title = u'American Thinker' - description = "American Thinker is a daily internet publication devoted to the thoughtful exploration of issues of importance to Americans." - __author__ = 'Walt Anthony' - publisher = 'Thomas Lifson' - category = 'news, politics, USA' - oldest_article = 7 # days + title = u'American Thinker' + description = "American Thinker is a daily internet publication devoted to the thoughtful exploration of issues of importance to Americans." + __author__ = 'Walt Anthony' + publisher = 'Thomas Lifson' + category = 'news, politics, USA' + oldest_article = 7 # days max_articles_per_feed = 50 - summary_length = 150 - language = 'en' + summary_length = 150 + language = 'en' ignore_duplicate_articles = {'title', 'url'} - remove_javascript = True + remove_javascript = True auto_cleanup = True conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - , 'linearize_tables' : True - } + 'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True + } def preprocess_raw_html(self, raw, url): root = html5lib.parse( clean_xml_chars(raw), treebuilder='lxml', namespaceHTMLElements=False) - for x in root.xpath('''descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' article_body ') and (@class and contains(concat(' ', normalize-space(@class), ' '), ' bottom '))]'''): + for x in root.xpath('''descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' article_body ') and (@class and contains(concat(' ', normalize-space(@class), ' '), ' bottom '))]'''): # noqa x.getparent().remove(x) return etree.tostring(root, encoding=unicode) feeds = [(u'http://feeds.feedburner.com/americanthinker'), - (u'http://feeds.feedburner.com/AmericanThinkerBlog') - ] + (u'http://feeds.feedburner.com/AmericanThinkerBlog') + ] diff --git a/recipes/amspec.recipe b/recipes/amspec.recipe index e2d35b21aa..f7880d49c5 100644 --- a/recipes/amspec.recipe +++ b/recipes/amspec.recipe @@ -1,4 +1,4 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2009-2010, Darko Miletic ' ''' spectator.org @@ -7,20 +7,22 @@ spectator.org from calibre.web.feeds.news import BasicNewsRecipe from css_selectors import Select + class TheAmericanSpectator(BasicNewsRecipe): - title = 'The American Spectator' - __author__ = 'Kovid Goyal' - description = 'News from USA' - oldest_article = 7 + title = 'The American Spectator' + __author__ = 'Kovid Goyal' + description = 'News from USA' + oldest_article = 7 max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - language = 'en' + no_stylesheets = True + use_embedded_content = False + language = 'en' auto_cleanup = True encoding = 'utf-8' def parse_index(self): - root = self.index_to_soup('http://spectator.org/issues/current', as_tree=True) + root = self.index_to_soup( + 'http://spectator.org/issues/current', as_tree=True) select = Select(root) main = tuple(select('div#block-system-main'))[0] feeds = [] @@ -43,7 +45,8 @@ class TheAmericanSpectator(BasicNewsRecipe): for x in select('div.views-field-field-short-summary', li): desc = self.tag_to_string(x) break - articles.append({'title':title, 'url':url, 'description':desc}) + articles.append( + {'title': title, 'url': url, 'description': desc}) self.log('\t', title, 'at', url) feeds.append((section_title, articles)) return feeds diff --git a/recipes/an_druma_mor.recipe b/recipes/an_druma_mor.recipe index d214cf8025..d1ff817b6e 100644 --- a/recipes/an_druma_mor.recipe +++ b/recipes/an_druma_mor.recipe @@ -1,12 +1,13 @@ from calibre.web.feeds.news import BasicNewsRecipe + class AnDrumaMor(BasicNewsRecipe): - title = u'An Druma M\xf3r' + title = u'An Druma M\xf3r' __author__ = "David O'Callaghan" oldest_article = 7 max_articles_per_feed = 100 language = 'ga' use_embedded_content = True - feeds = [(u'Nuacht Laeth\xfail', u'http://feeds.feedburner.com/NuachtLneLaethilArAnDrumaMr')] - + feeds = [(u'Nuacht Laeth\xfail', + u'http://feeds.feedburner.com/NuachtLneLaethilArAnDrumaMr')] diff --git a/recipes/anandtech.recipe b/recipes/anandtech.recipe index 45b28a2e11..aa29ed443c 100644 --- a/recipes/anandtech.recipe +++ b/recipes/anandtech.recipe @@ -1,4 +1,4 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' @@ -13,10 +13,10 @@ class anan(BasicNewsRecipe): title = 'Anandtech' description = 'comprehensive Hardware Tests' __author__ = 'Oliver Niesner, Armin Geller' # 2014-02-27 AGE: update - use_embedded_content = False + use_embedded_content = False language = 'en' timefmt = ' [%d %b %Y]' - oldest_article = 7 + oldest_article = 7 max_articles_per_feed = 40 no_stylesheets = True remove_javascript = True @@ -26,17 +26,17 @@ class anan(BasicNewsRecipe): masthead_url = 'http://www.anandtech.com/content/images/globals/printheader.png' keep_only_tags = [ - dict(name='section', attrs={'class':['main_cont']}), - ] - remove_tags=[ - dict(name='div', attrs={'class':['print', - 'breadcrumb_area noprint', - 'fl-rt noprint', - 'blog_top_right',]}) - ] + dict(name='section', attrs={'class': ['main_cont']}), + ] + remove_tags = [ + dict(name='div', attrs={'class': ['print', + 'breadcrumb_area noprint', + 'fl-rt noprint', + 'blog_top_right', ]}) + ] - feeds = [('Anandtech', 'http://www.anandtech.com/rss/')] + feeds = [('Anandtech', 'http://www.anandtech.com/rss/')] - def print_version(self,url): + def print_version(self, url): # return url.replace("0Cshow0C", "0Cprint0C") # 2013-09-07 AGE: update return url.replace("/show/", "/print/") # 2014-02-27 AGE: update diff --git a/recipes/anchorage_daily.recipe b/recipes/anchorage_daily.recipe index 530bd5528c..9e2a65c1d2 100644 --- a/recipes/anchorage_daily.recipe +++ b/recipes/anchorage_daily.recipe @@ -1,38 +1,28 @@ from calibre.web.feeds.news import BasicNewsRecipe + class AdvancedUserRecipe1278347258(BasicNewsRecipe): - title = u'Anchorage Daily News' + title = u'Anchorage Daily News' __author__ = 'rty' oldest_article = 7 max_articles_per_feed = 100 auto_cleanup = True - - feeds = [(u'Alaska News', u'http://www.adn.com/rss-feeds/feed/all'), - (u'Politics', u'http://www.adn.com/rss-feeds/feed/politics'), - ] + feeds = [(u'Alaska News', u'http://www.adn.com/rss-feeds/feed/all'), + (u'Politics', u'http://www.adn.com/rss-feeds/feed/politics'), + ] description = ''''Alaska's Newspaper''' - publisher = 'http://www.adn.com' - category = 'news, Alaska, Anchorage' + publisher = 'http://www.adn.com' + category = 'news, Alaska, Anchorage' language = 'en' extra_css = ''' p{font-weight: normal;text-align: justify} ''' remove_javascript = True - use_embedded_content = False + use_embedded_content = False no_stylesheets = True language = 'en' - encoding = 'utf-8' - conversion_options = {'linearize_tables':True} + encoding = 'utf-8' + conversion_options = {'linearize_tables': True} masthead_url = 'http://media.adn.com/includes/assets/images/adn_logo.2.gif' - #keep_only_tags = [ - #dict(name='div', attrs={'class':'left_col story_mainbar'}), - #] - #remove_tags = [ - #dict(name='div', attrs={'class':'story_tools'}), - #dict(name='p', attrs={'class':'ad_label'}), - #] - #remove_tags_after = [ - #dict(name='div', attrs={'class':'advertisement'}), - #] diff --git a/recipes/android_com_pl.recipe b/recipes/android_com_pl.recipe index 4b5455cfc6..b7ad096f79 100644 --- a/recipes/android_com_pl.recipe +++ b/recipes/android_com_pl.recipe @@ -1,15 +1,17 @@ import re from calibre.web.feeds.news import BasicNewsRecipe + class Android_com_pl(BasicNewsRecipe): - title = u'Android.com.pl' - __author__ = 'fenuks' - description = u'Android.com.pl - to największe w Polsce centrum Android OS. Znajdziesz tu: nowości, forum, pomoc, recenzje, gry, aplikacje.' - category = 'Android, mobile' - language = 'pl' + title = u'Android.com.pl' + __author__ = 'fenuks' + description = u'Android.com.pl - to największe w Polsce centrum Android OS. Znajdziesz tu: nowości, forum, pomoc, recenzje, gry, aplikacje.' + category = 'Android, mobile' + language = 'pl' use_embedded_content = True cover_url = 'http://android.com.pl/wp-content/themes/android/images/logo.png' oldest_article = 8 max_articles_per_feed = 100 - preprocess_regexps = [(re.compile(ur'

    .{,1}

    ', re.DOTALL), lambda match: '')] - feeds = [(u'Android', u'http://android.com.pl/feed/')] \ No newline at end of file + preprocess_regexps = [ + (re.compile(ur'

    .{,1}

    ', re.DOTALL), lambda match: '')] + feeds = [(u'Android', u'http://android.com.pl/feed/')] diff --git a/recipes/animal_politico.recipe b/recipes/animal_politico.recipe index 0a4cbbb4b6..adbd6d7b3a 100644 --- a/recipes/animal_politico.recipe +++ b/recipes/animal_politico.recipe @@ -3,43 +3,43 @@ from calibre.web.feeds.news import BasicNewsRecipe + class AdvancedUserRecipe1290663986(BasicNewsRecipe): - title = u'Animal Pol\u00EDtico' - publisher = u'Animal Pol\u00EDtico' - category = u'News, Mexico' - description = u'Noticias Pol\u00EDticas' - __author__ = 'leamsi' - masthead_url = 'http://www.animalpolitico.com/wp-content/themes/animal_mu/images/logo.png' + title = u'Animal Pol\u00EDtico' + publisher = u'Animal Pol\u00EDtico' + category = u'News, Mexico' + description = u'Noticias Pol\u00EDticas' + __author__ = 'leamsi' + masthead_url = 'http://www.animalpolitico.com/wp-content/themes/animal_mu/images/logo.png' oldest_article = 1 max_articles_per_feed = 100 - language = 'es_MX' - - #feeds = [(u'Animal Politico', u'http://www.animalpolitico.com/feed/')] + language = 'es_MX' remove_tags_before = dict(name='div', id='main') - remove_tags = [dict(name='div', attrs={'class':'fb-like-button'})] - keep_only_tags = [dict(name='h1', attrs={'class':'entry-title'}), - dict(name='div', attrs={'class':'entry-content'})] + remove_tags = [dict(name='div', attrs={'class': 'fb-like-button'})] + keep_only_tags = [dict(name='h1', attrs={'class': 'entry-title'}), + dict(name='div', attrs={'class': 'entry-content'})] remove_javascript = True INDEX = 'http://www.animalpolitico.com/' def generic_parse(self, soup): articles = [] - for entry in soup.findAll(lambda tag: tag.name == 'li' and tag.has_key('class') and tag['class'].find('hentry') != -1): #soup.findAll('li', 'hentry'): - article_url = entry.a['href'] + '?print=yes' - article_title= entry.find('h3', 'entry-title') - article_title= self.tag_to_string(article_title) + # soup.findAll('li', 'hentry'): + for entry in soup.findAll(lambda tag: tag.name == 'li' and tag.has_key('class') and tag['class'].find('hentry') != -1): # noqa + article_url = entry.a['href'] + '?print=yes' + article_title = entry.find('h3', 'entry-title') + article_title = self.tag_to_string(article_title) article_date = entry.find('span', 'the-time') article_date = self.tag_to_string(article_date) article_desc = self.tag_to_string(entry.find('p')) - #print 'Article:',article_title, article_date,article_url - #print entry['class'] + # print 'Article:',article_title, article_date,article_url + # print entry['class'] - articles.append({'title' : article_title, - 'date' : article_date, - 'description' : article_desc, - 'url' : article_url}) + articles.append({'title': article_title, + 'date': article_date, + 'description': article_desc, + 'url': article_url}) # Avoid including the multimedia stuff. if entry['class'].find('last') != -1: break @@ -48,56 +48,57 @@ class AdvancedUserRecipe1290663986(BasicNewsRecipe): def plumaje_parse(self, soup): articles = [] - blogs_soup = soup.find(lambda tag: tag.name == 'ul' and tag.has_key('class') and tag['class'].find('bloglist-fecha') != -1) + blogs_soup = soup.find(lambda tag: tag.name == 'ul' and tag.has_key('class') and tag['class'].find('bloglist-fecha') != -1) # noqa for entry in blogs_soup.findAll('li'): article_title = entry.p - article_url = article_title.a['href'] + '?print=yes' - article_date = article_title.nextSibling + article_url = article_title.a['href'] + '?print=yes' + article_date = article_title.nextSibling article_title = self.tag_to_string(article_title) - article_date = self.tag_to_string(article_date).replace(u'Last Updated: ', '') - article_desc = self.tag_to_string(entry.find('h4')) + article_date = self.tag_to_string( + article_date).replace(u'Last Updated: ', '') + article_desc = self.tag_to_string(entry.find('h4')) - #print 'Article:',article_title, article_date,article_url - articles.append({'title' : article_title, - 'date' : article_date, - 'description' : article_desc, - 'url' : article_url}) + # print 'Article:',article_title, article_date,article_url + articles.append({'title': article_title, + 'date': article_date, + 'description': article_desc, + 'url': article_url}) return articles def boca_parse(self, soup): articles = [] - for entry in soup.findAll(lambda tag: tag.name == 'div' and tag.has_key('class') and tag['class'].find('hentry') != -1): #soup.findAll('li', 'hentry'): - article_title= entry.find('h2', 'entry-title') - article_url = article_title.a['href'] + '?print=yes' - article_title= self.tag_to_string(article_title) + # soup.findAll('li', 'hentry'): + for entry in soup.findAll(lambda tag: tag.name == 'div' and tag.has_key('class') and tag['class'].find('hentry') != -1): # noqa + article_title = entry.find('h2', 'entry-title') + article_url = article_title.a['href'] + '?print=yes' + article_title = self.tag_to_string(article_title) article_date = entry.find('span', 'entry-date') article_date = self.tag_to_string(article_date) - article_desc = self.tag_to_string(entry.find('div', 'entry-content')) + article_desc = self.tag_to_string( + entry.find('div', 'entry-content')) - #print 'Article:',article_title, article_date,article_url - #print entry['class'] + # print 'Article:',article_title, article_date,article_url + # print entry['class'] - articles.append({'title' : article_title, - 'date' : article_date, - 'description' : article_desc, - 'url' : article_url}) + articles.append({'title': article_title, + 'date': article_date, + 'description': article_desc, + 'url': article_url}) # Avoid including the multimedia stuff. if entry['class'].find('last') != -1: break return articles - - - def parse_index(self): - gobierno_soup = self.index_to_soup(self.INDEX+'gobierno/') - congreso_soup = self.index_to_soup(self.INDEX+'congreso/') - seguridad_soup = self.index_to_soup(self.INDEX+'seguridad/') - comunidad_soup = self.index_to_soup(self.INDEX+'comunidad/') - plumaje_soup = self.index_to_soup(self.INDEX+'plumaje/') - la_boca_del_lobo_soup = self.index_to_soup(self.INDEX+'category/la-boca-del-lobo/') + gobierno_soup = self.index_to_soup(self.INDEX + 'gobierno/') + congreso_soup = self.index_to_soup(self.INDEX + 'congreso/') + seguridad_soup = self.index_to_soup(self.INDEX + 'seguridad/') + comunidad_soup = self.index_to_soup(self.INDEX + 'comunidad/') + plumaje_soup = self.index_to_soup(self.INDEX + 'plumaje/') + la_boca_del_lobo_soup = self.index_to_soup( + self.INDEX + 'category/la-boca-del-lobo/') gobierno_articles = self.generic_parse(gobierno_soup) congreso_articles = self.generic_parse(congreso_soup) @@ -106,6 +107,5 @@ class AdvancedUserRecipe1290663986(BasicNewsRecipe): plumaje_articles = self.plumaje_parse(plumaje_soup) la_boca_del_lobo_articles = self.boca_parse(la_boca_del_lobo_soup) - - return [ (u'Gobierno', gobierno_articles), (u'Congreso', congreso_articles), (u'Seguridad', seguridad_articles), - (u'Comunidad', comunidad_articles), (u'Plumaje', plumaje_articles), (u'La Boca del Lobo', la_boca_del_lobo_articles), ] + return [(u'Gobierno', gobierno_articles), (u'Congreso', congreso_articles), (u'Seguridad', seguridad_articles), + (u'Comunidad', comunidad_articles), (u'Plumaje', plumaje_articles), (u'La Boca del Lobo', la_boca_del_lobo_articles), ] diff --git a/recipes/antyweb.recipe b/recipes/antyweb.recipe index cd7d792c4a..3871579c44 100644 --- a/recipes/antyweb.recipe +++ b/recipes/antyweb.recipe @@ -1,6 +1,7 @@ -#-*- coding: utf-8 -*- +# -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe + class AntywebRecipe(BasicNewsRecipe): encoding = 'utf-8' __license__ = 'GPL v3' @@ -10,38 +11,40 @@ class AntywebRecipe(BasicNewsRecipe): title = u'Antyweb' category = u'News' description = u'Blog o internecie i nowych technologiach' - cover_url='' - remove_empty_feeds= True + cover_url = '' + remove_empty_feeds = True auto_cleanup = False - no_stylesheets=True + no_stylesheets = True use_embedded_content = False oldest_article = 7 max_articles_per_feed = 100 remove_javascript = True simultaneous_downloads = 10 - ignore_duplicate_articles = {'title', 'url'} # zignoruj zduplikowane artykuły o takich samych tytułach LUB adresach - scale_news_images =True - conversion_options = { 'tags' : u'news, aplikacje mobilne, Android, iOS, Windows Phone ', - 'smarten_punctuation' : True, - 'publisher' : 'AntyWeb' - } # opcje konwersji. + # zignoruj zduplikowane artykuły o takich samych tytułach LUB adresach + ignore_duplicate_articles = {'title', 'url'} + scale_news_images = True + conversion_options = {'tags': u'news, aplikacje mobilne, Android, iOS, Windows Phone ', + 'smarten_punctuation': True, + 'publisher': 'AntyWeb' + } # opcje konwersji. - keep_only_tags=[] - keep_only_tags.append(dict(name = 'h1')) - keep_only_tags.append(dict(name = 'article', attrs = {'class' : 'article'})) - remove_tags =[] - remove_tags.append(dict(name = 'div', attrs = {'class' : 'ac-footer group'})) + keep_only_tags = [] + keep_only_tags.append(dict(name='h1')) + keep_only_tags.append(dict(name='article', attrs={'class': 'article'})) + remove_tags = [] + remove_tags.append(dict(name='div', attrs={'class': 'ac-footer group'})) + + feeds = [ + (u'News', 'http://feeds.feedburner.com/antyweb'), + (u'Felietony', 'http://feeds.feedburner.com/AntywebFelietony'), + (u'Apple', 'http://feeds.feedburner.com/AntywebApple'), + (u'Gry', 'http://feeds.feedburner.com/AntywebGry'), + (u'Mobile', 'http://feeds.feedburner.com/AntywebMobile'), + (u'Startups', 'http://feeds.feedburner.com/AntywebStartups'), + (u'Google', 'http://feeds.feedburner.com/AntywebGoogle'), + (u'Microsoft', 'http://feeds.feedburner.com/AntywebMicrosoft') + ] - feeds = [ - (u'News', 'http://feeds.feedburner.com/antyweb'), - (u'Felietony', 'http://feeds.feedburner.com/AntywebFelietony'), - (u'Apple', 'http://feeds.feedburner.com/AntywebApple'), - (u'Gry', 'http://feeds.feedburner.com/AntywebGry'), - (u'Mobile', 'http://feeds.feedburner.com/AntywebMobile'), - (u'Startups', 'http://feeds.feedburner.com/AntywebStartups'), - (u'Google', 'http://feeds.feedburner.com/AntywebGoogle'), - (u'Microsoft', 'http://feeds.feedburner.com/AntywebMicrosoft') - ] def preprocess_html(self, soup): for alink in soup.findAll('a'): if alink.string is not None: diff --git a/recipes/ap.recipe b/recipes/ap.recipe index 9936c37c7e..37e594819e 100644 --- a/recipes/ap.recipe +++ b/recipes/ap.recipe @@ -6,21 +6,23 @@ class AssociatedPress(BasicNewsRecipe): title = u'Associated Press' description = 'Global news' __author__ = 'Krittika Goyal' - use_embedded_content = False + use_embedded_content = False language = 'en' no_stylesheets = True conversion_options = { - 'linearize_tables' : True + 'linearize_tables': True } - keep_only_tags = {'name':'table', 'attrs':{'class':lambda x: x and 'ap-story-table' in x.split()}} + keep_only_tags = {'name': 'table', 'attrs': { + 'class': lambda x: x and 'ap-story-table' in x.split()}} remove_tags = [ - {'class':['ap-mediabox-table']}, - {'name':'img', 'src':lambda x: x and '//analytics.' in x}, + {'class': ['ap-mediabox-table']}, + {'name': 'img', 'src': lambda x: x and '//analytics.' in x}, ] def parse_index(self): feeds = [] - fronts = ('HOME', 'US', 'WORLD', 'BUSINESS', 'TECHNOLOGY', 'SPORTS', 'ENTERTAINMENT', 'HEALTH', 'SCIENCE', 'POLITICS') + fronts = ('HOME', 'US', 'WORLD', 'BUSINESS', 'TECHNOLOGY', + 'SPORTS', 'ENTERTAINMENT', 'HEALTH', 'SCIENCE', 'POLITICS') for front in fronts: feeds.append([front.capitalize(), self.parse_section(front)]) feeds[0][0] = 'Top Stories' @@ -28,19 +30,20 @@ class AssociatedPress(BasicNewsRecipe): def parse_section(self, front): self.log('Processing section:', front) - soup = self.index_to_soup('http://hosted.ap.org/dynamic/fronts/%s?SITE=AP' % front) + soup = self.index_to_soup( + 'http://hosted.ap.org/dynamic/fronts/%s?SITE=AP' % front) articles = [] - for x in soup.findAll('p', attrs={'class':['ap-newsbriefitem-p', 'ap-topheadlineitem-p']}): + for x in soup.findAll('p', attrs={'class': ['ap-newsbriefitem-p', 'ap-topheadlineitem-p']}): a = x.find('a', href=True) title = self.tag_to_string(a) url = "http://hosted.ap.org" + a['href'] - p = x.find(attrs={'class':'topheadlinebody'}) + p = x.find(attrs={'class': 'topheadlinebody'}) desc = '' if p is not None: desc = self.tag_to_string(p) self.log('\tFound article:', title, '\n\t\t', desc) - articles.append({'title':title, 'url':url}) + articles.append({'title': title, 'url': url}) self.log('\n\n') diff --git a/recipes/apcom.recipe b/recipes/apcom.recipe index 4839f94eaf..2a97def60f 100644 --- a/recipes/apcom.recipe +++ b/recipes/apcom.recipe @@ -1,8 +1,8 @@ #!/usr/bin/env python2 -__license__ = 'GPL v3' -__author__ = 'Gabriele Marini, based on Darko Miletic' +__license__ = 'GPL v3' +__author__ = 'Gabriele Marini, based on Darko Miletic' __copyright__ = '2009-2010, Darko Miletic ' -description = 'Italian daily newspaper - 14-05-2010' +description = 'Italian daily newspaper - 14-05-2010' ''' http://www.apcom.NET/ @@ -10,39 +10,38 @@ http://www.apcom.NET/ from calibre.web.feeds.news import BasicNewsRecipe + class Apcom(BasicNewsRecipe): - __author__ = 'Marini Gabriele' - description = 'Italian daily newspaper' + __author__ = 'Marini Gabriele' + description = 'Italian daily newspaper' - cover_url = 'http://www.apcom.net/img/logoAP.gif' - title = u'Apcom' - publisher = 'TM News S.p.A.' - category = 'News, politics, culture, economy, general interest' + cover_url = 'http://www.apcom.net/img/logoAP.gif' + title = u'Apcom' + publisher = 'TM News S.p.A.' + category = 'News, politics, culture, economy, general interest' - language = 'it' - timefmt = '[%a, %d %b, %Y]' + language = 'it' + timefmt = '[%a, %d %b, %Y]' oldest_article = 7 max_articles_per_feed = 50 - use_embedded_content = False - recursion = 100 + use_embedded_content = False + recursion = 100 - no_stylesheets = True - conversion_options = {'linearize_tables':True} + no_stylesheets = True + conversion_options = {'linearize_tables': True} remove_javascript = True - keep_only_tags = [ - dict(name='div', attrs={'id':'ag_center'}) - ] - - + keep_only_tags = [ + dict(name='div', attrs={'id': 'ag_center'}) + ] feeds = [ - (u'Globale', u'http://www.apcom.net/rss/globale.xml '), - (u'Politica', u'http://www.apcom.net/rss/politica.xml'), - (u'Cronaca', u'http://www.apcom.net/rss/cronaca.xml'), - (u'Econimia', u'http://www.apcom.net/rss/economia.xml'), - (u'Esteri', u'http://www.apcom.net/rss/esteri.xml'), - (u'Cultura', u'http://www.apcom.net/rss/cultura.xml'), - (u'Sport', u'http://www.apcom.net/rss/sport.xml') - ] + (u'Globale', u'http://www.apcom.net/rss/globale.xml '), + (u'Politica', u'http://www.apcom.net/rss/politica.xml'), + (u'Cronaca', u'http://www.apcom.net/rss/cronaca.xml'), + (u'Econimia', u'http://www.apcom.net/rss/economia.xml'), + (u'Esteri', u'http://www.apcom.net/rss/esteri.xml'), + (u'Cultura', u'http://www.apcom.net/rss/cultura.xml'), + (u'Sport', u'http://www.apcom.net/rss/sport.xml') + ] diff --git a/recipes/apod.recipe b/recipes/apod.recipe index 7bb3161954..c85b67252b 100644 --- a/recipes/apod.recipe +++ b/recipes/apod.recipe @@ -1,28 +1,30 @@ from calibre.web.feeds.news import BasicNewsRecipe + class APOD(BasicNewsRecipe): - title = u'Astronomy Picture of the Day' - __author__ = 'Starson17' + title = u'Astronomy Picture of the Day' + __author__ = 'Starson17' description = 'Astronomy Pictures' language = 'en' - use_embedded_content = False - no_stylesheets = True - cover_url = 'http://apod.nasa.gov/apod/image/1003/m78_torregrosa.jpg' + use_embedded_content = False + no_stylesheets = True + cover_url = 'http://apod.nasa.gov/apod/image/1003/m78_torregrosa.jpg' remove_javascript = True recursions = 0 - oldest_article = 14 + oldest_article = 14 remove_attributes = ['onmouseover', 'onmouseout'] feeds = [ - (u'Astronomy Picture of the Day', u'http://apod.nasa.gov/apod.rss') - ] + (u'Astronomy Picture of the Day', u'http://apod.nasa.gov/apod.rss') + ] extra_css = ''' h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} p{font-family:Arial,Helvetica,sans-serif;font-size:small;} body{font-family:Helvetica,Arial,sans-serif;font-size:small;} - ''' + ''' + def postprocess_html(self, soup, first_fetch): center_tags = soup.findAll(['center']) p_tags = soup.findAll(['p']) @@ -35,4 +37,3 @@ class APOD(BasicNewsRecipe): for tag in last2_p: tag.extract() return soup - diff --git a/recipes/app_funds.recipe b/recipes/app_funds.recipe index 84df9dcffc..a8703bcdfc 100644 --- a/recipes/app_funds.recipe +++ b/recipes/app_funds.recipe @@ -9,18 +9,19 @@ appfunds.blogspot.com from calibre.web.feeds.news import BasicNewsRecipe + class app_funds(BasicNewsRecipe): - title = u'APP Funds' + title = u'APP Funds' __author__ = 'teepel ' - language = 'pl' - description ='Blog inwestora dla inwestorów i oszczędzających' - INDEX='http://appfunds.blogspot.com' - remove_empty_feeds= True + language = 'pl' + description = 'Blog inwestora dla inwestorów i oszczędzających' + INDEX = 'http://appfunds.blogspot.com' + remove_empty_feeds = True oldest_article = 7 max_articles_per_feed = 100 simultaneous_downloads = 5 - remove_javascript=True - no_stylesheets=True + remove_javascript = True + no_stylesheets = True auto_cleanup = True - feeds = [(u'blog', u'http://feeds.feedburner.com/blogspot/etVI')] + feeds = [(u'blog', u'http://feeds.feedburner.com/blogspot/etVI')] diff --git a/recipes/apple_daily.recipe b/recipes/apple_daily.recipe index 0c4a13b7dd..f34b30f8b6 100644 --- a/recipes/apple_daily.recipe +++ b/recipes/apple_daily.recipe @@ -1,12 +1,14 @@ # vim:fileencoding=UTF-8 from __future__ import unicode_literals -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2013-2015, Eddie Lau' __Date__ = '' from calibre import (__appname__, force_unicode, strftime) from calibre.utils.date import now as nowf -import os, datetime, re +import os +import datetime +import re from calibre.web.feeds.recipes import BasicNewsRecipe from contextlib import nested from calibre.ebooks.BeautifulSoup import BeautifulSoup @@ -15,10 +17,11 @@ from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata import MetaInformation from calibre.utils.localization import canonicalize_lang + class AppleDaily(BasicNewsRecipe): - title = u'蘋果日報 (香港)' - __author__ = 'Eddie Lau' - publisher = '蘋果日報' + title = u'蘋果日報 (香港)' + __author__ = 'Eddie Lau' + publisher = '蘋果日報' oldest_article = 1 max_articles_per_feed = 100 auto_cleanup = False @@ -26,48 +29,48 @@ class AppleDaily(BasicNewsRecipe): encoding = 'utf-8' auto_cleanup = False remove_javascript = True - use_embedded_content = False + use_embedded_content = False no_stylesheets = True description = 'http://hkm.appledaily.com/' - category = 'Chinese, News, Hong Kong' + category = 'Chinese, News, Hong Kong' masthead_url = 'http://upload.wikimedia.org/wikipedia/zh/c/cf/AppleDailyLogo1.png' - extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} h1 {font-size:200%; text-align:left; font-weight:bold;} p[class=video-caption] {font-size:50%; margin-left:auto; margin-right:auto;}' - keep_only_tags = [dict(name='div', attrs={'id':'content-article'})] - remove_tags = [dict(name='div', attrs={'class':'prev-next-btn'}), - dict(name='p', attrs={'class':'next'})] + extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} h1 {font-size:200%; text-align:left; font-weight:bold;} p[class=video-caption] {font-size:50%; margin-left:auto; margin-right:auto;}' # noqa + keep_only_tags = [dict(name='div', attrs={'id': 'content-article'})] + remove_tags = [dict(name='div', attrs={'class': 'prev-next-btn'}), + dict(name='p', attrs={'class': 'next'})] def get_dtlocal(self): dt_utc = datetime.datetime.utcnow() # convert UTC to local hk time - at HKT 6am, all news are available - return dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(6.0/24) + return dt_utc + datetime.timedelta(8.0 / 24) - datetime.timedelta(6.0 / 24) def get_fetchdate(self): - if __Date__ <> '': + if __Date__ != '': return __Date__ else: return self.get_dtlocal().strftime("%Y%m%d") def get_fetchformatteddate(self): - if __Date__ <> '': - return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8] + if __Date__ != '': + return __Date__[0:4] + '-' + __Date__[4:6] + '-' + __Date__[6:8] else: return self.get_dtlocal().strftime("%Y-%m-%d") def get_fetchyear(self): - if __Date__ <> '': + if __Date__ != '': return __Date__[0:4] else: return self.get_dtlocal().strftime("%Y") def get_fetchmonth(self): - if __Date__ <> '': + if __Date__ != '': return __Date__[4:6] else: return self.get_dtlocal().strftime("%m") def get_fetchday(self): - if __Date__ <> '': + if __Date__ != '': return __Date__[6:8] else: return self.get_dtlocal().strftime("%d") @@ -78,7 +81,7 @@ class AppleDaily(BasicNewsRecipe): def get_cover_url(self): soup = self.index_to_soup('http://hkm.appledaily.com/') - cover = soup.find(attrs={'class':'top-news'}).get('src', False) + cover = soup.find(attrs={'class': 'top-news'}).get('src', False) br = BasicNewsRecipe.get_browser(self) try: br.open(cover) @@ -90,12 +93,12 @@ class AppleDaily(BasicNewsRecipe): if first and hasattr(self, 'add_toc_thumbnail'): picdiv = soup.find('img') if picdiv is not None: - self.add_toc_thumbnail(article,picdiv['src']) + self.add_toc_thumbnail(article, picdiv['src']) def parse_index(self): feeds = [] soup = self.index_to_soup('http://hkm.appledaily.com/') - ul = soup.find(attrs={'class':'menu'}) + ul = soup.find(attrs={'class': 'menu'}) sectionList = [] for li in ul.findAll('li'): relativea = li.find('a', href=True).get('href', False) @@ -111,13 +114,14 @@ class AppleDaily(BasicNewsRecipe): def parse_section(self, url): soup = self.index_to_soup(url) - ul = soup.find(attrs={'class':'list'}) + ul = soup.find(attrs={'class': 'list'}) current_articles = [] for li in ul.findAll('li'): a = li.find('a', href=True) title = li.find('p', text=True).strip() if a is not None: - current_articles.append({'title': title, 'url':'http://hkm.appledaily.com/' + a.get('href', False)}) + current_articles.append( + {'title': title, 'url': 'http://hkm.appledaily.com/' + a.get('href', False)}) pass return current_articles @@ -131,7 +135,8 @@ class AppleDaily(BasicNewsRecipe): mi.publisher = __appname__ mi.author_sort = __appname__ if self.publication_type: - mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() + mi.publication_type = 'periodical:' + \ + self.publication_type + ':' + self.short_title() mi.timestamp = nowf() article_titles, aseen = [], set() for f in feeds: @@ -144,15 +149,16 @@ class AppleDaily(BasicNewsRecipe): if not isinstance(mi.comments, unicode): mi.comments = mi.comments.decode('utf-8', 'replace') mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' + - '\n\n'.join(article_titles)) + '\n\n'.join(article_titles)) language = canonicalize_lang(self.language) if language is not None: mi.language = language # This one affects the pub date shown in kindle title - #mi.pubdate = nowf() + # mi.pubdate = nowf() # now appears to need the time field to be > 12.00noon as well - mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0) + mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int( + self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0) opf_path = os.path.join(dir, 'index.opf') ncx_path = os.path.join(dir, 'index.ncx') @@ -161,12 +167,14 @@ class AppleDaily(BasicNewsRecipe): mp = getattr(self, 'masthead_path', None) if mp is not None and os.access(mp, os.R_OK): from calibre.ebooks.metadata.opf2 import Guide - ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu()) + ref = Guide.Reference(os.path.basename( + self.masthead_path), os.getcwdu()) ref.type = 'masthead' ref.title = 'Masthead Image' opf.guide.append(ref) - manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] + manifest = [os.path.join(dir, 'feed_%d' % i) + for i in range(len(feeds))] manifest.append(os.path.join(dir, 'index.html')) manifest.append(os.path.join(dir, 'index.ncx')) @@ -175,7 +183,7 @@ class AppleDaily(BasicNewsRecipe): if cpath is None: pf = open(os.path.join(dir, 'cover.jpg'), 'wb') if self.default_cover(pf): - cpath = pf.name + cpath = pf.name if cpath is not None and os.access(cpath, os.R_OK): opf.cover = cpath manifest.append(cpath) @@ -197,12 +205,11 @@ class AppleDaily(BasicNewsRecipe): self.play_order_counter = 0 self.play_order_map = {} - def feed_index(num, parent): f = feeds[num] for j, a in enumerate(f): if getattr(a, 'downloaded', False): - adir = 'feed_%d/article_%d/'%(num, j) + adir = 'feed_%d/article_%d/' % (num, j) auth = a.author if not auth: auth = None @@ -212,16 +219,18 @@ class AppleDaily(BasicNewsRecipe): else: desc = self.description_limiter(desc) tt = a.toc_thumbnail if a.toc_thumbnail else None - entries.append('%sindex.html'%adir) + entries.append('%sindex.html' % adir) po = self.play_order_map.get(entries[-1], None) if po is None: self.play_order_counter += 1 po = self.play_order_counter - parent.add_item('%sindex.html'%adir, None, - a.title if a.title else _('Untitled Article'), - play_order=po, author=auth, - description=desc, toc_thumbnail=tt) - last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) + parent.add_item('%sindex.html' % adir, None, + a.title if a.title else _( + 'Untitled Article'), + play_order=po, author=auth, + description=desc, toc_thumbnail=tt) + last = os.path.join( + self.output_dir, ('%sindex.html' % adir).replace('/', os.sep)) for sp in a.sub_pages: prefix = os.path.commonprefix([opf_path, sp]) relp = sp[len(prefix):] @@ -234,12 +243,14 @@ class AppleDaily(BasicNewsRecipe): soup = BeautifulSoup(src) body = soup.find('body') if body is not None: - prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) + prefix = '/'.join('..'for i in range(2 * + len(re.findall(r'link\d+', last)))) templ = self.navbar.generate(True, num, j, len(f), - not self.has_single_feed, - a.orig_url, __appname__, prefix=prefix, - center=self.center_navbar) - elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') + not self.has_single_feed, + a.orig_url, __appname__, prefix=prefix, + center=self.center_navbar) + elem = BeautifulSoup(templ.render( + doctype='xhtml').decode('utf-8')).find('div') body.insert(len(body.contents), elem) with open(last, 'wb') as fi: fi.write(unicode(soup).encode('utf-8')) @@ -248,7 +259,7 @@ class AppleDaily(BasicNewsRecipe): if len(feeds) > 1: for i, f in enumerate(feeds): - entries.append('feed_%d/index.html'%i) + entries.append('feed_%d/index.html' % i) po = self.play_order_map.get(entries[-1], None) if po is None: self.play_order_counter += 1 @@ -259,11 +270,11 @@ class AppleDaily(BasicNewsRecipe): desc = getattr(f, 'description', None) if not desc: desc = None - feed_index(i, toc.add_item('feed_%d/index.html'%i, None, - f.title, play_order=po, description=desc, author=auth)) + feed_index(i, toc.add_item('feed_%d/index.html' % i, None, + f.title, play_order=po, description=desc, author=auth)) else: - entries.append('feed_%d/index.html'%0) + entries.append('feed_%d/index.html' % 0) feed_index(0, toc) for i, p in enumerate(entries): @@ -273,5 +284,3 @@ class AppleDaily(BasicNewsRecipe): with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): opf.render(opf_file, ncx_file) - - diff --git a/recipes/appledaily_tw.recipe b/recipes/appledaily_tw.recipe index 0f5d07090a..4a2949a3ec 100644 --- a/recipes/appledaily_tw.recipe +++ b/recipes/appledaily_tw.recipe @@ -34,12 +34,12 @@ class AppledailyTW(BasicNewsRecipe): {'name': 'hr'} ] conversion_options = { - 'title' : title, - 'comments' : description, - 'tags' : category, - 'language' : language, - 'publisher' : publisher, - 'authors' : publisher, + 'title': title, + 'comments': description, + 'tags': category, + 'language': language, + 'publisher': publisher, + 'authors': publisher, 'linearize_tables': True } feeds = [ @@ -105,5 +105,6 @@ class AppledailyTW(BasicNewsRecipe): def preprocess_raw_html(self, raw_html, url): raw_html = re.sub(ur'
    = self.test[1]: break url = prev_page_url diff --git a/recipes/archeowiesci.recipe b/recipes/archeowiesci.recipe index c0fc576c9f..43517e3465 100644 --- a/recipes/archeowiesci.recipe +++ b/recipes/archeowiesci.recipe @@ -1,33 +1,35 @@ from calibre.web.feeds.news import BasicNewsRecipe + class Archeowiesci(BasicNewsRecipe): - title = u'Archeowieści' - __author__ = 'fenuks' - category = 'archeology' - language = 'pl' + title = u'Archeowieści' + __author__ = 'fenuks' + category = 'archeology' + language = 'pl' description = u'Z pasją o przeszłości' - cover_url='http://archeowiesci.pl/wp-content/uploads/2011/05/Archeowiesci2-115x115.jpg' + cover_url = 'http://archeowiesci.pl/wp-content/uploads/2011/05/Archeowiesci2-115x115.jpg' oldest_article = 7 - needs_subscription='optional' + needs_subscription = 'optional' max_articles_per_feed = 100 auto_cleanup = True - remove_tags=[dict(name='span', attrs={'class':['post-ratings', 'post-ratings-loading']})] - feeds = [(u'Archeowieści', u'http://archeowiesci.pl/feed/')] + remove_tags = [ + dict(name='span', attrs={'class': ['post-ratings', 'post-ratings-loading']})] + feeds = [(u'Archeowieści', u'http://archeowiesci.pl/feed/')] - def parse_feeds (self): - feeds = BasicNewsRecipe.parse_feeds(self) - for feed in feeds: - for article in feed.articles[:]: - if self.username is None and 'subskrypcja' in article.title: - feed.articles.remove(article) - return feeds + def parse_feeds(self): + feeds = BasicNewsRecipe.parse_feeds(self) + for feed in feeds: + for article in feed.articles[:]: + if self.username is None and 'subskrypcja' in article.title: + feed.articles.remove(article) + return feeds def get_browser(self): br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://archeowiesci.pl/wp-login.php') br.select_form(name='loginform') - br['log'] = self.username + br['log'] = self.username br['pwd'] = self.password br.submit() return br diff --git a/recipes/argnoticias.recipe b/recipes/argnoticias.recipe index 03e3627064..e46d618706 100644 --- a/recipes/argnoticias.recipe +++ b/recipes/argnoticias.recipe @@ -1,5 +1,5 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2013, Darko Miletic ' ''' @@ -10,87 +10,85 @@ import time from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe + class ArgNoticias(BasicNewsRecipe): - title = 'ARG Noticias' - __author__ = 'Darko Miletic' - description = 'Ultimas noticias de Argentina' - publisher = 'ARG Noticias' - category = 'news, politics, Argentina' - oldest_article = 2 + title = 'ARG Noticias' + __author__ = 'Darko Miletic' + description = 'Ultimas noticias de Argentina' + publisher = 'ARG Noticias' + category = 'news, politics, Argentina' + oldest_article = 2 max_articles_per_feed = 100 - no_stylesheets = True - encoding = 'utf-8' - use_embedded_content = False - masthead_url = 'http://www.argnoticias.com/images/arg-logo-footer.png' - language = 'es_AR' - publication_type = 'newsportal' - INDEX = 'http://www.argnoticias.com' - extra_css = '' + no_stylesheets = True + encoding = 'utf-8' + use_embedded_content = False + masthead_url = 'http://www.argnoticias.com/images/arg-logo-footer.png' + language = 'es_AR' + publication_type = 'newsportal' + INDEX = 'http://www.argnoticias.com' + extra_css = '' conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher': publisher - , 'language' : language - } + 'comment': description, 'tags': category, 'publisher': publisher, 'language': language + } - keep_only_tags = [dict(name='div', attrs={'class':['itemHeader','itemBody','itemAuthorBlock']})] + keep_only_tags = [ + dict(name='div', attrs={'class': ['itemHeader', 'itemBody', 'itemAuthorBlock']})] remove_tags = [ - dict(name=['object','link','base','iframe']), - dict(name='div', attrs={'class':['b2jsocial_parent','itemSocialSharing']}) - ] + dict(name=['object', 'link', 'base', 'iframe']), + dict(name='div', attrs={ + 'class': ['b2jsocial_parent', 'itemSocialSharing']}) + ] feeds = [ - (u'Politica' , u'http://www.argnoticias.com/index.php/politica' ) - ,(u'Economia' , u'http://www.argnoticias.com/index.php/economia' ) - ,(u'Sociedad' , u'http://www.argnoticias.com/index.php/sociedad' ) - ,(u'Mundo' , u'http://www.argnoticias.com/index.php/mundo' ) - ,(u'Deportes' , u'http://www.argnoticias.com/index.php/deportes' ) - ,(u'Espectaculos', u'http://www.argnoticias.com/index.php/espectaculos') - ,(u'Tendencias' , u'http://www.argnoticias.com/index.php/tendencias' ) - ] + + (u'Politica', u'http://www.argnoticias.com/index.php/politica'), + (u'Economia', u'http://www.argnoticias.com/index.php/economia'), + (u'Sociedad', u'http://www.argnoticias.com/index.php/sociedad'), + (u'Mundo', u'http://www.argnoticias.com/index.php/mundo'), + (u'Deportes', u'http://www.argnoticias.com/index.php/deportes'), + (u'Espectaculos', u'http://www.argnoticias.com/index.php/espectaculos'), + (u'Tendencias', u'http://www.argnoticias.com/index.php/tendencias') + ] def parse_index(self): totalfeeds = [] lfeeds = self.get_feeds() - checker = [] + checker = [] for feedobj in lfeeds: feedtitle, feedurl = feedobj - self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) + self.report_progress(0, _('Fetching feed') + ' %s...' % + (feedtitle if feedtitle else feedurl)) articles = [] soup = self.index_to_soup(feedurl) - for item in soup.findAll('div', attrs={'class':'Nota'}): - atag = item.find('a', attrs={'class':'moduleItemTitle'}) - ptag = item.find('div', attrs={'class':'moduleItemIntrotext'}) - url = self.INDEX + atag['href'] - title = self.tag_to_string(atag) + for item in soup.findAll('div', attrs={'class': 'Nota'}): + atag = item.find('a', attrs={'class': 'moduleItemTitle'}) + ptag = item.find('div', attrs={'class': 'moduleItemIntrotext'}) + url = self.INDEX + atag['href'] + title = self.tag_to_string(atag) description = self.tag_to_string(ptag) - date = strftime("%a, %d %b %Y %H:%M:%S +0000",time.gmtime()) + date = strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) if url not in checker: checker.append(url) articles.append({ - 'title' :title - ,'date' :date - ,'url' :url - ,'description':description - }) + 'title': title, 'date': date, 'url': url, 'description': description + }) for item in soup.findAll('li'): - atag = item.find('a', attrs={'class':'moduleItemTitle'}) + atag = item.find('a', attrs={'class': 'moduleItemTitle'}) if atag: - ptag = item.find('div', attrs={'class':'moduleItemIntrotext'}) - url = self.INDEX + atag['href'] - title = self.tag_to_string(atag) + ptag = item.find( + 'div', attrs={'class': 'moduleItemIntrotext'}) + url = self.INDEX + atag['href'] + title = self.tag_to_string(atag) description = self.tag_to_string(ptag) - date = strftime("%a, %d %b %Y %H:%M:%S +0000",time.gmtime()) + date = strftime( + "%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) if url not in checker: checker.append(url) articles.append({ - 'title' :title - ,'date' :date - ,'url' :url - ,'description':description - }) + 'title': title, 'date': date, 'url': url, 'description': description + }) totalfeeds.append((feedtitle, articles)) return totalfeeds diff --git a/recipes/arizona_republic.recipe b/recipes/arizona_republic.recipe index 5bc2140946..99fe68f274 100644 --- a/recipes/arizona_republic.recipe +++ b/recipes/arizona_republic.recipe @@ -1,68 +1,77 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2010, jolo' ''' azrepublic.com ''' from calibre.web.feeds.recipes import BasicNewsRecipe + class AdvancedUserRecipe1307301031(BasicNewsRecipe): - title = u'AZRepublic' - __author__ = 'Jim Olo' - language = 'en' - description = "The Arizona Republic is Arizona's leading provider of news and information, and has published a daily newspaper in Phoenix for more than 110 years" - publisher = 'AZRepublic/AZCentral' - masthead_url = 'http://freedom2t.com/wp-content/uploads/press_az_republic_v2.gif' - cover_url = 'http://www.valleyleadership.org/Common/Img/2line4c_AZRepublic%20with%20azcentral%20logo.jpg' - category = 'news, politics, USA, AZ, Arizona' + title = u'AZRepublic' + __author__ = 'Jim Olo' + language = 'en' + description = "The Arizona Republic is Arizona's leading provider of news and information, and has published a daily newspaper in Phoenix for more than 110 years" # noqa + publisher = 'AZRepublic/AZCentral' + masthead_url = 'http://freedom2t.com/wp-content/uploads/press_az_republic_v2.gif' + cover_url = 'http://www.valleyleadership.org/Common/Img/2line4c_AZRepublic%20with%20azcentral%20logo.jpg' + category = 'news, politics, USA, AZ, Arizona' oldest_article = 7 max_articles_per_feed = 100 remove_empty_feeds = True - no_stylesheets = True - remove_javascript = True + no_stylesheets = True + remove_javascript = True # extra_css = '.headline {font-size: medium;} \n .fact { padding-top: 10pt }' - extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .headline {font-size: medium} .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' + extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .headline {font-size: medium} .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' # noqa - remove_attributes = ['width','height','h2','subHeadline','style'] + remove_attributes = ['width', 'height', 'h2', 'subHeadline', 'style'] remove_tags = [ - dict(name='div', attrs={'id':['slidingBillboard', 'top728x90', 'subindex-header', 'topSearch']}), - dict(name='div', attrs={'id':['simplesearch', 'azcLoginBox', 'azcLoginBoxInner', 'topNav']}), - dict(name='div', attrs={'id':['carsDrop', 'homesDrop', 'rentalsDrop', 'classifiedDrop']}), - dict(name='div', attrs={'id':['nav', 'mp', 'subnav', 'jobsDrop']}), - dict(name='h6', attrs={'class':['section-header']}), - dict(name='a', attrs={'href':['#comments']}), - dict(name='div', attrs={'class':['articletools clearfix', 'floatRight']}), - dict(name='div', attrs={'id':['fbFrame', 'ob', 'storyComments', 'storyGoogleAdBox']}), - dict(name='div', attrs={'id':['storyTopHomes', 'openRight', 'footerwrap', 'copyright']}), - dict(name='div', attrs={'id':['blogsHed', 'blog_comments', 'blogByline','blogTopics']}), - dict(name='div', attrs={'id':['membersRightMain', 'dealsfooter', 'azrTopHed', 'azrRightCol']}), - dict(name='div', attrs={'id':['ttdHeader', 'ttdTimeWeather']}), - dict(name='div', attrs={'id':['membersRightMain', 'deals-header-wrap']}), - dict(name='div', attrs={'id':['todoTopSearchBar', 'byline clearfix', 'subdex-topnav']}), - dict(name='h1', attrs={'id':['SEOtext']}), - dict(name='table', attrs={'class':['ap-mediabox-table']}), - dict(name='p', attrs={'class':['ap_para']}), - dict(name='span', attrs={'class':['source-org vcard', 'org fn']}), - dict(name='a', attrs={'href':['http://hosted2.ap.org/APDEFAULT/privacy']}), - dict(name='a', attrs={'href':['http://hosted2.ap.org/APDEFAULT/terms']}), - dict(name='div', attrs={'id':['onespot_nextclick']}), - ] - - feeds = [ - (u'FrontPage', u'http://www.azcentral.com/rss/feeds/republicfront.xml'), - (u'TopUS-News', u'http://hosted.ap.org/lineups/USHEADS.rss?SITE=AZPHG&SECTION=HOME'), - (u'WorldNews', u'http://hosted.ap.org/lineups/WORLDHEADS.rss?SITE=AZPHG&SECTION=HOME'), - (u'TopBusiness', u'http://hosted.ap.org/lineups/BUSINESSHEADS.rss?SITE=AZPHG&SECTION=HOME'), - (u'Entertainment', u'http://hosted.ap.org/lineups/ENTERTAINMENT.rss?SITE=AZPHG&SECTION=HOME'), - (u'ArizonaNews', u'http://www.azcentral.com/rss/feeds/news.xml'), - (u'Gilbert', u'http://www.azcentral.com/rss/feeds/gilbert.xml'), - (u'Chandler', u'http://www.azcentral.com/rss/feeds/chandler.xml'), - (u'DiningReviews', u'http://www.azcentral.com/rss/feeds/diningreviews.xml'), - (u'AZBusiness', u'http://www.azcentral.com/rss/feeds/business.xml'), - (u'ArizonaDeals', u'http://www.azcentral.com/members/Blog%7E/RealDealsblog'), - (u'GroceryDeals', u'http://www.azcentral.com/members/Blog%7E/RealDealsblog/tag/2646') - ] - - - + dict(name='div', attrs={ + 'id': ['slidingBillboard', 'top728x90', 'subindex-header', 'topSearch']}), + dict(name='div', attrs={ + 'id': ['simplesearch', 'azcLoginBox', 'azcLoginBoxInner', 'topNav']}), + dict(name='div', attrs={ + 'id': ['carsDrop', 'homesDrop', 'rentalsDrop', 'classifiedDrop']}), + dict(name='div', attrs={'id': ['nav', 'mp', 'subnav', 'jobsDrop']}), + dict(name='h6', attrs={'class': ['section-header']}), + dict(name='a', attrs={'href': ['#comments']}), + dict(name='div', attrs={ + 'class': ['articletools clearfix', 'floatRight']}), + dict(name='div', attrs={ + 'id': ['fbFrame', 'ob', 'storyComments', 'storyGoogleAdBox']}), + dict(name='div', attrs={ + 'id': ['storyTopHomes', 'openRight', 'footerwrap', 'copyright']}), + dict(name='div', attrs={ + 'id': ['blogsHed', 'blog_comments', 'blogByline', 'blogTopics']}), + dict(name='div', attrs={ + 'id': ['membersRightMain', 'dealsfooter', 'azrTopHed', 'azrRightCol']}), + dict(name='div', attrs={'id': ['ttdHeader', 'ttdTimeWeather']}), + dict(name='div', attrs={ + 'id': ['membersRightMain', 'deals-header-wrap']}), + dict(name='div', attrs={ + 'id': ['todoTopSearchBar', 'byline clearfix', 'subdex-topnav']}), + dict(name='h1', attrs={'id': ['SEOtext']}), + dict(name='table', attrs={'class': ['ap-mediabox-table']}), + dict(name='p', attrs={'class': ['ap_para']}), + dict(name='span', attrs={'class': ['source-org vcard', 'org fn']}), + dict(name='a', attrs={ + 'href': ['http://hosted2.ap.org/APDEFAULT/privacy']}), + dict(name='a', attrs={ + 'href': ['http://hosted2.ap.org/APDEFAULT/terms']}), + dict(name='div', attrs={'id': ['onespot_nextclick']}), + ] + feeds = [ + (u'FrontPage', u'http://www.azcentral.com/rss/feeds/republicfront.xml'), + (u'TopUS-News', u'http://hosted.ap.org/lineups/USHEADS.rss?SITE=AZPHG&SECTION=HOME'), + (u'WorldNews', u'http://hosted.ap.org/lineups/WORLDHEADS.rss?SITE=AZPHG&SECTION=HOME'), + (u'TopBusiness', u'http://hosted.ap.org/lineups/BUSINESSHEADS.rss?SITE=AZPHG&SECTION=HOME'), + (u'Entertainment', u'http://hosted.ap.org/lineups/ENTERTAINMENT.rss?SITE=AZPHG&SECTION=HOME'), + (u'ArizonaNews', u'http://www.azcentral.com/rss/feeds/news.xml'), + (u'Gilbert', u'http://www.azcentral.com/rss/feeds/gilbert.xml'), + (u'Chandler', u'http://www.azcentral.com/rss/feeds/chandler.xml'), + (u'DiningReviews', u'http://www.azcentral.com/rss/feeds/diningreviews.xml'), + (u'AZBusiness', u'http://www.azcentral.com/rss/feeds/business.xml'), + (u'ArizonaDeals', u'http://www.azcentral.com/members/Blog%7E/RealDealsblog'), + (u'GroceryDeals', u'http://www.azcentral.com/members/Blog%7E/RealDealsblog/tag/2646') + ] diff --git a/recipes/army_times.recipe b/recipes/army_times.recipe index 2cb5164106..9cbd2967a7 100644 --- a/recipes/army_times.recipe +++ b/recipes/army_times.recipe @@ -1,42 +1,39 @@ from calibre.web.feeds.news import BasicNewsRecipe + + class ArmyTimes(BasicNewsRecipe): - title = 'Army Times' - __author__ = 'jde' - __date__ = '16 May 2012' - __version__ = '1.0' - description = 'News of the U.S. Army' - language = 'en' - publisher = 'ArmyTimes.com' - category = 'news, U.S. Army' - tags = 'news, U.S. Army' - cover_url = 'http://www.armytimes.com/images/logo_armytimes_alert.jpg' - masthead_url = 'http://www.armytimes.com/images/logo_armytimes_alert.jpg' - oldest_article = 7 #days - max_articles_per_feed = 25 - publication_type = 'newspaper' - no_stylesheets = True - use_embedded_content = False - encoding = None - recursions = 0 - needs_subscription = False - remove_javascript = True - remove_empty_feeds = True - auto_cleanup = True + title = 'Army Times' + __author__ = 'jde' + __date__ = '16 May 2012' + __version__ = '1.0' + description = 'News of the U.S. Army' + language = 'en' + publisher = 'ArmyTimes.com' + category = 'news, U.S. Army' + tags = 'news, U.S. Army' + cover_url = 'http://www.armytimes.com/images/logo_armytimes_alert.jpg' + masthead_url = 'http://www.armytimes.com/images/logo_armytimes_alert.jpg' + oldest_article = 7 # days + max_articles_per_feed = 25 + publication_type = 'newspaper' + no_stylesheets = True + use_embedded_content = False + encoding = None + recursions = 0 + needs_subscription = False + remove_javascript = True + remove_empty_feeds = True + auto_cleanup = True + feeds = [ - - feeds = [ - -('News', 'http://www.armytimes.com/rss_news.php'), -('Benefits', 'http://www.armytimes.com/rss_benefits.php'), -('Money', 'http://www.armytimes.com/rss_money.php'), -('Careers & Education', 'http://www.armytimes.com/rss_careers.php'), -('Community', 'http://www.armytimes.com/rss_community.php'), -('Off Duty', 'http://www.armytimes.com/rss_off_duty.php'), -('Entertainment', 'http://www.armytimes.com/rss_entertainment.php'), -('Guard & Reserve', 'http://www.armytimes.com/rss_guard.php'), + ('News', 'http://www.armytimes.com/rss_news.php'), + ('Benefits', 'http://www.armytimes.com/rss_benefits.php'), + ('Money', 'http://www.armytimes.com/rss_money.php'), + ('Careers & Education', 'http://www.armytimes.com/rss_careers.php'), + ('Community', 'http://www.armytimes.com/rss_community.php'), + ('Off Duty', 'http://www.armytimes.com/rss_off_duty.php'), + ('Entertainment', 'http://www.armytimes.com/rss_entertainment.php'), + ('Guard & Reserve', 'http://www.armytimes.com/rss_guard.php'), ] - - - diff --git a/recipes/arret_sur_images.recipe b/recipes/arret_sur_images.recipe index fac2983231..797d52127d 100644 --- a/recipes/arret_sur_images.recipe +++ b/recipes/arret_sur_images.recipe @@ -7,10 +7,11 @@ __description__ = 'Get some fresh news from Arrêt sur images' from calibre.web.feeds.recipes import BasicNewsRecipe + class Asi(BasicNewsRecipe): - title = 'Arrêt sur images' - __author__ = 'François D. (aka franek)' + title = 'Arrêt sur images' + __author__ = 'François D. (aka franek)' description = 'Global news in french from news site "Arrêt sur images"' oldest_article = 7.0 @@ -26,15 +27,16 @@ class Asi(BasicNewsRecipe): no_stylesheets = True remove_javascript = True - feeds = [ + feeds = [ ('vite dit et gratuit', 'http://www.arretsurimages.net/vite-dit.rss'), ('Toutes les chroniques', 'http://www.arretsurimages.net/chroniques.rss'), ('Contenus et dossiers', 'http://www.arretsurimages.net/dossiers.rss'), ] - conversion_options = { 'smarten_punctuation' : True } + conversion_options = {'smarten_punctuation': True} - remove_tags = [dict(id='vite-titre'), dict(id='header'), dict(id='wrap-connexion'), dict(id='col_right'), dict(name='div', attrs={'class':'bloc-chroniqueur-2'}), dict(id='footercontainer')] + remove_tags = [dict(id='vite-titre'), dict(id='header'), dict(id='wrap-connexion'), dict(id='col_right'), + dict(name='div', attrs={'class': 'bloc-chroniqueur-2'}), dict(id='footercontainer')] def print_version(self, url): return url.replace('contenu.php', 'contenu-imprimable.php') @@ -51,4 +53,3 @@ class Asi(BasicNewsRecipe): br['password'] = self.password br.submit() return br - diff --git a/recipes/ars_technica.recipe b/recipes/ars_technica.recipe index a71e2ebb8b..dd385484f8 100644 --- a/recipes/ars_technica.recipe +++ b/recipes/ars_technica.recipe @@ -1,4 +1,4 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2008-2012, Darko Miletic ' ''' arstechnica.com @@ -7,20 +7,21 @@ arstechnica.com from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup + class ArsTechnica(BasicNewsRecipe): - title = u'Ars Technica' - language = 'en' - __author__ = 'Darko Miletic, Sujata Raman, Alexis Rohou, Tom Sparks' - description = 'Ars Technica: Serving the technologist for 1.2 decades' - publisher = 'Conde Nast Publications' - category = 'news, IT, technology' - oldest_article = 5 + title = u'Ars Technica' + language = 'en' + __author__ = 'Darko Miletic, Sujata Raman, Alexis Rohou, Tom Sparks' + description = 'Ars Technica: Serving the technologist for 1.2 decades' + publisher = 'Conde Nast Publications' + category = 'news, IT, technology' + oldest_article = 5 max_articles_per_feed = 100 - no_stylesheets = True - encoding = 'utf-8' - use_embedded_content = False - remove_empty_feeds = True - publication_type = 'newsportal' + no_stylesheets = True + encoding = 'utf-8' + use_embedded_content = False + remove_empty_feeds = True + publication_type = 'newsportal' extra_css = ''' body {font-family: Arial,sans-serif} .heading{font-family: "Times New Roman",serif} @@ -31,56 +32,48 @@ class ArsTechnica(BasicNewsRecipe): ''' conversion_options = { - 'comments' : description - ,'tags' : category - ,'language' : language - ,'publisher' : publisher - } + 'comments': description, 'tags': category, 'language': language, 'publisher': publisher + } keep_only_tags = [ - dict(attrs={'class':'standalone'}) - ,dict(attrs={'id':'article-guts'}) - ] + dict(attrs={'class': 'standalone'}), dict(attrs={'id': 'article-guts'}) + ] remove_tags = [ - dict(name=['object','link','embed','iframe','meta']) - ,dict(attrs={'class':'corner-info'}) - ,dict(attrs={'id': 'article-footer-wrap'}) - ,dict(attrs={'class': 'article-expander'}) - ,dict(name='nav',attrs={'class': 'subheading'}) - ] + dict(name=['object', 'link', 'embed', 'iframe', 'meta']), dict(attrs={'class': 'corner-info'}), dict(attrs={ + 'id': 'article-footer-wrap'}), dict(attrs={'class': 'article-expander'}), dict(name='nav', attrs={'class': 'subheading'}) + ] remove_attributes = ['lang'] feeds = [ - - (u'Ars Features (All our long-form feature articles)' , u'http://feeds.arstechnica.com/arstechnica/features') - , (u'Technology Lab (Information Technology)' , u'http://feeds.arstechnica.com/arstechnica/technology-lab') - ,(u'Gear & Gadgets' , u'http://feeds.arstechnica.com/arstechnica/gadgets') - ,(u'Ministry of Innovation (Business of Technology)' , u'http://feeds.arstechnica.com/arstechnica/business') - ,(u'Risk Assessment (Security & Hacktivism)' , u'http://feeds.arstechnica.com/arstechnica/security') - ,(u'Law & Disorder (Civilizations & Discontents)' , u'http://feeds.arstechnica.com/arstechnica/tech-policy') - ,(u'Infinite Loop (Apple Ecosystem)' , u'http://feeds.arstechnica.com/arstechnica/apple') - ,(u'Opposable Thumbs (Gaming & Entertainment)' , u'http://feeds.arstechnica.com/arstechnica/gaming') - ,(u'Scientific Method (Science & Exploration)' , u'http://feeds.arstechnica.com/arstechnica/science') - ,(u'Multiverse (Exploratoins & Meditations on Sci-Fi)' , u'http://feeds.arstechnica.com/arstechnica/multiverse') - ,(u'Cars Technica (All Things Automotive)' , u'http://feeds.arstechnica.com/arstechnica/cars') - ,(u'Staff Blogs (From the Minds of Ars)' , u'http://feeds.arstechnica.com/arstechnica/staff-blogs') - ] + (u'Ars Features (All our long-form feature articles)', u'http://feeds.arstechnica.com/arstechnica/features'), + (u'Technology Lab (Information Technology)', u'http://feeds.arstechnica.com/arstechnica/technology-lab'), + (u'Gear & Gadgets', u'http://feeds.arstechnica.com/arstechnica/gadgets'), + (u'Ministry of Innovation (Business of Technology)', u'http://feeds.arstechnica.com/arstechnica/business'), + (u'Risk Assessment (Security & Hacktivism)', u'http://feeds.arstechnica.com/arstechnica/security'), + (u'Law & Disorder (Civilizations & Discontents)', u'http://feeds.arstechnica.com/arstechnica/tech-policy'), + (u'Infinite Loop (Apple Ecosystem)', u'http://feeds.arstechnica.com/arstechnica/apple'), + (u'Opposable Thumbs (Gaming & Entertainment)', u'http://feeds.arstechnica.com/arstechnica/gaming'), + (u'Scientific Method (Science & Exploration)', u'http://feeds.arstechnica.com/arstechnica/science'), + (u'Multiverse (Exploratoins & Meditations on Sci-Fi)', u'http://feeds.arstechnica.com/arstechnica/multiverse'), + (u'Cars Technica (All Things Automotive)', u'http://feeds.arstechnica.com/arstechnica/cars'), + (u'Staff Blogs (From the Minds of Ars)', u'http://feeds.arstechnica.com/arstechnica/staff-blogs') + ] def append_page(self, soup, appendtag, position): - pager = soup.find(attrs={'class':'numbers'}) + pager = soup.find(attrs={'class': 'numbers'}) if pager: - nexttag = pager.find(attrs={'class':'next'}) + nexttag = pager.find(attrs={'class': 'next'}) if nexttag: nurl = nexttag.parent['href'] - rawc = self.index_to_soup(nurl,True) + rawc = self.index_to_soup(nurl, True) soup2 = BeautifulSoup(rawc, fromEncoding=self.encoding) - texttag = soup2.find(attrs={'id':'article-guts'}) + texttag = soup2.find(attrs={'id': 'article-guts'}) newpos = len(texttag.contents) - self.append_page(soup2,texttag,newpos) + self.append_page(soup2, texttag, newpos) texttag.extract() pager.extract() - appendtag.insert(position,texttag) + appendtag.insert(position, texttag) def preprocess_html(self, soup): self.append_page(soup, soup.body, 3) @@ -102,4 +95,4 @@ class ArsTechnica(BasicNewsRecipe): return soup def preprocess_raw_html(self, raw, url): - return ''+raw[raw.find(''):] + return '' + raw[raw.find(''):] diff --git a/recipes/asco_de_vida.recipe b/recipes/asco_de_vida.recipe index fa1944f95d..8756ca2238 100644 --- a/recipes/asco_de_vida.recipe +++ b/recipes/asco_de_vida.recipe @@ -1,20 +1,18 @@ from calibre.web.feeds.news import BasicNewsRecipe + class HindustanTimes(BasicNewsRecipe): - title = u'Asco de vida' - language = 'es' - __author__ = 'Krittika Goyal' - oldest_article = 1 #days + title = u'Asco de vida' + language = 'es' + __author__ = 'Krittika Goyal' + oldest_article = 1 # days max_articles_per_feed = 25 - #encoding = 'cp1252' use_embedded_content = False no_stylesheets = True - keep_only_tags = dict(name='div', attrs={'class':'box story'}) - - - feeds = [ -('News', - 'http://feeds2.feedburner.com/AscoDeVida'), -] + keep_only_tags = dict(name='div', attrs={'class': 'box story'}) + feeds = [ + ('News', + 'http://feeds2.feedburner.com/AscoDeVida'), + ] diff --git a/recipes/asia_one.recipe b/recipes/asia_one.recipe index 7d8c2f4572..7ffc9c770d 100644 --- a/recipes/asia_one.recipe +++ b/recipes/asia_one.recipe @@ -1,6 +1,6 @@ #!/usr/bin/env python2 -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2009, Bruce ' ''' asiaone.com @@ -8,23 +8,25 @@ asiaone.com from calibre.web.feeds.news import BasicNewsRecipe + class AsiaOne(BasicNewsRecipe): - title = u'AsiaOne' + title = u'AsiaOne' oldest_article = 2 max_articles_per_feed = 100 - __author__ = 'Bruce' - description = 'News from Singapore Press Holdings Portal' + __author__ = 'Bruce' + description = 'News from Singapore Press Holdings Portal' no_stylesheets = False language = 'en_SG' remove_javascript = True - remove_tags = [dict(name='span', attrs={'class':'footer'})] + remove_tags = [dict(name='span', attrs={'class': 'footer'})] keep_only_tags = [ - dict(name='h1', attrs={'class':'headline'}), - dict(name='div', attrs={'class':['article-content','person-info row']}) - ] + dict(name='h1', attrs={'class': 'headline'}), + dict(name='div', attrs={ + 'class': ['article-content', 'person-info row']}) + ] - feeds = [ - ('Singapore', 'http://asiaone.feedsportal.com/c/34151/f/618415/index.rss'), - ('Asia', 'http://asiaone.feedsportal.com/c/34151/f/618416/index.rss') + feeds = [ + ('Singapore', 'http://asiaone.feedsportal.com/c/34151/f/618415/index.rss'), + ('Asia', 'http://asiaone.feedsportal.com/c/34151/f/618416/index.rss') - ] + ] diff --git a/recipes/asianreviewofbooks.recipe b/recipes/asianreviewofbooks.recipe index 0763f040ea..9a208cf61b 100644 --- a/recipes/asianreviewofbooks.recipe +++ b/recipes/asianreviewofbooks.recipe @@ -1,5 +1,5 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2012, Darko Miletic ' ''' www.asianreviewofbooks.com @@ -7,20 +7,21 @@ www.asianreviewofbooks.com from calibre.web.feeds.news import BasicNewsRecipe + class AsianReviewOfBooks(BasicNewsRecipe): - title = 'The Asian Review of Books' - __author__ = 'Darko Miletic' - description = 'In addition to reviewing books about or of relevance to Asia, the Asian Review of Books also features long-format essays by leading Asian writers and thinkers, to providing an unparalleled forum for discussion of key contemporary issues by Asians for Asia and a vehicle of intellectual depth and breadth where leading thinkers can write on the books, arts and ideas of the day. Widely quoted and referenced, with an archive of more than one thousand book reviews, it is the only web resource dedicated to Asian books. And now, with the addition of the new premium content, the Asian Review of Books, is a must-read publication.' - publisher = 'The Asian Review of Books' - category = 'literature, books, reviews, Asia' - oldest_article = 30 + title = 'The Asian Review of Books' + __author__ = 'Darko Miletic' + description = 'In addition to reviewing books about or of relevance to Asia, the Asian Review of Books also features long-format essays by leading Asian writers and thinkers, to providing an unparalleled forum for discussion of key contemporary issues by Asians for Asia and a vehicle of intellectual depth and breadth where leading thinkers can write on the books, arts and ideas of the day. Widely quoted and referenced, with an archive of more than one thousand book reviews, it is the only web resource dedicated to Asian books. And now, with the addition of the new premium content, the Asian Review of Books, is a must-read publication.' # noqa + publisher = 'The Asian Review of Books' + category = 'literature, books, reviews, Asia' + oldest_article = 30 max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - encoding = 'cp1252' - language = 'en_CN' - publication_type = 'magazine' - masthead_url = 'http://www.asianreviewofbooks.com/new/images/mob_arb.png' + no_stylesheets = True + use_embedded_content = False + encoding = 'cp1252' + language = 'en_CN' + publication_type = 'magazine' + masthead_url = 'http://www.asianreviewofbooks.com/new/images/mob_arb.png' extra_css = """ body{font-family: serif} .big {font-size: xx-large} @@ -31,21 +32,16 @@ class AsianReviewOfBooks(BasicNewsRecipe): """ conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } + 'comment': description, 'tags': category, 'publisher': publisher, 'language': language + } - - remove_tags = [dict(name=['object','script','iframe','embed'])] + remove_tags = [dict(name=['object', 'script', 'iframe', 'embed'])] remove_attributes = ['style', 'onclick'] - feeds = [(u'Articles' , u'http://www.asianreviewofbooks.com/new/rss.php')] + feeds = [(u'Articles', u'http://www.asianreviewofbooks.com/new/rss.php')] def print_version(self, url): root, sep, artid = url.rpartition('?ID=') return root + 'getarticle.php?articleID=' + artid + '&stats=web' def preprocess_raw_html(self, raw, url): - return 'title' + raw + '' - + return 'title' + raw + '' diff --git a/recipes/astro_news_pl.recipe b/recipes/astro_news_pl.recipe index 5189154f3b..8f9911dcd8 100644 --- a/recipes/astro_news_pl.recipe +++ b/recipes/astro_news_pl.recipe @@ -1,18 +1,19 @@ from calibre.web.feeds.news import BasicNewsRecipe + + class AstroNEWS(BasicNewsRecipe): - title = u'AstroNEWS' - __author__ = 'fenuks' - description = u'AstroNEWS regularnie dostarcza wiadomości o wydarzeniach związanych z astronomią i astronautyką. Informujemy o aktualnych odkryciach i wydarzeniach naukowych, zapowiadamy ciekawe zjawiska astronomiczne. Serwis jest częścią portalu astronomicznego AstroNET prowadzonego przez miłośników astronomii i zawodowych astronomów.' - category = 'astronomy, science' - language = 'pl' + title = u'AstroNEWS' + __author__ = 'fenuks' + description = u'AstroNEWS regularnie dostarcza wiadomości o wydarzeniach związanych z astronomią i astronautyką. Informujemy o aktualnych odkryciach i wydarzeniach naukowych, zapowiadamy ciekawe zjawiska astronomiczne. Serwis jest częścią portalu astronomicznego AstroNET prowadzonego przez miłośników astronomii i zawodowych astronomów.' # noqa + category = 'astronomy, science' + language = 'pl' oldest_article = 8 max_articles_per_feed = 100 - #extra_css= 'table {text-align: left;}' - no_stylesheets=True - cover_url='http://news.astronet.pl/img/logo_news.jpg' + no_stylesheets = True + cover_url = 'http://news.astronet.pl/img/logo_news.jpg' remove_attributes = ['width', 'align'] - remove_tags=[dict(name='hr')] - feeds = [(u'Wiadomości', u'http://news.astronet.pl/rss.cgi')] + remove_tags = [dict(name='hr')] + feeds = [(u'Wiadomości', u'http://news.astronet.pl/rss.cgi')] def print_version(self, url): return url.replace('astronet.pl/', 'astronet.pl/print.cgi?') diff --git a/recipes/astroflesz.recipe b/recipes/astroflesz.recipe index 902f99c2c8..d44aa994e5 100644 --- a/recipes/astroflesz.recipe +++ b/recipes/astroflesz.recipe @@ -1,11 +1,12 @@ # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai from calibre.web.feeds.news import BasicNewsRecipe + class Astroflesz(BasicNewsRecipe): title = u'Astroflesz' oldest_article = 7 __author__ = 'fenuks' - description = u'astroflesz.pl - to portal poświęcony astronomii. Informuje zarówno o aktualnych wydarzeniach i odkryciach naukowych, jak również zapowiada ciekawe zjawiska astronomiczne' + description = u'astroflesz.pl - to portal poświęcony astronomii. Informuje zarówno o aktualnych wydarzeniach i odkryciach naukowych, jak również zapowiada ciekawe zjawiska astronomiczne' # noqa category = 'astronomy' language = 'pl' cover_url = 'http://www.astroflesz.pl/templates/astroflesz/images/logo/logo.png' @@ -16,12 +17,13 @@ class Astroflesz(BasicNewsRecipe): remove_empty_feeds = True remove_attributes = ['style'] keep_only_tags = [dict(id="k2Container")] - remove_tags_after = dict(name='div', attrs={'class':'itemLinks'}) - remove_tags = [dict(name='div', attrs={'class':['itemLinks', 'itemToolbar', 'itemRatingBlock']})] + remove_tags_after = dict(name='div', attrs={'class': 'itemLinks'}) + remove_tags = [dict(name='div', attrs={ + 'class': ['itemLinks', 'itemToolbar', 'itemRatingBlock']})] feeds = [(u'Wszystkie', u'http://astroflesz.pl/?format=feed')] def postprocess_html(self, soup, first_fetch): - t = soup.find(attrs={'class':'itemIntroText'}) + t = soup.find(attrs={'class': 'itemIntroText'}) if t: for i in t.findAll('img'): i['style'] = 'float: left; margin-right: 5px;' diff --git a/recipes/athens_news.recipe b/recipes/athens_news.recipe index 6667faaf0c..654dc85db2 100644 --- a/recipes/athens_news.recipe +++ b/recipes/athens_news.recipe @@ -1,4 +1,4 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2011, Darko Miletic ' ''' www.athensnews.gr @@ -6,21 +6,22 @@ www.athensnews.gr from calibre.web.feeds.news import BasicNewsRecipe + class AthensNews(BasicNewsRecipe): - title = 'Athens News' - __author__ = 'Darko Miletic' - description = 'Greece in English since 1952' - publisher = 'NEP Publishing Company SA' - category = 'news, politics, Greece, Athens' - oldest_article = 1 + title = 'Athens News' + __author__ = 'Darko Miletic' + description = 'Greece in English since 1952' + publisher = 'NEP Publishing Company SA' + category = 'news, politics, Greece, Athens' + oldest_article = 1 max_articles_per_feed = 200 - no_stylesheets = True - encoding = 'utf8' - use_embedded_content = False - language = 'en_GR' - remove_empty_feeds = True - publication_type = 'newspaper' - masthead_url = 'http://www.athensnews.gr/sites/athensnews/themes/athensnewsv3/images/logo.jpg' + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'en_GR' + remove_empty_feeds = True + publication_type = 'newspaper' + masthead_url = 'http://www.athensnews.gr/sites/athensnews/themes/athensnewsv3/images/logo.jpg' extra_css = """ body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em; display:block} @@ -30,36 +31,32 @@ class AthensNews(BasicNewsRecipe): """ conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - , 'linearize_tables' : True - } + 'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True + } remove_tags = [ - dict(name=['meta','link']) - ] - keep_only_tags=[ - dict(name='span',attrs={'class':'big'}) - ,dict(name='td', attrs={'class':['articlepubdate','text']}) - ] - remove_attributes=['lang'] - + dict(name=['meta', 'link']) + ] + keep_only_tags = [ + dict(name='span', attrs={'class': 'big'}), dict( + name='td', attrs={'class': ['articlepubdate', 'text']}) + ] + remove_attributes = ['lang'] feeds = [ - (u'News' , u'http://www.athensnews.gr/category/1/feed' ) - ,(u'Politics' , u'http://www.athensnews.gr/category/8/feed' ) - ,(u'Business' , u'http://www.athensnews.gr/category/2/feed' ) - ,(u'Economy' , u'http://www.athensnews.gr/category/11/feed') - ,(u'Community' , u'http://www.athensnews.gr/category/5/feed' ) - ,(u'Arts' , u'http://www.athensnews.gr/category/3/feed' ) - ,(u'Living in Athens', u'http://www.athensnews.gr/category/7/feed' ) - ,(u'Sports' , u'http://www.athensnews.gr/category/4/feed' ) - ,(u'Travel' , u'http://www.athensnews.gr/category/6/feed' ) - ,(u'Letters' , u'http://www.athensnews.gr/category/44/feed') - ,(u'Media' , u'http://www.athensnews.gr/multimedia/feed' ) - ] + + (u'News', u'http://www.athensnews.gr/category/1/feed'), + (u'Politics', u'http://www.athensnews.gr/category/8/feed'), + (u'Business', u'http://www.athensnews.gr/category/2/feed'), + (u'Economy', u'http://www.athensnews.gr/category/11/feed'), + (u'Community', u'http://www.athensnews.gr/category/5/feed'), + (u'Arts', u'http://www.athensnews.gr/category/3/feed'), + (u'Living in Athens', u'http://www.athensnews.gr/category/7/feed'), + (u'Sports', u'http://www.athensnews.gr/category/4/feed'), + (u'Travel', u'http://www.athensnews.gr/category/6/feed'), + (u'Letters', u'http://www.athensnews.gr/category/44/feed'), + (u'Media', u'http://www.athensnews.gr/multimedia/feed') + ] def print_version(self, url): return url + '?action=print' diff --git a/recipes/atlantic.recipe b/recipes/atlantic.recipe index 83a006f8e0..00928e498b 100644 --- a/recipes/atlantic.recipe +++ b/recipes/atlantic.recipe @@ -1,6 +1,6 @@ #!/usr/bin/env python2 from __future__ import unicode_literals -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' theatlantic.com @@ -9,13 +9,15 @@ import html5lib from lxml import html from calibre.web.feeds.news import BasicNewsRecipe + def classes(classes): q = frozenset(classes.split(' ')) - return dict(attrs={'class':lambda x:x and frozenset(x.split()).intersection(q)}) + return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}) + class TheAtlantic(BasicNewsRecipe): - title = 'The Atlantic' + title = 'The Atlantic' __author__ = 'Kovid Goyal' description = 'Current affairs and politics focussed on the US' INDEX = 'http://www.theatlantic.com/magazine/' @@ -23,13 +25,14 @@ class TheAtlantic(BasicNewsRecipe): encoding = 'utf-8' keep_only_tags = [ - classes('article-header article-body article-magazine metadata article-cover-content lead-img'), + classes( + 'article-header article-body article-magazine metadata article-cover-content lead-img'), ] - remove_tags = [ + remove_tags = [ {'name': ['meta', 'link', 'noscript']}, - {'attrs':{'class':['offset-wrapper', 'ad-boxfeatures-wrapper']}}, - {'attrs':{'class':lambda x: x and 'article-tools' in x}}, - {'src':lambda x:x and 'spotxchange.com' in x}, + {'attrs': {'class': ['offset-wrapper', 'ad-boxfeatures-wrapper']}}, + {'attrs': {'class': lambda x: x and 'article-tools' in x}}, + {'src': lambda x: x and 'spotxchange.com' in x}, ] remove_tags_after = classes('article-body') @@ -48,7 +51,7 @@ class TheAtlantic(BasicNewsRecipe): return url + '?single_page=true' def preprocess_html(self, soup): - for img in soup.findAll('img', attrs={'data-src':True}): + for img in soup.findAll('img', attrs={'data-src': True}): img['src'] = img['data-src'] return soup @@ -61,8 +64,8 @@ class TheAtlantic(BasicNewsRecipe): self.cover_url = img['src'] current_section, current_articles = 'Cover Story', [] feeds = [] - for div in soup.findAll('div', attrs={'class':lambda x: x and set(x.split()).intersection({'top-sections', 'bottom-sections'})}): - for h2 in div.findAll('h2', attrs={'class':True}): + for div in soup.findAll('div', attrs={'class': lambda x: x and set(x.split()).intersection({'top-sections', 'bottom-sections'})}): + for h2 in div.findAll('h2', attrs={'class': True}): if 'section-name' in h2['class'].split(): if current_articles: feeds.append((current_section, current_articles)) @@ -75,18 +78,22 @@ class TheAtlantic(BasicNewsRecipe): url = a['href'] if url.startswith('/'): url = 'http://www.theatlantic.com' + url - li = a.findParent('li', attrs={'class':lambda x: x and 'article' in x.split()}) + li = a.findParent( + 'li', attrs={'class': lambda x: x and 'article' in x.split()}) desc = '' - dek = li.find(attrs={'class':lambda x:x and 'dek' in x.split()}) + dek = li.find( + attrs={'class': lambda x: x and 'dek' in x.split()}) if dek is not None: desc += self.tag_to_string(dek) - byline = li.find(attrs={'class':lambda x:x and 'byline' in x.split()}) + byline = li.find( + attrs={'class': lambda x: x and 'byline' in x.split()}) if byline is not None: desc += ' -- ' + self.tag_to_string(byline) self.log('\t', title, 'at', url) if desc: self.log('\t\t', desc) - current_articles.append({'title':title, 'url':url, 'description':desc}) + current_articles.append( + {'title': title, 'url': url, 'description': desc}) if current_articles: feeds.append((current_section, current_articles)) return feeds diff --git a/recipes/atlantic_com.recipe b/recipes/atlantic_com.recipe index b42fbdec87..6e81186ec3 100644 --- a/recipes/atlantic_com.recipe +++ b/recipes/atlantic_com.recipe @@ -3,20 +3,21 @@ from __future__ import unicode_literals, division, absolute_import, print_function from calibre.web.feeds.news import BasicNewsRecipe + class AdvancedUserRecipe1421956712(BasicNewsRecipe): - title = 'TheAtlantic.com' - __author__ = 'ebrandon' - language = 'en' - description = 'News and editorial about politics, culture, entertainment, tech, etc. Contains many articles not seen in The Atlantic magazine' + title = 'TheAtlantic.com' + __author__ = 'ebrandon' + language = 'en' + description = 'News and editorial about politics, culture, entertainment, tech, etc. Contains many articles not seen in The Atlantic magazine' oldest_article = 7 max_articles_per_feed = 100 - auto_cleanup = True + auto_cleanup = True ignore_duplicate_articles = {'title', 'url'} def print_version(self, url): return url.replace('/archive/', '/print/') - feeds = [ + feeds = [ ('Politics', 'http://feeds.feedburner.com/AtlanticPoliticsChannel'), ('International', 'http://feeds.feedburner.com/AtlanticInternational'), ('National', 'http://feeds.feedburner.com/AtlanticNational'), diff --git a/recipes/attac_es.recipe b/recipes/attac_es.recipe index e57d321423..9e0d11f438 100644 --- a/recipes/attac_es.recipe +++ b/recipes/attac_es.recipe @@ -2,14 +2,15 @@ from __future__ import unicode_literals from calibre.web.feeds.news import BasicNewsRecipe + class AttacEspanaRecipe (BasicNewsRecipe): __author__ = 'Marc Busqué ' __url__ = 'http://www.lamarciana.com' __version__ = '1.0.2' - __license__ = 'GPL v3' + __license__ = 'GPL v3' __copyright__ = '2012, Marc Busqué ' title = u'attac.es' - description = u'La Asociación por la Tasación de las Transacciones Financieras y por la Ayuda a los Ciudadanos (ATTAC) es un movimiento internacional altermundialista que promueve el control democrático de los mercados financieros y las instituciones encargadas de su control mediante la reflexión política y la movilización social.' + description = u'La Asociación por la Tasación de las Transacciones Financieras y por la Ayuda a los Ciudadanos (ATTAC) es un movimiento internacional altermundialista que promueve el control democrático de los mercados financieros y las instituciones encargadas de su control mediante la reflexión política y la movilización social.' # noqa url = 'http://www.attac.es' language = 'es' tags = 'contrainformación, información alternativa' @@ -27,5 +28,5 @@ class AttacEspanaRecipe (BasicNewsRecipe): cover_url = u'http://www.attac.es/wp-content/themes/attacweb/images/attaces.jpg' feeds = [ - (u'Attac', u'http://www.attac.es/feed'), - ] + (u'Attac', u'http://www.attac.es/feed'), + ] diff --git a/recipes/auto.recipe b/recipes/auto.recipe index d8faf8d1f3..3327ef6e41 100644 --- a/recipes/auto.recipe +++ b/recipes/auto.recipe @@ -1,9 +1,9 @@ #!/usr/bin/env python2 -__license__ = 'GPL v3' -__author__ = 'GabrieleMarini, based on Darko Miletic' -__copyright__ = '2009, Darko Miletic , Gabriele Marini' -__version__ = 'v1.02 Marini Gabriele ' -__date__ = '14062010' +__license__ = 'GPL v3' +__author__ = 'GabrieleMarini, based on Darko Miletic' +__copyright__ = '2009, Darko Miletic , Gabriele Marini' +__version__ = 'v1.02 Marini Gabriele ' +__date__ = '14062010' __description__ = 'Italian daily newspaper' ''' @@ -11,53 +11,46 @@ http://www.corrieredellosport.it/ ''' from calibre.web.feeds.news import BasicNewsRecipe + class Auto(BasicNewsRecipe): - __author__ = 'Gabriele Marini' - description = 'Auto and Formula 1' + __author__ = 'Gabriele Marini' + description = 'Auto and Formula 1' - cover_url = 'http://www.auto.it/res/imgs/logo_Auto.png' + cover_url = 'http://www.auto.it/res/imgs/logo_Auto.png' + title = u'Auto' + publisher = 'CONTE Editore' + category = 'Sport' - title = u'Auto' - publisher = 'CONTE Editore' - category = 'Sport' - - language = 'it' - timefmt = '[%a, %d %b, %Y]' + language = 'it' + timefmt = '[%a, %d %b, %Y]' oldest_article = 60 max_articles_per_feed = 30 - use_embedded_content = False - recursion = 10 + use_embedded_content = False + recursion = 10 remove_javascript = True no_stylesheets = True html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - , '--ignore-tables' - ] + '--comment', description, '--category', category, '--publisher', publisher, '--ignore-tables' + ] - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + \ + description + '"\ntags="' + category + '"\nlinearize_tables=True' keep_only_tags = [ - dict(name='h2', attrs={'class':['tit_Article y_Txt']}), - dict(name='h2', attrs={'class':['tit_Article']}), - dict(name='div', attrs={'class':['box_Img newsdet_new ']}), - dict(name='div', attrs={'class':['box_Img newsdet_as ']}), - dict(name='table', attrs={'class':['table_A']}), - dict(name='div', attrs={'class':['txt_Article txtBox_cms']}), - dict(name='testoscheda')] - + dict(name='h2', attrs={'class': ['tit_Article y_Txt']}), + dict(name='h2', attrs={'class': ['tit_Article']}), + dict(name='div', attrs={'class': ['box_Img newsdet_new ']}), + dict(name='div', attrs={'class': ['box_Img newsdet_as ']}), + dict(name='table', attrs={'class': ['table_A']}), + dict(name='div', attrs={'class': ['txt_Article txtBox_cms']}), + dict(name='testoscheda')] feeds = [ - (u'Tutte le News' , u'http://www.auto.it/rss/articoli.xml' ), - (u'Prove su Strada' , u'http://www.auto.it/rss/prove+6.xml'), - (u'Novit\xe0' , u'http://www.auto.it/rss/novita+3.xml') - ] - - - - + (u'Tutte le News', u'http://www.auto.it/rss/articoli.xml'), + (u'Prove su Strada', u'http://www.auto.it/rss/prove+6.xml'), + (u'Novit\xe0', u'http://www.auto.it/rss/novita+3.xml') + ] diff --git a/recipes/auto_blog.recipe b/recipes/auto_blog.recipe index 33e2c7095e..8584058be2 100644 --- a/recipes/auto_blog.recipe +++ b/recipes/auto_blog.recipe @@ -1,16 +1,15 @@ from calibre.web.feeds.news import BasicNewsRecipe + class AutoBlog(BasicNewsRecipe): - title = u'Auto Blog' - __author__ = 'Welovelucy' + title = u'Auto Blog' + __author__ = 'Welovelucy' language = 'en' description = 'Auto industry news' oldest_article = 7 max_articles_per_feed = 100 - feeds = [(u'AutoBlog', u'http://www.autoblog.com/rss.xml')] + feeds = [(u'AutoBlog', u'http://www.autoblog.com/rss.xml')] def print_version(self, url): return url + 'print/' - - diff --git a/recipes/auto_prove.recipe b/recipes/auto_prove.recipe index 806c933656..1bd7f6e7ad 100644 --- a/recipes/auto_prove.recipe +++ b/recipes/auto_prove.recipe @@ -1,9 +1,9 @@ #!/usr/bin/env python2 -__license__ = 'GPL v3' -__author__ = 'GabrieleMarini, based on Darko Miletic' -__copyright__ = '2009, Darko Miletic , Gabriele Marini' -__version__ = 'v1.02 Marini Gabriele ' -__date__ = '10, January 2010' +__license__ = 'GPL v3' +__author__ = 'GabrieleMarini, based on Darko Miletic' +__copyright__ = '2009, Darko Miletic , Gabriele Marini' +__version__ = 'v1.02 Marini Gabriele ' +__date__ = '10, January 2010' __description__ = 'Italian daily newspaper' ''' @@ -11,80 +11,80 @@ http://www.corrieredellosport.it/ ''' from calibre.web.feeds.news import BasicNewsRecipe + class AutoPR(BasicNewsRecipe): - __author__ = 'Gabriele Marini' - description = 'Auto and Formula 1' + __author__ = 'Gabriele Marini' + description = 'Auto and Formula 1' - cover_url = 'http://www.auto.it/res/imgs/logo_Auto.png' + cover_url = 'http://www.auto.it/res/imgs/logo_Auto.png' + title = u'Auto Prove' + publisher = 'CONTE Editore' + category = 'Sport' - title = u'Auto Prove' - publisher = 'CONTE Editore' - category = 'Sport' - - language = 'it' - timefmt = '[%a, %d %b, %Y]' + language = 'it' + timefmt = '[%a, %d %b, %Y]' oldest_article = 60 max_articles_per_feed = 20 - use_embedded_content = False - recursion = 100 + use_embedded_content = False + recursion = 100 remove_javascript = True no_stylesheets = True - #html2lrf_options = [ + # html2lrf_options = [ # '--comment', description # , '--category', category # , '--publisher', publisher # , '--ignore-tables' # ] - #html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' - keep_only_tags = [ - dict(name='h2', attrs={'class':['tit_Article y_Txt']}), - dict(name='h2', attrs={'class':['tit_Article']}), - dict(name='div', attrs={'class':['box_Img newsdet_new ']}), - dict(name='div', attrs={'class':['box_Img newsdet_as ']}), - dict(name='table', attrs={'class':['table_A']}), - dict(name='div', attrs={'class':['txt_Article txtBox_cms']}), - dict(name='testoscheda')] + dict(name='h2', attrs={'class': ['tit_Article y_Txt']}), + dict(name='h2', attrs={'class': ['tit_Article']}), + dict(name='div', attrs={'class': ['box_Img newsdet_new ']}), + dict(name='div', attrs={'class': ['box_Img newsdet_as ']}), + dict(name='table', attrs={'class': ['table_A']}), + dict(name='div', attrs={'class': ['txt_Article txtBox_cms']}), + dict(name='testoscheda')] def parse_index(self): feeds = [] for title, url in [ - ("Prove su Strada" , "http://www.auto.it/rss/prove+6.xml") - ]: + ("Prove su Strada", "http://www.auto.it/rss/prove+6.xml") + ]: soup = self.index_to_soup(url) soup = soup.find('channel') print soup for article in soup.findAllNext('item'): - title = self.tag_to_string(article.title) + title = self.tag_to_string(article.title) date = self.tag_to_string(article.pubDate) description = self.tag_to_string(article.description) link = self.tag_to_string(article.guid) # print article articles = self.create_links_append(link, date, description) if articles: - feeds.append((title, articles)) + feeds.append((title, articles)) return feeds def create_links_append(self, link, date, description): current_articles = [] - current_articles.append({'title': 'Generale', 'url': link,'description':description, 'date':date}), - current_articles.append({'title': 'Design', 'url': link.replace('scheda','design'),'description':'scheda', 'date':''}), - current_articles.append({'title': 'Interni', 'url': link.replace('scheda','interni'),'description':'Interni', 'date':''}), - current_articles.append({'title': 'Tecnica', 'url': link.replace('scheda','tecnica'),'description':'Tecnica', 'date':''}), - current_articles.append({'title': 'Su Strada', 'url': link.replace('scheda','su_strada'),'description':'Su Strada', 'date':''}), - current_articles.append({'title': 'Pagella', 'url': link.replace('scheda','pagella'),'description':'Pagella', 'date':''}), - current_articles.append({'title': 'Rilevamenti', 'url': link.replace('scheda','telemetria'),'description':'Rilevamenti', 'date':''}) + current_articles.append( + {'title': 'Generale', 'url': link, 'description': description, 'date': date}), + current_articles.append({'title': 'Design', 'url': link.replace( + 'scheda', 'design'), 'description': 'scheda', 'date': ''}), + current_articles.append({'title': 'Interni', 'url': link.replace( + 'scheda', 'interni'), 'description': 'Interni', 'date': ''}), + current_articles.append({'title': 'Tecnica', 'url': link.replace( + 'scheda', 'tecnica'), 'description': 'Tecnica', 'date': ''}), + current_articles.append({'title': 'Su Strada', 'url': link.replace( + 'scheda', 'su_strada'), 'description': 'Su Strada', 'date': ''}), + current_articles.append({'title': 'Pagella', 'url': link.replace( + 'scheda', 'pagella'), 'description': 'Pagella', 'date': ''}), + current_articles.append({'title': 'Rilevamenti', 'url': link.replace( + 'scheda', 'telemetria'), 'description': 'Rilevamenti', 'date': ''}) return current_articles - - - - - diff --git a/recipes/autobild.recipe b/recipes/autobild.recipe index ad63252ae9..95dd2aedb5 100644 --- a/recipes/autobild.recipe +++ b/recipes/autobild.recipe @@ -1,7 +1,7 @@ #!/usr/bin/env python2 # -*- coding: utf-8 -*- -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = u'2011, Silviu Cotoar\u0103' ''' auto-bild.ro @@ -9,47 +9,42 @@ auto-bild.ro from calibre.web.feeds.news import BasicNewsRecipe + class AutoBild(BasicNewsRecipe): - title = u'Auto Bild' - __author__ = u'Silviu Cotoar\u0103' - description = 'Auto' - publisher = 'Auto Bild' - oldest_article = 50 - language = 'ro' + title = u'Auto Bild' + __author__ = u'Silviu Cotoar\u0103' + description = 'Auto' + publisher = 'Auto Bild' + oldest_article = 50 + language = 'ro' max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - category = 'Ziare,Reviste,Auto' - encoding = 'utf-8' - cover_url = 'http://www.auto-bild.ro/images/autobild.gif' + no_stylesheets = True + use_embedded_content = False + category = 'Ziare,Reviste,Auto' + encoding = 'utf-8' + cover_url = 'http://www.auto-bild.ro/images/autobild.gif' conversion_options = { - 'comments' : description - ,'tags' : category - ,'language' : language - ,'publisher' : publisher - } - + 'comments': description, 'tags': category, 'language': language, 'publisher': publisher + } keep_only_tags = [ - dict(name='div', attrs={'class':'box_2 articol clearfix'}) - ] + dict(name='div', attrs={'class': 'box_2 articol clearfix'}) + ] remove_tags = [ - dict(name='div', attrs={'class':['detail']}) - , dict(name='a', attrs={'id':['zoom_link']}) - , dict(name='div', attrs={'class':['icons clearfix']}) - , dict(name='div', attrs={'class':['pub_articol clearfix']}) + dict(name='div', attrs={'class': ['detail']}), dict(name='a', attrs={'id': ['zoom_link']}), dict( + name='div', attrs={'class': ['icons clearfix']}), dict(name='div', attrs={'class': ['pub_articol clearfix']}) - ] + ] remove_tags_after = [ - dict(name='div', attrs={'class':['pub_articol clearfix']}) - ] + dict(name='div', attrs={'class': ['pub_articol clearfix']}) + ] - feeds = [ + feeds = [ (u'Feeds', u'http://www.auto-bild.ro/rss/toate') - ] + ] def preprocess_html(self, soup): return self.adeify_images(soup) diff --git a/recipes/automatiseringgids.recipe b/recipes/automatiseringgids.recipe index 8794f1b424..4f429e1e9b 100644 --- a/recipes/automatiseringgids.recipe +++ b/recipes/automatiseringgids.recipe @@ -1,27 +1,28 @@ import re from calibre.web.feeds.news import BasicNewsRecipe + class autogids(BasicNewsRecipe): title = u'Automatiseringgids IT' oldest_article = 7 - __author__ = 'DrMerry' - description = 'IT-nieuws van Automatiseringgids' - language = 'nl' - publisher = 'AutomatiseringGids' - category = 'Nieuws, IT, Nederlandstalig' + __author__ = 'DrMerry' + description = 'IT-nieuws van Automatiseringgids' + language = 'nl' + publisher = 'AutomatiseringGids' + category = 'Nieuws, IT, Nederlandstalig' simultaneous_downloads = 5 - timefmt = ' [%a, %d %B, %Y]' + timefmt = ' [%a, %d %B, %Y]' no_stylesheets = True remove_javascript = True remove_empty_feeds = True publication_type = 'newspaper' - encoding = 'utf-8' - cover_url = 'http://www.automatiseringgids.nl/binaries/content/gallery/ag/marketing/ag-avatar-100x50.jpg' - keep_only_tags = [dict(name='div', attrs={'class':['content']})] + encoding = 'utf-8' + cover_url = 'http://www.automatiseringgids.nl/binaries/content/gallery/ag/marketing/ag-avatar-100x50.jpg' + keep_only_tags = [dict(name='div', attrs={'class': ['content']})] preprocess_regexps = [ - (re.compile(r'(

    Reacties

    |

    Zie ook:

    |
    |]*>|)', re.DOTALL | re.IGNORECASE), + lambda match: ''), ] - feeds = [(u'Actueel', u'http://www.automatiseringgids.nl/rss.aspx')] + feeds = [(u'Actueel', u'http://www.automatiseringgids.nl/rss.aspx')] diff --git a/recipes/autosport.recipe b/recipes/autosport.recipe index 5c4465f652..5b9f91516d 100644 --- a/recipes/autosport.recipe +++ b/recipes/autosport.recipe @@ -9,22 +9,25 @@ www.autosport.com from calibre.web.feeds.news import BasicNewsRecipe + class autosport(BasicNewsRecipe): title = u'Autosport' __author__ = 'MrStefan ' language = 'en_GB' - description =u'Daily Formula 1 and motorsport news from the leading weekly motor racing magazine. The authority on Formula 1, F1, MotoGP, GP2, Champ Car, Le Mans...' - masthead_url='http://cdn.images.autosport.com/asdotcom.gif' - remove_empty_feeds= True + description = u'Daily Formula 1 and motorsport news from the leading weekly motor racing magazine. The authority on Formula 1, F1, MotoGP, GP2, Champ Car, Le Mans...' # noqa + masthead_url = 'http://cdn.images.autosport.com/asdotcom.gif' + remove_empty_feeds = True oldest_article = 1 max_articles_per_feed = 100 - remove_javascript=True - no_stylesheets=True + remove_javascript = True + no_stylesheets = True - keep_only_tags =[] - keep_only_tags.append(dict(name = 'h1', attrs = {'class' : 'news_headline'})) - keep_only_tags.append(dict(name = 'td', attrs = {'class' : 'news_article_author'})) - keep_only_tags.append(dict(name = 'td', attrs = {'class' : 'news_article_date'})) - keep_only_tags.append(dict(name = 'p')) + keep_only_tags = [] + keep_only_tags.append(dict(name='h1', attrs={'class': 'news_headline'})) + keep_only_tags.append( + dict(name='td', attrs={'class': 'news_article_author'})) + keep_only_tags.append( + dict(name='td', attrs={'class': 'news_article_date'})) + keep_only_tags.append(dict(name='p')) feeds = [(u'ALL NEWS', u'http://www.autosport.com/rss/allnews.xml')] diff --git a/recipes/avantaje.recipe b/recipes/avantaje.recipe index dbd6b75d4a..ec67bb3c82 100644 --- a/recipes/avantaje.recipe +++ b/recipes/avantaje.recipe @@ -1,7 +1,7 @@ #!/usr/bin/env python2 # -*- coding: utf-8 -*- -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = u'2011, Silviu Cotoar\u0103' ''' avantaje.ro @@ -9,49 +9,41 @@ avantaje.ro from calibre.web.feeds.news import BasicNewsRecipe + class Avantaje(BasicNewsRecipe): - title = u'Avantaje' - __author__ = u'Silviu Cotoar\u0103' - description = u'' - publisher = u'Avantaje' - oldest_article = 25 - language = 'ro' + title = u'Avantaje' + __author__ = u'Silviu Cotoar\u0103' + description = u'' + publisher = u'Avantaje' + oldest_article = 25 + language = 'ro' max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - category = 'Ziare,Reviste,Stiri' - encoding = 'utf-8' - cover_url = 'http://www.avantaje.ro/images/default/logo.gif' + no_stylesheets = True + use_embedded_content = False + category = 'Ziare,Reviste,Stiri' + encoding = 'utf-8' + cover_url = 'http://www.avantaje.ro/images/default/logo.gif' conversion_options = { - 'comments' : description - ,'tags' : category - ,'language' : language - ,'publisher' : publisher - } + 'comments': description, 'tags': category, 'language': language, 'publisher': publisher + } keep_only_tags = [ - dict(name='div', attrs={'id':'articol'}) - , dict(name='div', attrs={'class':'gallery clearfix'}) - , dict(name='div', attrs={'align':'justify'}) - ] + dict(name='div', attrs={'id': 'articol'}), dict(name='div', attrs={ + 'class': 'gallery clearfix'}), dict(name='div', attrs={'align': 'justify'}) + ] remove_tags = [ - dict(name='div', attrs={'id':['color_sanatate_box']}) - , dict(name='div', attrs={'class':['nav']}) - , dict(name='div', attrs={'class':['voteaza_art']}) - , dict(name='div', attrs={'class':['bookmark']}) - , dict(name='div', attrs={'class':['links clearfix']}) - , dict(name='div', attrs={'class':['title']}) - ] + dict(name='div', attrs={'id': ['color_sanatate_box']}), dict(name='div', attrs={'class': ['nav']}), dict(name='div', attrs={'class': ['voteaza_art']}), dict(name='div', attrs={'class': ['bookmark']}), dict(name='div', attrs={'class': ['links clearfix']}), dict(name='div', attrs={'class': ['title']}) # noqa + ] remove_tags_after = [ - dict(name='div', attrs={'class':['title']}) - ] + dict(name='div', attrs={'class': ['title']}) + ] - feeds = [ - (u'Feeds', u'http://feeds.feedburner.com/Avantaje') - ] + feeds = [ + (u'Feeds', u'http://feeds.feedburner.com/Avantaje') + ] def preprocess_html(self, soup): return self.adeify_images(soup) diff --git a/recipes/aventurilapescuit.recipe b/recipes/aventurilapescuit.recipe index 6eac5ad24c..7321226175 100644 --- a/recipes/aventurilapescuit.recipe +++ b/recipes/aventurilapescuit.recipe @@ -1,7 +1,7 @@ #!/usr/bin/env python2 # -*- coding: utf-8 -*- -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = u'2011, Silviu Cotoar\u0103' ''' aventurilapescuit.ro @@ -9,43 +9,41 @@ aventurilapescuit.ro from calibre.web.feeds.news import BasicNewsRecipe + class AventuriLaPescuit(BasicNewsRecipe): - title = u'Aventuri La Pescuit' - __author__ = u'Silviu Cotoar\u0103' - description = 'Aventuri La Pescuit' - publisher = 'Aventuri La Pescuit' - oldest_article = 5 - language = 'ro' + title = u'Aventuri La Pescuit' + __author__ = u'Silviu Cotoar\u0103' + description = 'Aventuri La Pescuit' + publisher = 'Aventuri La Pescuit' + oldest_article = 5 + language = 'ro' max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - category = 'Ziare,Pescuit,Hobby' - encoding = 'utf-8' - cover_url = 'http://www.aventurilapescuit.ro/images/logo.gif' + no_stylesheets = True + use_embedded_content = False + category = 'Ziare,Pescuit,Hobby' + encoding = 'utf-8' + cover_url = 'http://www.aventurilapescuit.ro/images/logo.gif' conversion_options = { - 'comments' : description - ,'tags' : category - ,'language' : language - ,'publisher' : publisher - } + 'comments': description, 'tags': category, 'language': language, 'publisher': publisher + } keep_only_tags = [ - dict(name='div', attrs={'id':'Article'}) - ] + dict(name='div', attrs={'id': 'Article'}) + ] remove_tags = [ - dict(name='div', attrs={'class':['right option']}) - , dict(name='iframe', attrs={'scrolling':['no']}) - ] + dict(name='div', attrs={'class': ['right option']}), dict( + name='iframe', attrs={'scrolling': ['no']}) + ] remove_tags_after = [ - dict(name='iframe', attrs={'scrolling':['no']}) - ] + dict(name='iframe', attrs={'scrolling': ['no']}) + ] - feeds = [ - (u'Feeds', u'http://www.aventurilapescuit.ro/sections/rssread/1') - ] + feeds = [ + (u'Feeds', u'http://www.aventurilapescuit.ro/sections/rssread/1') + ] def preprocess_html(self, soup): return self.adeify_images(soup) diff --git a/recipes/avto-magazin.recipe b/recipes/avto-magazin.recipe index adaf74546e..b4d0f81e2e 100644 --- a/recipes/avto-magazin.recipe +++ b/recipes/avto-magazin.recipe @@ -4,44 +4,44 @@ __copyright__ = '2010, BlonG' avto-magazin.si ''' from calibre.web.feeds.news import BasicNewsRecipe + + class Dnevnik(BasicNewsRecipe): - title = u'Avto Magazin' - __author__ = u'BlonG' - description = u'Za avtomobilisti\xc4\x8dne frike, poznavalce in nedeljske \xc5\xa1oferje.' - oldest_article = 7 - max_articles_per_feed = 20 - labguage = 'sl' - no_stylesheets = True - use_embedded_content = False - language = 'sl' + title = u'Avto Magazin' + __author__ = u'BlonG' + description = u'Za avtomobilisti\xc4\x8dne frike, poznavalce in nedeljske \xc5\xa1oferje.' + oldest_article = 7 + max_articles_per_feed = 20 + labguage = 'sl' + no_stylesheets = True + use_embedded_content = False + language = 'sl' - conversion_options = {'linearize_tables' : True} + conversion_options = {'linearize_tables': True} + cover_url = 'https://sites.google.com/site/javno2010/home/avto_magazin_cover.jpg' - cover_url = 'https://sites.google.com/site/javno2010/home/avto_magazin_cover.jpg' + extra_css = ''' + h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} + h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} + p{font-family:Arial,Helvetica,sans-serif;font-size:small;} + body{font-family:Helvetica,Arial,sans-serif;font-size:small;} + ''' - extra_css = ''' - h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} - h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} - p{font-family:Arial,Helvetica,sans-serif;font-size:small;} - body{font-family:Helvetica,Arial,sans-serif;font-size:small;} - ''' + keep_only_tags = [ + dict(name='div', attrs={'id': '_iprom_inStream'}), + # dict(name='div', attrs={'class':'entry-content'}), + ] - keep_only_tags = [ - dict(name='div', attrs={'id':'_iprom_inStream'}), -# dict(name='div', attrs={'class':'entry-content'}), - ] + remove_tags = [ + dict(name='div', attrs={'id': 'voteConfirmation'}), + dict(name='div', attrs={'id': 'InsideVote'}), + dict(name='div', attrs={'class': 'Zone234'}), + dict(name='div', attrs={'class': 'Comments'}), + dict(name='div', attrs={'class': 'sorodneNovice'}), + dict(name='div', attrs={'id': 'footer'}), + ] - remove_tags = [ - dict(name='div', attrs={'id':'voteConfirmation'}), - dict(name='div', attrs={'id':'InsideVote'}), - dict(name='div', attrs={'class':'Zone234'}), - dict(name='div', attrs={'class':'Comments'}), - dict(name='div', attrs={'class':'sorodneNovice'}), - dict(name='div', attrs={'id':'footer'}), - ] - - - feeds = [ - (u'Novice', u'http://www.avto-magazin.si/rss/') - ] + feeds = [ + (u'Novice', u'http://www.avto-magazin.si/rss/') + ] diff --git a/recipes/axxon_magazine.recipe b/recipes/axxon_magazine.recipe index 93cb5cd03b..1ecf01e276 100644 --- a/recipes/axxon_magazine.recipe +++ b/recipes/axxon_magazine.recipe @@ -1,4 +1,4 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2010, Darko Miletic ' ''' axxon.com.ar @@ -6,35 +6,33 @@ axxon.com.ar from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe + class Axxon_news(BasicNewsRecipe): - title = 'Revista Axxon' - __author__ = 'Darko Miletic' - description = 'Axxon, Ciencia Ficcion en Bits' - publisher = 'Revista Axxon - Ciencia Ficcion' - category = 'SF, Argentina' - oldest_article = 31 - delay = 1 + title = 'Revista Axxon' + __author__ = 'Darko Miletic' + description = 'Axxon, Ciencia Ficcion en Bits' + publisher = 'Revista Axxon - Ciencia Ficcion' + category = 'SF, Argentina' + oldest_article = 31 + delay = 1 max_articles_per_feed = 100 - no_stylesheets = False - use_embedded_content = False - language = 'es_AR' - encoding = 'utf-8' - publication_type = 'magazine' - INDEX = 'http://axxon.com.ar/rev/' - extra_css = ' body{font-family: Verdana,Arial,sans-serif} .editorial{font-family: serif} .posttitle{font-family: "Trebuchet MS","Lucida Grande",Verdana,Arial,sans-serif} .cuento{font-family: "Times New Roman", serif} .biografia{color: red; font-weight: bold; font-family: Verdana,Geneva,Arial,Helvetica,sans-serif} ' + no_stylesheets = False + use_embedded_content = False + language = 'es_AR' + encoding = 'utf-8' + publication_type = 'magazine' + INDEX = 'http://axxon.com.ar/rev/' + extra_css = ' body{font-family: Verdana,Arial,sans-serif} .editorial{font-family: serif} .posttitle{font-family: "Trebuchet MS","Lucida Grande",Verdana,Arial,sans-serif} .cuento{font-family: "Times New Roman", serif} .biografia{color: red; font-weight: bold; font-family: Verdana,Geneva,Arial,Helvetica,sans-serif} ' # noqa conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } + 'comment': description, 'tags': category, 'publisher': publisher, 'language': language + } - - keep_only_tags = [dict(name='div', attrs={'class':'post'})] - remove_tags = [dict(name=['object','link','iframe','embed','img'])] - remove_tags_after = [dict(attrs={'class':['editorial','correo','biografia','articulo']})] - remove_attributes = ['width','height','font','border','align'] + keep_only_tags = [dict(name='div', attrs={'class': 'post'})] + remove_tags = [dict(name=['object', 'link', 'iframe', 'embed', 'img'])] + remove_tags_after = [ + dict(attrs={'class': ['editorial', 'correo', 'biografia', 'articulo']})] + remove_attributes = ['width', 'height', 'font', 'border', 'align'] def parse_index(self): articles = [] @@ -44,21 +42,16 @@ class Axxon_news(BasicNewsRecipe): description = '' title_prefix = '' feed_link = item.find('a') - if feed_link and feed_link.has_key('href') and feed_link['href'].startswith('?p='): - url = self.INDEX + feed_link['href'] + if feed_link and feed_link.has_key('href') and feed_link['href'].startswith('?p='): # noqa + url = self.INDEX + feed_link['href'] title = title_prefix + self.tag_to_string(feed_link) - date = strftime(self.timefmt) + date = strftime(self.timefmt) articles.append({ - 'title' :title - ,'date' :date - ,'url' :url - ,'description':description - }) + 'title': title, 'date': date, 'url': url, 'description': description + }) return [(soup.head.title.string, articles)] - def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] return self.adeify_images(soup) - diff --git a/recipes/axxon_news.recipe b/recipes/axxon_news.recipe index 0d3cea5a22..1dcb1a2337 100644 --- a/recipes/axxon_news.recipe +++ b/recipes/axxon_news.recipe @@ -1,6 +1,6 @@ #!/usr/bin/env python2 -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2009, Darko Miletic ' ''' axxon.com.ar @@ -8,55 +8,50 @@ axxon.com.ar from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag + class Axxon_news(BasicNewsRecipe): - title = 'Axxon noticias' - __author__ = 'Darko Miletic' - description = 'Axxon, Ciencia Ficcion en Bits' - publisher = 'Axxon' - category = 'news, SF, Argentina, science, movies' - oldest_article = 7 + title = 'Axxon noticias' + __author__ = 'Darko Miletic' + description = 'Axxon, Ciencia Ficcion en Bits' + publisher = 'Axxon' + category = 'news, SF, Argentina, science, movies' + oldest_article = 7 max_articles_per_feed = 100 - no_stylesheets = False - use_embedded_content = False + no_stylesheets = False + use_embedded_content = False language = 'es_AR' - lang = 'es-AR' + lang = 'es-AR' conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : lang - , 'pretty_print' : True - } + 'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True + } + keep_only_tags = [dict(name='div', attrs={'class': 'post'})] - keep_only_tags = [dict(name='div', attrs={'class':'post'})] + remove_tags = [dict(name=['object', 'link', 'iframe', 'embed'])] - remove_tags = [dict(name=['object','link','iframe','embed'])] - - feeds = [(u'Noticias', u'http://axxon.com.ar/noticias/feed/')] - - remove_attributes = ['style','width','height','font','border','align'] + feeds = [(u'Noticias', u'http://axxon.com.ar/noticias/feed/')] + remove_attributes = ['style', 'width', 'height', 'font', 'border', 'align'] def adeify_images2(cls, soup): for item in soup.findAll('img'): - for attrib in ['height','width','border','align','style']: - if item.has_key(attrib): - del item[attrib] + for attrib in ['height', 'width', 'border', 'align', 'style']: + if item.has_key(attrib): # noqa + del item[attrib] oldParent = item.parent if oldParent.name == 'a': - oldParent.name == 'p' + oldParent.name == 'p' myIndex = oldParent.contents.index(item) - brtag = Tag(soup,'br') - oldParent.insert(myIndex+1,brtag) + brtag = Tag(soup, 'br') + oldParent.insert(myIndex + 1, brtag) return soup def preprocess_html(self, soup): soup.html['xml:lang'] = self.lang - soup.html['lang'] = self.lang - mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) - soup.html.insert(0,mlang) + soup.html['lang'] = self.lang + mlang = Tag(soup, 'meta', [ + ("http-equiv", "Content-Language"), ("content", self.lang)]) + soup.html.insert(0, mlang) return self.adeify_images2(soup) - diff --git a/recipes/azstarnet.recipe b/recipes/azstarnet.recipe index 3ab10a9b6f..153d0a56e3 100644 --- a/recipes/azstarnet.recipe +++ b/recipes/azstarnet.recipe @@ -1,5 +1,5 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2009-2010, Darko Miletic ' ''' azstarnet.com @@ -7,53 +7,47 @@ azstarnet.com import urllib from calibre.web.feeds.news import BasicNewsRecipe + class Azstarnet(BasicNewsRecipe): - title = 'Arizona Daily Star' - __author__ = 'Darko Miletic' - description = 'news from Arizona' - language = 'en' - publisher = 'azstarnet.com' - category = 'news, politics, Arizona, USA' - oldest_article = 3 + title = 'Arizona Daily Star' + __author__ = 'Darko Miletic' + description = 'news from Arizona' + language = 'en' + publisher = 'azstarnet.com' + category = 'news, politics, Arizona, USA' + oldest_article = 3 max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - encoding = 'utf-8' - masthead_url = 'http://azstarnet.com/content/tncms/live/global/resources/images/logo.gif' - needs_subscription = True + no_stylesheets = True + use_embedded_content = False + encoding = 'utf-8' + masthead_url = 'http://azstarnet.com/content/tncms/live/global/resources/images/logo.gif' + needs_subscription = True conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } - + 'comment': description, 'tags': category, 'publisher': publisher, 'language': language + } def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.open('http://azstarnet.com/') if self.username is not None and self.password is not None: - data = urllib.urlencode({ 'm':'login' - ,'u':self.username - ,'p':self.password - ,'z':'http://azstarnet.com/' - }) - br.open('http://azstarnet.com/app/registration/proxy.php',data) + data = urllib.urlencode({'m': 'login', 'u': self.username, 'p': self.password, 'z': 'http://azstarnet.com/' + }) + br.open('http://azstarnet.com/app/registration/proxy.php', data) return br - remove_tags = [dict(name=['object','link','iframe','base','img'])] - + remove_tags = [dict(name=['object', 'link', 'iframe', 'base', 'img'])] feeds = [ - (u'Local News' , u'http://azstarnet.com/search/?f=rss&t=article&c=news/local&l=25&s=start_time&sd=desc') - ,(u'National News' , u'http://azstarnet.com/search/?f=rss&t=article&c=news/national&l=25&s=start_time&sd=desc') - ,(u'World News' , u'http://azstarnet.com/search/?f=rss&t=article&c=news/world&l=25&s=start_time&sd=desc') - ,(u'Sports' , u'http://azstarnet.com/search/?f=rss&t=article&c=sports&l=25&s=start_time&sd=desc') - ,(u'Opinion' , u'http://azstarnet.com/search/?f=rss&t=article&c=news/opinion&l=25&s=start_time&sd=desc') - ,(u'Movies' , u'http://azstarnet.com/search/?f=rss&t=article&c=entertainment/movies&l=25&s=start_time&sd=desc') - ,(u'Food' , u'http://azstarnet.com/search/?f=rss&t=article&c=lifestyles/food-and-cooking&l=25&s=start_time&sd=desc') - ] + + (u'Local News', u'http://azstarnet.com/search/?f=rss&t=article&c=news/local&l=25&s=start_time&sd=desc'), + (u'National News', u'http://azstarnet.com/search/?f=rss&t=article&c=news/national&l=25&s=start_time&sd=desc'), + (u'World News', u'http://azstarnet.com/search/?f=rss&t=article&c=news/world&l=25&s=start_time&sd=desc'), + (u'Sports', u'http://azstarnet.com/search/?f=rss&t=article&c=sports&l=25&s=start_time&sd=desc'), + (u'Opinion', u'http://azstarnet.com/search/?f=rss&t=article&c=news/opinion&l=25&s=start_time&sd=desc'), + (u'Movies', u'http://azstarnet.com/search/?f=rss&t=article&c=entertainment/movies&l=25&s=start_time&sd=desc'), + (u'Food', u'http://azstarnet.com/search/?f=rss&t=article&c=lifestyles/food-and-cooking&l=25&s=start_time&sd=desc') + ] def preprocess_html(self, soup): for item in soup.findAll(style=True): @@ -62,4 +56,3 @@ class Azstarnet(BasicNewsRecipe): def print_version(self, url): return url + '?print=1' - diff --git a/recipes/b365realitatea.recipe b/recipes/b365realitatea.recipe index 80a1ee225b..a0f08d9ed2 100644 --- a/recipes/b365realitatea.recipe +++ b/recipes/b365realitatea.recipe @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = u'2011, Silviu Cotoar\u0103' ''' b365.realitatea.net @@ -8,45 +8,40 @@ b365.realitatea.net from calibre.web.feeds.news import BasicNewsRecipe + class b365Realitatea(BasicNewsRecipe): - title = u'b365 Realitatea' - __author__ = u'Silviu Cotoar\u0103' - publisher = u'b365 Realitatea' - description = u'b365 Realitatea' - oldest_article = 5 - language = 'ro' + title = u'b365 Realitatea' + __author__ = u'Silviu Cotoar\u0103' + publisher = u'b365 Realitatea' + description = u'b365 Realitatea' + oldest_article = 5 + language = 'ro' max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - category = 'Ziare,Romania,Bucuresti' - encoding = 'utf-8' - cover_url = 'http://b365.realitatea.net/wp-content/themes/b/images/b365-logo.png' + no_stylesheets = True + use_embedded_content = False + category = 'Ziare,Romania,Bucuresti' + encoding = 'utf-8' + cover_url = 'http://b365.realitatea.net/wp-content/themes/b/images/b365-logo.png' conversion_options = { - 'comments' : description - ,'tags' : category - ,'language' : language - ,'publisher' : publisher - } + 'comments': description, 'tags': category, 'language': language, 'publisher': publisher + } keep_only_tags = [ - dict(name='div', attrs={'class':'newsArticle'}) - ] + dict(name='div', attrs={'class': 'newsArticle'}) + ] remove_tags = [ - dict(name='div', attrs={'class':'date'}) - , dict(name='dic', attrs={'class':'addthis_toolbox addthis_default_style'}) - , dict(name='div', attrs={'class':'related_posts'}) - , dict(name='div', attrs={'id':'RelevantiWidget'}) - ] + dict(name='div', attrs={'class': 'date'}), dict(name='dic', attrs={'class': 'addthis_toolbox addthis_default_style'}), dict( + name='div', attrs={'class': 'related_posts'}), dict(name='div', attrs={'id': 'RelevantiWidget'}) + ] remove_tags_after = [ - dict(name='div', attrs={'id':'RelevantiWidget'}) - ] - feeds = [ + dict(name='div', attrs={'id': 'RelevantiWidget'}) + ] + feeds = [ (u'\u0218tiri', u'http://b365.realitatea.net/rss-full/') - ] + ] def preprocess_html(self, soup): return self.adeify_images(soup) - diff --git a/recipes/b92.recipe b/recipes/b92.recipe index c4520b37fc..0cd0f76257 100644 --- a/recipes/b92.recipe +++ b/recipes/b92.recipe @@ -1,5 +1,5 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2008-2012, Darko Miletic ' ''' b92.net @@ -7,63 +7,63 @@ b92.net import re from calibre.web.feeds.news import BasicNewsRecipe + class B92(BasicNewsRecipe): - title = 'B92' - __author__ = 'Darko Miletic' - description = 'Najnovije vesti iz Srbije, regiona i sveta, aktuelne teme iz sveta politike, ekonomije, drustva, foto galerija, kolumne' - publisher = 'B92' - category = 'news, politics, Serbia' - oldest_article = 2 + title = 'B92' + __author__ = 'Darko Miletic' + description = 'Najnovije vesti iz Srbije, regiona i sveta, aktuelne teme iz sveta politike, ekonomije, drustva, foto galerija, kolumne' + publisher = 'B92' + category = 'news, politics, Serbia' + oldest_article = 2 max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - encoding = 'cp1250' - language = 'sr' - publication_type = 'newsportal' - masthead_url = 'http://b92s.net/v4/img/new-logo.png' - extra_css = """ + no_stylesheets = True + use_embedded_content = False + encoding = 'cp1250' + language = 'sr' + publication_type = 'newsportal' + masthead_url = 'http://b92s.net/v4/img/new-logo.png' + extra_css = """ @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} - body{font-family: Arial,Helvetica,sans1,sans-serif} + body{font-family: Arial,Helvetica,sans1,sans-serif} .article-info2,.article-info1{text-transform: uppercase; font-size: small} img{display: block} .sms{font-weight: bold} """ - - conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher': publisher - , 'language' : language - , 'linearize_tables' : True - } - - preprocess_regexps = [ - (re.compile(u'\u0110'), lambda match: u'\u00D0'), - (re.compile(r'', re.DOTALL|re.IGNORECASE), lambda match: 'something') - ] - - keep_only_tags = [dict(attrs={'class':['article-info1','article-text']})] - remove_attributes = ['width','height','align','hspace','vspace','border','lang','xmlns:fb'] - remove_tags = [ - dict(name=['embed','link','base','meta','iframe']) - ,dict(attrs={'id':'social'}) - ] - feeds = [ - (u'Vesti' , u'http://www.b92.net/info/rss/vesti.xml' ) - ,(u'Biz' , u'http://www.b92.net/info/rss/biz.xml' ) - ,(u'Sport' , u'http://www.b92.net/info/rss/sport.xml' ) - ,(u'Zivot' , u'http://www.b92.net/info/rss/zivot.xml' ) - ,(u'Kultura' , u'http://www.b92.net/info/rss/kultura.xml' ) - ,(u'Automobili' , u'http://www.b92.net/info/rss/automobili.xml') - ,(u'Tehnopolis' , u'http://www.b92.net/info/rss/tehnopolis.xml') - ] + conversion_options = { + 'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True + } + + preprocess_regexps = [ + (re.compile(u'\u0110'), lambda match: u'\u00D0'), + (re.compile(r'', re.DOTALL | re.IGNORECASE), + lambda match: 'something') + ] + + keep_only_tags = [dict(attrs={'class': ['article-info1', 'article-text']})] + remove_attributes = ['width', 'height', 'align', + 'hspace', 'vspace', 'border', 'lang', 'xmlns:fb'] + remove_tags = [ + dict(name=['embed', 'link', 'base', 'meta', 'iframe']), dict( + attrs={'id': 'social'}) + ] + + feeds = [ + + (u'Vesti', u'http://www.b92.net/info/rss/vesti.xml'), + (u'Biz', u'http://www.b92.net/info/rss/biz.xml'), + (u'Sport', u'http://www.b92.net/info/rss/sport.xml'), + (u'Zivot', u'http://www.b92.net/info/rss/zivot.xml'), + (u'Kultura', u'http://www.b92.net/info/rss/kultura.xml'), + (u'Automobili', u'http://www.b92.net/info/rss/automobili.xml'), + (u'Tehnopolis', u'http://www.b92.net/info/rss/tehnopolis.xml') + ] def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for alink in soup.findAll('a'): if alink.string is not None: - tstr = alink.string - alink.replaceWith(tstr) + tstr = alink.string + alink.replaceWith(tstr) return soup diff --git a/recipes/ba_herald.recipe b/recipes/ba_herald.recipe index 939879ccaa..47169535c7 100644 --- a/recipes/ba_herald.recipe +++ b/recipes/ba_herald.recipe @@ -1,4 +1,4 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2012, Darko Miletic ' ''' www.buenosairesherald.com @@ -7,22 +7,23 @@ www.buenosairesherald.com from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe + class BuenosAiresHerald(BasicNewsRecipe): - title = 'Buenos Aires Herald' - __author__ = 'Darko Miletic' - description = 'A world of information in a few words' - publisher = 'Editorial Nefir S.A.' - category = 'news, politics, Argentina' - oldest_article = 2 + title = 'Buenos Aires Herald' + __author__ = 'Darko Miletic' + description = 'A world of information in a few words' + publisher = 'Editorial Nefir S.A.' + category = 'news, politics, Argentina' + oldest_article = 2 max_articles_per_feed = 200 - no_stylesheets = True - encoding = 'utf8' - use_embedded_content = False - language = 'en_AR' - remove_empty_feeds = True - publication_type = 'newspaper' - masthead_url = 'http://www.buenosairesherald.com/img/logo.jpg' - INDEX = 'http://www.buenosairesherald.com' + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + language = 'en_AR' + remove_empty_feeds = True + publication_type = 'newspaper' + masthead_url = 'http://www.buenosairesherald.com/img/logo.jpg' + INDEX = 'http://www.buenosairesherald.com' extra_css = """ body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em; display:block} @@ -31,50 +32,44 @@ class BuenosAiresHerald(BasicNewsRecipe): """ conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } - - remove_tags = [dict(name=['meta','link','iframe'])] - keep_only_tags = [dict(attrs={'class':'nota_texto p'})] + 'comment': description, 'tags': category, 'publisher': publisher, 'language': language + } + remove_tags = [dict(name=['meta', 'link', 'iframe'])] + keep_only_tags = [dict(attrs={'class': 'nota_texto p'})] feeds = [ - (u'Argentina' , u'http://www.buenosairesherald.com/argentina' ) - ,(u'World' , u'http://www.buenosairesherald.com/world' ) - ,(u'Latin America' , u'http://www.buenosairesherald.com/latin-america' ) - ,(u'Entertainment' , u'http://www.buenosairesherald.com/entertainment' ) - ,(u'Sports' , u'http://www.buenosairesherald.com/sports' ) - ] + + (u'Argentina', u'http://www.buenosairesherald.com/argentina'), + (u'World', u'http://www.buenosairesherald.com/world'), + (u'Latin America', u'http://www.buenosairesherald.com/latin-america'), + (u'Entertainment', u'http://www.buenosairesherald.com/entertainment'), + (u'Sports', u'http://www.buenosairesherald.com/sports') + ] def print_version(self, url): artidraw = url.rpartition('/article/')[2] artid = artidraw.partition('/')[0] return 'http://www.buenosairesherald.com/articles/print.aspx?ix=' + artid - def parse_index(self): totalfeeds = [] lfeeds = self.get_feeds() for feedobj in lfeeds: feedtitle, feedurl = feedobj - self.report_progress(0, ('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) + self.report_progress(0, ('Fetching feed') + ' %s...' % + (feedtitle if feedtitle else feedurl)) articles = [] soup = self.index_to_soup(feedurl) - for item in soup.findAll('div', attrs={'class':'nota_texto_seccion'}): + for item in soup.findAll('div', attrs={'class': 'nota_texto_seccion'}): description = self.tag_to_string(item.h2) atag = item.h2.find('a') - if atag and atag.has_key('href'): - url = self.INDEX + atag['href'] - title = description - date = strftime(self.timefmt) + if atag and atag.has_key('href'): # noqa + url = self.INDEX + atag['href'] + title = description + date = strftime(self.timefmt) articles.append({ - 'title' :title - ,'date' :date - ,'url' :url - ,'description':description - }) + 'title': title, 'date': date, 'url': url, 'description': description + }) totalfeeds.append((feedtitle, articles)) return totalfeeds diff --git a/recipes/babyonline.recipe b/recipes/babyonline.recipe index fe1376cabf..8eec58f506 100644 --- a/recipes/babyonline.recipe +++ b/recipes/babyonline.recipe @@ -1,7 +1,7 @@ #!/usr/bin/env python2 # -*- coding: utf-8 -*- -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = u'2011, Silviu Cotoar\u0103' ''' babyonline.ro @@ -9,51 +9,49 @@ babyonline.ro from calibre.web.feeds.news import BasicNewsRecipe + class BabyOnline(BasicNewsRecipe): - title = u'Baby Online' - __author__ = u'Silviu Cotoar\u0103' - description = u'De la p\u0103rinte la p\u0103rinte' - publisher = u'Baby Online' - oldest_article = 50 - language = 'ro' + title = u'Baby Online' + __author__ = u'Silviu Cotoar\u0103' + description = u'De la p\u0103rinte la p\u0103rinte' + publisher = u'Baby Online' + oldest_article = 50 + language = 'ro' max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - category = 'Ziare,Reviste,Copii,Mame' - encoding = 'utf-8' - cover_url = 'http://www.babyonline.ro/images/default/logo.gif' + no_stylesheets = True + use_embedded_content = False + category = 'Ziare,Reviste,Copii,Mame' + encoding = 'utf-8' + cover_url = 'http://www.babyonline.ro/images/default/logo.gif' conversion_options = { - 'comments' : description - ,'tags' : category - ,'language' : language - ,'publisher' : publisher - } + 'comments': description, 'tags': category, 'language': language, 'publisher': publisher + } keep_only_tags = [ - dict(name='div', attrs={'id':'article_container'}) - ] + dict(name='div', attrs={'id': 'article_container'}) + ] remove_tags = [ - dict(name='div', attrs={'id':'bar_nav'}), - dict(name='div', attrs={'id':'service_send'}), - dict(name='div', attrs={'id':'other_videos'}), - dict(name='div', attrs={'class':'dot_line_yellow'}), - dict(name='a', attrs={'class':'print'}), - dict(name='a', attrs={'class':'email'}), - dict(name='a', attrs={'class':'YM'}), - dict(name='a', attrs={'class':'comment'}), - dict(name='div', attrs={'class':'tombstone_cross'}), - dict(name='span', attrs={'class':'liketext'}) - ] + dict(name='div', attrs={'id': 'bar_nav'}), + dict(name='div', attrs={'id': 'service_send'}), + dict(name='div', attrs={'id': 'other_videos'}), + dict(name='div', attrs={'class': 'dot_line_yellow'}), + dict(name='a', attrs={'class': 'print'}), + dict(name='a', attrs={'class': 'email'}), + dict(name='a', attrs={'class': 'YM'}), + dict(name='a', attrs={'class': 'comment'}), + dict(name='div', attrs={'class': 'tombstone_cross'}), + dict(name='span', attrs={'class': 'liketext'}) + ] remove_tags_after = [ - dict(name='div', attrs={'id':'service_send'}) - ] + dict(name='div', attrs={'id': 'service_send'}) + ] - feeds = [ - (u'Feeds', u'http://www.babyonline.ro/rss_homepage.xml') - ] + feeds = [ + (u'Feeds', u'http://www.babyonline.ro/rss_homepage.xml') + ] def preprocess_html(self, soup): return self.adeify_images(soup) diff --git a/recipes/badania_net.recipe b/recipes/badania_net.recipe index 3ccf6be88c..892282d8f5 100644 --- a/recipes/badania_net.recipe +++ b/recipes/badania_net.recipe @@ -1,5 +1,7 @@ from calibre.web.feeds.news import BasicNewsRecipe import re + + class BadaniaNet(BasicNewsRecipe): title = u'badania.net' __author__ = 'fenuks' @@ -11,9 +13,17 @@ class BadaniaNet(BasicNewsRecipe): oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True - preprocess_regexps = [(re.compile(r"

    Tekst sponsoruje

    ", re.IGNORECASE), lambda m: ''),] + preprocess_regexps = [ + (re.compile(r"

    Tekst sponsoruje

    ", re.IGNORECASE), lambda m: ''), ] remove_empty_feeds = True use_embedded_content = False remove_tags = [] keep_only_tags = [dict(name='article')] - feeds = [(u'Psychologia', u'http://badania.net/category/psychologia/feed/'), (u'Technologie', u'http://badania.net/category/technologie/feed/'), (u'Biologia', u'http://badania.net/category/biologia/feed/'), (u'Chemia', u'http://badania.net/category/chemia/feed/'), (u'Zdrowie', u'http://badania.net/category/zdrowie/'), (u'Seks', u'http://badania.net/category/psychologia-ewolucyjna-tematyka-seks/feed/')] \ No newline at end of file + feeds = [ + (u'Psychologia', u'http://badania.net/category/psychologia/feed/'), + (u'Technologie', u'http://badania.net/category/technologie/feed/'), + (u'Biologia', u'http://badania.net/category/biologia/feed/'), + + (u'Chemia', u'http://badania.net/category/chemia/feed/'), + (u'Zdrowie', u'http://badania.net/category/zdrowie/'), + (u'Seks', u'http://badania.net/category/psychologia-ewolucyjna-tematyka-seks/feed/')] diff --git a/recipes/balkaninsight.recipe b/recipes/balkaninsight.recipe index 9d7396a9aa..fab28069a9 100644 --- a/recipes/balkaninsight.recipe +++ b/recipes/balkaninsight.recipe @@ -1,4 +1,4 @@ -__license__ = 'GPL v3' +__license__ = 'GPL v3' __copyright__ = '2010, Darko Miletic ' ''' balkaninsight.com @@ -7,21 +7,22 @@ balkaninsight.com import re from calibre.web.feeds.news import BasicNewsRecipe + class BalkanInsight(BasicNewsRecipe): - title = 'Balkan Insight' - __author__ = 'Darko Miletic' - description = 'Get exclusive news and in depth information on business, politics, events and lifestyle in the Balkans. Free and exclusive premium content.' - publisher = 'BalkanInsight.com' - category = 'news, politics, Balcans' - oldest_article = 2 + title = 'Balkan Insight' + __author__ = 'Darko Miletic' + description = 'Get exclusive news and in depth information on business, politics, events and lifestyle in the Balkans. Free and exclusive premium content.' + publisher = 'BalkanInsight.com' + category = 'news, politics, Balcans' + oldest_article = 2 max_articles_per_feed = 100 - no_stylesheets = False - use_embedded_content = False - encoding = 'utf-8' - masthead_url = 'http://www.balkaninsight.com/templates/balkaninsight/images/aindex_02.jpg' - language = 'en' - publication_type = 'newsportal' - remove_empty_feeds = True + no_stylesheets = False + use_embedded_content = False + encoding = 'utf-8' + masthead_url = 'http://www.balkaninsight.com/templates/balkaninsight/images/aindex_02.jpg' + language = 'en' + publication_type = 'newsportal' + remove_empty_feeds = True extra_css = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} .article_description,body{font-family: Arial,Verdana,Helvetica,sans1,sans-serif} @@ -31,30 +32,28 @@ class BalkanInsight(BasicNewsRecipe): .main_news_img{font-size: small} """ conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } + 'comment': description, 'tags': category, 'publisher': publisher, 'language': language + } preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - keep_only_tags = [dict(name='div', attrs={'id':'article'})] + keep_only_tags = [dict(name='div', attrs={'id': 'article'})] remove_tags = [ - dict(name=['object','link','iframe']) - ] + dict(name=['object', 'link', 'iframe']) + ] - feeds = [ - (u'Albania' , u'http://www.balkaninsight.com/?tpl=653&tpid=144' ) - ,(u'Bosnia' , u'http://www.balkaninsight.com/?tpl=653&tpid=145' ) - ,(u'Bulgaria' , u'http://www.balkaninsight.com/?tpl=653&tpid=146' ) - ,(u'Croatia' , u'http://www.balkaninsight.com/?tpl=653&tpid=147' ) - ,(u'Kosovo' , u'http://www.balkaninsight.com/?tpl=653&tpid=148' ) - ,(u'Macedonia' , u'http://www.balkaninsight.com/?tpl=653&tpid=149' ) - ,(u'Montenegro' , u'http://www.balkaninsight.com/?tpl=653&tpid=150' ) - ,(u'Romania' , u'http://www.balkaninsight.com/?tpl=653&tpid=151' ) - ,(u'Serbia' , u'http://www.balkaninsight.com/?tpl=653&tpid=152' ) - ] + feeds = [ + + (u'Albania', u'http://www.balkaninsight.com/?tpl=653&tpid=144'), + (u'Bosnia', u'http://www.balkaninsight.com/?tpl=653&tpid=145'), + (u'Bulgaria', u'http://www.balkaninsight.com/?tpl=653&tpid=146'), + (u'Croatia', u'http://www.balkaninsight.com/?tpl=653&tpid=147'), + (u'Kosovo', u'http://www.balkaninsight.com/?tpl=653&tpid=148'), + (u'Macedonia', u'http://www.balkaninsight.com/?tpl=653&tpid=149'), + (u'Montenegro', u'http://www.balkaninsight.com/?tpl=653&tpid=150'), + (u'Romania', u'http://www.balkaninsight.com/?tpl=653&tpid=151'), + (u'Serbia', u'http://www.balkaninsight.com/?tpl=653&tpid=152') + ] def preprocess_html(self, soup): for item in soup.findAll(style=True): diff --git a/recipes/baltimore_sun.recipe b/recipes/baltimore_sun.recipe index 34c7d9115e..c49349289c 100644 --- a/recipes/baltimore_sun.recipe +++ b/recipes/baltimore_sun.recipe @@ -4,32 +4,35 @@ __copyright__ = '2009, Kovid Goyal ' __copyright__ = '2012 Josh Hall' __docformat__ = 'restructuredtext en' -import urllib, re +import urllib +import re from calibre.web.feeds.news import BasicNewsRecipe + class BaltimoreSun(BasicNewsRecipe): - title = 'The Baltimore Sun' + title = 'The Baltimore Sun' __author__ = 'Kovid Goyal' description = 'Complete local news and blogs from Baltimore' - language = 'en' - version = 2.5 + language = 'en' + version = 2.5 compress_news_images = True compress_news_images_auto_size = 8 oldest_article = 1 max_articles_per_feed = 100 use_embedded_content = False - no_stylesheets = True - remove_javascript = True - remove_empty_feeds = True + no_stylesheets = True + remove_javascript = True + remove_empty_feeds = True ignore_duplicate_articles = {'title'} keep_only_tags = [ - dict(name=['div', 'section'], attrs={'class':["trb_article_title","trb_article_leadart", 'trb_bylines', 'trb_article_dateline', 'trb_mainContent']}), + dict(name=['div', 'section'], attrs={'class': [ + "trb_article_title", "trb_article_leadart", 'trb_bylines', 'trb_article_dateline', 'trb_mainContent']}), ] remove_tags = [ dict(name=['meta', 'link']), - dict(name=['div', 'aside'], attrs={'class':lambda x: x and set(x.split()).intersection({ + dict(name=['div', 'aside'], attrs={'class': lambda x: x and set(x.split()).intersection({ 'trb_gptAd', 'trb_panelmod_container', 'trb_socialize', 'trb_taboola', 'trb_embed_related'})}), ] @@ -51,7 +54,8 @@ class BaltimoreSun(BasicNewsRecipe): # (u'Howard County', u'http://feeds.feedburner.com/baltimoresun/news/local/howard/rss2'), (u'Education', u'http://feeds.feedburner.com/baltimoresun/news/education/rss2'), # (u'Obituaries', u'http://feeds.feedburner.com/baltimoresun/news/obituaries/rss2'), - (u'Local Politics', u'http://feeds.feedburner.com/baltimoresun/news/local/politics/rss2'), + (u'Local Politics', + u'http://feeds.feedburner.com/baltimoresun/news/local/politics/rss2'), (u'Weather', u'http://feeds.feedburner.com/baltimoresun/news/weather/site/rss2'), # (u'Traffic', u'http://feeds.feedburner.com/baltimoresun/news/traffic/rss2'), (u'Nation/world', u'http://feeds.feedburner.com/baltimoresun/news/nationworld/rss2'), @@ -60,7 +64,8 @@ class BaltimoreSun(BasicNewsRecipe): # Sports## (u'Top Sports', u'http://feeds.feedburner.com/baltimoresun/sports/rss2'), (u'Orioles/Baseball', u'http://www.baltimoresun.com/sports/orioles/rss2.0.xml'), - (u'Ravens/Football', u'http://feeds.feedburner.com/baltimoresun/sports/football/rss2'), + (u'Ravens/Football', + u'http://feeds.feedburner.com/baltimoresun/sports/football/rss2'), # (u'Terps', u''http://feeds.feedburner.com/baltimoresun/sports/terps/rss2'), # (u'College Football', u''feed://feeds.feedburner.com/baltimoresun/sports/college/football/rss2'), # (u'Lacrosse', u'http://feeds.feedburner.com/baltimoresun/sports/college/lacrosse/rss2'), @@ -74,8 +79,10 @@ class BaltimoreSun(BasicNewsRecipe): (u'Celebrity News', u'http://baltimore.feedsportal.com/c/34255/f/623042/index.rss'), (u'Arts & Theater', u'http://feeds.feedburner.com/baltimoresun/entertainment/galleriesmuseums/rss2'), (u'Movies', u'http://www.baltimoresun.com/entertainment/movies/rss2.0.xml'), - (u'Music & Nightlife', u'http://www.baltimoresun.com/entertainment/music/rss2.0.xml'), - (u'Restaurants & Food', u'http://www.baltimoresun.com/entertainment/dining/rss2.0.xml'), + (u'Music & Nightlife', + u'http://www.baltimoresun.com/entertainment/music/rss2.0.xml'), + (u'Restaurants & Food', + u'http://www.baltimoresun.com/entertainment/dining/rss2.0.xml'), (u'TV/Media', u'http://www.baltimoresun.com/entertainment/tv/rss2.0.xml'), # Life ## @@ -113,18 +120,25 @@ class BaltimoreSun(BasicNewsRecipe): (u'Peter Schmuck', u'http://www.baltimoresun.com/sports/bal-columnist-schmuck,0,7485088.columnist-rss2.0.xml'), # News Blogs ## - (u'Baltimore Crime Beat', u'http://baltimore.feedsportal.com/c/34255/f/623075/index.rss'), + (u'Baltimore Crime Beat', + u'http://baltimore.feedsportal.com/c/34255/f/623075/index.rss'), (u'InsideEd', u'http://www.baltimoresun.com/news/maryland/education/blog/rss2.0.xml'), - (u'Maryland Politics', u'http://www.baltimoresun.com/news/maryland/politics/blog/rss2.0.xml'), - (u'Maryland Weather', u'http://www.baltimoresun.com/news/weather/weather-blog/rss2.0.xml'), - (u'Second Opinion', u'http://www.baltimoresun.com/news/opinion/second-opinion-blog/rss2.0.xml'), - (u'Sun Investigates', u'http://www.baltimoresun.com/news/maryland/sun-investigates/rss2.0.xml'), + (u'Maryland Politics', + u'http://www.baltimoresun.com/news/maryland/politics/blog/rss2.0.xml'), + (u'Maryland Weather', + u'http://www.baltimoresun.com/news/weather/weather-blog/rss2.0.xml'), + (u'Second Opinion', + u'http://www.baltimoresun.com/news/opinion/second-opinion-blog/rss2.0.xml'), + (u'Sun Investigates', + u'http://www.baltimoresun.com/news/maryland/sun-investigates/rss2.0.xml'), (u'You Dont Say', u'http://www.baltimoresun.com/news/language-blog/rss2.0.xml'), # Business Blogs ## (u'BaltTech', u'http://www.baltimoresun.com/business/technology/blog/rss2.0.xml'), - (u'Consuming Interests', u'http://www.baltimoresun.com/business/consuming-interests-blog/rss2.0.xml'), - (u'The Real Estate Wonk', u'http://www.baltimoresun.com/business/real-estate/wonk/rss2.0.xml'), + (u'Consuming Interests', + u'http://www.baltimoresun.com/business/consuming-interests-blog/rss2.0.xml'), + (u'The Real Estate Wonk', + u'http://www.baltimoresun.com/business/real-estate/wonk/rss2.0.xml'), # Entertainment Blogs ## (u'ArtSmash', 'http://www.baltimoresun.com/entertainment/arts/artsmash/rss2.0.xml'), @@ -135,7 +149,8 @@ class BaltimoreSun(BasicNewsRecipe): # Life Blogs ## # (u'BMore Green', u'http://weblogs.baltimoresun.com/features/green/index.xml'), - (u'Baltimore Insider',u'http://www.baltimoresun.com/features/baltimore-insider-blog/rss2.0.xml'), + (u'Baltimore Insider', + u'http://www.baltimoresun.com/features/baltimore-insider-blog/rss2.0.xml'), (u'Picture of Health', u'http://www.baltimoresun.com/health/blog/rss2.0.xml'), # (u'Unleashed', u'http://weblogs.baltimoresun.com/features/mutts/blog/index.xml'), @@ -143,15 +158,18 @@ class BaltimoreSun(BasicNewsRecipe): (u'TV Lust', u'http://baltimore.feedsportal.com/c/34255/f/623096/index.rss'), # Sports Blogs ## - (u'Baltimore Sports Blitz', u'http://baltimore.feedsportal.com/c/34255/f/623097/index.rss'), + (u'Baltimore Sports Blitz', + u'http://baltimore.feedsportal.com/c/34255/f/623097/index.rss'), # (u'Lacrosse Insider',u'http://www.baltimoresun.com/sports/lacrosse-blog/rss2.0.xml'), (u'Orioles Insider', u'http://baltimore.feedsportal.com/c/34255/f/623100/index.rss'), - (u'Ravens Insider', u'http://www.baltimoresun.com/sports/ravens/ravens-insider/rss2.0.xml'), + (u'Ravens Insider', + u'http://www.baltimoresun.com/sports/ravens/ravens-insider/rss2.0.xml'), # (u'Ring Posts', u'http://weblogs.baltimoresun.com/sports/wrestling/blog/index.xml'), - (u'The Schmuck Stops Here', u'http://www.baltimoresun.com/sports/schmuck-blog/rss2.0.xml'), + (u'The Schmuck Stops Here', + u'http://www.baltimoresun.com/sports/schmuck-blog/rss2.0.xml'), # (u'Tracking the Terps', u'http://weblogs.baltimoresun.com/sports/college/maryland_terps/blog/index.xml'), # (u'Varsity Letters', u'http://weblogs.baltimoresun.com/sports/highschool/varsityletters/index.xml'), -] + ] def get_article_url(self, article): ans = None @@ -162,7 +180,8 @@ class BaltimoreSun(BasicNewsRecipe): except: pass if ans is None: - ans = article.get('feedburner_origlink', article.get('guid', article.get('link'))) + ans = article.get('feedburner_origlink', + article.get('guid', article.get('link'))) if ans is not None: return ans.replace('?track=rss', '') diff --git a/recipes/banat_news.recipe b/recipes/banat_news.recipe index 4c183693a3..d8b2592b89 100644 --- a/recipes/banat_news.recipe +++ b/recipes/banat_news.recipe @@ -6,63 +6,66 @@ www.philstar.com import time from calibre.web.feeds.recipes import BasicNewsRecipe -class BanatNews(BasicNewsRecipe): - title = 'Banat News' - custom_title = "Banat News - " + time.strftime('%d %b %Y %I:%M %p') - __author__ = 'jde' - __date__ = '31 May 2012' - __version__ = '1.0' - description = 'Banat News is a daily Cebuano-language newspaper based in Cebu, Philippines - philstar.com is a Philippine news and entertainment portal for the Filipino global community. It is the online presence of the STAR Group of Publications, a leading publisher of newspapers and magazines in the Philippines.' - language = 'ceb' - publisher = 'The Philippine STAR' - category = 'news, Philippines' - tags = 'news, Philippines' - cover_url = 'http://www.philstar.com/images/logo_Banat.jpg' - masthead_url = 'http://www.philstar.com/images/logo_Banat.jpg' - oldest_article = 1.5 #days - max_articles_per_feed = 25 - simultaneous_downloads = 10 - publication_type = 'newspaper' - timefmt = ' [%a, %d %b %Y %I:%M %p]' - no_stylesheets = True - use_embedded_content = False - encoding = None - recursions = 0 - needs_subscription = False - remove_javascript = True - remove_empty_feeds = True - auto_cleanup = False - remove_tags = [dict(name='img', attrs={'id':'Image1'}) #Logo - ,dict(name='span', attrs={'id':'ControlArticle1_LabelHeader'}) #Section (Headlines, Nation, Metro, ...) - ,dict(name='a', attrs={'id':'ControlArticle1_FormView1_hlComments'}) #Comments - ,dict(name='img', attrs={'src':'images/post-comments.jpg'}) #View Comments - ,dict(name='a', attrs={'id':'ControlArticle1_FormView1_ControlPhotoAndCaption1_hlImageCaption'}) #Zoom - ] - conversion_options = { 'title' : custom_title, - 'comments' : description, - 'tags' : tags, - 'language' : language, - 'publisher' : publisher, - 'authors' : publisher, - 'smarten_punctuation' : True - } +class BanatNews(BasicNewsRecipe): + title = 'Banat News' + custom_title = "Banat News - " + time.strftime('%d %b %Y %I:%M %p') + __author__ = 'jde' + __date__ = '31 May 2012' + __version__ = '1.0' + description = 'Banat News is a daily Cebuano-language newspaper based in Cebu, Philippines - philstar.com is a Philippine news and entertainment portal for the Filipino global community. It is the online presence of the STAR Group of Publications, a leading publisher of newspapers and magazines in the Philippines.' # noqa + language = 'ceb' + publisher = 'The Philippine STAR' + category = 'news, Philippines' + tags = 'news, Philippines' + cover_url = 'http://www.philstar.com/images/logo_Banat.jpg' + masthead_url = 'http://www.philstar.com/images/logo_Banat.jpg' + oldest_article = 1.5 # days + max_articles_per_feed = 25 + simultaneous_downloads = 10 + publication_type = 'newspaper' + timefmt = ' [%a, %d %b %Y %I:%M %p]' + no_stylesheets = True + use_embedded_content = False + encoding = None + recursions = 0 + needs_subscription = False + remove_javascript = True + remove_empty_feeds = True + auto_cleanup = False + + remove_tags = [dict(name='img', attrs={'id': 'Image1'}) # Logo + # Section (Headlines, Nation, Metro, ...) + # Comments + # View Comments + # Zoom + , dict(name='span', attrs={'id': 'ControlArticle1_LabelHeader'}), dict(name='a', attrs={'id': 'ControlArticle1_FormView1_hlComments'}), dict(name='img', attrs={'src': 'images/post-comments.jpg'}), dict(name='a', attrs={'id': 'ControlArticle1_FormView1_ControlPhotoAndCaption1_hlImageCaption'}) # noqa + ] + conversion_options = {'title': custom_title, + 'comments': description, + 'tags': tags, + 'language': language, + 'publisher': publisher, + 'authors': publisher, + 'smarten_punctuation': True + } feeds = [ - ('Balita' , 'http://rss.philstar.com/Rss.aspx?publicationSubCategoryId=101' ) - ,('Opinyon' , 'http://rss.philstar.com/Rss.aspx?publicationSubCategoryId=102' ) - ,('Kalingawan' , 'http://rss.philstar.com/Rss.aspx?publicationSubCategoryId=104' ) - ,('Showbiz' , 'http://rss.philstar.com/Rss.aspx?publicationSubCategoryId=62' ) - ,('Palaro' , 'http://rss.philstar.com/Rss.aspx?publicationSubCategoryId=103' ) - ,('Imong Kapalaran' , 'http://rss.philstar.com/Rss.aspx?publicationSubCategoryId=105' ) - ] + + ('Balita' , 'http://rss.philstar.com/Rss.aspx?publicationSubCategoryId=101'), + ('Opinyon' , 'http://rss.philstar.com/Rss.aspx?publicationSubCategoryId=102'), + ('Kalingawan' , 'http://rss.philstar.com/Rss.aspx?publicationSubCategoryId=104'), + ('Showbiz' , 'http://rss.philstar.com/Rss.aspx?publicationSubCategoryId=62'), + ('Palaro' , 'http://rss.philstar.com/Rss.aspx?publicationSubCategoryId=103'), + ('Imong Kapalaran' , 'http://rss.philstar.com/Rss.aspx?publicationSubCategoryId=105') + ] # process the printer friendly version of article def print_version(self, url): - return url.replace('/Article', '/ArticlePrinterFriendly') + return url.replace('/Article', '/ArticlePrinterFriendly') -# obtain title from printer friendly version of article; avoiding add_toc_thumbnail changing title when article has image +# obtain title from printer friendly version of article; avoiding +# add_toc_thumbnail changing title when article has image def populate_article_metadata(self, article, soup, first): - article.title = soup.find('span', {'id': 'ControlArticle1_FormView1_ArticleHeaderLabel'}).contents[0].strip() - - + article.title = soup.find( + 'span', {'id': 'ControlArticle1_FormView1_ArticleHeaderLabel'}).contents[0].strip() diff --git a/recipes/bangkok_biz.recipe b/recipes/bangkok_biz.recipe index b2a2ee42f9..f0ec772e45 100644 --- a/recipes/bangkok_biz.recipe +++ b/recipes/bangkok_biz.recipe @@ -1,25 +1,25 @@ from calibre.web.feeds.news import BasicNewsRecipe + class AdvancedUserRecipe1290689337(BasicNewsRecipe): __author__ = 'Anat R.' language = 'th' - title = u'Bangkok Biz News' + title = u'Bangkok Biz News' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True remove_javascript = True use_embedded_content = False - feeds = [(u'Headlines', - u'http://www.bangkokbiznews.com/home/services/rss/home.xml'), - (u'Politics', u'http://www.bangkokbiznews.com/home/services/rss/politics.xml'), - (u'Business', u'http://www.bangkokbiznews.com/home/services/rss/business.xml'), - (u'Finance', u' http://www.bangkokbiznews.com/home/services/rss/finance.xml'), - (u'Technology', u' http://www.bangkokbiznews.com/home/services/rss/it.xml')] - remove_tags_before = dict(name='div', attrs={'class':'box-Detailcontent'}) - remove_tags_after = dict(name='p', attrs={'class':'allTags'}) + feeds = [(u'Headlines', + u'http://www.bangkokbiznews.com/home/services/rss/home.xml'), + (u'Politics', u'http://www.bangkokbiznews.com/home/services/rss/politics.xml'), + (u'Business', u'http://www.bangkokbiznews.com/home/services/rss/business.xml'), + (u'Finance', u' http://www.bangkokbiznews.com/home/services/rss/finance.xml'), + (u'Technology', u' http://www.bangkokbiznews.com/home/services/rss/it.xml')] + remove_tags_before = dict(name='div', attrs={'class': 'box-Detailcontent'}) + remove_tags_after = dict(name='p', attrs={'class': 'allTags'}) remove_tags = [] - remove_tags.append(dict(name = 'div', attrs = {'id': 'content-tools'})) - remove_tags.append(dict(name = 'p', attrs = {'class':'allTags'})) - remove_tags.append(dict(name = 'div', attrs = {'id':'morePic'})) - remove_tags.append(dict(name = 'ul', attrs = {'class':'tabs-nav'})) - + remove_tags.append(dict(name='div', attrs={'id': 'content-tools'})) + remove_tags.append(dict(name='p', attrs={'class': 'allTags'})) + remove_tags.append(dict(name='div', attrs={'id': 'morePic'})) + remove_tags.append(dict(name='ul', attrs={'class': 'tabs-nav'})) diff --git a/recipes/bangkokpost.recipe b/recipes/bangkokpost.recipe index 44750d9a6c..8a17cb6d37 100644 --- a/recipes/bangkokpost.recipe +++ b/recipes/bangkokpost.recipe @@ -1,7 +1,8 @@ from calibre.web.feeds.news import BasicNewsRecipe + class BangkokPostRecipe(BasicNewsRecipe): - __license__ = 'GPL v3' + __license__ = 'GPL v3' __author__ = 'kwetal' language = 'en_TH' version = 1 @@ -20,26 +21,34 @@ class BangkokPostRecipe(BasicNewsRecipe): # Feeds from: http://www.bangkokpost.com/rss/ feeds = [] - feeds.append((u'Breaking News', u'http://www.bangkokpost.com/rss/data/breakingnews.xml')) - feeds.append((u'Top Stories', u'http://www.bangkokpost.com/rss/data/topstories.xml')) + feeds.append( + (u'Breaking News', u'http://www.bangkokpost.com/rss/data/breakingnews.xml')) + feeds.append( + (u'Top Stories', u'http://www.bangkokpost.com/rss/data/topstories.xml')) feeds.append((u'News', u'http://www.bangkokpost.com/rss/data/news.xml')) - feeds.append((u'Business', u'http://www.bangkokpost.com/rss/data/business.xml')) - feeds.append((u'Opinion', u'http://www.bangkokpost.com/rss/data/opinion.xml')) - feeds.append((u'Travel', u'http://www.bangkokpost.com/rss/data/travel.xml')) - feeds.append((u'Leisure', u'http://www.bangkokpost.com/rss/data/leisure.xml')) - feeds.append((u'Entertainment', u'http://www.bangkokpost.com/rss/data/entertainment.xml')) + feeds.append( + (u'Business', u'http://www.bangkokpost.com/rss/data/business.xml')) + feeds.append( + (u'Opinion', u'http://www.bangkokpost.com/rss/data/opinion.xml')) + feeds.append( + (u'Travel', u'http://www.bangkokpost.com/rss/data/travel.xml')) + feeds.append( + (u'Leisure', u'http://www.bangkokpost.com/rss/data/leisure.xml')) + feeds.append( + (u'Entertainment', u'http://www.bangkokpost.com/rss/data/entertainment.xml')) feeds.append((u'Auto', u'http://www.bangkokpost.com/rss/data/auto.xml')) feeds.append((u'Life', u'http://www.bangkokpost.com/rss/data/life.xml')) feeds.append((u'Tech', u'http://www.bangkokpost.com/rss/data/tect.xml')) keep_only_tags = [] - keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'entry'})) + keep_only_tags.append(dict(name='div', attrs={'class': 'entry'})) remove_tags = [] - remove_tags.append(dict(name = 'div', attrs = {'class': 'article-features'})) - remove_tags.append(dict(name = 'div', attrs = {'class': 'socialBookmark'})) - remove_tags.append(dict(name = 'div', attrs = {'id': 'main-sns'})) + remove_tags.append(dict(name='div', attrs={'class': 'article-features'})) + remove_tags.append(dict(name='div', attrs={'class': 'socialBookmark'})) + remove_tags.append(dict(name='div', attrs={'id': 'main-sns'})) # Their YouTube movies are displayed in an iframe, if you want those you will have to parse the articles by hand. - # Setting self.recursion to 1, which might resolve this, makes calibre downloading a lot of PDF files, which will cause a very, very very, long download time - remove_tags.append(dict(name = 'iframe')) - + # Setting self.recursion to 1, which might resolve this, makes calibre + # downloading a lot of PDF files, which will cause a very, very very, long + # download time + remove_tags.append(dict(name='iframe')) diff --git a/recipes/bankier_pl.recipe b/recipes/bankier_pl.recipe index 626c66ea41..02af81e64b 100644 --- a/recipes/bankier_pl.recipe +++ b/recipes/bankier_pl.recipe @@ -9,38 +9,38 @@ bankier.pl from calibre.web.feeds.news import BasicNewsRecipe + class bankier(BasicNewsRecipe): - title = u'Bankier.pl' + title = u'Bankier.pl' __author__ = 'teepel ' - language = 'pl' - description ='Polski portal finansowy. Informacje o: gospodarka, inwestowanie, finanse osobiste, prowadzenie firmy, kursy walut, notowania akcji, fundusze.' - masthead_url='http://www.bankier.pl/gfx/hd-mid-02.gif' - INDEX='http://bankier.pl/' - remove_empty_feeds= True + language = 'pl' + description = 'Polski portal finansowy. Informacje o: gospodarka, inwestowanie, finanse osobiste, prowadzenie firmy, kursy walut, notowania akcji, fundusze.' # noqa + masthead_url = 'http://www.bankier.pl/gfx/hd-mid-02.gif' + INDEX = 'http://bankier.pl/' + remove_empty_feeds = True oldest_article = 1 max_articles_per_feed = 100 - remove_javascript=True - no_stylesheets=True + remove_javascript = True + no_stylesheets = True simultaneous_downloads = 5 - keep_only_tags =[] - keep_only_tags.append(dict(name = 'div', attrs = {'align' : 'left'})) + keep_only_tags = [] + keep_only_tags.append(dict(name='div', attrs={'align': 'left'})) - remove_tags =[] - remove_tags.append(dict(name = 'table', attrs = {'cellspacing' : '2'})) - remove_tags.append(dict(name = 'div', attrs = {'align' : 'center'})) - remove_tags.append(dict(name = 'img', attrs = {'src' : '/gfx/hd-mid-02.gif'})) - #remove_tags.append(dict(name = 'a', attrs = {'target' : '_blank'})) - #remove_tags.append(dict(name = 'br', attrs = {'clear' : 'all'})) + remove_tags = [] + remove_tags.append(dict(name='table', attrs={'cellspacing': '2'})) + remove_tags.append(dict(name='div', attrs={'align': 'center'})) + remove_tags.append(dict(name='img', attrs={'src': '/gfx/hd-mid-02.gif'})) + + feeds = [ + (u'Wiadomości dnia', u'http://feeds.feedburner.com/bankier-wiadomosci-dnia'), + (u'Finanse osobiste', u'http://feeds.feedburner.com/bankier-finanse-osobiste'), + (u'Firma', u'http://feeds.feedburner.com/bankier-firma'), + (u'Giełda', u'http://feeds.feedburner.com/bankier-gielda'), + (u'Rynek walutowy', u'http://feeds.feedburner.com/bankier-rynek-walutowy'), + (u'Komunikaty ze spółek', u'http://feeds.feedburner.com/bankier-espi'), + ] - feeds = [ - (u'Wiadomości dnia', u'http://feeds.feedburner.com/bankier-wiadomosci-dnia'), - (u'Finanse osobiste', u'http://feeds.feedburner.com/bankier-finanse-osobiste'), - (u'Firma', u'http://feeds.feedburner.com/bankier-firma'), - (u'Giełda', u'http://feeds.feedburner.com/bankier-gielda'), - (u'Rynek walutowy', u'http://feeds.feedburner.com/bankier-rynek-walutowy'), - (u'Komunikaty ze spółek', u'http://feeds.feedburner.com/bankier-espi'), - ] def print_version(self, url): segment = url.split('.') urlPart = segment[2] diff --git a/recipes/barrons.recipe b/recipes/barrons.recipe index 22bf676b77..72bcfe0f4c 100644 --- a/recipes/barrons.recipe +++ b/recipes/barrons.recipe @@ -13,17 +13,18 @@ from calibre.web.feeds.news import BasicNewsRecipe USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0' + class Barrons(BasicNewsRecipe): title = 'Barron\'s' max_articles_per_feed = 50 - needs_subscription = True + needs_subscription = True language = 'en' __author__ = 'Kovid Goyal' description = 'Weekly publication for investors from the publisher of the Wall Street Journal' - timefmt = ' [%a, %b %d, %Y]' - use_embedded_content = False + timefmt = ' [%a, %b %d, %Y]' + use_embedded_content = False no_stylesheets = True match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*'] conversion_options = {'linearize_tables': True} @@ -32,12 +33,16 @@ class Barrons(BasicNewsRecipe): oldest_article = 7 requires_version = (0, 9, 16) - keep_only_tags = [dict(attrs={'class':lambda x: x and (x.startswith('sector one column') or x.startswith('sector two column'))})] + keep_only_tags = [dict(attrs={'class': lambda x: x and ( + x.startswith('sector one column') or x.startswith('sector two column'))})] remove_tags = [ - dict(name='div', attrs={'class':['sTools sTools-t', 'tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}), - dict(attrs={'class':['insetButton', 'insettipBox', 'insetClose']}), - dict(attrs={'data-module-name':['resp.module.trendingNow.BarronsDesktop', 'resp.module.share_tools.ShareTools']}), - dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}), + dict(name='div', attrs={'class': [ + 'sTools sTools-t', 'tabContainer artTabbedNav', 'rssToolBox hidden', 'articleToolbox']}), + dict(attrs={'class': ['insetButton', 'insettipBox', 'insetClose']}), + dict(attrs={'data-module-name': [ + 'resp.module.trendingNow.BarronsDesktop', 'resp.module.share_tools.ShareTools']}), + dict(name='span', attrs={ + 'data-country-code': True, 'data-ticker-code': True}), ] def get_browser(self): @@ -88,7 +93,7 @@ class Barrons(BasicNewsRecipe): def preprocess_html(self, soup): # Remove thumbnail for zoomable images - for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}): + for div in soup.findAll('div', attrs={'class': lambda x: x and 'insetZoomTargetBox' in x.split()}): img = div.find('img') if img is not None: img.extract() @@ -96,7 +101,8 @@ class Barrons(BasicNewsRecipe): return soup # Comment out the feeds you don't want retrieved. -# Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire +# Because these feeds are sorted alphabetically when converted to LRF, you +# may want to number them to put them in the order you desire feeds = [ ('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'), diff --git a/recipes/bash_org_pl.recipe b/recipes/bash_org_pl.recipe index b772b7c3b4..9e48d5a22f 100644 --- a/recipes/bash_org_pl.recipe +++ b/recipes/bash_org_pl.recipe @@ -1,54 +1,51 @@ from calibre.web.feeds.news import BasicNewsRecipe + class Bash_org_pl(BasicNewsRecipe): - title = u'Bash.org.pl' - __author__ = 'fenuks' - description = 'Bash.org.pl - zabawne cytaty z IRC' - category = 'funny quotations, humour' - language = 'pl' + title = u'Bash.org.pl' + __author__ = 'fenuks' + description = 'Bash.org.pl - zabawne cytaty z IRC' + category = 'funny quotations, humour' + language = 'pl' cover_url = u'http://userlogos.org/files/logos/dzikiosiol/none_0.png' max_articles_per_feed = 50 - no_stylesheets= True - keep_only_tags= [dict(name='a', attrs={'class':'qid click'}), - dict(name='div', attrs={'class':'quote post-content post-body'})] - + no_stylesheets = True + keep_only_tags = [dict(name='a', attrs={'class': 'qid click'}), + dict(name='div', attrs={'class': 'quote post-content post-body'})] def latest_articles(self): articles = [] - soup=self.index_to_soup(u'http://bash.org.pl/latest/') - #date=soup.find('div', attrs={'class':'right'}).string - tags=soup.findAll('a', attrs={'class':'qid click'}) + soup = self.index_to_soup(u'http://bash.org.pl/latest/') + tags = soup.findAll('a', attrs={'class': 'qid click'}) for a in tags: - title=a.string - url='http://bash.org.pl' +a['href'] - articles.append({'title' : title, - 'url' : url, - 'date' : '', - 'description' : '' - }) + title = a.string + url = 'http://bash.org.pl' + a['href'] + articles.append({'title': title, + 'url': url, + 'date': '', + 'description': '' + }) return articles - def random_articles(self): - articles = [] - for i in range(self.max_articles_per_feed): - soup=self.index_to_soup(u'http://bash.org.pl/random/') - #date=soup.find('div', attrs={'class':'right'}).string - url=soup.find('a', attrs={'class':'qid click'}) - title='' - url='http://bash.org.pl/random/' - articles.append({'title' : title, - 'url' : url, - 'date' : '', - 'description' : '' - }) - return articles + articles = [] + for i in range(self.max_articles_per_feed): + soup = self.index_to_soup(u'http://bash.org.pl/random/') + url = soup.find('a', attrs={'class': 'qid click'}) + title = '' + url = 'http://bash.org.pl/random/' + articles.append({'title': title, + 'url': url, + 'date': '', + 'description': '' + }) + return articles def populate_article_metadata(self, article, soup, first): - article.title = soup.find(attrs={'class':'qid click'}).string + article.title = soup.find(attrs={'class': 'qid click'}).string def parse_index(self): - feeds = [] - feeds.append((u"Najnowsze", self.latest_articles())) - feeds.append((u"Losowe", self.random_articles())) - return feeds + feeds = [] + feeds.append((u"Najnowsze", self.latest_articles())) + feeds.append((u"Losowe", self.random_articles())) + return feeds diff --git a/recipes/bay_citizen.recipe b/recipes/bay_citizen.recipe index e6a6c2b63d..5c9aac6450 100644 --- a/recipes/bay_citizen.recipe +++ b/recipes/bay_citizen.recipe @@ -1,39 +1,39 @@ from calibre.web.feeds.news import BasicNewsRecipe + class TheBayCitizen(BasicNewsRecipe): - title = 'The Bay Citizen' - language = 'en' - __author__ = 'noah' - description = 'The Bay Citizen' - publisher = 'The Bay Citizen' - INDEX = u'http://www.baycitizen.org' - category = 'news' - oldest_article = 2 + title = 'The Bay Citizen' + language = 'en' + __author__ = 'noah' + description = 'The Bay Citizen' + publisher = 'The Bay Citizen' + INDEX = u'http://www.baycitizen.org' + category = 'news' + oldest_article = 2 max_articles_per_feed = 20 - no_stylesheets = True - masthead_url = 'http://media.baycitizen.org/images/layout/logo1.png' - feeds = [('Main Feed', 'http://www.baycitizen.org/feeds/stories/')] - keep_only_tags = [dict(name='div', attrs={'class':'story'})] - remove_tags = [ - dict(name='div', attrs={'class':'socialBar'}), - dict(name='div', attrs={'id':'text-resize'}), - dict(name='div', attrs={'class':'story relatedContent'}), - dict(name='div', attrs={'id':'comment_status_loading'}), - ] + no_stylesheets = True + masthead_url = 'http://media.baycitizen.org/images/layout/logo1.png' + feeds = [('Main Feed', 'http://www.baycitizen.org/feeds/stories/')] + keep_only_tags = [dict(name='div', attrs={'class': 'story'})] + remove_tags = [ + dict(name='div', attrs={'class': 'socialBar'}), + dict(name='div', attrs={'id': 'text-resize'}), + dict(name='div', attrs={'class': 'story relatedContent'}), + dict(name='div', attrs={'id': 'comment_status_loading'}), + ] def append_page(self, soup, appendtag, position): - pager = soup.find('a',attrs={'class':'stry-next'}) + pager = soup.find('a', attrs={'class': 'stry-next'}) if pager: - nexturl = self.INDEX + pager['href'] - soup2 = self.index_to_soup(nexturl) - texttag = soup2.find('div', attrs={'class':'body'}) - for it in texttag.findAll(style=True): - del it['style'] - newpos = len(texttag.contents) - self.append_page(soup2,texttag,newpos) - texttag.extract() - appendtag.insert(position,texttag) - + nexturl = self.INDEX + pager['href'] + soup2 = self.index_to_soup(nexturl) + texttag = soup2.find('div', attrs={'class': 'body'}) + for it in texttag.findAll(style=True): + del it['style'] + newpos = len(texttag.contents) + self.append_page(soup2, texttag, newpos) + texttag.extract() + appendtag.insert(position, texttag) def preprocess_html(self, soup): for item in soup.findAll(style=True): diff --git a/recipes/bbc.recipe b/recipes/bbc.recipe index 9b2d4854bb..c1f55457d8 100644 --- a/recipes/bbc.recipe +++ b/recipes/bbc.recipe @@ -1,16 +1,16 @@ ## -## Title: BBC News, Sport, and Blog Calibre Recipe -## Contact: mattst - jmstanfield@gmail.com +# Title: BBC News, Sport, and Blog Calibre Recipe +# Contact: mattst - jmstanfield@gmail.com ## -## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html -## Copyright: mattst - jmstanfield@gmail.com +# License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html +# Copyright: mattst - jmstanfield@gmail.com ## -## Written: November 2011 -## Last Edited: 2011-11-19 +# Written: November 2011 +# Last Edited: 2011-11-19 ## -__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html' -__copyright__ = 'mattst - jmstanfield@gmail.com' +__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html' +__copyright__ = 'mattst - jmstanfield@gmail.com' ''' @@ -23,6 +23,7 @@ import re # Import the BasicNewsRecipe class which this class extends. from calibre.web.feeds.recipes import BasicNewsRecipe + class BBCNewsSportBlog(BasicNewsRecipe): # @@ -61,76 +62,80 @@ class BBCNewsSportBlog(BasicNewsRecipe): # Select / de-select the feeds you want in your ebook. # feeds = [ - ("News Home", "http://feeds.bbci.co.uk/news/rss.xml"), - ("UK", "http://feeds.bbci.co.uk/news/uk/rss.xml"), - ("World", "http://feeds.bbci.co.uk/news/world/rss.xml"), - #("England", "http://feeds.bbci.co.uk/news/england/rss.xml"), - #("Scotland", "http://feeds.bbci.co.uk/news/scotland/rss.xml"), - #("Wales", "http://feeds.bbci.co.uk/news/wales/rss.xml"), - #("N. Ireland", "http://feeds.bbci.co.uk/news/northern_ireland/rss.xml"), - #("Africa", "http://feeds.bbci.co.uk/news/world/africa/rss.xml"), - #("Asia", "http://feeds.bbci.co.uk/news/world/asia/rss.xml"), - #("Europe", "http://feeds.bbci.co.uk/news/world/europe/rss.xml"), - #("Latin America", "http://feeds.bbci.co.uk/news/world/latin_america/rss.xml"), - #("Middle East", "http://feeds.bbci.co.uk/news/world/middle_east/rss.xml"), - ("US & Canada", "http://feeds.bbci.co.uk/news/world/us_and_canada/rss.xml"), - ("Politics", "http://feeds.bbci.co.uk/news/politics/rss.xml"), - ("Science/Environment", "http://feeds.bbci.co.uk/news/science_and_environment/rss.xml"), - ("Technology", "http://feeds.bbci.co.uk/news/technology/rss.xml"), - ("Magazine", "http://feeds.bbci.co.uk/news/magazine/rss.xml"), - ("Entertainment/Arts", "http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml"), - #("Health", "http://feeds.bbci.co.uk/news/health/rss.xml"), - #("Education/Family", "http://feeds.bbci.co.uk/news/education/rss.xml"), - ("Business", "http://feeds.bbci.co.uk/news/business/rss.xml"), - ("Special Reports", "http://feeds.bbci.co.uk/news/special_reports/rss.xml"), - ("Also in the News", "http://feeds.bbci.co.uk/news/also_in_the_news/rss.xml"), - #("Newsbeat", "http://www.bbc.co.uk/newsbeat/rss.xml"), - #("Click", "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/programmes/click_online/rss.xml"), - ("Blog: Nick Robinson (Political Editor)", "http://feeds.bbci.co.uk/news/correspondents/nickrobinson/rss.sxml"), - #("Blog: Mark D'Arcy (Parliamentary Correspondent)", "http://feeds.bbci.co.uk/news/correspondents/markdarcy/rss.sxml"), - #("Blog: Robert Peston (Business Editor)", "http://feeds.bbci.co.uk/news/correspondents/robertpeston/rss.sxml"), - #("Blog: Stephanie Flanders (Economics Editor)", "http://feeds.bbci.co.uk/news/correspondents/stephanieflanders/rss.sxml"), - ("Blog: Rory Cellan-Jones (Technology correspondent)", "http://feeds.bbci.co.uk/news/correspondents/rorycellanjones/rss.sxml"), - ("Sport Front Page", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/front_page/rss.xml"), - #("Football", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/football/rss.xml"), - #("Cricket", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/cricket/rss.xml"), - #("Rugby Union", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_union/rss.xml"), - #("Rugby League", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_league/rss.xml"), - #("Tennis", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/tennis/rss.xml"), - #("Golf", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/golf/rss.xml"), - #("Motorsport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/motorsport/rss.xml"), - #("Boxing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/boxing/rss.xml"), - #("Athletics", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/athletics/rss.xml"), - #("Snooker", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/snooker/rss.xml"), - #("Horse Racing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/horse_racing/rss.xml"), - #("Cycling", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/cycling/rss.xml"), - #("Disability Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/disability_sport/rss.xml"), - #("Other Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/rss.xml"), - #("Olympics 2012", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/olympics_2012/rss.xml"), - #("N. Ireland Politics", "http://feeds.bbci.co.uk/news/northern_ireland/northern_ireland_politics/rss.xml"), - #("Scotland Politics", "http://feeds.bbci.co.uk/news/scotland/scotland_politics/rss.xml"), - #("Scotland Business", "http://feeds.bbci.co.uk/news/scotland/scotland_business/rss.xml"), - #("E. Scotland, Edinburgh & Fife", "http://feeds.bbci.co.uk/news/scotland/edinburgh_east_and_fife/rss.xml"), - #("W. Scotland & Glasgow", "http://feeds.bbci.co.uk/news/scotland/glasgow_and_west/rss.xml"), - #("Highlands & Islands", "http://feeds.bbci.co.uk/news/scotland/highlands_and_islands/rss.xml"), - #("NE. Scotland, Orkney & Shetland", "http://feeds.bbci.co.uk/news/scotland/north_east_orkney_and_shetland/rss.xml"), - #("South Scotland", "http://feeds.bbci.co.uk/news/scotland/south_scotland/rss.xml"), - #("Central Scotland & Tayside", "http://feeds.bbci.co.uk/news/scotland/tayside_and_central/rss.xml"), - #("Wales Politics", "http://feeds.bbci.co.uk/news/wales/wales_politics/rss.xml"), - #("NW. Wales", "http://feeds.bbci.co.uk/news/wales/north_west_wales/rss.xml"), - #("NE. Wales", "http://feeds.bbci.co.uk/news/wales/north_east_wales/rss.xml"), - #("Mid. Wales", "http://feeds.bbci.co.uk/news/wales/mid_wales/rss.xml"), - #("SW. Wales", "http://feeds.bbci.co.uk/news/wales/south_west_wales/rss.xml"), - #("SE. Wales", "http://feeds.bbci.co.uk/news/wales/south_east_wales/rss.xml"), - #("Newyddion - News in Welsh", "http://feeds.bbci.co.uk/newyddion/rss.xml"), - #("Gwleidyddiaeth", "http://feeds.bbci.co.uk/newyddion/gwleidyddiaeth/rss.xml"), - #("Gogledd-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/gogledd-ddwyrain/rss.xml"), - #("Gogledd-Orllewin", "http://feeds.bbci.co.uk/newyddion/gogledd-orllewin/rss.xml"), - #("Canolbarth", "http://feeds.bbci.co.uk/newyddion/canolbarth/rss.xml"), - #("De-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/de-ddwyrain/rss.xml"), - #("De-Orllewin", "http://feeds.bbci.co.uk/newyddion/de-orllewin/rss.xml"), - ] - + ("News Home", "http://feeds.bbci.co.uk/news/rss.xml"), + ("UK", "http://feeds.bbci.co.uk/news/uk/rss.xml"), + ("World", "http://feeds.bbci.co.uk/news/world/rss.xml"), + # ("England", "http://feeds.bbci.co.uk/news/england/rss.xml"), + # ("Scotland", "http://feeds.bbci.co.uk/news/scotland/rss.xml"), + # ("Wales", "http://feeds.bbci.co.uk/news/wales/rss.xml"), + # ("N. Ireland", "http://feeds.bbci.co.uk/news/northern_ireland/rss.xml"), + # ("Africa", "http://feeds.bbci.co.uk/news/world/africa/rss.xml"), + # ("Asia", "http://feeds.bbci.co.uk/news/world/asia/rss.xml"), + # ("Europe", "http://feeds.bbci.co.uk/news/world/europe/rss.xml"), + # ("Latin America", "http://feeds.bbci.co.uk/news/world/latin_america/rss.xml"), + # ("Middle East", "http://feeds.bbci.co.uk/news/world/middle_east/rss.xml"), + ("US & Canada", "http://feeds.bbci.co.uk/news/world/us_and_canada/rss.xml"), + ("Politics", "http://feeds.bbci.co.uk/news/politics/rss.xml"), + ("Science/Environment", + "http://feeds.bbci.co.uk/news/science_and_environment/rss.xml"), + ("Technology", "http://feeds.bbci.co.uk/news/technology/rss.xml"), + ("Magazine", "http://feeds.bbci.co.uk/news/magazine/rss.xml"), + ("Entertainment/Arts", + "http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml"), + # ("Health", "http://feeds.bbci.co.uk/news/health/rss.xml"), + # ("Education/Family", "http://feeds.bbci.co.uk/news/education/rss.xml"), + ("Business", "http://feeds.bbci.co.uk/news/business/rss.xml"), + ("Special Reports", "http://feeds.bbci.co.uk/news/special_reports/rss.xml"), + ("Also in the News", "http://feeds.bbci.co.uk/news/also_in_the_news/rss.xml"), + # ("Newsbeat", "http://www.bbc.co.uk/newsbeat/rss.xml"), + # ("Click", "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/programmes/click_online/rss.xml"), + ("Blog: Nick Robinson (Political Editor)", + "http://feeds.bbci.co.uk/news/correspondents/nickrobinson/rss.sxml"), + # ("Blog: Mark D'Arcy (Parliamentary Correspondent)", "http://feeds.bbci.co.uk/news/correspondents/markdarcy/rss.sxml"), + # ("Blog: Robert Peston (Business Editor)", "http://feeds.bbci.co.uk/news/correspondents/robertpeston/rss.sxml"), + # ("Blog: Stephanie Flanders (Economics Editor)", "http://feeds.bbci.co.uk/news/correspondents/stephanieflanders/rss.sxml"), + ("Blog: Rory Cellan-Jones (Technology correspondent)", + "http://feeds.bbci.co.uk/news/correspondents/rorycellanjones/rss.sxml"), + ("Sport Front Page", + "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/front_page/rss.xml"), + # ("Football", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/football/rss.xml"), + # ("Cricket", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/cricket/rss.xml"), + # ("Rugby Union", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_union/rss.xml"), + # ("Rugby League", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_league/rss.xml"), + # ("Tennis", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/tennis/rss.xml"), + # ("Golf", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/golf/rss.xml"), + # ("Motorsport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/motorsport/rss.xml"), + # ("Boxing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/boxing/rss.xml"), + # ("Athletics", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/athletics/rss.xml"), + # ("Snooker", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/snooker/rss.xml"), + # ("Horse Racing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/horse_racing/rss.xml"), + # ("Cycling", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/cycling/rss.xml"), + # ("Disability Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/disability_sport/rss.xml"), + # ("Other Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/rss.xml"), + # ("Olympics 2012", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/olympics_2012/rss.xml"), + # ("N. Ireland Politics", "http://feeds.bbci.co.uk/news/northern_ireland/northern_ireland_politics/rss.xml"), + # ("Scotland Politics", "http://feeds.bbci.co.uk/news/scotland/scotland_politics/rss.xml"), + # ("Scotland Business", "http://feeds.bbci.co.uk/news/scotland/scotland_business/rss.xml"), + # ("E. Scotland, Edinburgh & Fife", "http://feeds.bbci.co.uk/news/scotland/edinburgh_east_and_fife/rss.xml"), + # ("W. Scotland & Glasgow", "http://feeds.bbci.co.uk/news/scotland/glasgow_and_west/rss.xml"), + # ("Highlands & Islands", "http://feeds.bbci.co.uk/news/scotland/highlands_and_islands/rss.xml"), + # ("NE. Scotland, Orkney & Shetland", "http://feeds.bbci.co.uk/news/scotland/north_east_orkney_and_shetland/rss.xml"), + # ("South Scotland", "http://feeds.bbci.co.uk/news/scotland/south_scotland/rss.xml"), + # ("Central Scotland & Tayside", "http://feeds.bbci.co.uk/news/scotland/tayside_and_central/rss.xml"), + # ("Wales Politics", "http://feeds.bbci.co.uk/news/wales/wales_politics/rss.xml"), + # ("NW. Wales", "http://feeds.bbci.co.uk/news/wales/north_west_wales/rss.xml"), + # ("NE. Wales", "http://feeds.bbci.co.uk/news/wales/north_east_wales/rss.xml"), + # ("Mid. Wales", "http://feeds.bbci.co.uk/news/wales/mid_wales/rss.xml"), + # ("SW. Wales", "http://feeds.bbci.co.uk/news/wales/south_west_wales/rss.xml"), + # ("SE. Wales", "http://feeds.bbci.co.uk/news/wales/south_east_wales/rss.xml"), + # ("Newyddion - News in Welsh", "http://feeds.bbci.co.uk/newyddion/rss.xml"), + # ("Gwleidyddiaeth", "http://feeds.bbci.co.uk/newyddion/gwleidyddiaeth/rss.xml"), + # ("Gogledd-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/gogledd-ddwyrain/rss.xml"), + # ("Gogledd-Orllewin", "http://feeds.bbci.co.uk/newyddion/gogledd-orllewin/rss.xml"), + # ("Canolbarth", "http://feeds.bbci.co.uk/newyddion/canolbarth/rss.xml"), + # ("De-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/de-ddwyrain/rss.xml"), + # ("De-Orllewin", "http://feeds.bbci.co.uk/newyddion/de-orllewin/rss.xml"), + ] # **** SELECT YOUR USER PREFERENCES **** @@ -179,15 +184,14 @@ class BBCNewsSportBlog(BasicNewsRecipe): # As with 'feeds' select/de-select by adding/removing the initial '#', # only one timefmt should be selected, here's a few to choose from. # - timefmt = ' [%a, %d %b %Y]' # [Fri, 14 Nov 2011] (Calibre default) - #timefmt = ' [%a, %d %b %Y %H:%M]' # [Fri, 14 Nov 2011 18:30] - #timefmt = ' [%a, %d %b %Y %I:%M %p]' # [Fri, 14 Nov 2011 06:30 PM] - #timefmt = ' [%d %b %Y]' # [14 Nov 2011] - #timefmt = ' [%d %b %Y %H:%M]' # [14 Nov 2011 18.30] - #timefmt = ' [%Y-%m-%d]' # [2011-11-14] - #timefmt = ' [%Y-%m-%d-%H-%M]' # [2011-11-14-18-30] - - + # [Fri, 14 Nov 2011] (Calibre default) + timefmt = ' [%a, %d %b %Y]' + # timefmt = ' [%a, %d %b %Y %H:%M]' # [Fri, 14 Nov 2011 18:30] + # timefmt = ' [%a, %d %b %Y %I:%M %p]' # [Fri, 14 Nov 2011 06:30 PM] + # timefmt = ' [%d %b %Y]' # [14 Nov 2011] + # timefmt = ' [%d %b %Y %H:%M]' # [14 Nov 2011 18.30] + # timefmt = ' [%Y-%m-%d]' # [2011-11-14] + # timefmt = ' [%Y-%m-%d-%H-%M]' # [2011-11-14-18-30] # # **** IMPORTANT **** @@ -201,8 +205,6 @@ class BBCNewsSportBlog(BasicNewsRecipe): # **** IMPORTANT **** # - - # Author of this recipe. __author__ = 'mattst' @@ -225,7 +227,8 @@ class BBCNewsSportBlog(BasicNewsRecipe): # with None is working fine, so stick with that for robustness. encoding = None - # Sets whether a feed has full articles embedded in it. The BBC feeds do not. + # Sets whether a feed has full articles embedded in it. The BBC feeds do + # not. use_embedded_content = False # Removes empty feeds - why keep them!? @@ -253,7 +256,7 @@ class BBCNewsSportBlog(BasicNewsRecipe): } ''' - conversion_options = { 'smarten_punctuation' : True } + conversion_options = {'smarten_punctuation': True} # Specify extra CSS - overrides ALL other CSS (IE. Added last). extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \ @@ -276,22 +279,22 @@ class BBCNewsSportBlog(BasicNewsRecipe): h4, h5, h6 { text-align: center; font-size: 100%; font-weight: bold; }' # Remove various tag attributes to improve the look of the ebook pages. - remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan', - 'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ] + remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', + 'valign', 'vspace', 'hspace', 'alt', 'width', 'height'] # Remove the (admittedly rarely used) line breaks, "
    ", which sometimes # cause a section of the ebook to start in an unsightly fashion or, more # frequently, a "
    " will muck up the formatting of a correspondant's byline. # "
    " and "
    " are far more frequently used on the table formatted # style of pages, and really spoil the look of the ebook pages. - preprocess_regexps = [(re.compile(r'', re.IGNORECASE), lambda m: ''), - (re.compile(r'', re.IGNORECASE), lambda m: '')] - + preprocess_regexps = [(re.compile(r'', re.IGNORECASE), lambda m: ''), + (re.compile(r'', re.IGNORECASE), lambda m: '')] # Create regular expressions for tag keeping and removal to make the matches more # robust against minor changes and errors in the HTML, Eg. double spaces, leading # and trailing spaces, missing hyphens, and such like. - # Python regular expression ('re' class) page: http://docs.python.org/library/re.html + # Python regular expression ('re' class) page: + # http://docs.python.org/library/re.html # *************************************** # Regular expressions for keep_only_tags: @@ -313,7 +316,8 @@ class BBCNewsSportBlog(BasicNewsRecipe): # (travel), and in some sport pages. These alternative pages are table based (which is # why I think they are an out-of-date design) and account for -I'm guesstimaking- less # than 1% of all articles. They use a table class 'storycontent' to hold the article - # and like blq_content (above) have required lots of extra removal by remove_tags. + # and like blq_content (above) have required lots of extra removal by + # remove_tags. story_content_reg_exp = '^.*story[_ -]*content.*$' # Keep the sections of the HTML which match the list below. The HTML page created by @@ -323,11 +327,14 @@ class BBCNewsSportBlog(BasicNewsRecipe): # will be inside div class/id 'blq_content' in the HTML (if 'blq_content' is there at # all). If they are the other way around in keep_only_tags then blq_content_reg_exp # will end up being discarded. - keep_only_tags = [ dict(name='table', attrs={'class':re.compile(story_content_reg_exp, re.IGNORECASE)}), - dict(name='div', attrs={'class':re.compile(blq_content_reg_exp, re.IGNORECASE)}), - dict(name='div', attrs={'id':re.compile(blq_content_reg_exp, re.IGNORECASE)}), - dict(name='div', attrs={'class':re.compile(storybody_reg_exp, re.IGNORECASE)}), - dict(name='div', attrs={'id':re.compile(storybody_reg_exp, re.IGNORECASE)}) ] + keep_only_tags = [dict(name='table', attrs={'class': re.compile(story_content_reg_exp, re.IGNORECASE)}), + dict(name='div', attrs={'class': re.compile( + blq_content_reg_exp, re.IGNORECASE)}), + dict(name='div', attrs={'id': re.compile( + blq_content_reg_exp, re.IGNORECASE)}), + dict(name='div', attrs={'class': re.compile( + storybody_reg_exp, re.IGNORECASE)}), + dict(name='div', attrs={'id': re.compile(storybody_reg_exp, re.IGNORECASE)})] # ************************************ # Regular expressions for remove_tags: @@ -365,7 +372,8 @@ class BBCNewsSportBlog(BasicNewsRecipe): audio_reg_exp = '^.*audio.*$' # Regular expression to remove pictureGallery and variant tags, Eg. 'pictureGallery'. - # This class is used to embed a photo slideshow. See also 'slideshow' below. + # This class is used to embed a photo slideshow. See also 'slideshow' + # below. picture_gallery_reg_exp = '^.*picture.*$' # Regular expression to remove slideshow and variant tags, Eg. 'dslideshow-enclosure'. @@ -394,7 +402,8 @@ class BBCNewsSportBlog(BasicNewsRecipe): hidden_reg_exp = '^.*hidden.*$' # Regular expression to remove comment and variant tags, Eg. 'comment-introduction'. - # Used on the site to display text about registered users entering comments. + # Used on the site to display text about registered users entering + # comments. comment_reg_exp = '^.*comment.*$' # Regular expression to remove form and variant tags, Eg. 'comment-form'. @@ -402,31 +411,32 @@ class BBCNewsSportBlog(BasicNewsRecipe): # for entering comments about an article. form_reg_exp = '^.*form.*$' - # Extra things to remove due to the addition of 'blq_content' in keep_only_tags. + # Extra things to remove due to the addition of 'blq_content' in + # keep_only_tags. - #
    Used on sports pages for 'email' and 'print'. + #
    Used on sports pages for 'email' and 'print'. story_actions_reg_exp = '^.*story[_ -]*actions.*$' - #
    Used on sports pages instead of 'share-help' (for + #
    Used on sports pages instead of 'share-help' (for # social networking links). bookmark_list_reg_exp = '^.*bookmark[_ -]*list.*$' - #
    + #
    # NOTE: Don't remove class="content-group" that is needed. # Used on sports pages to link to 'similar stories'. secondary_content_reg_exp = '^.*secondary[_ -]*content.*$' - #