Perform PEP8 compliance checks on the entire codebase

Some bits of PEP 8 are turned off via setup.cfg
This commit is contained in:
Kovid Goyal 2016-07-29 11:04:04 +05:30
parent 643977ffa6
commit 567040ee1e
1881 changed files with 49336 additions and 46525 deletions

View File

@ -1,16 +1,16 @@
#!/usr/bin/env python2
##
## Title: Diario 10minutos.com.uy News and Sports Calibre Recipe
## Contact: Carlos Alves - <carlos@carlosalves.info>
# Title: Diario 10minutos.com.uy News and Sports Calibre Recipe
# Contact: Carlos Alves - <carlos@carlosalves.info>
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
## Copyright: Carlos Alves - <carlos@carlosalves.info>
# License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
# Copyright: Carlos Alves - <carlos@carlosalves.info>
##
## Written: September 2013
## Last Edited: 2016-01-11
# Written: September 2013
# Last Edited: 2016-01-11
##
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__author__ = '2016, Carlos Alves <carlos@carlosalves.info>'
'''
10minutos.com.uy
@ -18,29 +18,30 @@ __author__ = '2016, Carlos Alves <carlos@carlosalves.info>'
from calibre.web.feeds.news import BasicNewsRecipe
class General(BasicNewsRecipe):
title = '10minutos'
__author__ = 'Carlos Alves'
description = 'Noticias de Salto - Uruguay'
title = '10minutos'
__author__ = 'Carlos Alves'
description = 'Noticias de Salto - Uruguay'
tags = 'news, sports'
language = 'es_UY'
timefmt = '[%a, %d %b, %Y]'
use_embedded_content = False
recursion = 5
language = 'es_UY'
timefmt = '[%a, %d %b, %Y]'
use_embedded_content = False
recursion = 5
encoding = 'utf8'
remove_javascript = True
no_stylesheets = True
oldest_article = 2
oldest_article = 2
max_articles_per_feed = 100
keep_only_tags = [dict(name='div', attrs={'class':'post-content'})]
keep_only_tags = [dict(name='div', attrs={'class': 'post-content'})]
remove_tags = [
dict(name='div', attrs={'class':['hr', 'titlebar', 'navigation']}),
dict(name='div', attrs={'class':'sharedaddy sd-sharing-enabled'}),
dict(name='p', attrs={'class':'post-meta'}),
dict(name=['object','link'])
]
dict(name='div', attrs={'class': ['hr', 'titlebar', 'navigation']}),
dict(name='div', attrs={'class': 'sharedaddy sd-sharing-enabled'}),
dict(name='p', attrs={'class': 'post-meta'}),
dict(name=['object', 'link'])
]
extra_css = '''
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
@ -49,8 +50,8 @@ class General(BasicNewsRecipe):
p {font-family:Arial,Helvetica,sans-serif;}
'''
feeds = [
(u'Articulos', u'http://10minutos.com.uy/?feed=rss2')
]
(u'Articulos', u'http://10minutos.com.uy/?feed=rss2')
]
def get_cover_url(self):
return 'http://10minutos.com.uy/a/img/logo.png'

View File

@ -1,9 +1,9 @@
#!/usr/bin/env python2
##
## Last Edited: 2016-01-11 Carlos Alves <carlos@carlosalves.info>
# Last Edited: 2016-01-11 Carlos Alves <carlos@carlosalves.info>
##
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
'''
180.com.uy
@ -11,31 +11,32 @@ __author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
from calibre.web.feeds.news import BasicNewsRecipe
class Noticias(BasicNewsRecipe):
title = '180.com.uy'
__author__ = 'Gustavo Azambuja'
description = 'Noticias de Uruguay'
language = 'es_UY'
timefmt = '[%a, %d %b, %Y]'
use_embedded_content = False
recursion = 5
title = '180.com.uy'
__author__ = 'Gustavo Azambuja'
description = 'Noticias de Uruguay'
language = 'es_UY'
timefmt = '[%a, %d %b, %Y]'
use_embedded_content = False
recursion = 5
encoding = 'utf-8'
remove_javascript = True
no_stylesheets = True
oldest_article = 2
oldest_article = 2
max_articles_per_feed = 100
remove_tags_after = dict(name='article')
remove_tags_after = dict(name='article')
keep_only_tags = [
dict(name='h3', attrs={'class':'title'}),
dict(name='div', attrs={'class':'copete'}),
dict(name='article', attrs={'class':'texto'})
]
dict(name='h3', attrs={'class': 'title'}),
dict(name='div', attrs={'class': 'copete'}),
dict(name='article', attrs={'class': 'texto'})
]
remove_tags = [
dict(name=['object','link'])
]
dict(name=['object', 'link'])
]
remove_attributes = ['width','height', 'style', 'font', 'color']
remove_attributes = ['width', 'height', 'style', 'font', 'color']
extra_css = '''
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
@ -44,15 +45,13 @@ class Noticias(BasicNewsRecipe):
p {font-family:Arial,Helvetica,sans-serif;}
'''
feeds = [
(u'Titulares', u'http://www.180.com.uy/feed.php')
]
(u'Titulares', u'http://www.180.com.uy/feed.php')
]
def get_cover_url(self):
pass
pass
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -22,7 +22,7 @@ class E1843(BasicNewsRecipe):
encoding = 'utf-8'
keep_only_tags = [
dict(name='h1', attrs={'class':'title'}),
dict(name='h1', attrs={'class': 'title'}),
classes('field-name-field-rubric-summary article-header__overlay-main-image meta-info__author article__body'),
]
@ -54,7 +54,8 @@ class E1843(BasicNewsRecipe):
r = div.find(**classes('article-rubric'))
if r is not None:
desc = self.tag_to_string(r)
articles.append({'title':title, 'url':url, 'description':desc})
articles.append(
{'title': title, 'url': url, 'description': desc})
if current_section and articles:
ans.append((current_section, articles))

View File

@ -1,8 +1,8 @@
__license__ = 'GPL v3'
__author__ = 'Luis Hernandez'
__license__ = 'GPL v3'
__author__ = 'Luis Hernandez'
__copyright__ = 'Luis Hernandez<tolyluis@gmail.com>'
__version__ = 'v0.85'
__date__ = '31 January 2011'
__version__ = 'v0.85'
__date__ = '31 January 2011'
'''
www.20minutos.es
@ -10,46 +10,39 @@ www.20minutos.es
import re
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1294946868(BasicNewsRecipe):
title = u'20 Minutos new'
publisher = u'Grupo 20 Minutos'
title = u'20 Minutos new'
publisher = u'Grupo 20 Minutos'
__author__ = 'Luis Hernandez'
description = 'Free spanish newspaper'
cover_url = 'http://estaticos.20minutos.es/mmedia/especiales/corporativo/css/img/logotipos_grupo20minutos.gif'
__author__ = 'Luis Hernandez'
description = 'Free spanish newspaper'
cover_url = 'http://estaticos.20minutos.es/mmedia/especiales/corporativo/css/img/logotipos_grupo20minutos.gif'
oldest_article = 2
max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
no_stylesheets = True
use_embedded_content = False
encoding = 'ISO-8859-1'
language = 'es'
timefmt = '[%a, %d %b, %Y]'
remove_empty_feeds = True
encoding = 'ISO-8859-1'
language = 'es'
timefmt = '[%a, %d %b, %Y]'
remove_empty_feeds = True
keep_only_tags = [
dict(name='div', attrs={'id':['content','vinetas',]})
,dict(name='div', attrs={'class':['boxed','description','lead','article-content','cuerpo estirar']})
,dict(name='span', attrs={'class':['photo-bar']})
,dict(name='ul', attrs={'class':['article-author']})
]
keep_only_tags = [
dict(name='div', attrs={'id': ['content', 'vinetas', ]}), dict(name='div', attrs={'class': ['boxed', 'description', 'lead', 'article-content', 'cuerpo estirar']}), dict(name='span', attrs={'class': ['photo-bar']}), dict(name='ul', attrs={'class': ['article-author']}) # noqa
]
remove_tags_before = dict(name='ul' , attrs={'class':['servicios-sub']})
remove_tags_after = dict(name='div' , attrs={'class':['related-news','col']})
remove_tags_before = dict(name='ul', attrs={'class': ['servicios-sub']})
remove_tags_after = dict(
name='div', attrs={'class': ['related-news', 'col']})
remove_tags = [
dict(name='ol', attrs={'class':['navigation',]})
,dict(name='span', attrs={'class':['action']})
,dict(name='div', attrs={'class':['twitter comments-list hidden','related-news','col','photo-gallery','photo-gallery side-art-block','calendario','article-comment','postto estirar','otras_vinetas estirar','kment','user-actions']})
,dict(name='div', attrs={'id':['twitter-destacados','eco-tabs','inner','vineta_calendario','vinetistas clearfix','otras_vinetas estirar','MIN1','main','SUP1','INT']})
,dict(name='ul', attrs={'class':['article-user-actions','stripped-list']})
,dict(name='ul', attrs={'id':['site-links']})
,dict(name='li', attrs={'class':['puntuacion','enviar','compartir']})
]
dict(name='ol', attrs={'class': ['navigation', ]}), dict(name='span', attrs={'class': ['action']}), dict(name='div', attrs={'class': ['twitter comments-list hidden', 'related-news', 'col', 'photo-gallery', 'photo-gallery side-art-block', 'calendario', 'article-comment', 'postto estirar', 'otras_vinetas estirar', 'kment', 'user-actions']}), dict( name='div', attrs={'id': ['twitter-destacados', 'eco-tabs', 'inner', 'vineta_calendario', 'vinetistas clearfix', 'otras_vinetas estirar', 'MIN1', 'main', 'SUP1', 'INT']}), dict(name='ul', attrs={'class': ['article-user-actions', 'stripped-list']}), dict(name='ul', attrs={'id': ['site-links']}), dict(name='li', attrs={'class': ['puntuacion', 'enviar', 'compartir']}) # noqa
]
extra_css = """
p{text-align: justify; font-size: 100%}
@ -57,23 +50,25 @@ class AdvancedUserRecipe1294946868(BasicNewsRecipe):
h3{font-family: sans-serif; font-size:150%; font-weight:bold; text-align: justify; }
"""
preprocess_regexps = [(re.compile(r'<a href="http://estaticos.*?[0-999]px;" target="_blank">', re.DOTALL), lambda m: '')]
preprocess_regexps = [(re.compile(
r'<a href="http://estaticos.*?[0-999]px;" target="_blank">', re.DOTALL), lambda m: '')]
feeds = [
(u'Portada' , u'http://www.20minutos.es/rss/')
,(u'Nacional' , u'http://www.20minutos.es/rss/nacional/')
,(u'Internacional' , u'http://www.20minutos.es/rss/internacional/')
,(u'Economia' , u'http://www.20minutos.es/rss/economia/')
,(u'Deportes' , u'http://www.20minutos.es/rss/deportes/')
,(u'Tecnologia' , u'http://www.20minutos.es/rss/tecnologia/')
,(u'Gente - TV' , u'http://www.20minutos.es/rss/gente-television/')
,(u'Motor' , u'http://www.20minutos.es/rss/motor/')
,(u'Salud' , u'http://www.20minutos.es/rss/belleza-y-salud/')
,(u'Viajes' , u'http://www.20minutos.es/rss/viajes/')
,(u'Vivienda' , u'http://www.20minutos.es/rss/vivienda/')
,(u'Empleo' , u'http://www.20minutos.es/rss/empleo/')
,(u'Cine' , u'http://www.20minutos.es/rss/cine/')
,(u'Musica' , u'http://www.20minutos.es/rss/musica/')
,(u'Vinetas' , u'http://www.20minutos.es/rss/vinetas/')
,(u'Comunidad20' , u'http://www.20minutos.es/rss/zona20/')
]
(u'Portada', u'http://www.20minutos.es/rss/'),
(u'Nacional', u'http://www.20minutos.es/rss/nacional/'),
(u'Internacional', u'http://www.20minutos.es/rss/internacional/'),
(u'Economia', u'http://www.20minutos.es/rss/economia/'),
(u'Deportes', u'http://www.20minutos.es/rss/deportes/'),
(u'Tecnologia', u'http://www.20minutos.es/rss/tecnologia/'),
(u'Gente - TV', u'http://www.20minutos.es/rss/gente-television/'),
(u'Motor', u'http://www.20minutos.es/rss/motor/'),
(u'Salud', u'http://www.20minutos.es/rss/belleza-y-salud/'),
(u'Viajes', u'http://www.20minutos.es/rss/viajes/'),
(u'Vivienda', u'http://www.20minutos.es/rss/vivienda/'),
(u'Empleo', u'http://www.20minutos.es/rss/empleo/'),
(u'Cine', u'http://www.20minutos.es/rss/cine/'),
(u'Musica', u'http://www.20minutos.es/rss/musica/'),
(u'Vinetas', u'http://www.20minutos.es/rss/vinetas/'),
(u'Comunidad20', u'http://www.20minutos.es/rss/zona20/')
]

View File

@ -1,33 +1,34 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2011 Aurélien Chabot <contact@aurelienchabot.fr>'
'''
20minutes.fr
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class Minutes(BasicNewsRecipe):
title = '20 minutes'
__author__ = u'Aurélien Chabot'
description = 'Actualités'
encoding = 'utf-8'
publisher = '20minutes.fr'
category = 'Actualités, France, Monde'
language = 'fr'
title = '20 minutes'
__author__ = u'Aurélien Chabot'
description = 'Actualités'
encoding = 'utf-8'
publisher = '20minutes.fr'
category = 'Actualités, France, Monde'
language = 'fr'
use_embedded_content = False
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 15
no_stylesheets = True
remove_empty_feeds = True
use_embedded_content = False
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 15
no_stylesheets = True
remove_empty_feeds = True
keep_only_tags = [
dict(name='h1'),
dict(attrs={'class':lambda x: x and 'lt-content' in x.split()}),
dict(attrs={'class': lambda x: x and 'lt-content' in x.split()}),
]
remove_tags = [
dict(attrs={'class':lambda x:x and 'content-related' in x.split()}),
dict(attrs={'class': lambda x: x and 'content-related' in x.split()}),
]
remove_tags_after = dict(id='ob_holder')

View File

@ -1,4 +1,4 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
'''
www.20minutos.es
@ -6,45 +6,44 @@ www.20minutos.es
from calibre.web.feeds.news import BasicNewsRecipe
class t20Minutos(BasicNewsRecipe):
title = '20 Minutos'
__author__ = 'Darko Miletic'
description = 'Diario de informacion general y local mas leido de Espania, noticias de ultima hora de Espania, el mundo, local, deportes, noticias curiosas y mas'
publisher = '20 Minutos Online SL'
category = 'news, politics, Spain'
oldest_article = 2
title = '20 Minutos'
__author__ = 'Darko Miletic'
description = 'Diario de informacion general y local mas leido de Espania, noticias de ultima hora de Espania, el mundo, local, deportes, noticias curiosas y mas' # noqa
publisher = '20 Minutos Online SL'
category = 'news, politics, Spain'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = True
language = 'es'
remove_empty_feeds = True
publication_type = 'newspaper'
masthead_url = 'http://estaticos.20minutos.es/css4/img/ui/logo-301x54.png'
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = True
language = 'es'
remove_empty_feeds = True
publication_type = 'newspaper'
masthead_url = 'http://estaticos.20minutos.es/css4/img/ui/logo-301x54.png'
extra_css = """
body{font-family: Arial,Helvetica,sans-serif }
img{margin-bottom: 0.4em; display:block}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
remove_tags = [dict(attrs={'class':'mf-viral'})]
remove_attributes=['border']
remove_tags = [dict(attrs={'class': 'mf-viral'})]
remove_attributes = ['border']
feeds = [
(u'Principal' , u'http://20minutos.feedsportal.com/c/32489/f/478284/index.rss')
,(u'Cine' , u'http://20minutos.feedsportal.com/c/32489/f/478285/index.rss')
,(u'Internacional' , u'http://20minutos.feedsportal.com/c/32489/f/492689/index.rss')
,(u'Deportes' , u'http://20minutos.feedsportal.com/c/32489/f/478286/index.rss')
,(u'Nacional' , u'http://20minutos.feedsportal.com/c/32489/f/492688/index.rss')
,(u'Economia' , u'http://20minutos.feedsportal.com/c/32489/f/492690/index.rss')
,(u'Tecnologia' , u'http://20minutos.feedsportal.com/c/32489/f/478292/index.rss')
]
(u'Principal', u'http://20minutos.feedsportal.com/c/32489/f/478284/index.rss'),
(u'Cine', u'http://20minutos.feedsportal.com/c/32489/f/478285/index.rss'),
(u'Internacional', u'http://20minutos.feedsportal.com/c/32489/f/492689/index.rss'),
(u'Deportes', u'http://20minutos.feedsportal.com/c/32489/f/478286/index.rss'),
(u'Nacional', u'http://20minutos.feedsportal.com/c/32489/f/492688/index.rss'),
(u'Economia', u'http://20minutos.feedsportal.com/c/32489/f/492690/index.rss'),
(u'Tecnologia', u'http://20minutos.feedsportal.com/c/32489/f/478292/index.rss')
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
@ -52,17 +51,16 @@ class t20Minutos(BasicNewsRecipe):
for item in soup.findAll('a'):
limg = item.find('img')
if item.string is not None:
str = item.string
item.replaceWith(str)
str = item.string
item.replaceWith(str)
else:
if limg:
item.name = 'div'
item.attrs = []
else:
str = self.tag_to_string(item)
item.replaceWith(str)
if limg:
item.name = 'div'
item.attrs = []
else:
str = self.tag_to_string(item)
item.replaceWith(str)
for item in soup.findAll('img'):
if not item.has_key('alt'):
item['alt'] = 'image'
if not item.has_key('alt'): # noqa
item['alt'] = 'image'
return soup

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python2
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
@ -11,51 +11,50 @@ import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Cro24Sata(BasicNewsRecipe):
title = '24 Sata - Hr'
__author__ = 'Darko Miletic'
description = "News Portal from Croatia"
publisher = '24sata.hr'
category = 'news, politics, Croatia'
oldest_article = 2
title = '24 Sata - Hr'
__author__ = 'Darko Miletic'
description = "News Portal from Croatia"
publisher = '24sata.hr'
category = 'news, politics, Croatia'
oldest_article = 2
max_articles_per_feed = 100
delay = 4
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
delay = 4
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
language = 'hr'
lang = 'hr-HR'
lang = 'hr-HR'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}' # noqa
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
remove_tags = [
dict(name=['object','link','embed'])
,dict(name='table', attrs={'class':'enumbox'})
]
dict(name=['object', 'link', 'embed']), dict(
name='table', attrs={'class': 'enumbox'})
]
feeds = [(u'Najnovije Vijesti', u'http://www.24sata.hr/index.php?cmd=show_rss&action=novo')]
feeds = [(u'Najnovije Vijesti',
u'http://www.24sata.hr/index.php?cmd=show_rss&action=novo')]
def preprocess_html(self, soup):
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
soup.html['lang'] = self.lang
mlang = Tag(soup, 'meta', [
("http-equiv", "Content-Language"), ("content", self.lang)])
mcharset = Tag(soup, 'meta', [
("http-equiv", "Content-Type"), ("content", "text/html; charset=UTF-8")])
soup.head.insert(0, mlang)
soup.head.insert(1, mcharset)
for item in soup.findAll(style=True):
del item['style']
return soup
def print_version(self, url):
return url + '&action=ispis'

View File

@ -1,6 +1,6 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2009-2012, Darko Miletic <darko.miletic at gmail.com>'
'''
@ -10,40 +10,38 @@ __copyright__ = '2009-2012, Darko Miletic <darko.miletic at gmail.com>'
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class Ser24Sata(BasicNewsRecipe):
title = '24 Sata - Sr'
__author__ = 'Darko Miletic'
description = '24 sata portal vesti iz Srbije'
publisher = 'Ringier d.o.o.'
category = 'news, politics, entertainment, Serbia'
oldest_article = 2
title = '24 Sata - Sr'
__author__ = 'Darko Miletic'
description = '24 sata portal vesti iz Srbije'
publisher = 'Ringier d.o.o.'
category = 'news, politics, entertainment, Serbia'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
language = 'sr'
publication_type = 'newsportal'
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
language = 'sr'
publication_type = 'newsportal'
extra_css = """
@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
body{font-family: serif1, serif}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher': publisher
, 'language' : language
}
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
feeds = [
(u'Vesti' , u'http://www.24sata.rs/rss/vesti.xml' ),
(u'Sport' , u'http://www.24sata.rs/rss/sport.xml' ),
(u'Šou' , u'http://www.24sata.rs/rss/sou.xml' ),
(u'Specijal', u'http://www.24sata.rs/rss/specijal.xml'),
(u'Novi Sad', u'http://www.24sata.rs/rss/ns.xml' )
]
(u'Vesti', u'http://www.24sata.rs/rss/vesti.xml'),
(u'Sport', u'http://www.24sata.rs/rss/sport.xml'),
(u'Šou', u'http://www.24sata.rs/rss/sou.xml'),
(u'Specijal', u'http://www.24sata.rs/rss/specijal.xml'),
(u'Novi Sad', u'http://www.24sata.rs/rss/ns.xml')
]
def print_version(self, url):
dpart, spart, apart = url.rpartition('/')

View File

@ -3,44 +3,63 @@
from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1438446837(BasicNewsRecipe):
title = '3DNews: Daily Digital Digest'
title = '3DNews: Daily Digital Digest'
__author__ = 'bugmen00t'
description = 'Независимое российское онлайн-издание, посвященное цифровым технологиям'
publisher = '3DNews'
category = 'news'
description = 'Независимое российское онлайн-издание, посвященное цифровым технологиям'
publisher = '3DNews'
category = 'news'
cover_url = u'http://www.3dnews.ru/assets/images/logo.png'
language = 'ru'
auto_cleanup = True
language = 'ru'
auto_cleanup = True
oldest_article = 15
max_articles_per_feed = 60
feeds = [
('\u041d\u043e\u0432\u043e\u0441\u0442\u0438 Hardware', 'http://www.3dnews.ru/news/rss/'),
('\u041d\u043e\u0432\u043e\u0441\u0442\u0438 Software', 'http://www.3dnews.ru/software-news/rss/'),
('\u0423\u043c\u043d\u044b\u0435 \u0432\u0435\u0449\u0438', 'http://www.3dnews.ru/smart-things/rss/'),
('\u0410\u043d\u0430\u043b\u0438\u0442\u0438\u043a\u0430', 'http://www.3dnews.ru/editorial/rss/'),
('\u041f\u0440\u043e\u0446\u0435\u0441\u0441\u043e\u0440\u044b \u0438 \u043f\u0430\u043c\u044f\u0442\u044c', 'http://www.3dnews.ru/cpu/rss/'),
('\u041c\u0430\u0442\u0435\u0440\u0438\u043d\u0441\u043a\u0438\u0435 \u043f\u043b\u0430\u0442\u044b', 'http://www.3dnews.ru/motherboard/rss/'),
feeds = [
('\u041d\u043e\u0432\u043e\u0441\u0442\u0438 Hardware',
'http://www.3dnews.ru/news/rss/'),
('\u041d\u043e\u0432\u043e\u0441\u0442\u0438 Software',
'http://www.3dnews.ru/software-news/rss/'),
('\u0423\u043c\u043d\u044b\u0435 \u0432\u0435\u0449\u0438',
'http://www.3dnews.ru/smart-things/rss/'),
('\u0410\u043d\u0430\u043b\u0438\u0442\u0438\u043a\u0430',
'http://www.3dnews.ru/editorial/rss/'),
('\u041f\u0440\u043e\u0446\u0435\u0441\u0441\u043e\u0440\u044b \u0438 \u043f\u0430\u043c\u044f\u0442\u044c',
'http://www.3dnews.ru/cpu/rss/'),
('\u041c\u0430\u0442\u0435\u0440\u0438\u043d\u0441\u043a\u0438\u0435 \u043f\u043b\u0430\u0442\u044b',
'http://www.3dnews.ru/motherboard/rss/'),
('\u041a\u043e\u0440\u043f\u0443\u0441\u0430, \u0411\u041f \u0438 \u043e\u0445\u043b\u0430\u0436\u0434\u0435\u043d\u0438\u0435',
'http://www.3dnews.ru/cooling/rss/'),
('\u0412\u0438\u0434\u0435\u043e\u043a\u0430\u0440\u0442\u044b', 'http://www.3dnews.ru/video/rss/'),
('\u041c\u043e\u043d\u0438\u0442\u043e\u0440\u044b \u0438 \u043f\u0440\u043e\u0435\u043a\u0442\u043e\u0440\u044b', 'http://www.3dnews.ru/display/rss/'),
('\u041d\u0430\u043a\u043e\u043f\u0438\u0442\u0435\u043b\u0438', 'http://www.3dnews.ru/storage/rss/'),
('\u0426\u0438\u0444\u0440\u043e\u0432\u043e\u0439 \u0430\u0432\u0442\u043e\u043c\u043e\u0431\u0438\u043b\u044c', 'http://www.3dnews.ru/auto/rss/'),
('\u0421\u043e\u0442\u043e\u0432\u0430\u044f \u0441\u0432\u044f\u0437\u044c', 'http://www.3dnews.ru/phone/rss/'),
('\u041f\u0435\u0440\u0438\u0444\u0435\u0440\u0438\u044f', 'http://www.3dnews.ru/peripheral/rss/'),
('\u041d\u043e\u0443\u0442\u0431\u0443\u043a\u0438 \u0438 \u041f\u041a', 'http://www.3dnews.ru/mobile/rss/'),
('\u041f\u043b\u0430\u043d\u0448\u0435\u0442\u044b', 'http://www.3dnews.ru/tablets/rss/'),
('\u0417\u0432\u0443\u043a \u0438 \u0430\u043a\u0443\u0441\u0442\u0438\u043a\u0430', 'http://www.3dnews.ru/multimedia/rss/'),
('\u0412\u0438\u0434\u0435\u043e\u043a\u0430\u0440\u0442\u044b',
'http://www.3dnews.ru/video/rss/'),
('\u041c\u043e\u043d\u0438\u0442\u043e\u0440\u044b \u0438 \u043f\u0440\u043e\u0435\u043a\u0442\u043e\u0440\u044b',
'http://www.3dnews.ru/display/rss/'),
('\u041d\u0430\u043a\u043e\u043f\u0438\u0442\u0435\u043b\u0438',
'http://www.3dnews.ru/storage/rss/'),
('\u0426\u0438\u0444\u0440\u043e\u0432\u043e\u0439 \u0430\u0432\u0442\u043e\u043c\u043e\u0431\u0438\u043b\u044c',
'http://www.3dnews.ru/auto/rss/'),
('\u0421\u043e\u0442\u043e\u0432\u0430\u044f \u0441\u0432\u044f\u0437\u044c',
'http://www.3dnews.ru/phone/rss/'),
('\u041f\u0435\u0440\u0438\u0444\u0435\u0440\u0438\u044f',
'http://www.3dnews.ru/peripheral/rss/'),
('\u041d\u043e\u0443\u0442\u0431\u0443\u043a\u0438 \u0438 \u041f\u041a',
'http://www.3dnews.ru/mobile/rss/'),
('\u041f\u043b\u0430\u043d\u0448\u0435\u0442\u044b',
'http://www.3dnews.ru/tablets/rss/'),
('\u0417\u0432\u0443\u043a \u0438 \u0430\u043a\u0443\u0441\u0442\u0438\u043a\u0430',
'http://www.3dnews.ru/multimedia/rss/'),
('\u0426\u0438\u0444\u0440\u043e\u0432\u043e\u0435 \u0444\u043e\u0442\u043e \u0438 \u0432\u0438\u0434\u0435\u043e',
'http://www.3dnews.ru/digital/rss/'),
('\u0421\u0435\u0442\u0438 \u0438 \u043a\u043e\u043c\u043c\u0443\u043d\u0438\u043a\u0430\u0446\u0438\u0438', 'http://www.3dnews.ru/communication/rss/'),
('\u0421\u0435\u0442\u0438 \u0438 \u043a\u043e\u043c\u043c\u0443\u043d\u0438\u043a\u0430\u0446\u0438\u0438',
'http://www.3dnews.ru/communication/rss/'),
('\u0418\u0433\u0440\u044b', 'http://www.3dnews.ru/games/rss/'),
('\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u043d\u043e\u0435 \u043e\u0431\u0435\u0441\u043f\u0435\u0447\u0435\u043d\u0438\u0435',
'http://www.3dnews.ru/software/rss/'),
('Off-\u0441\u044f\u043d\u043a\u0430', 'http://www.3dnews.ru/offsyanka/rss/'),
('\u041c\u0430\u0441\u0442\u0435\u0440\u0441\u043a\u0430\u044f', 'http://www.3dnews.ru/workshop/rss/'),
('Off-\u0441\u044f\u043d\u043a\u0430',
'http://www.3dnews.ru/offsyanka/rss/'),
('\u041c\u0430\u0441\u0442\u0435\u0440\u0441\u043a\u0430\u044f',
'http://www.3dnews.ru/workshop/rss/'),
('ServerNews', 'http://servernews.ru/rss'),
]

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python2
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
elargentino.com
@ -9,33 +9,26 @@ elargentino.com
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class SieteDias(BasicNewsRecipe):
title = '7 dias'
__author__ = 'Darko Miletic'
description = 'Revista Argentina'
publisher = 'ElArgentino.com'
category = 'news, politics, show, Argentina'
oldest_article = 7
title = '7 dias'
__author__ = 'Darko Miletic'
description = 'Revista Argentina'
publisher = 'ElArgentino.com'
category = 'news, politics, show, Argentina'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = 'es_AR'
lang = 'es-AR'
direction = 'ltr'
INDEX = 'http://www.elargentino.com/medios/125/7-Dias.html'
extra_css = ' .titulo{font-size: x-large; font-weight: bold} .volantaImp{font-size: small; font-weight: bold} '
lang = 'es-AR'
direction = 'ltr'
INDEX = 'http://www.elargentino.com/medios/125/7-Dias.html'
extra_css = ' .titulo{font-size: x-large; font-weight: bold} .volantaImp{font-size: small; font-weight: bold} '
html2lrf_options = [
'--comment' , description
, '--category' , category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='div', attrs={'class':'ContainerPop'})]
keep_only_tags = [dict(name='div', attrs={'class': 'ContainerPop'})]
remove_tags = [dict(name='link')]
@ -50,20 +43,23 @@ class SieteDias(BasicNewsRecipe):
for item in soup.findAll(style=True):
del item['style']
soup.html['lang'] = self.lang
soup.html['dir' ] = self.direction
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
soup.html['dir'] = self.direction
mlang = Tag(soup, 'meta', [
("http-equiv", "Content-Language"), ("content", self.lang)])
mcharset = Tag(soup, 'meta', [
("http-equiv", "Content-Type"), ("content", "text/html; charset=utf-8")])
soup.head.insert(0, mlang)
soup.head.insert(1, mcharset)
return soup
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup(self.INDEX)
cover_item = soup.find('div',attrs={'class':'colder'})
cover_item = soup.find('div', attrs={'class': 'colder'})
if cover_item:
clean_url = self.image_url_processor(None,cover_item.div.img['src'])
cover_url = 'http://www.elargentino.com' + clean_url + '&height=600'
clean_url = self.image_url_processor(
None, cover_item.div.img['src'])
cover_url = 'http://www.elargentino.com' + clean_url + '&height=600'
return cover_url
def image_url_processor(self, baseurl, url):

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = u'2011, Silviu Cotoar\u0103'
'''
sapteseri.ro
@ -9,43 +9,40 @@ sapteseri.ro
from calibre.web.feeds.news import BasicNewsRecipe
class SapteSeri(BasicNewsRecipe):
title = u'Sapte Seri'
__author__ = u'Silviu Cotoar\u0103'
description = u'Sapte Seri'
publisher = u'Sapte Seri'
oldest_article = 5
language = 'ro'
title = u'Sapte Seri'
__author__ = u'Silviu Cotoar\u0103'
description = u'Sapte Seri'
publisher = u'Sapte Seri'
oldest_article = 5
language = 'ro'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
category = 'Ziare,Oras,Distractie,Fun'
encoding = 'utf-8'
remove_empty_feeds = True
remove_javascript = True
cover_url = 'http://www.sapteseri.ro/Images/logo.jpg'
no_stylesheets = True
use_embedded_content = False
category = 'Ziare,Oras,Distractie,Fun'
encoding = 'utf-8'
remove_empty_feeds = True
remove_javascript = True
cover_url = 'http://www.sapteseri.ro/Images/logo.jpg'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
'comments': description, 'tags': category, 'language': language, 'publisher': publisher
}
keep_only_tags = [
dict(name='h1', attrs={'id':'title'})
, dict(name='div', attrs={'class':'mt10 mb10'})
, dict(name='div', attrs={'class':'mb20 mt10'})
, dict(name='div', attrs={'class':'mt5 mb20'})
]
dict(name='h1', attrs={'id': 'title'}), dict(name='div', attrs={'class': 'mt10 mb10'}), dict(
name='div', attrs={'class': 'mb20 mt10'}), dict(name='div', attrs={'class': 'mt5 mb20'})
]
remove_tags = [
dict(name='div', attrs={'id':['entityimgworking']})
]
dict(name='div', attrs={'id': ['entityimgworking']})
]
feeds = [
(u'Ce se intampla azi in Bucuresti', u'http://www.sapteseri.ro/ro/feed/ce-se-intampla-azi/bucuresti/')
]
feeds = [
(u'Ce se intampla azi in Bucuresti',
u'http://www.sapteseri.ro/ro/feed/ce-se-intampla-azi/bucuresti/')
]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -1,69 +1,70 @@
#!/usr/bin/env python2
__license__ = 'GPL v3'
__author__ = 'Gabriele Marini, based on Darko Miletic'
__license__ = 'GPL v3'
__author__ = 'Gabriele Marini, based on Darko Miletic'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
description = 'Italian daily newspaper - 01-05-2010'
description = 'Italian daily newspaper - 01-05-2010'
'''
http://www.ansa.it/
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Ansa(BasicNewsRecipe):
__author__ = 'Gabriele Marini'
description = 'Italian News Agency'
__author__ = 'Gabriele Marini'
description = 'Italian News Agency'
cover_url = 'http://www.ansa.it/web/images/logo_ansa_interna.gif'
title = u'Ansa'
publisher = 'Ansa'
category = 'News, politics, culture, economy, general interest'
cover_url = 'http://www.ansa.it/web/images/logo_ansa_interna.gif'
title = u'Ansa'
publisher = 'Ansa'
category = 'News, politics, culture, economy, general interest'
language = 'it'
timefmt = '[%a, %d %b, %Y]'
language = 'it'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 1
max_articles_per_feed = 10
use_embedded_content = False
recursion = 10
use_embedded_content = False
recursion = 10
remove_javascript = True
no_stylesheets = True
conversion_options = {'linearize_tables':True}
no_stylesheets = True
conversion_options = {'linearize_tables': True}
remove_attributes = ['colspan']
keep_only_tags = [dict(name='div', attrs={'class':['path','header-content','corpo']}),
]
remove_tags = [
dict(name='div', attrs={'class':'tools-bar'}),
dict(name='div', attrs={'id':['rssdiv','blocco']})
]
feeds = [
(u'HomePage', u'http://www.ansa.it/web/ansait_web_rss_homepage.xml'),
(u'Top New', u'http://www.ansa.it/web/notizie/rubriche/topnews/topnews_rss.xml'),
(u'Cronaca', u'http://www.ansa.it/web/notizie/rubriche/cronaca/cronaca_rss.xml'),
(u'Mondo', u'http://www.ansa.it/web/notizie/rubriche/mondo/mondo_rss.xml'),
(u'Economia', u'http://www.ansa.it/web/notizie/rubriche/economia/economia_rss.xml'),
(u'Politica', u'http://www.ansa.it/web/notizie/rubriche/politica/politica_rss.xml'),
(u'Scienze', u'http://www.ansa.it/web/notizie/rubriche/scienza/scienza_rss.xml'),
(u'Cinema', u'http://www.ansa.it/web/notizie/rubriche/cinema/cinema_rss.xml'),
(u'Tecnologia e Internet', u'http://www.ansa.it/web/notizie/rubriche/tecnologia/tecnologia_rss.xml'),
(u'Spettacolo', u'http://www.ansa.it/web/notizie/rubriche/spettacolo/spettacolo_rss.xml'),
(u'Cultura e Tendenze', u'http://www.ansa.it/web/notizie/rubriche/cultura/cultura_rss.xml'),
(u'Sport', u'http://www.ansa.it/web/notizie/rubriche/altrisport/altrisport_rss.xml'),
(u'Calcio', u'http://www.ansa.it/web/notizie/rubriche/calcio/calcio_rss.xml'),
(u'Lazio', u'http://www.ansa.it/web/notizie/regioni/lazio/lazio_rss.xml'),
(u'Lombardia', u'http://www.ansa.it/web/notizie/regioni/lombardia/lombardia.shtml'),
(u'Veneto', u'http://www.ansa.it/web/notizie/regioni/veneto/veneto.shtml'),
(u'Campanioa', u'http://www.ansa.it/web/notizie/regioni/campania/campania.shtml'),
(u'Sicilia', u'http://www.ansa.it/web/notizie/regioni/sicilia/sicilia.shtml'),
(u'Toscana', u'http://www.ansa.it/web/notizie/regioni/toscana/toscana.shtml'),
(u'Trentino', u'http://www.ansa.it/web/notizie/regioni/trentino/trentino.shtml')
keep_only_tags = [dict(name='div', attrs={'class': ['path', 'header-content', 'corpo']}),
]
remove_tags = [
dict(name='div', attrs={'class': 'tools-bar'}),
dict(name='div', attrs={'id': ['rssdiv', 'blocco']})
]
feeds = [
(u'HomePage', u'http://www.ansa.it/web/ansait_web_rss_homepage.xml'),
(u'Top New', u'http://www.ansa.it/web/notizie/rubriche/topnews/topnews_rss.xml'),
(u'Cronaca', u'http://www.ansa.it/web/notizie/rubriche/cronaca/cronaca_rss.xml'),
(u'Mondo', u'http://www.ansa.it/web/notizie/rubriche/mondo/mondo_rss.xml'),
(u'Economia', u'http://www.ansa.it/web/notizie/rubriche/economia/economia_rss.xml'),
(u'Politica', u'http://www.ansa.it/web/notizie/rubriche/politica/politica_rss.xml'),
(u'Scienze', u'http://www.ansa.it/web/notizie/rubriche/scienza/scienza_rss.xml'),
(u'Cinema', u'http://www.ansa.it/web/notizie/rubriche/cinema/cinema_rss.xml'),
(u'Tecnologia e Internet',
u'http://www.ansa.it/web/notizie/rubriche/tecnologia/tecnologia_rss.xml'),
(u'Spettacolo', u'http://www.ansa.it/web/notizie/rubriche/spettacolo/spettacolo_rss.xml'),
(u'Cultura e Tendenze',
u'http://www.ansa.it/web/notizie/rubriche/cultura/cultura_rss.xml'),
(u'Sport', u'http://www.ansa.it/web/notizie/rubriche/altrisport/altrisport_rss.xml'),
(u'Calcio', u'http://www.ansa.it/web/notizie/rubriche/calcio/calcio_rss.xml'),
(u'Lazio', u'http://www.ansa.it/web/notizie/regioni/lazio/lazio_rss.xml'),
(u'Lombardia', u'http://www.ansa.it/web/notizie/regioni/lombardia/lombardia.shtml'),
(u'Veneto', u'http://www.ansa.it/web/notizie/regioni/veneto/veneto.shtml'),
(u'Campanioa', u'http://www.ansa.it/web/notizie/regioni/campania/campania.shtml'),
(u'Sicilia', u'http://www.ansa.it/web/notizie/regioni/sicilia/sicilia.shtml'),
(u'Toscana', u'http://www.ansa.it/web/notizie/regioni/toscana/toscana.shtml'),
(u'Trentino', u'http://www.ansa.it/web/notizie/regioni/trentino/trentino.shtml')
]
extra_css = '''
.path{font-style: italic; font-size: small}
.header-content h1{font-weight: bold; font-size: xx-large}

View File

@ -1,21 +1,22 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class DrawAndCook(BasicNewsRecipe):
title = 'DrawAndCook'
__author__ = 'Starson17'
__version__ = 'v1.10'
__date__ = '13 March 2011'
description = 'Drawings of recipes!'
language = 'en'
publisher = 'Starson17'
category = 'news, food, recipes'
use_embedded_content= False
no_stylesheets = True
oldest_article = 24
remove_javascript = True
remove_empty_feeds = True
cover_url = 'http://farm5.static.flickr.com/4043/4471139063_4dafced67f_o.jpg'
title = 'DrawAndCook'
__author__ = 'Starson17'
__version__ = 'v1.10'
__date__ = '13 March 2011'
description = 'Drawings of recipes!'
language = 'en'
publisher = 'Starson17'
category = 'news, food, recipes'
use_embedded_content = False
no_stylesheets = True
oldest_article = 24
remove_javascript = True
remove_empty_feeds = True
cover_url = 'http://farm5.static.flickr.com/4043/4471139063_4dafced67f_o.jpg'
INDEX = 'http://www.theydrawandcook.com'
max_articles_per_feed = 30
@ -24,8 +25,8 @@ class DrawAndCook(BasicNewsRecipe):
def parse_index(self):
feeds = []
for title, url in [
("They Draw and Cook", "http://www.theydrawandcook.com/")
]:
("They Draw and Cook", "http://www.theydrawandcook.com/")
]:
articles = self.make_links(url)
if articles:
feeds.append((title, articles))
@ -38,22 +39,24 @@ class DrawAndCook(BasicNewsRecipe):
date = ''
current_articles = []
soup = self.index_to_soup(url)
featured_major_slider = soup.find(name='div', attrs={'id':'featured_major_slider'})
recipes = featured_major_slider.findAll('li', attrs={'data-id': re.compile(r'artwork_entry_\d+', re.DOTALL)})
featured_major_slider = soup.find(
name='div', attrs={'id': 'featured_major_slider'})
recipes = featured_major_slider.findAll(
'li', attrs={'data-id': re.compile(r'artwork_entry_\d+', re.DOTALL)})
for recipe in recipes:
page_url = self.INDEX + recipe.a['href']
print 'page_url is: ', page_url
title = recipe.find('strong').string
print 'title is: ', title
current_articles.append({'title': title, 'url': page_url, 'description':'', 'date':date})
current_articles.append(
{'title': title, 'url': page_url, 'description': '', 'date': date})
return current_articles
keep_only_tags = [dict(name='h1', attrs={'id':'page_title'})
,dict(name='section', attrs={'id':'artwork'})
]
keep_only_tags = [dict(name='h1', attrs={'id': 'page_title'}), dict(name='section', attrs={'id': 'artwork'})
]
remove_tags = [dict(name='article', attrs={'id':['recipe_actions', 'metadata']})
]
remove_tags = [dict(name='article', attrs={'id': ['recipe_actions', 'metadata']})
]
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
@ -61,5 +64,4 @@ class DrawAndCook(BasicNewsRecipe):
img {max-width:100%; min-width:100%;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''
'''

View File

@ -2,9 +2,8 @@ from calibre.web.feeds.news import BasicNewsRecipe
import re
class ZiveRecipe(BasicNewsRecipe):
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__author__ = 'Abelturd'
language = 'sk'
version = 1
@ -25,21 +24,20 @@ class ZiveRecipe(BasicNewsRecipe):
cover_url = 'http://www.zive.sk/Client.Images/Logos/logo-zive-sk.gif'
feeds = []
feeds.append((u'V\u0161etky \u010dl\xe1nky', u'http://www.zive.sk/rss/sc-47/default.aspx'))
feeds.append((u'V\u0161etky \u010dl\xe1nky',
u'http://www.zive.sk/rss/sc-47/default.aspx'))
preprocess_regexps = [
(re.compile(r'<p><p><strong>Pokra.*ie</strong></p>', re.DOTALL|re.IGNORECASE),
lambda match: ''),
]
(re.compile(r'<p><p><strong>Pokra.*ie</strong></p>', re.DOTALL | re.IGNORECASE),
lambda match: ''),
]
remove_tags = []
keep_only_tags = [dict(name='h1'), dict(name='span', attrs={'class':'arlist-data-info-author'}), dict(name='div', attrs={'class':'bbtext font-resizer-area'}),]
keep_only_tags = [dict(name='h1'), dict(name='span', attrs={
'class': 'arlist-data-info-author'}), dict(name='div', attrs={'class': 'bbtext font-resizer-area'}), ]
extra_css = '''
h1 {font-size:140%;font-family:georgia,serif; font-weight:bold}
h3 {font-size:115%;font-family:georgia,serif; font-weight:bold}
'''

View File

@ -1,71 +1,113 @@
from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe(BasicNewsRecipe):
title = u'Aachener Nachrichten'
__author__ = 'schuster' #AGE update 2012-11-28
oldest_article = 1
title = u'Aachener Nachrichten'
__author__ = 'schuster' # AGE update 2012-11-28
oldest_article = 1
max_articles_per_feed = 100
no_stylesheets = True
remove_javascript = True
remove_empty_feeds = True
language = 'de'
no_stylesheets = True
remove_javascript = True
remove_empty_feeds = True
language = 'de'
# cover_url = 'http://www.aachener-nachrichten.de/img/logos/an_website_retina.png'
masthead_url = 'http://www.aachener-nachrichten.de/img/logos/an_website_retina.png'
masthead_url = 'http://www.aachener-nachrichten.de/img/logos/an_website_retina.png'
keep_only_tags = [
dict(name='article', attrs={'class':['single']})
]
keep_only_tags = [
dict(name='article', attrs={'class': ['single']})
]
remove_tags = [
dict(name='div', attrs={'class':["clearfix navi-wrapper"]}),
dict(name='div', attrs={'id':["article_actions"]}),
dict(name='style', attrs={'type':["text/css"]}),
dict(name='aside'),
dict(name='a', attrs={'class':["btn btn-action"]})
]
dict(name='div', attrs={'class': ["clearfix navi-wrapper"]}),
dict(name='div', attrs={'id': ["article_actions"]}),
dict(name='style', attrs={'type': ["text/css"]}),
dict(name='aside'),
dict(name='a', attrs={'class': ["btn btn-action"]})
]
feeds = [
(u'Lokales - Euregio', u'http://www.aachener-nachrichten.de/cmlink/euregio-rss-1.357285'),
(u'Lokales - Aachen', u'http://www.aachener-nachrichten.de/cmlink/aachen-rss-1.357286'),
(u'Lokales - Nordkreis', u'http://www.aachener-nachrichten.de/cmlink/nordkreis-rss-1.358150'),
(u'Lokales - Düren', u'http://www.aachener-nachrichten.de/cmlink/dueren-rss-1.358626'),
(u'Lokales - Eiffel', u'http://www.aachener-nachrichten.de/cmlink/eifel-rss-1.358978'),
(u'Lokales - Eschweiler', u'http://www.aachener-nachrichten.de/cmlink/eschweiler-rss-1.359332'),
(u'Lokales - Geilenkirchen', u'http://www.aachener-nachrichten.de/cmlink/geilenkirchen-rss-1.359643'),
(u'Lokales - Heinsberg', u'http://www.aachener-nachrichten.de/cmlink/heinsberg-rss-1.359724'),
(u'Lokales - Jülich', u'http://www.aachener-nachrichten.de/cmlink/juelich-rss-1.359725'),
(u'Lokales - Stolberg', u'http://www.aachener-nachrichten.de/cmlink/stolberg-rss-1.359726'),
(u'News - Politik', u'http://www.aachener-nachrichten.de/cmlink/politik-rss-1.359727'),
(u'News - Aus aller Welt', u'http://www.aachener-nachrichten.de/cmlink/ausallerwelt-rss-1.453282'),
(u'News - Wirtschaft', u'http://www.aachener-nachrichten.de/cmlink/wirtschaft-rss-1.359872'),
(u'News - Kultur', u'http://www.aachener-nachrichten.de/cmlink/kultur-rss-1.365018'),
(u'News - Kino', u'http://www.aachener-nachrichten.de/cmlink/kino-rss-1.365019'),
(u'News - Digital', u'http://www.aachener-nachrichten.de/cmlink/digital-rss-1.365020'),
(u'News - Wissenschaft', u'http://www.aachener-nachrichten.de/cmlink/wissenschaft-rss-1.365021'),
(u'News - Hochschule', u'http://www.aachener-nachrichten.de/cmlink/hochschule-rss-1.365022'),
(u'News - Auto', u'http://www.aachener-nachrichten.de/cmlink/auto-rss-1.365023'),
(u'News - Kurioses', u'http://www.aachener-nachrichten.de/cmlink/kurioses-rss-1.365067'),
(u'News - Musik', u'http://www.aachener-nachrichten.de/cmlink/musik-rss-1.365305'),
(u'News - Tagesthema', u'http://www.aachener-nachrichten.de/cmlink/tagesthema-rss-1.365519'),
(u'News - Newsticker', u'http://www.aachener-nachrichten.de/cmlink/newsticker-rss-1.451948'),
(u'Sport - Aktuell', u'http://www.aachener-nachrichten.de/cmlink/aktuell-rss-1.366716'),
(u'Sport - Fußball', u'http://www.aachener-nachrichten.de/cmlink/fussball-rss-1.367060'),
(u'Sport - Bundesliga', u'http://www.aachener-nachrichten.de/cmlink/bundesliga-rss-1.453367'),
(u'Sport - Alemannia Aachen', u'http://www.aachener-nachrichten.de/cmlink/alemanniaaachen-rss-1.366057'),
(u'Sport - Volleyball', u'http://www.aachener-nachrichten.de/cmlink/volleyball-rss-1.453370'),
(u'Sport - Chio', u'http://www.aachener-nachrichten.de/cmlink/chio-rss-1.453371'),
(u'Dossier - Kinderuni', u'http://www.aachener-nachrichten.de/cmlink/kinderuni-rss-1.453375'),
(u'Dossier - Karlspreis', u'http://www.aachener-nachrichten.de/cmlink/karlspreis-rss-1.453376'),
(u'Dossier - Ritterorden', u'http://www.aachener-nachrichten.de/cmlink/ritterorden-rss-1.453377'),
(u'Dossier - ZAB-Aachen', u'http://www.aachener-nachrichten.de/cmlink/zabaachen-rss-1.453380'),
(u'Dossier - Karneval', u'http://www.aachener-nachrichten.de/cmlink/karneval-rss-1.453384'),
(u'Ratgeber - Geld', u'http://www.aachener-nachrichten.de/cmlink/geld-rss-1.453385'),
(u'Ratgeber - Recht', u'http://www.aachener-nachrichten.de/cmlink/recht-rss-1.453386'),
(u'Ratgeber - Gesundheit', u'http://www.aachener-nachrichten.de/cmlink/gesundheit-rss-1.453387'),
(u'Ratgeber - Familie', u'http://www.aachener-nachrichten.de/cmlink/familie-rss-1.453388'),
(u'Ratgeber - Livestyle', u'http://www.aachener-nachrichten.de/cmlink/lifestyle-rss-1.453389'),
(u'Ratgeber - Reisen', u'http://www.aachener-nachrichten.de/cmlink/reisen-rss-1.453390'),
(u'Ratgeber - Bauen und Wohnen', u'http://www.aachener-nachrichten.de/cmlink/bauen-rss-1.453398'),
(u'Ratgeber - Bildung und Beruf', u'http://www.aachener-nachrichten.de/cmlink/bildung-rss-1.453400'),
]
(u'Lokales - Euregio',
u'http://www.aachener-nachrichten.de/cmlink/euregio-rss-1.357285'),
(u'Lokales - Aachen',
u'http://www.aachener-nachrichten.de/cmlink/aachen-rss-1.357286'),
(u'Lokales - Nordkreis',
u'http://www.aachener-nachrichten.de/cmlink/nordkreis-rss-1.358150'),
(u'Lokales - Düren',
u'http://www.aachener-nachrichten.de/cmlink/dueren-rss-1.358626'),
(u'Lokales - Eiffel',
u'http://www.aachener-nachrichten.de/cmlink/eifel-rss-1.358978'),
(u'Lokales - Eschweiler',
u'http://www.aachener-nachrichten.de/cmlink/eschweiler-rss-1.359332'),
(u'Lokales - Geilenkirchen',
u'http://www.aachener-nachrichten.de/cmlink/geilenkirchen-rss-1.359643'),
(u'Lokales - Heinsberg',
u'http://www.aachener-nachrichten.de/cmlink/heinsberg-rss-1.359724'),
(u'Lokales - Jülich',
u'http://www.aachener-nachrichten.de/cmlink/juelich-rss-1.359725'),
(u'Lokales - Stolberg',
u'http://www.aachener-nachrichten.de/cmlink/stolberg-rss-1.359726'),
(u'News - Politik',
u'http://www.aachener-nachrichten.de/cmlink/politik-rss-1.359727'),
(u'News - Aus aller Welt',
u'http://www.aachener-nachrichten.de/cmlink/ausallerwelt-rss-1.453282'),
(u'News - Wirtschaft',
u'http://www.aachener-nachrichten.de/cmlink/wirtschaft-rss-1.359872'),
(u'News - Kultur',
u'http://www.aachener-nachrichten.de/cmlink/kultur-rss-1.365018'),
(u'News - Kino', u'http://www.aachener-nachrichten.de/cmlink/kino-rss-1.365019'),
(u'News - Digital',
u'http://www.aachener-nachrichten.de/cmlink/digital-rss-1.365020'),
(u'News - Wissenschaft',
u'http://www.aachener-nachrichten.de/cmlink/wissenschaft-rss-1.365021'),
(u'News - Hochschule',
u'http://www.aachener-nachrichten.de/cmlink/hochschule-rss-1.365022'),
(u'News - Auto', u'http://www.aachener-nachrichten.de/cmlink/auto-rss-1.365023'),
(u'News - Kurioses',
u'http://www.aachener-nachrichten.de/cmlink/kurioses-rss-1.365067'),
(u'News - Musik',
u'http://www.aachener-nachrichten.de/cmlink/musik-rss-1.365305'),
(u'News - Tagesthema',
u'http://www.aachener-nachrichten.de/cmlink/tagesthema-rss-1.365519'),
(u'News - Newsticker',
u'http://www.aachener-nachrichten.de/cmlink/newsticker-rss-1.451948'),
(u'Sport - Aktuell',
u'http://www.aachener-nachrichten.de/cmlink/aktuell-rss-1.366716'),
(u'Sport - Fußball',
u'http://www.aachener-nachrichten.de/cmlink/fussball-rss-1.367060'),
(u'Sport - Bundesliga',
u'http://www.aachener-nachrichten.de/cmlink/bundesliga-rss-1.453367'),
(u'Sport - Alemannia Aachen',
u'http://www.aachener-nachrichten.de/cmlink/alemanniaaachen-rss-1.366057'),
(u'Sport - Volleyball',
u'http://www.aachener-nachrichten.de/cmlink/volleyball-rss-1.453370'),
(u'Sport - Chio',
u'http://www.aachener-nachrichten.de/cmlink/chio-rss-1.453371'),
(u'Dossier - Kinderuni',
u'http://www.aachener-nachrichten.de/cmlink/kinderuni-rss-1.453375'),
(u'Dossier - Karlspreis',
u'http://www.aachener-nachrichten.de/cmlink/karlspreis-rss-1.453376'),
(u'Dossier - Ritterorden',
u'http://www.aachener-nachrichten.de/cmlink/ritterorden-rss-1.453377'),
(u'Dossier - ZAB-Aachen',
u'http://www.aachener-nachrichten.de/cmlink/zabaachen-rss-1.453380'),
(u'Dossier - Karneval',
u'http://www.aachener-nachrichten.de/cmlink/karneval-rss-1.453384'),
(u'Ratgeber - Geld',
u'http://www.aachener-nachrichten.de/cmlink/geld-rss-1.453385'),
(u'Ratgeber - Recht',
u'http://www.aachener-nachrichten.de/cmlink/recht-rss-1.453386'),
(u'Ratgeber - Gesundheit',
u'http://www.aachener-nachrichten.de/cmlink/gesundheit-rss-1.453387'),
(u'Ratgeber - Familie',
u'http://www.aachener-nachrichten.de/cmlink/familie-rss-1.453388'),
(u'Ratgeber - Livestyle',
u'http://www.aachener-nachrichten.de/cmlink/lifestyle-rss-1.453389'),
(u'Ratgeber - Reisen',
u'http://www.aachener-nachrichten.de/cmlink/reisen-rss-1.453390'),
(u'Ratgeber - Bauen und Wohnen',
u'http://www.aachener-nachrichten.de/cmlink/bauen-rss-1.453398'),
(u'Ratgeber - Bildung und Beruf',
u'http://www.aachener-nachrichten.de/cmlink/bildung-rss-1.453400'),
]

View File

@ -1,43 +1,45 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class ABCRecipe(BasicNewsRecipe):
title = u'ABC Linuxu'
oldest_article = 5
max_articles_per_feed = 3#5
__author__ = 'Funthomas'
language = 'cs'
title = u'ABC Linuxu'
oldest_article = 5
max_articles_per_feed = 3 # 5
__author__ = 'Funthomas'
language = 'cs'
feeds = [
#(u'Blogy', u'http://www.abclinuxu.cz/auto/blogDigest.rss'),
(u'Články', u'http://www.abclinuxu.cz/auto/abc.rss'),
(u'Zprávičky','http://www.abclinuxu.cz/auto/zpravicky.rss')
]
feeds = [
# (u'Blogy', u'http://www.abclinuxu.cz/auto/blogDigest.rss'),
(u'Články', u'http://www.abclinuxu.cz/auto/abc.rss'),
(u'Zprávičky', 'http://www.abclinuxu.cz/auto/zpravicky.rss')
]
remove_javascript = True
no_stylesheets = True
remove_attributes = ['width','height']
remove_javascript = True
no_stylesheets = True
remove_attributes = ['width', 'height']
remove_tags_before = dict(name='h1')
remove_tags = [
dict(attrs={'class':['meta-vypis','page_tools','cl_perex']}),
dict(attrs={'class':['cl_nadpis-link','komix-nav']})
]
remove_tags_before = dict(name='h1')
remove_tags = [
dict(attrs={'class': ['meta-vypis', 'page_tools', 'cl_perex']}),
dict(attrs={'class': ['cl_nadpis-link', 'komix-nav']})
]
remove_tags_after = [
dict(name='div',attrs={'class':['cl_perex','komix-nav']}),
dict(attrs={'class':['meta-vypis','page_tools']}),
dict(name='',attrs={'':''}),
]
remove_tags_after = [
dict(name='div', attrs={'class': ['cl_perex', 'komix-nav']}),
dict(attrs={'class': ['meta-vypis', 'page_tools']}),
dict(name='', attrs={'': ''}),
]
preprocess_regexps = [
(re.compile(r'</div>.*<p class="perex">', re.DOTALL),
lambda match: '</div><p class="perex">')
]
preprocess_regexps = [
(re.compile(r'</div>.*<p class="perex">', re.DOTALL),lambda match: '</div><p class="perex">')
]
def print_version(self, url):
return url + '?varianta=print&noDiz'
def print_version(self, url):
return url + '?varianta=print&noDiz'
extra_css = '''
extra_css = '''
h1 {font-size:130%; font-weight:bold}
h3 {font-size:111%; font-weight:bold}
'''

View File

@ -1,4 +1,4 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2011, Pat Stapleton <pat.stapleton at gmail.com>'
'''
abc.net.au/news
@ -6,51 +6,50 @@ abc.net.au/news
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class ABCNews(BasicNewsRecipe):
title = 'ABC News'
__author__ = 'Pat Stapleton, Dean Cording'
description = 'News from Australia'
masthead_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png'
cover_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png'
title = 'ABC News'
__author__ = 'Pat Stapleton, Dean Cording'
description = 'News from Australia'
masthead_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png'
cover_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = False
#delay = 1
use_embedded_content = False
encoding = 'utf8'
publisher = 'ABC News'
category = 'News, Australia, World'
language = 'en_AU'
publication_type = 'newsportal'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = False
use_embedded_content = False
encoding = 'utf8'
publisher = 'ABC News'
category = 'News, Australia, World'
language = 'en_AU'
publication_type = 'newsportal'
# preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
#Remove annoying map links (inline-caption class is also used for some image captions! hence regex to match maps.google)
preprocess_regexps = [(re.compile(r'<a class="inline-caption" href="http://maps\.google\.com.*?/a>', re.DOTALL), lambda m: '')]
# Remove annoying map links (inline-caption class is also used for some
# image captions! hence regex to match maps.google)
preprocess_regexps = [(re.compile(
r'<a class="inline-caption" href="http://maps\.google\.com.*?/a>', re.DOTALL), lambda m: '')]
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': False
}
'comments': description, 'tags': category, 'language': language, 'publisher': publisher, 'linearize_tables': False
}
keep_only_tags = [dict(attrs={'class':['article section']})]
keep_only_tags = [dict(attrs={'class': ['article section']})]
remove_tags = [dict(attrs={'class':['related', 'tags', 'tools', 'attached-content ready',
'inline-content story left', 'inline-content map left contracted', 'published',
'story-map', 'statepromo', 'topics', ]})]
remove_tags = [dict(attrs={'class': ['related', 'tags', 'tools', 'attached-content ready',
'inline-content story left', 'inline-content map left contracted', 'published',
'story-map', 'statepromo', 'topics', ]})]
remove_attributes = ['width','height']
remove_attributes = ['width', 'height']
feeds = [
('Top Stories', 'http://www.abc.net.au/news/feed/45910/rss.xml'),
('Canberra', 'http://www.abc.net.au/news/feed/6910/rss.xml'),
('Sydney', 'http://www.abc.net.au/news/feed/10232/rss.xml'),
('Melbourne', 'http://www.abc.net.au/news/feed/21708/rss.xml'),
('Brisbane', 'http://www.abc.net.au/news/feed/12858/rss.xml'),
('Perth', 'feed://www.abc.net.au/news/feed/24886/rss.xml'),
('Australia', 'http://www.abc.net.au/news/feed/46182/rss.xml'),
('World', 'http://www.abc.net.au/news/feed/52278/rss.xml'),
('Business', 'http://www.abc.net.au/news/feed/51892/rss.xml'),
('Science and Technology', 'http://www.abc.net.au/news/feed/2298/rss.xml'),
]
feeds = [
('Top Stories', 'http://www.abc.net.au/news/feed/45910/rss.xml'),
('Canberra', 'http://www.abc.net.au/news/feed/6910/rss.xml'),
('Sydney', 'http://www.abc.net.au/news/feed/10232/rss.xml'),
('Melbourne', 'http://www.abc.net.au/news/feed/21708/rss.xml'),
('Brisbane', 'http://www.abc.net.au/news/feed/12858/rss.xml'),
('Perth', 'feed://www.abc.net.au/news/feed/24886/rss.xml'),
('Australia', 'http://www.abc.net.au/news/feed/46182/rss.xml'),
('World', 'http://www.abc.net.au/news/feed/52278/rss.xml'),
('Business', 'http://www.abc.net.au/news/feed/51892/rss.xml'),
('Science and Technology',
'http://www.abc.net.au/news/feed/2298/rss.xml'),
]

View File

@ -1,8 +1,8 @@
__license__ = 'GPL v3'
__author__ = 'Ricardo Jurado'
__license__ = 'GPL v3'
__author__ = 'Ricardo Jurado'
__copyright__ = 'Ricardo Jurado'
__version__ = 'v0.4'
__date__ = '11 February 2011'
__version__ = 'v0.4'
__date__ = '11 February 2011'
'''
http://www.abc.es/
@ -10,16 +10,17 @@ http://www.abc.es/
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1296604369(BasicNewsRecipe):
title = u'ABC.es'
masthead_url = 'http://www.abc.es/img/logo-abc.gif'
cover_url = 'http://www.abc.es/img/logo-abc.gif'
publisher = u'Grupo VOCENTO'
title = u'ABC.es'
masthead_url = 'http://www.abc.es/img/logo-abc.gif'
cover_url = 'http://www.abc.es/img/logo-abc.gif'
publisher = u'Grupo VOCENTO'
__author__ = 'Ricardo Jurado'
description = 'Noticias de Spain y el mundo'
category = 'News,Spain,National,International,Economy'
__author__ = 'Ricardo Jurado'
description = 'Noticias de Spain y el mundo'
category = 'News,Spain,National,International,Economy'
oldest_article = 2
max_articles_per_feed = 10
@ -38,20 +39,21 @@ class AdvancedUserRecipe1296604369(BasicNewsRecipe):
h1{font-family: sans-serif; font-size:150%; font-weight:bold; text-align: justify; }
"""
feeds = [
(u'PORTADA', u'http://www.abc.es/rss/feeds/abcPortada.xml')
,(u'ULTIMAS', u'http://www.abc.es/rss/feeds/abc_ultima.xml')
,(u'NACIONAL', u'http://www.abc.es/rss/feeds/abc_EspanaEspana.xml')
,(u'INTERNACIONAL', u'http://www.abc.es/rss/feeds/abc_Internacional.xml')
,(u'OPINION', u'http://www.abc.es/rss/feeds/abc_opinioncompleto.xml')
,(u'BLOGS ABC', u'http://www.abc.es/rss/feeds/blogs-abc.xml')
,(u'ECONOMIA', u'http://www.abc.es/rss/feeds/abc_Economia.xml')
,(u'CIENCIA Y TECNOLOGIA', u'http://www.abc.es/rss/feeds/abc_Ciencia_Tecnologia.xml')
,(u'CULTURA', u'http://www.abc.es/rss/feeds/abc_Cultura.xml')
,(u'LIBROS', u'http://www.abc.es/rss/feeds/abc_Libros.xml')
,(u'MEDIOS Y REDES', u'http://www.abc.es/rss/feeds/ABC_Medios_Redes.xml')
,(u'EVASION', u'http://www.abc.es/rss/feeds/abc_evasion.xml')
,(u'ESPECTACULOS', u'http://www.abc.es/rss/feeds/abc_Espectaculos.xml')
,(u'GENTE', u'http://www.abc.es/rss/feeds/abc_Gente.xml')
,(u'DEPORTES', u'http://www.abc.es/rss/feeds/abc_Deportes.xml')
]
feeds = [
(u'PORTADA', u'http://www.abc.es/rss/feeds/abcPortada.xml'),
(u'ULTIMAS', u'http://www.abc.es/rss/feeds/abc_ultima.xml'),
(u'NACIONAL', u'http://www.abc.es/rss/feeds/abc_EspanaEspana.xml'),
(u'INTERNACIONAL', u'http://www.abc.es/rss/feeds/abc_Internacional.xml'),
(u'OPINION', u'http://www.abc.es/rss/feeds/abc_opinioncompleto.xml'),
(u'BLOGS ABC', u'http://www.abc.es/rss/feeds/blogs-abc.xml'),
(u'ECONOMIA', u'http://www.abc.es/rss/feeds/abc_Economia.xml'),
(u'CIENCIA Y TECNOLOGIA', u'http://www.abc.es/rss/feeds/abc_Ciencia_Tecnologia.xml'),
(u'CULTURA', u'http://www.abc.es/rss/feeds/abc_Cultura.xml'),
(u'LIBROS', u'http://www.abc.es/rss/feeds/abc_Libros.xml'),
(u'MEDIOS Y REDES', u'http://www.abc.es/rss/feeds/ABC_Medios_Redes.xml'),
(u'EVASION', u'http://www.abc.es/rss/feeds/abc_evasion.xml'),
(u'ESPECTACULOS', u'http://www.abc.es/rss/feeds/abc_Espectaculos.xml'),
(u'GENTE', u'http://www.abc.es/rss/feeds/abc_Gente.xml'),
(u'DEPORTES', u'http://www.abc.es/rss/feeds/abc_Deportes.xml')
]

View File

@ -1,4 +1,4 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2010-2012, Darko Miletic <darko.miletic at gmail.com>'
'''
abc.com.py
@ -6,46 +6,45 @@ abc.com.py
from calibre.web.feeds.news import BasicNewsRecipe
class ABC_py(BasicNewsRecipe):
title = 'ABC Color'
__author__ = 'Darko Miletic'
description = 'Noticias de Paraguay y el resto del mundo'
publisher = 'ABC'
category = 'news, politics, Paraguay'
oldest_article = 2
title = 'ABC Color'
__author__ = 'Darko Miletic'
description = 'Noticias de Paraguay y el resto del mundo'
publisher = 'ABC'
category = 'news, politics, Paraguay'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'es_PY'
remove_empty_feeds = True
masthead_url = 'http://www.abc.com.py/plantillas/img/abc-logo.png'
publication_type = 'newspaper'
extra_css = """
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'es_PY'
remove_empty_feeds = True
masthead_url = 'http://www.abc.com.py/plantillas/img/abc-logo.png'
publication_type = 'newspaper'
extra_css = """
body{font-family: UnitSlabProMedium,"Times New Roman",serif }
img{margin-bottom: 0.4em; display: block;}
img{margin-bottom: 0.4em; display: block;}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
remove_tags = [
dict(name=['form','iframe','embed','object','link','base','table']),
dict(attrs={'class':['es-carousel-wrapper']}),
dict(attrs={'id':['tools','article-banner-1']})
]
keep_only_tags = [dict(attrs={'id':'article'})]
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
remove_tags = [
dict(name=['form', 'iframe', 'embed',
'object', 'link', 'base', 'table']),
dict(attrs={'class': ['es-carousel-wrapper']}),
dict(attrs={'id': ['tools', 'article-banner-1']})
]
keep_only_tags = [dict(attrs={'id': 'article'})]
feeds = [
(u'Ultimo momento', u'http://www.abc.com.py/rss.xml' )
,(u'Nacionales' , u'http://www.abc.com.py/nacionales/rss.xml' )
,(u'Mundo' , u'http://www.abc.com.py/internacionales/rss.xml')
,(u'Deportes' , u'http://www.abc.com.py/deportes/rss.xml' )
,(u'Espectaculos' , u'http://www.abc.com.py/espectaculos/rss.xml' )
,(u'TecnoCiencia' , u'http://www.abc.com.py/ciencia/rss.xml' )
]
(u'Ultimo momento', u'http://www.abc.com.py/rss.xml'),
(u'Nacionales', u'http://www.abc.com.py/nacionales/rss.xml'),
(u'Mundo', u'http://www.abc.com.py/internacionales/rss.xml'),
(u'Deportes', u'http://www.abc.com.py/deportes/rss.xml'),
(u'Espectaculos', u'http://www.abc.com.py/espectaculos/rss.xml'),
(u'TecnoCiencia', u'http://www.abc.com.py/ciencia/rss.xml')
]

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python2
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.accountancyage.com
@ -8,26 +8,28 @@ www.accountancyage.com
from calibre.web.feeds.news import BasicNewsRecipe
class AccountancyAge(BasicNewsRecipe):
title = 'Accountancy Age'
__author__ = 'Darko Miletic'
description = 'business news'
publisher = 'accountancyage.com'
category = 'news, politics, finances'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
title = 'Accountancy Age'
__author__ = 'Darko Miletic'
description = 'business news'
publisher = 'accountancyage.com'
category = 'news, politics, finances'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
simultaneous_downloads = 1
encoding = 'utf-8'
lang = 'en'
encoding = 'utf-8'
lang = 'en'
language = 'en'
feeds = [(u'All News', u'http://feeds.accountancyage.com/rss/latest/accountancyage/all')]
feeds = [
(u'All News', u'http://feeds.accountancyage.com/rss/latest/accountancyage/all')]
keep_only_tags = [
dict(name='h1'),
dict(attrs={'class':'article_content'}),
dict(attrs={'class': 'article_content'}),
]
def get_article_url(self, article):

View File

@ -2,26 +2,23 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1334868409(BasicNewsRecipe):
title = u'AÇIK BİLİM DERGİSİ'
description = ' Aylık çevrimiçi bilim dergisi'
__author__ = u'thomass'
title = u'AÇIK BİLİM DERGİSİ'
description = ' Aylık çevrimiçi bilim dergisi'
__author__ = u'thomass'
oldest_article = 30
max_articles_per_feed = 300
auto_cleanup = True
encoding = 'UTF-8'
publisher = 'açık bilim'
category = 'haber, bilim,TR,dergi'
language = 'tr'
encoding = 'UTF-8'
publisher = 'açık bilim'
category = 'haber, bilim,TR,dergi'
language = 'tr'
publication_type = 'magazine '
conversion_options = {
'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
}
'tags': category, 'language': language, 'publisher': publisher, 'linearize_tables': True
}
cover_img_url = 'http://www.acikbilim.com/wp-content/themes/Equilibrium/images/logodene.jpg'
masthead_url = 'http://www.acikbilim.com/wp-content/themes/Equilibrium/images/logodene.jpg'
feeds = [(u'Tüm Yayınlar', u'http://www.acikbilim.com/feed')]
feeds = [(u'Tüm Yayınlar', u'http://www.acikbilim.com/feed')]

View File

@ -1,7 +1,7 @@
# vim:fileencoding=utf-8
from __future__ import unicode_literals
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2012'
'''
acrimed.org
@ -10,19 +10,21 @@ acrimed.org
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Acrimed(BasicNewsRecipe):
title = u'Acrimed'
__author__ = 'Gaëtan Lehmann'
oldest_article = 30
max_articles_per_feed = 100
auto_cleanup = True
auto_cleanup_keep = '//div[@class="crayon article-chapo-4112 chapo"]'
language = 'fr'
masthead_url = 'http://www.acrimed.org/IMG/siteon0.gif'
feeds = [(u'Acrimed', u'http://www.acrimed.org/spip.php?page=backend')]
preprocess_regexps = [
(re.compile(r'<title>(.*) - Acrimed \| Action Critique M.*dias</title>'), lambda m: '<title>' + m.group(1) + '</title>'),
class Acrimed(BasicNewsRecipe):
title = u'Acrimed'
__author__ = 'Gaëtan Lehmann'
oldest_article = 30
max_articles_per_feed = 100
auto_cleanup = True
auto_cleanup_keep = '//div[@class="crayon article-chapo-4112 chapo"]'
language = 'fr'
masthead_url = 'http://www.acrimed.org/IMG/siteon0.gif'
feeds = [(u'Acrimed', u'http://www.acrimed.org/spip.php?page=backend')]
preprocess_regexps = [
(re.compile(r'<title>(.*) - Acrimed \| Action Critique M.*dias</title>'),
lambda m: '<title>' + m.group(1) + '</title>'),
(re.compile(r'<h2>(.*) - Acrimed \| Action Critique M.*dias</h2>'), lambda m: '<h2>' + m.group(1) + '</h2>')]
extra_css = """

View File

@ -1,8 +1,9 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class ADRecipe(BasicNewsRecipe):
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__author__ = 'kwetal'
language = 'nl'
country = 'NL'
@ -22,41 +23,54 @@ class ADRecipe(BasicNewsRecipe):
remove_javascript = True
keep_only_tags = []
keep_only_tags.append(dict(name = 'div', attrs = {'id': 'art_box2'}))
keep_only_tags.append(dict(name = 'p', attrs = {'class': 'gen_footnote3'}))
keep_only_tags.append(dict(name='div', attrs={'id': 'art_box2'}))
keep_only_tags.append(dict(name='p', attrs={'class': 'gen_footnote3'}))
remove_tags = []
remove_tags.append(dict(name = 'div', attrs = {'class': 'gen_clear'}))
remove_tags.append(dict(name = 'div', attrs = {'class': re.compile(r'gen_spacer.*')}))
remove_tags.append(dict(name='div', attrs={'class': 'gen_clear'}))
remove_tags.append(
dict(name='div', attrs={'class': re.compile(r'gen_spacer.*')}))
remove_attributes = ['style']
# feeds from http://ad.nl/ad/nl/1401/home/integration/nmc/frameset/ad_footer/rssFeeds.dhtml
# feeds from
# http://ad.nl/ad/nl/1401/home/integration/nmc/frameset/ad_footer/rssFeeds.dhtml
feeds = []
feeds.append((u'Binnenland', u'http://www.ad.nl/nieuws/binnenland/rss.xml'))
feeds.append((u'Buitenland', u'http://www.ad.nl/nieuws/buitenland/rss.xml'))
feeds.append(
(u'Binnenland', u'http://www.ad.nl/nieuws/binnenland/rss.xml'))
feeds.append(
(u'Buitenland', u'http://www.ad.nl/nieuws/buitenland/rss.xml'))
feeds.append((u'Bizar', u'http://www.ad.nl/nieuws/bizar/rss.xml'))
feeds.append((u'Gezondheid & Wetenschap', u'http://www.ad.nl/nieuws/gezondheidwetenschap/rss.xml'))
feeds.append((u'Gezondheid & Wetenschap',
u'http://www.ad.nl/nieuws/gezondheidwetenschap/rss.xml'))
feeds.append((u'Economie', u'http://www.ad.nl/nieuws/economie/rss.xml'))
feeds.append((u'Nederlands Voetbal', u'http://www.ad.nl/sportwereld/nederlandsvoetbal/rss.xml'))
feeds.append((u'Buitenlands Voetbal', u'http://www.ad.nl/sportwereld/buitenlandsvoetbal/rss.xml'))
feeds.append((u'Champions League/Europa League', u'http://www.ad.nl/sportwereld/championsleagueeuropaleague/rss.xml'))
feeds.append((u'Wielrennen', u'http://www.ad.nl/sportwereld/wielrennen/rss.xml'))
feeds.append((u'Nederlands Voetbal',
u'http://www.ad.nl/sportwereld/nederlandsvoetbal/rss.xml'))
feeds.append((u'Buitenlands Voetbal',
u'http://www.ad.nl/sportwereld/buitenlandsvoetbal/rss.xml'))
feeds.append((u'Champions League/Europa League',
u'http://www.ad.nl/sportwereld/championsleagueeuropaleague/rss.xml'))
feeds.append(
(u'Wielrennen', u'http://www.ad.nl/sportwereld/wielrennen/rss.xml'))
feeds.append((u'Tennis', u'http://www.ad.nl/sportwereld/tennis/rss.xml'))
feeds.append((u'Formule 1', u'http://www.ad.nl/sportwereld/formule1/rss.xml'))
feeds.append((u'Meer Sport', u'http://www.ad.nl/sportwereld/meersport/rss.xml'))
feeds.append(
(u'Formule 1', u'http://www.ad.nl/sportwereld/formule1/rss.xml'))
feeds.append(
(u'Meer Sport', u'http://www.ad.nl/sportwereld/meersport/rss.xml'))
feeds.append((u'Celebs', u'http://www.ad.nl/showbizz/celebs/rss.xml'))
feeds.append((u'Film', u'http://www.ad.nl/showbizz/film/rss.xml'))
feeds.append((u'Muziek', u'http://www.ad.nl/showbizz/muziek/rss.xml'))
feeds.append((u'TV', u'http://www.ad.nl/showbizz/tv/rss.xml'))
feeds.append((u'Kunst & Literatuur', u'http://www.ad.nl/showbizz/kunstenliteratuur/rss.xml'))
feeds.append((u'Kunst & Literatuur',
u'http://www.ad.nl/showbizz/kunstenliteratuur/rss.xml'))
feeds.append((u'Jouw Wereld', u'http://www.ad.nl/you/rss.xml'))
feeds.append((u'Consument', u'http://www.ad.nl/consument/rss.xml'))
feeds.append((u'Autowereld', u'http://www.ad.nl/autowereld/rss.xml'))
feeds.append((u'Reiswereld', u'http://www.ad.nl/reiswereld/rss.xml'))
feeds.append((u'Internet', u'http://www.ad.nl/digitaal/internet/rss.xml'))
feeds.append((u'Games', u'http://www.ad.nl/digitaal/games/rss.xml'))
feeds.append((u'Multimedia', u'http://www.ad.nl/digitaal/multimedia/rss.xml'))
feeds.append(
(u'Multimedia', u'http://www.ad.nl/digitaal/multimedia/rss.xml'))
feeds.append((u'Planet Watch', u'http://www.ad.nl/planetwatch/rss.xml'))
extra_css = '''
@ -71,7 +85,8 @@ class ADRecipe(BasicNewsRecipe):
def print_version(self, url):
parts = url.split('/')
print_url = 'http://' + parts[2] + '/' + parts[3] + '/' + parts[4] + '/' + parts[5] + '/' \
+ parts[10] + '/' + parts[7] + '/print/' + parts[8] + '/' + parts[9] + '/' + parts[13]
+ parts[10] + '/' + parts[7] + '/print/' + \
parts[8] + '/' + parts[9] + '/' + parts[13]
return print_url

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = u'2011, Silviu Cotoar\u0103'
'''
adevarul.ro
@ -9,51 +9,38 @@ adevarul.ro
from calibre.web.feeds.news import BasicNewsRecipe
class Adevarul(BasicNewsRecipe):
title = u'Adev\u0103rul'
language = 'ro'
__author__ = u'Silviu Cotoar\u0103'
description = u'\u0218tiri din Rom\u00e2nia'
publisher = 'Adevarul'
category = 'Ziare,Stiri,Romania'
oldest_article = 5
title = u'Adev\u0103rul'
language = 'ro'
__author__ = u'Silviu Cotoar\u0103'
description = u'\u0218tiri din Rom\u00e2nia'
publisher = 'Adevarul'
category = 'Ziare,Stiri,Romania'
oldest_article = 5
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
remove_javascript = True
cover_url = 'http://upload.wikimedia.org/wikipedia/en/d/d6/Logo_noul_adevarul.png'
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
remove_javascript = True
cover_url = 'http://upload.wikimedia.org/wikipedia/en/d/d6/Logo_noul_adevarul.png'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
keep_only_tags = [ dict(name='div', attrs={'class':'article_header'})
,dict(name='div', attrs={'class':'bb-tu first-t bb-article-body'})
]
'comments': description, 'tags': category, 'language': language, 'publisher': publisher
}
keep_only_tags = [dict(name='div', attrs={'class': 'article_header'}), dict(name='div', attrs={'class': 'bb-tu first-t bb-article-body'})
]
remove_tags = [
dict(name='li', attrs={'class':'author'})
,dict(name='li', attrs={'class':'date'})
,dict(name='li', attrs={'class':'comments'})
,dict(name='div', attrs={'class':'bb-wg-article_related_attachements'})
,dict(name='div', attrs={'class':'bb-md bb-md-article_comments'})
,dict(name='form', attrs={'id':'bb-comment-create-form'})
,dict(name='div', attrs={'id':'mediatag'})
,dict(name='div', attrs={'id':'ft'})
,dict(name='div', attrs={'id':'comment_wrapper'})
]
dict(name='li', attrs={'class': 'author'}), dict(name='li', attrs={'class': 'date'}), dict(name='li', attrs={'class': 'comments'}), dict(name='div', attrs={'class': 'bb-wg-article_related_attachements'}), dict(name='div', attrs={'class': 'bb-md bb-md-article_comments'}), dict(name='form', attrs={'id': 'bb-comment-create-form'}), dict(name='div', attrs={'id': 'mediatag'}), dict(name='div', attrs={'id': 'ft'}), dict(name='div', attrs={'id': 'comment_wrapper'}) # noqa
]
remove_tags_after = [
dict(name='div', attrs={'id':'comment_wrapper'}),
]
dict(name='div', attrs={'id': 'comment_wrapper'}),
]
feeds = [ (u'\u0218tiri', u'http://www.adevarul.ro/rss/latest') ]
feeds = [(u'\u0218tiri', u'http://www.adevarul.ro/rss/latest')]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -1,8 +1,8 @@
#!/usr/bin/env python2
__license__ = 'GPL v3'
__author__ = 'Gabriele Marini, based on Darko Miletic'
__license__ = 'GPL v3'
__author__ = 'Gabriele Marini, based on Darko Miletic'
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
description = 'Italian daily newspaper - 02-05-2010'
description = 'Italian daily newspaper - 02-05-2010'
'''
http://www.adnkronos.com/
@ -10,50 +10,49 @@ http://www.adnkronos.com/
from calibre.web.feeds.news import BasicNewsRecipe
class Adnkronos(BasicNewsRecipe):
__author__ = 'Gabriele Marini'
description = 'News agency'
cover_url = 'http://www.adnkronos.com/IGN6/img/popup_ign.jpg'
title = u'Adnkronos'
publisher = 'Adnkronos Group - ews agency'
category = 'News, politics, culture, economy, general interest'
language = 'it'
timefmt = '[%a, %d %b, %Y]'
class Adnkronos(BasicNewsRecipe):
__author__ = 'Gabriele Marini'
description = 'News agency'
cover_url = 'http://www.adnkronos.com/IGN6/img/popup_ign.jpg'
title = u'Adnkronos'
publisher = 'Adnkronos Group - ews agency'
category = 'News, politics, culture, economy, general interest'
language = 'it'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 7
max_articles_per_feed = 80
use_embedded_content = False
recursion = 10
use_embedded_content = False
recursion = 10
remove_javascript = True
def get_article_url(self, article):
link = article.get('id', article.get('guid', None))
return link
extra_css = ' .newsAbstract{font-style: italic} '
keep_only_tags = [dict(name='div', attrs={'class':['breadCrumbs','newsTop','newsText']})
]
remove_tags = [
dict(name='div', attrs={'class':['leogoo','leogoo2']})
]
feeds = [
(u'Prima Pagina', u'http://rss.adnkronos.com/RSS_PrimaPagina.xml'),
(u'Ultima Ora', u'http://rss.adnkronos.com/RSS_Ultimora.xml'),
(u'Politica', u'http://rss.adnkronos.com/RSS_Politica.xml'),
(u'Esteri', u'http://rss.adnkronos.com/RSS_Esteri.xml'),
(u'Cronoca', u'http://rss.adnkronos.com/RSS_Cronaca.xml'),
(u'Economia', u'http://rss.adnkronos.com/RSS_Economia.xml'),
(u'Finanza', u'http://rss.adnkronos.com/RSS_Finanza.xml'),
(u'CyberNews', u'http://rss.adnkronos.com/RSS_CyberNews.xml'),
(u'Spettacolo', u'http://rss.adnkronos.com/RSS_Spettacolo.xml'),
(u'Cultura', u'http://rss.adnkronos.com/RSS_Cultura.xml'),
(u'Sport', u'http://rss.adnkronos.com/RSS_Sport.xml'),
(u'Sostenibilita', u'http://rss.adnkronos.com/RSS_Sostenibilita.xml'),
(u'Salute', u'http://rss.adnkronos.com/RSS_Salute.xml')
keep_only_tags = [dict(name='div', attrs={'class': ['breadCrumbs', 'newsTop', 'newsText']})
]
remove_tags = [
dict(name='div', attrs={'class': ['leogoo', 'leogoo2']})
]
feeds = [
(u'Prima Pagina', u'http://rss.adnkronos.com/RSS_PrimaPagina.xml'),
(u'Ultima Ora', u'http://rss.adnkronos.com/RSS_Ultimora.xml'),
(u'Politica', u'http://rss.adnkronos.com/RSS_Politica.xml'),
(u'Esteri', u'http://rss.adnkronos.com/RSS_Esteri.xml'),
(u'Cronoca', u'http://rss.adnkronos.com/RSS_Cronaca.xml'),
(u'Economia', u'http://rss.adnkronos.com/RSS_Economia.xml'),
(u'Finanza', u'http://rss.adnkronos.com/RSS_Finanza.xml'),
(u'CyberNews', u'http://rss.adnkronos.com/RSS_CyberNews.xml'),
(u'Spettacolo', u'http://rss.adnkronos.com/RSS_Spettacolo.xml'),
(u'Cultura', u'http://rss.adnkronos.com/RSS_Cultura.xml'),
(u'Sport', u'http://rss.adnkronos.com/RSS_Sport.xml'),
(u'Sostenibilita', u'http://rss.adnkronos.com/RSS_Sostenibilita.xml'),
(u'Salute', u'http://rss.adnkronos.com/RSS_Salute.xml')
]

View File

@ -1,26 +1,26 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1336986047(BasicNewsRecipe):
title = u'Ads of the World'
title = u'Ads of the World'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = False
description = 'The best international advertising campaigns'
description = 'The best international advertising campaigns'
language = 'en'
__author__ = 'faber1971'
no_stylesheets = True
keep_only_tags = [
dict(name='div', attrs={'id':'primary'})
]
dict(name='div', attrs={'id': 'primary'})
]
remove_tags = [
dict(name='ul', attrs={'class':'links inline'})
,dict(name='div', attrs={'class':'form-item'})
,dict(name='div', attrs={'id':['options', 'comments']})
,dict(name='ul', attrs={'id':'nodePager'})
]
dict(name='ul', attrs={'class': 'links inline'}), dict(name='div', attrs={'class': 'form-item'}), dict(
name='div', attrs={'id': ['options', 'comments']}), dict(name='ul', attrs={'id': 'nodePager'})
]
reverse_article_order = True
masthead_url = 'http://bigcatgroup.co.uk/files/2011/01/05-ads-of-the-world.png'
feeds = [(u'Ads of the world', u'http://feeds.feedburner.com/adsoftheworld-latest')]
masthead_url = 'http://bigcatgroup.co.uk/files/2011/01/05-ads-of-the-world.png'
feeds = [
(u'Ads of the world', u'http://feeds.feedburner.com/adsoftheworld-latest')]

View File

@ -1,10 +1,12 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Adventure_zone(BasicNewsRecipe):
title = u'Adventure Zone'
__author__ = 'fenuks'
description = u'Czytaj więcej o przygodzie - codzienne nowinki. Szukaj u nas solucji i poradników, czytaj recenzje i zapowiedzi. Także galeria, pliki oraz forum dla wszystkich fanów gier przygodowych.'
category = 'games'
language = 'pl'
title = u'Adventure Zone'
__author__ = 'fenuks'
description = u'Czytaj więcej o przygodzie - codzienne nowinki. Szukaj u nas solucji i poradników, czytaj recenzje i zapowiedzi. Także galeria, pliki oraz forum dla wszystkich fanów gier przygodowych.' # noqa
category = 'games'
language = 'pl'
BASEURL = 'http://www.adventure-zone.info/fusion/'
no_stylesheets = True
extra_css = '.image {float: left; margin-right: 5px;}'
@ -13,20 +15,20 @@ class Adventure_zone(BasicNewsRecipe):
cover_url = 'http://www.adventure-zone.info/inne/logoaz_2012.png'
remove_attributes = ['style']
use_embedded_content = False
keep_only_tags = [dict(attrs={'class':'content'})]
remove_tags = [dict(attrs={'class':'footer'})]
feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/rss/index.php')]
keep_only_tags = [dict(attrs={'class': 'content'})]
remove_tags = [dict(attrs={'class': 'footer'})]
feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/rss/index.php')]
def skip_ad_pages(self, soup):
skip_tag = soup.body.find(attrs={'class':'content'})
skip_tag = soup.body.find(attrs={'class': 'content'})
skip_tag = skip_tag.findAll(name='a')
title = soup.title.string.lower()
if (('zapowied' in title) or ('recenzj' in title) or ('solucj' in title) or ('poradnik' in title)):
if (('zapowied' in title) or ('recenzj' in title) or ('solucj' in title) or ('poradnik' in title)):
for r in skip_tag:
if r.strong and r.strong.string:
word=r.strong.string.lower()
if (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)):
return self.index_to_soup(self.BASEURL+r['href'], raw=True)
word = r.strong.string.lower()
if (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)):
return self.index_to_soup(self.BASEURL + r['href'], raw=True)
def preprocess_html(self, soup):
for link in soup.findAll('a', href=True):

View File

@ -1,4 +1,4 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2009-2012, Darko Miletic <darko.miletic at gmail.com>'
'''
www.adventuregamers.com
@ -6,21 +6,21 @@ www.adventuregamers.com
from calibre.web.feeds.news import BasicNewsRecipe
class AdventureGamers(BasicNewsRecipe):
title = u'Adventure Gamers'
language = 'en'
__author__ = 'Darko Miletic'
description = 'Adventure games portal'
publisher = 'Adventure Gamers'
category = 'news, games, adventure, technology'
oldest_article = 10
#delay = 10
title = u'Adventure Gamers'
language = 'en'
__author__ = 'Darko Miletic'
description = 'Adventure games portal'
publisher = 'Adventure Gamers'
category = 'news, games, adventure, technology'
oldest_article = 10
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf8'
remove_javascript = True
use_embedded_content = False
INDEX = u'http://www.adventuregamers.com'
no_stylesheets = True
encoding = 'utf8'
remove_javascript = True
use_embedded_content = False
INDEX = u'http://www.adventuregamers.com'
extra_css = """
.pageheader_type{font-size: x-large; font-weight: bold; color: #828D74}
.pageheader_title,.page_title{font-size: xx-large; color: #394128}
@ -29,59 +29,54 @@ class AdventureGamers(BasicNewsRecipe):
.score_column_1{ padding-left: 10px; font-size: small; width: 50%}
.score_column_2{ padding-left: 10px; font-size: small; width: 50%}
.score_column_3{ padding-left: 10px; font-size: small; width: 50%}
.score_header{font-size: large; color: #50544A}
.score_header{font-size: large; color: #50544A}
img{margin-bottom: 1em;}
body{font-family: 'Open Sans',Helvetica,Arial,sans-serif}
body{font-family: 'Open Sans',Helvetica,Arial,sans-serif}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
keep_only_tags = [dict(name='div', attrs={'class':'cleft_inn'})]
keep_only_tags = [dict(name='div', attrs={'class': 'cleft_inn'})]
remove_tags = [
dict(name=['object','link','embed','form','iframe','meta'])
,dict(name='a', attrs={'href':'http://www.adventuregamers.com/about/scoring'})
,dict(name='a', attrs={'href':'http://www.adventuregamers.com/about/policies'})
]
remove_tags_after = [dict(name='div', attrs={'class':'bodytext'})]
remove_attributes = ['width','height']
dict(name=['object', 'link', 'embed', 'form', 'iframe', 'meta']), dict(name='a', attrs={
'href': 'http://www.adventuregamers.com/about/scoring'}), dict(name='a', attrs={'href': 'http://www.adventuregamers.com/about/policies'})
]
remove_tags_after = [dict(name='div', attrs={'class': 'bodytext'})]
remove_attributes = ['width', 'height']
feeds = [(u'Articles', u'http://www.adventuregamers.com/rss/')]
def get_article_url(self, article):
url = BasicNewsRecipe.get_article_url(self, article)
if '/videos/' in url or '/hypeometer/' in url:
return None
return None
return url
def append_page(self, soup, appendtag, position):
pager = soup.find('div', attrs={'class':'pagination_big'})
pager = soup.find('div', attrs={'class': 'pagination_big'})
if pager:
nextpage = soup.find('a', attrs={'class':'next-page'})
if nextpage:
nexturl = nextpage['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class':'bodytext'})
for it in texttag.findAll(style=True):
del it['style']
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
texttag.extract()
pager.extract()
appendtag.insert(position,texttag)
nextpage = soup.find('a', attrs={'class': 'next-page'})
if nextpage:
nexturl = nextpage['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class': 'bodytext'})
for it in texttag.findAll(style=True):
del it['style']
newpos = len(texttag.contents)
self.append_page(soup2, texttag, newpos)
texttag.extract()
pager.extract()
appendtag.insert(position, texttag)
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('div', attrs={'class':'floatright'}):
for item in soup.findAll('div', attrs={'class': 'floatright'}):
item.extract()
self.append_page(soup, soup.body, 3)
pager = soup.find('div',attrs={'class':'pagination_big'})
pager = soup.find('div', attrs={'class': 'pagination_big'})
if pager:
pager.extract()
pager.extract()
return self.adeify_images(soup)

View File

@ -1,20 +1,20 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Aftenposten(BasicNewsRecipe):
title = u'Aftenposten'
__author__ = 'davotibarna'
description = 'Norske nyheter'
language = 'no'
oldest_article = 5
max_articles_per_feed = 100
recipe_disabled = ('The recipe to download Aftenposten has been '
'temporarily disabled at the publisher\'s request, while '
'they finalize their digital strategy.')
no_stylesheets = True
encoding = 'ISO-8859-1'
title = u'Aftenposten'
__author__ = 'davotibarna'
description = 'Norske nyheter'
language = 'no'
oldest_article = 5
max_articles_per_feed = 100
recipe_disabled = ('The recipe to download Aftenposten has been '
'temporarily disabled at the publisher\'s request, while '
'they finalize their digital strategy.')
no_stylesheets = True
encoding = 'ISO-8859-1'
feeds = [(u'Aftenposten', u'http://www.aftenposten.no/eksport/rss-1_0/')]
def print_version(self, url):
return url.replace('#xtor=RSS-3', '?service=print')
feeds = [(u'Aftenposten', u'http://www.aftenposten.no/eksport/rss-1_0/')]
def print_version(self, url):
return url.replace('#xtor=RSS-3', '?service=print')

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
boljevac.blogspot.com
@ -8,25 +8,23 @@ boljevac.blogspot.com
import re
from calibre.web.feeds.news import BasicNewsRecipe
class AgroGerila(BasicNewsRecipe):
title = 'Agro Gerila'
__author__ = 'Darko Miletic'
description = 'Politicki nekorektan blog.'
oldest_article = 45
title = 'Agro Gerila'
__author__ = 'Darko Miletic'
description = 'Politicki nekorektan blog.'
oldest_article = 45
max_articles_per_feed = 100
language = 'sr'
encoding = 'utf-8'
no_stylesheets = True
use_embedded_content = True
publication_type = 'blog'
extra_css = ' @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: "Trebuchet MS",Trebuchet,Verdana,sans1,sans-serif} .article_description{font-family: sans1, sans-serif} img{margin-bottom: 0.8em; border: 1px solid #333333; padding: 4px } '
language = 'sr'
encoding = 'utf-8'
no_stylesheets = True
use_embedded_content = True
publication_type = 'blog'
extra_css = ' @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: "Trebuchet MS",Trebuchet,Verdana,sans1,sans-serif} .article_description{font-family: sans1, sans-serif} img{margin-bottom: 0.8em; border: 1px solid #333333; padding: 4px } ' # noqa
conversion_options = {
'comment' : description
, 'tags' : 'film, blog, srbija'
, 'publisher': 'Dry-Na-Nord'
, 'language' : language
}
'comment': description, 'tags': 'film, blog, srbija', 'publisher': 'Dry-Na-Nord', 'language': language
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
@ -36,5 +34,3 @@ class AgroGerila(BasicNewsRecipe):
for item in soup.findAll(style=True):
del item['style']
return self.adeify_images(soup)

View File

@ -1,4 +1,4 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2010 - 2014, Darko Miletic <darko.miletic at gmail.com>'
'''
www.aif.ru
@ -6,35 +6,32 @@ www.aif.ru
from calibre.web.feeds.news import BasicNewsRecipe
class AIF_ru(BasicNewsRecipe):
title = 'Arguments & Facts - Russian'
__author__ = 'Darko Miletic'
description = 'News from Russia'
publisher = 'AIF'
category = 'news, politics, Russia'
oldest_article = 2
title = 'Arguments & Facts - Russian'
__author__ = 'Darko Miletic'
description = 'News from Russia'
publisher = 'AIF'
category = 'news, politics, Russia'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf8'
language = 'ru'
publication_type = 'magazine'
masthead_url = 'http://static3.aif.ru/glossy/index/i/logo.png'
no_stylesheets = True
use_embedded_content = False
encoding = 'utf8'
language = 'ru'
publication_type = 'magazine'
masthead_url = 'http://static3.aif.ru/glossy/index/i/logo.png'
extra_css = """
body{font-family: Verdana,Arial,Helvetica,sans1,sans-serif}
img{display: block}
"""
keep_only_tags = [
dict(name='h1', attrs={'class':'title'})
,dict(name='div', attrs={'class':'prew_tags'})
,dict(name='article', attrs={'class':lambda x: x and 'articl_body' in x.split()})
]
remove_tags = [
dict(name=['iframe','object','link','base','input','meta'])
,dict(name='div',attrs={'class':'in-topic'})
,dict(name='div', attrs={'class':lambda x: x and 'related_article' in x.split()})
,dict(name='div', attrs={'class':lambda x: x and 'articl_tag' in x.split()})
]
feeds = [(u'News', u'http://www.aif.ru/rss/all.php')]
keep_only_tags = [
dict(name='h1', attrs={'class': 'title'}), dict(name='div', attrs={'class': 'prew_tags'}), dict(
name='article', attrs={'class': lambda x: x and 'articl_body' in x.split()})
]
remove_tags = [
dict(name=['iframe', 'object', 'link', 'base', 'input', 'meta']), dict(name='div', attrs={'class': 'in-topic'}), dict(name='div', attrs={
'class': lambda x: x and 'related_article' in x.split()}), dict(name='div', attrs={'class': lambda x: x and 'articl_tag' in x.split()})
]
feeds = [(u'News', u'http://www.aif.ru/rss/all.php')]

View File

@ -1,5 +1,6 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AirForceTimes(BasicNewsRecipe):
title = 'Air Force Times'
__author__ = 'jde'
@ -12,7 +13,7 @@ class AirForceTimes(BasicNewsRecipe):
tags = 'news, U.S. Air Force'
cover_url = 'http://www.airforcetimes.com/images/logo_airforcetimes_alert.jpg'
masthead_url = 'http://www.airforcetimes.com/images/logo_airforcetimes_alert.jpg'
oldest_article = 7 #days
oldest_article = 7 # days
max_articles_per_feed = 25
publication_type = 'newspaper'
no_stylesheets = True
@ -24,20 +25,14 @@ class AirForceTimes(BasicNewsRecipe):
remove_empty_feeds = True
auto_cleanup = True
feeds = [
('News', 'http://www.airforcetimes.com/rss_news.php'),
('Benefits', 'http://www.airforcetimes.com/rss_benefits.php'),
('Money', 'http://www.airforcetimes.com/rss_money.php'),
('Careers & Education', 'http://www.airforcetimes.com/rss_careers.php'),
('Community', 'http://www.airforcetimes.com/rss_community.php'),
('Off Duty', 'http://www.airforcetimes.com/rss_off_duty.php'),
('Entertainment', 'http://www.airforcetimes.com/rss_entertainment.php'),
('Guard & Reserve', 'http://www.airforcetimes.com/rss_guard.php'),
]
('News', 'http://www.airforcetimes.com/rss_news.php'),
('Benefits', 'http://www.airforcetimes.com/rss_benefits.php'),
('Money', 'http://www.airforcetimes.com/rss_money.php'),
('Careers & Education', 'http://www.airforcetimes.com/rss_careers.php'),
('Community', 'http://www.airforcetimes.com/rss_community.php'),
('Off Duty', 'http://www.airforcetimes.com/rss_off_duty.php'),
('Entertainment', 'http://www.airforcetimes.com/rss_entertainment.php'),
('Guard & Reserve', 'http://www.airforcetimes.com/rss_guard.php'),
]

View File

@ -1,15 +1,17 @@
#!/usr/bin/env python2
__license__ = 'Creative Commons Attribution 4.0 International License'
__author__ = 'John McDole'
__license__ = 'Creative Commons Attribution 4.0 International License'
__author__ = 'John McDole'
__copyright__ = ''
__version__ = '0.1'
__date__ = '2015/01/10'
__version__ = '0.1'
__date__ = '2015/01/10'
__docformat__ = 'restructuredtext en'
import datetime, re
import datetime
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class AdvancedUserRecipe1282101454(BasicNewsRecipe):
now = datetime.datetime.now()
title = 'The AJC'
@ -24,72 +26,81 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
max_articles_per_feed = 100
no_stylesheets = True
# The AJC lists identical articles in multiple feeds; this removes them based on their URL
# The AJC lists identical articles in multiple feeds; this removes them
# based on their URL
ignore_duplicate_articles = {'title', 'url'}
# And this says "Hey, AJC, different feeds should mean something!"
remove_empty_feeds = True
# Sets whether a feed has full articles embedded in it. The AJC feeds do not.
# Sets whether a feed has full articles embedded in it. The AJC feeds do
# not.
use_embedded_content = False
masthead_url = 'http://gawand.org/wp-content/uploads/2010/06/ajc-logo.gif'
# Pick your poison. Business seems to be mostly cross-linked articles. Premium and cross-linked
# articels will be dropped.
feeds = [
('Breaking News', 'http://www.ajc.com/list/rss/online/ajc-auto-list-iphone-topnews/aFKq/'),
('Metro and Georgia', 'http://www.ajc.com/list/rss/news/local/news-georgia-and-region/aCxP/'),
('Business', 'http://www.ajc.com/feeds/categories/business/'),
('Health', 'http://www.ajc.com/feeds/categories/health/'),
# ('Braves', 'http://www.ajc.com/list/rss/sports/baseball/atlanta-braves-news/aGpN/'),
# ('Falcons', 'http://www.ajc.com/list/rss/sports/football/falcons-news/aGK4/'),
# ('Georgia Tech Yellow Jackets', 'http://www.ajc.com/list/rss/sports/college/georgia-tech-headlines/aGK6/'),
]
feeds = [
('Breaking News', 'http://www.ajc.com/list/rss/online/ajc-auto-list-iphone-topnews/aFKq/'),
('Metro and Georgia',
'http://www.ajc.com/list/rss/news/local/news-georgia-and-region/aCxP/'),
('Business', 'http://www.ajc.com/feeds/categories/business/'),
('Health', 'http://www.ajc.com/feeds/categories/health/'),
# ('Braves', 'http://www.ajc.com/list/rss/sports/baseball/atlanta-braves-news/aGpN/'),
# ('Falcons', 'http://www.ajc.com/list/rss/sports/football/falcons-news/aGK4/'),
# ('Georgia Tech Yellow Jackets', 'http://www.ajc.com/list/rss/sports/college/georgia-tech-headlines/aGK6/'),
]
headline_reg_exp = '^.*cm-story-headline.*$'
story_body_reg_exp = '^.*cm-story-body.*$'
author_reg_exp = '^.*cm-story-author.*$'
keep_only_tags = [
dict(name='div', attrs={'class':re.compile(headline_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':'cm-story-meta'}),
dict(name='div', attrs={'class':re.compile(author_reg_exp, re.IGNORECASE)}),
dict(name='meta', attrs={'name':'description'}),
dict(name='div', attrs={'class':re.compile(story_body_reg_exp, re.IGNORECASE)}),
]
dict(name='div', attrs={'class': re.compile(
headline_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': 'cm-story-meta'}),
dict(name='div', attrs={'class': re.compile(
author_reg_exp, re.IGNORECASE)}),
dict(name='meta', attrs={'name': 'description'}),
dict(name='div', attrs={'class': re.compile(
story_body_reg_exp, re.IGNORECASE)}),
]
premium_reg_exp = '^.*cmPremiumContent.*$'
footer_reg_exp = '^.*cm-story-footer.*$'
remove_tags = [
dict(name='div', attrs={'class':re.compile(footer_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':'cm-inline-related-group'})
]
dict(name='div', attrs={'class': re.compile(
footer_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class': 'cm-inline-related-group'})
]
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
.cm-story-headline h1 { text-align: center; font-size: 175%; font-weight: bold; } \
.cm-story-meta { font-size: 80%; } \
.cm-related-caption, .cmPhotoImageAttribution, img { display: block; font-size: 75%; font-style: italic; text-align: center; margin: 5px auto;} \
.cm-related-caption, .cmPhotoImageAttribution, img { display: block; font-size: 75%; font-style: italic; text-align: center; margin: 5px auto;} \
.cm-story-author { display: block; font-size: 80%; font-style: italic; }'
# I would love to remove these completely from the finished product, but I can't see how at the momemnt.
# Retuning "None" from preprocess_html(soup) as suggested in mobileread forums leads to errors.
# Retuning "None" from preprocess_html(soup) as suggested in mobileread
# forums leads to errors.
def preprocess_html(self, soup):
premium = soup.find('div', attrs={'class':re.compile(self.premium_reg_exp, re.IGNORECASE)})
premium = soup.find('div', attrs={'class': re.compile(
self.premium_reg_exp, re.IGNORECASE)})
if premium:
return None
crosslink = soup.find('a', attrs={'class':'cm-feed-story-more-link'})
crosslink = soup.find('a', attrs={'class': 'cm-feed-story-more-link'})
if crosslink:
return None
return soup
def populate_article_metadata(self, article, soup, first):
for meta in soup.findAll('meta', attrs={'name':'description'}):
for meta in soup.findAll('meta', attrs={'name': 'description'}):
article.text_summary = meta['content']
article.summary = meta['content']
lead = soup.find('div', attrs={'class':'cm-story-photo'})
lead = soup.find('div', attrs={'class': 'cm-story-photo'})
if lead:
lead = lead.find('img')
else:
@ -98,10 +109,10 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
self.add_toc_thumbnail(article, lead['src'])
names = ''
comma = ''
for div in soup.findAll('div', attrs={'class':re.compile(self.author_reg_exp, re.IGNORECASE)}):
for div in soup.findAll('div', attrs={'class': re.compile(self.author_reg_exp, re.IGNORECASE)}):
div.extract()
for auth in div.findAll('a'):
if (auth.has_key('class') and auth['class'] == 'cm-source-image'):
if (auth.has_key('class') and auth['class'] == 'cm-source-image'): # noqa
continue
names = names + comma + auth.contents[0]
comma = ', '
@ -110,7 +121,6 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
tag = Tag(soup, 'div', [('class', 'cm-story-author')])
tag.append("by: ")
tag.append(names)
meta = soup.find('div', attrs={'class':'cm-story-meta'})
meta = soup.find('div', attrs={'class': 'cm-story-meta'})
meta_idx = meta.parent.contents.index(meta)
meta.parent.insert(meta_idx + 1, tag)

View File

@ -1,4 +1,4 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2010, Hiroshi Miura <miurahr@linux.com>'
'''
ajiajin.com/blog
@ -6,18 +6,17 @@ ajiajin.com/blog
from calibre.web.feeds.news import BasicNewsRecipe
class AjiajinBlog(BasicNewsRecipe):
title = u'Ajiajin blog'
__author__ = 'Hiroshi Miura'
title = u'Ajiajin blog'
__author__ = 'Hiroshi Miura'
oldest_article = 5
publication_type = 'blog'
max_articles_per_feed = 100
description = 'The next generation internet trends in Japan and Asia'
publisher = ''
category = 'internet, asia, japan'
language = 'en'
encoding = 'utf-8'
feeds = [(u'blog', u'http://feeds.feedburner.com/Asiajin')]
description = 'The next generation internet trends in Japan and Asia'
publisher = ''
category = 'internet, asia, japan'
language = 'en'
encoding = 'utf-8'
feeds = [(u'blog', u'http://feeds.feedburner.com/Asiajin')]

View File

@ -2,46 +2,51 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Aksiyon (BasicNewsRecipe):
title = u'Aksiyon Dergisi'
__author__ = u'thomass'
description = 'Haftalık haber dergisi '
oldest_article =13
max_articles_per_feed =100
no_stylesheets = True
#delay = 1
#use_embedded_content = False
encoding = 'utf-8'
publisher = 'Aksiyon'
category = 'news, haberler,TR,gazete'
language = 'tr'
title = u'Aksiyon Dergisi'
__author__ = u'thomass'
description = 'Haftalık haber dergisi '
oldest_article = 13
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
publisher = 'Aksiyon'
category = 'news, haberler,TR,gazete'
language = 'tr'
publication_type = 'magazine'
auto_cleanup = True
cover_img_url = 'http://www.aksiyon.com.tr/aksiyon/images/aksiyon/top-page/aksiyon_top_r2_c1.jpg'
masthead_url = 'http://aksiyon.com.tr/aksiyon/images/aksiyon/top-page/aksiyon_top_r2_c1.jpg'
ignore_duplicate_articles = { 'title', 'url' }
remove_empty_feeds= True
feeds = [
( u'KAPAK', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=26'),
( u'ANASAYFA', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=0'),
( u'EKONOMİ', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=35'),
( u'EKOANALİZ', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=284'),
( u'YAZARLAR', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=17'),
( u'KİTAPLIK', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=13'),
( u'SİNEMA', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=14'),
( u'ARKA PENCERE', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=27'),
( u'DÜNYA', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=32'),
( u'DOSYALAR', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=34'),
( u'KARAKUTU', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=11'),
( u'KÜLTÜR & SANAT', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=12'),
( u'SPOR', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=38'),
( u'BİLİŞİM - TEKNOLOJİ', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=39'),
( u'3. BOYUT', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=172'),
( u'HAYAT BİLGİSİ', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=283'),
( u'İŞ DÜNYASI', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=283'),
]
#def print_version(self, url):
#return url.replace('http://www.aksiyon.com.tr/aksiyon/newsDetail_getNewsById.action?load=detay&', 'http://www.aksiyon.com.tr/aksiyon/mobile_detailn.action?')
ignore_duplicate_articles = {'title', 'url'}
remove_empty_feeds = True
feeds = [
(u'KAPAK', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=26'),
(u'ANASAYFA', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=0'),
(u'EKONOMİ', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=35'),
(u'EKOANALİZ', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=284'),
(u'YAZARLAR', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=17'),
(u'KİTAPLIK', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=13'),
(u'SİNEMA', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=14'),
(u'ARKA PENCERE',
u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=27'),
(u'DÜNYA', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=32'),
(u'DOSYALAR', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=34'),
(u'KARAKUTU', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=11'),
(u'KÜLTÜR & SANAT',
u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=12'),
(u'SPOR', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=38'),
(u'BİLİŞİM - TEKNOLOJİ',
u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=39'),
(u'3. BOYUT', u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=172'),
(u'HAYAT BİLGİSİ',
u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=283'),
(u'İŞ DÜNYASI',
u'http://www.aksiyon.com.tr/aksiyon/rss?sectionId=283'),
]
# def print_version(self, url):
# return
# url.replace('http://www.aksiyon.com.tr/aksiyon/newsDetail_getNewsById.action?load=detay&',
# 'http://www.aksiyon.com.tr/aksiyon/mobile_detailn.action?')

View File

@ -1,4 +1,4 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2010-2012, Darko Miletic <darko.miletic at gmail.com>'
'''
akter.co.rs
@ -7,37 +7,35 @@ akter.co.rs
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Akter(BasicNewsRecipe):
title = 'AKTER - Nedeljnik'
__author__ = 'Darko Miletic'
description = 'AKTER - nedeljni politicki magazin savremene Srbije'
publisher = 'Akter Media Group d.o.o.'
category = 'vesti, online vesti, najnovije vesti, politika, sport, ekonomija, biznis, finansije, berza, kultura, zivot, putovanja, auto, automobili, tehnologija, politicki magazin, dogadjaji, desavanja, lifestyle, zdravlje, zdravstvo, vest, novine, nedeljnik, srbija, novi sad, vojvodina, svet, drustvo, zabava, republika srpska, beograd, intervju, komentar, reportaza, arhiva vesti, news, serbia, politics'
oldest_article = 8
title = 'AKTER - Nedeljnik'
__author__ = 'Darko Miletic'
description = 'AKTER - nedeljni politicki magazin savremene Srbije'
publisher = 'Akter Media Group d.o.o.'
category = 'vesti, online vesti, najnovije vesti, politika, sport, ekonomija, biznis, finansije, berza, kultura, zivot, putovanja, auto, automobili, tehnologija, politicki magazin, dogadjaji, desavanja, lifestyle, zdravlje, zdravstvo, vest, novine, nedeljnik, srbija, novi sad, vojvodina, svet, drustvo, zabava, republika srpska, beograd, intervju, komentar, reportaza, arhiva vesti, news, serbia, politics' # noqa
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
masthead_url = 'http://www.akter.co.rs/gfx/logoneover.png'
language = 'sr'
publication_type = 'magazine'
remove_empty_feeds = True
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
masthead_url = 'http://www.akter.co.rs/gfx/logoneover.png'
language = 'sr'
publication_type = 'magazine'
remove_empty_feeds = True
extra_css = """
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
body{font-family: Tahoma,Geneva,sans1,sans-serif}
img{margin-bottom: 0.8em; display: block;}
img{margin-bottom: 0.8em; display: block;}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher': publisher
, 'language' : language
}
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'id':'section_to_print'})]
feeds = [(u'Nedeljnik', u'http://akter.co.rs/rss/nedeljnik')]
keep_only_tags = [dict(name='div', attrs={'id': 'section_to_print'})]
feeds = [(u'Nedeljnik', u'http://akter.co.rs/rss/nedeljnik')]
def print_version(self, url):
dpart, spart, apart = url.rpartition('/')
@ -45,10 +43,9 @@ class Akter(BasicNewsRecipe):
def get_cover_url(self):
soup = self.index_to_soup('http://www.akter.co.rs/weekly.html')
divt = soup.find('div', attrs={'class':'lastissue'})
divt = soup.find('div', attrs={'class': 'lastissue'})
if divt:
imgt = divt.find('img')
if imgt:
return 'http://www.akter.co.rs' + imgt['src']
imgt = divt.find('img')
if imgt:
return 'http://www.akter.co.rs' + imgt['src']
return None

View File

@ -1,4 +1,4 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
'''
akter.co.rs
@ -7,37 +7,34 @@ akter.co.rs
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Akter(BasicNewsRecipe):
title = 'AKTER - Dnevnik'
__author__ = 'Darko Miletic'
description = 'AKTER - Najnovije vesti iz Srbije'
publisher = 'Akter Media Group d.o.o.'
category = 'vesti, online vesti, najnovije vesti, politika, sport, ekonomija, biznis, finansije, berza, kultura, zivot, putovanja, auto, automobili, tehnologija, politicki magazin, dogadjaji, desavanja, lifestyle, zdravlje, zdravstvo, vest, novine, nedeljnik, srbija, novi sad, vojvodina, svet, drustvo, zabava, republika srpska, beograd, intervju, komentar, reportaza, arhiva vesti, news, serbia, politics'
oldest_article = 8
title = 'AKTER - Dnevnik'
__author__ = 'Darko Miletic'
description = 'AKTER - Najnovije vesti iz Srbije'
publisher = 'Akter Media Group d.o.o.'
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
masthead_url = 'http://www.akter.co.rs/gfx/logodnover.png'
language = 'sr'
publication_type = 'magazine'
remove_empty_feeds = True
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
masthead_url = 'http://www.akter.co.rs/gfx/logodnover.png'
language = 'sr'
publication_type = 'magazine'
remove_empty_feeds = True
extra_css = """
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
body{font-family: Tahoma,Geneva,sans1,sans-serif}
img{margin-bottom: 0.8em; display: block;}
img{margin-bottom: 0.8em; display: block;}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher': publisher
, 'language' : language
}
'comment': description, 'publisher': publisher, 'language': language
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'id':'section_to_print'})]
feeds = [(u'Vesti', u'http://akter.co.rs/rss/dnevni')]
keep_only_tags = [dict(name='div', attrs={'id': 'section_to_print'})]
feeds = [(u'Vesti', u'http://akter.co.rs/rss/dnevni')]
def print_version(self, url):
dpart, spart, apart = url.rpartition('/')

View File

@ -3,8 +3,9 @@ from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
class aktualneRecipe(BasicNewsRecipe):
__author__ = 'bubak'
__author__ = 'bubak'
title = u'aktualne.cz'
publisher = u'Centrum holdings'
description = 'aktuálně.cz'
@ -13,13 +14,13 @@ class aktualneRecipe(BasicNewsRecipe):
encoding = 'utf-8'
feeds = [
(u'Domácí', u'http://aktualne.centrum.cz/feeds/rss/domaci/?photo=0'),
(u'Zprávy', u'http://aktualne.centrum.cz/feeds/rss/zpravy/?photo=0'),
(u'Praha', u'http://aktualne.centrum.cz/feeds/rss/domaci/regiony/praha/?photo=0'),
(u'Ekonomika', u'http://aktualne.centrum.cz/feeds/rss/ekonomika/?photo=0'),
(u'Finance', u'http://aktualne.centrum.cz/feeds/rss/finance/?photo=0'),
(u'Blogy a názory', u'http://blog.aktualne.centrum.cz/export-all.php')
]
(u'Domácí', u'http://aktualne.centrum.cz/feeds/rss/domaci/?photo=0'),
(u'Zprávy', u'http://aktualne.centrum.cz/feeds/rss/zpravy/?photo=0'),
(u'Praha', u'http://aktualne.centrum.cz/feeds/rss/domaci/regiony/praha/?photo=0'),
(u'Ekonomika', u'http://aktualne.centrum.cz/feeds/rss/ekonomika/?photo=0'),
(u'Finance', u'http://aktualne.centrum.cz/feeds/rss/finance/?photo=0'),
(u'Blogy a názory', u'http://blog.aktualne.centrum.cz/export-all.php')
]
language = 'cs'
cover_url = 'http://img.aktualne.centrum.cz/design/akt4/o/l/logo-akt-ciste.png'
@ -27,29 +28,31 @@ class aktualneRecipe(BasicNewsRecipe):
no_stylesheets = True
remove_attributes = []
remove_tags_before = dict(name='h1', attrs={'class':['titulek-clanku']})
remove_tags_before = dict(name='h1', attrs={'class': ['titulek-clanku']})
filter_regexps = [r'img.aktualne.centrum.cz']
remove_tags = [dict(name='div', attrs={'id':['social-bookmark']}),
dict(name='div', attrs={'class':['box1', 'svazane-tagy']}),
dict(name='div', attrs={'class':'itemcomment id0'}),
dict(name='div', attrs={'class':'hlavicka'}),
dict(name='div', attrs={'class':'hlavni-menu'}),
dict(name='div', attrs={'class':'top-standard-brand-obal'}),
dict(name='div', attrs={'class':'breadcrumb'}),
dict(name='div', attrs={'id':'start-standard'}),
dict(name='div', attrs={'id':'forum'}),
dict(name='span', attrs={'class':'akce'}),
dict(name='span', attrs={'class':'odrazka vetsi'}),
dict(name='div', attrs={'class':'boxP'}),
dict(name='div', attrs={'class':'box2'})]
remove_tags = [dict(name='div', attrs={'id': ['social-bookmark']}),
dict(name='div', attrs={'class': ['box1', 'svazane-tagy']}),
dict(name='div', attrs={'class': 'itemcomment id0'}),
dict(name='div', attrs={'class': 'hlavicka'}),
dict(name='div', attrs={'class': 'hlavni-menu'}),
dict(name='div', attrs={
'class': 'top-standard-brand-obal'}),
dict(name='div', attrs={'class': 'breadcrumb'}),
dict(name='div', attrs={'id': 'start-standard'}),
dict(name='div', attrs={'id': 'forum'}),
dict(name='span', attrs={'class': 'akce'}),
dict(name='span', attrs={'class': 'odrazka vetsi'}),
dict(name='div', attrs={'class': 'boxP'}),
dict(name='div', attrs={'class': 'box2'})]
preprocess_regexps = [
(re.compile(r'<div class="(contenttitle"|socialni-site|wiki|facebook-promo|facebook-like-button"|meta-akce).*',
re.DOTALL|re.IGNORECASE), lambda match: '</body>'),
(re.compile(r'<div class="[^"]*poutak-clanek-trojka".*', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
(re.compile(r'<div class="(contenttitle"|socialni-site|wiki|facebook-promo|facebook-like-button"|meta-akce).*',
re.DOTALL | re.IGNORECASE), lambda match: '</body>'),
(re.compile(r'<div class="[^"]*poutak-clanek-trojka".*', re.DOTALL | re.IGNORECASE), lambda match: '</body>')]
keep_only_tags = []
visited_urls = {}
def get_article_url(self, article):
url = BasicNewsRecipe.get_article_url(self, article)
if url in self.visited_urls:

View File

@ -1,66 +1,76 @@
# coding=utf-8
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2011-2016, Hassan Williamson <haz at hazrpg.co.uk>'
'''
ahram.org.eg
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class AlAhram(BasicNewsRecipe):
title = u'Al-Ahram (الأهرام)'
__author__ = 'Hassan Williamson'
description = 'The Arabic version of the Al-Ahram newspaper.'
language = 'ar'
encoding = 'utf8'
cover_url = 'http://www.ahram.org.eg/Media/News/2015/3/14/2015-635619650946000713-600.jpg'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
#delay = 1
use_embedded_content = False
publisher = 'Al-Ahram'
category = 'News'
publication_type = 'newsportal'
title = u'Al-Ahram (الأهرام)'
__author__ = 'Hassan Williamson'
description = 'The Arabic version of the Al-Ahram newspaper.'
language = 'ar'
encoding = 'utf8'
cover_url = 'http://www.ahram.org.eg/Media/News/2015/3/14/2015-635619650946000713-600.jpg'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
publisher = 'Al-Ahram'
category = 'News'
publication_type = 'newsportal'
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif; direction: rtl; } .bbtitle{ font-weight: bold; font-size: 2em; } .bbsubtitle{ font-size: 1.3em; } #WriterImage{ height: 10px; } '
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif; direction: rtl; } .bbtitle{ font-weight: bold; font-size: 2em; } .bbsubtitle{ font-size: 1.3em; } #WriterImage{ height: 10px; } ' # noqa
keep_only_tags = [
dict(name='div', attrs={'class':['bbcolright']})
]
keep_only_tags = [
dict(name='div', attrs={'class': ['bbcolright']})
]
remove_tags = [
dict(name='div', attrs={'class':['bbnav', 'bbsp']}),
dict(name='div', attrs={'id':['AddThisButton']}),
dict(name='a', attrs={'class':['twitter-share-button']}),
dict(name='div', attrs={'id':['ReaderCount']}),
]
remove_tags = [
dict(name='div', attrs={'class': ['bbnav', 'bbsp']}),
dict(name='div', attrs={'id': ['AddThisButton']}),
dict(name='a', attrs={'class': ['twitter-share-button']}),
dict(name='div', attrs={'id': ['ReaderCount']}),
]
remove_attributes = [
'width','height','style'
]
'width', 'height', 'style'
]
feeds = [
(u'الأولى', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=25'),
(u'الصفحة الثانية', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=74'),
(u'مصر', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=27'),
(u'المشهد السياسي', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=60'),
(u'المحافظات', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=29'),
(u'الوطن العربي', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=31'),
(u'العالم', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=26'),
(u'تقارير المراسلين', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=2'),
(u'تحقيقات', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=3'),
(u'قضايا واراء', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=4'),
(u'اقتصاد', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=5'),
(u'رياضة', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=6'),
(u'حوادث', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=38'),
(u'دنيا الثقافة', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=7'),
(u'المراة والطفل', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=8'),
(u'يوم جديد', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=9'),
(u'الكتاب', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=10'),
(u'الاعمدة', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=11'),
(u'أراء حرة', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=59'),
(u'ملفات الاهرام', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=12'),
(u'بريد الاهرام', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=15'),
(u'برلمان الثورة', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=61'),
(u'الاخيرة', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=16'),
]
feeds = [
(u'الأولى', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=25'),
(u'الصفحة الثانية',
'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=74'),
(u'مصر', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=27'),
(u'المشهد السياسي',
'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=60'),
(u'المحافظات', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=29'),
(u'الوطن العربي',
'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=31'),
(u'العالم', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=26'),
(u'تقارير المراسلين',
'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=2'),
(u'تحقيقات', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=3'),
(u'قضايا واراء',
'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=4'),
(u'اقتصاد', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=5'),
(u'رياضة', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=6'),
(u'حوادث', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=38'),
(u'دنيا الثقافة',
'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=7'),
(u'المراة والطفل',
'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=8'),
(u'يوم جديد', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=9'),
(u'الكتاب', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=10'),
(u'الاعمدة', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=11'),
(u'أراء حرة', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=59'),
(u'ملفات الاهرام',
'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=12'),
(u'بريد الاهرام',
'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=15'),
(u'برلمان الثورة',
'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=61'),
(u'الاخيرة', 'http://www.ahram.org.eg/archive/RssXml.aspx?CategoryID=16'),
]

View File

@ -1,4 +1,4 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
'''
@ -6,57 +6,62 @@ english.aljazeera.net
'''
from calibre.web.feeds.news import BasicNewsRecipe
def has_cls(x):
return dict(attrs={'class':lambda cls: cls and x in cls.split()})
return dict(attrs={'class': lambda cls: cls and x in cls.split()})
class AlJazeera(BasicNewsRecipe):
title = 'Al Jazeera in English'
__author__ = 'Darko Miletic'
description = 'News from Middle East'
language = 'en'
publisher = 'Al Jazeera'
category = 'news, politics, middle east'
delay = 1
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
title = 'Al Jazeera in English'
__author__ = 'Darko Miletic'
description = 'News from Middle East'
language = 'en'
publisher = 'Al Jazeera'
category = 'news, politics, middle east'
delay = 1
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
extra_css = """
body{font-family: Arial,sans-serif}
#ctl00_cphBody_dvSummary{font-weight: bold}
#dvArticleDate{font-size: small; color: #999999}
"""
conversion_options = {
'comment' : description , 'tags' : category ,
'publisher' : publisher , 'language' : language
'comment': description, 'tags': category,
'publisher': publisher, 'language': language
}
keep_only_tags = [
dict(id='main-story'),
]
remove_tags = [
has_cls('MoreOnTheStory'), has_cls('ArticleBottomToolbar'), dict(smtitle="ShowMore"),
dict(name=['object','link','table','meta','base','iframe','embed']),
has_cls('MoreOnTheStory'), has_cls(
'ArticleBottomToolbar'), dict(smtitle="ShowMore"),
dict(name=['object', 'link', 'table',
'meta', 'base', 'iframe', 'embed']),
]
feeds = [(u'Al Jazeera English', u'http://english.aljazeera.net/Services/Rss/?PostingId=2007731105943979989')]
feeds = [(u'Al Jazeera English',
u'http://english.aljazeera.net/Services/Rss/?PostingId=2007731105943979989')]
def get_article_url(self, article):
artlurl = article.get('link', None)
return artlurl.replace('http://english.aljazeera.net//','http://english.aljazeera.net/')
artlurl = article.get('link', None)
return artlurl.replace('http://english.aljazeera.net//', 'http://english.aljazeera.net/')
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll(face=True):
del item['face']
td = soup.find('td',attrs={'class':'DetailedSummary'})
td = soup.find('td', attrs={'class': 'DetailedSummary'})
if td:
td.name = 'div'
spn = soup.find('span',attrs={'id':'DetailedTitle'})
spn = soup.find('span', attrs={'id': 'DetailedTitle'})
if spn:
spn.name='h1'
for itm in soup.findAll('span', attrs={'id':['dvArticleDate','ctl00_cphBody_lblDate']}):
spn.name = 'h1'
for itm in soup.findAll('span', attrs={'id': ['dvArticleDate', 'ctl00_cphBody_lblDate']}):
itm.name = 'div'
for alink in soup.findAll('a'):
if alink.string is not None:

View File

@ -1,79 +1,85 @@
# coding=utf-8
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2016, Hassan Williamson <haz at hazrpg.co.uk>'
'''
almasryalyoum.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class AlMasryAlyoum(BasicNewsRecipe):
title = u'Al-Masry Alyoum (المصري اليوم)'
__author__ = 'Hassan Williamson'
description = 'The Arabic version of the Al-Masry Alyoum (Egypt Independent) newspaper.'
language = 'ar'
encoding = 'utf8'
cover_url = 'http://www.almasryalyoum.com/content/images/header_logo.png'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
#delay = 1
use_embedded_content = False
publisher = 'Al-Masry Alyoum'
category = 'News'
publication_type = 'newsportal'
title = u'Al-Masry Alyoum (المصري اليوم)'
__author__ = 'Hassan Williamson'
description = 'The Arabic version of the Al-Masry Alyoum (Egypt Independent) newspaper.'
language = 'ar'
encoding = 'utf8'
cover_url = 'http://www.almasryalyoum.com/content/images/header_logo.png'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
publisher = 'Al-Masry Alyoum'
category = 'News'
publication_type = 'newsportal'
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif; direction: rtl; } .tit_2{ font-weight: bold; font-size: 2em; } .pinfo{ font-size: 1.3em; } .articleimg img{ max-width: 100%; } .imgauther{ display: block; font-size: 0.7em; } .caption{ font-size: 0.7em; } '
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif; direction: rtl; } .tit_2{ font-weight: bold; font-size: 2em; } .pinfo{ font-size: 1.3em; } .articleimg img{ max-width: 100%; } .imgauther{ display: block; font-size: 0.7em; } .caption{ font-size: 0.7em; } ' # noqa
keep_only_tags = [
dict(name='div', attrs={'class':['article']})
]
keep_only_tags = [
dict(name='div', attrs={'class': ['article']})
]
remove_tags = [
dict(name='div', attrs={'class':['share_buttons_container']}),
dict(name='div', attrs={'class':['min_related']}),
dict(name='div', attrs={'id':['feedback']}),
dict(name='div', attrs={'class':['news_SMSBox']}),
dict(name='div', attrs={'class':['tags']}),
dict(name='div', attrs={'class':['ads', 'y_logo_news']}),
dict(name='div', attrs={'class':['ads']}),
dict(name='div', attrs={'class':['option']}),
dict(name='div', attrs={'class':['seealso']}),
dict(name='div', attrs={'id':['comments']}),
]
remove_tags = [
dict(name='div', attrs={'class': ['share_buttons_container']}),
dict(name='div', attrs={'class': ['min_related']}),
dict(name='div', attrs={'id': ['feedback']}),
dict(name='div', attrs={'class': ['news_SMSBox']}),
dict(name='div', attrs={'class': ['tags']}),
dict(name='div', attrs={'class': ['ads', 'y_logo_news']}),
dict(name='div', attrs={'class': ['ads']}),
dict(name='div', attrs={'class': ['option']}),
dict(name='div', attrs={'class': ['seealso']}),
dict(name='div', attrs={'id': ['comments']}),
]
remove_attributes = [
'width','height','style'
]
'width', 'height', 'style'
]
feeds = [
(u'أخر الأخبار', 'http://www.almasryalyoum.com/rss/RssFeeds'),
(u'الصفحة الرئيسية', 'http://www.almasryalyoum.com/rss/RssFeeds?homePage=true'),
(u'أقلام وآراء', 'http://www.almasryalyoum.com/rss/RssFeeds?typeId=2&homePage=false'),
(u'أخبار مصر', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=3'),
(u'رياضة', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=8'),
(u'اقتصاد', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=4'),
(u'حوادث', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=7'),
(u'فنون', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=10'),
(u'منوعاتنون', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=12'),
(u'ثقافة', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=6'),
(u'علوم وتكنولوجيا', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=9'),
(u'تحقيقات وحوارات', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=5'),
(u'المرأة', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=69'),
(u'رأي', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=2'),
(u'وسط الناس', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=13'),
(u'مركز المصري للدراسات و المعلومات', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=56'),
(u'مطبخ', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=81'),
(u'برلمان مصر', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=78'),
(u'تقارير', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=54'),
(u'تحليلات', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=60'),
(u'عروض نقدية', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=61'),
(u'دراسات', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=62'),
(u'كتاب المصري اليوم', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=65'),
(u'فعاليات', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=66'),
(u'إسلامي', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=75'),
(u'مطبخي', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=76'),
(u'مسلسلاتيطبخي', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=77'),
(u'رمضان زمان', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=82'),
(u'تقارير', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=85'),
(u'سيارات', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=86'),
]
feeds = [
(u'أخر الأخبار', 'http://www.almasryalyoum.com/rss/RssFeeds'),
(u'الصفحة الرئيسية',
'http://www.almasryalyoum.com/rss/RssFeeds?homePage=true'),
(u'أقلام وآراء', 'http://www.almasryalyoum.com/rss/RssFeeds?typeId=2&homePage=false'),
(u'أخبار مصر', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=3'),
(u'رياضة', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=8'),
(u'اقتصاد', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=4'),
(u'حوادث', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=7'),
(u'فنون', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=10'),
(u'منوعاتنون', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=12'),
(u'ثقافة', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=6'),
(u'علوم وتكنولوجيا',
'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=9'),
(u'تحقيقات وحوارات',
'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=5'),
(u'المرأة', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=69'),
(u'رأي', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=2'),
(u'وسط الناس', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=13'),
(u'مركز المصري للدراسات و المعلومات',
'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=56'),
(u'مطبخ', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=81'),
(u'برلمان مصر', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=78'),
(u'تقارير', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=54'),
(u'تحليلات', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=60'),
(u'عروض نقدية', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=61'),
(u'دراسات', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=62'),
(u'كتاب المصري اليوم',
'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=65'),
(u'فعاليات', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=66'),
(u'إسلامي', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=75'),
(u'مطبخي', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=76'),
(u'مسلسلاتيطبخي',
'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=77'),
(u'رمضان زمان', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=82'),
(u'تقارير', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=85'),
(u'سيارات', 'http://www.almasryalyoum.com/rss/RssFeeds?sectionId=86'),
]

View File

@ -1,14 +1,18 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2014, spswerling'
'''
http://www.al-monitor.com/
'''
import string, inspect, datetime, re
import string
import inspect
import datetime
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class AlMonitor(BasicNewsRecipe):
title = u'Al Monitor'
__author__ = u'spswerling'
@ -26,39 +30,39 @@ class AlMonitor(BasicNewsRecipe):
recursions = 0
compress_news_images = True
compress_news_images_max_size = 7
scale_news_images = (150,200) # (kindle touch: 600x800)
scale_news_images = (150, 200) # (kindle touch: 600x800)
useHighResImages = False
oldest_article = 1.5
max_articles_per_section = 15
sections = [
(u'egypt',u'http://www.al-monitor.com/pulse/egypt-pulse'),
(u'gulf',u'http://www.al-monitor.com/pulse/gulf-pulse'),
(u'iran',u'http://www.al-monitor.com/pulse/iran-pulse'),
(u'iraq',u'http://www.al-monitor.com/pulse/iraq-pulse'),
(u'israel',u'http://www.al-monitor.com/pulse/israel-pulse'),
(u'lebanon',u'http://www.al-monitor.com/pulse/lebanon-pulse'),
(u'palistine',u'http://www.al-monitor.com/pulse/palistine-pulse'),
(u'syria',u'http://www.al-monitor.com/pulse/syria-pulse'),
(u'turkey',u'http://www.al-monitor.com/pulse/turkey-pulse'),
]
(u'egypt', u'http://www.al-monitor.com/pulse/egypt-pulse'),
(u'gulf', u'http://www.al-monitor.com/pulse/gulf-pulse'),
(u'iran', u'http://www.al-monitor.com/pulse/iran-pulse'),
(u'iraq', u'http://www.al-monitor.com/pulse/iraq-pulse'),
(u'israel', u'http://www.al-monitor.com/pulse/israel-pulse'),
(u'lebanon', u'http://www.al-monitor.com/pulse/lebanon-pulse'),
(u'palistine', u'http://www.al-monitor.com/pulse/palistine-pulse'),
(u'syria', u'http://www.al-monitor.com/pulse/syria-pulse'),
(u'turkey', u'http://www.al-monitor.com/pulse/turkey-pulse'),
]
# util for creating remove_tags and keep_tags style regex matchers
def tag_matcher(elt, attr, rgx_str):
return dict(name=elt, attrs={attr:re.compile(rgx_str, re.IGNORECASE)})
return dict(name=elt, attrs={attr: re.compile(rgx_str, re.IGNORECASE)})
remove_tags = [
dict(attrs={'id':[
'header',
'pulsebanner',
'relatedarticles',
'sidecolumn',
'disqus',
'footer',
'footer2',
'footer3',
'mobile-extras',
]}),
dict(attrs={'id': [
'header',
'pulsebanner',
'relatedarticles',
'sidecolumn',
'disqus',
'footer',
'footer2',
'footer3',
'mobile-extras',
]}),
tag_matcher('hr', 'id', 'spacer'),
tag_matcher('a', 'title', 'print this article'),
tag_matcher('div', 'class', 'extras'),
@ -118,12 +122,12 @@ class AlMonitor(BasicNewsRecipe):
if len(self.articles[section]) >= self.max_articles_per_section:
return
self.articles[section].append(
dict(title=title,
url=full_url,
date='',
description='',
author='',
content=''))
dict(title=title,
url=full_url,
date='',
description='',
author='',
content=''))
def preprocess_raw_html(self, raw_html, url):
reason_to_skip = self.should_skip_article(BeautifulSoup(raw_html))
@ -136,7 +140,7 @@ class AlMonitor(BasicNewsRecipe):
return super(self.__class__, self).preprocess_raw_html(raw_html, url)
def populate_article_metadata(self, article, soup, first):
summary_node = soup.find('div', {'id':'summary'})
summary_node = soup.find('div', {'id': 'summary'})
if summary_node:
summary = self.text(summary_node)
self._p('Summary: ' + summary)
@ -167,7 +171,7 @@ class AlMonitor(BasicNewsRecipe):
def date_from_string(self, datestring):
try:
# eg: Posted September 17, 2014
dt = datetime.datetime.strptime(datestring,"Posted %B %d, %Y")
dt = datetime.datetime.strptime(datestring, "Posted %B %d, %Y")
except:
dt = None
@ -192,14 +196,14 @@ class AlMonitor(BasicNewsRecipe):
return abs_url
def text(self,n):
def text(self, n):
return self.tag_to_string(n).strip()
def _dbg_soup_node(self, node):
s = ' cls: ' + str(node.get('class')).strip() + \
' id: ' + str(node.get('id')).strip() + \
' role: ' + str(node.get('role')).strip() + \
' txt: ' + self.text(node)
' id: ' + str(node.get('id')).strip() + \
' role: ' + str(node.get('role')).strip() + \
' txt: ' + self.text(node)
return s
def _p(self, msg):

View File

@ -1,10 +1,11 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2012, Peter Grungi <p dot grungi at gmail dot com>'
from calibre.web.feeds.news import BasicNewsRecipe
class AlbertMohlersBlog(BasicNewsRecipe):
title = u'Albert Mohler\'s Blog'
title = u'Albert Mohler\'s Blog'
__author__ = 'Peter Grungi'
language = 'en'
oldest_article = 90
@ -15,4 +16,5 @@ class AlbertMohlersBlog(BasicNewsRecipe):
language = 'en'
author = 'Albert Mohler'
feeds = [(u'Albert Mohler\'s Blog', u'http://feeds.feedburner.com/AlbertMohlersBlog?format=xml')]
feeds = [(u'Albert Mohler\'s Blog',
u'http://feeds.feedburner.com/AlbertMohlersBlog?format=xml')]

View File

@ -2,16 +2,16 @@ __license__ = 'GPL v3'
import re
from calibre.web.feeds.news import BasicNewsRecipe
class AlejaKomiksu(BasicNewsRecipe):
title = u'Aleja Komiksu'
__author__ = 'fenuks'
description = u'Serwis poświęcony komiksom. Najnowsze wieści, recenzje, artykuły, wywiady, galerie, komiksy online, konkursy, linki, baza komiksów online.'
category = 'comics'
#publication_type = ''
language = 'pl'
#encoding = ''
title = u'Aleja Komiksu'
__author__ = 'fenuks'
description = u'Serwis poświęcony komiksom. Najnowsze wieści, recenzje, artykuły, wywiady, galerie, komiksy online, konkursy, linki, baza komiksów online.'
category = 'comics'
language = 'pl'
extra_css = 'ul {list-style-type: none;} .gfx_news {float: right;}'
preprocess_regexps = [(re.compile(ur'((<li class="no_img_b">(Do poczytania)|(Nowości):</li>)|(<p class="head2">Komentarze</p>)).*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>')]
preprocess_regexps = [(re.compile(ur'((<li class="no_img_b">(Do poczytania)|(Nowości):</li>)|(<p class="head2">Komentarze</p>)).*</body>',
re.DOTALL | re.IGNORECASE), lambda match: '</body>')]
cover_url = 'http://www.alejakomiksu.com/gfx/build/logo.png'
masthead_url = 'http://www.alejakomiksu.com/gfx/build/logo.png'
use_embedded_content = False
@ -23,15 +23,13 @@ class AlejaKomiksu(BasicNewsRecipe):
remove_attributes = ['style', 'font']
ignore_duplicate_articles = {'title', 'url'}
keep_only_tags = [dict(attrs={'class':'cont_tresc'})]
#remove_tags = [dict()]
#remove_tags_before = dict()
keep_only_tags = [dict(attrs={'class': 'cont_tresc'})]
feeds = [(u'Wiadomości', 'http://www.alejakomiksu.com/rss.php5')]
def skip_ad_pages(self, soup):
tag = soup.find(attrs={'class':'rodzaj'})
tag = soup.find(attrs={'class': 'rodzaj'})
if tag and tag.a.string.lower().strip() == 'recenzje':
link = soup.find(text=re.compile('recenzuje'))
if link:
return self.index_to_soup(link.parent['href'], raw=True)
return self.index_to_soup(link.parent['href'], raw=True)

View File

@ -1,4 +1,4 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
@ -8,19 +8,20 @@ www.alo.rs
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class Alo_Novine(BasicNewsRecipe):
title = 'Alo!'
__author__ = 'Darko Miletic'
description = "News Portal from Serbia"
publisher = 'Alo novine d.o.o.'
category = 'news, politics, Serbia'
oldest_article = 2
title = 'Alo!'
__author__ = 'Darko Miletic'
description = "News Portal from Serbia"
publisher = 'Alo novine d.o.o.'
category = 'news, politics, Serbia'
oldest_article = 2
max_articles_per_feed = 100
delay = 4
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
language = 'sr'
delay = 4
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
language = 'sr'
extra_css = """
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
.article_description,body{font-family: Arial,Helvetica,sans1,sans-serif}
@ -30,25 +31,23 @@ class Alo_Novine(BasicNewsRecipe):
img{margin-bottom: 0.8em} """
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher': publisher
, 'language' : language
}
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
remove_tags = [dict(name=['object','link','embed'])]
remove_attributes = ['height','width']
remove_tags = [dict(name=['object', 'link', 'embed'])]
remove_attributes = ['height', 'width']
feeds = [
(u'Najnovije Vijesti', u'http://www.alo.rs/rss/danasnje_vesti')
,(u'Politika' , u'http://www.alo.rs/rss/politika')
,(u'Vesti' , u'http://www.alo.rs/rss/vesti')
,(u'Sport' , u'http://www.alo.rs/rss/sport')
,(u'Ljudi' , u'http://www.alo.rs/rss/ljudi')
,(u'Saveti' , u'http://www.alo.rs/rss/saveti')
]
(u'Najnovije Vijesti', u'http://www.alo.rs/rss/danasnje_vesti'),
(u'Politika', u'http://www.alo.rs/rss/politika'),
(u'Vesti', u'http://www.alo.rs/rss/vesti'),
(u'Sport', u'http://www.alo.rs/rss/sport'),
(u'Ljudi', u'http://www.alo.rs/rss/ljudi'),
(u'Saveti', u'http://www.alo.rs/rss/saveti')
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
@ -61,5 +60,4 @@ class Alo_Novine(BasicNewsRecipe):
return 'http://www.alo.rs/resources/templates/tools/print.php?id=' + artid
def image_url_processor(self, baseurl, url):
return url.replace('alo.rs//','alo.rs/')
return url.replace('alo.rs//', 'alo.rs/')

View File

@ -1,4 +1,4 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2011, Rasmus Lauritsen <rasmus at lauritsen.info>'
'''
aoh.dk
@ -6,38 +6,35 @@ aoh.dk
from calibre.web.feeds.news import BasicNewsRecipe
class aoh_dk(BasicNewsRecipe):
title = 'Alt om Herning'
__author__ = 'Rasmus Lauritsen'
description = 'Nyheder fra Herning om omegn'
publisher = 'Mediehuset Herning Folkeblad'
category = 'news, local, Denmark'
oldest_article = 14
title = 'Alt om Herning'
__author__ = 'Rasmus Lauritsen'
description = 'Nyheder fra Herning om omegn'
publisher = 'Mediehuset Herning Folkeblad'
category = 'news, local, Denmark'
oldest_article = 14
max_articles_per_feed = 50
no_stylesheets = True
delay = 1
encoding = 'utf8'
use_embedded_content = False
language = 'da'
no_stylesheets = True
delay = 1
encoding = 'utf8'
use_embedded_content = False
language = 'da'
extra_css = """ body{font-family: Verdana,Arial,sans-serif }
img{margin-bottom: 0.4em}
.txtContent,.stamp{font-size: small}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
feeds = [(u'All news', u'http://aoh.dk/rss.xml')]
keep_only_tags = [
dict(name='h1')
,dict(name='span', attrs={'class':['frontpage_body']})
]
keep_only_tags = [
dict(name='h1'), dict(name='span', attrs={'class': ['frontpage_body']})
]
remove_tags = [
dict(name=['object','link'])
]
remove_tags = [
dict(name=['object', 'link'])
]

View File

@ -1,34 +1,35 @@
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
class Alternet(BasicNewsRecipe):
title = u'Alternet'
__author__= 'rty'
title = u'Alternet'
__author__ = 'rty'
oldest_article = 7
max_articles_per_feed = 100
publisher = 'alternet.org'
category = 'News, Magazine'
description = 'News magazine and online community'
feeds = [
feeds = [
(u'Front Page', u'http://feeds.feedblitz.com/alternet')
]
]
remove_attributes = ['width', 'align','cellspacing']
remove_attributes = ['width', 'align', 'cellspacing']
remove_javascript = True
use_embedded_content = True
use_embedded_content = True
no_stylesheets = True
language = 'en'
encoding = 'UTF-8'
encoding = 'UTF-8'
temp_files = []
articles_are_obfuscated = True
def get_article_url(self, article):
return article.get('link', None)
return article.get('link', None)
def get_obfuscated_article(self, url):
br = self.get_browser()
br.open(url)
response = br.follow_link(url_regex = r'/printversion/[0-9]+', nr = 0)
response = br.follow_link(url_regex=r'/printversion/[0-9]+', nr=0)
html = response.read()
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(html)

View File

@ -1,6 +1,6 @@
# vim:fileencoding=UTF-8
from __future__ import unicode_literals
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2013, Eddie Lau'
__Date__ = ''
@ -12,7 +12,9 @@ Change Log:
from calibre import (__appname__, force_unicode, strftime)
from calibre.utils.date import now as nowf
import os, datetime, re
import os
import datetime
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested
from calibre.ebooks.BeautifulSoup import BeautifulSoup
@ -21,10 +23,11 @@ from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
from calibre.utils.localization import canonicalize_lang
class AppleDaily(BasicNewsRecipe):
title = u'AM730'
__author__ = 'Eddie Lau'
publisher = 'AM730'
title = u'AM730'
__author__ = 'Eddie Lau'
publisher = 'AM730'
oldest_article = 1
max_articles_per_feed = 100
auto_cleanup = False
@ -35,46 +38,46 @@ class AppleDaily(BasicNewsRecipe):
use_embedded_content = False
no_stylesheets = True
description = 'http://www.am730.com.hk'
category = 'Chinese, News, Hong Kong'
category = 'Chinese, News, Hong Kong'
masthead_url = 'http://www.am730.com.hk/images/logo.jpg'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 20px; margin-bottom: 20px; max-height:70%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} li {font-size:50%; margin-left:auto; margin-right:auto;}'
keep_only_tags = [dict(name='h2', attrs={'class':'printTopic'}),
dict(name='div', attrs={'id':'article_content'}),
dict(name='div', attrs={'id':'slider'})]
remove_tags = [dict(name='img', attrs={'src':'images/am730_article_logo.jpg'}),
dict(name='img', attrs={'src':'images/am_endmark.gif'})]
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 20px; margin-bottom: 20px; max-height:70%;} div[id=articleHeader] {font-size:200%; text-align:left; font-weight:bold;} li {font-size:50%; margin-left:auto; margin-right:auto;}' # noqa
keep_only_tags = [dict(name='h2', attrs={'class': 'printTopic'}),
dict(name='div', attrs={'id': 'article_content'}),
dict(name='div', attrs={'id': 'slider'})]
remove_tags = [dict(name='img', attrs={'src': 'images/am730_article_logo.jpg'}),
dict(name='img', attrs={'src': 'images/am_endmark.gif'})]
def get_dtlocal(self):
dt_utc = datetime.datetime.utcnow()
# convert UTC to local hk time - at HKT 6am, all news are available
return dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(6.0/24)
return dt_utc + datetime.timedelta(8.0 / 24) - datetime.timedelta(6.0 / 24)
def get_fetchdate(self):
if __Date__ <> '':
if __Date__ != '':
return __Date__
else:
return self.get_dtlocal().strftime("%Y%m%d")
def get_fetchformatteddate(self):
if __Date__ <> '':
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
if __Date__ != '':
return __Date__[0:4] + '-' + __Date__[4:6] + '-' + __Date__[6:8]
else:
return self.get_dtlocal().strftime("%Y-%m-%d")
def get_fetchyear(self):
if __Date__ <> '':
if __Date__ != '':
return __Date__[0:4]
else:
return self.get_dtlocal().strftime("%Y")
def get_fetchmonth(self):
if __Date__ <> '':
if __Date__ != '':
return __Date__[4:6]
else:
return self.get_dtlocal().strftime("%m")
def get_fetchday(self):
if __Date__ <> '':
if __Date__ != '':
return __Date__[6:8]
else:
return self.get_dtlocal().strftime("%d")
@ -85,7 +88,9 @@ class AppleDaily(BasicNewsRecipe):
def get_cover_url(self):
soup = self.index_to_soup('http://www.am730.com.hk')
cover = 'http://www.am730.com.hk/' + soup.find(attrs={'id':'mini_news_img'}).find('img').get('src', False)
cover = 'http://www.am730.com.hk/' + \
soup.find(attrs={'id': 'mini_news_img'}).find(
'img').get('src', False)
br = BasicNewsRecipe.get_browser(self)
try:
br.open(cover)
@ -97,7 +102,7 @@ class AppleDaily(BasicNewsRecipe):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
self.add_toc_thumbnail(article, picdiv['src'])
def parse_index(self):
feeds = []
@ -123,7 +128,8 @@ class AppleDaily(BasicNewsRecipe):
mi.publisher = __appname__
mi.author_sort = __appname__
if self.publication_type:
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
mi.publication_type = 'periodical:' + \
self.publication_type + ':' + self.short_title()
mi.timestamp = nowf()
article_titles, aseen = [], set()
for f in feeds:
@ -136,15 +142,15 @@ class AppleDaily(BasicNewsRecipe):
if not isinstance(mi.comments, unicode):
mi.comments = mi.comments.decode('utf-8', 'replace')
mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
'\n\n'.join(article_titles))
'\n\n'.join(article_titles))
language = canonicalize_lang(self.language)
if language is not None:
mi.language = language
# This one affects the pub date shown in kindle title
#mi.pubdate = nowf()
# now appears to need the time field to be > 12.00noon as well
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(
self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
opf_path = os.path.join(dir, 'index.opf')
ncx_path = os.path.join(dir, 'index.ncx')
@ -153,12 +159,14 @@ class AppleDaily(BasicNewsRecipe):
mp = getattr(self, 'masthead_path', None)
if mp is not None and os.access(mp, os.R_OK):
from calibre.ebooks.metadata.opf2 import Guide
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
ref = Guide.Reference(os.path.basename(
self.masthead_path), os.getcwdu())
ref.type = 'masthead'
ref.title = 'Masthead Image'
opf.guide.append(ref)
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
manifest = [os.path.join(dir, 'feed_%d' % i)
for i in range(len(feeds))]
manifest.append(os.path.join(dir, 'index.html'))
manifest.append(os.path.join(dir, 'index.ncx'))
@ -167,7 +175,7 @@ class AppleDaily(BasicNewsRecipe):
if cpath is None:
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
if self.default_cover(pf):
cpath = pf.name
cpath = pf.name
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
manifest.append(cpath)
@ -189,12 +197,11 @@ class AppleDaily(BasicNewsRecipe):
self.play_order_counter = 0
self.play_order_map = {}
def feed_index(num, parent):
f = feeds[num]
for j, a in enumerate(f):
if getattr(a, 'downloaded', False):
adir = 'feed_%d/article_%d/'%(num, j)
adir = 'feed_%d/article_%d/' % (num, j)
auth = a.author
if not auth:
auth = None
@ -204,16 +211,18 @@ class AppleDaily(BasicNewsRecipe):
else:
desc = self.description_limiter(desc)
tt = a.toc_thumbnail if a.toc_thumbnail else None
entries.append('%sindex.html'%adir)
entries.append('%sindex.html' % adir)
po = self.play_order_map.get(entries[-1], None)
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
parent.add_item('%sindex.html'%adir, None,
a.title if a.title else _('Untitled Article'),
play_order=po, author=auth,
description=desc, toc_thumbnail=tt)
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
parent.add_item('%sindex.html' % adir, None,
a.title if a.title else _(
'Untitled Article'),
play_order=po, author=auth,
description=desc, toc_thumbnail=tt)
last = os.path.join(
self.output_dir, ('%sindex.html' % adir).replace('/', os.sep))
for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp])
relp = sp[len(prefix):]
@ -226,12 +235,14 @@ class AppleDaily(BasicNewsRecipe):
soup = BeautifulSoup(src)
body = soup.find('body')
if body is not None:
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
prefix = '/'.join('..'for i in range(2 *
len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed,
a.orig_url, __appname__, prefix=prefix,
center=self.center_navbar)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
not self.has_single_feed,
a.orig_url, __appname__, prefix=prefix,
center=self.center_navbar)
elem = BeautifulSoup(templ.render(
doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem)
with open(last, 'wb') as fi:
fi.write(unicode(soup).encode('utf-8'))
@ -240,7 +251,7 @@ class AppleDaily(BasicNewsRecipe):
if len(feeds) > 1:
for i, f in enumerate(feeds):
entries.append('feed_%d/index.html'%i)
entries.append('feed_%d/index.html' % i)
po = self.play_order_map.get(entries[-1], None)
if po is None:
self.play_order_counter += 1
@ -251,11 +262,11 @@ class AppleDaily(BasicNewsRecipe):
desc = getattr(f, 'description', None)
if not desc:
desc = None
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
f.title, play_order=po, description=desc, author=auth))
feed_index(i, toc.add_item('feed_%d/index.html' % i, None,
f.title, play_order=po, description=desc, author=auth))
else:
entries.append('feed_%d/index.html'%0)
entries.append('feed_%d/index.html' % 0)
feed_index(0, toc)
for i, p in enumerate(entries):
@ -265,5 +276,3 @@ class AppleDaily(BasicNewsRecipe):
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file)

View File

@ -1,4 +1,4 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2008-2015, Darko Miletic <darko.miletic at gmail.com>'
'''
ambito.com
@ -6,46 +6,46 @@ ambito.com
from calibre.web.feeds.news import BasicNewsRecipe
class Ambito(BasicNewsRecipe):
title = 'Ambito.com'
__author__ = 'Darko Miletic'
description = 'Ambito.com con noticias del Diario Ambito Financiero de Buenos Aires'
publisher = 'Editorial Nefir S.A.'
category = 'news, politics, economy, finances, Argentina'
oldest_article = 2
no_stylesheets = True
encoding = 'cp1252'
masthead_url = 'http://www.ambito.com/img/logo.jpg'
use_embedded_content = False
remove_empty_feeds = True
language = 'es_AR'
publication_type = 'newsportal'
title = 'Ambito.com'
__author__ = 'Darko Miletic'
description = 'Ambito.com con noticias del Diario Ambito Financiero de Buenos Aires'
publisher = 'Editorial Nefir S.A.'
category = 'news, politics, economy, finances, Argentina'
oldest_article = 2
no_stylesheets = True
encoding = 'cp1252'
masthead_url = 'http://www.ambito.com/img/logo.jpg'
use_embedded_content = False
remove_empty_feeds = True
language = 'es_AR'
publication_type = 'newsportal'
extra_css = """
body{font-family: "Trebuchet MS",Verdana,sans-serif}
.volanta{font-size: small}
.t2_portada{font-size: xx-large; font-family: Georgia,serif; color: #026698}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
keep_only_tags = [dict(attrs={'id':['tituloDespliegue','imgDesp','textoDespliegue']})]
remove_tags = [dict(name=['object','link','embed','iframe','meta','link'])]
keep_only_tags = [
dict(attrs={'id': ['tituloDespliegue', 'imgDesp', 'textoDespliegue']})]
remove_tags = [
dict(name=['object', 'link', 'embed', 'iframe', 'meta', 'link'])]
feeds = [
(u'Principales Noticias', u'http://www.ambito.com/rss/noticiasp.asp' )
,(u'Economia' , u'http://www.ambito.com/rss/noticias.asp?S=Econom%EDa' )
,(u'Politica' , u'http://www.ambito.com/rss/noticias.asp?S=Pol%EDtica' )
,(u'Informacion General' , u'http://www.ambito.com/rss/noticias.asp?S=Informaci%F3n%20General')
,(u'Campo' , u'http://www.ambito.com/rss/noticias.asp?S=Agro' )
,(u'Internacionales' , u'http://www.ambito.com/rss/noticias.asp?S=Internacionales' )
,(u'Deportes' , u'http://www.ambito.com/rss/noticias.asp?S=Deportes' )
,(u'Espectaculos' , u'http://www.ambito.com/rss/noticias.asp?S=Espect%E1culos' )
,(u'Tecnologia' , u'http://www.ambito.com/rss/noticias.asp?S=Tecnolog%EDa' )
,(u'Ambito Nacional' , u'http://www.ambito.com/rss/noticias.asp?S=Ambito%20Nacional' )
]
(u'Principales Noticias', u'http://www.ambito.com/rss/noticiasp.asp'),
(u'Economia', u'http://www.ambito.com/rss/noticias.asp?S=Econom%EDa'),
(u'Politica', u'http://www.ambito.com/rss/noticias.asp?S=Pol%EDtica'),
(u'Informacion General', u'http://www.ambito.com/rss/noticias.asp?S=Informaci%F3n%20General'),
(u'Campo', u'http://www.ambito.com/rss/noticias.asp?S=Agro'),
(u'Internacionales', u'http://www.ambito.com/rss/noticias.asp?S=Internacionales'),
(u'Deportes', u'http://www.ambito.com/rss/noticias.asp?S=Deportes'),
(u'Espectaculos', u'http://www.ambito.com/rss/noticias.asp?S=Espect%E1culos'),
(u'Tecnologia', u'http://www.ambito.com/rss/noticias.asp?S=Tecnolog%EDa'),
(u'Ambito Nacional', u'http://www.ambito.com/rss/noticias.asp?S=Ambito%20Nacional')
]

View File

@ -1,4 +1,4 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
'''
ambito.com/diario
@ -8,22 +8,23 @@ import time
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
class Ambito_Financiero(BasicNewsRecipe):
title = 'Ambito Financiero'
__author__ = 'Darko Miletic'
description = 'Informacion Libre las 24 horas'
publisher = 'Editorial Nefir S.A.'
category = 'news, politics, economy, Argentina'
no_stylesheets = True
encoding = 'cp1252'
masthead_url = 'http://www.ambito.com/diario/img/logo_af.gif'
publication_type = 'newspaper'
needs_subscription = 'optional'
use_embedded_content = False
language = 'es_AR'
PREFIX = 'http://www.ambito.com'
INDEX = PREFIX + '/diario/index.asp'
LOGIN = PREFIX + '/diario/login/entrada.asp'
title = 'Ambito Financiero'
__author__ = 'Darko Miletic'
description = 'Informacion Libre las 24 horas'
publisher = 'Editorial Nefir S.A.'
category = 'news, politics, economy, Argentina'
no_stylesheets = True
encoding = 'cp1252'
masthead_url = 'http://www.ambito.com/diario/img/logo_af.gif'
publication_type = 'newspaper'
needs_subscription = 'optional'
use_embedded_content = False
language = 'es_AR'
PREFIX = 'http://www.ambito.com'
INDEX = PREFIX + '/diario/index.asp'
LOGIN = PREFIX + '/diario/login/entrada.asp'
extra_css = """
body{font-family: "Trebuchet MS",Verdana,sans-serif}
.volanta{font-size: small}
@ -31,14 +32,12 @@ class Ambito_Financiero(BasicNewsRecipe):
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
keep_only_tags = [dict(name='div', attrs={'align':'justify'})]
remove_tags = [dict(name=['object','link','embed','iframe','meta','link','table','img'])]
keep_only_tags = [dict(name='div', attrs={'align': 'justify'})]
remove_tags = [dict(name=['object', 'link', 'embed',
'iframe', 'meta', 'link', 'table', 'img'])]
remove_attributes = ['align']
def get_browser(self):
@ -53,7 +52,7 @@ class Ambito_Financiero(BasicNewsRecipe):
return br
def print_version(self, url):
return url.replace('/diario/noticia.asp?','/noticias/imprimir.asp?')
return url.replace('/diario/noticia.asp?', '/noticias/imprimir.asp?')
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
@ -61,27 +60,24 @@ class Ambito_Financiero(BasicNewsRecipe):
for item in soup.findAll('a'):
str = item.string
if str is None:
str = self.tag_to_string(item)
str = self.tag_to_string(item)
item.replaceWith(str)
return soup
def parse_index(self):
soup = self.index_to_soup(self.INDEX)
cover_item = soup.find('img',attrs={'class':'fotodespliegue'})
cover_item = soup.find('img', attrs={'class': 'fotodespliegue'})
if cover_item:
self.cover_url = self.PREFIX + cover_item['src']
self.cover_url = self.PREFIX + cover_item['src']
articles = []
checker = []
for feed_link in soup.findAll('a', attrs={'class':['t0_portada','t2_portada','bajada']}):
url = self.PREFIX + feed_link['href']
checker = []
for feed_link in soup.findAll('a', attrs={'class': ['t0_portada', 't2_portada', 'bajada']}):
url = self.PREFIX + feed_link['href']
title = self.tag_to_string(feed_link)
date = strftime("%a, %d %b %Y %H:%M:%S +0000",time.gmtime())
date = strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())
if url not in checker:
checker.append(url)
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description':u''
})
'title': title, 'date': date, 'url': url, 'description': u''
})
return [(self.title, articles)]

View File

@ -1,4 +1,4 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2010, Walt Anthony <workshop.northpole at gmail.com>'
'''
www.americanthinker.com
@ -8,37 +8,34 @@ from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.cleantext import clean_xml_chars
from lxml import etree
class AmericanThinker(BasicNewsRecipe):
title = u'American Thinker'
description = "American Thinker is a daily internet publication devoted to the thoughtful exploration of issues of importance to Americans."
__author__ = 'Walt Anthony'
publisher = 'Thomas Lifson'
category = 'news, politics, USA'
oldest_article = 7 # days
title = u'American Thinker'
description = "American Thinker is a daily internet publication devoted to the thoughtful exploration of issues of importance to Americans."
__author__ = 'Walt Anthony'
publisher = 'Thomas Lifson'
category = 'news, politics, USA'
oldest_article = 7 # days
max_articles_per_feed = 50
summary_length = 150
language = 'en'
summary_length = 150
language = 'en'
ignore_duplicate_articles = {'title', 'url'}
remove_javascript = True
remove_javascript = True
auto_cleanup = True
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
, 'linearize_tables' : True
}
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True
}
def preprocess_raw_html(self, raw, url):
root = html5lib.parse(
clean_xml_chars(raw), treebuilder='lxml',
namespaceHTMLElements=False)
for x in root.xpath('''descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' article_body ') and (@class and contains(concat(' ', normalize-space(@class), ' '), ' bottom '))]'''):
for x in root.xpath('''descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' article_body ') and (@class and contains(concat(' ', normalize-space(@class), ' '), ' bottom '))]'''): # noqa
x.getparent().remove(x)
return etree.tostring(root, encoding=unicode)
feeds = [(u'http://feeds.feedburner.com/americanthinker'),
(u'http://feeds.feedburner.com/AmericanThinkerBlog')
]
(u'http://feeds.feedburner.com/AmericanThinkerBlog')
]

View File

@ -1,4 +1,4 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
'''
spectator.org
@ -7,20 +7,22 @@ spectator.org
from calibre.web.feeds.news import BasicNewsRecipe
from css_selectors import Select
class TheAmericanSpectator(BasicNewsRecipe):
title = 'The American Spectator'
__author__ = 'Kovid Goyal'
description = 'News from USA'
oldest_article = 7
title = 'The American Spectator'
__author__ = 'Kovid Goyal'
description = 'News from USA'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
language = 'en'
no_stylesheets = True
use_embedded_content = False
language = 'en'
auto_cleanup = True
encoding = 'utf-8'
def parse_index(self):
root = self.index_to_soup('http://spectator.org/issues/current', as_tree=True)
root = self.index_to_soup(
'http://spectator.org/issues/current', as_tree=True)
select = Select(root)
main = tuple(select('div#block-system-main'))[0]
feeds = []
@ -43,7 +45,8 @@ class TheAmericanSpectator(BasicNewsRecipe):
for x in select('div.views-field-field-short-summary', li):
desc = self.tag_to_string(x)
break
articles.append({'title':title, 'url':url, 'description':desc})
articles.append(
{'title': title, 'url': url, 'description': desc})
self.log('\t', title, 'at', url)
feeds.append((section_title, articles))
return feeds

View File

@ -1,12 +1,13 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AnDrumaMor(BasicNewsRecipe):
title = u'An Druma M\xf3r'
title = u'An Druma M\xf3r'
__author__ = "David O'Callaghan"
oldest_article = 7
max_articles_per_feed = 100
language = 'ga'
use_embedded_content = True
feeds = [(u'Nuacht Laeth\xfail', u'http://feeds.feedburner.com/NuachtLneLaethilArAnDrumaMr')]
feeds = [(u'Nuacht Laeth\xfail',
u'http://feeds.feedburner.com/NuachtLneLaethilArAnDrumaMr')]

View File

@ -1,4 +1,4 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
@ -13,10 +13,10 @@ class anan(BasicNewsRecipe):
title = 'Anandtech'
description = 'comprehensive Hardware Tests'
__author__ = 'Oliver Niesner, Armin Geller' # 2014-02-27 AGE: update
use_embedded_content = False
use_embedded_content = False
language = 'en'
timefmt = ' [%d %b %Y]'
oldest_article = 7
oldest_article = 7
max_articles_per_feed = 40
no_stylesheets = True
remove_javascript = True
@ -26,17 +26,17 @@ class anan(BasicNewsRecipe):
masthead_url = 'http://www.anandtech.com/content/images/globals/printheader.png'
keep_only_tags = [
dict(name='section', attrs={'class':['main_cont']}),
]
remove_tags=[
dict(name='div', attrs={'class':['print',
'breadcrumb_area noprint',
'fl-rt noprint',
'blog_top_right',]})
]
dict(name='section', attrs={'class': ['main_cont']}),
]
remove_tags = [
dict(name='div', attrs={'class': ['print',
'breadcrumb_area noprint',
'fl-rt noprint',
'blog_top_right', ]})
]
feeds = [('Anandtech', 'http://www.anandtech.com/rss/')]
feeds = [('Anandtech', 'http://www.anandtech.com/rss/')]
def print_version(self,url):
def print_version(self, url):
# return url.replace("0Cshow0C", "0Cprint0C") # 2013-09-07 AGE: update
return url.replace("/show/", "/print/") # 2014-02-27 AGE: update

View File

@ -1,38 +1,28 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1278347258(BasicNewsRecipe):
title = u'Anchorage Daily News'
title = u'Anchorage Daily News'
__author__ = 'rty'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
feeds = [(u'Alaska News', u'http://www.adn.com/rss-feeds/feed/all'),
(u'Politics', u'http://www.adn.com/rss-feeds/feed/politics'),
]
feeds = [(u'Alaska News', u'http://www.adn.com/rss-feeds/feed/all'),
(u'Politics', u'http://www.adn.com/rss-feeds/feed/politics'),
]
description = ''''Alaska's Newspaper'''
publisher = 'http://www.adn.com'
category = 'news, Alaska, Anchorage'
publisher = 'http://www.adn.com'
category = 'news, Alaska, Anchorage'
language = 'en'
extra_css = '''
p{font-weight: normal;text-align: justify}
'''
remove_javascript = True
use_embedded_content = False
use_embedded_content = False
no_stylesheets = True
language = 'en'
encoding = 'utf-8'
conversion_options = {'linearize_tables':True}
encoding = 'utf-8'
conversion_options = {'linearize_tables': True}
masthead_url = 'http://media.adn.com/includes/assets/images/adn_logo.2.gif'
#keep_only_tags = [
#dict(name='div', attrs={'class':'left_col story_mainbar'}),
#]
#remove_tags = [
#dict(name='div', attrs={'class':'story_tools'}),
#dict(name='p', attrs={'class':'ad_label'}),
#]
#remove_tags_after = [
#dict(name='div', attrs={'class':'advertisement'}),
#]

View File

@ -1,15 +1,17 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Android_com_pl(BasicNewsRecipe):
title = u'Android.com.pl'
__author__ = 'fenuks'
description = u'Android.com.pl - to największe w Polsce centrum Android OS. Znajdziesz tu: nowości, forum, pomoc, recenzje, gry, aplikacje.'
category = 'Android, mobile'
language = 'pl'
title = u'Android.com.pl'
__author__ = 'fenuks'
description = u'Android.com.pl - to największe w Polsce centrum Android OS. Znajdziesz tu: nowości, forum, pomoc, recenzje, gry, aplikacje.'
category = 'Android, mobile'
language = 'pl'
use_embedded_content = True
cover_url = 'http://android.com.pl/wp-content/themes/android/images/logo.png'
oldest_article = 8
max_articles_per_feed = 100
preprocess_regexps = [(re.compile(ur'<p>.{,1}</p>', re.DOTALL), lambda match: '')]
feeds = [(u'Android', u'http://android.com.pl/feed/')]
preprocess_regexps = [
(re.compile(ur'<p>.{,1}</p>', re.DOTALL), lambda match: '')]
feeds = [(u'Android', u'http://android.com.pl/feed/')]

View File

@ -3,43 +3,43 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1290663986(BasicNewsRecipe):
title = u'Animal Pol\u00EDtico'
publisher = u'Animal Pol\u00EDtico'
category = u'News, Mexico'
description = u'Noticias Pol\u00EDticas'
__author__ = 'leamsi'
masthead_url = 'http://www.animalpolitico.com/wp-content/themes/animal_mu/images/logo.png'
title = u'Animal Pol\u00EDtico'
publisher = u'Animal Pol\u00EDtico'
category = u'News, Mexico'
description = u'Noticias Pol\u00EDticas'
__author__ = 'leamsi'
masthead_url = 'http://www.animalpolitico.com/wp-content/themes/animal_mu/images/logo.png'
oldest_article = 1
max_articles_per_feed = 100
language = 'es_MX'
#feeds = [(u'Animal Politico', u'http://www.animalpolitico.com/feed/')]
language = 'es_MX'
remove_tags_before = dict(name='div', id='main')
remove_tags = [dict(name='div', attrs={'class':'fb-like-button'})]
keep_only_tags = [dict(name='h1', attrs={'class':'entry-title'}),
dict(name='div', attrs={'class':'entry-content'})]
remove_tags = [dict(name='div', attrs={'class': 'fb-like-button'})]
keep_only_tags = [dict(name='h1', attrs={'class': 'entry-title'}),
dict(name='div', attrs={'class': 'entry-content'})]
remove_javascript = True
INDEX = 'http://www.animalpolitico.com/'
def generic_parse(self, soup):
articles = []
for entry in soup.findAll(lambda tag: tag.name == 'li' and tag.has_key('class') and tag['class'].find('hentry') != -1): #soup.findAll('li', 'hentry'):
article_url = entry.a['href'] + '?print=yes'
article_title= entry.find('h3', 'entry-title')
article_title= self.tag_to_string(article_title)
# soup.findAll('li', 'hentry'):
for entry in soup.findAll(lambda tag: tag.name == 'li' and tag.has_key('class') and tag['class'].find('hentry') != -1): # noqa
article_url = entry.a['href'] + '?print=yes'
article_title = entry.find('h3', 'entry-title')
article_title = self.tag_to_string(article_title)
article_date = entry.find('span', 'the-time')
article_date = self.tag_to_string(article_date)
article_desc = self.tag_to_string(entry.find('p'))
#print 'Article:',article_title, article_date,article_url
#print entry['class']
# print 'Article:',article_title, article_date,article_url
# print entry['class']
articles.append({'title' : article_title,
'date' : article_date,
'description' : article_desc,
'url' : article_url})
articles.append({'title': article_title,
'date': article_date,
'description': article_desc,
'url': article_url})
# Avoid including the multimedia stuff.
if entry['class'].find('last') != -1:
break
@ -48,56 +48,57 @@ class AdvancedUserRecipe1290663986(BasicNewsRecipe):
def plumaje_parse(self, soup):
articles = []
blogs_soup = soup.find(lambda tag: tag.name == 'ul' and tag.has_key('class') and tag['class'].find('bloglist-fecha') != -1)
blogs_soup = soup.find(lambda tag: tag.name == 'ul' and tag.has_key('class') and tag['class'].find('bloglist-fecha') != -1) # noqa
for entry in blogs_soup.findAll('li'):
article_title = entry.p
article_url = article_title.a['href'] + '?print=yes'
article_date = article_title.nextSibling
article_url = article_title.a['href'] + '?print=yes'
article_date = article_title.nextSibling
article_title = self.tag_to_string(article_title)
article_date = self.tag_to_string(article_date).replace(u'Last Updated: ', '')
article_desc = self.tag_to_string(entry.find('h4'))
article_date = self.tag_to_string(
article_date).replace(u'Last Updated: ', '')
article_desc = self.tag_to_string(entry.find('h4'))
#print 'Article:',article_title, article_date,article_url
articles.append({'title' : article_title,
'date' : article_date,
'description' : article_desc,
'url' : article_url})
# print 'Article:',article_title, article_date,article_url
articles.append({'title': article_title,
'date': article_date,
'description': article_desc,
'url': article_url})
return articles
def boca_parse(self, soup):
articles = []
for entry in soup.findAll(lambda tag: tag.name == 'div' and tag.has_key('class') and tag['class'].find('hentry') != -1): #soup.findAll('li', 'hentry'):
article_title= entry.find('h2', 'entry-title')
article_url = article_title.a['href'] + '?print=yes'
article_title= self.tag_to_string(article_title)
# soup.findAll('li', 'hentry'):
for entry in soup.findAll(lambda tag: tag.name == 'div' and tag.has_key('class') and tag['class'].find('hentry') != -1): # noqa
article_title = entry.find('h2', 'entry-title')
article_url = article_title.a['href'] + '?print=yes'
article_title = self.tag_to_string(article_title)
article_date = entry.find('span', 'entry-date')
article_date = self.tag_to_string(article_date)
article_desc = self.tag_to_string(entry.find('div', 'entry-content'))
article_desc = self.tag_to_string(
entry.find('div', 'entry-content'))
#print 'Article:',article_title, article_date,article_url
#print entry['class']
# print 'Article:',article_title, article_date,article_url
# print entry['class']
articles.append({'title' : article_title,
'date' : article_date,
'description' : article_desc,
'url' : article_url})
articles.append({'title': article_title,
'date': article_date,
'description': article_desc,
'url': article_url})
# Avoid including the multimedia stuff.
if entry['class'].find('last') != -1:
break
return articles
def parse_index(self):
gobierno_soup = self.index_to_soup(self.INDEX+'gobierno/')
congreso_soup = self.index_to_soup(self.INDEX+'congreso/')
seguridad_soup = self.index_to_soup(self.INDEX+'seguridad/')
comunidad_soup = self.index_to_soup(self.INDEX+'comunidad/')
plumaje_soup = self.index_to_soup(self.INDEX+'plumaje/')
la_boca_del_lobo_soup = self.index_to_soup(self.INDEX+'category/la-boca-del-lobo/')
gobierno_soup = self.index_to_soup(self.INDEX + 'gobierno/')
congreso_soup = self.index_to_soup(self.INDEX + 'congreso/')
seguridad_soup = self.index_to_soup(self.INDEX + 'seguridad/')
comunidad_soup = self.index_to_soup(self.INDEX + 'comunidad/')
plumaje_soup = self.index_to_soup(self.INDEX + 'plumaje/')
la_boca_del_lobo_soup = self.index_to_soup(
self.INDEX + 'category/la-boca-del-lobo/')
gobierno_articles = self.generic_parse(gobierno_soup)
congreso_articles = self.generic_parse(congreso_soup)
@ -106,6 +107,5 @@ class AdvancedUserRecipe1290663986(BasicNewsRecipe):
plumaje_articles = self.plumaje_parse(plumaje_soup)
la_boca_del_lobo_articles = self.boca_parse(la_boca_del_lobo_soup)
return [ (u'Gobierno', gobierno_articles), (u'Congreso', congreso_articles), (u'Seguridad', seguridad_articles),
(u'Comunidad', comunidad_articles), (u'Plumaje', plumaje_articles), (u'La Boca del Lobo', la_boca_del_lobo_articles), ]
return [(u'Gobierno', gobierno_articles), (u'Congreso', congreso_articles), (u'Seguridad', seguridad_articles),
(u'Comunidad', comunidad_articles), (u'Plumaje', plumaje_articles), (u'La Boca del Lobo', la_boca_del_lobo_articles), ]

View File

@ -1,6 +1,7 @@
#-*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
class AntywebRecipe(BasicNewsRecipe):
encoding = 'utf-8'
__license__ = 'GPL v3'
@ -10,38 +11,40 @@ class AntywebRecipe(BasicNewsRecipe):
title = u'Antyweb'
category = u'News'
description = u'Blog o internecie i nowych technologiach'
cover_url=''
remove_empty_feeds= True
cover_url = ''
remove_empty_feeds = True
auto_cleanup = False
no_stylesheets=True
no_stylesheets = True
use_embedded_content = False
oldest_article = 7
max_articles_per_feed = 100
remove_javascript = True
simultaneous_downloads = 10
ignore_duplicate_articles = {'title', 'url'} # zignoruj zduplikowane artykuły o takich samych tytułach LUB adresach
scale_news_images =True
conversion_options = { 'tags' : u'news, aplikacje mobilne, Android, iOS, Windows Phone ',
'smarten_punctuation' : True,
'publisher' : 'AntyWeb'
} # opcje konwersji.
# zignoruj zduplikowane artykuły o takich samych tytułach LUB adresach
ignore_duplicate_articles = {'title', 'url'}
scale_news_images = True
conversion_options = {'tags': u'news, aplikacje mobilne, Android, iOS, Windows Phone ',
'smarten_punctuation': True,
'publisher': 'AntyWeb'
} # opcje konwersji.
keep_only_tags=[]
keep_only_tags.append(dict(name = 'h1'))
keep_only_tags.append(dict(name = 'article', attrs = {'class' : 'article'}))
remove_tags =[]
remove_tags.append(dict(name = 'div', attrs = {'class' : 'ac-footer group'}))
keep_only_tags = []
keep_only_tags.append(dict(name='h1'))
keep_only_tags.append(dict(name='article', attrs={'class': 'article'}))
remove_tags = []
remove_tags.append(dict(name='div', attrs={'class': 'ac-footer group'}))
feeds = [
(u'News', 'http://feeds.feedburner.com/antyweb'),
(u'Felietony', 'http://feeds.feedburner.com/AntywebFelietony'),
(u'Apple', 'http://feeds.feedburner.com/AntywebApple'),
(u'Gry', 'http://feeds.feedburner.com/AntywebGry'),
(u'Mobile', 'http://feeds.feedburner.com/AntywebMobile'),
(u'Startups', 'http://feeds.feedburner.com/AntywebStartups'),
(u'Google', 'http://feeds.feedburner.com/AntywebGoogle'),
(u'Microsoft', 'http://feeds.feedburner.com/AntywebMicrosoft')
]
feeds = [
(u'News', 'http://feeds.feedburner.com/antyweb'),
(u'Felietony', 'http://feeds.feedburner.com/AntywebFelietony'),
(u'Apple', 'http://feeds.feedburner.com/AntywebApple'),
(u'Gry', 'http://feeds.feedburner.com/AntywebGry'),
(u'Mobile', 'http://feeds.feedburner.com/AntywebMobile'),
(u'Startups', 'http://feeds.feedburner.com/AntywebStartups'),
(u'Google', 'http://feeds.feedburner.com/AntywebGoogle'),
(u'Microsoft', 'http://feeds.feedburner.com/AntywebMicrosoft')
]
def preprocess_html(self, soup):
for alink in soup.findAll('a'):
if alink.string is not None:

View File

@ -6,21 +6,23 @@ class AssociatedPress(BasicNewsRecipe):
title = u'Associated Press'
description = 'Global news'
__author__ = 'Krittika Goyal'
use_embedded_content = False
use_embedded_content = False
language = 'en'
no_stylesheets = True
conversion_options = {
'linearize_tables' : True
'linearize_tables': True
}
keep_only_tags = {'name':'table', 'attrs':{'class':lambda x: x and 'ap-story-table' in x.split()}}
keep_only_tags = {'name': 'table', 'attrs': {
'class': lambda x: x and 'ap-story-table' in x.split()}}
remove_tags = [
{'class':['ap-mediabox-table']},
{'name':'img', 'src':lambda x: x and '//analytics.' in x},
{'class': ['ap-mediabox-table']},
{'name': 'img', 'src': lambda x: x and '//analytics.' in x},
]
def parse_index(self):
feeds = []
fronts = ('HOME', 'US', 'WORLD', 'BUSINESS', 'TECHNOLOGY', 'SPORTS', 'ENTERTAINMENT', 'HEALTH', 'SCIENCE', 'POLITICS')
fronts = ('HOME', 'US', 'WORLD', 'BUSINESS', 'TECHNOLOGY',
'SPORTS', 'ENTERTAINMENT', 'HEALTH', 'SCIENCE', 'POLITICS')
for front in fronts:
feeds.append([front.capitalize(), self.parse_section(front)])
feeds[0][0] = 'Top Stories'
@ -28,19 +30,20 @@ class AssociatedPress(BasicNewsRecipe):
def parse_section(self, front):
self.log('Processing section:', front)
soup = self.index_to_soup('http://hosted.ap.org/dynamic/fronts/%s?SITE=AP' % front)
soup = self.index_to_soup(
'http://hosted.ap.org/dynamic/fronts/%s?SITE=AP' % front)
articles = []
for x in soup.findAll('p', attrs={'class':['ap-newsbriefitem-p', 'ap-topheadlineitem-p']}):
for x in soup.findAll('p', attrs={'class': ['ap-newsbriefitem-p', 'ap-topheadlineitem-p']}):
a = x.find('a', href=True)
title = self.tag_to_string(a)
url = "http://hosted.ap.org" + a['href']
p = x.find(attrs={'class':'topheadlinebody'})
p = x.find(attrs={'class': 'topheadlinebody'})
desc = ''
if p is not None:
desc = self.tag_to_string(p)
self.log('\tFound article:', title, '\n\t\t', desc)
articles.append({'title':title, 'url':url})
articles.append({'title': title, 'url': url})
self.log('\n\n')

View File

@ -1,8 +1,8 @@
#!/usr/bin/env python2
__license__ = 'GPL v3'
__author__ = 'Gabriele Marini, based on Darko Miletic'
__license__ = 'GPL v3'
__author__ = 'Gabriele Marini, based on Darko Miletic'
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
description = 'Italian daily newspaper - 14-05-2010'
description = 'Italian daily newspaper - 14-05-2010'
'''
http://www.apcom.NET/
@ -10,39 +10,38 @@ http://www.apcom.NET/
from calibre.web.feeds.news import BasicNewsRecipe
class Apcom(BasicNewsRecipe):
__author__ = 'Marini Gabriele'
description = 'Italian daily newspaper'
__author__ = 'Marini Gabriele'
description = 'Italian daily newspaper'
cover_url = 'http://www.apcom.net/img/logoAP.gif'
title = u'Apcom'
publisher = 'TM News S.p.A.'
category = 'News, politics, culture, economy, general interest'
cover_url = 'http://www.apcom.net/img/logoAP.gif'
title = u'Apcom'
publisher = 'TM News S.p.A.'
category = 'News, politics, culture, economy, general interest'
language = 'it'
timefmt = '[%a, %d %b, %Y]'
language = 'it'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 7
max_articles_per_feed = 50
use_embedded_content = False
recursion = 100
use_embedded_content = False
recursion = 100
no_stylesheets = True
conversion_options = {'linearize_tables':True}
no_stylesheets = True
conversion_options = {'linearize_tables': True}
remove_javascript = True
keep_only_tags = [
dict(name='div', attrs={'id':'ag_center'})
]
keep_only_tags = [
dict(name='div', attrs={'id': 'ag_center'})
]
feeds = [
(u'Globale', u'http://www.apcom.net/rss/globale.xml '),
(u'Politica', u'http://www.apcom.net/rss/politica.xml'),
(u'Cronaca', u'http://www.apcom.net/rss/cronaca.xml'),
(u'Econimia', u'http://www.apcom.net/rss/economia.xml'),
(u'Esteri', u'http://www.apcom.net/rss/esteri.xml'),
(u'Cultura', u'http://www.apcom.net/rss/cultura.xml'),
(u'Sport', u'http://www.apcom.net/rss/sport.xml')
]
(u'Globale', u'http://www.apcom.net/rss/globale.xml '),
(u'Politica', u'http://www.apcom.net/rss/politica.xml'),
(u'Cronaca', u'http://www.apcom.net/rss/cronaca.xml'),
(u'Econimia', u'http://www.apcom.net/rss/economia.xml'),
(u'Esteri', u'http://www.apcom.net/rss/esteri.xml'),
(u'Cultura', u'http://www.apcom.net/rss/cultura.xml'),
(u'Sport', u'http://www.apcom.net/rss/sport.xml')
]

View File

@ -1,28 +1,30 @@
from calibre.web.feeds.news import BasicNewsRecipe
class APOD(BasicNewsRecipe):
title = u'Astronomy Picture of the Day'
__author__ = 'Starson17'
title = u'Astronomy Picture of the Day'
__author__ = 'Starson17'
description = 'Astronomy Pictures'
language = 'en'
use_embedded_content = False
no_stylesheets = True
cover_url = 'http://apod.nasa.gov/apod/image/1003/m78_torregrosa.jpg'
use_embedded_content = False
no_stylesheets = True
cover_url = 'http://apod.nasa.gov/apod/image/1003/m78_torregrosa.jpg'
remove_javascript = True
recursions = 0
oldest_article = 14
oldest_article = 14
remove_attributes = ['onmouseover', 'onmouseout']
feeds = [
(u'Astronomy Picture of the Day', u'http://apod.nasa.gov/apod.rss')
]
(u'Astronomy Picture of the Day', u'http://apod.nasa.gov/apod.rss')
]
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''
'''
def postprocess_html(self, soup, first_fetch):
center_tags = soup.findAll(['center'])
p_tags = soup.findAll(['p'])
@ -35,4 +37,3 @@ class APOD(BasicNewsRecipe):
for tag in last2_p:
tag.extract()
return soup

View File

@ -9,18 +9,19 @@ appfunds.blogspot.com
from calibre.web.feeds.news import BasicNewsRecipe
class app_funds(BasicNewsRecipe):
title = u'APP Funds'
title = u'APP Funds'
__author__ = 'teepel <teepel44@gmail.com>'
language = 'pl'
description ='Blog inwestora dla inwestorów i oszczędzających'
INDEX='http://appfunds.blogspot.com'
remove_empty_feeds= True
language = 'pl'
description = 'Blog inwestora dla inwestorów i oszczędzających'
INDEX = 'http://appfunds.blogspot.com'
remove_empty_feeds = True
oldest_article = 7
max_articles_per_feed = 100
simultaneous_downloads = 5
remove_javascript=True
no_stylesheets=True
remove_javascript = True
no_stylesheets = True
auto_cleanup = True
feeds = [(u'blog', u'http://feeds.feedburner.com/blogspot/etVI')]
feeds = [(u'blog', u'http://feeds.feedburner.com/blogspot/etVI')]

View File

@ -1,12 +1,14 @@
# vim:fileencoding=UTF-8
from __future__ import unicode_literals
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2013-2015, Eddie Lau'
__Date__ = ''
from calibre import (__appname__, force_unicode, strftime)
from calibre.utils.date import now as nowf
import os, datetime, re
import os
import datetime
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested
from calibre.ebooks.BeautifulSoup import BeautifulSoup
@ -15,10 +17,11 @@ from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
from calibre.utils.localization import canonicalize_lang
class AppleDaily(BasicNewsRecipe):
title = u'蘋果日報 (香港)'
__author__ = 'Eddie Lau'
publisher = '蘋果日報'
title = u'蘋果日報 (香港)'
__author__ = 'Eddie Lau'
publisher = '蘋果日報'
oldest_article = 1
max_articles_per_feed = 100
auto_cleanup = False
@ -26,48 +29,48 @@ class AppleDaily(BasicNewsRecipe):
encoding = 'utf-8'
auto_cleanup = False
remove_javascript = True
use_embedded_content = False
use_embedded_content = False
no_stylesheets = True
description = 'http://hkm.appledaily.com/'
category = 'Chinese, News, Hong Kong'
category = 'Chinese, News, Hong Kong'
masthead_url = 'http://upload.wikimedia.org/wikipedia/zh/c/cf/AppleDailyLogo1.png'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} h1 {font-size:200%; text-align:left; font-weight:bold;} p[class=video-caption] {font-size:50%; margin-left:auto; margin-right:auto;}'
keep_only_tags = [dict(name='div', attrs={'id':'content-article'})]
remove_tags = [dict(name='div', attrs={'class':'prev-next-btn'}),
dict(name='p', attrs={'class':'next'})]
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} h1 {font-size:200%; text-align:left; font-weight:bold;} p[class=video-caption] {font-size:50%; margin-left:auto; margin-right:auto;}' # noqa
keep_only_tags = [dict(name='div', attrs={'id': 'content-article'})]
remove_tags = [dict(name='div', attrs={'class': 'prev-next-btn'}),
dict(name='p', attrs={'class': 'next'})]
def get_dtlocal(self):
dt_utc = datetime.datetime.utcnow()
# convert UTC to local hk time - at HKT 6am, all news are available
return dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(6.0/24)
return dt_utc + datetime.timedelta(8.0 / 24) - datetime.timedelta(6.0 / 24)
def get_fetchdate(self):
if __Date__ <> '':
if __Date__ != '':
return __Date__
else:
return self.get_dtlocal().strftime("%Y%m%d")
def get_fetchformatteddate(self):
if __Date__ <> '':
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
if __Date__ != '':
return __Date__[0:4] + '-' + __Date__[4:6] + '-' + __Date__[6:8]
else:
return self.get_dtlocal().strftime("%Y-%m-%d")
def get_fetchyear(self):
if __Date__ <> '':
if __Date__ != '':
return __Date__[0:4]
else:
return self.get_dtlocal().strftime("%Y")
def get_fetchmonth(self):
if __Date__ <> '':
if __Date__ != '':
return __Date__[4:6]
else:
return self.get_dtlocal().strftime("%m")
def get_fetchday(self):
if __Date__ <> '':
if __Date__ != '':
return __Date__[6:8]
else:
return self.get_dtlocal().strftime("%d")
@ -78,7 +81,7 @@ class AppleDaily(BasicNewsRecipe):
def get_cover_url(self):
soup = self.index_to_soup('http://hkm.appledaily.com/')
cover = soup.find(attrs={'class':'top-news'}).get('src', False)
cover = soup.find(attrs={'class': 'top-news'}).get('src', False)
br = BasicNewsRecipe.get_browser(self)
try:
br.open(cover)
@ -90,12 +93,12 @@ class AppleDaily(BasicNewsRecipe):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
self.add_toc_thumbnail(article, picdiv['src'])
def parse_index(self):
feeds = []
soup = self.index_to_soup('http://hkm.appledaily.com/')
ul = soup.find(attrs={'class':'menu'})
ul = soup.find(attrs={'class': 'menu'})
sectionList = []
for li in ul.findAll('li'):
relativea = li.find('a', href=True).get('href', False)
@ -111,13 +114,14 @@ class AppleDaily(BasicNewsRecipe):
def parse_section(self, url):
soup = self.index_to_soup(url)
ul = soup.find(attrs={'class':'list'})
ul = soup.find(attrs={'class': 'list'})
current_articles = []
for li in ul.findAll('li'):
a = li.find('a', href=True)
title = li.find('p', text=True).strip()
if a is not None:
current_articles.append({'title': title, 'url':'http://hkm.appledaily.com/' + a.get('href', False)})
current_articles.append(
{'title': title, 'url': 'http://hkm.appledaily.com/' + a.get('href', False)})
pass
return current_articles
@ -131,7 +135,8 @@ class AppleDaily(BasicNewsRecipe):
mi.publisher = __appname__
mi.author_sort = __appname__
if self.publication_type:
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
mi.publication_type = 'periodical:' + \
self.publication_type + ':' + self.short_title()
mi.timestamp = nowf()
article_titles, aseen = [], set()
for f in feeds:
@ -144,15 +149,16 @@ class AppleDaily(BasicNewsRecipe):
if not isinstance(mi.comments, unicode):
mi.comments = mi.comments.decode('utf-8', 'replace')
mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
'\n\n'.join(article_titles))
'\n\n'.join(article_titles))
language = canonicalize_lang(self.language)
if language is not None:
mi.language = language
# This one affects the pub date shown in kindle title
#mi.pubdate = nowf()
# mi.pubdate = nowf()
# now appears to need the time field to be > 12.00noon as well
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(
self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
opf_path = os.path.join(dir, 'index.opf')
ncx_path = os.path.join(dir, 'index.ncx')
@ -161,12 +167,14 @@ class AppleDaily(BasicNewsRecipe):
mp = getattr(self, 'masthead_path', None)
if mp is not None and os.access(mp, os.R_OK):
from calibre.ebooks.metadata.opf2 import Guide
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
ref = Guide.Reference(os.path.basename(
self.masthead_path), os.getcwdu())
ref.type = 'masthead'
ref.title = 'Masthead Image'
opf.guide.append(ref)
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
manifest = [os.path.join(dir, 'feed_%d' % i)
for i in range(len(feeds))]
manifest.append(os.path.join(dir, 'index.html'))
manifest.append(os.path.join(dir, 'index.ncx'))
@ -175,7 +183,7 @@ class AppleDaily(BasicNewsRecipe):
if cpath is None:
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
if self.default_cover(pf):
cpath = pf.name
cpath = pf.name
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
manifest.append(cpath)
@ -197,12 +205,11 @@ class AppleDaily(BasicNewsRecipe):
self.play_order_counter = 0
self.play_order_map = {}
def feed_index(num, parent):
f = feeds[num]
for j, a in enumerate(f):
if getattr(a, 'downloaded', False):
adir = 'feed_%d/article_%d/'%(num, j)
adir = 'feed_%d/article_%d/' % (num, j)
auth = a.author
if not auth:
auth = None
@ -212,16 +219,18 @@ class AppleDaily(BasicNewsRecipe):
else:
desc = self.description_limiter(desc)
tt = a.toc_thumbnail if a.toc_thumbnail else None
entries.append('%sindex.html'%adir)
entries.append('%sindex.html' % adir)
po = self.play_order_map.get(entries[-1], None)
if po is None:
self.play_order_counter += 1
po = self.play_order_counter
parent.add_item('%sindex.html'%adir, None,
a.title if a.title else _('Untitled Article'),
play_order=po, author=auth,
description=desc, toc_thumbnail=tt)
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
parent.add_item('%sindex.html' % adir, None,
a.title if a.title else _(
'Untitled Article'),
play_order=po, author=auth,
description=desc, toc_thumbnail=tt)
last = os.path.join(
self.output_dir, ('%sindex.html' % adir).replace('/', os.sep))
for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp])
relp = sp[len(prefix):]
@ -234,12 +243,14 @@ class AppleDaily(BasicNewsRecipe):
soup = BeautifulSoup(src)
body = soup.find('body')
if body is not None:
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
prefix = '/'.join('..'for i in range(2 *
len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed,
a.orig_url, __appname__, prefix=prefix,
center=self.center_navbar)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
not self.has_single_feed,
a.orig_url, __appname__, prefix=prefix,
center=self.center_navbar)
elem = BeautifulSoup(templ.render(
doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem)
with open(last, 'wb') as fi:
fi.write(unicode(soup).encode('utf-8'))
@ -248,7 +259,7 @@ class AppleDaily(BasicNewsRecipe):
if len(feeds) > 1:
for i, f in enumerate(feeds):
entries.append('feed_%d/index.html'%i)
entries.append('feed_%d/index.html' % i)
po = self.play_order_map.get(entries[-1], None)
if po is None:
self.play_order_counter += 1
@ -259,11 +270,11 @@ class AppleDaily(BasicNewsRecipe):
desc = getattr(f, 'description', None)
if not desc:
desc = None
feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
f.title, play_order=po, description=desc, author=auth))
feed_index(i, toc.add_item('feed_%d/index.html' % i, None,
f.title, play_order=po, description=desc, author=auth))
else:
entries.append('feed_%d/index.html'%0)
entries.append('feed_%d/index.html' % 0)
feed_index(0, toc)
for i, p in enumerate(entries):
@ -273,5 +284,3 @@ class AppleDaily(BasicNewsRecipe):
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file)

View File

@ -34,12 +34,12 @@ class AppledailyTW(BasicNewsRecipe):
{'name': 'hr'}
]
conversion_options = {
'title' : title,
'comments' : description,
'tags' : category,
'language' : language,
'publisher' : publisher,
'authors' : publisher,
'title': title,
'comments': description,
'tags': category,
'language': language,
'publisher': publisher,
'authors': publisher,
'linearize_tables': True
}
feeds = [
@ -105,5 +105,6 @@ class AppledailyTW(BasicNewsRecipe):
def preprocess_raw_html(self, raw_html, url):
raw_html = re.sub(ur'<a href=".*?<br><br>.*?<\/a>', '', raw_html)
raw_html = re.sub(ur'<title>(.*?)[\s]+\|.*<\/title>', '<title>\1<\/title>', raw_html)
raw_html = re.sub(
ur'<title>(.*?)[\s]+\|.*<\/title>', '<title>\1<\/title>', raw_html)
return raw_html

View File

@ -2,21 +2,22 @@
from __future__ import unicode_literals
from calibre.web.feeds.news import BasicNewsRecipe
class BasicUserRecipe1395137685(BasicNewsRecipe):
title = u'Applefobia'
__author__ = 'koliberek'
title = u'Applefobia'
__author__ = 'koliberek'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
language = 'pl'
remove_empty_feeds = True
remove_javascript = True
conversion_options = {
'tags' : u'newsy, Apple, humor',
'smarten_punctuation' : True,
'authors' : 'Ogrodnik January',
'publisher' : 'Blogspot.pl'
}
reverse_article_order = True
language = 'pl'
remove_empty_feeds = True
remove_javascript = True
conversion_options = {
'tags': u'newsy, Apple, humor',
'smarten_punctuation': True,
'authors': 'Ogrodnik January',
'publisher': 'Blogspot.pl'
}
reverse_article_order = True
feeds = [(u'Aktualne', u'http://applefobia.blogspot.com/feeds/posts/default')]
feeds = [(u'Aktualne', u'http://applefobia.blogspot.com/feeds/posts/default')]

View File

@ -1,22 +1,20 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AmericanProspect(BasicNewsRecipe):
title = u'American Prospect'
__author__ = u'Michael Heinz, a.peter'
version = 2
title = u'American Prospect'
__author__ = u'Michael Heinz, a.peter'
version = 2
oldest_article = 30
language = 'en'
oldest_article = 30
language = 'en'
max_articles_per_feed = 100
recursions = 0
no_stylesheets = True
remove_javascript = True
recursions = 0
no_stylesheets = True
remove_javascript = True
#keep_only_tags = [dict(name='div', attrs={'class':'pad_10L10R'})]
#remove_tags = [dict(name='form'), dict(name='div', attrs={'class':['bkt_caption','sharebox noprint','badgebox']})]
use_embedded_content = False
no_stylesheets = True
auto_cleanup = True
feeds = [(u'Articles', u'feed://www.prospect.org/articles_rss.jsp')]
feeds = [(u'Articles', u'feed://www.prospect.org/articles_rss.jsp')]

View File

@ -1,19 +1,25 @@
#!/usr/bin/env python2
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = 'Ruben Pollan <meskio@sindominio.net>'
__docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1335656316(BasicNewsRecipe):
title = u'AraInfo.org'
__author__ = 'Ruben Pollan'
description = 'Regional newspaper from Aragon'
title = u'AraInfo.org'
__author__ = 'Ruben Pollan'
description = 'Regional newspaper from Aragon'
language = 'es'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
cover_url = u'http://arainfo.org/wordpress/wp-content/uploads/2011/10/logo-web_alta.jpg'
cover_url = u'http://arainfo.org/wordpress/wp-content/uploads/2011/10/logo-web_alta.jpg'
feeds = [(u'Movimientos', u'http://arainfo.org/category/movimientos/feed/'), (u'Econom\xeda', u'http://arainfo.org/category/economia/feed/'), (u'Ecolog\xeda', u'http://arainfo.org/category/ecologia/feed/'), (u'Culturas', u'http://arainfo.org/category/culturas/feed/'), (u'Altavoz', u'http://arainfo.org/category/altavoz/feed/')]
feeds = [
(u'Movimientos', u'http://arainfo.org/category/movimientos/feed/'),
(u'Econom\xeda', u'http://arainfo.org/category/economia/feed/'),
(u'Ecolog\xeda', u'http://arainfo.org/category/ecologia/feed/'),
(u'Culturas', u'http://arainfo.org/category/culturas/feed/'),
(u'Altavoz', u'http://arainfo.org/category/altavoz/feed/')]

View File

@ -1,4 +1,4 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
'''
www.arabianbusiness.com
@ -6,21 +6,21 @@ www.arabianbusiness.com
from calibre.web.feeds.news import BasicNewsRecipe
class Arabian_Business(BasicNewsRecipe):
title = 'Arabian Business'
__author__ = 'Darko Miletic'
description = 'Comprehensive Guide to Middle East Business & Gulf Industry News including,Banking & Finance,Construction,Energy,Media & Marketing,Real Estate,Transportation,Travel,Technology,Politics,Healthcare,Lifestyle,Jobs & UAE guide.Top Gulf & Dubai Business News.'
publisher = 'Arabian Business Publishing Ltd.'
category = 'ArabianBusiness.com,Arab Business News,Middle East Business News,Middle East Business,Arab Media News,Industry Events,Middle East Industry News,Arab Business Industry,Dubai Business News,Financial News,UAE Business News,Middle East Press Releases,Gulf News,Arab News,GCC Business News,Banking Finance,Media Marketing,Construction,Oil Gas,Retail,Transportation,Travel Hospitality,Photos,Videos,Life Style,Fashion,United Arab Emirates,UAE,Dubai,Sharjah,Abu Dhabi,Qatar,KSA,Saudi Arabia,Bahrain,Kuwait,Oman,Europe,South Asia,America,Asia,news'
oldest_article = 2
title = 'Arabian Business'
__author__ = 'Darko Miletic'
description = 'Comprehensive Guide to Middle East Business & Gulf Industry News including,Banking & Finance,Construction,Energy,Media & Marketing,Real Estate,Transportation,Travel,Technology,Politics,Healthcare,Lifestyle,Jobs & UAE guide.Top Gulf & Dubai Business News.' # noqa
publisher = 'Arabian Business Publishing Ltd.'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'en'
remove_empty_feeds = True
publication_type = 'newsportal'
masthead_url = 'http://www.arabianbusiness.com/skins/ab.main/gfx/arabianbusiness_logo_sm.gif'
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'en'
remove_empty_feeds = True
publication_type = 'newsportal'
masthead_url = 'http://www.arabianbusiness.com/skins/ab.main/gfx/arabianbusiness_logo_sm.gif'
extra_css = """
body{font-family: Georgia,serif }
img{margin-bottom: 0.4em; margin-top: 0.4em; display:block}
@ -29,49 +29,46 @@ class Arabian_Business(BasicNewsRecipe):
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
'comment': description, 'publisher': publisher, 'language': language
}
remove_tags_before=dict(attrs={'id':'article-title'})
remove_tags_before = dict(attrs={'id': 'article-title'})
remove_tags = [
dict(name=['meta','link','base','iframe','embed','object'])
,dict(attrs={'class':'printfooter'})
]
remove_attributes=['lang']
dict(name=['meta', 'link', 'base', 'iframe', 'embed', 'object']), dict(
attrs={'class': 'printfooter'})
]
remove_attributes = ['lang']
feeds = [
(u'Africa' , u'http://www.arabianbusiness.com/world/Africa/?service=rss' )
,(u'Americas' , u'http://www.arabianbusiness.com/world/americas/?service=rss' )
,(u'Asia Pacific' , u'http://www.arabianbusiness.com/world/asia-pacific/?service=rss' )
,(u'Europe' , u'http://www.arabianbusiness.com/world/europe/?service=rss' )
,(u'Middle East' , u'http://www.arabianbusiness.com/world/middle-east/?service=rss' )
,(u'South Asia' , u'http://www.arabianbusiness.com/world/south-asia/?service=rss' )
,(u'Banking & Finance', u'http://www.arabianbusiness.com/industries/banking-finance/?service=rss' )
,(u'Construction' , u'http://www.arabianbusiness.com/industries/construction/?service=rss' )
,(u'Education' , u'http://www.arabianbusiness.com/industries/education/?service=rss' )
,(u'Energy' , u'http://www.arabianbusiness.com/industries/energy/?service=rss' )
,(u'Healthcare' , u'http://www.arabianbusiness.com/industries/healthcare/?service=rss' )
,(u'Media' , u'http://www.arabianbusiness.com/industries/media/?service=rss' )
,(u'Real Estate' , u'http://www.arabianbusiness.com/industries/real-estate/?service=rss' )
,(u'Retail' , u'http://www.arabianbusiness.com/industries/retail/?service=rss' )
,(u'Technology' , u'http://www.arabianbusiness.com/industries/technology/?service=rss' )
,(u'Transport' , u'http://www.arabianbusiness.com/industries/transport/?service=rss' )
,(u'Travel' , u'http://www.arabianbusiness.com/industries/travel-hospitality/?service=rss')
,(u'Equities' , u'http://www.arabianbusiness.com/markets/equities/?service=rss' )
,(u'Commodities' , u'http://www.arabianbusiness.com/markets/commodities/?service=rss' )
,(u'Currencies' , u'http://www.arabianbusiness.com/markets/currencies/?service=rss' )
,(u'Market Data' , u'http://www.arabianbusiness.com/markets/market-data/?service=rss' )
,(u'Comment' , u'http://www.arabianbusiness.com/opinion/comment/?service=rss' )
,(u'Think Tank' , u'http://www.arabianbusiness.com/opinion/think-tank/?service=rss' )
,(u'Arts' , u'http://www.arabianbusiness.com/lifestyle/arts/?service=rss' )
,(u'Cars' , u'http://www.arabianbusiness.com/lifestyle/cars/?service=rss' )
,(u'Food' , u'http://www.arabianbusiness.com/lifestyle/food/?service=rss' )
,(u'Sport' , u'http://www.arabianbusiness.com/lifestyle/sport/?service=rss' )
]
(u'Africa', u'http://www.arabianbusiness.com/world/Africa/?service=rss'),
(u'Americas', u'http://www.arabianbusiness.com/world/americas/?service=rss'),
(u'Asia Pacific', u'http://www.arabianbusiness.com/world/asia-pacific/?service=rss'),
(u'Europe', u'http://www.arabianbusiness.com/world/europe/?service=rss'),
(u'Middle East', u'http://www.arabianbusiness.com/world/middle-east/?service=rss'),
(u'South Asia', u'http://www.arabianbusiness.com/world/south-asia/?service=rss'),
(u'Banking & Finance', u'http://www.arabianbusiness.com/industries/banking-finance/?service=rss'),
(u'Construction', u'http://www.arabianbusiness.com/industries/construction/?service=rss'),
(u'Education', u'http://www.arabianbusiness.com/industries/education/?service=rss'),
(u'Energy', u'http://www.arabianbusiness.com/industries/energy/?service=rss'),
(u'Healthcare', u'http://www.arabianbusiness.com/industries/healthcare/?service=rss'),
(u'Media', u'http://www.arabianbusiness.com/industries/media/?service=rss'),
(u'Real Estate', u'http://www.arabianbusiness.com/industries/real-estate/?service=rss'),
(u'Retail', u'http://www.arabianbusiness.com/industries/retail/?service=rss'),
(u'Technology', u'http://www.arabianbusiness.com/industries/technology/?service=rss'),
(u'Transport', u'http://www.arabianbusiness.com/industries/transport/?service=rss'),
(u'Travel', u'http://www.arabianbusiness.com/industries/travel-hospitality/?service=rss'),
(u'Equities', u'http://www.arabianbusiness.com/markets/equities/?service=rss'),
(u'Commodities', u'http://www.arabianbusiness.com/markets/commodities/?service=rss'),
(u'Currencies', u'http://www.arabianbusiness.com/markets/currencies/?service=rss'),
(u'Market Data', u'http://www.arabianbusiness.com/markets/market-data/?service=rss'),
(u'Comment', u'http://www.arabianbusiness.com/opinion/comment/?service=rss'),
(u'Think Tank', u'http://www.arabianbusiness.com/opinion/think-tank/?service=rss'),
(u'Arts', u'http://www.arabianbusiness.com/lifestyle/arts/?service=rss'),
(u'Cars', u'http://www.arabianbusiness.com/lifestyle/cars/?service=rss'),
(u'Food', u'http://www.arabianbusiness.com/lifestyle/food/?service=rss'),
(u'Sport', u'http://www.arabianbusiness.com/lifestyle/sport/?service=rss')
]
def print_version(self, url):
return url + '?service=printer&page='
@ -81,6 +78,6 @@ class Arabian_Business(BasicNewsRecipe):
del item['style']
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
tstr = alink.string
alink.replaceWith(tstr)
return soup

View File

@ -1,34 +1,32 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Arbetaren_SE(BasicNewsRecipe):
title = u'Arbetaren'
__author__ = 'Joakim Lindskog'
description = 'Nyheter fr\xc3\xa5n Arbetaren'
publisher = 'Arbetaren'
category = 'news, politics, socialism, Sweden'
oldest_article = 7
delay = 1
title = u'Arbetaren'
__author__ = 'Joakim Lindskog'
description = 'Nyheter fr\xc3\xa5n Arbetaren'
publisher = 'Arbetaren'
category = 'news, politics, socialism, Sweden'
oldest_article = 7
delay = 1
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = 'sv'
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = 'sv'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
keep_only_tags = [dict(name='div', attrs={'id':'article'})]
remove_tags_before = dict(name='div', attrs={'id':'article'})
remove_tags_after = dict(name='p',attrs={'id':'byline'})
keep_only_tags = [dict(name='div', attrs={'id': 'article'})]
remove_tags_before = dict(name='div', attrs={'id': 'article'})
remove_tags_after = dict(name='p', attrs={'id': 'byline'})
remove_tags = [
dict(name=['object','link','base']),
dict(name='p', attrs={'class':'print'}),
dict(name='a', attrs={'class':'addthis_button_compact'}),
dict(name='script')
]
dict(name=['object', 'link', 'base']),
dict(name='p', attrs={'class': 'print'}),
dict(name='a', attrs={'class': 'addthis_button_compact'}),
dict(name='script')
]
feeds = [(u'Nyheter', u'http://www.arbetaren.se/rss/arbetaren.rss?rev=123')]
feeds = [(u'Nyheter', u'http://www.arbetaren.se/rss/arbetaren.rss?rev=123')]

View File

@ -3,16 +3,17 @@
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
class Arcadia_BBS(BasicNewsRecipe):
title = u'Arcadia'
__author__ = 'Masahiro Hasegawa'
language = 'ja'
encoding = 'utf8'
filter_regexps = [r'ad\.jp\.ap\.valuecommerce.com',]
filter_regexps = [r'ad\.jp\.ap\.valuecommerce.com', ]
timefmt = '[%Y/%m/%d]'
remove_tags_before = dict(name='a', attrs={'name':'kiji'})
remove_tags_before = dict(name='a', attrs={'name': 'kiji'})
sid_list = [] #some sotory id
sid_list = [] # some sotory id
def parse_index(self):
result = []
@ -21,15 +22,12 @@ class Arcadia_BBS(BasicNewsRecipe):
soup = self.index_to_soup(
'http://www.mai-net.net/bbs/sst/sst.php?act=dump&all=%d'
% sid)
sec = soup.findAll('a', attrs={'href':re.compile(r'.*?kiji')})
sec = soup.findAll('a', attrs={'href': re.compile(r'.*?kiji')})
for s in sec[:-2]:
s_result.append(dict(title=s.string,
url="http://www.mai-net.net" + s['href'],
date=s.parent.parent.parent.findAll('td')[3].string[:-6],
description='', content=''))
url="http://www.mai-net.net" + s['href'],
date=s.parent.parent.parent.findAll('td')[
3].string[:-6],
description='', content=''))
result.append((s_result[0]['title'], s_result))
return result

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python2
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = 'Copyright 2010 Starson17'
'''
www.arcamax.com
@ -10,30 +10,29 @@ import os
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryDirectory
class Arcamax(BasicNewsRecipe):
title = 'Arcamax'
__author__ = 'Kovid Goyal'
description = u'Family Friendly Comics - Customize for more days/comics: Defaults to 7 days, 25 comics - 20 general, 5 editorial.'
category = 'news, comics'
language = 'en'
use_embedded_content= False
no_stylesheets = True
remove_javascript = True
cover_url = 'http://www.arcamax.com/images/pub/amuse/leftcol/zits.jpg'
title = 'Arcamax'
__author__ = 'Kovid Goyal'
description = u'Family Friendly Comics - Customize for more days/comics: Defaults to 7 days, 25 comics - 20 general, 5 editorial.'
category = 'news, comics'
language = 'en'
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
cover_url = 'http://www.arcamax.com/images/pub/amuse/leftcol/zits.jpg'
# ###### USER PREFERENCES - SET COMICS AND NUMBER OF COMICS TO RETRIEVE ########
# ###### USER PREFERENCES - SET COMICS AND NUMBER OF COMICS TO RETRIEVE ##
num_comics_to_get = 7
# CHOOSE COMIC STRIPS BELOW - REMOVE COMMENT '# ' FROM IN FRONT OF DESIRED STRIPS
# CHOOSE COMIC STRIPS BELOW - REMOVE COMMENT '# ' FROM IN FRONT OF DESIRED
# STRIPS
conversion_options = {'linearize_tables' : True
, 'comment' : description
, 'tags' : category
, 'language' : language
}
conversion_options = {'linearize_tables': True, 'comment': description, 'tags': category, 'language': language
}
keep_only_tags = [
dict(name='header', attrs={'class':'fn-content-header bluelabel'}),
dict(name='figure', attrs={'class':['comic']}),
keep_only_tags = [
dict(name='header', attrs={'class': 'fn-content-header bluelabel'}),
dict(name='figure', attrs={'class': ['comic']}),
]
def parse_index(self):
@ -93,18 +92,22 @@ class Arcamax(BasicNewsRecipe):
num -= 1
raw = self.index_to_soup(url, raw=True)
self.panel_counter += 1
path = os.path.join(self.panel_tdir, '%d.html' % self.panel_counter)
path = os.path.join(self.panel_tdir, '%d.html' %
self.panel_counter)
with open(path, 'wb') as f:
f.write(raw)
soup = self.index_to_soup(raw)
a = soup.find(name='a', attrs={'class':['prev']})
a = soup.find(name='a', attrs={'class': ['prev']})
prev_page_url = 'http://www.arcamax.com' + a['href']
title = self.tag_to_string(soup.find('title')).partition('|')[0].strip()
title = self.tag_to_string(
soup.find('title')).partition('|')[0].strip()
if 'for' not in title.split():
title = title + ' for today'
date = self.tag_to_string(soup.find(name='span', attrs={'class':['cur']}))
date = self.tag_to_string(
soup.find(name='span', attrs={'class': ['cur']}))
self.log('\tFound:', title, 'at:', url)
current_articles.append({'title': title, 'url':'file://' + path , 'description':'', 'date': date})
current_articles.append(
{'title': title, 'url': 'file://' + path, 'description': '', 'date': date})
if self.test and len(current_articles) >= self.test[1]:
break
url = prev_page_url

View File

@ -1,33 +1,35 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Archeowiesci(BasicNewsRecipe):
title = u'Archeowieści'
__author__ = 'fenuks'
category = 'archeology'
language = 'pl'
title = u'Archeowieści'
__author__ = 'fenuks'
category = 'archeology'
language = 'pl'
description = u'Z pasją o przeszłości'
cover_url='http://archeowiesci.pl/wp-content/uploads/2011/05/Archeowiesci2-115x115.jpg'
cover_url = 'http://archeowiesci.pl/wp-content/uploads/2011/05/Archeowiesci2-115x115.jpg'
oldest_article = 7
needs_subscription='optional'
needs_subscription = 'optional'
max_articles_per_feed = 100
auto_cleanup = True
remove_tags=[dict(name='span', attrs={'class':['post-ratings', 'post-ratings-loading']})]
feeds = [(u'Archeowieści', u'http://archeowiesci.pl/feed/')]
remove_tags = [
dict(name='span', attrs={'class': ['post-ratings', 'post-ratings-loading']})]
feeds = [(u'Archeowieści', u'http://archeowiesci.pl/feed/')]
def parse_feeds (self):
feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds:
for article in feed.articles[:]:
if self.username is None and 'subskrypcja' in article.title:
feed.articles.remove(article)
return feeds
def parse_feeds(self):
feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds:
for article in feed.articles[:]:
if self.username is None and 'subskrypcja' in article.title:
feed.articles.remove(article)
return feeds
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None:
br.open('http://archeowiesci.pl/wp-login.php')
br.select_form(name='loginform')
br['log'] = self.username
br['log'] = self.username
br['pwd'] = self.password
br.submit()
return br

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2013, Darko Miletic <darko.miletic at gmail.com>'
'''
@ -10,87 +10,85 @@ import time
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
class ArgNoticias(BasicNewsRecipe):
title = 'ARG Noticias'
__author__ = 'Darko Miletic'
description = 'Ultimas noticias de Argentina'
publisher = 'ARG Noticias'
category = 'news, politics, Argentina'
oldest_article = 2
title = 'ARG Noticias'
__author__ = 'Darko Miletic'
description = 'Ultimas noticias de Argentina'
publisher = 'ARG Noticias'
category = 'news, politics, Argentina'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
masthead_url = 'http://www.argnoticias.com/images/arg-logo-footer.png'
language = 'es_AR'
publication_type = 'newsportal'
INDEX = 'http://www.argnoticias.com'
extra_css = ''
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
masthead_url = 'http://www.argnoticias.com/images/arg-logo-footer.png'
language = 'es_AR'
publication_type = 'newsportal'
INDEX = 'http://www.argnoticias.com'
extra_css = ''
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher': publisher
, 'language' : language
}
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
keep_only_tags = [dict(name='div', attrs={'class':['itemHeader','itemBody','itemAuthorBlock']})]
keep_only_tags = [
dict(name='div', attrs={'class': ['itemHeader', 'itemBody', 'itemAuthorBlock']})]
remove_tags = [
dict(name=['object','link','base','iframe']),
dict(name='div', attrs={'class':['b2jsocial_parent','itemSocialSharing']})
]
dict(name=['object', 'link', 'base', 'iframe']),
dict(name='div', attrs={
'class': ['b2jsocial_parent', 'itemSocialSharing']})
]
feeds = [
(u'Politica' , u'http://www.argnoticias.com/index.php/politica' )
,(u'Economia' , u'http://www.argnoticias.com/index.php/economia' )
,(u'Sociedad' , u'http://www.argnoticias.com/index.php/sociedad' )
,(u'Mundo' , u'http://www.argnoticias.com/index.php/mundo' )
,(u'Deportes' , u'http://www.argnoticias.com/index.php/deportes' )
,(u'Espectaculos', u'http://www.argnoticias.com/index.php/espectaculos')
,(u'Tendencias' , u'http://www.argnoticias.com/index.php/tendencias' )
]
(u'Politica', u'http://www.argnoticias.com/index.php/politica'),
(u'Economia', u'http://www.argnoticias.com/index.php/economia'),
(u'Sociedad', u'http://www.argnoticias.com/index.php/sociedad'),
(u'Mundo', u'http://www.argnoticias.com/index.php/mundo'),
(u'Deportes', u'http://www.argnoticias.com/index.php/deportes'),
(u'Espectaculos', u'http://www.argnoticias.com/index.php/espectaculos'),
(u'Tendencias', u'http://www.argnoticias.com/index.php/tendencias')
]
def parse_index(self):
totalfeeds = []
lfeeds = self.get_feeds()
checker = []
checker = []
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
self.report_progress(0, _('Fetching feed') + ' %s...' %
(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
for item in soup.findAll('div', attrs={'class':'Nota'}):
atag = item.find('a', attrs={'class':'moduleItemTitle'})
ptag = item.find('div', attrs={'class':'moduleItemIntrotext'})
url = self.INDEX + atag['href']
title = self.tag_to_string(atag)
for item in soup.findAll('div', attrs={'class': 'Nota'}):
atag = item.find('a', attrs={'class': 'moduleItemTitle'})
ptag = item.find('div', attrs={'class': 'moduleItemIntrotext'})
url = self.INDEX + atag['href']
title = self.tag_to_string(atag)
description = self.tag_to_string(ptag)
date = strftime("%a, %d %b %Y %H:%M:%S +0000",time.gmtime())
date = strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())
if url not in checker:
checker.append(url)
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description':description
})
'title': title, 'date': date, 'url': url, 'description': description
})
for item in soup.findAll('li'):
atag = item.find('a', attrs={'class':'moduleItemTitle'})
atag = item.find('a', attrs={'class': 'moduleItemTitle'})
if atag:
ptag = item.find('div', attrs={'class':'moduleItemIntrotext'})
url = self.INDEX + atag['href']
title = self.tag_to_string(atag)
ptag = item.find(
'div', attrs={'class': 'moduleItemIntrotext'})
url = self.INDEX + atag['href']
title = self.tag_to_string(atag)
description = self.tag_to_string(ptag)
date = strftime("%a, %d %b %Y %H:%M:%S +0000",time.gmtime())
date = strftime(
"%a, %d %b %Y %H:%M:%S +0000", time.gmtime())
if url not in checker:
checker.append(url)
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description':description
})
'title': title, 'date': date, 'url': url, 'description': description
})
totalfeeds.append((feedtitle, articles))
return totalfeeds

View File

@ -1,68 +1,77 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2010, jolo'
'''
azrepublic.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe1307301031(BasicNewsRecipe):
title = u'AZRepublic'
__author__ = 'Jim Olo'
language = 'en'
description = "The Arizona Republic is Arizona's leading provider of news and information, and has published a daily newspaper in Phoenix for more than 110 years"
publisher = 'AZRepublic/AZCentral'
masthead_url = 'http://freedom2t.com/wp-content/uploads/press_az_republic_v2.gif'
cover_url = 'http://www.valleyleadership.org/Common/Img/2line4c_AZRepublic%20with%20azcentral%20logo.jpg'
category = 'news, politics, USA, AZ, Arizona'
title = u'AZRepublic'
__author__ = 'Jim Olo'
language = 'en'
description = "The Arizona Republic is Arizona's leading provider of news and information, and has published a daily newspaper in Phoenix for more than 110 years" # noqa
publisher = 'AZRepublic/AZCentral'
masthead_url = 'http://freedom2t.com/wp-content/uploads/press_az_republic_v2.gif'
cover_url = 'http://www.valleyleadership.org/Common/Img/2line4c_AZRepublic%20with%20azcentral%20logo.jpg'
category = 'news, politics, USA, AZ, Arizona'
oldest_article = 7
max_articles_per_feed = 100
remove_empty_feeds = True
no_stylesheets = True
remove_javascript = True
no_stylesheets = True
remove_javascript = True
# extra_css = '.headline {font-size: medium;} \n .fact { padding-top: 10pt }'
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .headline {font-size: medium} .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .headline {font-size: medium} .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' # noqa
remove_attributes = ['width','height','h2','subHeadline','style']
remove_attributes = ['width', 'height', 'h2', 'subHeadline', 'style']
remove_tags = [
dict(name='div', attrs={'id':['slidingBillboard', 'top728x90', 'subindex-header', 'topSearch']}),
dict(name='div', attrs={'id':['simplesearch', 'azcLoginBox', 'azcLoginBoxInner', 'topNav']}),
dict(name='div', attrs={'id':['carsDrop', 'homesDrop', 'rentalsDrop', 'classifiedDrop']}),
dict(name='div', attrs={'id':['nav', 'mp', 'subnav', 'jobsDrop']}),
dict(name='h6', attrs={'class':['section-header']}),
dict(name='a', attrs={'href':['#comments']}),
dict(name='div', attrs={'class':['articletools clearfix', 'floatRight']}),
dict(name='div', attrs={'id':['fbFrame', 'ob', 'storyComments', 'storyGoogleAdBox']}),
dict(name='div', attrs={'id':['storyTopHomes', 'openRight', 'footerwrap', 'copyright']}),
dict(name='div', attrs={'id':['blogsHed', 'blog_comments', 'blogByline','blogTopics']}),
dict(name='div', attrs={'id':['membersRightMain', 'dealsfooter', 'azrTopHed', 'azrRightCol']}),
dict(name='div', attrs={'id':['ttdHeader', 'ttdTimeWeather']}),
dict(name='div', attrs={'id':['membersRightMain', 'deals-header-wrap']}),
dict(name='div', attrs={'id':['todoTopSearchBar', 'byline clearfix', 'subdex-topnav']}),
dict(name='h1', attrs={'id':['SEOtext']}),
dict(name='table', attrs={'class':['ap-mediabox-table']}),
dict(name='p', attrs={'class':['ap_para']}),
dict(name='span', attrs={'class':['source-org vcard', 'org fn']}),
dict(name='a', attrs={'href':['http://hosted2.ap.org/APDEFAULT/privacy']}),
dict(name='a', attrs={'href':['http://hosted2.ap.org/APDEFAULT/terms']}),
dict(name='div', attrs={'id':['onespot_nextclick']}),
]
feeds = [
(u'FrontPage', u'http://www.azcentral.com/rss/feeds/republicfront.xml'),
(u'TopUS-News', u'http://hosted.ap.org/lineups/USHEADS.rss?SITE=AZPHG&SECTION=HOME'),
(u'WorldNews', u'http://hosted.ap.org/lineups/WORLDHEADS.rss?SITE=AZPHG&SECTION=HOME'),
(u'TopBusiness', u'http://hosted.ap.org/lineups/BUSINESSHEADS.rss?SITE=AZPHG&SECTION=HOME'),
(u'Entertainment', u'http://hosted.ap.org/lineups/ENTERTAINMENT.rss?SITE=AZPHG&SECTION=HOME'),
(u'ArizonaNews', u'http://www.azcentral.com/rss/feeds/news.xml'),
(u'Gilbert', u'http://www.azcentral.com/rss/feeds/gilbert.xml'),
(u'Chandler', u'http://www.azcentral.com/rss/feeds/chandler.xml'),
(u'DiningReviews', u'http://www.azcentral.com/rss/feeds/diningreviews.xml'),
(u'AZBusiness', u'http://www.azcentral.com/rss/feeds/business.xml'),
(u'ArizonaDeals', u'http://www.azcentral.com/members/Blog%7E/RealDealsblog'),
(u'GroceryDeals', u'http://www.azcentral.com/members/Blog%7E/RealDealsblog/tag/2646')
]
dict(name='div', attrs={
'id': ['slidingBillboard', 'top728x90', 'subindex-header', 'topSearch']}),
dict(name='div', attrs={
'id': ['simplesearch', 'azcLoginBox', 'azcLoginBoxInner', 'topNav']}),
dict(name='div', attrs={
'id': ['carsDrop', 'homesDrop', 'rentalsDrop', 'classifiedDrop']}),
dict(name='div', attrs={'id': ['nav', 'mp', 'subnav', 'jobsDrop']}),
dict(name='h6', attrs={'class': ['section-header']}),
dict(name='a', attrs={'href': ['#comments']}),
dict(name='div', attrs={
'class': ['articletools clearfix', 'floatRight']}),
dict(name='div', attrs={
'id': ['fbFrame', 'ob', 'storyComments', 'storyGoogleAdBox']}),
dict(name='div', attrs={
'id': ['storyTopHomes', 'openRight', 'footerwrap', 'copyright']}),
dict(name='div', attrs={
'id': ['blogsHed', 'blog_comments', 'blogByline', 'blogTopics']}),
dict(name='div', attrs={
'id': ['membersRightMain', 'dealsfooter', 'azrTopHed', 'azrRightCol']}),
dict(name='div', attrs={'id': ['ttdHeader', 'ttdTimeWeather']}),
dict(name='div', attrs={
'id': ['membersRightMain', 'deals-header-wrap']}),
dict(name='div', attrs={
'id': ['todoTopSearchBar', 'byline clearfix', 'subdex-topnav']}),
dict(name='h1', attrs={'id': ['SEOtext']}),
dict(name='table', attrs={'class': ['ap-mediabox-table']}),
dict(name='p', attrs={'class': ['ap_para']}),
dict(name='span', attrs={'class': ['source-org vcard', 'org fn']}),
dict(name='a', attrs={
'href': ['http://hosted2.ap.org/APDEFAULT/privacy']}),
dict(name='a', attrs={
'href': ['http://hosted2.ap.org/APDEFAULT/terms']}),
dict(name='div', attrs={'id': ['onespot_nextclick']}),
]
feeds = [
(u'FrontPage', u'http://www.azcentral.com/rss/feeds/republicfront.xml'),
(u'TopUS-News', u'http://hosted.ap.org/lineups/USHEADS.rss?SITE=AZPHG&SECTION=HOME'),
(u'WorldNews', u'http://hosted.ap.org/lineups/WORLDHEADS.rss?SITE=AZPHG&SECTION=HOME'),
(u'TopBusiness', u'http://hosted.ap.org/lineups/BUSINESSHEADS.rss?SITE=AZPHG&SECTION=HOME'),
(u'Entertainment', u'http://hosted.ap.org/lineups/ENTERTAINMENT.rss?SITE=AZPHG&SECTION=HOME'),
(u'ArizonaNews', u'http://www.azcentral.com/rss/feeds/news.xml'),
(u'Gilbert', u'http://www.azcentral.com/rss/feeds/gilbert.xml'),
(u'Chandler', u'http://www.azcentral.com/rss/feeds/chandler.xml'),
(u'DiningReviews', u'http://www.azcentral.com/rss/feeds/diningreviews.xml'),
(u'AZBusiness', u'http://www.azcentral.com/rss/feeds/business.xml'),
(u'ArizonaDeals', u'http://www.azcentral.com/members/Blog%7E/RealDealsblog'),
(u'GroceryDeals', u'http://www.azcentral.com/members/Blog%7E/RealDealsblog/tag/2646')
]

View File

@ -1,42 +1,39 @@
from calibre.web.feeds.news import BasicNewsRecipe
class ArmyTimes(BasicNewsRecipe):
title = 'Army Times'
__author__ = 'jde'
__date__ = '16 May 2012'
__version__ = '1.0'
description = 'News of the U.S. Army'
language = 'en'
publisher = 'ArmyTimes.com'
category = 'news, U.S. Army'
tags = 'news, U.S. Army'
cover_url = 'http://www.armytimes.com/images/logo_armytimes_alert.jpg'
masthead_url = 'http://www.armytimes.com/images/logo_armytimes_alert.jpg'
oldest_article = 7 #days
max_articles_per_feed = 25
publication_type = 'newspaper'
no_stylesheets = True
use_embedded_content = False
encoding = None
recursions = 0
needs_subscription = False
remove_javascript = True
remove_empty_feeds = True
auto_cleanup = True
title = 'Army Times'
__author__ = 'jde'
__date__ = '16 May 2012'
__version__ = '1.0'
description = 'News of the U.S. Army'
language = 'en'
publisher = 'ArmyTimes.com'
category = 'news, U.S. Army'
tags = 'news, U.S. Army'
cover_url = 'http://www.armytimes.com/images/logo_armytimes_alert.jpg'
masthead_url = 'http://www.armytimes.com/images/logo_armytimes_alert.jpg'
oldest_article = 7 # days
max_articles_per_feed = 25
publication_type = 'newspaper'
no_stylesheets = True
use_embedded_content = False
encoding = None
recursions = 0
needs_subscription = False
remove_javascript = True
remove_empty_feeds = True
auto_cleanup = True
feeds = [
feeds = [
('News', 'http://www.armytimes.com/rss_news.php'),
('Benefits', 'http://www.armytimes.com/rss_benefits.php'),
('Money', 'http://www.armytimes.com/rss_money.php'),
('Careers & Education', 'http://www.armytimes.com/rss_careers.php'),
('Community', 'http://www.armytimes.com/rss_community.php'),
('Off Duty', 'http://www.armytimes.com/rss_off_duty.php'),
('Entertainment', 'http://www.armytimes.com/rss_entertainment.php'),
('Guard & Reserve', 'http://www.armytimes.com/rss_guard.php'),
('News', 'http://www.armytimes.com/rss_news.php'),
('Benefits', 'http://www.armytimes.com/rss_benefits.php'),
('Money', 'http://www.armytimes.com/rss_money.php'),
('Careers & Education', 'http://www.armytimes.com/rss_careers.php'),
('Community', 'http://www.armytimes.com/rss_community.php'),
('Off Duty', 'http://www.armytimes.com/rss_off_duty.php'),
('Entertainment', 'http://www.armytimes.com/rss_entertainment.php'),
('Guard & Reserve', 'http://www.armytimes.com/rss_guard.php'),
]

View File

@ -7,10 +7,11 @@ __description__ = 'Get some fresh news from Arrêt sur images'
from calibre.web.feeds.recipes import BasicNewsRecipe
class Asi(BasicNewsRecipe):
title = 'Arrêt sur images'
__author__ = 'François D. (aka franek)'
title = 'Arrêt sur images'
__author__ = 'François D. (aka franek)'
description = 'Global news in french from news site "Arrêt sur images"'
oldest_article = 7.0
@ -26,15 +27,16 @@ class Asi(BasicNewsRecipe):
no_stylesheets = True
remove_javascript = True
feeds = [
feeds = [
('vite dit et gratuit', 'http://www.arretsurimages.net/vite-dit.rss'),
('Toutes les chroniques', 'http://www.arretsurimages.net/chroniques.rss'),
('Contenus et dossiers', 'http://www.arretsurimages.net/dossiers.rss'),
]
conversion_options = { 'smarten_punctuation' : True }
conversion_options = {'smarten_punctuation': True}
remove_tags = [dict(id='vite-titre'), dict(id='header'), dict(id='wrap-connexion'), dict(id='col_right'), dict(name='div', attrs={'class':'bloc-chroniqueur-2'}), dict(id='footercontainer')]
remove_tags = [dict(id='vite-titre'), dict(id='header'), dict(id='wrap-connexion'), dict(id='col_right'),
dict(name='div', attrs={'class': 'bloc-chroniqueur-2'}), dict(id='footercontainer')]
def print_version(self, url):
return url.replace('contenu.php', 'contenu-imprimable.php')
@ -51,4 +53,3 @@ class Asi(BasicNewsRecipe):
br['password'] = self.password
br.submit()
return br

View File

@ -1,4 +1,4 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
'''
arstechnica.com
@ -7,20 +7,21 @@ arstechnica.com
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class ArsTechnica(BasicNewsRecipe):
title = u'Ars Technica'
language = 'en'
__author__ = 'Darko Miletic, Sujata Raman, Alexis Rohou, Tom Sparks'
description = 'Ars Technica: Serving the technologist for 1.2 decades'
publisher = 'Conde Nast Publications'
category = 'news, IT, technology'
oldest_article = 5
title = u'Ars Technica'
language = 'en'
__author__ = 'Darko Miletic, Sujata Raman, Alexis Rohou, Tom Sparks'
description = 'Ars Technica: Serving the technologist for 1.2 decades'
publisher = 'Conde Nast Publications'
category = 'news, IT, technology'
oldest_article = 5
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
remove_empty_feeds = True
publication_type = 'newsportal'
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
remove_empty_feeds = True
publication_type = 'newsportal'
extra_css = '''
body {font-family: Arial,sans-serif}
.heading{font-family: "Times New Roman",serif}
@ -31,56 +32,48 @@ class ArsTechnica(BasicNewsRecipe):
'''
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
'comments': description, 'tags': category, 'language': language, 'publisher': publisher
}
keep_only_tags = [
dict(attrs={'class':'standalone'})
,dict(attrs={'id':'article-guts'})
]
dict(attrs={'class': 'standalone'}), dict(attrs={'id': 'article-guts'})
]
remove_tags = [
dict(name=['object','link','embed','iframe','meta'])
,dict(attrs={'class':'corner-info'})
,dict(attrs={'id': 'article-footer-wrap'})
,dict(attrs={'class': 'article-expander'})
,dict(name='nav',attrs={'class': 'subheading'})
]
dict(name=['object', 'link', 'embed', 'iframe', 'meta']), dict(attrs={'class': 'corner-info'}), dict(attrs={
'id': 'article-footer-wrap'}), dict(attrs={'class': 'article-expander'}), dict(name='nav', attrs={'class': 'subheading'})
]
remove_attributes = ['lang']
feeds = [
(u'Ars Features (All our long-form feature articles)' , u'http://feeds.arstechnica.com/arstechnica/features')
, (u'Technology Lab (Information Technology)' , u'http://feeds.arstechnica.com/arstechnica/technology-lab')
,(u'Gear & Gadgets' , u'http://feeds.arstechnica.com/arstechnica/gadgets')
,(u'Ministry of Innovation (Business of Technology)' , u'http://feeds.arstechnica.com/arstechnica/business')
,(u'Risk Assessment (Security & Hacktivism)' , u'http://feeds.arstechnica.com/arstechnica/security')
,(u'Law & Disorder (Civilizations & Discontents)' , u'http://feeds.arstechnica.com/arstechnica/tech-policy')
,(u'Infinite Loop (Apple Ecosystem)' , u'http://feeds.arstechnica.com/arstechnica/apple')
,(u'Opposable Thumbs (Gaming & Entertainment)' , u'http://feeds.arstechnica.com/arstechnica/gaming')
,(u'Scientific Method (Science & Exploration)' , u'http://feeds.arstechnica.com/arstechnica/science')
,(u'Multiverse (Exploratoins & Meditations on Sci-Fi)' , u'http://feeds.arstechnica.com/arstechnica/multiverse')
,(u'Cars Technica (All Things Automotive)' , u'http://feeds.arstechnica.com/arstechnica/cars')
,(u'Staff Blogs (From the Minds of Ars)' , u'http://feeds.arstechnica.com/arstechnica/staff-blogs')
]
(u'Ars Features (All our long-form feature articles)', u'http://feeds.arstechnica.com/arstechnica/features'),
(u'Technology Lab (Information Technology)', u'http://feeds.arstechnica.com/arstechnica/technology-lab'),
(u'Gear & Gadgets', u'http://feeds.arstechnica.com/arstechnica/gadgets'),
(u'Ministry of Innovation (Business of Technology)', u'http://feeds.arstechnica.com/arstechnica/business'),
(u'Risk Assessment (Security & Hacktivism)', u'http://feeds.arstechnica.com/arstechnica/security'),
(u'Law & Disorder (Civilizations & Discontents)', u'http://feeds.arstechnica.com/arstechnica/tech-policy'),
(u'Infinite Loop (Apple Ecosystem)', u'http://feeds.arstechnica.com/arstechnica/apple'),
(u'Opposable Thumbs (Gaming & Entertainment)', u'http://feeds.arstechnica.com/arstechnica/gaming'),
(u'Scientific Method (Science & Exploration)', u'http://feeds.arstechnica.com/arstechnica/science'),
(u'Multiverse (Exploratoins & Meditations on Sci-Fi)', u'http://feeds.arstechnica.com/arstechnica/multiverse'),
(u'Cars Technica (All Things Automotive)', u'http://feeds.arstechnica.com/arstechnica/cars'),
(u'Staff Blogs (From the Minds of Ars)', u'http://feeds.arstechnica.com/arstechnica/staff-blogs')
]
def append_page(self, soup, appendtag, position):
pager = soup.find(attrs={'class':'numbers'})
pager = soup.find(attrs={'class': 'numbers'})
if pager:
nexttag = pager.find(attrs={'class':'next'})
nexttag = pager.find(attrs={'class': 'next'})
if nexttag:
nurl = nexttag.parent['href']
rawc = self.index_to_soup(nurl,True)
rawc = self.index_to_soup(nurl, True)
soup2 = BeautifulSoup(rawc, fromEncoding=self.encoding)
texttag = soup2.find(attrs={'id':'article-guts'})
texttag = soup2.find(attrs={'id': 'article-guts'})
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
self.append_page(soup2, texttag, newpos)
texttag.extract()
pager.extract()
appendtag.insert(position,texttag)
appendtag.insert(position, texttag)
def preprocess_html(self, soup):
self.append_page(soup, soup.body, 3)
@ -102,4 +95,4 @@ class ArsTechnica(BasicNewsRecipe):
return soup
def preprocess_raw_html(self, raw, url):
return '<html><head>'+raw[raw.find('</head>'):]
return '<html><head>' + raw[raw.find('</head>'):]

View File

@ -1,20 +1,18 @@
from calibre.web.feeds.news import BasicNewsRecipe
class HindustanTimes(BasicNewsRecipe):
title = u'Asco de vida'
language = 'es'
__author__ = 'Krittika Goyal'
oldest_article = 1 #days
title = u'Asco de vida'
language = 'es'
__author__ = 'Krittika Goyal'
oldest_article = 1 # days
max_articles_per_feed = 25
#encoding = 'cp1252'
use_embedded_content = False
no_stylesheets = True
keep_only_tags = dict(name='div', attrs={'class':'box story'})
feeds = [
('News',
'http://feeds2.feedburner.com/AscoDeVida'),
]
keep_only_tags = dict(name='div', attrs={'class': 'box story'})
feeds = [
('News',
'http://feeds2.feedburner.com/AscoDeVida'),
]

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python2
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2009, Bruce <bruce at dotdoh.com>'
'''
asiaone.com
@ -8,23 +8,25 @@ asiaone.com
from calibre.web.feeds.news import BasicNewsRecipe
class AsiaOne(BasicNewsRecipe):
title = u'AsiaOne'
title = u'AsiaOne'
oldest_article = 2
max_articles_per_feed = 100
__author__ = 'Bruce'
description = 'News from Singapore Press Holdings Portal'
__author__ = 'Bruce'
description = 'News from Singapore Press Holdings Portal'
no_stylesheets = False
language = 'en_SG'
remove_javascript = True
remove_tags = [dict(name='span', attrs={'class':'footer'})]
remove_tags = [dict(name='span', attrs={'class': 'footer'})]
keep_only_tags = [
dict(name='h1', attrs={'class':'headline'}),
dict(name='div', attrs={'class':['article-content','person-info row']})
]
dict(name='h1', attrs={'class': 'headline'}),
dict(name='div', attrs={
'class': ['article-content', 'person-info row']})
]
feeds = [
('Singapore', 'http://asiaone.feedsportal.com/c/34151/f/618415/index.rss'),
('Asia', 'http://asiaone.feedsportal.com/c/34151/f/618416/index.rss')
feeds = [
('Singapore', 'http://asiaone.feedsportal.com/c/34151/f/618415/index.rss'),
('Asia', 'http://asiaone.feedsportal.com/c/34151/f/618416/index.rss')
]
]

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
'''
www.asianreviewofbooks.com
@ -7,20 +7,21 @@ www.asianreviewofbooks.com
from calibre.web.feeds.news import BasicNewsRecipe
class AsianReviewOfBooks(BasicNewsRecipe):
title = 'The Asian Review of Books'
__author__ = 'Darko Miletic'
description = 'In addition to reviewing books about or of relevance to Asia, the Asian Review of Books also features long-format essays by leading Asian writers and thinkers, to providing an unparalleled forum for discussion of key contemporary issues by Asians for Asia and a vehicle of intellectual depth and breadth where leading thinkers can write on the books, arts and ideas of the day. Widely quoted and referenced, with an archive of more than one thousand book reviews, it is the only web resource dedicated to Asian books. And now, with the addition of the new premium content, the Asian Review of Books, is a must-read publication.'
publisher = 'The Asian Review of Books'
category = 'literature, books, reviews, Asia'
oldest_article = 30
title = 'The Asian Review of Books'
__author__ = 'Darko Miletic'
description = 'In addition to reviewing books about or of relevance to Asia, the Asian Review of Books also features long-format essays by leading Asian writers and thinkers, to providing an unparalleled forum for discussion of key contemporary issues by Asians for Asia and a vehicle of intellectual depth and breadth where leading thinkers can write on the books, arts and ideas of the day. Widely quoted and referenced, with an archive of more than one thousand book reviews, it is the only web resource dedicated to Asian books. And now, with the addition of the new premium content, the Asian Review of Books, is a must-read publication.' # noqa
publisher = 'The Asian Review of Books'
category = 'literature, books, reviews, Asia'
oldest_article = 30
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
language = 'en_CN'
publication_type = 'magazine'
masthead_url = 'http://www.asianreviewofbooks.com/new/images/mob_arb.png'
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
language = 'en_CN'
publication_type = 'magazine'
masthead_url = 'http://www.asianreviewofbooks.com/new/images/mob_arb.png'
extra_css = """
body{font-family: serif}
.big {font-size: xx-large}
@ -31,21 +32,16 @@ class AsianReviewOfBooks(BasicNewsRecipe):
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
remove_tags = [dict(name=['object','script','iframe','embed'])]
remove_tags = [dict(name=['object', 'script', 'iframe', 'embed'])]
remove_attributes = ['style', 'onclick']
feeds = [(u'Articles' , u'http://www.asianreviewofbooks.com/new/rss.php')]
feeds = [(u'Articles', u'http://www.asianreviewofbooks.com/new/rss.php')]
def print_version(self, url):
root, sep, artid = url.rpartition('?ID=')
return root + 'getarticle.php?articleID=' + artid + '&stats=web'
def preprocess_raw_html(self, raw, url):
return '<html><head><title>title</title></head><body>' + raw + '</body></html>'
return '<html><head><title>title</title></head><body>' + raw + '</body></html>'

View File

@ -1,18 +1,19 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AstroNEWS(BasicNewsRecipe):
title = u'AstroNEWS'
__author__ = 'fenuks'
description = u'AstroNEWS regularnie dostarcza wiadomości o wydarzeniach związanych z astronomią i astronautyką. Informujemy o aktualnych odkryciach i wydarzeniach naukowych, zapowiadamy ciekawe zjawiska astronomiczne. Serwis jest częścią portalu astronomicznego AstroNET prowadzonego przez miłośników astronomii i zawodowych astronomów.'
category = 'astronomy, science'
language = 'pl'
title = u'AstroNEWS'
__author__ = 'fenuks'
description = u'AstroNEWS regularnie dostarcza wiadomości o wydarzeniach związanych z astronomią i astronautyką. Informujemy o aktualnych odkryciach i wydarzeniach naukowych, zapowiadamy ciekawe zjawiska astronomiczne. Serwis jest częścią portalu astronomicznego AstroNET prowadzonego przez miłośników astronomii i zawodowych astronomów.' # noqa
category = 'astronomy, science'
language = 'pl'
oldest_article = 8
max_articles_per_feed = 100
#extra_css= 'table {text-align: left;}'
no_stylesheets=True
cover_url='http://news.astronet.pl/img/logo_news.jpg'
no_stylesheets = True
cover_url = 'http://news.astronet.pl/img/logo_news.jpg'
remove_attributes = ['width', 'align']
remove_tags=[dict(name='hr')]
feeds = [(u'Wiadomości', u'http://news.astronet.pl/rss.cgi')]
remove_tags = [dict(name='hr')]
feeds = [(u'Wiadomości', u'http://news.astronet.pl/rss.cgi')]
def print_version(self, url):
return url.replace('astronet.pl/', 'astronet.pl/print.cgi?')

View File

@ -1,11 +1,12 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from calibre.web.feeds.news import BasicNewsRecipe
class Astroflesz(BasicNewsRecipe):
title = u'Astroflesz'
oldest_article = 7
__author__ = 'fenuks'
description = u'astroflesz.pl - to portal poświęcony astronomii. Informuje zarówno o aktualnych wydarzeniach i odkryciach naukowych, jak również zapowiada ciekawe zjawiska astronomiczne'
description = u'astroflesz.pl - to portal poświęcony astronomii. Informuje zarówno o aktualnych wydarzeniach i odkryciach naukowych, jak również zapowiada ciekawe zjawiska astronomiczne' # noqa
category = 'astronomy'
language = 'pl'
cover_url = 'http://www.astroflesz.pl/templates/astroflesz/images/logo/logo.png'
@ -16,12 +17,13 @@ class Astroflesz(BasicNewsRecipe):
remove_empty_feeds = True
remove_attributes = ['style']
keep_only_tags = [dict(id="k2Container")]
remove_tags_after = dict(name='div', attrs={'class':'itemLinks'})
remove_tags = [dict(name='div', attrs={'class':['itemLinks', 'itemToolbar', 'itemRatingBlock']})]
remove_tags_after = dict(name='div', attrs={'class': 'itemLinks'})
remove_tags = [dict(name='div', attrs={
'class': ['itemLinks', 'itemToolbar', 'itemRatingBlock']})]
feeds = [(u'Wszystkie', u'http://astroflesz.pl/?format=feed')]
def postprocess_html(self, soup, first_fetch):
t = soup.find(attrs={'class':'itemIntroText'})
t = soup.find(attrs={'class': 'itemIntroText'})
if t:
for i in t.findAll('img'):
i['style'] = 'float: left; margin-right: 5px;'

View File

@ -1,4 +1,4 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
'''
www.athensnews.gr
@ -6,21 +6,22 @@ www.athensnews.gr
from calibre.web.feeds.news import BasicNewsRecipe
class AthensNews(BasicNewsRecipe):
title = 'Athens News'
__author__ = 'Darko Miletic'
description = 'Greece in English since 1952'
publisher = 'NEP Publishing Company SA'
category = 'news, politics, Greece, Athens'
oldest_article = 1
title = 'Athens News'
__author__ = 'Darko Miletic'
description = 'Greece in English since 1952'
publisher = 'NEP Publishing Company SA'
category = 'news, politics, Greece, Athens'
oldest_article = 1
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'en_GR'
remove_empty_feeds = True
publication_type = 'newspaper'
masthead_url = 'http://www.athensnews.gr/sites/athensnews/themes/athensnewsv3/images/logo.jpg'
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'en_GR'
remove_empty_feeds = True
publication_type = 'newspaper'
masthead_url = 'http://www.athensnews.gr/sites/athensnews/themes/athensnewsv3/images/logo.jpg'
extra_css = """
body{font-family: Arial,Helvetica,sans-serif }
img{margin-bottom: 0.4em; display:block}
@ -30,36 +31,32 @@ class AthensNews(BasicNewsRecipe):
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
, 'linearize_tables' : True
}
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True
}
remove_tags = [
dict(name=['meta','link'])
]
keep_only_tags=[
dict(name='span',attrs={'class':'big'})
,dict(name='td', attrs={'class':['articlepubdate','text']})
]
remove_attributes=['lang']
dict(name=['meta', 'link'])
]
keep_only_tags = [
dict(name='span', attrs={'class': 'big'}), dict(
name='td', attrs={'class': ['articlepubdate', 'text']})
]
remove_attributes = ['lang']
feeds = [
(u'News' , u'http://www.athensnews.gr/category/1/feed' )
,(u'Politics' , u'http://www.athensnews.gr/category/8/feed' )
,(u'Business' , u'http://www.athensnews.gr/category/2/feed' )
,(u'Economy' , u'http://www.athensnews.gr/category/11/feed')
,(u'Community' , u'http://www.athensnews.gr/category/5/feed' )
,(u'Arts' , u'http://www.athensnews.gr/category/3/feed' )
,(u'Living in Athens', u'http://www.athensnews.gr/category/7/feed' )
,(u'Sports' , u'http://www.athensnews.gr/category/4/feed' )
,(u'Travel' , u'http://www.athensnews.gr/category/6/feed' )
,(u'Letters' , u'http://www.athensnews.gr/category/44/feed')
,(u'Media' , u'http://www.athensnews.gr/multimedia/feed' )
]
(u'News', u'http://www.athensnews.gr/category/1/feed'),
(u'Politics', u'http://www.athensnews.gr/category/8/feed'),
(u'Business', u'http://www.athensnews.gr/category/2/feed'),
(u'Economy', u'http://www.athensnews.gr/category/11/feed'),
(u'Community', u'http://www.athensnews.gr/category/5/feed'),
(u'Arts', u'http://www.athensnews.gr/category/3/feed'),
(u'Living in Athens', u'http://www.athensnews.gr/category/7/feed'),
(u'Sports', u'http://www.athensnews.gr/category/4/feed'),
(u'Travel', u'http://www.athensnews.gr/category/6/feed'),
(u'Letters', u'http://www.athensnews.gr/category/44/feed'),
(u'Media', u'http://www.athensnews.gr/multimedia/feed')
]
def print_version(self, url):
return url + '?action=print'

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python2
from __future__ import unicode_literals
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
theatlantic.com
@ -9,13 +9,15 @@ import html5lib
from lxml import html
from calibre.web.feeds.news import BasicNewsRecipe
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={'class':lambda x:x and frozenset(x.split()).intersection(q)})
return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)})
class TheAtlantic(BasicNewsRecipe):
title = 'The Atlantic'
title = 'The Atlantic'
__author__ = 'Kovid Goyal'
description = 'Current affairs and politics focussed on the US'
INDEX = 'http://www.theatlantic.com/magazine/'
@ -23,13 +25,14 @@ class TheAtlantic(BasicNewsRecipe):
encoding = 'utf-8'
keep_only_tags = [
classes('article-header article-body article-magazine metadata article-cover-content lead-img'),
classes(
'article-header article-body article-magazine metadata article-cover-content lead-img'),
]
remove_tags = [
remove_tags = [
{'name': ['meta', 'link', 'noscript']},
{'attrs':{'class':['offset-wrapper', 'ad-boxfeatures-wrapper']}},
{'attrs':{'class':lambda x: x and 'article-tools' in x}},
{'src':lambda x:x and 'spotxchange.com' in x},
{'attrs': {'class': ['offset-wrapper', 'ad-boxfeatures-wrapper']}},
{'attrs': {'class': lambda x: x and 'article-tools' in x}},
{'src': lambda x: x and 'spotxchange.com' in x},
]
remove_tags_after = classes('article-body')
@ -48,7 +51,7 @@ class TheAtlantic(BasicNewsRecipe):
return url + '?single_page=true'
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-src':True}):
for img in soup.findAll('img', attrs={'data-src': True}):
img['src'] = img['data-src']
return soup
@ -61,8 +64,8 @@ class TheAtlantic(BasicNewsRecipe):
self.cover_url = img['src']
current_section, current_articles = 'Cover Story', []
feeds = []
for div in soup.findAll('div', attrs={'class':lambda x: x and set(x.split()).intersection({'top-sections', 'bottom-sections'})}):
for h2 in div.findAll('h2', attrs={'class':True}):
for div in soup.findAll('div', attrs={'class': lambda x: x and set(x.split()).intersection({'top-sections', 'bottom-sections'})}):
for h2 in div.findAll('h2', attrs={'class': True}):
if 'section-name' in h2['class'].split():
if current_articles:
feeds.append((current_section, current_articles))
@ -75,18 +78,22 @@ class TheAtlantic(BasicNewsRecipe):
url = a['href']
if url.startswith('/'):
url = 'http://www.theatlantic.com' + url
li = a.findParent('li', attrs={'class':lambda x: x and 'article' in x.split()})
li = a.findParent(
'li', attrs={'class': lambda x: x and 'article' in x.split()})
desc = ''
dek = li.find(attrs={'class':lambda x:x and 'dek' in x.split()})
dek = li.find(
attrs={'class': lambda x: x and 'dek' in x.split()})
if dek is not None:
desc += self.tag_to_string(dek)
byline = li.find(attrs={'class':lambda x:x and 'byline' in x.split()})
byline = li.find(
attrs={'class': lambda x: x and 'byline' in x.split()})
if byline is not None:
desc += ' -- ' + self.tag_to_string(byline)
self.log('\t', title, 'at', url)
if desc:
self.log('\t\t', desc)
current_articles.append({'title':title, 'url':url, 'description':desc})
current_articles.append(
{'title': title, 'url': url, 'description': desc})
if current_articles:
feeds.append((current_section, current_articles))
return feeds

View File

@ -3,20 +3,21 @@
from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1421956712(BasicNewsRecipe):
title = 'TheAtlantic.com'
__author__ = 'ebrandon'
language = 'en'
description = 'News and editorial about politics, culture, entertainment, tech, etc. Contains many articles not seen in The Atlantic magazine'
title = 'TheAtlantic.com'
__author__ = 'ebrandon'
language = 'en'
description = 'News and editorial about politics, culture, entertainment, tech, etc. Contains many articles not seen in The Atlantic magazine'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
auto_cleanup = True
ignore_duplicate_articles = {'title', 'url'}
def print_version(self, url):
return url.replace('/archive/', '/print/')
feeds = [
feeds = [
('Politics', 'http://feeds.feedburner.com/AtlanticPoliticsChannel'),
('International', 'http://feeds.feedburner.com/AtlanticInternational'),
('National', 'http://feeds.feedburner.com/AtlanticNational'),

View File

@ -2,14 +2,15 @@
from __future__ import unicode_literals
from calibre.web.feeds.news import BasicNewsRecipe
class AttacEspanaRecipe (BasicNewsRecipe):
__author__ = 'Marc Busqué <marc@lamarciana.com>'
__url__ = 'http://www.lamarciana.com'
__version__ = '1.0.2'
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2012, Marc Busqué <marc@lamarciana.com>'
title = u'attac.es'
description = u'La Asociación por la Tasación de las Transacciones Financieras y por la Ayuda a los Ciudadanos (ATTAC) es un movimiento internacional altermundialista que promueve el control democrático de los mercados financieros y las instituciones encargadas de su control mediante la reflexión política y la movilización social.'
description = u'La Asociación por la Tasación de las Transacciones Financieras y por la Ayuda a los Ciudadanos (ATTAC) es un movimiento internacional altermundialista que promueve el control democrático de los mercados financieros y las instituciones encargadas de su control mediante la reflexión política y la movilización social.' # noqa
url = 'http://www.attac.es'
language = 'es'
tags = 'contrainformación, información alternativa'
@ -27,5 +28,5 @@ class AttacEspanaRecipe (BasicNewsRecipe):
cover_url = u'http://www.attac.es/wp-content/themes/attacweb/images/attaces.jpg'
feeds = [
(u'Attac', u'http://www.attac.es/feed'),
]
(u'Attac', u'http://www.attac.es/feed'),
]

View File

@ -1,9 +1,9 @@
#!/usr/bin/env python2
__license__ = 'GPL v3'
__author__ = 'GabrieleMarini, based on Darko Miletic'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>, Gabriele Marini'
__version__ = 'v1.02 Marini Gabriele '
__date__ = '14062010'
__license__ = 'GPL v3'
__author__ = 'GabrieleMarini, based on Darko Miletic'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>, Gabriele Marini'
__version__ = 'v1.02 Marini Gabriele '
__date__ = '14062010'
__description__ = 'Italian daily newspaper'
'''
@ -11,53 +11,46 @@ http://www.corrieredellosport.it/
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Auto(BasicNewsRecipe):
__author__ = 'Gabriele Marini'
description = 'Auto and Formula 1'
__author__ = 'Gabriele Marini'
description = 'Auto and Formula 1'
cover_url = 'http://www.auto.it/res/imgs/logo_Auto.png'
cover_url = 'http://www.auto.it/res/imgs/logo_Auto.png'
title = u'Auto'
publisher = 'CONTE Editore'
category = 'Sport'
title = u'Auto'
publisher = 'CONTE Editore'
category = 'Sport'
language = 'it'
timefmt = '[%a, %d %b, %Y]'
language = 'it'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 60
max_articles_per_feed = 30
use_embedded_content = False
recursion = 10
use_embedded_content = False
recursion = 10
remove_javascript = True
no_stylesheets = True
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
, '--ignore-tables'
]
'--comment', description, '--category', category, '--publisher', publisher, '--ignore-tables'
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + \
description + '"\ntags="' + category + '"\nlinearize_tables=True'
keep_only_tags = [
dict(name='h2', attrs={'class':['tit_Article y_Txt']}),
dict(name='h2', attrs={'class':['tit_Article']}),
dict(name='div', attrs={'class':['box_Img newsdet_new ']}),
dict(name='div', attrs={'class':['box_Img newsdet_as ']}),
dict(name='table', attrs={'class':['table_A']}),
dict(name='div', attrs={'class':['txt_Article txtBox_cms']}),
dict(name='testoscheda')]
dict(name='h2', attrs={'class': ['tit_Article y_Txt']}),
dict(name='h2', attrs={'class': ['tit_Article']}),
dict(name='div', attrs={'class': ['box_Img newsdet_new ']}),
dict(name='div', attrs={'class': ['box_Img newsdet_as ']}),
dict(name='table', attrs={'class': ['table_A']}),
dict(name='div', attrs={'class': ['txt_Article txtBox_cms']}),
dict(name='testoscheda')]
feeds = [
(u'Tutte le News' , u'http://www.auto.it/rss/articoli.xml' ),
(u'Prove su Strada' , u'http://www.auto.it/rss/prove+6.xml'),
(u'Novit\xe0' , u'http://www.auto.it/rss/novita+3.xml')
]
(u'Tutte le News', u'http://www.auto.it/rss/articoli.xml'),
(u'Prove su Strada', u'http://www.auto.it/rss/prove+6.xml'),
(u'Novit\xe0', u'http://www.auto.it/rss/novita+3.xml')
]

View File

@ -1,16 +1,15 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AutoBlog(BasicNewsRecipe):
title = u'Auto Blog'
__author__ = 'Welovelucy'
title = u'Auto Blog'
__author__ = 'Welovelucy'
language = 'en'
description = 'Auto industry news'
oldest_article = 7
max_articles_per_feed = 100
feeds = [(u'AutoBlog', u'http://www.autoblog.com/rss.xml')]
feeds = [(u'AutoBlog', u'http://www.autoblog.com/rss.xml')]
def print_version(self, url):
return url + 'print/'

View File

@ -1,9 +1,9 @@
#!/usr/bin/env python2
__license__ = 'GPL v3'
__author__ = 'GabrieleMarini, based on Darko Miletic'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>, Gabriele Marini'
__version__ = 'v1.02 Marini Gabriele '
__date__ = '10, January 2010'
__license__ = 'GPL v3'
__author__ = 'GabrieleMarini, based on Darko Miletic'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>, Gabriele Marini'
__version__ = 'v1.02 Marini Gabriele '
__date__ = '10, January 2010'
__description__ = 'Italian daily newspaper'
'''
@ -11,80 +11,80 @@ http://www.corrieredellosport.it/
'''
from calibre.web.feeds.news import BasicNewsRecipe
class AutoPR(BasicNewsRecipe):
__author__ = 'Gabriele Marini'
description = 'Auto and Formula 1'
__author__ = 'Gabriele Marini'
description = 'Auto and Formula 1'
cover_url = 'http://www.auto.it/res/imgs/logo_Auto.png'
cover_url = 'http://www.auto.it/res/imgs/logo_Auto.png'
title = u'Auto Prove'
publisher = 'CONTE Editore'
category = 'Sport'
title = u'Auto Prove'
publisher = 'CONTE Editore'
category = 'Sport'
language = 'it'
timefmt = '[%a, %d %b, %Y]'
language = 'it'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 60
max_articles_per_feed = 20
use_embedded_content = False
recursion = 100
use_embedded_content = False
recursion = 100
remove_javascript = True
no_stylesheets = True
#html2lrf_options = [
# html2lrf_options = [
# '--comment', description
# , '--category', category
# , '--publisher', publisher
# , '--ignore-tables'
# ]
#html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
keep_only_tags = [
dict(name='h2', attrs={'class':['tit_Article y_Txt']}),
dict(name='h2', attrs={'class':['tit_Article']}),
dict(name='div', attrs={'class':['box_Img newsdet_new ']}),
dict(name='div', attrs={'class':['box_Img newsdet_as ']}),
dict(name='table', attrs={'class':['table_A']}),
dict(name='div', attrs={'class':['txt_Article txtBox_cms']}),
dict(name='testoscheda')]
dict(name='h2', attrs={'class': ['tit_Article y_Txt']}),
dict(name='h2', attrs={'class': ['tit_Article']}),
dict(name='div', attrs={'class': ['box_Img newsdet_new ']}),
dict(name='div', attrs={'class': ['box_Img newsdet_as ']}),
dict(name='table', attrs={'class': ['table_A']}),
dict(name='div', attrs={'class': ['txt_Article txtBox_cms']}),
dict(name='testoscheda')]
def parse_index(self):
feeds = []
for title, url in [
("Prove su Strada" , "http://www.auto.it/rss/prove+6.xml")
]:
("Prove su Strada", "http://www.auto.it/rss/prove+6.xml")
]:
soup = self.index_to_soup(url)
soup = soup.find('channel')
print soup
for article in soup.findAllNext('item'):
title = self.tag_to_string(article.title)
title = self.tag_to_string(article.title)
date = self.tag_to_string(article.pubDate)
description = self.tag_to_string(article.description)
link = self.tag_to_string(article.guid)
# print article
articles = self.create_links_append(link, date, description)
if articles:
feeds.append((title, articles))
feeds.append((title, articles))
return feeds
def create_links_append(self, link, date, description):
current_articles = []
current_articles.append({'title': 'Generale', 'url': link,'description':description, 'date':date}),
current_articles.append({'title': 'Design', 'url': link.replace('scheda','design'),'description':'scheda', 'date':''}),
current_articles.append({'title': 'Interni', 'url': link.replace('scheda','interni'),'description':'Interni', 'date':''}),
current_articles.append({'title': 'Tecnica', 'url': link.replace('scheda','tecnica'),'description':'Tecnica', 'date':''}),
current_articles.append({'title': 'Su Strada', 'url': link.replace('scheda','su_strada'),'description':'Su Strada', 'date':''}),
current_articles.append({'title': 'Pagella', 'url': link.replace('scheda','pagella'),'description':'Pagella', 'date':''}),
current_articles.append({'title': 'Rilevamenti', 'url': link.replace('scheda','telemetria'),'description':'Rilevamenti', 'date':''})
current_articles.append(
{'title': 'Generale', 'url': link, 'description': description, 'date': date}),
current_articles.append({'title': 'Design', 'url': link.replace(
'scheda', 'design'), 'description': 'scheda', 'date': ''}),
current_articles.append({'title': 'Interni', 'url': link.replace(
'scheda', 'interni'), 'description': 'Interni', 'date': ''}),
current_articles.append({'title': 'Tecnica', 'url': link.replace(
'scheda', 'tecnica'), 'description': 'Tecnica', 'date': ''}),
current_articles.append({'title': 'Su Strada', 'url': link.replace(
'scheda', 'su_strada'), 'description': 'Su Strada', 'date': ''}),
current_articles.append({'title': 'Pagella', 'url': link.replace(
'scheda', 'pagella'), 'description': 'Pagella', 'date': ''}),
current_articles.append({'title': 'Rilevamenti', 'url': link.replace(
'scheda', 'telemetria'), 'description': 'Rilevamenti', 'date': ''})
return current_articles

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = u'2011, Silviu Cotoar\u0103'
'''
auto-bild.ro
@ -9,47 +9,42 @@ auto-bild.ro
from calibre.web.feeds.news import BasicNewsRecipe
class AutoBild(BasicNewsRecipe):
title = u'Auto Bild'
__author__ = u'Silviu Cotoar\u0103'
description = 'Auto'
publisher = 'Auto Bild'
oldest_article = 50
language = 'ro'
title = u'Auto Bild'
__author__ = u'Silviu Cotoar\u0103'
description = 'Auto'
publisher = 'Auto Bild'
oldest_article = 50
language = 'ro'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
category = 'Ziare,Reviste,Auto'
encoding = 'utf-8'
cover_url = 'http://www.auto-bild.ro/images/autobild.gif'
no_stylesheets = True
use_embedded_content = False
category = 'Ziare,Reviste,Auto'
encoding = 'utf-8'
cover_url = 'http://www.auto-bild.ro/images/autobild.gif'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
'comments': description, 'tags': category, 'language': language, 'publisher': publisher
}
keep_only_tags = [
dict(name='div', attrs={'class':'box_2 articol clearfix'})
]
dict(name='div', attrs={'class': 'box_2 articol clearfix'})
]
remove_tags = [
dict(name='div', attrs={'class':['detail']})
, dict(name='a', attrs={'id':['zoom_link']})
, dict(name='div', attrs={'class':['icons clearfix']})
, dict(name='div', attrs={'class':['pub_articol clearfix']})
dict(name='div', attrs={'class': ['detail']}), dict(name='a', attrs={'id': ['zoom_link']}), dict(
name='div', attrs={'class': ['icons clearfix']}), dict(name='div', attrs={'class': ['pub_articol clearfix']})
]
]
remove_tags_after = [
dict(name='div', attrs={'class':['pub_articol clearfix']})
]
dict(name='div', attrs={'class': ['pub_articol clearfix']})
]
feeds = [
feeds = [
(u'Feeds', u'http://www.auto-bild.ro/rss/toate')
]
]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -1,27 +1,28 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class autogids(BasicNewsRecipe):
title = u'Automatiseringgids IT'
oldest_article = 7
__author__ = 'DrMerry'
description = 'IT-nieuws van Automatiseringgids'
language = 'nl'
publisher = 'AutomatiseringGids'
category = 'Nieuws, IT, Nederlandstalig'
__author__ = 'DrMerry'
description = 'IT-nieuws van Automatiseringgids'
language = 'nl'
publisher = 'AutomatiseringGids'
category = 'Nieuws, IT, Nederlandstalig'
simultaneous_downloads = 5
timefmt = ' [%a, %d %B, %Y]'
timefmt = ' [%a, %d %B, %Y]'
no_stylesheets = True
remove_javascript = True
remove_empty_feeds = True
publication_type = 'newspaper'
encoding = 'utf-8'
cover_url = 'http://www.automatiseringgids.nl/binaries/content/gallery/ag/marketing/ag-avatar-100x50.jpg'
keep_only_tags = [dict(name='div', attrs={'class':['content']})]
encoding = 'utf-8'
cover_url = 'http://www.automatiseringgids.nl/binaries/content/gallery/ag/marketing/ag-avatar-100x50.jpg'
keep_only_tags = [dict(name='div', attrs={'class': ['content']})]
preprocess_regexps = [
(re.compile(r'(<h3>Reacties</h3>|<h2>Zie ook:</h2>|<div style=".*</div>|<a[^>]*>|</a>)', re.DOTALL|re.IGNORECASE),
lambda match: ''),
(re.compile(r'(<h3>Reacties</h3>|<h2>Zie ook:</h2>|<div style=".*</div>|<a[^>]*>|</a>)', re.DOTALL | re.IGNORECASE),
lambda match: ''),
]
feeds = [(u'Actueel', u'http://www.automatiseringgids.nl/rss.aspx')]
feeds = [(u'Actueel', u'http://www.automatiseringgids.nl/rss.aspx')]

View File

@ -9,22 +9,25 @@ www.autosport.com
from calibre.web.feeds.news import BasicNewsRecipe
class autosport(BasicNewsRecipe):
title = u'Autosport'
__author__ = 'MrStefan <mrstefaan@gmail.com>'
language = 'en_GB'
description =u'Daily Formula 1 and motorsport news from the leading weekly motor racing magazine. The authority on Formula 1, F1, MotoGP, GP2, Champ Car, Le Mans...'
masthead_url='http://cdn.images.autosport.com/asdotcom.gif'
remove_empty_feeds= True
description = u'Daily Formula 1 and motorsport news from the leading weekly motor racing magazine. The authority on Formula 1, F1, MotoGP, GP2, Champ Car, Le Mans...' # noqa
masthead_url = 'http://cdn.images.autosport.com/asdotcom.gif'
remove_empty_feeds = True
oldest_article = 1
max_articles_per_feed = 100
remove_javascript=True
no_stylesheets=True
remove_javascript = True
no_stylesheets = True
keep_only_tags =[]
keep_only_tags.append(dict(name = 'h1', attrs = {'class' : 'news_headline'}))
keep_only_tags.append(dict(name = 'td', attrs = {'class' : 'news_article_author'}))
keep_only_tags.append(dict(name = 'td', attrs = {'class' : 'news_article_date'}))
keep_only_tags.append(dict(name = 'p'))
keep_only_tags = []
keep_only_tags.append(dict(name='h1', attrs={'class': 'news_headline'}))
keep_only_tags.append(
dict(name='td', attrs={'class': 'news_article_author'}))
keep_only_tags.append(
dict(name='td', attrs={'class': 'news_article_date'}))
keep_only_tags.append(dict(name='p'))
feeds = [(u'ALL NEWS', u'http://www.autosport.com/rss/allnews.xml')]

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = u'2011, Silviu Cotoar\u0103'
'''
avantaje.ro
@ -9,49 +9,41 @@ avantaje.ro
from calibre.web.feeds.news import BasicNewsRecipe
class Avantaje(BasicNewsRecipe):
title = u'Avantaje'
__author__ = u'Silviu Cotoar\u0103'
description = u''
publisher = u'Avantaje'
oldest_article = 25
language = 'ro'
title = u'Avantaje'
__author__ = u'Silviu Cotoar\u0103'
description = u''
publisher = u'Avantaje'
oldest_article = 25
language = 'ro'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
category = 'Ziare,Reviste,Stiri'
encoding = 'utf-8'
cover_url = 'http://www.avantaje.ro/images/default/logo.gif'
no_stylesheets = True
use_embedded_content = False
category = 'Ziare,Reviste,Stiri'
encoding = 'utf-8'
cover_url = 'http://www.avantaje.ro/images/default/logo.gif'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
'comments': description, 'tags': category, 'language': language, 'publisher': publisher
}
keep_only_tags = [
dict(name='div', attrs={'id':'articol'})
, dict(name='div', attrs={'class':'gallery clearfix'})
, dict(name='div', attrs={'align':'justify'})
]
dict(name='div', attrs={'id': 'articol'}), dict(name='div', attrs={
'class': 'gallery clearfix'}), dict(name='div', attrs={'align': 'justify'})
]
remove_tags = [
dict(name='div', attrs={'id':['color_sanatate_box']})
, dict(name='div', attrs={'class':['nav']})
, dict(name='div', attrs={'class':['voteaza_art']})
, dict(name='div', attrs={'class':['bookmark']})
, dict(name='div', attrs={'class':['links clearfix']})
, dict(name='div', attrs={'class':['title']})
]
dict(name='div', attrs={'id': ['color_sanatate_box']}), dict(name='div', attrs={'class': ['nav']}), dict(name='div', attrs={'class': ['voteaza_art']}), dict(name='div', attrs={'class': ['bookmark']}), dict(name='div', attrs={'class': ['links clearfix']}), dict(name='div', attrs={'class': ['title']}) # noqa
]
remove_tags_after = [
dict(name='div', attrs={'class':['title']})
]
dict(name='div', attrs={'class': ['title']})
]
feeds = [
(u'Feeds', u'http://feeds.feedburner.com/Avantaje')
]
feeds = [
(u'Feeds', u'http://feeds.feedburner.com/Avantaje')
]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = u'2011, Silviu Cotoar\u0103'
'''
aventurilapescuit.ro
@ -9,43 +9,41 @@ aventurilapescuit.ro
from calibre.web.feeds.news import BasicNewsRecipe
class AventuriLaPescuit(BasicNewsRecipe):
title = u'Aventuri La Pescuit'
__author__ = u'Silviu Cotoar\u0103'
description = 'Aventuri La Pescuit'
publisher = 'Aventuri La Pescuit'
oldest_article = 5
language = 'ro'
title = u'Aventuri La Pescuit'
__author__ = u'Silviu Cotoar\u0103'
description = 'Aventuri La Pescuit'
publisher = 'Aventuri La Pescuit'
oldest_article = 5
language = 'ro'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
category = 'Ziare,Pescuit,Hobby'
encoding = 'utf-8'
cover_url = 'http://www.aventurilapescuit.ro/images/logo.gif'
no_stylesheets = True
use_embedded_content = False
category = 'Ziare,Pescuit,Hobby'
encoding = 'utf-8'
cover_url = 'http://www.aventurilapescuit.ro/images/logo.gif'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
'comments': description, 'tags': category, 'language': language, 'publisher': publisher
}
keep_only_tags = [
dict(name='div', attrs={'id':'Article'})
]
dict(name='div', attrs={'id': 'Article'})
]
remove_tags = [
dict(name='div', attrs={'class':['right option']})
, dict(name='iframe', attrs={'scrolling':['no']})
]
dict(name='div', attrs={'class': ['right option']}), dict(
name='iframe', attrs={'scrolling': ['no']})
]
remove_tags_after = [
dict(name='iframe', attrs={'scrolling':['no']})
]
dict(name='iframe', attrs={'scrolling': ['no']})
]
feeds = [
(u'Feeds', u'http://www.aventurilapescuit.ro/sections/rssread/1')
]
feeds = [
(u'Feeds', u'http://www.aventurilapescuit.ro/sections/rssread/1')
]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -4,44 +4,44 @@ __copyright__ = '2010, BlonG'
avto-magazin.si
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Dnevnik(BasicNewsRecipe):
title = u'Avto Magazin'
__author__ = u'BlonG'
description = u'Za avtomobilisti\xc4\x8dne frike, poznavalce in nedeljske \xc5\xa1oferje.'
oldest_article = 7
max_articles_per_feed = 20
labguage = 'sl'
no_stylesheets = True
use_embedded_content = False
language = 'sl'
title = u'Avto Magazin'
__author__ = u'BlonG'
description = u'Za avtomobilisti\xc4\x8dne frike, poznavalce in nedeljske \xc5\xa1oferje.'
oldest_article = 7
max_articles_per_feed = 20
labguage = 'sl'
no_stylesheets = True
use_embedded_content = False
language = 'sl'
conversion_options = {'linearize_tables' : True}
conversion_options = {'linearize_tables': True}
cover_url = 'https://sites.google.com/site/javno2010/home/avto_magazin_cover.jpg'
cover_url = 'https://sites.google.com/site/javno2010/home/avto_magazin_cover.jpg'
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''
keep_only_tags = [
dict(name='div', attrs={'id': '_iprom_inStream'}),
# dict(name='div', attrs={'class':'entry-content'}),
]
keep_only_tags = [
dict(name='div', attrs={'id':'_iprom_inStream'}),
# dict(name='div', attrs={'class':'entry-content'}),
]
remove_tags = [
dict(name='div', attrs={'id': 'voteConfirmation'}),
dict(name='div', attrs={'id': 'InsideVote'}),
dict(name='div', attrs={'class': 'Zone234'}),
dict(name='div', attrs={'class': 'Comments'}),
dict(name='div', attrs={'class': 'sorodneNovice'}),
dict(name='div', attrs={'id': 'footer'}),
]
remove_tags = [
dict(name='div', attrs={'id':'voteConfirmation'}),
dict(name='div', attrs={'id':'InsideVote'}),
dict(name='div', attrs={'class':'Zone234'}),
dict(name='div', attrs={'class':'Comments'}),
dict(name='div', attrs={'class':'sorodneNovice'}),
dict(name='div', attrs={'id':'footer'}),
]
feeds = [
(u'Novice', u'http://www.avto-magazin.si/rss/')
]
feeds = [
(u'Novice', u'http://www.avto-magazin.si/rss/')
]

View File

@ -1,4 +1,4 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
axxon.com.ar
@ -6,35 +6,33 @@ axxon.com.ar
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
class Axxon_news(BasicNewsRecipe):
title = 'Revista Axxon'
__author__ = 'Darko Miletic'
description = 'Axxon, Ciencia Ficcion en Bits'
publisher = 'Revista Axxon - Ciencia Ficcion'
category = 'SF, Argentina'
oldest_article = 31
delay = 1
title = 'Revista Axxon'
__author__ = 'Darko Miletic'
description = 'Axxon, Ciencia Ficcion en Bits'
publisher = 'Revista Axxon - Ciencia Ficcion'
category = 'SF, Argentina'
oldest_article = 31
delay = 1
max_articles_per_feed = 100
no_stylesheets = False
use_embedded_content = False
language = 'es_AR'
encoding = 'utf-8'
publication_type = 'magazine'
INDEX = 'http://axxon.com.ar/rev/'
extra_css = ' body{font-family: Verdana,Arial,sans-serif} .editorial{font-family: serif} .posttitle{font-family: "Trebuchet MS","Lucida Grande",Verdana,Arial,sans-serif} .cuento{font-family: "Times New Roman", serif} .biografia{color: red; font-weight: bold; font-family: Verdana,Geneva,Arial,Helvetica,sans-serif} '
no_stylesheets = False
use_embedded_content = False
language = 'es_AR'
encoding = 'utf-8'
publication_type = 'magazine'
INDEX = 'http://axxon.com.ar/rev/'
extra_css = ' body{font-family: Verdana,Arial,sans-serif} .editorial{font-family: serif} .posttitle{font-family: "Trebuchet MS","Lucida Grande",Verdana,Arial,sans-serif} .cuento{font-family: "Times New Roman", serif} .biografia{color: red; font-weight: bold; font-family: Verdana,Geneva,Arial,Helvetica,sans-serif} ' # noqa
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
keep_only_tags = [dict(name='div', attrs={'class':'post'})]
remove_tags = [dict(name=['object','link','iframe','embed','img'])]
remove_tags_after = [dict(attrs={'class':['editorial','correo','biografia','articulo']})]
remove_attributes = ['width','height','font','border','align']
keep_only_tags = [dict(name='div', attrs={'class': 'post'})]
remove_tags = [dict(name=['object', 'link', 'iframe', 'embed', 'img'])]
remove_tags_after = [
dict(attrs={'class': ['editorial', 'correo', 'biografia', 'articulo']})]
remove_attributes = ['width', 'height', 'font', 'border', 'align']
def parse_index(self):
articles = []
@ -44,21 +42,16 @@ class Axxon_news(BasicNewsRecipe):
description = ''
title_prefix = ''
feed_link = item.find('a')
if feed_link and feed_link.has_key('href') and feed_link['href'].startswith('?p='):
url = self.INDEX + feed_link['href']
if feed_link and feed_link.has_key('href') and feed_link['href'].startswith('?p='): # noqa
url = self.INDEX + feed_link['href']
title = title_prefix + self.tag_to_string(feed_link)
date = strftime(self.timefmt)
date = strftime(self.timefmt)
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description':description
})
'title': title, 'date': date, 'url': url, 'description': description
})
return [(soup.head.title.string, articles)]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return self.adeify_images(soup)

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python2
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
axxon.com.ar
@ -8,55 +8,50 @@ axxon.com.ar
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Axxon_news(BasicNewsRecipe):
title = 'Axxon noticias'
__author__ = 'Darko Miletic'
description = 'Axxon, Ciencia Ficcion en Bits'
publisher = 'Axxon'
category = 'news, SF, Argentina, science, movies'
oldest_article = 7
title = 'Axxon noticias'
__author__ = 'Darko Miletic'
description = 'Axxon, Ciencia Ficcion en Bits'
publisher = 'Axxon'
category = 'news, SF, Argentina, science, movies'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = False
use_embedded_content = False
no_stylesheets = False
use_embedded_content = False
language = 'es_AR'
lang = 'es-AR'
lang = 'es-AR'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
}
'comment': description, 'tags': category, 'publisher': publisher, 'language': lang, 'pretty_print': True
}
keep_only_tags = [dict(name='div', attrs={'class': 'post'})]
keep_only_tags = [dict(name='div', attrs={'class':'post'})]
remove_tags = [dict(name=['object', 'link', 'iframe', 'embed'])]
remove_tags = [dict(name=['object','link','iframe','embed'])]
feeds = [(u'Noticias', u'http://axxon.com.ar/noticias/feed/')]
remove_attributes = ['style','width','height','font','border','align']
feeds = [(u'Noticias', u'http://axxon.com.ar/noticias/feed/')]
remove_attributes = ['style', 'width', 'height', 'font', 'border', 'align']
def adeify_images2(cls, soup):
for item in soup.findAll('img'):
for attrib in ['height','width','border','align','style']:
if item.has_key(attrib):
del item[attrib]
for attrib in ['height', 'width', 'border', 'align', 'style']:
if item.has_key(attrib): # noqa
del item[attrib]
oldParent = item.parent
if oldParent.name == 'a':
oldParent.name == 'p'
oldParent.name == 'p'
myIndex = oldParent.contents.index(item)
brtag = Tag(soup,'br')
oldParent.insert(myIndex+1,brtag)
brtag = Tag(soup, 'br')
oldParent.insert(myIndex + 1, brtag)
return soup
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
soup.html.insert(0,mlang)
soup.html['lang'] = self.lang
mlang = Tag(soup, 'meta', [
("http-equiv", "Content-Language"), ("content", self.lang)])
soup.html.insert(0, mlang)
return self.adeify_images2(soup)

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
'''
azstarnet.com
@ -7,53 +7,47 @@ azstarnet.com
import urllib
from calibre.web.feeds.news import BasicNewsRecipe
class Azstarnet(BasicNewsRecipe):
title = 'Arizona Daily Star'
__author__ = 'Darko Miletic'
description = 'news from Arizona'
language = 'en'
publisher = 'azstarnet.com'
category = 'news, politics, Arizona, USA'
oldest_article = 3
title = 'Arizona Daily Star'
__author__ = 'Darko Miletic'
description = 'news from Arizona'
language = 'en'
publisher = 'azstarnet.com'
category = 'news, politics, Arizona, USA'
oldest_article = 3
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
masthead_url = 'http://azstarnet.com/content/tncms/live/global/resources/images/logo.gif'
needs_subscription = True
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
masthead_url = 'http://azstarnet.com/content/tncms/live/global/resources/images/logo.gif'
needs_subscription = True
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
br.open('http://azstarnet.com/')
if self.username is not None and self.password is not None:
data = urllib.urlencode({ 'm':'login'
,'u':self.username
,'p':self.password
,'z':'http://azstarnet.com/'
})
br.open('http://azstarnet.com/app/registration/proxy.php',data)
data = urllib.urlencode({'m': 'login', 'u': self.username, 'p': self.password, 'z': 'http://azstarnet.com/'
})
br.open('http://azstarnet.com/app/registration/proxy.php', data)
return br
remove_tags = [dict(name=['object','link','iframe','base','img'])]
remove_tags = [dict(name=['object', 'link', 'iframe', 'base', 'img'])]
feeds = [
(u'Local News' , u'http://azstarnet.com/search/?f=rss&t=article&c=news/local&l=25&s=start_time&sd=desc')
,(u'National News' , u'http://azstarnet.com/search/?f=rss&t=article&c=news/national&l=25&s=start_time&sd=desc')
,(u'World News' , u'http://azstarnet.com/search/?f=rss&t=article&c=news/world&l=25&s=start_time&sd=desc')
,(u'Sports' , u'http://azstarnet.com/search/?f=rss&t=article&c=sports&l=25&s=start_time&sd=desc')
,(u'Opinion' , u'http://azstarnet.com/search/?f=rss&t=article&c=news/opinion&l=25&s=start_time&sd=desc')
,(u'Movies' , u'http://azstarnet.com/search/?f=rss&t=article&c=entertainment/movies&l=25&s=start_time&sd=desc')
,(u'Food' , u'http://azstarnet.com/search/?f=rss&t=article&c=lifestyles/food-and-cooking&l=25&s=start_time&sd=desc')
]
(u'Local News', u'http://azstarnet.com/search/?f=rss&t=article&c=news/local&l=25&s=start_time&sd=desc'),
(u'National News', u'http://azstarnet.com/search/?f=rss&t=article&c=news/national&l=25&s=start_time&sd=desc'),
(u'World News', u'http://azstarnet.com/search/?f=rss&t=article&c=news/world&l=25&s=start_time&sd=desc'),
(u'Sports', u'http://azstarnet.com/search/?f=rss&t=article&c=sports&l=25&s=start_time&sd=desc'),
(u'Opinion', u'http://azstarnet.com/search/?f=rss&t=article&c=news/opinion&l=25&s=start_time&sd=desc'),
(u'Movies', u'http://azstarnet.com/search/?f=rss&t=article&c=entertainment/movies&l=25&s=start_time&sd=desc'),
(u'Food', u'http://azstarnet.com/search/?f=rss&t=article&c=lifestyles/food-and-cooking&l=25&s=start_time&sd=desc')
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
@ -62,4 +56,3 @@ class Azstarnet(BasicNewsRecipe):
def print_version(self, url):
return url + '?print=1'

View File

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = u'2011, Silviu Cotoar\u0103'
'''
b365.realitatea.net
@ -8,45 +8,40 @@ b365.realitatea.net
from calibre.web.feeds.news import BasicNewsRecipe
class b365Realitatea(BasicNewsRecipe):
title = u'b365 Realitatea'
__author__ = u'Silviu Cotoar\u0103'
publisher = u'b365 Realitatea'
description = u'b365 Realitatea'
oldest_article = 5
language = 'ro'
title = u'b365 Realitatea'
__author__ = u'Silviu Cotoar\u0103'
publisher = u'b365 Realitatea'
description = u'b365 Realitatea'
oldest_article = 5
language = 'ro'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
category = 'Ziare,Romania,Bucuresti'
encoding = 'utf-8'
cover_url = 'http://b365.realitatea.net/wp-content/themes/b/images/b365-logo.png'
no_stylesheets = True
use_embedded_content = False
category = 'Ziare,Romania,Bucuresti'
encoding = 'utf-8'
cover_url = 'http://b365.realitatea.net/wp-content/themes/b/images/b365-logo.png'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
'comments': description, 'tags': category, 'language': language, 'publisher': publisher
}
keep_only_tags = [
dict(name='div', attrs={'class':'newsArticle'})
]
dict(name='div', attrs={'class': 'newsArticle'})
]
remove_tags = [
dict(name='div', attrs={'class':'date'})
, dict(name='dic', attrs={'class':'addthis_toolbox addthis_default_style'})
, dict(name='div', attrs={'class':'related_posts'})
, dict(name='div', attrs={'id':'RelevantiWidget'})
]
dict(name='div', attrs={'class': 'date'}), dict(name='dic', attrs={'class': 'addthis_toolbox addthis_default_style'}), dict(
name='div', attrs={'class': 'related_posts'}), dict(name='div', attrs={'id': 'RelevantiWidget'})
]
remove_tags_after = [
dict(name='div', attrs={'id':'RelevantiWidget'})
]
feeds = [
dict(name='div', attrs={'id': 'RelevantiWidget'})
]
feeds = [
(u'\u0218tiri', u'http://b365.realitatea.net/rss-full/')
]
]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
'''
b92.net
@ -7,63 +7,63 @@ b92.net
import re
from calibre.web.feeds.news import BasicNewsRecipe
class B92(BasicNewsRecipe):
title = 'B92'
__author__ = 'Darko Miletic'
description = 'Najnovije vesti iz Srbije, regiona i sveta, aktuelne teme iz sveta politike, ekonomije, drustva, foto galerija, kolumne'
publisher = 'B92'
category = 'news, politics, Serbia'
oldest_article = 2
title = 'B92'
__author__ = 'Darko Miletic'
description = 'Najnovije vesti iz Srbije, regiona i sveta, aktuelne teme iz sveta politike, ekonomije, drustva, foto galerija, kolumne'
publisher = 'B92'
category = 'news, politics, Serbia'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1250'
language = 'sr'
publication_type = 'newsportal'
masthead_url = 'http://b92s.net/v4/img/new-logo.png'
extra_css = """
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1250'
language = 'sr'
publication_type = 'newsportal'
masthead_url = 'http://b92s.net/v4/img/new-logo.png'
extra_css = """
@font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
body{font-family: Arial,Helvetica,sans1,sans-serif}
body{font-family: Arial,Helvetica,sans1,sans-serif}
.article-info2,.article-info1{text-transform: uppercase; font-size: small}
img{display: block}
.sms{font-weight: bold}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher': publisher
, 'language' : language
, 'linearize_tables' : True
}
preprocess_regexps = [
(re.compile(u'\u0110'), lambda match: u'\u00D0'),
(re.compile(r'<html.*?<body>', re.DOTALL|re.IGNORECASE), lambda match: '<html><head><title>something</title></head><body>')
]
keep_only_tags = [dict(attrs={'class':['article-info1','article-text']})]
remove_attributes = ['width','height','align','hspace','vspace','border','lang','xmlns:fb']
remove_tags = [
dict(name=['embed','link','base','meta','iframe'])
,dict(attrs={'id':'social'})
]
feeds = [
(u'Vesti' , u'http://www.b92.net/info/rss/vesti.xml' )
,(u'Biz' , u'http://www.b92.net/info/rss/biz.xml' )
,(u'Sport' , u'http://www.b92.net/info/rss/sport.xml' )
,(u'Zivot' , u'http://www.b92.net/info/rss/zivot.xml' )
,(u'Kultura' , u'http://www.b92.net/info/rss/kultura.xml' )
,(u'Automobili' , u'http://www.b92.net/info/rss/automobili.xml')
,(u'Tehnopolis' , u'http://www.b92.net/info/rss/tehnopolis.xml')
]
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language, 'linearize_tables': True
}
preprocess_regexps = [
(re.compile(u'\u0110'), lambda match: u'\u00D0'),
(re.compile(r'<html.*?<body>', re.DOTALL | re.IGNORECASE),
lambda match: '<html><head><title>something</title></head><body>')
]
keep_only_tags = [dict(attrs={'class': ['article-info1', 'article-text']})]
remove_attributes = ['width', 'height', 'align',
'hspace', 'vspace', 'border', 'lang', 'xmlns:fb']
remove_tags = [
dict(name=['embed', 'link', 'base', 'meta', 'iframe']), dict(
attrs={'id': 'social'})
]
feeds = [
(u'Vesti', u'http://www.b92.net/info/rss/vesti.xml'),
(u'Biz', u'http://www.b92.net/info/rss/biz.xml'),
(u'Sport', u'http://www.b92.net/info/rss/sport.xml'),
(u'Zivot', u'http://www.b92.net/info/rss/zivot.xml'),
(u'Kultura', u'http://www.b92.net/info/rss/kultura.xml'),
(u'Automobili', u'http://www.b92.net/info/rss/automobili.xml'),
(u'Tehnopolis', u'http://www.b92.net/info/rss/tehnopolis.xml')
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
tstr = alink.string
alink.replaceWith(tstr)
return soup

Some files were not shown because too many files have changed in this diff Show More