This commit is contained in:
Sengian 2012-01-25 22:34:50 +01:00
commit 1028383a93
724 changed files with 304073 additions and 156411 deletions

View File

@ -2,6 +2,7 @@
.check-cache.pickle .check-cache.pickle
src/calibre/plugins src/calibre/plugins
resources/images.qrc resources/images.qrc
src/calibre/ebooks/oeb/display/test/*.js
src/calibre/manual/.build/ src/calibre/manual/.build/
src/calibre/manual/cli/ src/calibre/manual/cli/
src/calibre/manual/template_ref.rst src/calibre/manual/template_ref.rst
@ -15,6 +16,7 @@ resources/ebook-convert-complete.pickle
resources/builtin_recipes.xml resources/builtin_recipes.xml
resources/builtin_recipes.zip resources/builtin_recipes.zip
resources/template-functions.json resources/template-functions.json
resources/display/*.js
setup/installer/windows/calibre/build.log setup/installer/windows/calibre/build.log
src/calibre/translations/.errors src/calibre/translations/.errors
src/cssutils/.svn/ src/cssutils/.svn/

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010, Dean Cording' __copyright__ = '2011, Pat Stapleton <pat.stapleton at gmail.com>'
''' '''
abc.net.au/news abc.net.au/news
''' '''
@ -8,7 +8,7 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
class ABCNews(BasicNewsRecipe): class ABCNews(BasicNewsRecipe):
title = 'ABC News' title = 'ABC News'
__author__ = 'Dean Cording' __author__ = 'Pat Stapleton, Dean Cording'
description = 'News from Australia' description = 'News from Australia'
masthead_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png' masthead_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png'
cover_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png' cover_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png'
@ -23,7 +23,9 @@ class ABCNews(BasicNewsRecipe):
category = 'News, Australia, World' category = 'News, Australia, World'
language = 'en_AU' language = 'en_AU'
publication_type = 'newsportal' publication_type = 'newsportal'
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')] # preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
#Remove annoying map links (inline-caption class is also used for some image captions! hence regex to match maps.google)
preprocess_regexps = [(re.compile(r'<a class="inline-caption" href="http://maps\.google\.com.*?/a>', re.DOTALL), lambda m: '')]
conversion_options = { conversion_options = {
'comments' : description 'comments' : description
,'tags' : category ,'tags' : category
@ -32,23 +34,23 @@ class ABCNews(BasicNewsRecipe):
,'linearize_tables': False ,'linearize_tables': False
} }
keep_only_tags = dict(id='article') keep_only_tags = [dict(attrs={'class':['article section']})]
remove_tags = [dict(attrs={'class':['related', 'tags']}), remove_tags = [dict(attrs={'class':['related', 'tags', 'tools', 'attached-content ready',
dict(id='statepromo') 'inline-content story left', 'inline-content map left contracted', 'published',
] 'story-map', 'statepromo', 'topics', ]})]
remove_attributes = ['width','height'] remove_attributes = ['width','height']
feeds = [ feeds = [
('Top Stories', 'http://www.abc.net.au/news/syndicate/topstoriesrss.xml'), ('Top Stories', 'http://www.abc.net.au/news/feed/45910/rss.xml'),
('Canberra', 'http://www.abc.net.au/news/indexes/idx-act/rss.xml'), ('Canberra', 'http://www.abc.net.au/news/feed/6910/rss.xml'),
('Sydney', 'http://www.abc.net.au/news/indexes/sydney/rss.xml'), ('Sydney', 'http://www.abc.net.au/news/feed/10232/rss.xml'),
('Melbourne', 'http://www.abc.net.au/news/indexes/melbourne/rss.xml'), ('Melbourne', 'http://www.abc.net.au/news/feed/21708/rss.xml'),
('Brisbane', 'http://www.abc.net.au/news/indexes/brisbane/rss.xml'), ('Brisbane', 'http://www.abc.net.au/news/feed/12858/rss.xml'),
('Perth', 'http://www.abc.net.au/news/indexes/perth/rss.xml'), ('Perth', 'feed://www.abc.net.au/news/feed/24886/rss.xml'),
('Australia', 'http://www.abc.net.au/news/indexes/idx-australia/rss.xml'), ('Australia', 'http://www.abc.net.au/news/feed/46182/rss.xml'),
('World', 'http://www.abc.net.au/news/indexes/world/rss.xml'), ('World', 'http://www.abc.net.au/news/feed/52278/rss.xml'),
('Business', 'http://www.abc.net.au/news/indexes/business/rss.xml'), ('Business', 'http://www.abc.net.au/news/feed/51892/rss.xml'),
('Science and Technology', 'http://www.abc.net.au/news/tag/science-and-technology/rss.xml'), ('Science and Technology', 'http://www.abc.net.au/news/feed/2298/rss.xml'),
] ]

View File

@ -1,19 +1,38 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import re
class Adventure_zone(BasicNewsRecipe): class Adventure_zone(BasicNewsRecipe):
title = u'Adventure Zone' title = u'Adventure Zone'
__author__ = 'fenuks' __author__ = 'fenuks'
description = 'Adventure zone - adventure games from A to Z' description = 'Adventure zone - adventure games from A to Z'
category = 'games' category = 'games'
language = 'pl' language = 'pl'
oldest_article = 15
max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
oldest_article = 20
max_articles_per_feed = 100
use_embedded_content=False
preprocess_regexps = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: '')]
remove_tags_before= dict(name='td', attrs={'class':'main-bg'}) remove_tags_before= dict(name='td', attrs={'class':'main-bg'})
remove_tags_after= dict(name='td', attrs={'class':'main-body middle-border'}) remove_tags= [dict(name='img', attrs={'alt':'Drukuj'})]
remove_tags_after= dict(id='comments')
extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; }' extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; }'
feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')] feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')]
def parse_feeds (self):
feeds = BasicNewsRecipe.parse_feeds(self)
soup=self.index_to_soup(u'http://www.adventure-zone.info/fusion/feeds/news.php')
tag=soup.find(name='channel')
titles=[]
for r in tag.findAll(name='image'):
r.extract()
art=tag.findAll(name='item')
for i in art:
titles.append(i.title.string)
for feed in feeds:
for article in feed.articles[:]:
article.title=titles[feed.articles.index(article)]
return feeds
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php') soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php')
cover=soup.find(id='box_OstatninumerAZ') cover=soup.find(id='box_OstatninumerAZ')
@ -22,17 +41,10 @@ class Adventure_zone(BasicNewsRecipe):
def skip_ad_pages(self, soup): def skip_ad_pages(self, soup):
skip_tag = soup.body.findAll(name='a') skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'})
if skip_tag is not None: skip_tag = skip_tag.findAll(name='a')
for r in skip_tag: for r in skip_tag:
if 'articles.php?' in r['href']: if r.strong:
if r.strong is not None:
word=r.strong.string word=r.strong.string
if ('zapowied' or 'recenzj') in word: if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word)):
return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item_id'+r['href'][r['href'].find('_id')+3:], raw=True) return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
else:
None
def print_version(self, url):
return url.replace('news.php?readmore', 'print.php?type=N&item_id')

View File

@ -0,0 +1,50 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Pat Stapleton <pat.stapleton at gmail.com>'
'''
abc.net.au/news
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class TheDailyNewsEG(BasicNewsRecipe):
title = u'al-masry al-youm'
__author__ = 'Omm Mishmishah'
description = 'Independent News from Egypt'
masthead_url = 'http://www.almasryalyoum.com/sites/default/files/img/english_logo.png'
cover_url = 'http://www.almasryalyoum.com/sites/default/files/img/english_logo.png'
auto_cleanup = True
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = False
#delay = 1
use_embedded_content = False
encoding = 'utf8'
publisher = 'Independent News Egypt'
category = 'News, Egypt, World'
language = 'en_EG'
publication_type = 'newsportal'
# preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
#Remove annoying map links (inline-caption class is also used for some image captions! hence regex to match maps.google)
preprocess_regexps = [(re.compile(r'<a class="inline-caption" href="http://maps\.google\.com.*?/a>', re.DOTALL), lambda m: '')]
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': False
}
keep_only_tags = [dict(attrs={'class':['article section']})]
remove_tags = [dict(attrs={'class':['related', 'tags', 'tools', 'attached-content ready',
'inline-content story left', 'inline-content map left contracted', 'published',
'story-map', 'statepromo', 'topics', ]})]
remove_attributes = ['width','height']
feeds = [(u'English News', u'http://www.almasryalyoum.com/en/rss_feed_term/113/rss.xml'),
(u'News Features', u'http://www.almasryalyoum.com/en/rss_feed_term/115/rss.xml'),
(u'Culture', u'http://www.almasryalyoum.com/en/rss_feed_term/133/rss.xml'),
(u'Cinema', u'http://www.almasryalyoum.com/en/rss_feed_term/134/rss.xml')
]

View File

@ -36,3 +36,5 @@ class Alternet(BasicNewsRecipe):
self.temp_files[-1].write(html) self.temp_files[-1].write(html)
self.temp_files[-1].close() self.temp_files[-1].close()
return self.temp_files[-1].name return self.temp_files[-1].name
conversion_options = {'linearize_tables': True}

View File

@ -11,7 +11,6 @@ class AssociatedPress(BasicNewsRecipe):
language = 'en' language = 'en'
no_stylesheets = True no_stylesheets = True
max_articles_per_feed = 15 max_articles_per_feed = 15
html2lrf_options = ['--force-page-break-before-tag="chapter"']
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in

View File

@ -1,5 +1,4 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class AstroNEWS(BasicNewsRecipe): class AstroNEWS(BasicNewsRecipe):
title = u'AstroNEWS' title = u'AstroNEWS'
__author__ = 'fenuks' __author__ = 'fenuks'
@ -8,11 +7,16 @@ class AstroNEWS(BasicNewsRecipe):
language = 'pl' language = 'pl'
oldest_article = 8 oldest_article = 8
max_articles_per_feed = 100 max_articles_per_feed = 100
auto_cleanup = True #extra_css= 'table {text-align: left;}'
no_stylesheets=True
cover_url='http://news.astronet.pl/img/logo_news.jpg' cover_url='http://news.astronet.pl/img/logo_news.jpg'
# no_stylesheets= True remove_tags=[dict(name='hr')]
feeds = [(u'Wiadomości', u'http://news.astronet.pl/rss.cgi')] feeds = [(u'Wiadomości', u'http://news.astronet.pl/rss.cgi')]
def print_version(self, url): def print_version(self, url):
return url.replace('astronet.pl/', 'astronet.pl/print.cgi?') return url.replace('astronet.pl/', 'astronet.pl/print.cgi?')
def preprocess_html(self, soup):
for item in soup.findAll(align=True):
del item['align']
return soup

View File

@ -1,61 +1,648 @@
__license__ = 'GPL v3' ##
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>' ## Title: BBC News, Sport, and Blog Calibre Recipe
## Contact: mattst - jmstanfield@gmail.com
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
## Copyright: mattst - jmstanfield@gmail.com
##
## Written: November 2011
## Last Edited: 2011-11-19
##
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__ = 'mattst - jmstanfield@gmail.com'
''' '''
news.bbc.co.uk BBC News, Sport, and Blog Calibre Recipe
''' '''
# Import the regular expressions module.
import re import re
# Import the BasicNewsRecipe class which this class extends.
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
class BBC(BasicNewsRecipe): class BBCNewsSportBlog(BasicNewsRecipe):
title = 'BBC News'
__author__ = 'Darko Miletic, Starson17'
description = 'News from UK. '
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
#delay = 1
use_embedded_content = False
encoding = 'utf8'
publisher = 'BBC'
category = 'news, UK, world'
language = 'en_GB'
publication_type = 'newsportal'
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
}
keep_only_tags = [
dict(name='div', attrs={'class':['layout-block-a layout-block']})
,dict(attrs={'class':['story-body','storybody']})
]
remove_tags = [
dict(name='div', attrs={'class':['story-feature related narrow', 'share-help', 'embedded-hyper',
'story-feature wide ', 'story-feature narrow']}),
dict(id=['hypertab', 'comment-form']),
]
remove_attributes = ['width','height']
#
# **** IMPORTANT USERS READ ME ****
#
# First select the feeds you want then scroll down below the feeds list
# and select the values you want for the other user preferences, like
# oldest_article and such like.
#
#
# Select the BBC rss feeds which you want in your ebook.
# Selected feed have NO '#' at their start, de-selected feeds begin with a '#'.
#
# Eg. ("News Home", "http://feeds.bbci.co.uk/... - include feed.
# Eg. #("News Home", "http://feeds.bbci.co.uk/... - do not include feed.
#
# There are 68 feeds below which constitute the bulk of the available rss
# feeds on the BBC web site. These include 5 blogs by editors and
# correspondants, 16 sports feeds, 15 'sub' regional feeds (Eg. North West
# Wales, Scotland Business), and 7 Welsh language feeds.
#
# Some of the feeds are low volume (Eg. blogs), or very low volume (Eg. Click)
# so if "oldest_article = 1.5" (only articles published in the last 36 hours)
# you may get some 'empty feeds' which will not then be included in the ebook.
#
# The 15 feeds currently selected below are simply my default ones.
#
# Note: With all 68 feeds selected, oldest_article set to 2,
# max_articles_per_feed set to 100, and simultaneous_downloads set to 10,
# the ebook creation took 29 minutes on my speedy 100 mbps net connection,
# fairly high-end desktop PC running Linux (Ubuntu Lucid-Lynx).
# More realistically with 15 feeds selected, oldest_article set to 1.5,
# max_articles_per_feed set to 100, and simultaneous_downloads set to 20,
# it took 6 minutes. If that's too slow increase 'simultaneous_downloads'.
#
# Select / de-select the feeds you want in your ebook.
#
feeds = [ feeds = [
('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'), ("News Home", "http://feeds.bbci.co.uk/news/rss.xml"),
('Science/Nature', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/science/nature/rss.xml'), ("UK", "http://feeds.bbci.co.uk/news/uk/rss.xml"),
('Technology', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/technology/rss.xml'), ("World", "http://feeds.bbci.co.uk/news/world/rss.xml"),
('Entertainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'), #("England", "http://feeds.bbci.co.uk/news/england/rss.xml"),
('Magazine', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/magazine/rss.xml'), #("Scotland", "http://feeds.bbci.co.uk/news/scotland/rss.xml"),
('Business', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml'), #("Wales", "http://feeds.bbci.co.uk/news/wales/rss.xml"),
('Health', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/health/rss.xml'), #("N. Ireland", "http://feeds.bbci.co.uk/news/northern_ireland/rss.xml"),
('Americas', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml'), #("Africa", "http://feeds.bbci.co.uk/news/world/africa/rss.xml"),
('Europe', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml'), #("Asia", "http://feeds.bbci.co.uk/news/world/asia/rss.xml"),
('South Asia', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/south_asia/rss.xml'), #("Europe", "http://feeds.bbci.co.uk/news/world/europe/rss.xml"),
('UK', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/rss.xml'), #("Latin America", "http://feeds.bbci.co.uk/news/world/latin_america/rss.xml"),
('Asia-Pacific', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/asia-pacific/rss.xml'), #("Middle East", "http://feeds.bbci.co.uk/news/world/middle_east/rss.xml"),
('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'), ("US & Canada", "http://feeds.bbci.co.uk/news/world/us_and_canada/rss.xml"),
("Politics", "http://feeds.bbci.co.uk/news/politics/rss.xml"),
("Science/Environment", "http://feeds.bbci.co.uk/news/science_and_environment/rss.xml"),
("Technology", "http://feeds.bbci.co.uk/news/technology/rss.xml"),
("Magazine", "http://feeds.bbci.co.uk/news/magazine/rss.xml"),
("Entertainment/Arts", "http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml"),
#("Health", "http://feeds.bbci.co.uk/news/health/rss.xml"),
#("Education/Family", "http://feeds.bbci.co.uk/news/education/rss.xml"),
("Business", "http://feeds.bbci.co.uk/news/business/rss.xml"),
("Special Reports", "http://feeds.bbci.co.uk/news/special_reports/rss.xml"),
("Also in the News", "http://feeds.bbci.co.uk/news/also_in_the_news/rss.xml"),
#("Newsbeat", "http://www.bbc.co.uk/newsbeat/rss.xml"),
#("Click", "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/programmes/click_online/rss.xml"),
("Blog: Nick Robinson (Political Editor)", "http://feeds.bbci.co.uk/news/correspondents/nickrobinson/rss.sxml"),
#("Blog: Mark D'Arcy (Parliamentary Correspondent)", "http://feeds.bbci.co.uk/news/correspondents/markdarcy/rss.sxml"),
#("Blog: Robert Peston (Business Editor)", "http://feeds.bbci.co.uk/news/correspondents/robertpeston/rss.sxml"),
#("Blog: Stephanie Flanders (Economics Editor)", "http://feeds.bbci.co.uk/news/correspondents/stephanieflanders/rss.sxml"),
("Blog: Rory Cellan-Jones (Technology correspondent)", "http://feeds.bbci.co.uk/news/correspondents/rorycellanjones/rss.sxml"),
("Sport Front Page", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/front_page/rss.xml"),
#("Football", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/football/rss.xml"),
#("Cricket", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/cricket/rss.xml"),
#("Rugby Union", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_union/rss.xml"),
#("Rugby League", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_league/rss.xml"),
#("Tennis", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/tennis/rss.xml"),
#("Golf", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/golf/rss.xml"),
#("Motorsport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/motorsport/rss.xml"),
#("Boxing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/boxing/rss.xml"),
#("Athletics", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/athletics/rss.xml"),
#("Snooker", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/snooker/rss.xml"),
#("Horse Racing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/horse_racing/rss.xml"),
#("Cycling", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/cycling/rss.xml"),
#("Disability Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/disability_sport/rss.xml"),
#("Other Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/rss.xml"),
#("Olympics 2012", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/olympics_2012/rss.xml"),
#("N. Ireland Politics", "http://feeds.bbci.co.uk/news/northern_ireland/northern_ireland_politics/rss.xml"),
#("Scotland Politics", "http://feeds.bbci.co.uk/news/scotland/scotland_politics/rss.xml"),
#("Scotland Business", "http://feeds.bbci.co.uk/news/scotland/scotland_business/rss.xml"),
#("E. Scotland, Edinburgh & Fife", "http://feeds.bbci.co.uk/news/scotland/edinburgh_east_and_fife/rss.xml"),
#("W. Scotland & Glasgow", "http://feeds.bbci.co.uk/news/scotland/glasgow_and_west/rss.xml"),
#("Highlands & Islands", "http://feeds.bbci.co.uk/news/scotland/highlands_and_islands/rss.xml"),
#("NE. Scotland, Orkney & Shetland", "http://feeds.bbci.co.uk/news/scotland/north_east_orkney_and_shetland/rss.xml"),
#("South Scotland", "http://feeds.bbci.co.uk/news/scotland/south_scotland/rss.xml"),
#("Central Scotland & Tayside", "http://feeds.bbci.co.uk/news/scotland/tayside_and_central/rss.xml"),
#("Wales Politics", "http://feeds.bbci.co.uk/news/wales/wales_politics/rss.xml"),
#("NW. Wales", "http://feeds.bbci.co.uk/news/wales/north_west_wales/rss.xml"),
#("NE. Wales", "http://feeds.bbci.co.uk/news/wales/north_east_wales/rss.xml"),
#("Mid. Wales", "http://feeds.bbci.co.uk/news/wales/mid_wales/rss.xml"),
#("SW. Wales", "http://feeds.bbci.co.uk/news/wales/south_west_wales/rss.xml"),
#("SE. Wales", "http://feeds.bbci.co.uk/news/wales/south_east_wales/rss.xml"),
#("Newyddion - News in Welsh", "http://feeds.bbci.co.uk/newyddion/rss.xml"),
#("Gwleidyddiaeth", "http://feeds.bbci.co.uk/newyddion/gwleidyddiaeth/rss.xml"),
#("Gogledd-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/gogledd-ddwyrain/rss.xml"),
#("Gogledd-Orllewin", "http://feeds.bbci.co.uk/newyddion/gogledd-orllewin/rss.xml"),
#("Canolbarth", "http://feeds.bbci.co.uk/newyddion/canolbarth/rss.xml"),
#("De-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/de-ddwyrain/rss.xml"),
#("De-Orllewin", "http://feeds.bbci.co.uk/newyddion/de-orllewin/rss.xml"),
] ]
# **** SELECT YOUR USER PREFERENCES ****
# Title to use for the ebook.
#
title = 'BBC News'
# A brief description for the ebook.
#
description = u'BBC web site ebook created using rss feeds.'
# The max number of articles which may be downloaded from each feed.
# I've never seen more than about 70 articles in a single feed in the
# BBC feeds.
#
max_articles_per_feed = 100
# The max age of articles which may be downloaded from each feed. This is
# specified in days - note fractions of days are allowed, Eg. 2.5 (2 and a
# half days). My default of 1.5 days is the last 36 hours, the point at
# which I've decided 'news' becomes 'old news', but be warned this is not
# so good for the blogs, technology, magazine, etc., and sports feeds.
# You may wish to extend this to 2-5 but watch out ebook creation time will
# increase as well. Setting this to 30 will get everything (AFAICT) as long
# as max_articles_per_feed remains set high (except for 'Click' which is
# v. low volume and its currently oldest article is 4th Feb 2011).
#
oldest_article = 1.5
# Number of simultaneous downloads. 20 is consistantly working fine on the
# BBC News feeds with no problems. Speeds things up from the defualt of 5.
# If you have a lot of feeds and/or have increased oldest_article above 2
# then you may wish to try increasing simultaneous_downloads to 25-30,
# Or, of course, if you are in a hurry. [I've not tried beyond 20.]
#
simultaneous_downloads = 20
# Timeout for fetching files from the server in seconds. The default of
# 120 seconds, seems somewhat excessive.
#
timeout = 30
# The format string for the date shown on the ebook's first page.
# List of all values: http://docs.python.org/library/time.html
# Default in news.py has a leading space so that's mirrored here.
# As with 'feeds' select/de-select by adding/removing the initial '#',
# only one timefmt should be selected, here's a few to choose from.
#
timefmt = ' [%a, %d %b %Y]' # [Fri, 14 Nov 2011] (Calibre default)
#timefmt = ' [%a, %d %b %Y %H:%M]' # [Fri, 14 Nov 2011 18:30]
#timefmt = ' [%a, %d %b %Y %I:%M %p]' # [Fri, 14 Nov 2011 06:30 PM]
#timefmt = ' [%d %b %Y]' # [14 Nov 2011]
#timefmt = ' [%d %b %Y %H:%M]' # [14 Nov 2011 18.30]
#timefmt = ' [%Y-%m-%d]' # [2011-11-14]
#timefmt = ' [%Y-%m-%d-%H-%M]' # [2011-11-14-18-30]
#
# **** IMPORTANT ****
#
# DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING.
#
# DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING.
#
# I MEAN IT, YES I DO, ABSOLUTELY, AT YOU OWN RISK. :)
#
# **** IMPORTANT ****
#
# Author of this recipe.
__author__ = 'mattst'
# Specify English as the language of the RSS feeds (ISO-639 code).
language = 'en_GB'
# Set tags.
tags = 'news, sport, blog'
# Set publisher and publication type.
publisher = 'BBC'
publication_type = 'newspaper'
# Disable stylesheets from site.
no_stylesheets = True
# Specifies an override encoding for sites that have an incorrect charset
# specified. Default of 'None' says to auto-detect. Some other BBC recipes
# use 'utf8', which works fine (so use that if necessary) but auto-detecting
# with None is working fine, so stick with that for robustness.
encoding = None
# Sets whether a feed has full articles embedded in it. The BBC feeds do not.
use_embedded_content = False
# Removes empty feeds - why keep them!?
remove_empty_feeds = True
# Create a custom title which fits nicely in the Kindle title list.
# Requires "import time" above class declaration, and replacing
# title with custom_title in conversion_options (right column only).
# Example of string below: "BBC News - 14 Nov 2011"
#
# custom_title = "BBC News - " + time.strftime('%d %b %Y')
'''
# Conversion options for advanced users, but don't forget to comment out the
# current conversion_options below. Avoid setting 'linearize_tables' as that
# plays havoc with the 'old style' table based pages.
#
conversion_options = { 'title' : title,
'comments' : description,
'tags' : tags,
'language' : language,
'publisher' : publisher,
'authors' : publisher,
'smarten_punctuation' : True
}
'''
conversion_options = { 'smarten_punctuation' : True }
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
.introduction, .first { font-weight: bold; } \
.cross-head { font-weight: bold; font-size: 125%; } \
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
.cap, .caption, .caption img, .caption span { display: block; text-align: center; margin: 5px auto; } \
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
text-align: center; font-size: 80%; font-style: italic; margin: 1px auto; } \
.story-date, .published { font-size: 80%; } \
table { width: 100%; } \
td img { display: block; margin: 5px auto; } \
ul { padding-top: 10px; } \
ol { padding-top: 10px; } \
li { padding-top: 5px; padding-bottom: 5px; } \
h1 { text-align: center; font-size: 175%; font-weight: bold; } \
h2 { text-align: center; font-size: 150%; font-weight: bold; } \
h3 { text-align: center; font-size: 125%; font-weight: bold; } \
h4, h5, h6 { text-align: center; font-size: 100%; font-weight: bold; }'
# Remove various tag attributes to improve the look of the ebook pages.
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
# Remove the (admittedly rarely used) line breaks, "<br />", which sometimes
# cause a section of the ebook to start in an unsightly fashion or, more
# frequently, a "<br />" will muck up the formatting of a correspondant's byline.
# "<br />" and "<br clear/>" are far more frequently used on the table formatted
# style of pages, and really spoil the look of the ebook pages.
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: '')]
# Create regular expressions for tag keeping and removal to make the matches more
# robust against minor changes and errors in the HTML, Eg. double spaces, leading
# and trailing spaces, missing hyphens, and such like.
# Python regular expression ('re' class) page: http://docs.python.org/library/re.html
# ***************************************
# Regular expressions for keep_only_tags:
# ***************************************
# The BBC News HTML pages use variants of 'storybody' to denote the section of a HTML
# page which contains the main text of the article. Match storybody variants: 'storybody',
# 'story-body', 'story body','storybody ', etc.
storybody_reg_exp = '^.*story[_ -]*body.*$'
# The BBC sport and 'newsbeat' (features) HTML pages use 'blq_content' to hold the title
# and published date. This is one level above the usual news pages which have the title
# and date within 'story-body'. This is annoying since 'blq_content' must also be kept,
# resulting in a lot of extra things to be removed by remove_tags.
blq_content_reg_exp = '^.*blq[_ -]*content.*$'
# The BBC has an alternative page design structure, which I suspect is an out-of-date
# design but which is still used in some articles, Eg. 'Click' (technology), 'FastTrack'
# (travel), and in some sport pages. These alternative pages are table based (which is
# why I think they are an out-of-date design) and account for -I'm guesstimaking- less
# than 1% of all articles. They use a table class 'storycontent' to hold the article
# and like blq_content (above) have required lots of extra removal by remove_tags.
story_content_reg_exp = '^.*story[_ -]*content.*$'
# Keep the sections of the HTML which match the list below. The HTML page created by
# Calibre will fill <body> with those sections which are matched. Note that the
# blq_content_reg_exp must be listed before storybody_reg_exp in keep_only_tags due to
# it being the parent of storybody_reg_exp, that is to say the div class/id 'story-body'
# will be inside div class/id 'blq_content' in the HTML (if 'blq_content' is there at
# all). If they are the other way around in keep_only_tags then blq_content_reg_exp
# will end up being discarded.
keep_only_tags = [ dict(name='table', attrs={'class':re.compile(story_content_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(blq_content_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(blq_content_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(storybody_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(storybody_reg_exp, re.IGNORECASE)}) ]
# ************************************
# Regular expressions for remove_tags:
# ************************************
# Regular expression to remove share-help and variant tags. The share-help class
# is used by the site for a variety of 'sharing' type links, Eg. Facebook, delicious,
# twitter, email. Removed to avoid page clutter.
share_help_reg_exp = '^.*share[_ -]*help.*$'
# Regular expression to remove embedded-hyper and variant tags. This class is used to
# display links to other BBC News articles on the same/similar subject.
embedded_hyper_reg_exp = '^.*embed*ed[_ -]*hyper.*$'
# Regular expression to remove hypertabs and variant tags. This class is used to
# display a tab bar at the top of an article which allows the user to switch to
# an article (viewed on the same page) providing further info., 'in depth' analysis,
# an editorial, a correspondant's blog entry, and such like. The ability to handle
# a tab bar of this nature is currently beyond the scope of this recipe and
# possibly of Calibre itself (not sure about that - TO DO - check!).
hypertabs_reg_exp = '^.*hyper[_ -]*tabs.*$'
# Regular expression to remove story-feature and variant tags. Eg. 'story-feature',
# 'story-feature related narrow', 'story-feature wide', 'story-feature narrow'.
# This class is used to add additional info. boxes, or small lists, outside of
# the main story. TO DO: Work out a way to incorporate these neatly.
story_feature_reg_exp = '^.*story[_ -]*feature.*$'
# Regular expression to remove video and variant tags, Eg. 'videoInStoryB',
# 'videoInStoryC'. This class is used to embed video.
video_reg_exp = '^.*video.*$'
# Regular expression to remove audio and variant tags, Eg. 'audioInStoryD'.
# This class is used to embed audio.
audio_reg_exp = '^.*audio.*$'
# Regular expression to remove pictureGallery and variant tags, Eg. 'pictureGallery'.
# This class is used to embed a photo slideshow. See also 'slideshow' below.
picture_gallery_reg_exp = '^.*picture.*$'
# Regular expression to remove slideshow and variant tags, Eg. 'dslideshow-enclosure'.
# This class is used to embed a slideshow (not necessarily photo) but both
# 'slideshow' and 'pictureGallery' are used for slideshows.
slideshow_reg_exp = '^.*slide[_ -]*show.*$'
# Regular expression to remove social-links and variant tags. This class is used to
# display links to a BBC bloggers main page, used in various columnist's blogs
# (Eg. Nick Robinson, Robert Preston).
social_links_reg_exp = '^.*social[_ -]*links.*$'
# Regular expression to remove quote and (multi) variant tags, Eg. 'quote',
# 'endquote', 'quote-credit', 'quote-credit-title', etc. These are usually
# removed by 'story-feature' removal (as they are usually within them), but
# not always. The quotation removed is always (AFAICT) in the article text
# as well but a 2nd copy is placed in a quote tag to draw attention to it.
# The quote class tags may or may not appear in div's.
quote_reg_exp = '^.*quote.*$'
# Regular expression to remove hidden and variant tags, Eg. 'hidden'.
# The purpose of these is unclear, they seem to be an internal link to a
# section within the article, but the text of the link (Eg. 'Continue reading
# the main story') never seems to be displayed anyway. Removed to avoid clutter.
# The hidden class tags may or may not appear in div's.
hidden_reg_exp = '^.*hidden.*$'
# Regular expression to remove comment and variant tags, Eg. 'comment-introduction'.
# Used on the site to display text about registered users entering comments.
comment_reg_exp = '^.*comment.*$'
# Regular expression to remove form and variant tags, Eg. 'comment-form'.
# Used on the site to allow registered BBC users to fill in forms, typically
# for entering comments about an article.
form_reg_exp = '^.*form.*$'
# Extra things to remove due to the addition of 'blq_content' in keep_only_tags.
#<div class="story-actions"> Used on sports pages for 'email' and 'print'.
story_actions_reg_exp = '^.*story[_ -]*actions.*$'
#<div class="bookmark-list"> Used on sports pages instead of 'share-help' (for
# social networking links).
bookmark_list_reg_exp = '^.*bookmark[_ -]*list.*$'
#<div id="secondary-content" class="content-group">
# NOTE: Don't remove class="content-group" that is needed.
# Used on sports pages to link to 'similar stories'.
secondary_content_reg_exp = '^.*secondary[_ -]*content.*$'
#<div id="featured-content" class="content-group">
# NOTE: Don't remove class="content-group" that is needed.
# Used on sports pages to link to pages like 'tables', 'fixtures', etc.
featured_content_reg_exp = '^.*featured[_ -]*content.*$'
#<div id="navigation">
# Used on sports pages to link to pages like 'tables', 'fixtures', etc.
# Used sometimes instead of "featured-content" above.
navigation_reg_exp = '^.*navigation.*$'
#<a class="skip" href="#blq-container-inner">Skip to top</a>
# Used on sports pages to link to the top of the page.
skip_reg_exp = '^.*skip.*$'
# Extra things to remove due to the addition of 'storycontent' in keep_only_tags,
# which are the alterative table design based pages. The purpose of some of these
# is not entirely clear from the pages (which are a total mess!).
# Remove mapping based tags, Eg. <map id="world_map">
# The dynamic maps don't seem to work during ebook creation. TO DO: Investigate.
map_reg_exp = '^.*map.*$'
# Remove social bookmarking variation, called 'socialBookMarks'.
social_bookmarks_reg_exp = '^.*social[_ -]*bookmarks.*$'
# Remove page navigation tools, like 'search', 'email', 'print', called 'blq-mast'.
blq_mast_reg_exp = '^.*blq[_ -]*mast.*$'
# Remove 'sharesb', I think this is a generic 'sharing' class. It seems to appear
# alongside 'socialBookMarks' whenever that appears. I am removing it as well
# under the assumption that it can appear alone as well.
sharesb_reg_exp = '^.*sharesb.*$'
# Remove class 'o'. The worst named user created css class of all time. The creator
# should immediately be fired. I've seen it used to hold nothing at all but with
# 20 or so empty lines in it. Also to hold a single link to another article.
# Whatever it was designed to do it is not wanted by this recipe. Exact match only.
o_reg_exp = '^o$'
# Remove 'promotopbg' and 'promobottombg', link lists. Have decided to
# use two reg expressions to make removing this (and variants) robust.
promo_top_reg_exp = '^.*promotopbg.*$'
promo_bottom_reg_exp = '^.*promobottombg.*$'
# Remove 'nlp', provides heading for link lists. Requires an exact match due to
# risk of matching those letters in something needed, unless I see a variation
# of 'nlp' used at a later date.
nlp_reg_exp = '^nlp$'
# Remove 'mva', provides embedded floating content of various types. Variant 'mvb'
# has also now been seen. Requires an exact match of 'mva' or 'mvb' due to risk of
# matching those letters in something needed.
mva_or_mvb_reg_exp = '^mv[ab]$'
# Remove 'mvtb', seems to be page navigation tools, like 'blq-mast'.
mvtb_reg_exp = '^mvtb$'
# Remove 'blq-toplink', class to provide a link to the top of the page.
blq_toplink_reg_exp = '^.*blq[_ -]*top[_ -]*link.*$'
# Remove 'products and services' links, Eg. desktop tools, alerts, and so on.
# Eg. Class="servicev4 ukfs_services" - what a mess of a name. Have decided to
# use two reg expressions to make removing this (and variants) robust.
prods_services_01_reg_exp = '^.*servicev4.*$'
prods_services_02_reg_exp = '^.*ukfs[_ -]*services.*$'
# Remove -what I think is- some kind of navigation tools helper class, though I am
# not sure, it's called: 'blq-rst blq-new-nav'. What I do know is it pops up
# frequently and it is not wanted. Have decided to use two reg expressions to make
# removing this (and variants) robust.
blq_misc_01_reg_exp = '^.*blq[_ -]*rst.*$'
blq_misc_02_reg_exp = '^.*blq[_ -]*new[_ -]*nav.*$'
# Remove 'puffbox' - this may only appear inside 'storyextra', so it may not
# need removing - I have no clue what it does other than it contains links.
# Whatever it is - it is not part of the article and is not wanted.
puffbox_reg_exp = '^.*puffbox.*$'
# Remove 'sibtbg' and 'sibtbgf' - some kind of table formatting classes.
sibtbg_reg_exp = '^.*sibtbg.*$'
# Remove 'storyextra' - links to relevant articles and external sites.
storyextra_reg_exp = '^.*story[_ -]*extra.*$'
remove_tags = [ dict(name='div', attrs={'class':re.compile(story_feature_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(share_help_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(embedded_hyper_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(hypertabs_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(video_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(audio_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(picture_gallery_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(slideshow_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(quote_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(hidden_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(comment_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(story_actions_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(bookmark_list_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(secondary_content_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(featured_content_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(navigation_reg_exp, re.IGNORECASE)}),
dict(name='form', attrs={'id':re.compile(form_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(quote_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(hidden_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(social_links_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(comment_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(skip_reg_exp, re.IGNORECASE)}),
dict(name='map', attrs={'id':re.compile(map_reg_exp, re.IGNORECASE)}),
dict(name='map', attrs={'name':re.compile(map_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(social_bookmarks_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(blq_mast_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(sharesb_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(o_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(promo_top_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(promo_bottom_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(nlp_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(mva_or_mvb_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(mvtb_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(blq_toplink_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(prods_services_01_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(prods_services_02_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(blq_misc_01_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(blq_misc_02_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(puffbox_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(sibtbg_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(storyextra_reg_exp, re.IGNORECASE)})
]
# Uses url to create and return the 'printer friendly' version of the url.
# In other words the 'print this page' address of the page.
#
# There are 3 types of urls used in the BBC site's rss feeds. There is just
# 1 type for the standard news while there are 2 used for sports feed urls.
# Note: Sports urls are linked from regular news feeds (Eg. 'News Home') when
# there is a major story of interest to 'everyone'. So even if no BBC sports
# feeds are added to 'feeds' the logic of this method is still needed to avoid
# blank / missing / empty articles which have an index title and then no body.
def print_version(self, url):
# Handle sports page urls type 01:
if (url.find("go/rss/-/sport1/") != -1):
temp_url = url.replace("go/rss/-/", "")
# Handle sports page urls type 02:
elif (url.find("go/rss/int/news/-/sport1/") != -1):
temp_url = url.replace("go/rss/int/news/-/", "")
# Handle regular news page urls:
else:
temp_url = url.replace("go/rss/int/news/-/", "")
# Always add "?print=true" to the end of the url.
print_url = temp_url + "?print=true"
return print_url
# Remove articles in feeds based on a string in the article title or url.
#
# Code logic written by: Starson17 - posted in: "Recipes - Re-usable code"
# thread, in post with title: "Remove articles from feed", see url:
# http://www.mobileread.com/forums/showpost.php?p=1165462&postcount=6
# Many thanks and all credit to Starson17.
#
# Starson17's code has obviously been altered to suite my requirements.
def parse_feeds(self):
# Call parent's method.
feeds = BasicNewsRecipe.parse_feeds(self)
# Loop through all feeds.
for feed in feeds:
# Loop through all articles in feed.
for article in feed.articles[:]:
# Match key words and remove article if there's a match.
# Most BBC rss feed video only 'articles' use upper case 'VIDEO'
# as a title prefix. Just match upper case 'VIDEO', so that
# articles like 'Video game banned' won't be matched and removed.
if 'VIDEO' in article.title:
feed.articles.remove(article)
# Most BBC rss feed audio only 'articles' use upper case 'AUDIO'
# as a title prefix. Just match upper case 'AUDIO', so that
# articles like 'Hi-Def audio...' won't be matched and removed.
elif 'AUDIO' in article.title:
feed.articles.remove(article)
# Most BBC rss feed photo slideshow 'articles' use 'In Pictures',
# 'In pictures', and 'in pictures', somewhere in their title.
# Match any case of that phrase.
elif 'IN PICTURES' in article.title.upper():
feed.articles.remove(article)
# As above, but user contributed pictures. Match any case.
elif 'YOUR PICTURES' in article.title.upper():
feed.articles.remove(article)
# 'Sportsday Live' are articles which contain a constantly and
# dynamically updated 'running commentary' during a live sporting
# event. Match any case.
elif 'SPORTSDAY LIVE' in article.title.upper():
feed.articles.remove(article)
# Sometimes 'Sportsday Live' (above) becomes 'Live - Sport Name'.
# These are being matched below using 'Live - ' because removing all
# articles with 'live' in their titles would remove some articles
# that are in fact not live sports pages. Match any case.
elif 'LIVE - ' in article.title.upper():
feed.articles.remove(article)
# 'Quiz of the week' is a Flash player weekly news quiz. Match only
# the 'Quiz of the' part in anticipation of monthly and yearly
# variants. Match any case.
elif 'QUIZ OF THE' in article.title.upper():
feed.articles.remove(article)
# Remove articles with 'scorecards' in the url. These are BBC sports
# pages which just display a cricket scorecard. The pages have a mass
# of table and css entries to display the scorecards nicely. Probably
# could make them work with this recipe, but might take a whole day
# of work to sort out all the css - basically a formatting nightmare.
elif 'scorecards' in article.url:
feed.articles.remove(article)
return feeds
# End of class and file.

View File

@ -1,61 +1,44 @@
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
import re
'''Calibre recipe to convert the RSS feeds of the Berliner Zeitung to an ebook.'''
class SportsIllustratedRecipe(BasicNewsRecipe) : class SportsIllustratedRecipe(BasicNewsRecipe) :
__author__ = 'ape' __author__ = 'a.peter'
__copyright__ = 'ape' __copyright__ = 'a.peter'
__license__ = 'GPL v3' __license__ = 'GPL v3'
language = 'de' language = 'de'
description = 'Berliner Zeitung' description = 'Berliner Zeitung RSS'
version = 2 version = 4
title = u'Berliner Zeitung' title = u'Berliner Zeitung'
timefmt = ' [%d.%m.%Y]' timefmt = ' [%d.%m.%Y]'
#oldest_article = 7.0
no_stylesheets = True no_stylesheets = True
remove_javascript = True remove_javascript = True
use_embedded_content = False use_embedded_content = False
publication_type = 'newspaper' publication_type = 'newspaper'
keep_only_tags = [dict(name='div', attrs={'class':'teaser t_split t_artikel'})] remove_tags_before = dict(name='div', attrs={'class':'newstype'})
remove_tags_after = [dict(id='article_text')]
INDEX = 'http://www.berlinonline.de/berliner-zeitung/' feeds = [(u'Startseite', u'http://www.berliner-zeitung.de/home/10808950,10808950,view,asFeed.xml'),
(u'Politik', u'http://www.berliner-zeitung.de/home/10808018,10808018,view,asFeed.xml'),
def parse_index(self): (u'Wirtschaft', u'http://www.berliner-zeitung.de/home/10808230,10808230,view,asFeed.xml'),
base = 'http://www.berlinonline.de' (u'Berlin', u'http://www.berliner-zeitung.de/home/10809148,10809148,view,asFeed.xml'),
answer = [] (u'Brandenburg', u'http://www.berliner-zeitung.de/home/10809312,10809312,view,asFeed.xml'),
articles = {} (u'Wissenschaft', u'http://www.berliner-zeitung.de/home/10808894,10808894,view,asFeed.xml'),
more = 1 (u'Digital', u'http://www.berliner-zeitung.de/home/10808718,10808718,view,asFeed.xml'),
(u'Kultur', u'http://www.berliner-zeitung.de/home/10809150,10809150,view,asFeed.xml'),
soup = self.index_to_soup(self.INDEX) (u'Panorama', u'http://www.berliner-zeitung.de/home/10808334,10808334,view,asFeed.xml'),
(u'Sport', u'http://www.berliner-zeitung.de/home/10808794,10808794,view,asFeed.xml'),
# Get list of links to ressorts from index page (u'Hertha', u'http://www.berliner-zeitung.de/home/10808800,10808800,view,asFeed.xml'),
ressort_list = soup.findAll('ul', attrs={'class': re.compile('ressortlist')}) (u'Union', u'http://www.berliner-zeitung.de/home/10808802,10808802,view,asFeed.xml'),
for ressort in ressort_list[0].findAll('a'): (u'Verkehr', u'http://www.berliner-zeitung.de/home/10809298,10809298,view,asFeed.xml'),
feed_title = ressort.string (u'Polizei', u'http://www.berliner-zeitung.de/home/10809296,10809296,view,asFeed.xml'),
print 'Analyzing', feed_title (u'Meinung', u'http://www.berliner-zeitung.de/home/10808020,10808020,view,asFeed.xml')]
if not articles.has_key(feed_title):
articles[feed_title] = []
answer.append(feed_title)
# Load ressort page.
feed = self.index_to_soup('http://www.berlinonline.de' + ressort['href'])
# find mainbar div which contains the list of all articles
for article_container in feed.findAll('div', attrs={'class': re.compile('mainbar')}):
# iterate over all articles
for article_teaser in article_container.findAll('div', attrs={'class': re.compile('teaser')}):
# extract title of article
if article_teaser.h3 != None:
article = {'title' : article_teaser.h3.a.string, 'date' : u'', 'url' : base + article_teaser.h3.a['href'], 'description' : u''}
articles[feed_title].append(article)
else:
# Skip teasers for missing photos
if article_teaser.div.p.contents[0].find('Foto:') > -1:
continue
article = {'title': 'Weitere Artikel ' + str(more), 'date': u'', 'url': base + article_teaser.div.p.a['href'], 'description': u''}
articles[feed_title].append(article)
more += 1
answer = [[key, articles[key]] for key in answer if articles.has_key(key)]
return answer
def get_masthead_url(self): def get_masthead_url(self):
return 'http://www.berlinonline.de/.img/berliner-zeitung/blz_logo.gif' return 'http://www.berliner-zeitung.de/image/view/10810244,7040611,data,logo.png'
def print_version(self, url):
return url.replace('.html', ',view,printVersion.html')

38
recipes/biamag.recipe Normal file
View File

@ -0,0 +1,38 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
bianet.com.tr
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Radikal_tr(BasicNewsRecipe):
title = 'BiaMag'
__author__ = 'Osman Kaysan'
description = 'Independent News from Turkey'
publisher = 'BiaMag'
category = 'news, politics, Turkey'
oldest_article = 15
max_articles_per_feed = 120
masthead_url = 'http://bianet.org/images/biamag_logo.gif'
language = 'tr'
no_stylesheets = True
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
,'remove_paragraph_spacing': True,
}
remove_tags_before = dict(name='div', attrs={'class':'manset'})
remove_tags = [ dict(name='ul', attrs={'class':['altul']}), dict(name='div', attrs={'id':['habermenu']}), dict(name='div', attrs={'class':['mail']}), dict(name='div', attrs={'class':['from']})]
remove_tags_after = dict(name='div', attrs={'id':'habermenu'})
feeds = [(u'BiaMag', u'http://www.bianet.org/biamag.rss')]
def preprocess_html(self, soup):
return self.adeify_images(soup)

38
recipes/biamag_en.recipe Normal file
View File

@ -0,0 +1,38 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
bianet.com.tr
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Radikal_tr(BasicNewsRecipe):
title = 'Bianet-English'
__author__ = 'Osman Kaysan'
description = 'Independent News Network from Turkey(English)'
publisher = 'Bianet'
category = 'news, politics, Turkey'
oldest_article = 7
max_articles_per_feed = 150
masthead_url = 'http://bianet.org/images/english_logo.gif'
language = 'en_TR'
no_stylesheets = True
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
,'remove_paragraph_spacing': True,
}
remove_tags_before = dict(name='div', attrs={'class':'manset'})
remove_tags = [ dict(name='ul', attrs={'class':['altul']}), dict(name='div', attrs={'id':['habermenu']}), dict(name='div', attrs={'class':['mail']}), dict(name='div', attrs={'class':['from']})]
remove_tags_after = dict(name='div', attrs={'id':'habermenu'})
feeds = [(u'Bianet-English', u'http://www.bianet.org/english.rss')]
def preprocess_html(self, soup):
return self.adeify_images(soup)

38
recipes/bianet.recipe Normal file
View File

@ -0,0 +1,38 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
bianet.com.tr
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Radikal_tr(BasicNewsRecipe):
title = 'Bianet'
__author__ = 'Osman Kaysan'
description = 'Independent News from Turkey'
publisher = 'Bianet'
category = 'news, politics, Turkey'
oldest_article = 7
max_articles_per_feed = 120
masthead_url = 'http://bianet.org/images/bianet_logo.gif'
language = 'tr'
no_stylesheets = True
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
,'remove_paragraph_spacing': True,
}
remove_tags_before = dict(name='div', attrs={'class':'manset'})
remove_tags = [ dict(name='ul', attrs={'class':['altul']}), dict(name='div', attrs={'id':['habermenu']}), dict(name='div', attrs={'class':['mail']}), dict(name='div', attrs={'class':['from']})]
remove_tags_after = dict(name='div', attrs={'id':'habermenu'})
feeds = [(u'Bianet', u'http://bianet.org/bianet.rss')]
def preprocess_html(self, soup):
return self.adeify_images(soup)

19
recipes/biolog_pl.recipe Normal file
View File

@ -0,0 +1,19 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe
class Biolog_pl(BasicNewsRecipe):
title = u'Biolog.pl'
oldest_article = 7
max_articles_per_feed = 100
remove_empty_feeds=True
__author__ = 'fenuks'
description = u'Przyrodnicze aktualności ze świata nauki (codziennie aktualizowane), kurs biologii, testy i sprawdziany, forum dyskusyjne.'
category = 'biology'
language = 'pl'
cover_url='http://www.biolog.pl/naukowy,portal,biolog.png'
no_stylesheets = True
#keeps_only_tags=[dict(id='main')]
remove_tags_before=dict(id='main')
remove_tags_after=dict(name='a', attrs={'name':'komentarze'})
remove_tags=[dict(name='img', attrs={'alt':'Komentarze'})]
feeds = [(u'Wszystkie', u'http://www.biolog.pl/backend.php'), (u'Medycyna', u'http://www.biolog.pl/medycyna-rss.php'), (u'Ekologia', u'http://www.biolog.pl/rss-ekologia.php'), (u'Genetyka i biotechnologia', u'http://www.biolog.pl/rss-biotechnologia.php'), (u'Botanika', u'http://www.biolog.pl/rss-botanika.php'), (u'Le\u015bnictwo', u'http://www.biolog.pl/rss-lesnictwo.php'), (u'Zoologia', u'http://www.biolog.pl/rss-zoologia.php')]

View File

@ -0,0 +1,50 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe
class Birgun (BasicNewsRecipe):
title = u'Birgün Gazetesi'
__author__ = u'Osman Kaysan'
oldest_article = 7
max_articles_per_feed =150
use_embedded_content = False
description = 'Birgun gazatesi haberleri, kose yazarlari'
publisher = 'Birgün'
category = 'news,haberler,turkce,gazete,birgun'
language = 'tr'
no_stylesheets = True
publication_type = 'newspaper'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
,'remove_paragraph_spacing': True,
}
cover_img_url = 'http://www.birgun.net/i/birgun.png'
masthead_url = 'http://www.birgun.net/i/birgun.png'
remove_attributes = ['width','height']
remove_tags_before = dict(name='h2', attrs={'class':'storyHeadline'})
#remove_tags_after = dict(name='div', attrs={'class':'toollinks'})
remove_tags_after = dict(name='tr', attrs={'valign':'top'})
remove_tags = [ dict(name='div', attrs={'id':'byLine'}), dict(name='div', attrs={'class':'toollinks'})
, dict(name='div', attrs={'class':'main-lead'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})
, dict(name='a', attrs={'class':'addthis_button'})]
remove_empty_feeds= True
feeds = [
( u'Güncel', u'http://www.birgun.net/actuels.xml')
,( u'Köşe Yazarları', u'http://www.birgun.net/writer.xml')
,( u'Politika', u'http://www.birgun.net/politics.xml')
,( u'Ekonomi', u'http://www.birgun.net/economic.xml')
,( u'Çalışma Yaşamı', u'http://www.birgun.net/workers.xml')
,( u'Dünya', u'http://www.birgun.net/worlds.xml')
,( u'Yaşam', u'http://www.birgun.net/lifes.xml')
]

View File

@ -0,0 +1,44 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Birmingham post'
description = 'News for Birmingham UK'
timefmt = ''
__author__ = 'Dave Asbury'
cover_url = 'http://1.bp.blogspot.com/_GwWyq5eGw9M/S9BHPHxW55I/AAAAAAAAB6Q/iGCWl0egGzg/s320/Birmingham+post+Lite+front.JPG'
oldest_article = 1
max_articles_per_feed = 20
remove_empty_feeds = True
remove_javascript = True
auto_cleanup = True
language = 'en_GB'
masthead_url = 'http://www.pressgazette.co.uk/Pictures/web/t/c/g/birmingham_post.jpg'
keep_only_tags = [
#dict(name='h1',attrs={'id' : 'article-headline'}),
#dict(attrs={'class':['article-meta-author','article-meta-date','article main','art-o art-align-center otm-1 ']}),
#dict(name='p')
#dict(attrs={'id' : 'three-col'})
]
remove_tags = [
# dict(name='div',attrs={'class' : 'span-33 last header-links'})
]
feeds = [
#(u'News',u'http://www.birminghampost.net/news/rss.xml'),
(u'Local News', u'http://www.birminghampost.net/news/west-midlands-news/rss.xml'),
(u'UK News', u'http://www.birminghampost.net/news/uk-news/rss.xml'),
(u'Sports',u'http://www.birminghampost.net/midlands-birmingham-sport/rss.xml'),
(u'Bloggs & Comments',u'http://www.birminghampost.net/comment/rss.xml')
]
extra_css = '''
body {font: sans-serif medium;}'
h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;}
h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; }
span{ font-size:9.5px; font-weight:bold;font-style:italic}
p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
'''

View File

@ -1,6 +1,6 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
''' '''
blic.rs blic.rs
''' '''
@ -73,7 +73,10 @@ class Blic(BasicNewsRecipe):
def print_version(self, url): def print_version(self, url):
return url + '/print' return url + '/print'
def preprocess_html(self, soup): def get_cover_url(self):
for item in soup.findAll(style=True): soup = self.index_to_soup('http://www.blic.rs/')
del item['style'] alink = soup.find('a', attrs={'id':'blic_naslovna_print'})
return soup if alink:
return 'http://www.blic.rs' + alink['href']
return None

26
recipes/blues.recipe Normal file
View File

@ -0,0 +1,26 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Oskar Kunicki <rakso at interia.pl>'
'''
Changelog:
2011-11-27
News from BluesRSS.info
'''
from calibre.web.feeds.news import BasicNewsRecipe
class BluesRSS(BasicNewsRecipe):
title = 'Blues News'
__author__ = 'Oskar Kunicki'
description ='Blues news from around the world'
publisher = 'BluesRSS.info'
category = 'news, blues, USA,UK'
oldest_article = 5
max_articles_per_feed = 100
language = 'en'
cover_url = 'http://bluesrss.info/cover.jpg'
masthead_url = 'http://bluesrss.info/cover.jpg'
no_stylesheets = True
remove_tags = [dict(name='div', attrs={'class':'wp-pagenavi'})]
feeds = [(u'News', u'http://bluesrss.info/feed/')]

View File

@ -10,30 +10,19 @@ http://www.buffalonews.com/RSS/
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1298680852(BasicNewsRecipe): class BuffaloNews(BasicNewsRecipe):
title = u'Buffalo News' title = u'Buffalo News'
oldest_article = 2 oldest_article = 2
language = 'en' language = 'en'
__author__ = 'ChappyOnIce' __author__ = 'ChappyOnIce, Krittika Goyal'
max_articles_per_feed = 20 max_articles_per_feed = 20
encoding = 'utf-8' encoding = 'utf-8'
masthead_url = 'http://www.buffalonews.com/buffalonews/skins/buffalonews/images/masthead/the_buffalo_news_logo.png' masthead_url = 'http://www.buffalonews.com/buffalonews/skins/buffalonews/images/masthead/the_buffalo_news_logo.png'
remove_javascript = True auto_cleanup = True
extra_css = 'body {text-align: justify;}\n \ remove_empty_feeds = True
p {text-indent: 20px;}'
keep_only_tags = [ feeds = [
dict(name='div', attrs={'class':['main-content-left']}) (u'City of Buffalo', u'http://www.buffalonews.com/city/communities/buffalo/?widget=rssfeed&view=feed&contentId=77944'),
]
remove_tags = [
dict(name='div', attrs={'id':['commentCount']}),
dict(name='div', attrs={'class':['story-list-links']})
]
remove_tags_after = dict(name='div', attrs={'class':['body storyContent']})
feeds = [(u'City of Buffalo', u'http://www.buffalonews.com/city/communities/buffalo/?widget=rssfeed&view=feed&contentId=77944'),
(u'Southern Erie County', u'http://www.buffalonews.com/city/communities/southern-erie/?widget=rssfeed&view=feed&contentId=77944'), (u'Southern Erie County', u'http://www.buffalonews.com/city/communities/southern-erie/?widget=rssfeed&view=feed&contentId=77944'),
(u'Eastern Erie County', u'http://www.buffalonews.com/city/communities/eastern-erie/?widget=rssfeed&view=feed&contentId=77944'), (u'Eastern Erie County', u'http://www.buffalonews.com/city/communities/eastern-erie/?widget=rssfeed&view=feed&contentId=77944'),
(u'Southern Tier', u'http://www.buffalonews.com/city/communities/southern-tier/?widget=rssfeed&view=feed&contentId=77944'), (u'Southern Tier', u'http://www.buffalonews.com/city/communities/southern-tier/?widget=rssfeed&view=feed&contentId=77944'),
@ -56,3 +45,4 @@ class AdvancedUserRecipe1298680852(BasicNewsRecipe):
(u'Off Main Street', u'http://www.buffalonews.com/city/columns/off-main-street/?widget=rssfeed&view=feed&contentId=77944'), (u'Off Main Street', u'http://www.buffalonews.com/city/columns/off-main-street/?widget=rssfeed&view=feed&contentId=77944'),
(u'Editorials', u'http://www.buffalonews.com/editorial-page/buffalo-news-editorials/?widget=rssfeed&view=feed&contentId=77944') (u'Editorials', u'http://www.buffalonews.com/editorial-page/buffalo-news-editorials/?widget=rssfeed&view=feed&contentId=77944')
] ]

View File

@ -4,16 +4,16 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = u'2011, Silviu Cotoar\u0103' __copyright__ = u'2011, Silviu Cotoar\u0103'
''' '''
catavencu.ro academiacatavencu.info
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Catavencu(BasicNewsRecipe): class AcademiaCatavencu(BasicNewsRecipe):
title = u'Academia Ca\u0163avencu' title = u'Academia Ca\u0163avencu'
__author__ = u'Silviu Cotoar\u0103' __author__ = u'Silviu Cotoar\u0103'
description = 'Tagma cum laude' description = 'Tagma cum laude'
publisher = 'Catavencu' publisher = u'Ca\u0163avencu'
oldest_article = 5 oldest_article = 5
language = 'ro' language = 'ro'
max_articles_per_feed = 100 max_articles_per_feed = 100
@ -21,7 +21,7 @@ class Catavencu(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
category = 'Ziare' category = 'Ziare'
encoding = 'utf-8' encoding = 'utf-8'
cover_url = 'http://upload.wikimedia.org/wikipedia/en/1/1e/Academia_Catavencu.jpg' cover_url = 'http://www.academiacatavencu.info/images/logo.png'
conversion_options = { conversion_options = {
'comments' : description 'comments' : description
@ -31,22 +31,21 @@ class Catavencu(BasicNewsRecipe):
} }
keep_only_tags = [ keep_only_tags = [
dict(name='ul', attrs={'class':'articles'}) dict(name='h1', attrs={'class':'art_title'}),
dict(name='div', attrs={'class':'art_text'})
] ]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'class':['tools']}) dict(name='div', attrs={'class':['desp_m']})
, dict(name='div', attrs={'class':['share']}) , dict(name='div', attrs={'id':['tags']})
, dict(name='div', attrs={'class':['category']})
, dict(name='div', attrs={'id':['comments']})
] ]
remove_tags_after = [ remove_tags_after = [
dict(name='div', attrs={'id':'comments'}) dict(name='div', attrs={'class':['desp_m']})
] ]
feeds = [ feeds = [
(u'Feeds', u'http://catavencu.ro/feed/rss') (u'Feeds', u'http://www.academiacatavencu.info/rss.xml')
] ]
def preprocess_html(self, soup): def preprocess_html(self, soup):

View File

@ -27,7 +27,7 @@ class CGM(BasicNewsRecipe):
del item['style'] del item['style']
ad=soup.findAll('a') ad=soup.findAll('a')
for r in ad: for r in ad:
if 'http://www.hustla.pl' in r['href']: if 'http://www.hustla.pl' in r['href'] or 'http://www.ebilet.pl' in r['href']:
r.extract() r.extract()
gallery=soup.find('div', attrs={'class':'galleryFlash'}) gallery=soup.find('div', attrs={'class':'galleryFlash'})
if gallery: if gallery:

View File

@ -23,7 +23,9 @@ class TheCND(BasicNewsRecipe):
remove_tags = [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})] remove_tags = [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
no_stylesheets = True no_stylesheets = True
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')] preprocess_regexps = [ (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
(re.compile('<table width.*?</table>', re.DOTALL), lambda m: ''),
]
def print_version(self, url): def print_version(self, url):
if url.find('news/article.php') >= 0: if url.find('news/article.php') >= 0:
@ -46,13 +48,15 @@ class TheCND(BasicNewsRecipe):
title = self.tag_to_string(a) title = self.tag_to_string(a)
self.log('\tFound article: ', title, 'at', url) self.log('\tFound article: ', title, 'at', url)
date = a.nextSibling date = a.nextSibling
if re.search('cm', date):
continue
if (date is not None) and len(date)>2: if (date is not None) and len(date)>2:
if not articles.has_key(date): if not articles.has_key(date):
articles[date] = [] articles[date] = []
articles[date].append({'title':title, 'url':url, 'description': '', 'date':''}) articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
self.log('\t\tAppend to : ', date) self.log('\t\tAppend to : ', date)
self.log('log articles', articles) #self.log('log articles', articles)
mostCurrent = sorted(articles).pop() mostCurrent = sorted(articles).pop()
self.title = 'CND ' + mostCurrent self.title = 'CND ' + mostCurrent

72
recipes/cnd_weekly.recipe Normal file
View File

@ -0,0 +1,72 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2010, Derek Liang <Derek.liang.ca @@@at@@@ gmail.com>'
'''
cnd.org
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class TheCND(BasicNewsRecipe):
title = 'CND Weekly'
__author__ = 'Derek Liang'
description = ''
INDEX = 'http://cnd.org'
language = 'zh'
conversion_options = {'linearize_tables':True}
remove_tags_before = dict(name='div', id='articleHead')
remove_tags_after = dict(id='copyright')
remove_tags = [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
no_stylesheets = True
preprocess_regexps = [ (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
(re.compile('<table width.*?</table>', re.DOTALL), lambda m: ''),
]
def print_version(self, url):
if url.find('news/article.php') >= 0:
return re.sub("^[^=]*", "http://my.cnd.org/modules/news/print.php?storyid", url)
else:
return re.sub("^[^=]*", "http://my.cnd.org/modules/wfsection/print.php?articleid", url)
def parse_index(self):
soup = self.index_to_soup(self.INDEX)
feeds = []
articles = {}
for a in soup.findAll('a', attrs={'target':'_cnd'}):
url = a['href']
if url.find('article.php') < 0 :
continue
if url.startswith('/'):
url = 'http://cnd.org'+url
title = self.tag_to_string(a)
date = a.nextSibling
if not re.search('cm', date):
continue
self.log('\tFound article: ', title, 'at', url, '@', date)
if (date is not None) and len(date)>2:
if not articles.has_key(date):
articles[date] = []
articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
self.log('\t\tAppend to : ', date)
sorted_articles = sorted(articles)
while sorted_articles:
mostCurrent = sorted_articles.pop()
self.title = 'CND ' + mostCurrent
feeds.append((self.title, articles[mostCurrent]))
return feeds
def populate_article_metadata(self, article, soup, first):
header = soup.find('h3')
self.log('header: ' + self.tag_to_string(header))
pass

View File

@ -0,0 +1,22 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe
class Computerworld_pl(BasicNewsRecipe):
title = u'Computerworld.pl'
__author__ = 'fenuks'
description = u'Serwis o IT w przemyśle, finansach, handlu, administracji oraz rynku IT i telekomunikacyjnym - wiadomości, opinie, analizy, porady prawne'
category = 'IT'
language = 'pl'
no_stylesheets=True
oldest_article = 7
max_articles_per_feed = 100
keep_only_tags=[dict(name='div', attrs={'id':'s'})]
remove_tags_after=dict(name='div', attrs={'class':'rMobi'})
remove_tags=[dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})]
feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')]
def get_cover_url(self):
soup = self.index_to_soup('http://www.computerworld.pl/')
cover=soup.find(name='img', attrs={'class':'prawo'})
self.cover_url=cover['src']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -0,0 +1,52 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
#from calibre import __appname__
from calibre.utils.magick import Image
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Cosmopolitan UK'
description = 'Fashion, beauty and Gossip for women from COSMOPOLITAN -UK'
__author__ = 'Dave Asbury'
#last update 21/12/11
# greyscale code by Starson
cover_url = 'http://www.cosmopolitan.magazine.co.uk/files/4613/2085/8988/Cosmo_Cover3.jpg'
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 20
remove_empty_feeds = True
remove_javascript = True
preprocess_regexps = [
(re.compile(r'<!-- Begin tmpl module_competition_offer -->.*?<!-- End tmpl module_competition_offer-->', re.IGNORECASE | re.DOTALL), lambda match: '')]
language = 'en_GB'
masthead_url = 'http://www.cosmopolitan.co.uk/cm/cosmopolitanuk/site_images/header/cosmouk_logo_home.gif'
keep_only_tags = [
dict(attrs={'class' : ['dateAuthor', 'publishDate']}),
dict(name='div',attrs ={'id' : ['main_content']})
]
remove_tags = [
dict(name='div',attrs={'class' : ['blogInfo','viral_toolbar','comment_number','prevEntry nav']}),
dict(name='div',attrs={'class' : 'blog_module_about_the_authors'}),
dict(attrs={'id': ['breadcrumbs','comment','related_links_list','right_rail','content_sec_fb_more','content_sec_mostpopularstories','content-sec_fb_frame_viewfb_bot']}),
dict(attrs={'class' : ['read_liked_that_header','fb_back_next_area']}),
dict(name='li',attrs={'class' : 'thumb'})
]
feeds = [
(u'Love & Sex', u'http://www.cosmopolitan.co.uk/love-sex/rss/'), (u'Men', u'http://cosmopolitan.co.uk/men/rss/'), (u'Fashion', u'http://cosmopolitan.co.uk/fashion/rss/'), (u'Hair & Beauty', u'http://cosmopolitan.co.uk/beauty-hair/rss/'), (u'LifeStyle', u'http://cosmopolitan.co.uk/lifestyle/rss/'), (u'Cosmo On Campus', u'http://cosmopolitan.co.uk/campus/rss/'), (u'Celebrity Gossip', u'http://cosmopolitan.co.uk/celebrity-gossip/rss/')]
def postprocess_html(self, soup, first):
#process all the images
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
if img < 0:
raise RuntimeError('Out of memory')
img.type = "GrayscaleType"
img.save(iurl)
return soup

View File

@ -5,7 +5,7 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
description = 'News as provide by The Daily Mirror -UK' description = 'News as provide by The Daily Mirror -UK'
__author__ = 'Dave Asbury' __author__ = 'Dave Asbury'
# last updated 30/10/11 # last updated 26/12/11
language = 'en_GB' language = 'en_GB'
cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg' cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
@ -13,30 +13,22 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif' masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
oldest_article = 2 oldest_article = 1
max_articles_per_feed = 30 max_articles_per_feed = 20
remove_empty_feeds = True remove_empty_feeds = True
remove_javascript = True remove_javascript = True
no_stylesheets = True no_stylesheets = True
extra_css = ''' auto_cleanup = True
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
'''
keep_only_tags = [
dict(name='div',attrs={'id' : 'body-content'})
]
remove_tags_after = [dict (name='div',attrs={'class' : 'related'})]
remove_tags = [ remove_tags = [
dict(name='div',attrs={'id' : ['sidebar','menu','search-box','roffers-top']}), dict(name='title'),
dict(name='div',attrs={'class' :['inline-ad span-16 last','article-resize','related','list teasers']}), dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),
dict(attrs={'class' : ['channellink','article-tags','replace','append-html']}),
dict(name='div',attrs={'class' : 'span-12 last sl-others addthis_toolbox addthis_default_style'})
] ]
preprocess_regexps = [ preprocess_regexps = [
(re.compile(r'<dl class="q-search">.*?</dl>', re.IGNORECASE | re.DOTALL), lambda match: '')] (re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')]
preprocess_regexps = [
(re.compile(r'Advertisement >>', re.IGNORECASE | re.DOTALL), lambda match: '')]
feeds = [ feeds = [
@ -53,5 +45,10 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml') ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
# example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml') # example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
] ]
extra_css = '''
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
h1{ font-size:18px;}
img { display:block}
'''

View File

@ -0,0 +1,18 @@
from calibre.web.feeds.news import BasicNewsRecipe
class DailyWritingTips(BasicNewsRecipe):
title = u'Daily Writing Tips'
language = 'en_GB'
__author__ = 'NotTaken'
oldest_article = 7 #days
max_articles_per_feed = 40
use_embedded_content = True
no_stylesheets = True
auto_cleanup = False
encoding = 'utf-8'
feeds = [
('Latest tips',
'http://feeds2.feedburner.com/DailyWritingTips'),
]

15
recipes/datasport.recipe Normal file
View File

@ -0,0 +1,15 @@
__license__ = 'GPL v3'
__author__ = 'faber1971'
description = 'Italian soccer news website - v1.00 (17, December 2011)'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1324114272(BasicNewsRecipe):
title = u'Datasport'
language = 'it'
__author__ = 'faber1971'
oldest_article = 1
max_articles_per_feed = 100
auto_cleanup = True
feeds = [(u'Datasport', u'http://www.datasport.it/calcio/rss.xml')]

View File

@ -0,0 +1,11 @@
from calibre.web.feeds.news import BasicNewsRecipe
class BasicUserRecipe1324913694(BasicNewsRecipe):
title = u'Derin Dusunce'
language = 'tr'
__author__ = 'asalet_r'
oldest_article = 7
max_articles_per_feed = 20
auto_cleanup = True
feeds = [(u'Derin D\xfc\u015f\xfcnce', u'http://www.derindusunce.org/feed/')]

View File

@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
'''
descopera.org
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Descopera(BasicNewsRecipe):
title = u'Descoperă.org'
__author__ = 'Marius Ignătescu'
description = 'Descoperă. Placerea de a cunoaște'
publisher = 'descopera.org'
category = 'science, technology, culture, history, earth'
language = 'ro'
oldest_article = 14
max_articles_per_feed = 100
encoding = 'utf8'
no_stylesheets = True
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
keep_only_tags = [dict(name='div', attrs={'class':['post']})]
remove_tags = [dict(name='div', attrs={'class':['topnav', 'box_a', 'shr-bookmarks shr-bookmarks-expand shr-bookmarks-center shr-bookmarks-bg-knowledge']})]
remove_attributes = ['width','height']
cover_url = 'http://www.descopera.org/wp-content/themes/dorg/styles/default/img/b_top.png?width=400'
feeds = [(u'Articles', u'http://www.descopera.org/feed/')]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -46,7 +46,8 @@ class DziennikInternautowRecipe(BasicNewsRecipe):
dict(name = 'div', attrs = {'class' : 'poradniki_context'}), dict(name = 'div', attrs = {'class' : 'poradniki_context'}),
dict(name = 'div', attrs = {'class' : 'uniBox'}), dict(name = 'div', attrs = {'class' : 'uniBox'}),
dict(name = 'object', attrs = {}), dict(name = 'object', attrs = {}),
dict(name = 'h3', attrs = {}) dict(name = 'h3', attrs = {}),
dict(attrs={'class':'twitter-share-button'})
] ]
preprocess_regexps = [ preprocess_regexps = [
@ -58,3 +59,8 @@ class DziennikInternautowRecipe(BasicNewsRecipe):
(r'\s*</', lambda match: '</'), (r'\s*</', lambda match: '</'),
] ]
] ]
def skip_ad_pages(self, soup):
if 'Advertisement' in soup.title:
nexturl=soup.find('a')['href']
return self.index_to_soup(nexturl, raw=True)

View File

@ -0,0 +1,12 @@
from calibre.web.feeds.news import BasicNewsRecipe
class BasicUserRecipe1324736687(BasicNewsRecipe):
title = u'D\xfcnya Bizim'
language = 'tr'
__author__ = 'asalet_r'
oldest_article = 7
max_articles_per_feed = 10
auto_cleanup = True
feeds = [(u'Aktif \u0130mamlar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=31'), (u'Ayr\u0131nt\u0131 Defteri', u'http://dunyabizim.com/servisler/rss.php?kategoriID=58'), (u'Baba Kitaplar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=4'), (u'Bu da Oldu', u'http://dunyabizim.com/servisler/rss.php?kategoriID=32'), (u'\xc7-al\u0131nt\u0131 Yaz\u0131lar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=33'), (u'Dar\xfclmedya', u'http://dunyabizim.com/servisler/rss.php?kategoriID=49'), (u'Gidenler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=59'), (u'G\xfczel Mekanlar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=43'), (u'\u0130yi Haberler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=18'), (u'\u0130yi M\xfczikler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=2'), (u'Kalite Dergiler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=3'), (u'Konu\u015fa Konu\u015fa', u'http://dunyabizim.com/servisler/rss.php?kategoriID=24'), (u'M\xfcstesta G\xfczeller', u'http://dunyabizim.com/servisler/rss.php?kategoriID=65'), (u'O \u015eimdi Nerede?', u'http://dunyabizim.com/servisler/rss.php?kategoriID=52'), (u'Olsa Ke\u015fke', u'http://dunyabizim.com/servisler/rss.php?kategoriID=34'), (u'Orada Ne Oldu?', u'http://dunyabizim.com/servisler/rss.php?kategoriID=38'), (u'\xd6nemli Adamlar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=1'), (u'Polemik', u'http://dunyabizim.com/servisler/rss.php?kategoriID=39'), (u'Sinema', u'http://dunyabizim.com/servisler/rss.php?kategoriID=23'), (u'Yalan Haber', u'http://dunyabizim.com/servisler/rss.php?kategoriID=40'), (u'Yeni \u015eeyler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=57'), (u'Zekeriya Sofras\u0131', u'http://dunyabizim.com/servisler/rss.php?kategoriID=60')]

View File

@ -0,0 +1,12 @@
from calibre.web.feeds.news import BasicNewsRecipe
class BasicUserRecipe1321194347(BasicNewsRecipe):
title = u'D\xfcnya B\xfclteni'
language = 'tr'
__author__ = 'asalet_r'
oldest_article = 7
max_articles_per_feed = 50
auto_cleanup = True
feeds = [(u'Tarih Dosyas\u0131', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=157'), (u'R\xf6portaj', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=153'), (u'Makale-Yorum', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=174'), (u'K\xfclt\xfcr-Sanat', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=66'), (u'Hayat\u0131n \u0130\xe7inden', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=200'), (u'Haber Analiz', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=123'), (u'Gezi-\u0130zlenim', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=90'), (u'Aile Sa\u011fl\u0131k E\u011fitim', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=75')]

View File

@ -0,0 +1,58 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe
import re
class Dziennik_pl(BasicNewsRecipe):
title = u'Dziennik.pl'
__author__ = 'fenuks'
description = u'Wiadomości z kraju i ze świata. Wiadomości gospodarcze. Znajdziesz u nas informacje, wydarzenia, komentarze, opinie.'
category = 'newspaper'
language = 'pl'
cover_url='http://6.s.dziennik.pl/images/og_dziennik.jpg'
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 100
remove_javascript=True
remove_empty_feeds=True
preprocess_regexps = [(re.compile("Komentarze:"), lambda m: '')]
keep_only_tags=[dict(id='article')]
remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget']}), dict(name='a', attrs={'class':'komentarz'})]
feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'),
(u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'),
(u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'),
(u'Kobieta', u'http://rss.dziennik.pl/Dziennik-Kobieta'),
(u'Auto', u'http://rss.dziennik.pl/Dziennik-Auto'),
(u'Rozrywka', u'http://rss.dziennik.pl/Dziennik-Rozrywka'),
(u'Film', u'http://rss.dziennik.pl/Dziennik-Film'),
(u'Muzyka' , u'http://rss.dziennik.pl/Dziennik-Muzyka'),
(u'Kultura', u'http://rss.dziennik.pl/Dziennik-Kultura'),
(u'Nauka', u'http://rss.dziennik.pl/Dziennik-Nauka'),
(u'Podróże', u'http://rss.dziennik.pl/Dziennik-Podroze/'),
(u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')]
def append_page(self, soup, appendtag):
tag=soup.find('a', attrs={'class':'page_next'})
if tag:
appendtag.find('div', attrs={'class':'article_paginator'}).extract()
while tag:
soup2= self.index_to_soup(tag['href'])
tag=soup2.find('a', attrs={'class':'page_next'})
if not tag:
for r in appendtag.findAll('div', attrs={'class':'art_src'}):
r.extract()
pagetext = soup2.find(name='div', attrs={'class':'article_body'})
for dictionary in self.remove_tags:
v=pagetext.findAll(name=dictionary['name'], attrs=dictionary['attrs'])
for delete in v:
delete.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
if appendtag.find('div', attrs={'class':'article_paginator'}):
appendtag.find('div', attrs={'class':'article_paginator'}).extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -0,0 +1,46 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid at kovidgoyal.net>, Armin Geller'
'''
Fetch echo-online.de
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class Echo_Online(BasicNewsRecipe):
title = u'Echo Online' # 2011-12-28 AGe
description = '-Echo Online-'
publisher = 'Echo Online GmbH'
category = 'News, Germany'
__author__ = 'Armin Geller' # 2011-12-28 AGe
language = 'de'
lang = 'de-DE'
encoding = 'iso-8859-1'
timefmt = ' [%a, %d %b %Y]'
oldest_article = 7
max_articles_per_feed = 50 # 2011-12-28 AGe
no_stylesheets = True
auto_cleanup = True
remove_javascript = True
feeds = [
(u'Topnews', u'http://www.echo-online.de/storage/rss/rss/topnews.xml'),
(u'Darmstadt', u'http://www.echo-online.de/rss/darmstadt.xml'),
(u'Darmstadt-Dieburg', u'http://www.echo-online.de/rss/darmstadtdieburg.xml'),
(u'Kreis Gro\xdf-Gerau', u'http://www.echo-online.de/rss/kreisgrossgerau.xml'),
(u'R\xfcsselsheim', u'http://www.echo-online.de/rss/ruesselsheim.xml'),
(u'Kreis Bergstra\xdfe', u'http://www.echo-online.de/rss/bergstrasse.xml'),
(u'Odenwaldkreis', u'http://www.echo-online.de/rss/odenwald.xml'),
(u'SV 98', u'http://www.echo-online.de/rss/sv98.xml'),
(u'Kino', u'http://www.echo-online.de/rss/kino.xml'),
(u'Ausstellungen', u'http://www.echo-online.de/rss/ausstellungen.xml'),
(u'Ausflug & Reise', u'http://www.echo-online.de/rss/ausflugreise.xml'),
]
def print_version(self, url):
return self.browser.open_novisit(url).geturl() + '?_FRAME=33&_FORMAT=PRINT'
remove_tags = [dict(name='div', attrs={'class':["header", "name"]}),]
auto_cleanup_keep = '//div[@class="bild_gross w270"]'
cover_url = 'http://adcounter.darmstaedter-echo.de/webdav/files/config/gui/images/Zeitungsfaecher.gif'

View File

@ -0,0 +1,50 @@
__license__ = 'GPL v3'
__copyright__ = '2012 Levien van Zon <levien@zonnetjes.net>'
'''
Fetch Edge.org conversations
'''
from calibre.web.feeds.news import BasicNewsRecipe
class EdgeConversationRSS(BasicNewsRecipe):
title = u'Edge.org Conversations'
__author__ = 'levien'
language = 'en'
description = '''Edge.org offers "open-minded, free ranging, intellectually
playful ... an unadorned pleasure in curiosity, a collective expression of
wonder at the living and inanimate world ... an ongoing and thrilling
colloquium.'''
oldest_article = 60
max_articles_per_feed = 100
no_stylesheets = True
keep_only_tags = [dict(name='div', attrs={'class':'HomeLeftPannel IMGCTRL'}) ]
remove_tags = [
dict(name='div',attrs={'class':'Logo'})
]
feeds = [(u'Edge RSS', u'http://edge.org/feeds/')]
def print_version(self, url):
return url.replace('conversation/', 'conversation.php?cid=')
def parse_feeds(self):
# Call parent's method.
feeds = BasicNewsRecipe.parse_feeds(self)
# Loop through all feeds.
for feed in feeds:
# Loop through all articles in feed.
for article in feed.articles[:]:
# Remove anything that is not a conversation, and remove PDF files as well...
if not ('CONVERSATION' in article.title):
feed.articles.remove(article)
elif 'pdf' in article.url:
feed.articles.remove(article)
return feeds

View File

@ -5,12 +5,11 @@ __license__ = 'GPL v3'
__copyright__ = '04 December 2010, desUBIKado' __copyright__ = '04 December 2010, desUBIKado'
__author__ = 'desUBIKado' __author__ = 'desUBIKado'
__description__ = 'Daily newspaper from Aragon' __description__ = 'Daily newspaper from Aragon'
__version__ = 'v0.07' __version__ = 'v0.08'
__date__ = '06, February 2011' __date__ = '13, November 2011'
''' '''
elperiodicodearagon.com elperiodicodearagon.com
''' '''
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -20,13 +19,13 @@ class elperiodicodearagon(BasicNewsRecipe):
description = u'Noticias desde Aragon' description = u'Noticias desde Aragon'
publisher = u'elperiodicodearagon.com' publisher = u'elperiodicodearagon.com'
category = u'news, politics, Spain, Aragon' category = u'news, politics, Spain, Aragon'
oldest_article = 2 oldest_article = 1
delay = 0 delay = 0
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
language = 'es' language = 'es'
encoding = 'utf8' encoding = 'iso-8859-1'
remove_empty_feeds = True remove_empty_feeds = True
remove_javascript = True remove_javascript = True
@ -39,61 +38,30 @@ class elperiodicodearagon(BasicNewsRecipe):
} }
feeds = [ feeds = [
(u'Arag\xf3n', u'http://elperiodicodearagon.com/RSS/2.xml'), (u'Portada', u'http://zetaestaticos.com/aragon/rss/portada_es.xml'),
(u'Internacional', u'http://elperiodicodearagon.com/RSS/4.xml'), (u'Arag\xf3n', u'http://zetaestaticos.com/aragon/rss/2_es.xml'),
(u'Espa\xf1a', u'http://elperiodicodearagon.com/RSS/3.xml'), (u'Internacional', u'http://zetaestaticos.com/aragon/rss/4_es.xml'),
(u'Econom\xeda', u'http://elperiodicodearagon.com/RSS/5.xml'), (u'Espa\xf1a', u'http://zetaestaticos.com/aragon/rss/3_es.xml'),
(u'Deportes', u'http://elperiodicodearagon.com/RSS/7.xml'), (u'Econom\xeda', u'http://zetaestaticos.com/aragon/rss/5_es.xml'),
(u'Real Zaragoza', u'http://elperiodicodearagon.com/RSS/10.xml'), (u'Deportes', u'http://zetaestaticos.com/aragon/rss/7_es.xml'),
(u'Opini\xf3n', u'http://elperiodicodearagon.com/RSS/103.xml'), (u'Real Zaragoza', u'http://zetaestaticos.com/aragon/rss/10_es.xml'),
(u'Escenarios', u'http://elperiodicodearagon.com/RSS/105.xml'), (u'CAI Zaragoza', u'http://zetaestaticos.com/aragon/rss/91_es.xml'),
(u'Sociedad', u'http://elperiodicodearagon.com/RSS/104.xml'), (u'Monta\xf1ismo', u'http://zetaestaticos.com/aragon/rss/354_es.xml'),
(u'Gente', u'http://elperiodicodearagon.com/RSS/330.xml') (u'Opini\xf3n', u'http://zetaestaticos.com/aragon/rss/103_es.xml'),
(u'Tema del d\xeda', u'http://zetaestaticos.com/aragon/rss/102_es.xml'),
(u'Escenarios', u'http://zetaestaticos.com/aragon/rss/105_es.xml'),
(u'Sociedad', u'http://zetaestaticos.com/aragon/rss/104_es.xml'),
(u'Gente', u'http://zetaestaticos.com/aragon/rss/330_es.xml'),
(u'Espacio 3', u'http://zetaestaticos.com/aragon/rss/328_es.xml'),
(u'Fiestas del Pilar', u'http://zetaestaticos.com/aragon/rss/107_es.xml')
] ]
extra_css = '''
h3 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
h2 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:18px;}
h4 {font-family:Arial,Helvetica,sans-serif; font-style:italic; font-weight:normal;font-size:20px;}
.columnaDeRecursosRelacionados {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
img{margin-bottom: 0.4em}
'''
remove_attributes = ['height','width'] remove_attributes = ['height','width']
keep_only_tags = [dict(name='div', attrs={'id':'contenidos'})] keep_only_tags = [dict(name='div', attrs={'id':'Noticia'})]
# Quitar toda la morralla
remove_tags = [dict(name='ul', attrs={'class':'herramientasDeNoticia'}),
dict(name='span', attrs={'class':'MasInformacion '}),
dict(name='span', attrs={'class':'MasInformacion'}),
dict(name='div', attrs={'class':'Middle'}),
dict(name='div', attrs={'class':'MenuCabeceraRZaragoza'}),
dict(name='div', attrs={'id':'MenuCabeceraRZaragoza'}),
dict(name='div', attrs={'class':'MenuEquipo'}),
dict(name='div', attrs={'class':'TemasRelacionados'}),
dict(name='div', attrs={'class':'GaleriaEnNoticia'}),
dict(name='div', attrs={'class':'Recorte'}),
dict(name='div', attrs={'id':'NoticiasenRecursos'}),
dict(name='div', attrs={'id':'NoticiaEnPapel'}),
dict(name='p', attrs={'class':'RecorteEnNoticias'}),
dict(name='div', attrs={'id':'Comparte'}),
dict(name='div', attrs={'id':'CajaComparte'}),
dict(name='a', attrs={'class':'EscribirComentario'}),
dict(name='a', attrs={'class':'AvisoComentario'}),
dict(name='div', attrs={'class':'CajaAvisoComentario'}),
dict(name='div', attrs={'class':'navegaNoticias'}),
dict(name='div', attrs={'class':'Mensaje'}),
dict(name='div', attrs={'id':'PaginadorDiCom'}),
dict(name='div', attrs={'id':'CajaAccesoCuentaUsuario'}),
dict(name='div', attrs={'id':'CintilloComentario'}),
dict(name='div', attrs={'id':'EscribeComentario'}),
dict(name='div', attrs={'id':'FormularioComentario'}),
dict(name='div', attrs={'id':'FormularioNormas'})]
# Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion) # Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion)
def get_cover_url(self): def get_cover_url(self):
@ -104,23 +72,7 @@ class elperiodicodearagon(BasicNewsRecipe):
return image['src'].rstrip('format=2') + 'format=1' return image['src'].rstrip('format=2') + 'format=1'
return None return None
# Para quitar espacios entre la noticia y los comentarios (lineas 1 y 2) # Usamos la versión para móviles
# El indice no apuntaba correctamente al empiece de la noticia (linea 3)
preprocess_regexps = [ def print_version(self, url):
(re.compile(r'<p>&nbsp;</p>', re.DOTALL|re.IGNORECASE), lambda match: ''), return url.replace('http://www.elperiodicodearagon.com/', 'http://www.elperiodicodearagon.com/m/')
(re.compile(r'<p> </p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(r'<p id="">', re.DOTALL|re.IGNORECASE), lambda match: '<p>')
]
# Para sustituir el video incrustado de YouTube por una imagen
def preprocess_html(self, soup):
for video_yt in soup.findAll('iframe',{'title':'YouTube video player'}):
if video_yt:
video_yt.name = 'img'
fuente = video_yt['src']
fuente2 = fuente.replace('http://www.youtube.com/embed/','http://img.youtube.com/vi/')
video_yt['src'] = fuente2 + '/0.jpg'
return soup

View File

@ -0,0 +1,58 @@
################################################################################
#Description: http://es.hu/ RSS channel
#Author: Bigpapa (bigpapabig@hotmail.com)
#Date: 2012.01.20. - V1.2
################################################################################
from calibre.web.feeds.recipes import BasicNewsRecipe
class elet_es_irodalom(BasicNewsRecipe):
title = u'\u00c9let \u00e9s Irodalom'
__author__ = 'Bigpapa'
oldest_article = 7
max_articles_per_feed = 30 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
no_stylesheets = True
#delay = 1
use_embedded_content = False
encoding = 'iso-8859-2'
category = 'Cikkek'
language = 'hu'
publication_type = 'newsportal'
extra_css = '.doc_title { font: bold 30px } .doc_author {font: bold 14px} '
needs_subscription = 'optional'
masthead_url = 'http://www.es.hu/images/logo.jpg'
timefmt = ' [%Y %b %d, %a]'
#Nem ide a kódba kell beleírni a hozzáférés adatait, hanem azt akkor adod meg, ha le akarod tölteni!
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.es.hu/')
br.select_form(name='userfrmlogin')
br['cusername'] = self.username
br['cpassword'] = self.password
br.submit()
return br
keep_only_tags = [
dict(name='div', attrs={'class':['doc_author', 'doc_title', 'doc']})
]
remove_tags = [
dict(name='a', attrs={'target':['_TOP']}),
dict(name='div', attrs={'style':['float: right; margin-left: 5px; margin-bottom: 5px;', 'float: right; margin-left: 5px; margin-bottom: 5px;']}),
]
feeds = [
(u'Publicisztika', 'http://www.feed43.com/4684235031168504.xml'),
(u'Interj\xfa', 'http://www.feed43.com/4032465460040618.xml'),
(u'Visszhang', 'http://www.feed43.com/3727375706873086.xml'),
(u'P\xe1ratlan oldal', 'http://www.feed43.com/2525784782475057.xml'),
(u'Feuilleton', 'http://www.feed43.com/7216025082703073.xml'),
(u'Pr\xf3za', 'http://www.feed43.com/8760248802326384.xml'),
(u'Vers', 'http://www.feed43.com/1737324675134275.xml'),
(u'K\xf6nyvkritika', 'http://www.feed43.com/1281156550717082.xml'),
(u'M\u0171b\xedr\xe1lat', 'http://www.feed43.com/1851854623681044.xml')
]

View File

@ -4,7 +4,8 @@ __copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
''' '''
elmundo.es elmundo.es
''' '''
import re
import time
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class ElMundo(BasicNewsRecipe): class ElMundo(BasicNewsRecipe):
@ -18,12 +19,15 @@ class ElMundo(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'iso8859_15' encoding = 'iso8859_15'
remove_javascript = True
remove_empty_feeds = True
language = 'es' language = 'es'
masthead_url = 'http://estaticos03.elmundo.es/elmundo/iconos/v4.x/v4.01/bg_h1.png' masthead_url = 'http://estaticos03.elmundo.es/elmundo/iconos/v4.x/v4.01/bg_h1.png'
publication_type = 'newspaper' publication_type = 'newspaper'
extra_css = """ extra_css = """
body{font-family: Arial,Helvetica,sans-serif} body{font-family: Arial,Helvetica,sans-serif}
.metadata_noticia{font-size: small} .metadata_noticia{font-size: small}
.pestana_GDP{font-size: small; font-weight:bold}
h1,h2,h3,h4,h5,h6,.subtitulo {color: #3F5974} h1,h2,h3,h4,h5,h6,.subtitulo {color: #3F5974}
.hora{color: red} .hora{color: red}
.update{color: gray} .update{color: gray}
@ -41,8 +45,11 @@ class ElMundo(BasicNewsRecipe):
remove_tags_after = dict(name='div' , attrs={'id':['desarrollo_noticia','tamano']}) remove_tags_after = dict(name='div' , attrs={'id':['desarrollo_noticia','tamano']})
remove_attributes = ['lang','border'] remove_attributes = ['lang','border']
remove_tags = [ remove_tags = [
dict(name='div', attrs={'class':['herramientas','publicidad_google']}) dict(name='div', attrs={'class':['herramientas','publicidad_google','comenta','col col-2b','apoyos','no-te-pierdas']})
,dict(name='div', attrs={'id':'modulo_multimedia' }) ,dict(name='div', attrs={'class':['publicidad publicidad_cuerpo_noticia','comentarios_nav','mensaje_privado','interact']})
,dict(name='div', attrs={'class':['num_comentarios estirar']})
,dict(name='span', attrs={'class':['links_comentar']})
,dict(name='div', attrs={'id':['comentar']})
,dict(name='ul', attrs={'class':'herramientas' }) ,dict(name='ul', attrs={'class':'herramientas' })
,dict(name=['object','link','embed','iframe','base','meta']) ,dict(name=['object','link','embed','iframe','base','meta'])
] ]
@ -50,13 +57,31 @@ class ElMundo(BasicNewsRecipe):
feeds = [ feeds = [
(u'Portada' , u'http://estaticos.elmundo.es/elmundo/rss/portada.xml' ) (u'Portada' , u'http://estaticos.elmundo.es/elmundo/rss/portada.xml' )
,(u'Deportes' , u'http://estaticos.elmundo.es/elmundodeporte/rss/portada.xml') ,(u'Deportes' , u'http://estaticos.elmundo.es/elmundodeporte/rss/portada.xml')
,(u'Economia' , u'http://estaticos.elmundo.es/elmundo/rss/economia.xml' ) ,(u'Econom\xeda' , u'http://estaticos.elmundo.es/elmundo/rss/economia.xml' )
,(u'Espana' , u'http://estaticos.elmundo.es/elmundo/rss/espana.xml' ) ,(u'Espa\xf1a' , u'http://estaticos.elmundo.es/elmundo/rss/espana.xml' )
,(u'Internacional' , u'http://estaticos.elmundo.es/elmundo/rss/internacional.xml' ) ,(u'Internacional' , u'http://estaticos.elmundo.es/elmundo/rss/internacional.xml' )
,(u'Cultura' , u'http://estaticos.elmundo.es/elmundo/rss/cultura.xml' ) ,(u'Cultura' , u'http://estaticos.elmundo.es/elmundo/rss/cultura.xml' )
,(u'Ciencia/Ecologia', u'http://estaticos.elmundo.es/elmundo/rss/ciencia.xml' ) ,(u'Ciencia/Ecolog\xeda', u'http://estaticos.elmundo.es/elmundo/rss/ciencia.xml' )
,(u'Comunicacion' , u'http://estaticos.elmundo.es/elmundo/rss/comunicacion.xml' ) ,(u'Comunicaci\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/comunicacion.xml' )
,(u'Television' , u'http://estaticos.elmundo.es/elmundo/rss/television.xml' ) ,(u'Televisi\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/television.xml' )
,(u'Salud' , u'http://estaticos.elmundo.es/elmundosalud/rss/portada.xml' )
,(u'Solidaridad' , u'http://estaticos.elmundo.es/elmundo/rss/solidaridad.xml' )
,(u'Su vivienda' , u'http://estaticos.elmundo.es/elmundo/rss/suvivienda.xml' )
,(u'Motor' , u'http://estaticos.elmundo.es/elmundomotor/rss/portada.xml' )
,(u'Madrid' , u'http://estaticos.elmundo.es/elmundo/rss/madrid.xml' )
,(u'Barcelona' , u'http://estaticos.elmundo.es/elmundo/rss/barcelona.xml' )
,(u'Pa\xeds Vasco' , u'http://estaticos.elmundo.es/elmundo/rss/paisvasco.xml' )
,(u'Baleares' , u'http://estaticos.elmundo.es/elmundo/rss/baleares.xml' )
,(u'Castilla y Le\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/castillayleon.xml' )
,(u'Valladolid' , u'http://estaticos.elmundo.es/elmundo/rss/valladolid.xml' )
,(u'Valencia' , u'http://estaticos.elmundo.es/elmundo/rss/valencia.xml' )
,(u'Alicante' , u'http://estaticos.elmundo.es/elmundo/rss/alicante.xml' )
,(u'Castell\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/castellon.xml' )
,(u'Andaluc\xeda' , u'http://estaticos.elmundo.es/elmundo/rss/andalucia.xml' )
,(u'Sevilla' , u'http://estaticos.elmundo.es/elmundo/rss/andalucia_sevilla.xml' )
,(u'M\xe1laga' , u'http://estaticos.elmundo.es/elmundo/rss/andalucia_malaga.xml' )
] ]
def preprocess_html(self, soup): def preprocess_html(self, soup):
@ -67,3 +92,34 @@ class ElMundo(BasicNewsRecipe):
def get_article_url(self, article): def get_article_url(self, article):
return article.get('guid', None) return article.get('guid', None)
preprocess_regexps = [
# Para presentar la imagen de los videos incrustados
(re.compile(r'var imagen', re.DOTALL|re.IGNORECASE), lambda match: '--></script><img src'),
(re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
(re.compile(r'var video=', re.DOTALL|re.IGNORECASE), lambda match: '<script language="Javascript" type="text/javascript"><!--'),
# Para que no salga la numeración de comentarios: 1, 2, 3 ...
(re.compile(r'<ol>\n<li style="z-index:', re.DOTALL|re.IGNORECASE), lambda match: '<ul><li style="z-index:'),
(re.compile(r'</ol>\n<div class="num_comentarios estirar">', re.DOTALL|re.IGNORECASE), lambda match: '</ul><div class="num_comentarios estirar">'),
]
# Obtener la imagen de portada
def get_cover_url(self):
cover = None
st = time.localtime()
year = str(st.tm_year)
month = "%.2d" % st.tm_mon
day = "%.2d" % st.tm_mday
#http://img.kiosko.net/2011/11/19/es/elmundo.750.jpg
cover='http://img.kiosko.net/'+ year + '/' + month + '/' + day +'/es/elmundo.750.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
self.log("\nPortada no disponible")
cover ='http://estaticos03.elmundo.es/elmundo/iconos/v4.x/v4.01/bg_h1.png'
return cover

16
recipes/emuzica_pl.recipe Normal file
View File

@ -0,0 +1,16 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe
class eMuzyka(BasicNewsRecipe):
title = u'eMuzyka'
__author__ = 'fenuks'
description = u'Emuzyka to największa i najpopularniejsza strona o muzyce w Polsce'
category = 'music'
language = 'pl'
cover_url='http://s.emuzyka.pl/img/emuzyka_invert_small.jpg'
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 100
keep_only_tags=[dict(name='div', attrs={'id':'news_container'}), dict(name='h3'), dict(name='div', attrs={'class':'review_text'})]
remove_tags=[dict(name='span', attrs={'id':'date'})]
feeds = [(u'Aktualno\u015bci', u'http://www.emuzyka.pl/rss.php?f=1'), (u'Recenzje', u'http://www.emuzyka.pl/rss.php?f=2')]

View File

@ -20,7 +20,7 @@ class ESPN(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
remove_javascript = True remove_javascript = True
needs_subscription = True needs_subscription = 'optional'
encoding= 'ISO-8859-1' encoding= 'ISO-8859-1'
remove_tags_before = dict(name='font', attrs={'class':'date'}) remove_tags_before = dict(name='font', attrs={'class':'date'})
@ -75,10 +75,9 @@ class ESPN(BasicNewsRecipe):
return soup return soup
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
if self.username and self.password:
br.set_handle_refresh(False) br.set_handle_refresh(False)
url = ('https://r.espn.go.com/members/v3_1/login') url = ('https://r.espn.go.com/members/v3_1/login')
raw = br.open(url).read() raw = br.open(url).read()
@ -100,7 +99,6 @@ class ESPN(BasicNewsRecipe):
return article.get('guid', None) return article.get('guid', None)
def print_version(self, url): def print_version(self, url):
if 'eticket' in url: if 'eticket' in url:
return url.partition('&')[0].replace('story?', 'print?') return url.partition('&')[0].replace('story?', 'print?')
match = re.search(r'story\?(id=\d+)', url) match = re.search(r'story\?(id=\d+)', url)

View File

@ -1,35 +1,43 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'Gerardo Diez' __copyright__ = '5, January 2011 Gerardo Diez<gerardo.diez.garcia@gmail.com> & desUBIKado'
__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>' __author__ = 'desUBIKado, based on an earlier version by Gerardo Diez'
description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)' __version__ = 'v1.01'
__docformat__ = 'restructuredtext en' __date__ = '13, November 2011'
''' '''
expansion.es [url]http://www.expansion.com/[/url]
''' '''
import time
import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
class Publico(BasicNewsRecipe):
title =u'Expansion.com' class expansion_spanish(BasicNewsRecipe):
__author__ ='Gerardo Diez' __author__ ='Gerardo Diez & desUBIKado'
publisher =u'Unidad Editorial Información Económica, S.L.' description ='Financial news from Spain'
category ='finances, catalunya' title =u'Expansion'
oldest_article =1 publisher =u'Unidad Editorial Internet, S.L.'
category ='news, finances, Spain'
oldest_article = 2
simultaneous_downloads = 10
max_articles_per_feed =100 max_articles_per_feed =100
simultaneous_downloads =10 timefmt = '[%a, %d %b, %Y]'
cover_url =u'http://estaticos01.expansion.com/iconos/v2.x/v2.0/cabeceras/logo_expansion.png' encoding ='iso-8859-15'
timefmt ='[%A, %d %B, %Y]'
encoding ='latin'
language ='es' language ='es'
remove_javascript =True use_embedded_content = False
no_stylesheets =True remove_javascript = True
no_stylesheets = True
remove_empty_feeds = True
keep_only_tags =dict(name='div', attrs={'class':['noticia primer_elemento']}) keep_only_tags =dict(name='div', attrs={'class':['noticia primer_elemento']})
remove_tags =[ remove_tags =[
dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto']}), dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto','tit_relacionadas','interact','paginacion estirar','sumario derecha']}),
dict(name='ul', attrs={'class':['bolos_desarrollo_noticia']}), dict(name='ul', attrs={'class':['bolos_desarrollo_noticia','not_logged']}),
dict(name='span', attrs={'class':['comentarios']}), dict(name='span', attrs={'class':['comentarios']}),
dict(name='p', attrs={'class':['cintillo_comentarios', 'cintillo_comentarios formulario']}), dict(name='p', attrs={'class':['cintillo_comentarios', 'cintillo_comentarios formulario']}),
dict(name='div', attrs={'id':['comentarios_lectores_listado']}) dict(name='div', attrs={'id':['comentarios_lectores_listado','comentar']})
] ]
feeds =[ feeds =[
(u'Portada', u'http://estaticos.expansion.com/rss/portada.xml'), (u'Portada', u'http://estaticos.expansion.com/rss/portada.xml'),
@ -38,42 +46,112 @@ class Publico(BasicNewsRecipe):
(u'Euribor', u'http://estaticos.expansion.com/rss/mercadoseuribor.xml'), (u'Euribor', u'http://estaticos.expansion.com/rss/mercadoseuribor.xml'),
(u'Materias Primas', u'http://estaticos.expansion.com/rss/mercadosmateriasprimas.xml'), (u'Materias Primas', u'http://estaticos.expansion.com/rss/mercadosmateriasprimas.xml'),
(u'Renta Fija', u'http://estaticos.expansion.com/rss/mercadosrentafija.xml'), (u'Renta Fija', u'http://estaticos.expansion.com/rss/mercadosrentafija.xml'),
(u'Portada: Mi Dinero', u'http://estaticos.expansion.com/rss/midinero.xml'), (u'Portada: Mi Dinero', u'http://estaticos.expansion.com/rss/midinero.xml'),
(u'Hipotecas', u'http://estaticos.expansion.com/rss/midinerohipotecas.xml'), (u'Hipotecas', u'http://estaticos.expansion.com/rss/midinerohipotecas.xml'),
(u'Créditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'), (u'Cr\xe9ditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'),
(u'Pensiones', u'http://estaticos.expansion.com/rss/midineropensiones.xml'), (u'Pensiones', u'http://estaticos.expansion.com/rss/midineropensiones.xml'),
(u'Fondos de Inversión', u'http://estaticos.expansion.com/rss/midinerofondos.xml'), (u'Fondos de Inversi\xf3n', u'http://estaticos.expansion.com/rss/midinerofondos.xml'),
(u'Motor', u'http://estaticos.expansion.com/rss/midineromotor.xml'), (u'Motor', u'http://estaticos.expansion.com/rss/midineromotor.xml'),
(u'Portada: Empresas', u'http://estaticos.expansion.com/rss/empresas.xml'), (u'Portada: Empresas', u'http://estaticos.expansion.com/rss/empresas.xml'),
(u'Banca', u'http://estaticos.expansion.com/rss/empresasbanca.xml'), (u'Banca', u'http://estaticos.expansion.com/rss/empresasbanca.xml'),
(u'TMT', u'http://estaticos.expansion.com/rss/empresastmt.xml'), (u'TMT', u'http://estaticos.expansion.com/rss/empresastmt.xml'),
(u'Energía', u'http://estaticos.expansion.com/rss/empresasenergia.xml'), (u'Energ\xeda', u'http://estaticos.expansion.com/rss/empresasenergia.xml'),
(u'Inmobiliario y Construcción', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'), (u'Inmobiliario y Construcci\xf3n', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'),
(u'Transporte y Turismo', u'http://estaticos.expansion.com/rss/empresastransporte.xml'), (u'Transporte y Turismo', u'http://estaticos.expansion.com/rss/empresastransporte.xml'),
(u'Automoción e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'), (u'Automoci\xf3n e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'),
(u'Distribución', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'), (u'Distribuci\xf3n', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'),
(u'Deporte y Negocio', u' http://estaticos.expansion.com/rss/empresasdeporte.xml'), (u'Deporte y Negocio', u' [url]http://estaticos.expansion.com/rss/empresasdeporte.xml[/url]'),
(u'Mi Negocio', u'http://estaticos.expansion.com/rss/empresasminegocio.xml'), (u'Mi Negocio', u'http://estaticos.expansion.com/rss/empresasminegocio.xml'),
(u'Interiores', u'http://estaticos.expansion.com/rss/empresasinteriores.xml'), (u'Interiores', u'http://estaticos.expansion.com/rss/empresasinteriores.xml'),
(u'Digitech', u'http://estaticos.expansion.com/rss/empresasdigitech.xml'), (u'Digitech', u'http://estaticos.expansion.com/rss/empresasdigitech.xml'),
(u'Portada: Econom\xeda y Pol\xedtica', u'http://estaticos.expansion.com/rss/economiapolitica.xml'),
(u'Portada: Economía y Política', u'http://estaticos.expansion.com/rss/economiapolitica.xml'), (u'Pol\xedtica', u'http://estaticos.expansion.com/rss/economia.xml'),
(u'Política', u'http://estaticos.expansion.com/rss/economia.xml'),
(u'Portada: Sociedad', u'http://estaticos.expansion.com/rss/entorno.xml'), (u'Portada: Sociedad', u'http://estaticos.expansion.com/rss/entorno.xml'),
(u'Portada: Opini\xf3n', u'http://estaticos.expansion.com/rss/opinion.xml'),
(u'Portada: Opinión', u'http://estaticos.expansion.com/rss/opinion.xml'),
(u'Llaves y editoriales', u'http://estaticos.expansion.com/rss/opinioneditorialyllaves.xml'), (u'Llaves y editoriales', u'http://estaticos.expansion.com/rss/opinioneditorialyllaves.xml'),
(u'Tribunas', u'http://estaticos.expansion.com/rss/opiniontribunas.xml'), (u'Tribunas', u'http://estaticos.expansion.com/rss/opiniontribunas.xml'),
(u'Portada: Jur\xeddico', u'http://estaticos.expansion.com/rss/juridico.xml'),
(u'Portada: Jurídico', u'http://estaticos.expansion.com/rss/juridico.xml'),
(u'Entrevistas', u'http://estaticos.expansion.com/rss/juridicoentrevistas.xml'), (u'Entrevistas', u'http://estaticos.expansion.com/rss/juridicoentrevistas.xml'),
(u'Opinión', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'), (u'Opini\xf3n', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'),
(u'Sentencias', u'http://estaticos.expansion.com/rss/juridicosentencias.xml'), (u'Sentencias', u'http://estaticos.expansion.com/rss/juridicosentencias.xml'),
(u'Mujer', u'http://estaticos.expansion.com/rss/mujer-empresa.xml'), (u'Mujer', u'http://estaticos.expansion.com/rss/mujer-empresa.xml'),
(u'Catalu&ntilde;a', u'http://estaticos.expansion.com/rss/catalunya.xml'), (u'Catalu\xf1a', u'http://estaticos.expansion.com/rss/catalunya.xml'),
(u'Función pública', u'http://estaticos.expansion.com/rss/funcion-publica.xml') (u'Funci\xf3n p\xfablica', u'http://estaticos.expansion.com/rss/funcion-publica.xml')
] ]
# Obtener la imagen de portada
def get_cover_url(self):
cover = None
st = time.localtime()
year = str(st.tm_year)
month = "%.2d" % st.tm_mon
day = "%.2d" % st.tm_mday
#[url]http://img5.kiosko.net/2011/11/14/es/expansion.750.jpg[/url]
cover='http://img5.kiosko.net/'+ year + '/' + month + '/' + day +'/es/expansion.750.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
self.log("\nPortada no disponible")
cover ='http://www.aproahp.org/enlaces/images/diario_expansion.gif'
return cover
# Para que no salte la publicidad al recuperar la noticia, y que siempre se recupere
# la página web, mando la variable "t" con la hora "linux" o "epoch" actual
# haciendole creer al sitio web que justo se acaba de ver la publicidad
def print_version(self, url):
st = time.time()
segundos = str(int(st))
parametros = '.html?t=' + segundos
return url.replace('.html', parametros)
_processed_links = []
def get_article_url(self, article):
# Para obtener la url original del artículo a partir de la de "feedsportal"
link = article.get('link', None)
if link is None:
return article
if link.split('/')[-1]=="story01.htm":
link=link.split('/')[-2]
a=['0B','0C','0D','0E','0F','0G','0N' ,'0L0S','0A']
b=['.' ,'/' ,'?' ,'-' ,'=' ,'&' ,'.com','www.','0']
for i in range(0,len(a)):
link=link.replace(a[i],b[i])
link="http://"+link
# Eliminar artículos duplicados en otros feeds
if not (link in self._processed_links):
self._processed_links.append(link)
else:
link = None
return link
# Un poco de css para mejorar la presentación de las noticias
extra_css = '''
.entradilla {font-family:Arial,Helvetica,sans-serif; font-weight:bold; font-style:italic; font-size:16px;}
.fecha_publicacion,.autor {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
'''
# Para presentar la imagen de los videos incrustados
preprocess_regexps = [
(re.compile(r'var imagen', re.DOTALL|re.IGNORECASE), lambda match: '--></script><img src'),
(re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
(re.compile(r'var id_reproductor', re.DOTALL|re.IGNORECASE), lambda match: '<script language="Javascript" type="text/javascript"><!--'),
]

30
recipes/fhm_uk.recipe Normal file
View File

@ -0,0 +1,30 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'FHM UK'
description = 'Good News for Men'
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif'
__author__ = 'Dave Asbury'
# last updated 27/12/11
language = 'en_GB'
oldest_article = 28
max_articles_per_feed = 12
remove_empty_feeds = True
no_stylesheets = True
#auto_cleanup = True
#articles_are_obfuscated = True
keep_only_tags = [
dict(name='h1'),
dict(name='img',attrs={'id' : 'ctl00_Body_imgMainImage'}),
dict(name='div',attrs={'id' : ['articleLeft']}),
dict(name='div',attrs={'class' : ['imagesCenterArticle','containerCenterArticle','articleBody']}),
]
feeds = [
(u'From the Homepage',u'http://feed43.com/8053226782885416.xml'),
(u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'),
(u'Gaming',u'http://feed43.com/0755006465351035.xml'),
]

18
recipes/fisco_oggi.recipe Normal file
View File

@ -0,0 +1,18 @@
__license__ = 'GPL v3'
__author__ = 'faber1971'
description = 'Website of Italian Governament Income Agency (about revenue, taxation, taxes)- v1.00 (17, December 2011)'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1324112023(BasicNewsRecipe):
title = u'Fisco Oggi'
language = 'it'
__author__ = 'faber1971'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
remove_javascript = True
no_stylesheets = True
feeds = [(u'Attualit\xe0', u'http://www.fiscooggi.it/taxonomy/term/1/feed'), (u'Normativa', u'http://www.fiscooggi.it/taxonomy/term/5/feed'), (u'Giurisprudenza', u'http://www.fiscooggi.it/taxonomy/term/8/feed'), (u'Dati e statistiche', u'http://www.fiscooggi.it/taxonomy/term/12/feed'), (u'Analisi e commenti', u'http://www.fiscooggi.it/taxonomy/term/13/feed'), (u'Bilancio e contabilit\xe0', u'http://www.fiscooggi.it/taxonomy/term/576/feed'), (u'Dalle regioni', u'http://www.fiscooggi.it/taxonomy/term/16/feed'), (u'Dal mondo', u'http://www.fiscooggi.it/taxonomy/term/17/feed')]

View File

@ -1,57 +1,68 @@
# -*- coding: utf-8 -*- import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Focus_pl(BasicNewsRecipe): class FocusRecipe(BasicNewsRecipe):
title = u'Focus.pl' __license__ = 'GPL v3'
oldest_article = 15 __author__ = u'intromatyk <intromatyk@gmail.com>'
max_articles_per_feed = 100
__author__ = 'fenuks'
language = 'pl' language = 'pl'
description ='polish scientific monthly magazine' version = 1
title = u'Focus'
publisher = u'Gruner + Jahr Polska'
category = u'News'
description = u'Newspaper'
category='magazine' category='magazine'
cover_url='' cover_url=''
remove_empty_feeds= True remove_empty_feeds= True
no_stylesheets=True no_stylesheets=True
remove_tags_before=dict(name='div', attrs={'class':'h2 h2f'}) oldest_article = 7
remove_tags_after=dict(name='div', attrs={'class':'clear'}) max_articles_per_feed = 100000
feeds = [(u'Wszystkie kategorie', u'http://focus.pl.feedsportal.com/c/32992/f/532692/index.rss'), recursions = 0
(u'Nauka', u'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'),
(u'Historia', u'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'), no_stylesheets = True
(u'Cywilizacja', u'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'), remove_javascript = True
(u'Sport', u'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'), encoding = 'utf-8'
(u'Technika', u'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'), # Seems to work best, but YMMV
(u'Przyroda', u'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'), simultaneous_downloads = 5
(u'Technologie', u'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'),
(u'Warto wiedzieć', u'http://focus.pl.feedsportal.com/c/32992/f/532700/index.rss'), r = re.compile('.*(?P<url>http:\/\/(www.focus.pl)|(rss.feedsportal.com\/c)\/.*\.html?).*')
keep_only_tags =[]
keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'cll'}))
remove_tags =[]
remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulm noprint'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'txb'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'h2'}))
remove_tags.append(dict(name = 'ul', attrs = {'class' : 'txu'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulc'}))
extra_css = '''
body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
h1{text-align: left;}
h2{font-size: medium; font-weight: bold;}
p.lead {font-weight: bold; text-align: left;}
.authordate {font-size: small; color: #696969;}
.fot{font-size: x-small; color: #666666;}
'''
feeds = [
] ('Nauka', 'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'),
('Historia', 'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'),
('Cywilizacja', 'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'),
('Sport', 'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'),
('Technika', 'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'),
('Przyroda', 'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'),
('Technologie', 'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'),
]
def skip_ad_pages(self, soup): def skip_ad_pages(self, soup):
tag=soup.find(name='a') if ('advertisement' in soup.find('title').string.lower()):
if tag: href = soup.find('a').get('href')
new_soup=self.index_to_soup(tag['href']+ 'do-druku/1/', raw=True) return self.index_to_soup(href, raw=True)
return new_soup else:
return None
def append_page(self, appendtag):
tag=appendtag.find(name='div', attrs={'class':'arrows'})
if tag:
nexturl='http://www.focus.pl/'+tag.a['href']
for rem in appendtag.findAll(name='div', attrs={'class':'klik-nav'}):
rem.extract()
while nexturl:
soup2=self.index_to_soup(nexturl)
nexturl=None
pagetext=soup2.find(name='div', attrs={'class':'txt'})
tag=pagetext.find(name='div', attrs={'class':'arrows'})
for r in tag.findAll(name='a'):
if u'Następne' in r.string:
nexturl='http://www.focus.pl/'+r['href']
for rem in pagetext.findAll(name='div', attrs={'class':'klik-nav'}):
rem.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
def get_cover_url(self): def get_cover_url(self):
soup=self.index_to_soup('http://www.focus.pl/magazyn/') soup=self.index_to_soup('http://www.focus.pl/magazyn/')
@ -60,7 +71,14 @@ class Focus_pl(BasicNewsRecipe):
self.cover_url='http://www.focus.pl/' + tag.a['href'] self.cover_url='http://www.focus.pl/' + tag.a['href']
return getattr(self, 'cover_url', self.cover_url) return getattr(self, 'cover_url', self.cover_url)
def print_version(self, url):
def preprocess_html(self, soup): if url.count ('focus.pl.feedsportal.com'):
self.append_page(soup.body) u = url.find('focus0Bpl')
return soup u = 'http://www.focus.pl/' + url[u + 11:]
u = u.replace('0C', '/')
u = u.replace('A', '')
u = u.replace ('0E','-')
u = u.replace('/nc/1//story01.htm', '/do-druku/1')
else:
u = url.replace('/nc/1','/do-druku/1')
return u

View File

@ -8,31 +8,35 @@ class FSP(BasicNewsRecipe):
__author__ = 'fluzao' __author__ = 'fluzao'
description = u'Printed edition contents. UOL subscription required (Folha subscription currently not supported).' + \ description = u'Printed edition contents. UOL subscription required (Folha subscription currently not supported).' + \
u' [Conte\xfado completo da edi\xe7\xe3o impressa. Somente para assinantes UOL.]' u' [Conte\xfado completo da edi\xe7\xe3o impressa. Somente para assinantes UOL.]'
INDEX = 'http://www1.folha.uol.com.br/fsp/indices/'
#found this to be the easiest place to find the index page (13-Nov-2011).
# searching for the "Indice Geral" link
HOMEPAGE = 'http://www1.folha.uol.com.br/fsp/'
masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
language = 'pt' language = 'pt'
no_stylesheets = True no_stylesheets = True
max_articles_per_feed = 40 max_articles_per_feed = 40
remove_javascript = True remove_javascript = True
needs_subscription = True needs_subscription = True
remove_tags_before = dict(name='b')
remove_tags_before = dict(name='p')
remove_tags = [dict(name='td', attrs={'align':'center'})] remove_tags = [dict(name='td', attrs={'align':'center'})]
remove_attributes = ['height','width'] remove_attributes = ['height','width']
masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
# fixes the problem with the section names # fixes the problem with the section names
section_dict = {'cotidian' : 'cotidiano', 'ilustrad': 'ilustrada', \ section_dict = {'cotidian' : 'cotidiano', 'ilustrad': 'ilustrada', \
'quadrin': 'quadrinhos' , 'opiniao' : u'opini\xE3o', \ 'quadrin': 'quadrinhos' , 'opiniao' : u'opini\xE3o', \
'ciencia' : u'ci\xeancia' , 'saude' : u'sa\xfade', \ 'ciencia' : u'ci\xeancia' , 'saude' : u'sa\xfade', \
'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio'} 'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio', \
'imoveis' : u'im\xf3veis', 'negocios' : u'neg\xf3cios', \
'veiculos' : u've\xedculos', 'corrida' : 'folha corrida'}
# this solves the problem with truncated content in Kindle # this solves the problem with truncated content in Kindle
conversion_options = {'linearize_tables' : True} conversion_options = {'linearize_tables' : True}
# this bit removes the footer where there are links for Proximo Texto, Texto Anterior, # this bit removes the footer where there are links for Proximo Texto, Texto Anterior,
# Indice e Comunicar Erros # Indice e Comunicar Erros
preprocess_regexps = [(re.compile(r'<BR><BR>Texto Anterior:.*<!--/NOTICIA-->', preprocess_regexps = [(re.compile(r'<!--/NOTICIA-->.*Comunicar Erros</a>',
re.DOTALL|re.IGNORECASE), lambda match: r''),
(re.compile(r'<BR><BR>Pr&oacute;ximo Texto:.*<!--/NOTICIA-->',
re.DOTALL|re.IGNORECASE), lambda match: r'')] re.DOTALL|re.IGNORECASE), lambda match: r'')]
def get_browser(self): def get_browser(self):
@ -49,7 +53,25 @@ class FSP(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
soup = self.index_to_soup(self.INDEX) #Searching for the index page on the HOMEPAGE
hpsoup = self.index_to_soup(self.HOMEPAGE)
indexref = hpsoup.find('a', href=re.compile('^indices.*'))
self.log('--> tag containing the today s index: ', indexref)
INDEX = indexref['href']
INDEX = 'http://www1.folha.uol.com.br/fsp/'+INDEX
self.log('--> INDEX after extracting href and adding prefix: ', INDEX)
# ... and taking the opportunity to get the cover image link
coverurl = hpsoup.find('a', href=re.compile('^cp.*'))['href']
if coverurl:
self.log('--> tag containing the today s cover: ', coverurl)
coverurl = coverurl.replace('htm', 'jpg')
coverurl = 'http://www1.folha.uol.com.br/fsp/images/'+coverurl
self.log('--> coverurl after extracting href and adding prefix: ', coverurl)
self.cover_url = coverurl
#soup = self.index_to_soup(self.INDEX)
soup = self.index_to_soup(INDEX)
feeds = [] feeds = []
articles = [] articles = []
section_title = "Preambulo" section_title = "Preambulo"
@ -68,8 +90,12 @@ class FSP(BasicNewsRecipe):
self.log('--> new section title: ', section_title) self.log('--> new section title: ', section_title)
if strpost.startswith('<a href'): if strpost.startswith('<a href'):
url = post['href'] url = post['href']
#this bit is kept if they ever go back to the old format (pre Nov-2011)
if url.startswith('/fsp'): if url.startswith('/fsp'):
url = 'http://www1.folha.uol.com.br'+url url = 'http://www1.folha.uol.com.br'+url
#
if url.startswith('http://www1.folha.uol.com.br/fsp'):
#url = 'http://www1.folha.uol.com.br'+url
title = self.tag_to_string(post) title = self.tag_to_string(post)
self.log() self.log()
self.log('--> post: ', post) self.log('--> post: ', post)
@ -82,15 +108,11 @@ class FSP(BasicNewsRecipe):
# keeping the front page url # keeping the front page url
minha_capa = feeds[0][1][1]['url'] minha_capa = feeds[0][1][1]['url']
# removing the 'Preambulo' section # removing the first section (now called 'top')
del feeds[0] del feeds[0]
# creating the url for the cover image
coverurl = feeds[0][1][0]['url']
coverurl = coverurl.replace('/opiniao/fz', '/images/cp')
coverurl = coverurl.replace('01.htm', '.jpg')
self.cover_url = coverurl
# inserting the cover page as the first article (nicer for kindle users) # inserting the cover page as the first article (nicer for kindle users)
feeds.insert(0,(u'primeira p\xe1gina', [{'title':u'Primeira p\xe1gina' , 'url':minha_capa}])) feeds.insert(0,(u'primeira p\xe1gina', [{'title':u'Primeira p\xe1gina' , 'url':minha_capa}]))
return feeds return feeds

50
recipes/formulaas.recipe Normal file
View File

@ -0,0 +1,50 @@
# -*- coding: utf-8 -*-
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = u'2011, Silviu Cotoar\u0103'
'''
formula-as.ro
'''
from calibre.web.feeds.news import BasicNewsRecipe
class FormulaAS(BasicNewsRecipe):
title = u'Formula AS'
__author__ = u'Silviu Cotoar\u0103'
publisher = u'Formula AS'
description = u'Formula AS'
oldest_article = 5
language = 'ro'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
category = 'Ziare,Romania'
encoding = 'utf-8'
cover_url = 'http://www.formula-as.ro/_client/img/header_logo.png'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
keep_only_tags = [
dict(name='div', attrs={'class':'item padded'})
]
remove_tags = [
dict(name='ul', attrs={'class':'subtitle lower'})
]
remove_tags_after = [
dict(name='ul', attrs={'class':'subtitle lower'}),
dict(name='div', attrs={'class':'item-brief-options'})
]
feeds = [
(u'\u0218tiri', u'http://www.formula-as.ro/rss/articole.xml')
]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -18,7 +18,7 @@ class FrazPC(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
use_embedded_content = False use_embedded_content = False
no_stylesheets = True no_stylesheets = True
cover_url='http://www.frazpc.pl/images/logo.png'
feeds = [ feeds = [
(u'Aktualno\u015bci', u'http://www.frazpc.pl/feed/aktualnosci'), (u'Aktualno\u015bci', u'http://www.frazpc.pl/feed/aktualnosci'),
(u'Artyku\u0142y', u'http://www.frazpc.pl/feed/artykuly') (u'Artyku\u0142y', u'http://www.frazpc.pl/feed/artykuly')
@ -33,6 +33,7 @@ class FrazPC(BasicNewsRecipe):
dict(name='div', attrs={'class':'comments_box'}) dict(name='div', attrs={'class':'comments_box'})
] ]
remove_tags_after=dict(name='div', attrs={'class':'content'})
preprocess_regexps = [(re.compile(r'\| <a href="#comments">Komentarze \([0-9]*\)</a>'), lambda match: '')] preprocess_regexps = [(re.compile(r'\| <a href="#comments">Komentarze \([0-9]*\)</a>'), lambda match: '')]
remove_attributes = [ 'width', 'height' ] remove_attributes = [ 'width', 'height' ]

View File

@ -0,0 +1,35 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
import re
import string
from calibre.web.feeds.news import BasicNewsRecipe
class GazetaPlSzczecin(BasicNewsRecipe):
title = u'Gazeta.pl Szczecin'
description = u'Wiadomości ze Szczecina na portalu Gazeta.pl.'
__author__ = u'Michał Szkutnik'
__license__ = u'GPL v3'
language = 'pl'
publisher = 'Agora S.A.'
category = 'news, szczecin'
oldest_article = 2
max_articles_per_feed = 100
auto_cleanup = True
remove_tags = [ { "name" : "a", "attrs" : { "href" : "http://szczecin.gazeta.pl/szczecin/www.gazeta.pl" }}]
cover_url = "http://bi.gazeta.pl/i/hp/hp2009/logo.gif"
feeds = [(u'Wszystkie', u'http://rss.feedsportal.com/c/32739/f/530434/index.rss')]
def get_article_url(self, article):
s = re.search("""/0L(szczecin.*)/story01.htm""", article.link)
s = s.group(1)
replacements = { "0B" : ".", "0C" : "/", "0H" : ",", "0I" : "_"}
for (a, b) in replacements.iteritems():
s = string.replace(s, a, b)
s = string.replace(s, "0A", "0")
return "http://"+s
def print_version(self, url):
s = re.search("""/(\d*),(\d*),(\d*),.*\.html""", url)
no1 = s.group(2)
no2 = s.group(3)
return """http://szczecin.gazeta.pl/szczecin/2029020,%s,%s.html""" % (no1, no2)

View File

@ -0,0 +1,90 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class GiveMeSomethingToRead(BasicNewsRecipe):
title = u'Give Me Something To Read'
description = 'Curation / aggregation of articles on diverse topics'
language = 'en'
__author__ = 'barty on mobileread.com forum'
max_articles_per_feed = 100
no_stylesheets = False
timefmt = ' [%a, %d %b, %Y]'
oldest_article = 365
auto_cleanup = True
INDEX = 'http://givemesomethingtoread.com'
CATEGORIES = [
# comment out categories you don't want
# (user friendly name, system name, max number of articles to load)
('The Arts','arts',25),
('Science','science',30),
('Technology','technology',30),
('Politics','politics',20),
('Media','media',30),
('Crime','crime',15),
('Other articles','',10)
]
def parse_index(self):
self.cover_url = 'http://thegretchenshow.files.wordpress.com/2009/12/well-read-cat-small.jpg'
feeds = []
seen_urls = set([])
regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
for category in self.CATEGORIES:
(cat_name, tag, max_articles) = category
tagurl = '' if tag=='' else '/tagged/'+tag
self.log('Reading category:', cat_name)
articles = []
pageno = 1
while len(articles) < max_articles and pageno < 100:
page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
pageno += 1
self.log('\tReading page:', page)
try:
soup = self.index_to_soup(page)
except:
break
headers = soup.findAll('h2')
if len(headers) == .0:
break
for header in headers:
atag = header.find('a')
url = atag['href']
# skip promotionals and duplicate
if url.startswith('http://givemesomethingtoread') or url.startswith('/') or url in seen_urls:
continue
seen_urls.add(url)
title = self.tag_to_string(header)
self.log('\tFound article:', title)
#self.log('\t', url)
desc = header.parent.find('blockquote')
desc = self.tag_to_string(desc) if desc else ''
m = regex.match( url)
if m:
desc = "[%s] %s" % (m.group(2), desc)
#self.log('\t', desc)
date = ''
p = header.parent.previousSibling
# navigate up to find h3, which contains the date
while p:
if hasattr(p,'name') and p.name == 'h3':
date = self.tag_to_string(p)
break
p = p.previousSibling
articles.append({'title':title,'url':url,'description':desc,'date':date})
if len(articles) >= max_articles:
break
if articles:
feeds.append((cat_name, articles))
return feeds

View File

@ -1,4 +1,3 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class GlasgowHerald(BasicNewsRecipe): class GlasgowHerald(BasicNewsRecipe):
@ -9,12 +8,16 @@ class GlasgowHerald(BasicNewsRecipe):
language = 'en_GB' language = 'en_GB'
__author__ = 'Kovid Goyal' __author__ = 'Kovid Goyal'
use_embedded_content = False
keep_only_tags = [dict(attrs={'class':'article'})] no_stylesheets = True
remove_tags = [ auto_cleanup = True
dict(id=['pic-nav']),
dict(attrs={'class':['comments-top']}) #keep_only_tags = [dict(attrs={'class':'article'})]
] #remove_tags = [
#dict(id=['pic-nav']),
#dict(attrs={'class':['comments-top']})
#]
feeds = [ feeds = [
@ -26,4 +29,3 @@ class GlasgowHerald(BasicNewsRecipe):
u'http://www.heraldscotland.com/cmlink/1.768',), u'http://www.heraldscotland.com/cmlink/1.768',),
(u'Columnists', u'http://www.heraldscotland.com/cmlink/1.658574')] (u'Columnists', u'http://www.heraldscotland.com/cmlink/1.658574')]

View File

@ -51,6 +51,13 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe):
{'class':['articleTools', 'pagination', 'Ads', 'topad', {'class':['articleTools', 'pagination', 'Ads', 'topad',
'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}] 'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}]
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
#Use the mobile version rather than the web version #Use the mobile version rather than the web version
def print_version(self, url): def print_version(self, url):
return url.rpartition('?')[0] + '?service=mobile' return url.rpartition('?')[0] + '?service=mobile'

13
recipes/goal.recipe Normal file
View File

@ -0,0 +1,13 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1325677767(BasicNewsRecipe):
title = u'Goal'
oldest_article = 1
language = 'it'
max_articles_per_feed = 100
auto_cleanup = True
remove_tags_after = [dict(id='article_content')]
feeds = [(u'Goal', u'http://www.goal.com/it/feeds/news?fmt=rss')]
__author__ = 'faber1971'
description = 'Sports news from Italy'

View File

@ -12,7 +12,6 @@ class GN(BasicNewsRecipe):
EDITION = 0 EDITION = 0
__author__ = 'Piotr Kontek' __author__ = 'Piotr Kontek'
title = u'Gość niedzielny'
description = 'Weekly magazine' description = 'Weekly magazine'
encoding = 'utf-8' encoding = 'utf-8'
no_stylesheets = True no_stylesheets = True
@ -20,6 +19,8 @@ class GN(BasicNewsRecipe):
remove_javascript = True remove_javascript = True
temp_files = [] temp_files = []
simultaneous_downloads = 1 simultaneous_downloads = 1
masthead_url = 'http://gosc.pl/files/11/03/12/949089_top.gif'
title = u'Gość niedzielny'
articles_are_obfuscated = True articles_are_obfuscated = True
@ -64,7 +65,6 @@ class GN(BasicNewsRecipe):
if img != None: if img != None:
a = img.parent a = img.parent
self.EDITION = a['href'] self.EDITION = a['href']
self.title = img['alt']
self.cover_url = 'http://www.gosc.pl' + img['src'] self.cover_url = 'http://www.gosc.pl' + img['src']
if not first: if not first:
break break

76
recipes/grantland.recipe Normal file
View File

@ -0,0 +1,76 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class GrantLand(BasicNewsRecipe):
title = u"Grantland"
description = 'Writings on Sports & Pop Culture'
language = 'en'
__author__ = 'barty on mobileread.com forum'
max_articles_per_feed = 100
no_stylesheets = True
# auto_cleanup is too aggressive sometimes and we end up with blank articles
auto_cleanup = False
timefmt = ' [%a, %d %b %Y]'
oldest_article = 90
cover_url = 'http://cdn0.sbnation.com/imported_assets/740965/blog_grantland_grid_3.jpg'
masthead_url = 'http://a1.espncdn.com/prod/assets/grantland/grantland-logo.jpg'
INDEX = 'http://www.grantland.com'
CATEGORIES = [
# comment out second line if you don't want older articles
# (user friendly name, url suffix, max number of articles to load)
('Today in Grantland','',20),
('In Case You Missed It','incaseyoumissedit',35),
]
remove_tags = [
{'name':['style','aside','nav','footer','script']},
{'name':'h1','text':'Grantland'},
{'id':['header','col-right']},
{'class':['connect_widget']},
{'name':'section','class':re.compile(r'\b(ad|module)\b')},
]
preprocess_regexps = [
# remove blog banners
(re.compile(r'<a href="/blog/(?:(?!</a>).)+</a>', re.DOTALL|re.IGNORECASE), lambda m: ''),
]
def parse_index(self):
feeds = []
seen_urls = set([])
for category in self.CATEGORIES:
(cat_name, tag, max_articles) = category
self.log('Reading category:', cat_name)
articles = []
page = "%s/%s" % (self.INDEX, tag)
soup = self.index_to_soup(page)
main = soup.find('div',id='col-main')
if main is None:
main = soup
for tag in main.findAll('a', href=re.compile(r'(story|post)/_/id/\d+')):
url = tag['href']
if url in seen_urls:
continue
title = tag.string
# blank title probably means <a href=".."><img /></a>. skip
if not title:
continue
self.log('\tFound article:', title)
self.log('\t', url)
articles.append({'title':title,'url':url})
seen_urls.add(url)
if len(articles) >= max_articles:
break
if articles:
feeds.append((cat_name, articles))
return feeds

43
recipes/gs24_pl.recipe Normal file
View File

@ -0,0 +1,43 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
import re
import string
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1322322819(BasicNewsRecipe):
title = u'GS24.pl (Głos Szczeciński)'
description = u'Internetowy serwis Głosu Szczecińskiego'
__author__ = u'Michał Szkutnik'
__license__ = u'GPL v3'
language = 'pl'
publisher = 'Media Regionalne sp. z o.o.'
category = 'news, szczecin'
oldest_article = 2
max_articles_per_feed = 100
auto_cleanup = True
cover_url = "http://www.gs24.pl/images/top_logo.png"
feeds = [
# (u'Wszystko', u'http://www.gs24.pl/rss.xml'),
(u'Szczecin', u'http://www.gs24.pl/szczecin.xml'),
(u'Stargard', u'http://www.gs24.pl/stargard.xml'),
(u'Świnoujście', u'http://www.gs24.pl/swinoujscie.xml'),
(u'Goleniów', u'http://www.gs24.pl/goleniow.xml'),
(u'Gryfice', u'http://www.gs24.pl/gryfice.xml'),
(u'Kamień Pomorski', u'http://www.gs24.pl/kamienpomorski.xml'),
(u'Police', u'http://www.gs24.pl/police.xml'),
(u'Region', u'http://www.gs24.pl/region.xml'),
(u'Sport', u'http://www.gs24.pl/sport.xml'),
]
def get_article_url(self, article):
s = re.search("""/0L0S(gs24.*)/story01.htm""", article.link)
s = s.group(1)
replacements = { "0B" : ".", "0C" : "/", "0H" : ",", "0I" : "_", "0D" : "?", "0F" : "="}
for (a, b) in replacements.iteritems():
s = string.replace(s, a, b)
s = string.replace(s, "0A", "0")
return "http://"+s
def print_version(self, url):
return url + "&Template=printpicart"

View File

@ -9,6 +9,7 @@ www.guardian.co.uk
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from datetime import date from datetime import date
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class Guardian(BasicNewsRecipe): class Guardian(BasicNewsRecipe):
@ -16,9 +17,11 @@ class Guardian(BasicNewsRecipe):
if date.today().weekday() == 6: if date.today().weekday() == 6:
base_url = "http://www.guardian.co.uk/theobserver" base_url = "http://www.guardian.co.uk/theobserver"
cover_pic = 'Observer digital edition' cover_pic = 'Observer digital edition'
masthead_url = 'http://static.guim.co.uk/sys-images/Guardian/Pix/site_furniture/2010/10/19/1287478087992/The-Observer-001.gif'
else: else:
base_url = "http://www.guardian.co.uk/theguardian" base_url = "http://www.guardian.co.uk/theguardian"
cover_pic = 'Guardian digital edition' cover_pic = 'Guardian digital edition'
masthead_url = 'http://static.guim.co.uk/static/f76b43f9dcfd761f0ecf7099a127b603b2922118/common/images/logos/the-guardian/titlepiece.gif'
__author__ = 'Seabound and Sujata Raman' __author__ = 'Seabound and Sujata Raman'
language = 'en_GB' language = 'en_GB'
@ -26,6 +29,7 @@ class Guardian(BasicNewsRecipe):
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_javascript = True remove_javascript = True
encoding = 'utf-8'
# List of section titles to ignore # List of section titles to ignore
# For example: ['Sport'] # For example: ['Sport']
@ -41,6 +45,16 @@ class Guardian(BasicNewsRecipe):
dict(name='div', attrs={'class':["guardian-tickets promo-component",]}), dict(name='div', attrs={'class':["guardian-tickets promo-component",]}),
dict(name='ul', attrs={'class':["pagination"]}), dict(name='ul', attrs={'class':["pagination"]}),
dict(name='ul', attrs={'id':["content-actions"]}), dict(name='ul', attrs={'id':["content-actions"]}),
# article history link
dict(name='a', attrs={'class':["rollover history-link"]}),
# "a version of this article ..." speil
dict(name='div' , attrs = { 'class' : ['section']}),
# "about this article" js dialog
dict(name='div', attrs={'class':["share-top",]}),
# author picture
dict(name='img', attrs={'class':["contributor-pic-small"]}),
# embedded videos/captions
dict(name='span',attrs={'class' : ['inline embed embed-media']}),
#dict(name='img'), #dict(name='img'),
] ]
use_embedded_content = False use_embedded_content = False
@ -65,8 +79,21 @@ class Guardian(BasicNewsRecipe):
url = None url = None
return url return url
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
def preprocess_html(self, soup): def preprocess_html(self, soup):
# multiple html sections in soup, useful stuff in the first
html = soup.find('html')
soup2 = BeautifulSoup()
soup2.insert(0,html)
soup = soup2
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
@ -75,6 +102,17 @@ class Guardian(BasicNewsRecipe):
for tag in soup.findAll(name=['ul','li']): for tag in soup.findAll(name=['ul','li']):
tag.name = 'div' tag.name = 'div'
# removes number next to rating stars
items_to_remove = []
rating_container = soup.find('div', attrs = {'class': ['rating-container']})
if rating_container:
for item in rating_container:
if isinstance(item, Tag) and str(item.name) == 'span':
items_to_remove.append(item)
for item in items_to_remove:
item.extract()
return soup return soup
def find_sections(self): def find_sections(self):

View File

@ -9,9 +9,9 @@ from calibre.ptempfile import PersistentTemporaryFile
from urlparse import urlparse from urlparse import urlparse
import re import re
class HackerNews(BasicNewsRecipe): class HNWithCommentsLink(BasicNewsRecipe):
title = 'Hacker News' title = 'HN With Comments Link'
__author__ = 'Tom Scholl' __author__ = 'Tom Scholl & David Kerschner'
description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.' description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.'
publisher = 'Y Combinator' publisher = 'Y Combinator'
category = 'news, programming, it, technology' category = 'news, programming, it, technology'
@ -80,6 +80,11 @@ class HackerNews(BasicNewsRecipe):
body = body + comments body = body + comments
return u'<html><title>' + title + u'</title><body>' + body + '</body></html>' return u'<html><title>' + title + u'</title><body>' + body + '</body></html>'
def parse_feeds(self):
a = super(HNWithCommentsLink, self).parse_feeds()
self.hn_articles = a[0].articles
return a
def get_obfuscated_article(self, url): def get_obfuscated_article(self, url):
if url.startswith('http://news.ycombinator.com'): if url.startswith('http://news.ycombinator.com'):
content = self.get_hn_content(url) content = self.get_hn_content(url)
@ -97,6 +102,13 @@ class HackerNews(BasicNewsRecipe):
else: else:
content = self.get_readable_content(url) content = self.get_readable_content(url)
article = 0
for a in self.hn_articles:
if a.url == url:
article = a
content = re.sub(r'</body>\s*</html>\s*$', '', content) + article.summary + '</body></html>'
self.temp_files.append(PersistentTemporaryFile('_fa.html')) self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(content) self.temp_files[-1].write(content)
self.temp_files[-1].close() self.temp_files[-1].close()

11
recipes/haksoz.recipe Normal file
View File

@ -0,0 +1,11 @@
from calibre.web.feeds.news import BasicNewsRecipe
class BasicUserRecipe1324739199(BasicNewsRecipe):
title = u'Haks\xf6z'
oldest_article = 7
max_articles_per_feed = 20
auto_cleanup = True
language = 'tr'
__author__ = 'asalet_r'
feeds = [(u'Haks\xf6z', u'http://www.haksozhaber.net/rss/')]

View File

@ -0,0 +1,58 @@
from calibre.web.feeds.news import BasicNewsRecipe
'''
Hamilton Spectator Calibre Recipe
'''
class HamiltonSpectator(BasicNewsRecipe):
title = u'Hamilton Spectator'
oldest_article = 2
max_articles_per_feed = 100
auto_cleanup = True
__author__ = u'Eric Coolman'
publisher = u'thespec.com'
description = u'Ontario Canada Newspaper'
category = u'News, Ontario, Canada'
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
language = 'en_CA'
encoding = 'utf-8'
feeds = [
(u'Top Stories',u'http://www.thespec.com/rss?query=/&assetType=Article'),
(u'All News',u'http://www.thespec.com/rss?query=/news&assetType=Article'),
(u'Local',u'http://www.thespec.com/rss?query=/local&assetType=Article'),
(u'Ontario',u'http://www.thespec.com/rss?query=/ontario&assetType=Article'),
(u'Canada',u'http://www.thespec.com/rss?query=/canada&assetType=Article'),
(u'World News',u'http://www.thespec.com/rss?query=/world&assetType=Article'),
(u'Business',u'http://www.thespec.com/rss?query=/business&assetType=Article'),
(u'Crime',u'http://www.thespec.com/rss?query=/crime&assetType=Article'),
(u'All Sports',u'http://www.thespec.com/rss?query=/sports&assetType=Article'),
(u'Ticats',u'http://www.thespec.com/rss?query=/sports/ticats&assetType=Article'),
(u'Bulldogs',u'http://www.thespec.com/rss?query=/sports/bulldogs&assetType=Article'),
(u'High School Sports',u'http://www.thespec.com/rss?query=/sports/highschools&assetType=Article'),
(u'Local Sports',u'http://www.thespec.com/rss?query=/sports/local&assetType=Article'),
(u'What''s On',u'http://www.thespec.com/rss?query=/whatson&assetType=Article'),
(u'Arts and Entertainment',u'http://www.thespec.com/rss?query=/whatson/artsentertainment&assetType=Article'),
(u'Books',u'http://www.thespec.com/rss?query=/whatson/books&assetType=Article'),
(u'Movies',u'http://www.thespec.com/rss?query=/whatson/movies&assetType=Article'),
(u'Music',u'http://www.thespec.com/rss?query=/whatson/music&assetType=Article'),
(u'Restaurant Reviews',u'http://www.thespec.com/rss?query=/whatson/restaurants&assetType=Article'),
(u'Opinion',u'http://www.thespec.com/rss?query=/opinion&assetType=Article'),
(u'Opinion Columns',u'http://www.thespec.com/rss?query=/opinion/columns&assetType=Article'),
(u'Cartoons',u'http://www.thespec.com/rss?query=/opinion/cartoons&assetType=Article'),
(u'Letters',u'http://www.thespec.com/rss?query=/opinion/letters&assetType=Article'),
(u'Editorial',u'http://www.thespec.com/rss?query=/opinion/editorial&assetType=Article'),
(u'Community',u'http://www.thespec.com/rss?query=/community&assetType=Article'),
(u'Education',u'http://www.thespec.com/rss?query=/community/education&assetType=Article'),
(u'Faith',u'http://www.thespec.com/rss?query=/community/faith&assetType=Article'),
(u'Contests',u'http://www.thespec.com/rss?query=/community/contests&assetType=Article'),
(u'Living',u'http://www.thespec.com/rss?query=/living&assetType=Article'),
(u'Food',u'http://www.thespec.com/rss?query=/living/food&assetType=Article'),
(u'Health and Fitness',u'http://www.thespec.com/rss?query=/living/healthfitness&assetType=Article'),
(u'Your Home',u'http://www.thespec.com/rss?query=/living/home&assetType=Article'),
(u'Travel',u'http://www.thespec.com/rss?query=/living/travel&assetType=Article'),
(u'Family and Parenting',u'http://www.thespec.com/rss?query=/living/familyparenting&assetType=Article'),
(u'Style',u'http://www.thespec.com/rss?query=/living/style&assetType=Article')
]

View File

@ -1,11 +1,11 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import re
class AdvancedUserRecipe(BasicNewsRecipe): class AdvancedUserRecipe(BasicNewsRecipe):
title = 'heise online' title = 'Heise-online'
description = 'News vom Heise-Verlag' description = 'News vom Heise-Verlag'
__author__ = 'schuster' __author__ = 'schuster'
masthead_url = 'http://www.heise.de/icons/ho/heise_online_logo.gif'
publisher = 'Heise Zeitschriften Verlag GmbH & Co. KG'
use_embedded_content = False use_embedded_content = False
language = 'de' language = 'de'
oldest_article = 2 oldest_article = 2
@ -14,11 +14,10 @@ class AdvancedUserRecipe(BasicNewsRecipe):
remove_empty_feeds = True remove_empty_feeds = True
timeout = 5 timeout = 5
no_stylesheets = True no_stylesheets = True
encoding = 'utf-8'
remove_tags_after = dict(name ='p', attrs={'class':'editor'}) remove_tags_after = dict(name ='p', attrs={'class':'editor'})
remove_tags = [{'class':'navi_top_container'}, remove_tags = [dict(id='navi_top_container'),
dict(id='navi_bottom'), dict(id='navi_bottom'),
dict(id='mitte_rechts'), dict(id='mitte_rechts'),
dict(id='navigation'), dict(id='navigation'),
@ -29,27 +28,31 @@ class AdvancedUserRecipe(BasicNewsRecipe):
dict(id='seiten_navi'), dict(id='seiten_navi'),
dict(id='adbottom'), dict(id='adbottom'),
dict(id='sitemap'), dict(id='sitemap'),
dict(name='a', href=re.compile(r'^/([a-zA-Z]+/)?')), dict(name='div', attrs={'id':'sitemap'}),
] dict(name='ul', attrs={'class':'erste_zeile'}),
dict(name='ul', attrs={'class':'zweite_zeile'}),
dict(name='div', attrs={'class':'navi_top_container'})]
feeds = [ feeds = [
('Newsticker', 'http://www.heise.de/newsticker/heise.rdf'), ('Newsticker', 'http://www.heise.de/newsticker/heise.rdf'),
('iX', 'http://www.heise.de/ix/news/news.rdf'), ('Auto', 'http://www.heise.de/autos/rss/news.rdf'),
('Technology Review', 'http://www.heise.de/tr/news-atom.xml'),
('mobil', 'http://www.heise.de/mobil/newsticker/heise-atom.xml'),
('Security', 'http://www.heise.de/security/news/news-atom.xml'),
('Netze', 'http://www.heise.de/netze/rss/netze-atom.xml'),
('Open Source', 'http://www.heise.de/open/news/news-atom.xml'),
('Resale ', 'http://www.heise.de/resale/rss/resale.rdf'),
('Foto ', 'http://www.heise.de/foto/rss/news-atom.xml'), ('Foto ', 'http://www.heise.de/foto/rss/news-atom.xml'),
('Autos', 'http://www.heise.de/autos/rss/news.rdf'), ('Mac&i', 'http://www.heise.de/mac-and-i/news.rdf'),
('Mac & i', 'http://www.heise.de/mac-and-i/news.rdf'), ('Mobile ', 'http://www.heise.de/mobil/newsticker/heise-atom.xml'),
('Netz ', 'http://www.heise.de/netze/rss/netze-atom.xml'),
('Open ', 'http://www.heise.de/open/news/news-atom.xml'),
('Resale ', 'http://www.heise.de/resale/rss/resale.rdf'),
('Security ', 'http://www.heise.de/security/news/news-atom.xml'),
('C`t', 'http://www.heise.de/ct/rss/artikel-atom.xml'),
('iX', 'http://www.heise.de/ix/news/news.rdf'),
('Mach-flott', 'http://www.heise.de/mach-flott/rss/mach-flott-atom.xml'),
('Blog: Babel-Bulletin', 'http://www.heise.de/developer/rss/babel-bulletin/blog.rdf'), ('Blog: Babel-Bulletin', 'http://www.heise.de/developer/rss/babel-bulletin/blog.rdf'),
('Blog: Der Dotnet-Doktor', 'http://www.heise.de/developer/rss/dotnet-doktor/blog.rdf'), ('Blog: Der Dotnet-Doktor', 'http://www.heise.de/developer/rss/dotnet-doktor/blog.rdf'),
('Blog: Bernds Management-Welt', 'http://www.heise.de/developer/rss/bernds-management-welt/blog.rdf'), ('Blog: Bernds Management-Welt', 'http://www.heise.de/developer/rss/bernds-management-welt/blog.rdf'),
('Blog: The World of IT', 'http://www.heise.de/developer/rss/world-of-it/blog.rdf'), ('Blog: IT conversation', 'http://www.heise.de/developer/rss/world-of-it/blog.rdf'),
('Blog: Kais bewegtes Web', 'http://www.heise.de/developer/rss/kais-bewegtes-web/blog.rdf') ('Blog: Kais bewegtes Web', 'http://www.heise.de/developer/rss/kais-bewegtes-web/blog.rdf')]
]
def print_version(self, url): def print_version(self, url):
return url + '?view=print' return url + '?view=print'

View File

@ -1,4 +1,5 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import urllib, re
class HindustanTimes(BasicNewsRecipe): class HindustanTimes(BasicNewsRecipe):
title = u'Hindustan Times' title = u'Hindustan Times'
@ -26,4 +27,24 @@ class HindustanTimes(BasicNewsRecipe):
'http://feeds.hindustantimes.com/HT-Homepage-LifestyleNews'), 'http://feeds.hindustantimes.com/HT-Homepage-LifestyleNews'),
] ]
def get_article_url(self, article):
'''
HT uses a variant of the feedportal RSS ad display mechanism
'''
try:
s = article.summary
return urllib.unquote(
re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
except:
pass
url = BasicNewsRecipe.get_article_url(self, article)
res = self.browser.open_novisit(url)
url = res.geturl().split('/')[-2]
encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
'0D': '?', '0E': '-', '0N': '.com', '0L': 'http://', '0S':
'www.'}
for k, v in encoding.iteritems():
url = url.replace(k, v)
return url

View File

@ -4,56 +4,20 @@ __license__ = 'GPL v3'
__copyright__ = '2010, matek09, matek09@gmail.com' __copyright__ = '2010, matek09, matek09@gmail.com'
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import re
class Histmag(BasicNewsRecipe): class Histmag(BasicNewsRecipe):
title = u'Histmag' title = u'Histmag'
oldest_article = 7
max_articles_per_feed = 100
cover_url='http://histmag.org/grafika/loga/histmag-logo-2-300px.png'
__author__ = 'matek09' __author__ = 'matek09'
description = u"Artykuly historyczne i publicystyczne" description = u"Artykuly historyczne i publicystyczne"
encoding = 'utf-8' encoding = 'utf-8'
#preprocess_regexps = [(re.compile(r'</span>'), lambda match: '</span><br><br>'),(re.compile(r'<span>'), lambda match: '<br><br><span>')]
no_stylesheets = True no_stylesheets = True
language = 'pl' language = 'pl'
remove_javascript = True remove_javascript = True
#max_articles_per_feed = 1 keep_only_tags=[dict(id='article')]
remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'article'})) remove_tags=[dict(name = 'p', attrs = {'class' : 'article-tags'})]
remove_tags_after = dict(dict(name = 'h2', attrs = {'class' : 'komentarze'}))
#keep_only_tags =[]
#keep_only_tags.append(dict(name = 'h2'))
#keep_only_tags.append(dict(name = 'p'))
remove_tags =[]
remove_tags.append(dict(name = 'p', attrs = {'class' : 'podpis'}))
remove_tags.append(dict(name = 'h2', attrs = {'class' : 'komentarze'}))
remove_tags.append(dict(name = 'img', attrs = {'src' : 'style/buttons/wesprzyjnas-1.jpg'}))
preprocess_regexps = [(re.compile(r'</span>'), lambda match: '</span><br><br>'),
(re.compile(r'<span>'), lambda match: '<br><br><span>')]
extra_css = '''
.left {font-size: x-small}
.right {font-size: x-small}
'''
def find_articles(self, soup):
articles = []
for div in soup.findAll('div', attrs={'class' : 'text'}):
articles.append({
'title' : self.tag_to_string(div.h3.a),
'url' : 'http://www.histmag.org/' + div.h3.a['href'],
'date' : self.tag_to_string(div.next('p')).split('|')[0],
'description' : self.tag_to_string(div.next('p', podpis=False)),
})
return articles
def parse_index(self):
soup = self.index_to_soup('http://histmag.org/?arc=4&dx=0')
feeds = []
feeds.append((u"Artykuly historyczne", self.find_articles(soup)))
soup = self.index_to_soup('http://histmag.org/?arc=5&dx=0')
feeds.append((u"Artykuly publicystyczne", self.find_articles(soup)))
soup = self.index_to_soup('http://histmag.org/?arc=1&dx=0')
feeds.append((u"Wydarzenia", self.find_articles(soup)))
return feeds
feeds = [(u'Wszystkie', u'http://histmag.org/rss/wszystkie.xml'), (u'Wydarzenia', u'http://histmag.org/rss/wydarzenia.xml'), (u'Recenzje', u'http://histmag.org/rss/recenzje.xml'), (u'Artykuły historyczne', u'http://histmag.org/rss/historia.xml'), (u'Publicystyka', u'http://histmag.org/rss/publicystyka.xml')]

View File

@ -8,6 +8,15 @@ class Historia_org_pl(BasicNewsRecipe):
category = 'history' category = 'history'
language = 'pl' language = 'pl'
oldest_article = 8 oldest_article = 8
remove_empty_feeds=True
max_articles_per_feed = 100 max_articles_per_feed = 100
feeds = [(u'Artykuły', u'http://www.historia.org.pl/index.php?format=feed&type=rss')] feeds = [(u'Wszystkie', u'http://www.historia.org.pl/index.php?format=feed&type=rss'),
(u'Wiadomości', u'http://www.historia.org.pl/index.php/wiadomosci.feed?type=rss'),
(u'Publikacje', u'http://www.historia.org.pl/index.php/publikacje.feed?type=rss'),
(u'Publicystyka', u'http://www.historia.org.pl/index.php/publicystyka.feed?type=rss'),
(u'Recenzje', u'http://historia.org.pl/index.php/recenzje.feed?type=rss'),
(u'Kultura i sztuka', u'http://www.historia.org.pl/index.php/kultura-i-sztuka.feed?type=rss'),
(u'Rekonstykcje', u'http://www.historia.org.pl/index.php/rekonstrukcje.feed?type=rss'),
(u'Projekty', u'http://www.historia.org.pl/index.php/projekty.feed?type=rss'),
(u'Konkursy'), (u'http://www.historia.org.pl/index.php/konkursy.feed?type=rss')]

View File

@ -1,44 +1,58 @@
# -*- coding: utf-8 -*- ################################################################################
import re #Description: http://hvg.hu/ RSS channel
from calibre.web.feeds.recipes import BasicNewsRecipe #Author: Bigpapa (bigpapabig@hotmail.com)
#Date: 2011.12.20. - V1.1
################################################################################
class HVG(BasicNewsRecipe): from calibre.web.feeds.news import BasicNewsRecipe
title = 'HVG.HU'
__author__ = u'István Papp' class hvg(BasicNewsRecipe):
description = u'Friss hírek a HVG-től' title = u'HVG'
timefmt = ' [%Y. %b. %d., %a.]' __author__ = 'Bigpapa'
oldest_article = 4
language = 'hu' language = 'hu'
oldest_article = 5 # Hany napos legyen a legregebbi cikk amit leszedjen.
max_articles_per_feed = 100 max_articles_per_feed = 5 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
no_stylesheets = True no_stylesheets = True
use_embedded_content = False
encoding = 'utf8' encoding = 'utf8'
publisher = 'HVG Online' extra_css = ' h2 { font:bold 28px} '
category = u'news, hírek, hvg'
extra_css = 'body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
remove_tags_before = dict(id='pg-content')
remove_javascript = True
remove_empty_feeds = True
feeds = [ remove_attributes = ['style','font', 'href']
(u'Itthon', u'http://hvg.hu/rss/itthon')
,(u'Világ', u'http://hvg.hu/rss/vilag') keep_only_tags = [
,(u'Gazdaság', u'http://hvg.hu/rss/gazdasag') dict(name='div', attrs={'id':['pg-content']})
,(u'IT | Tudomány', u'http://hvg.hu/rss/tudomany')
,(u'Panoráma', u'http://hvg.hu/rss/Panorama')
,(u'Karrier', u'http://hvg.hu/rss/karrier')
,(u'Gasztronómia', u'http://hvg.hu/rss/gasztronomia')
,(u'Helyi érték', u'http://hvg.hu/rss/helyiertek')
,(u'Kultúra', u'http://hvg.hu/rss/kultura')
,(u'Cégautó', u'http://hvg.hu/rss/cegauto')
,(u'Vállalkozó szellem', u'http://hvg.hu/rss/kkv')
,(u'Egészség', u'http://hvg.hu/rss/egeszseg')
,(u'Vélemény', u'http://hvg.hu/rss/velemeny')
,(u'Sport', u'http://hvg.hu/rss/sport')
] ]
def print_version(self, url): remove_tags = [
return url.replace ('#rss', '/print') dict(name='div', attrs={'class':['box articlemenu', 'bannergoogle468', 'boxcontainer left', 'boxcontainer', 'commentbox']}),
dict(name='table', attrs={'class':['banner2', 'monocle']}),
dict(name='div', attrs={'id':['connect_widget_4cf63ca849ddf4577922632', 'sharetip', 'upprev_box']}),
dict(name='div', attrs={'style':['float: right; margin-bottom: 5px;', 'display: none;']}),
dict(name='h3', attrs={'class':['hthree']}),
dict(name='ul', attrs={'class':['defaultul']}),
dict(name='form', attrs={'id':['commentForm']}),
dict(name='h6', attrs={'class':['hthree']}),
dict(name='h6', attrs={'class':['more2']}),
dict(name='img', attrs={'class':['framed']}),
dict(name='td', attrs={'class':['greyboxbody','embedvideobody','embedvideofooter','embedvideobottom']}),
]
feeds = [
# (u'\xd6sszes', 'http://hvg.hu/rss'),
(u'Itthon', 'http://hvg.hu/rss/itthon'),
(u'Vil\xe1g', 'http://hvg.hu/rss/vilag'),
(u'Gazdas\xe1g', 'http://hvg.hu/rss/gazdasag'),
(u'Tudom\xe1ny', 'http://hvg.hu/rss/tudomany'),
(u'Panor\xe1ma', 'http://hvg.hu/rss/panorama'),
(u'Karrier', 'http://hvg.hu/rss/karrier'),
(u'Gasztron\xf3mia', 'http://hvg.hu/rss/gasztronomia'),
(u'Helyi \xe9rt\xe9k', 'http://hvg.hu/rss/helyiertek'),
(u'Kult\xfara', 'http://hvg.hu/rss/kultura'),
(u'C\xe9gaut\xf3', 'http://hvg.hu/rss/cegauto'),
(u'V\xe1llalkoz\xf3 szellem', 'http://hvg.hu/rss/kkv'),
(u'Eg\xe9szs\xe9g', 'http://hvg.hu/rss/egeszseg'),
(u'V\xe9lem\xe9ny', 'http://hvg.hu/rss/velemeny'),
(u'Sport', 'http://hvg.hu/rss/sport')
]

BIN
recipes/icons/biolog_pl.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB

BIN
recipes/icons/blues.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 910 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 373 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 481 B

BIN
recipes/icons/formulaas.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 687 B

BIN
recipes/icons/infra_pl.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

BIN
recipes/icons/moneynews.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 914 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 241 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 944 B

BIN
recipes/icons/rionegro.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 817 B

BIN
recipes/icons/skylife.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.3 KiB

BIN
recipes/icons/zaman.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 999 B

View File

@ -0,0 +1,68 @@
# encoding: utf-8 -*-
__license__ = 'GPL v3'
__author__ = 'Josemi Liébana <office at josemi-liebana.com>'
__copyright__ = 'Josemi Liébana'
__version__ = 'v0.1'
__date__ = '5 January 2012'
'''
www.ideal.es
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Ideal(BasicNewsRecipe):
title = u'Ideal (Edición Almería)'
__author__ = u'Josemi Liébana'
description = u'Noticias de Almería y el resto del mundo'
publisher = 'Ideal'
category = u'News, Politics, Spain, Almería'
publication_type = 'Newspaper'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'cp1252'
use_embedded_content = False
language = 'es'
remove_empty_feeds = True
masthead_url = u'http://www.ideal.es/img/rd.logotipo2_ideal.gif'
cover_url = u'http://www.ideal.es/granada/noticias/201112/24/Media/Granada/portada--647x894.JPG'
extra_css = u' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .photo-caption{font-size: x-small} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [
dict(attrs={'id':'title'})
,dict(attrs={'class':['overhead','headline','subhead','date','text','noticia_cont','desarrollo']})
]
remove_tags = [dict(name='ul')]
remove_attributes = ['width','height']
feeds = [
(u'Última Hora' , u'http://www.ideal.es/almeria/rss/feeds/ultima.xml' )
,(u'Portada' , u'http://www.ideal.es/almeria/portada.xml' )
,(u'Local' , u'http://www.ideal.es/almeria/rss/feeds/granada.xml' )
,(u'Deportes' , u'http://www.ideal.es/almeria/rss/feeds/deportes.xml' )
,(u'Sociedad' , u'http://www.ideal.es/almeria/rss/feeds/sociedad.xml' )
,(u'Cultura' , u'http://www.ideal.es/almeria/rss/feeds/cultura.xml' )
,(u'Economía' , u'http://www.ideal.es/almeria/rss/feeds/economia.xml' )
,(u'Costa' , u'http://www.ideal.es/almeria/rss/feeds/costa.xml' )
,(u'Puerta Purchena' , u'http://www.ideal.es/almeria/rss/feeds/puerta_purchena.xml' )
,(u'Andalucía' , u'http://www.ideal.es/almeria/rss/feeds/andalucia.xml' )
,(u'España' , u'http://www.ideal.es/almeria/rss/feeds/espana.xml' )
,(u'Mundo' , u'http://www.ideal.es/almeria/rss/feeds/internacional.xml' )
,(u'Vivir' , u'http://www.ideal.es/almeria/rss/feeds/vivir.xml' )
,(u'Opinión' , u'http://www.ideal.es/almeria/rss/feeds/opinion.xml' )
,(u'Televisión' , u'http://www.ideal.es/almeria/rss/feeds/television.xml' )
,(u'Contraportada' , u'http://www.ideal.es/almeria/rss/feeds/contraportada.xml' )
]

View File

@ -0,0 +1,69 @@
# encoding: utf-8 -*-
__license__ = 'GPL v3'
__author__ = 'Josemi Liébana <office at josemi-liebana.com>'
__copyright__ = 'Josemi Liébana'
__version__ = 'v0.1'
__date__ = '5 January 2012'
'''
www.ideal.es
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Ideal(BasicNewsRecipe):
title = u'Ideal (Edición Granada)'
__author__ = u'Josemi Liébana'
description = u'Noticias de Granada y el resto del mundo'
publisher = 'Ideal'
category = 'News, Politics, Spain, Granada'
publication_type = 'Newspaper'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'cp1252'
use_embedded_content = False
language = 'es'
remove_empty_feeds = True
masthead_url = 'http://www.ideal.es/img/rd.logotipo2_ideal.gif'
cover_url = 'http://www.ideal.es/granada/noticias/201112/24/Media/Granada/portada--647x894.JPG'
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .photo-caption{font-size: x-small} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [
dict(attrs={'id':'title'})
,dict(attrs={'class':['overhead','headline','subhead','date','text','noticia_cont','desarrollo']})
]
remove_tags = [dict(name='ul')]
remove_attributes = ['width','height']
feeds = [
(u'Última Hora' , u'http://www.ideal.es/granada/rss/feeds/ultima.xml' )
,(u'Portada' , u'http://www.ideal.es/granada/portada.xml' )
,(u'Local' , u'http://www.ideal.es/granada/rss/feeds/granada.xml' )
,(u'Deportes' , u'http://www.ideal.es/granada/rss/feeds/deportes.xml' )
,(u'Sociedad' , u'http://www.ideal.es/granada/rss/feeds/sociedad.xml' )
,(u'Cultura' , u'http://www.ideal.es/granada/rss/feeds/cultura.xml' )
,(u'Economía' , u'http://www.ideal.es/granada/rss/feeds/economia.xml' )
,(u'Costa' , u'http://www.ideal.es/granada/rss/feeds/costa.xml' )
,(u'La Carrera' , u'http://www.ideal.es/granada/rss/feeds/la_carrera.xml' )
,(u'Puerta Real' , u'http://www.ideal.es/granada/rss/feeds/puerta_real.xml' )
,(u'Andalucía' , u'http://www.ideal.es/granada/rss/feeds/andalucia.xml' )
,(u'España' , u'http://www.ideal.es/granada/rss/feeds/espana.xml' )
,(u'Mundo' , u'http://www.ideal.es/granada/rss/feeds/internacional.xml' )
,(u'Vivir' , u'http://www.ideal.es/granada/rss/feeds/vivir.xml' )
,(u'Opinión' , u'http://www.ideal.es/granada/rss/feeds/opinion.xml' )
,(u'Televisión' , u'http://www.ideal.es/granada/rss/feeds/television.xml' )
,(u'Contraportada' , u'http://www.ideal.es/granada/rss/feeds/contraportada.xml' )
]

67
recipes/ideal_jaen.recipe Normal file
View File

@ -0,0 +1,67 @@
# encoding: utf-8 -*-
__license__ = 'GPL v3'
__author__ = 'Josemi Liébana <office at josemi-liebana.com>'
__copyright__ = 'Josemi Liébana'
__version__ = 'v0.1'
__date__ = '5 January 2012'
'''
www.ideal.es
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Ideal(BasicNewsRecipe):
title = u'Ideal (Edición Jaén)'
__author__ = u'Josemi Liébana'
description = u'Noticias de Jaén y el resto del mundo'
publisher = 'Ideal'
category = u'News, Politics, Spain, Jaén'
publication_type = 'Newspaper'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'cp1252'
use_embedded_content = False
language = 'es'
remove_empty_feeds = True
masthead_url = 'http://www.ideal.es/img/rd.logotipo2_ideal.gif'
cover_url = 'http://www.ideal.es/granada/noticias/201112/24/Media/Granada/portada--647x894.JPG'
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .photo-caption{font-size: x-small} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [
dict(attrs={'id':'title'})
,dict(attrs={'class':['overhead','headline','subhead','date','text','noticia_cont','desarrollo']})
]
remove_tags = [dict(name='ul')]
remove_attributes = ['width','height']
feeds = [
(u'Última Hora' , u'http://www.ideal.es/jaen/rss/feeds/ultima.xml' )
,(u'Portada' , u'http://www.ideal.es/jaen/portada.xml' )
,(u'Local' , u'http://www.ideal.es/jaen/rss/feeds/granada.xml' )
,(u'Deportes' , u'http://www.ideal.es/jaen/rss/feeds/deportes.xml' )
,(u'Sociedad' , u'http://www.ideal.es/jaen/rss/feeds/sociedad.xml' )
,(u'Cultura' , u'http://www.ideal.es/jaen/rss/feeds/cultura.xml' )
,(u'Economía' , u'http://www.ideal.es/jaen/rss/feeds/economia.xml' )
,(u'Costa' , u'http://www.ideal.es/jaen/rss/feeds/costa.xml' )
,(u'Andalucía' , u'http://www.ideal.es/jaen/rss/feeds/andalucia.xml' )
,(u'España' , u'http://www.ideal.es/jaen/rss/feeds/espana.xml' )
,(u'Mundo' , u'http://www.ideal.es/jaen/rss/feeds/internacional.xml' )
,(u'Vivir' , u'http://www.ideal.es/jaen/rss/feeds/vivir.xml' )
,(u'Opinión' , u'http://www.ideal.es/jaen/rss/feeds/opinion.xml' )
,(u'Televisión' , u'http://www.ideal.es/jaen/rss/feeds/television.xml' )
,(u'Contraportada' , u'http://www.ideal.es/jaen/rss/feeds/contraportada.xml' )
]

View File

@ -1,63 +1,30 @@
__license__ = 'GPL v3'
__copyright__ = '2008, Derry FitzGerald'
'''
iht.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
class NYTimesGlobal(BasicNewsRecipe):
class InternationalHeraldTribune(BasicNewsRecipe): title = u'NY Times Global'
title = u'The International Herald Tribune'
__author__ = 'Derry FitzGerald'
language = 'en' language = 'en'
__author__ = 'Krittika Goyal'
oldest_article = 1 #days
max_articles_per_feed = 25
use_embedded_content = False
oldest_article = 1
max_articles_per_feed = 30
no_stylesheets = True no_stylesheets = True
auto_cleanup = True
remove_tags = [dict(name='div', attrs={'class':['footer','header']}),
dict(name=['form'])]
preprocess_regexps = [
(re.compile(r'<!-- webtrends.*', re.DOTALL),
lambda m:'</body></html>')
]
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
remove_empty_feeds = True
feeds = [ feeds = [
(u'Frontpage', u'http://www.iht.com/rss/frontpage.xml'), ('NYTimes',
(u'Business', u'http://www.iht.com/rss/business.xml'), 'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml'),
(u'Americas', u'http://www.iht.com/rss/america.xml'), ('NYTimes global',
(u'Europe', u'http://www.iht.com/rss/europe.xml'), 'http://www.nytimes.com/services/xml/rss/nyt/GlobalHome.xml'),
(u'Asia', u'http://www.iht.com/rss/asia.xml'), ('World',
(u'Africa and Middle East', u'http://www.iht.com/rss/africa.xml'), 'http://www.nytimes.com/services/xml/rss/nyt/World.xml'),
(u'Opinion', u'http://www.iht.com/rss/opinion.xml'), ('U.S.',
(u'Technology', u'http://www.iht.com/rss/technology.xml'), 'http://www.nytimes.com/services/xml/rss/nyt/US.xml'),
(u'Health and Science', u'http://www.iht.com/rss/healthscience.xml'), ('Business',
(u'Sports', u'http://www.iht.com/rss/sports.xml'), 'http://feeds.nytimes.com/nyt/rss/Business'),
(u'Culture', u'http://www.iht.com/rss/arts.xml'), ('Sports',
(u'Style and Design', u'http://www.iht.com/rss/style.xml'), 'http://www.nytimes.com/services/xml/rss/nyt/Sports.xml'),
(u'Travel', u'http://www.iht.com/rss/travel.xml'), ('Technology',
(u'At Home Abroad', u'http://www.iht.com/rss/athome.xml'), 'http://feeds.nytimes.com/nyt/rss/Technology'),
(u'Your Money', u'http://www.iht.com/rss/yourmoney.xml'), ]
(u'Properties', u'http://www.iht.com/rss/properties.xml')
]
temp_files = []
articles_are_obfuscated = True
masthead_url = 'http://graphics8.nytimes.com/images/misc/iht-masthead-logo.gif'
def get_obfuscated_article(self, url):
br = self.get_browser()
br.open(url)
response1 = br.follow_link(url_regex=re.compile(r'.*pagewanted=print.*'))
html = response1.read()
self.temp_files.append(PersistentTemporaryFile('_iht.html'))
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name

12
recipes/iktibas.recipe Normal file
View File

@ -0,0 +1,12 @@
from calibre.web.feeds.news import BasicNewsRecipe
class BasicUserRecipe1324739406(BasicNewsRecipe):
title = u'\u0130ktibas'
language = 'tr'
__author__ = 'asalet_r'
oldest_article = 7
max_articles_per_feed = 20
auto_cleanup = True
feeds = [(u'\u0130ktibas', u'http://www.iktibasdergisi.com/rss/rss.xml')]

View File

@ -1,9 +1,8 @@
# adapted from old recipe by Darko Miletic <darko.miletic at gmail.com> # adapted from old recipe by Darko Miletic <darko.miletic at gmail.com>
import string, re import re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString from calibre.ebooks.BeautifulSoup import Tag, NavigableString
class TheIndependentNew(BasicNewsRecipe): class TheIndependentNew(BasicNewsRecipe):
@ -40,7 +39,9 @@ class TheIndependentNew(BasicNewsRecipe):
encoding = 'utf-8' encoding = 'utf-8'
remove_tags =[ remove_tags =[
dict(attrs={'id' : ['RelatedArtTag','renderBiography']}), dict(attrs={'id' : ['RelatedArtTag','renderBiography']}),
dict(attrs={'class' : ['autoplay','openBiogPopup']}) dict(attrs={'class' : ['autoplay','openBiogPopup']}),
dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}),
dict(attrs={'style' : re.compile('.*')}),
] ]
keep_only_tags =[dict(attrs={'id':'main'})] keep_only_tags =[dict(attrs={'id':'main'})]
@ -103,6 +104,12 @@ class TheIndependentNew(BasicNewsRecipe):
url = None url = None
return url return url
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
def preprocess_html(self, soup): def preprocess_html(self, soup):
#remove 'advertorial articles' #remove 'advertorial articles'
@ -114,6 +121,7 @@ class TheIndependentNew(BasicNewsRecipe):
return None return None
items_to_extract = [] items_to_extract = []
slideshow_elements = []
for item in soup.findAll(attrs={'class' : re.compile("widget.*")}): for item in soup.findAll(attrs={'class' : re.compile("widget.*")}):
remove = True remove = True
@ -132,6 +140,7 @@ class TheIndependentNew(BasicNewsRecipe):
if (pattern.search(item['class'])) is not None: if (pattern.search(item['class'])) is not None:
if self._FETCH_IMAGES: if self._FETCH_IMAGES:
remove = False remove = False
slideshow_elements.append(item)
else: else:
remove = True remove = True
@ -149,7 +158,8 @@ class TheIndependentNew(BasicNewsRecipe):
items_to_extract = [] items_to_extract = []
if self._FETCH_IMAGES: if self._FETCH_IMAGES:
for item in soup.findAll('a',attrs={'href' : re.compile('.*')}): for element in slideshow_elements:
for item in element.findAll('a',attrs={'href' : re.compile('.*')}):
if item.img is not None: if item.img is not None:
#use full size image #use full size image
img = item.findNext('img') img = item.findNext('img')
@ -157,7 +167,7 @@ class TheIndependentNew(BasicNewsRecipe):
img['src'] = item['href'] img['src'] = item['href']
#insert caption if available #insert caption if available
if img['title'] is not None and (len(img['title']) > 1): if img.get('title') and (len(img['title']) > 1):
tag = Tag(soup,'h3') tag = Tag(soup,'h3')
text = NavigableString(img['title']) text = NavigableString(img['title'])
tag.insert(0,text) tag.insert(0,text)
@ -262,12 +272,15 @@ class TheIndependentNew(BasicNewsRecipe):
def _insertRatingStars(self,soup,item): def _insertRatingStars(self,soup,item):
if item.contents is None: if item.contents is None or len(item.contents) < 1:
return return
rating = item.contents[0] rating = item.contents[0]
if not rating.isdigit():
return None try:
rating = int(item.contents[0]) rating = float(item.contents[0])
except:
print 'Could not convert decimal rating to star: malformatted float.'
return
for i in range(1,6): for i in range(1,6):
star = Tag(soup,'img') star = Tag(soup,'img')
if i <= rating: if i <= rating:
@ -284,7 +297,7 @@ class TheIndependentNew(BasicNewsRecipe):
items_to_extract = [] items_to_extract = []
for item in soup.findAll('div', attrs={'class' : 'image'}): for item in soup.findAll('div', attrs={'class' : 'image'}):
img = item.findNext('img') img = item.findNext('img')
if img is not None and img['src'] is not None: if img and img.get('src'):
# broken images still point to remote url # broken images still point to remote url
pattern = re.compile('http://www.independent.co.uk.*') pattern = re.compile('http://www.independent.co.uk.*')
if pattern.match(img["src"]) is not None: if pattern.match(img["src"]) is not None:

View File

@ -1,16 +1,20 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe1234144423(BasicNewsRecipe): class IndianapolisStar(BasicNewsRecipe):
title = u'Indianapolis Star' title = u'Indianapolis Star'
oldest_article = 5 oldest_article = 10
auto_cleanup = True
language = 'en' language = 'en'
__author__ = 'Owen Kelly' __author__ = 'Owen Kelly'
max_articles_per_feed = 100 max_articles_per_feed = 100
cover_url = u'http://www2.indystar.com/frontpage/images/today.jpg' cover_url = u'http://www2.indystar.com/frontpage/images/today.jpg'
feeds = [(u'Community Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=LOCAL&template=rss'),
feeds = [(u'Community Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=LOCAL&template=rss&mime=XML'), (u'News Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=NEWS&template=rss&mime=XML'), (u'Business Headlines', u'http://www..indystar.com/apps/pbcs.dll/section?Category=BUSINESS&template=rss&mime=XML'), (u'Sports Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=SPORTS&template=rss&mime=XML'), (u'Lifestyle Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=LIVING&template=rss&mime=XML'), (u'Opinion Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=OPINION&template=rss&mime=XML')] (u'News Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=NEWS&template=rss'),
(u'Business Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=BUSINESS&template=rss'),
(u'Politics and Government', u'http://www.indystar.com/apps/pbcs.dll/section?Category=NEWS05&template=rss'),
(u'Lifestyle Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=LIVING&template=rss&mime=XML'),
(u'Opinion Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=OPINION&template=rss&mime=XML')
]
def print_version(self, url): def print_version(self, url):
return url + '&template=printart' return url + '&template=printart'

17
recipes/infra_pl.recipe Normal file
View File

@ -0,0 +1,17 @@
from calibre.web.feeds.news import BasicNewsRecipe
class INFRA(BasicNewsRecipe):
title = u'INFRA'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'Serwis Informacyjny INFRA - UFO, Zjawiska Paranormalne, Duchy, Tajemnice świata.'
cover_url = 'http://npn.nazwa.pl/templates/ja_teline_ii/images/logo.jpg'
category = 'UFO'
language = 'pl'
max_articles_per_feed = 100
no_stylesheers=True
remove_tags_before=dict(name='h2', attrs={'class':'contentheading'})
remove_tags_after=dict(attrs={'class':'pagenav'})
remove_tags=[dict(attrs={'class':'pagenav'})]
feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/index.php?option=com_rd_rss&id=1')]

View File

@ -0,0 +1,12 @@
from calibre.web.feeds.news import BasicNewsRecipe
class BasicUserRecipe1324158549(BasicNewsRecipe):
title = u'izdiham.com'
language = 'tr'
__author__ = 'asalet_r'
oldest_article = 7
max_articles_per_feed = 20
auto_cleanup = True
feeds = [(u'\u0130zdiham', u'http://www.izdiham.com/index.php/feed')]

18
recipes/japan_news.recipe Normal file
View File

@ -0,0 +1,18 @@
from calibre.web.feeds.news import BasicNewsRecipe
class NewsOnJapan(BasicNewsRecipe):
title = u'News On Japan'
language = 'en'
__author__ = 'Krittika Goyal'
oldest_article = 1 #days
max_articles_per_feed = 25
use_embedded_content = False
no_stylesheets = True
auto_cleanup = True
feeds = [
('News',
'http://newsonjapan.com/rss/top.xml'),
]

72
recipes/klip_me.recipe Normal file
View File

@ -0,0 +1,72 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1299694372(BasicNewsRecipe):
title = u'Klipme'
__author__ = 'Ken Sun'
publisher = 'Klip.me'
category = 'info, custom, Klip.me'
oldest_article = 365
max_articles_per_feed = 100
no_stylesheets = True
remove_javascript = True
remove_tags = [
dict(name='div', attrs={'id':'text_controls_toggle'})
,dict(name='script')
,dict(name='div', attrs={'id':'text_controls'})
,dict(name='div', attrs={'id':'editing_controls'})
,dict(name='div', attrs={'class':'bar bottom'})
]
use_embedded_content = False
needs_subscription = True
INDEX = u'http://www.klip.me'
LOGIN = INDEX + u'/fav/signin?callback=/fav'
feeds = [
(u'Klip.me unread', u'http://www.klip.me/fav'),
(u'Klip.me started', u'http://www.klip.me/fav?s=starred')
]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None:
br.open(self.LOGIN)
br.select_form(nr=0)
br['Email'] = self.username
if self.password is not None:
br['Passwd'] = self.password
br.submit()
return br
def parse_index(self):
totalfeeds = []
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, 'Fetching feed'+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl)
for item in soup.findAll('table',attrs={'class':['item','item new']}):
atag = item.a
if atag and atag.has_key('href'):
url = atag['href']
articles.append({
'url' :url
})
totalfeeds.append((feedtitle, articles))
return totalfeeds
def print_version(self, url):
return 'http://www.klip.me' + url
def populate_article_metadata(self, article, soup, first):
article.title = soup.find('title').contents[0].strip()
def postprocess_html(self, soup, first_fetch):
for link_tag in soup.findAll(attrs={"id" : "story"}):
link_tag.insert(0,'<h1>'+soup.find('title').contents[0].strip()+'</h1>')
print link_tag
return soup

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2011, Attis <attis@attis.one.pl>' __copyright__ = '2011 Attis <attis@attis.one.pl>, 2012 Tomasz Długosz <tomek3d@gmail.com>'
__version__ = 'v. 0.1' __version__ = 'v. 0.1'
import re import re
@ -10,7 +10,7 @@ class KopalniaWiedzy(BasicNewsRecipe):
publisher = u'Kopalnia Wiedzy' publisher = u'Kopalnia Wiedzy'
description = u'Ciekawostki ze świata nauki i techniki' description = u'Ciekawostki ze świata nauki i techniki'
encoding = 'utf-8' encoding = 'utf-8'
__author__ = 'Attis' __author__ = 'Attis & Tomasz Długosz'
language = 'pl' language = 'pl'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
@ -18,9 +18,9 @@ class KopalniaWiedzy(BasicNewsRecipe):
remove_javascript = True remove_javascript = True
no_stylesheets = True no_stylesheets = True
remove_tags = [{'name':'p', 'attrs': {'class': 'keywords'} }, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}] remove_tags = [{'name':'p', 'attrs': {'class': 'keywords'}}, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}, {'name':'div', 'attrs': {'class':'article-time-and-cat'}}, {'name':'p', 'attrs': {'class':'tags'}}]
remove_tags_after = dict(attrs={'class':'ad-square'}) remove_tags_after = dict(attrs={'class':'ad-square'})
keep_only_tags = [dict(name="div", attrs={'id':'articleContent'})] keep_only_tags = [dict(name="div", attrs={'class':'article-text text-small'})]
extra_css = '.topimage {margin-top: 30px}' extra_css = '.topimage {margin-top: 30px}'
preprocess_regexps = [ preprocess_regexps = [

View File

@ -0,0 +1,14 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe
class Kosmonauta(BasicNewsRecipe):
title = u'Kosmonauta.net'
__author__ = 'fenuks'
description = u'polskojęzyczny portal w całości dedykowany misjom kosmicznym i badaniom kosmosu.'
category = 'astronomy'
language = 'pl'
cover_url='http://bi.gazeta.pl/im/4/10393/z10393414X,Kosmonauta-net.jpg'
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 100
feeds = [(u'Kosmonauta.net', u'http://www.kosmonauta.net/index.php/feed/rss.html')]

View File

@ -1,10 +1,9 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.la-razon.com www.la-razon.com
''' '''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class LaRazon_Bol(BasicNewsRecipe): class LaRazon_Bol(BasicNewsRecipe):
@ -16,19 +15,17 @@ class LaRazon_Bol(BasicNewsRecipe):
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 200 max_articles_per_feed = 200
no_stylesheets = True no_stylesheets = True
encoding = 'cp1252' encoding = 'utf8'
use_embedded_content = False use_embedded_content = False
language = 'es_BO' language = 'es_BO'
publication_type = 'newspaper' publication_type = 'newspaper'
delay = 1
remove_empty_feeds = True remove_empty_feeds = True
cover_url = strftime('http://www.la-razon.com/portadas/%Y%m%d_LaRazon.jpg') masthead_url = 'http://www.la-razon.com/static/LRZRazon/images/lrz-logo.png'
masthead_url = 'http://www.la-razon.com/imagenes/logo.jpg' extra_css = """ body{font-family: Georgia,"Times New Roman",Times,serif}
extra_css = """ body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em; display: block}
img{margin-bottom: 0.4em} .meta{font-size: small; font-family: Arial,Helvetica,sans-serif}
.noticia-titulo{font-family: Georgia,"Times New Roman",Times,serif}
.lead{font-weight: bold; font-size: 0.8em}
""" """
INDEX = 'http://www.la-razon.com/'
conversion_options = { conversion_options = {
'comment' : description 'comment' : description
@ -37,28 +34,37 @@ class LaRazon_Bol(BasicNewsRecipe):
, 'language' : language , 'language' : language
} }
keep_only_tags = [dict(name='div', attrs={'class':['noticia-titulo','noticia-desarrollo']})] keep_only_tags = [dict(name='div', attrs={'class':['pg-hd', 'pg-bd']})]
remove_tags = [dict(name=['meta','link','form','iframe','embed','object'])] remove_tags = [
dict(name=['meta','link','form','iframe','embed','object'])
,dict(name='div', attrs={'class':'bd'})
]
remove_attributes = ['width','height'] remove_attributes = ['width','height']
feeds = [ feeds = [
(u'Editorial' , u'http://www.la-razon.com/rss_editorial.php' ) (u'Editorial' , u'http://www.la-razon.com/rss/opinion/editorial/' )
,(u'Opinión' , u'http://www.la-razon.com/rss_opinion.php' ) ,(u'Nacional' , u'http://www.la-razon.com/rss/nacional/' )
,(u'Nacional' , u'http://www.la-razon.com/rss_nacional.php' ) ,(u'Economia' , u'http://www.la-razon.com/rss/economia/' )
,(u'Economia' , u'http://www.la-razon.com/rss_economia.php' ) ,(u'Ciudades' , u'http://www.la-razon.com/rss/ciudades/' )
,(u'Ciudades' , u'http://www.la-razon.com/rss_ciudades.php' ) ,(u'Sociedad' , u'http://www.la-razon.com/rss/sociedad/' )
,(u'Sociedad' , u'http://www.la-razon.com/rss_sociedad.php' ) ,(u'Mundo' , u'http://www.la-razon.com/rss/mundo/' )
,(u'Mundo' , u'http://www.la-razon.com/rss_sociedad.php' ) ,(u'La Revista' , u'http://www.la-razon.com/rss/la_revista/' )
,(u'La Revista' , u'http://www.la-razon.com/rss_larevista.php' ) ,(u'Sociales' , u'http://www.la-razon.com/rss/sociales/' )
,(u'Sociales' , u'http://www.la-razon.com/rss_sociales.php' ) ,(u'Mia' , u'http://www.la-razon.com/rss/suplementos/mia/' )
,(u'Mia' , u'http://www.la-razon.com/rss_mia.php' ) ,(u'Marcas' , u'http://www.la-razon.com/rss/marcas/' )
,(u'Marcas' , u'http://www.la-razon.com/rss_marcas.php' ) ,(u'Escape' , u'http://www.la-razon.com/rss/suplementos/escape/' )
,(u'Escape' , u'http://www.la-razon.com/rss_escape.php' ) ,(u'El Financiero' , u'http://www.la-razon.com/rss/suplementos/financiero/')
,(u'El Financiero' , u'http://www.la-razon.com/rss_financiero.php') ,(u'Tendencias' , u'http://www.la-razon.com/rss/suplementos/tendencias/')
,(u'Tendencias' , u'http://www.la-razon.com/rss_tendencias.php')
] ]
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
return soup return soup
def get_cover_url(self):
soup = self.index_to_soup(self.INDEX)
lightbox = soup.find('div', attrs = {'class' : 'lightbox lightbox-frontpage'})
return lightbox.img['src']

View File

@ -1,13 +1,12 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'Lorenzo Vigentini, based on Darko Miletic, Gabriele Marini' __author__ = 'Lorenzo Vigentini, based on Darko Miletic, Gabriele Marini'
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>' __copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
description = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version' description = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version; 14.12.2011 new version'
''' '''
http://www.repubblica.it/ http://www.repubblica.it/
''' '''
import re
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -33,12 +32,6 @@ class LaRepubblica(BasicNewsRecipe):
remove_attributes = ['width','height','lang','xmlns:og','xmlns:fb'] remove_attributes = ['width','height','lang','xmlns:og','xmlns:fb']
preprocess_regexps = [
(re.compile(r'.*?<head>', re.DOTALL|re.IGNORECASE), lambda match: '<head>'),
(re.compile(r'<head>.*?<title>', re.DOTALL|re.IGNORECASE), lambda match: '<head><title>'),
(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE), lambda match: '</title></head>')
]
def get_article_url(self, article): def get_article_url(self, article):
link = BasicNewsRecipe.get_article_url(self, article) link = BasicNewsRecipe.get_article_url(self, article)
if link and not '.repubblica.it/' in link: if link and not '.repubblica.it/' in link:
@ -73,15 +66,15 @@ class LaRepubblica(BasicNewsRecipe):
remove_tags = [ remove_tags = [
dict(name=['object','link','meta','iframe','embed']), dict(name=['object','link','meta','iframe','embed']),
dict(name='span',attrs={'class':'linkindice'}), dict(name='span',attrs={'class':'linkindice'}),
dict(name='div', attrs={'class':'bottom-mobile'}), dict(name='div', attrs={'class':['bottom-mobile','adv adv-middle-inline']}),
dict(name='div', attrs={'id':['rssdiv','blocco']}), dict(name='div', attrs={'id':['rssdiv','blocco','fb-like-head']}),
dict(name='div', attrs={'class':'utility'}), dict(name='div', attrs={'class':['utility','fb-like-button','archive-button']}),
dict(name='div', attrs={'class':'generalbox'}), dict(name='div', attrs={'class':'generalbox'}),
dict(name='ul', attrs={'id':'hystory'}) dict(name='ul', attrs={'id':'hystory'})
] ]
feeds = [ feeds = [
(u'Rilievo', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'), (u'Homepage', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
(u'Cronaca', u'http://www.repubblica.it/rss/cronaca/rss2.0.xml'), (u'Cronaca', u'http://www.repubblica.it/rss/cronaca/rss2.0.xml'),
(u'Esteri', u'http://www.repubblica.it/rss/esteri/rss2.0.xml'), (u'Esteri', u'http://www.repubblica.it/rss/esteri/rss2.0.xml'),
(u'Economia', u'http://www.repubblica.it/rss/economia/rss2.0.xml'), (u'Economia', u'http://www.repubblica.it/rss/economia/rss2.0.xml'),
@ -110,3 +103,5 @@ class LaRepubblica(BasicNewsRecipe):
del item['style'] del item['style']
return soup return soup
def preprocess_raw_html(self, raw, url):
return '<html><head>'+raw[raw.find('</head>'):]

14
recipes/lega_nerd.recipe Normal file
View File

@ -0,0 +1,14 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1326135232(BasicNewsRecipe):
title = u'Lega Nerd'
description = 'nerd / geek culture, pc, comics, music, culture'
language = 'it'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
feeds = [(u'Lega Nerd', u'http://feeds.feedburner.com/LegaNerd')]
__author__ = 'faber1971'
__version__ = 'v1.0'
__date__ = '9, January 2011'

View File

@ -0,0 +1,94 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class LetsGetCritical(BasicNewsRecipe):
title = u"Let's Get Critical"
description = 'Curation / aggregation of criticisms of the arts and culture '
language = 'en'
__author__ = 'barty on mobileread.com forum'
max_articles_per_feed = 100
no_stylesheets = False
timefmt = ' [%a, %d %b, %Y]'
oldest_article = 365
auto_cleanup = True
INDEX = 'http://www.letsgetcritical.org'
CATEGORIES = [
# comment out categories you don't want
# (user friendly name, system name, max number of articles to load)
('Architecture','architecture',30),
('Art','art',30),
('Books','books',30),
('Design','design',30),
('Digital','digital',30),
('Food','food',30),
('Movies','movies',30),
('Music','music',30),
('Television','television',30),
('Other articles','',10)
]
def parse_index(self):
self.cover_url = 'http://www.letsgetcritical.org/wp-content/themes/lets_get_critical/images/lgc.jpg'
feeds = []
seen_urls = set([])
regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
for category in self.CATEGORIES:
(cat_name, tag, max_articles) = category
tagurl = '' if tag=='' else '/category/'+tag.lower()
self.log('Reading category:', cat_name)
articles = []
pageno = 1
while len(articles) < max_articles and pageno < 100:
page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
pageno += 1
self.log('\tReading page:', page)
try:
soup = self.index_to_soup(page)
except:
break
posts = soup.findAll('div',attrs={'class':'post_multi'})
if len(posts) == 0:
break
for post in posts:
dt = post.find('div',attrs={'class':'title'})
atag = dt.find('a')
url = atag['href']
# skip promotionals and duplicate
if url.startswith('http://letsgetcritical') or url.startswith('/') or url in seen_urls:
continue
seen_urls.add(url)
title = self.tag_to_string(atag)
self.log('\tFound article:', title)
self.log('\t', url)
desc = post.find('blockquote')
desc = self.tag_to_string(desc) if desc else ''
m = regex.match( url)
if m:
desc = "[%s] %s" % (m.group(2), desc)
#self.log('\t', desc)
date = ''
p = post.previousSibling
# navigate up sibling to find date
while p:
if hasattr(p,'class') and p['class'] == 'singledate':
date = self.tag_to_string(p)
break
p = p.previousSibling
articles.append({'title':title,'url':url,'description':desc,'date':date})
if len(articles) >= max_articles:
break
if articles:
feeds.append((cat_name, articles))
return feeds

View File

@ -41,7 +41,7 @@ class LosTiempos_Bol(BasicNewsRecipe):
keep_only_tags = [dict(name='div', attrs={'id':'articulo'})] keep_only_tags = [dict(name='div', attrs={'id':'articulo'})]
remove_tags = [ remove_tags = [
dict(name=['meta','link','form','iframe','embed','object','hr']) dict(name=['meta','link','form','iframe','embed','object','hr'])
,dict(attrs={'class':['caja_fonts sin_border_bot','pub']}) ,dict(attrs={'class':['caja_fonts sin_border_bot','pub','twitter-share-button']})
] ]
remove_attributes = ['width','height'] remove_attributes = ['width','height']

View File

@ -14,8 +14,11 @@ class WeeklyLWN(BasicNewsRecipe):
description = 'Weekly summary of what has happened in the free software world.' description = 'Weekly summary of what has happened in the free software world.'
__author__ = 'Davide Cavalca' __author__ = 'Davide Cavalca'
language = 'en' language = 'en'
site_url = 'http://lwn.net'
cover_url = 'http://lwn.net/images/lcorner.png' extra_css = 'pre,code,samp,kbd,tt { font-size: 80% }\nblockquote {margin-left:0 }\n* { color: black }\n'
cover_url = site_url + '/images/lcorner.png'
#masthead_url = 'http://lwn.net/images/lcorner.png' #masthead_url = 'http://lwn.net/images/lcorner.png'
publication_type = 'magazine' publication_type = 'magazine'
@ -43,11 +46,29 @@ class WeeklyLWN(BasicNewsRecipe):
br.submit() br.submit()
return br return br
def print_version(self, url):
# Strip off anchor
url = url.split('#')[0]
# Prepend site_url
if url[0:len(self.site_url)] != self.site_url:
url = self.site_url + url
# Append printable URL parameter
print_param = '?format=printable'
if url[-len(print_param):] != print_param:
url += print_param
#import sys
#print >>sys.stderr, "*** print_version(url):", url
return url
def parse_index(self): def parse_index(self):
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
index_url = 'http://lwn.net/current/bigpage?format=printable' index_url = self.print_version('/current/bigpage')
else: else:
index_url = 'http://lwn.net/free/bigpage?format=printable' index_url = self.print_version('/free/bigpage')
soup = self.index_to_soup(index_url) soup = self.index_to_soup(index_url)
body = soup.body body = soup.body
@ -56,19 +77,19 @@ class WeeklyLWN(BasicNewsRecipe):
url_re = re.compile('^/Articles/') url_re = re.compile('^/Articles/')
while True: while True:
tag_title = body.findNext(name='p', attrs={'class':'SummaryHL'}) tag_title = body.findNext(attrs={'class':'SummaryHL'})
if tag_title == None: if tag_title == None:
break break
tag_section = tag_title.findPrevious(name='p', attrs={'class':'Cat1HL'}) tag_section = tag_title.findPrevious(attrs={'class':'Cat1HL'})
if tag_section == None: if tag_section == None:
section = 'Front Page' section = 'Front Page'
else: else:
section = tag_section.string section = tag_section.string
tag_section2 = tag_title.findPrevious(name='p', attrs={'class':'Cat2HL'}) tag_section2 = tag_title.findPrevious(attrs={'class':'Cat2HL'})
if tag_section2 != None: if tag_section2 != None:
if tag_section2.findPrevious(name='p', attrs={'class':'Cat1HL'}) == tag_section: if tag_section2.findPrevious(attrs={'class':'Cat1HL'}) == tag_section:
section = "%s: %s" %(section, tag_section2.string) section = "%s: %s" %(section, tag_section2.string)
if section not in articles.keys(): if section not in articles.keys():
@ -94,9 +115,10 @@ class WeeklyLWN(BasicNewsRecipe):
if tag_url == None: if tag_url == None:
break break
article = dict( article = dict(
title=self.tag_to_string(tag_title), title=self.tag_to_string(tag_title),
url= 'http://lwn.net' + tag_url['href'].split('#')[0] + '?format=printable', url=tag_url['href'],
description='', content='', date='') description='', content='', date='')
articles[section].append(article) articles[section].append(article)

Some files were not shown because too many files have changed in this diff Show More