Merge
@ -2,6 +2,7 @@
|
||||
.check-cache.pickle
|
||||
src/calibre/plugins
|
||||
resources/images.qrc
|
||||
src/calibre/ebooks/oeb/display/test/*.js
|
||||
src/calibre/manual/.build/
|
||||
src/calibre/manual/cli/
|
||||
src/calibre/manual/template_ref.rst
|
||||
@ -15,6 +16,7 @@ resources/ebook-convert-complete.pickle
|
||||
resources/builtin_recipes.xml
|
||||
resources/builtin_recipes.zip
|
||||
resources/template-functions.json
|
||||
resources/display/*.js
|
||||
setup/installer/windows/calibre/build.log
|
||||
src/calibre/translations/.errors
|
||||
src/cssutils/.svn/
|
||||
|
4210
Changelog.old.yaml
4849
Changelog.yaml
@ -1,5 +1,5 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Dean Cording'
|
||||
__copyright__ = '2011, Pat Stapleton <pat.stapleton at gmail.com>'
|
||||
'''
|
||||
abc.net.au/news
|
||||
'''
|
||||
@ -8,7 +8,7 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class ABCNews(BasicNewsRecipe):
|
||||
title = 'ABC News'
|
||||
__author__ = 'Dean Cording'
|
||||
__author__ = 'Pat Stapleton, Dean Cording'
|
||||
description = 'News from Australia'
|
||||
masthead_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png'
|
||||
cover_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png'
|
||||
@ -23,7 +23,9 @@ class ABCNews(BasicNewsRecipe):
|
||||
category = 'News, Australia, World'
|
||||
language = 'en_AU'
|
||||
publication_type = 'newsportal'
|
||||
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
||||
# preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
||||
#Remove annoying map links (inline-caption class is also used for some image captions! hence regex to match maps.google)
|
||||
preprocess_regexps = [(re.compile(r'<a class="inline-caption" href="http://maps\.google\.com.*?/a>', re.DOTALL), lambda m: '')]
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
@ -32,23 +34,23 @@ class ABCNews(BasicNewsRecipe):
|
||||
,'linearize_tables': False
|
||||
}
|
||||
|
||||
keep_only_tags = dict(id='article')
|
||||
keep_only_tags = [dict(attrs={'class':['article section']})]
|
||||
|
||||
remove_tags = [dict(attrs={'class':['related', 'tags']}),
|
||||
dict(id='statepromo')
|
||||
]
|
||||
remove_tags = [dict(attrs={'class':['related', 'tags', 'tools', 'attached-content ready',
|
||||
'inline-content story left', 'inline-content map left contracted', 'published',
|
||||
'story-map', 'statepromo', 'topics', ]})]
|
||||
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
feeds = [
|
||||
('Top Stories', 'http://www.abc.net.au/news/syndicate/topstoriesrss.xml'),
|
||||
('Canberra', 'http://www.abc.net.au/news/indexes/idx-act/rss.xml'),
|
||||
('Sydney', 'http://www.abc.net.au/news/indexes/sydney/rss.xml'),
|
||||
('Melbourne', 'http://www.abc.net.au/news/indexes/melbourne/rss.xml'),
|
||||
('Brisbane', 'http://www.abc.net.au/news/indexes/brisbane/rss.xml'),
|
||||
('Perth', 'http://www.abc.net.au/news/indexes/perth/rss.xml'),
|
||||
('Australia', 'http://www.abc.net.au/news/indexes/idx-australia/rss.xml'),
|
||||
('World', 'http://www.abc.net.au/news/indexes/world/rss.xml'),
|
||||
('Business', 'http://www.abc.net.au/news/indexes/business/rss.xml'),
|
||||
('Science and Technology', 'http://www.abc.net.au/news/tag/science-and-technology/rss.xml'),
|
||||
('Top Stories', 'http://www.abc.net.au/news/feed/45910/rss.xml'),
|
||||
('Canberra', 'http://www.abc.net.au/news/feed/6910/rss.xml'),
|
||||
('Sydney', 'http://www.abc.net.au/news/feed/10232/rss.xml'),
|
||||
('Melbourne', 'http://www.abc.net.au/news/feed/21708/rss.xml'),
|
||||
('Brisbane', 'http://www.abc.net.au/news/feed/12858/rss.xml'),
|
||||
('Perth', 'feed://www.abc.net.au/news/feed/24886/rss.xml'),
|
||||
('Australia', 'http://www.abc.net.au/news/feed/46182/rss.xml'),
|
||||
('World', 'http://www.abc.net.au/news/feed/52278/rss.xml'),
|
||||
('Business', 'http://www.abc.net.au/news/feed/51892/rss.xml'),
|
||||
('Science and Technology', 'http://www.abc.net.au/news/feed/2298/rss.xml'),
|
||||
]
|
||||
|
@ -1,19 +1,38 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
import re
|
||||
class Adventure_zone(BasicNewsRecipe):
|
||||
title = u'Adventure Zone'
|
||||
__author__ = 'fenuks'
|
||||
description = 'Adventure zone - adventure games from A to Z'
|
||||
category = 'games'
|
||||
language = 'pl'
|
||||
oldest_article = 15
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
oldest_article = 20
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content=False
|
||||
preprocess_regexps = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: '')]
|
||||
remove_tags_before= dict(name='td', attrs={'class':'main-bg'})
|
||||
remove_tags_after= dict(name='td', attrs={'class':'main-body middle-border'})
|
||||
remove_tags= [dict(name='img', attrs={'alt':'Drukuj'})]
|
||||
remove_tags_after= dict(id='comments')
|
||||
extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; }'
|
||||
feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')]
|
||||
|
||||
def parse_feeds (self):
|
||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||
soup=self.index_to_soup(u'http://www.adventure-zone.info/fusion/feeds/news.php')
|
||||
tag=soup.find(name='channel')
|
||||
titles=[]
|
||||
for r in tag.findAll(name='image'):
|
||||
r.extract()
|
||||
art=tag.findAll(name='item')
|
||||
for i in art:
|
||||
titles.append(i.title.string)
|
||||
for feed in feeds:
|
||||
for article in feed.articles[:]:
|
||||
article.title=titles[feed.articles.index(article)]
|
||||
return feeds
|
||||
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php')
|
||||
cover=soup.find(id='box_OstatninumerAZ')
|
||||
@ -22,17 +41,10 @@ class Adventure_zone(BasicNewsRecipe):
|
||||
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
skip_tag = soup.body.findAll(name='a')
|
||||
if skip_tag is not None:
|
||||
for r in skip_tag:
|
||||
if 'articles.php?' in r['href']:
|
||||
if r.strong is not None:
|
||||
word=r.strong.string
|
||||
if ('zapowied' or 'recenzj') in word:
|
||||
return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item_id'+r['href'][r['href'].find('_id')+3:], raw=True)
|
||||
else:
|
||||
None
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('news.php?readmore', 'print.php?type=N&item_id')
|
||||
|
||||
skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'})
|
||||
skip_tag = skip_tag.findAll(name='a')
|
||||
for r in skip_tag:
|
||||
if r.strong:
|
||||
word=r.strong.string
|
||||
if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word)):
|
||||
return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
|
50
recipes/al_masry_al_youm.recipe
Normal file
@ -0,0 +1,50 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Pat Stapleton <pat.stapleton at gmail.com>'
|
||||
'''
|
||||
abc.net.au/news
|
||||
'''
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class TheDailyNewsEG(BasicNewsRecipe):
|
||||
title = u'al-masry al-youm'
|
||||
__author__ = 'Omm Mishmishah'
|
||||
description = 'Independent News from Egypt'
|
||||
masthead_url = 'http://www.almasryalyoum.com/sites/default/files/img/english_logo.png'
|
||||
cover_url = 'http://www.almasryalyoum.com/sites/default/files/img/english_logo.png'
|
||||
|
||||
auto_cleanup = True
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = False
|
||||
#delay = 1
|
||||
use_embedded_content = False
|
||||
encoding = 'utf8'
|
||||
publisher = 'Independent News Egypt'
|
||||
category = 'News, Egypt, World'
|
||||
language = 'en_EG'
|
||||
publication_type = 'newsportal'
|
||||
# preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
||||
#Remove annoying map links (inline-caption class is also used for some image captions! hence regex to match maps.google)
|
||||
preprocess_regexps = [(re.compile(r'<a class="inline-caption" href="http://maps\.google\.com.*?/a>', re.DOTALL), lambda m: '')]
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
,'linearize_tables': False
|
||||
}
|
||||
|
||||
keep_only_tags = [dict(attrs={'class':['article section']})]
|
||||
|
||||
remove_tags = [dict(attrs={'class':['related', 'tags', 'tools', 'attached-content ready',
|
||||
'inline-content story left', 'inline-content map left contracted', 'published',
|
||||
'story-map', 'statepromo', 'topics', ]})]
|
||||
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
feeds = [(u'English News', u'http://www.almasryalyoum.com/en/rss_feed_term/113/rss.xml'),
|
||||
(u'News Features', u'http://www.almasryalyoum.com/en/rss_feed_term/115/rss.xml'),
|
||||
(u'Culture', u'http://www.almasryalyoum.com/en/rss_feed_term/133/rss.xml'),
|
||||
(u'Cinema', u'http://www.almasryalyoum.com/en/rss_feed_term/134/rss.xml')
|
||||
]
|
@ -10,11 +10,11 @@ class Alternet(BasicNewsRecipe):
|
||||
category = 'News, Magazine'
|
||||
description = 'News magazine and online community'
|
||||
feeds = [
|
||||
(u'Front Page', u'http://feeds.feedblitz.com/alternet'),
|
||||
(u'Breaking News', u'http://feeds.feedblitz.com/alternet_breaking_news'),
|
||||
(u'Top Ten Campaigns', u'http://feeds.feedblitz.com/alternet_top_10_campaigns'),
|
||||
(u'Special Coverage Areas', u'http://feeds.feedblitz.com/alternet_coverage')
|
||||
]
|
||||
(u'Front Page', u'http://feeds.feedblitz.com/alternet'),
|
||||
(u'Breaking News', u'http://feeds.feedblitz.com/alternet_breaking_news'),
|
||||
(u'Top Ten Campaigns', u'http://feeds.feedblitz.com/alternet_top_10_campaigns'),
|
||||
(u'Special Coverage Areas', u'http://feeds.feedblitz.com/alternet_coverage')
|
||||
]
|
||||
remove_attributes = ['width', 'align','cellspacing']
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
@ -36,3 +36,5 @@ class Alternet(BasicNewsRecipe):
|
||||
self.temp_files[-1].write(html)
|
||||
self.temp_files[-1].close()
|
||||
return self.temp_files[-1].name
|
||||
|
||||
conversion_options = {'linearize_tables': True}
|
||||
|
@ -11,7 +11,6 @@ class AssociatedPress(BasicNewsRecipe):
|
||||
language = 'en'
|
||||
no_stylesheets = True
|
||||
max_articles_per_feed = 15
|
||||
html2lrf_options = ['--force-page-break-before-tag="chapter"']
|
||||
|
||||
|
||||
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
|
@ -1,5 +1,4 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AstroNEWS(BasicNewsRecipe):
|
||||
title = u'AstroNEWS'
|
||||
__author__ = 'fenuks'
|
||||
@ -8,11 +7,16 @@ class AstroNEWS(BasicNewsRecipe):
|
||||
language = 'pl'
|
||||
oldest_article = 8
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
#extra_css= 'table {text-align: left;}'
|
||||
no_stylesheets=True
|
||||
cover_url='http://news.astronet.pl/img/logo_news.jpg'
|
||||
# no_stylesheets= True
|
||||
remove_tags=[dict(name='hr')]
|
||||
feeds = [(u'Wiadomości', u'http://news.astronet.pl/rss.cgi')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('astronet.pl/', 'astronet.pl/print.cgi?')
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(align=True):
|
||||
del item['align']
|
||||
return soup
|
||||
|
@ -1,61 +1,648 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
##
|
||||
## Title: BBC News, Sport, and Blog Calibre Recipe
|
||||
## Contact: mattst - jmstanfield@gmail.com
|
||||
##
|
||||
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
|
||||
## Copyright: mattst - jmstanfield@gmail.com
|
||||
##
|
||||
## Written: November 2011
|
||||
## Last Edited: 2011-11-19
|
||||
##
|
||||
|
||||
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
||||
__copyright__ = 'mattst - jmstanfield@gmail.com'
|
||||
|
||||
|
||||
'''
|
||||
news.bbc.co.uk
|
||||
BBC News, Sport, and Blog Calibre Recipe
|
||||
'''
|
||||
|
||||
# Import the regular expressions module.
|
||||
import re
|
||||
|
||||
# Import the BasicNewsRecipe class which this class extends.
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class BBC(BasicNewsRecipe):
|
||||
title = 'BBC News'
|
||||
__author__ = 'Darko Miletic, Starson17'
|
||||
description = 'News from UK. '
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
#delay = 1
|
||||
use_embedded_content = False
|
||||
encoding = 'utf8'
|
||||
publisher = 'BBC'
|
||||
category = 'news, UK, world'
|
||||
language = 'en_GB'
|
||||
publication_type = 'newsportal'
|
||||
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
|
||||
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
,'linearize_tables': True
|
||||
class BBCNewsSportBlog(BasicNewsRecipe):
|
||||
|
||||
#
|
||||
# **** IMPORTANT USERS READ ME ****
|
||||
#
|
||||
# First select the feeds you want then scroll down below the feeds list
|
||||
# and select the values you want for the other user preferences, like
|
||||
# oldest_article and such like.
|
||||
#
|
||||
#
|
||||
# Select the BBC rss feeds which you want in your ebook.
|
||||
# Selected feed have NO '#' at their start, de-selected feeds begin with a '#'.
|
||||
#
|
||||
# Eg. ("News Home", "http://feeds.bbci.co.uk/... - include feed.
|
||||
# Eg. #("News Home", "http://feeds.bbci.co.uk/... - do not include feed.
|
||||
#
|
||||
# There are 68 feeds below which constitute the bulk of the available rss
|
||||
# feeds on the BBC web site. These include 5 blogs by editors and
|
||||
# correspondants, 16 sports feeds, 15 'sub' regional feeds (Eg. North West
|
||||
# Wales, Scotland Business), and 7 Welsh language feeds.
|
||||
#
|
||||
# Some of the feeds are low volume (Eg. blogs), or very low volume (Eg. Click)
|
||||
# so if "oldest_article = 1.5" (only articles published in the last 36 hours)
|
||||
# you may get some 'empty feeds' which will not then be included in the ebook.
|
||||
#
|
||||
# The 15 feeds currently selected below are simply my default ones.
|
||||
#
|
||||
# Note: With all 68 feeds selected, oldest_article set to 2,
|
||||
# max_articles_per_feed set to 100, and simultaneous_downloads set to 10,
|
||||
# the ebook creation took 29 minutes on my speedy 100 mbps net connection,
|
||||
# fairly high-end desktop PC running Linux (Ubuntu Lucid-Lynx).
|
||||
# More realistically with 15 feeds selected, oldest_article set to 1.5,
|
||||
# max_articles_per_feed set to 100, and simultaneous_downloads set to 20,
|
||||
# it took 6 minutes. If that's too slow increase 'simultaneous_downloads'.
|
||||
#
|
||||
# Select / de-select the feeds you want in your ebook.
|
||||
#
|
||||
feeds = [
|
||||
("News Home", "http://feeds.bbci.co.uk/news/rss.xml"),
|
||||
("UK", "http://feeds.bbci.co.uk/news/uk/rss.xml"),
|
||||
("World", "http://feeds.bbci.co.uk/news/world/rss.xml"),
|
||||
#("England", "http://feeds.bbci.co.uk/news/england/rss.xml"),
|
||||
#("Scotland", "http://feeds.bbci.co.uk/news/scotland/rss.xml"),
|
||||
#("Wales", "http://feeds.bbci.co.uk/news/wales/rss.xml"),
|
||||
#("N. Ireland", "http://feeds.bbci.co.uk/news/northern_ireland/rss.xml"),
|
||||
#("Africa", "http://feeds.bbci.co.uk/news/world/africa/rss.xml"),
|
||||
#("Asia", "http://feeds.bbci.co.uk/news/world/asia/rss.xml"),
|
||||
#("Europe", "http://feeds.bbci.co.uk/news/world/europe/rss.xml"),
|
||||
#("Latin America", "http://feeds.bbci.co.uk/news/world/latin_america/rss.xml"),
|
||||
#("Middle East", "http://feeds.bbci.co.uk/news/world/middle_east/rss.xml"),
|
||||
("US & Canada", "http://feeds.bbci.co.uk/news/world/us_and_canada/rss.xml"),
|
||||
("Politics", "http://feeds.bbci.co.uk/news/politics/rss.xml"),
|
||||
("Science/Environment", "http://feeds.bbci.co.uk/news/science_and_environment/rss.xml"),
|
||||
("Technology", "http://feeds.bbci.co.uk/news/technology/rss.xml"),
|
||||
("Magazine", "http://feeds.bbci.co.uk/news/magazine/rss.xml"),
|
||||
("Entertainment/Arts", "http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml"),
|
||||
#("Health", "http://feeds.bbci.co.uk/news/health/rss.xml"),
|
||||
#("Education/Family", "http://feeds.bbci.co.uk/news/education/rss.xml"),
|
||||
("Business", "http://feeds.bbci.co.uk/news/business/rss.xml"),
|
||||
("Special Reports", "http://feeds.bbci.co.uk/news/special_reports/rss.xml"),
|
||||
("Also in the News", "http://feeds.bbci.co.uk/news/also_in_the_news/rss.xml"),
|
||||
#("Newsbeat", "http://www.bbc.co.uk/newsbeat/rss.xml"),
|
||||
#("Click", "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/programmes/click_online/rss.xml"),
|
||||
("Blog: Nick Robinson (Political Editor)", "http://feeds.bbci.co.uk/news/correspondents/nickrobinson/rss.sxml"),
|
||||
#("Blog: Mark D'Arcy (Parliamentary Correspondent)", "http://feeds.bbci.co.uk/news/correspondents/markdarcy/rss.sxml"),
|
||||
#("Blog: Robert Peston (Business Editor)", "http://feeds.bbci.co.uk/news/correspondents/robertpeston/rss.sxml"),
|
||||
#("Blog: Stephanie Flanders (Economics Editor)", "http://feeds.bbci.co.uk/news/correspondents/stephanieflanders/rss.sxml"),
|
||||
("Blog: Rory Cellan-Jones (Technology correspondent)", "http://feeds.bbci.co.uk/news/correspondents/rorycellanjones/rss.sxml"),
|
||||
("Sport Front Page", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/front_page/rss.xml"),
|
||||
#("Football", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/football/rss.xml"),
|
||||
#("Cricket", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/cricket/rss.xml"),
|
||||
#("Rugby Union", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_union/rss.xml"),
|
||||
#("Rugby League", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_league/rss.xml"),
|
||||
#("Tennis", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/tennis/rss.xml"),
|
||||
#("Golf", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/golf/rss.xml"),
|
||||
#("Motorsport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/motorsport/rss.xml"),
|
||||
#("Boxing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/boxing/rss.xml"),
|
||||
#("Athletics", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/athletics/rss.xml"),
|
||||
#("Snooker", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/snooker/rss.xml"),
|
||||
#("Horse Racing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/horse_racing/rss.xml"),
|
||||
#("Cycling", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/cycling/rss.xml"),
|
||||
#("Disability Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/disability_sport/rss.xml"),
|
||||
#("Other Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/rss.xml"),
|
||||
#("Olympics 2012", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/olympics_2012/rss.xml"),
|
||||
#("N. Ireland Politics", "http://feeds.bbci.co.uk/news/northern_ireland/northern_ireland_politics/rss.xml"),
|
||||
#("Scotland Politics", "http://feeds.bbci.co.uk/news/scotland/scotland_politics/rss.xml"),
|
||||
#("Scotland Business", "http://feeds.bbci.co.uk/news/scotland/scotland_business/rss.xml"),
|
||||
#("E. Scotland, Edinburgh & Fife", "http://feeds.bbci.co.uk/news/scotland/edinburgh_east_and_fife/rss.xml"),
|
||||
#("W. Scotland & Glasgow", "http://feeds.bbci.co.uk/news/scotland/glasgow_and_west/rss.xml"),
|
||||
#("Highlands & Islands", "http://feeds.bbci.co.uk/news/scotland/highlands_and_islands/rss.xml"),
|
||||
#("NE. Scotland, Orkney & Shetland", "http://feeds.bbci.co.uk/news/scotland/north_east_orkney_and_shetland/rss.xml"),
|
||||
#("South Scotland", "http://feeds.bbci.co.uk/news/scotland/south_scotland/rss.xml"),
|
||||
#("Central Scotland & Tayside", "http://feeds.bbci.co.uk/news/scotland/tayside_and_central/rss.xml"),
|
||||
#("Wales Politics", "http://feeds.bbci.co.uk/news/wales/wales_politics/rss.xml"),
|
||||
#("NW. Wales", "http://feeds.bbci.co.uk/news/wales/north_west_wales/rss.xml"),
|
||||
#("NE. Wales", "http://feeds.bbci.co.uk/news/wales/north_east_wales/rss.xml"),
|
||||
#("Mid. Wales", "http://feeds.bbci.co.uk/news/wales/mid_wales/rss.xml"),
|
||||
#("SW. Wales", "http://feeds.bbci.co.uk/news/wales/south_west_wales/rss.xml"),
|
||||
#("SE. Wales", "http://feeds.bbci.co.uk/news/wales/south_east_wales/rss.xml"),
|
||||
#("Newyddion - News in Welsh", "http://feeds.bbci.co.uk/newyddion/rss.xml"),
|
||||
#("Gwleidyddiaeth", "http://feeds.bbci.co.uk/newyddion/gwleidyddiaeth/rss.xml"),
|
||||
#("Gogledd-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/gogledd-ddwyrain/rss.xml"),
|
||||
#("Gogledd-Orllewin", "http://feeds.bbci.co.uk/newyddion/gogledd-orllewin/rss.xml"),
|
||||
#("Canolbarth", "http://feeds.bbci.co.uk/newyddion/canolbarth/rss.xml"),
|
||||
#("De-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/de-ddwyrain/rss.xml"),
|
||||
#("De-Orllewin", "http://feeds.bbci.co.uk/newyddion/de-orllewin/rss.xml"),
|
||||
]
|
||||
|
||||
|
||||
# **** SELECT YOUR USER PREFERENCES ****
|
||||
|
||||
# Title to use for the ebook.
|
||||
#
|
||||
title = 'BBC News'
|
||||
|
||||
# A brief description for the ebook.
|
||||
#
|
||||
description = u'BBC web site ebook created using rss feeds.'
|
||||
|
||||
# The max number of articles which may be downloaded from each feed.
|
||||
# I've never seen more than about 70 articles in a single feed in the
|
||||
# BBC feeds.
|
||||
#
|
||||
max_articles_per_feed = 100
|
||||
|
||||
# The max age of articles which may be downloaded from each feed. This is
|
||||
# specified in days - note fractions of days are allowed, Eg. 2.5 (2 and a
|
||||
# half days). My default of 1.5 days is the last 36 hours, the point at
|
||||
# which I've decided 'news' becomes 'old news', but be warned this is not
|
||||
# so good for the blogs, technology, magazine, etc., and sports feeds.
|
||||
# You may wish to extend this to 2-5 but watch out ebook creation time will
|
||||
# increase as well. Setting this to 30 will get everything (AFAICT) as long
|
||||
# as max_articles_per_feed remains set high (except for 'Click' which is
|
||||
# v. low volume and its currently oldest article is 4th Feb 2011).
|
||||
#
|
||||
oldest_article = 1.5
|
||||
|
||||
# Number of simultaneous downloads. 20 is consistantly working fine on the
|
||||
# BBC News feeds with no problems. Speeds things up from the defualt of 5.
|
||||
# If you have a lot of feeds and/or have increased oldest_article above 2
|
||||
# then you may wish to try increasing simultaneous_downloads to 25-30,
|
||||
# Or, of course, if you are in a hurry. [I've not tried beyond 20.]
|
||||
#
|
||||
simultaneous_downloads = 20
|
||||
|
||||
# Timeout for fetching files from the server in seconds. The default of
|
||||
# 120 seconds, seems somewhat excessive.
|
||||
#
|
||||
timeout = 30
|
||||
|
||||
# The format string for the date shown on the ebook's first page.
|
||||
# List of all values: http://docs.python.org/library/time.html
|
||||
# Default in news.py has a leading space so that's mirrored here.
|
||||
# As with 'feeds' select/de-select by adding/removing the initial '#',
|
||||
# only one timefmt should be selected, here's a few to choose from.
|
||||
#
|
||||
timefmt = ' [%a, %d %b %Y]' # [Fri, 14 Nov 2011] (Calibre default)
|
||||
#timefmt = ' [%a, %d %b %Y %H:%M]' # [Fri, 14 Nov 2011 18:30]
|
||||
#timefmt = ' [%a, %d %b %Y %I:%M %p]' # [Fri, 14 Nov 2011 06:30 PM]
|
||||
#timefmt = ' [%d %b %Y]' # [14 Nov 2011]
|
||||
#timefmt = ' [%d %b %Y %H:%M]' # [14 Nov 2011 18.30]
|
||||
#timefmt = ' [%Y-%m-%d]' # [2011-11-14]
|
||||
#timefmt = ' [%Y-%m-%d-%H-%M]' # [2011-11-14-18-30]
|
||||
|
||||
|
||||
|
||||
#
|
||||
# **** IMPORTANT ****
|
||||
#
|
||||
# DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING.
|
||||
#
|
||||
# DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING.
|
||||
#
|
||||
# I MEAN IT, YES I DO, ABSOLUTELY, AT YOU OWN RISK. :)
|
||||
#
|
||||
# **** IMPORTANT ****
|
||||
#
|
||||
|
||||
|
||||
|
||||
# Author of this recipe.
|
||||
__author__ = 'mattst'
|
||||
|
||||
# Specify English as the language of the RSS feeds (ISO-639 code).
|
||||
language = 'en_GB'
|
||||
|
||||
# Set tags.
|
||||
tags = 'news, sport, blog'
|
||||
|
||||
# Set publisher and publication type.
|
||||
publisher = 'BBC'
|
||||
publication_type = 'newspaper'
|
||||
|
||||
# Disable stylesheets from site.
|
||||
no_stylesheets = True
|
||||
|
||||
# Specifies an override encoding for sites that have an incorrect charset
|
||||
# specified. Default of 'None' says to auto-detect. Some other BBC recipes
|
||||
# use 'utf8', which works fine (so use that if necessary) but auto-detecting
|
||||
# with None is working fine, so stick with that for robustness.
|
||||
encoding = None
|
||||
|
||||
# Sets whether a feed has full articles embedded in it. The BBC feeds do not.
|
||||
use_embedded_content = False
|
||||
|
||||
# Removes empty feeds - why keep them!?
|
||||
remove_empty_feeds = True
|
||||
|
||||
# Create a custom title which fits nicely in the Kindle title list.
|
||||
# Requires "import time" above class declaration, and replacing
|
||||
# title with custom_title in conversion_options (right column only).
|
||||
# Example of string below: "BBC News - 14 Nov 2011"
|
||||
#
|
||||
# custom_title = "BBC News - " + time.strftime('%d %b %Y')
|
||||
|
||||
'''
|
||||
# Conversion options for advanced users, but don't forget to comment out the
|
||||
# current conversion_options below. Avoid setting 'linearize_tables' as that
|
||||
# plays havoc with the 'old style' table based pages.
|
||||
#
|
||||
conversion_options = { 'title' : title,
|
||||
'comments' : description,
|
||||
'tags' : tags,
|
||||
'language' : language,
|
||||
'publisher' : publisher,
|
||||
'authors' : publisher,
|
||||
'smarten_punctuation' : True
|
||||
}
|
||||
'''
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':['layout-block-a layout-block']})
|
||||
,dict(attrs={'class':['story-body','storybody']})
|
||||
]
|
||||
conversion_options = { 'smarten_punctuation' : True }
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['story-feature related narrow', 'share-help', 'embedded-hyper',
|
||||
'story-feature wide ', 'story-feature narrow']}),
|
||||
dict(id=['hypertab', 'comment-form']),
|
||||
]
|
||||
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
|
||||
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
|
||||
.introduction, .first { font-weight: bold; } \
|
||||
.cross-head { font-weight: bold; font-size: 125%; } \
|
||||
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
|
||||
.cap, .caption, .caption img, .caption span { display: block; text-align: center; margin: 5px auto; } \
|
||||
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
|
||||
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
|
||||
text-align: center; font-size: 80%; font-style: italic; margin: 1px auto; } \
|
||||
.story-date, .published { font-size: 80%; } \
|
||||
table { width: 100%; } \
|
||||
td img { display: block; margin: 5px auto; } \
|
||||
ul { padding-top: 10px; } \
|
||||
ol { padding-top: 10px; } \
|
||||
li { padding-top: 5px; padding-bottom: 5px; } \
|
||||
h1 { text-align: center; font-size: 175%; font-weight: bold; } \
|
||||
h2 { text-align: center; font-size: 150%; font-weight: bold; } \
|
||||
h3 { text-align: center; font-size: 125%; font-weight: bold; } \
|
||||
h4, h5, h6 { text-align: center; font-size: 100%; font-weight: bold; }'
|
||||
|
||||
remove_attributes = ['width','height']
|
||||
# Remove various tag attributes to improve the look of the ebook pages.
|
||||
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
|
||||
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
|
||||
|
||||
feeds = [
|
||||
('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'),
|
||||
('Science/Nature', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/science/nature/rss.xml'),
|
||||
('Technology', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/technology/rss.xml'),
|
||||
('Entertainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'),
|
||||
('Magazine', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/magazine/rss.xml'),
|
||||
('Business', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml'),
|
||||
('Health', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/health/rss.xml'),
|
||||
('Americas', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml'),
|
||||
('Europe', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml'),
|
||||
('South Asia', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/south_asia/rss.xml'),
|
||||
('UK', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/rss.xml'),
|
||||
('Asia-Pacific', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/asia-pacific/rss.xml'),
|
||||
('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'),
|
||||
]
|
||||
# Remove the (admittedly rarely used) line breaks, "<br />", which sometimes
|
||||
# cause a section of the ebook to start in an unsightly fashion or, more
|
||||
# frequently, a "<br />" will muck up the formatting of a correspondant's byline.
|
||||
# "<br />" and "<br clear/>" are far more frequently used on the table formatted
|
||||
# style of pages, and really spoil the look of the ebook pages.
|
||||
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
|
||||
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: '')]
|
||||
|
||||
|
||||
# Create regular expressions for tag keeping and removal to make the matches more
|
||||
# robust against minor changes and errors in the HTML, Eg. double spaces, leading
|
||||
# and trailing spaces, missing hyphens, and such like.
|
||||
# Python regular expression ('re' class) page: http://docs.python.org/library/re.html
|
||||
|
||||
# ***************************************
|
||||
# Regular expressions for keep_only_tags:
|
||||
# ***************************************
|
||||
|
||||
# The BBC News HTML pages use variants of 'storybody' to denote the section of a HTML
|
||||
# page which contains the main text of the article. Match storybody variants: 'storybody',
|
||||
# 'story-body', 'story body','storybody ', etc.
|
||||
storybody_reg_exp = '^.*story[_ -]*body.*$'
|
||||
|
||||
# The BBC sport and 'newsbeat' (features) HTML pages use 'blq_content' to hold the title
|
||||
# and published date. This is one level above the usual news pages which have the title
|
||||
# and date within 'story-body'. This is annoying since 'blq_content' must also be kept,
|
||||
# resulting in a lot of extra things to be removed by remove_tags.
|
||||
blq_content_reg_exp = '^.*blq[_ -]*content.*$'
|
||||
|
||||
# The BBC has an alternative page design structure, which I suspect is an out-of-date
|
||||
# design but which is still used in some articles, Eg. 'Click' (technology), 'FastTrack'
|
||||
# (travel), and in some sport pages. These alternative pages are table based (which is
|
||||
# why I think they are an out-of-date design) and account for -I'm guesstimaking- less
|
||||
# than 1% of all articles. They use a table class 'storycontent' to hold the article
|
||||
# and like blq_content (above) have required lots of extra removal by remove_tags.
|
||||
story_content_reg_exp = '^.*story[_ -]*content.*$'
|
||||
|
||||
# Keep the sections of the HTML which match the list below. The HTML page created by
|
||||
# Calibre will fill <body> with those sections which are matched. Note that the
|
||||
# blq_content_reg_exp must be listed before storybody_reg_exp in keep_only_tags due to
|
||||
# it being the parent of storybody_reg_exp, that is to say the div class/id 'story-body'
|
||||
# will be inside div class/id 'blq_content' in the HTML (if 'blq_content' is there at
|
||||
# all). If they are the other way around in keep_only_tags then blq_content_reg_exp
|
||||
# will end up being discarded.
|
||||
keep_only_tags = [ dict(name='table', attrs={'class':re.compile(story_content_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(blq_content_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'id':re.compile(blq_content_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(storybody_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'id':re.compile(storybody_reg_exp, re.IGNORECASE)}) ]
|
||||
|
||||
# ************************************
|
||||
# Regular expressions for remove_tags:
|
||||
# ************************************
|
||||
|
||||
# Regular expression to remove share-help and variant tags. The share-help class
|
||||
# is used by the site for a variety of 'sharing' type links, Eg. Facebook, delicious,
|
||||
# twitter, email. Removed to avoid page clutter.
|
||||
share_help_reg_exp = '^.*share[_ -]*help.*$'
|
||||
|
||||
# Regular expression to remove embedded-hyper and variant tags. This class is used to
|
||||
# display links to other BBC News articles on the same/similar subject.
|
||||
embedded_hyper_reg_exp = '^.*embed*ed[_ -]*hyper.*$'
|
||||
|
||||
# Regular expression to remove hypertabs and variant tags. This class is used to
|
||||
# display a tab bar at the top of an article which allows the user to switch to
|
||||
# an article (viewed on the same page) providing further info., 'in depth' analysis,
|
||||
# an editorial, a correspondant's blog entry, and such like. The ability to handle
|
||||
# a tab bar of this nature is currently beyond the scope of this recipe and
|
||||
# possibly of Calibre itself (not sure about that - TO DO - check!).
|
||||
hypertabs_reg_exp = '^.*hyper[_ -]*tabs.*$'
|
||||
|
||||
# Regular expression to remove story-feature and variant tags. Eg. 'story-feature',
|
||||
# 'story-feature related narrow', 'story-feature wide', 'story-feature narrow'.
|
||||
# This class is used to add additional info. boxes, or small lists, outside of
|
||||
# the main story. TO DO: Work out a way to incorporate these neatly.
|
||||
story_feature_reg_exp = '^.*story[_ -]*feature.*$'
|
||||
|
||||
# Regular expression to remove video and variant tags, Eg. 'videoInStoryB',
|
||||
# 'videoInStoryC'. This class is used to embed video.
|
||||
video_reg_exp = '^.*video.*$'
|
||||
|
||||
# Regular expression to remove audio and variant tags, Eg. 'audioInStoryD'.
|
||||
# This class is used to embed audio.
|
||||
audio_reg_exp = '^.*audio.*$'
|
||||
|
||||
# Regular expression to remove pictureGallery and variant tags, Eg. 'pictureGallery'.
|
||||
# This class is used to embed a photo slideshow. See also 'slideshow' below.
|
||||
picture_gallery_reg_exp = '^.*picture.*$'
|
||||
|
||||
# Regular expression to remove slideshow and variant tags, Eg. 'dslideshow-enclosure'.
|
||||
# This class is used to embed a slideshow (not necessarily photo) but both
|
||||
# 'slideshow' and 'pictureGallery' are used for slideshows.
|
||||
slideshow_reg_exp = '^.*slide[_ -]*show.*$'
|
||||
|
||||
# Regular expression to remove social-links and variant tags. This class is used to
|
||||
# display links to a BBC bloggers main page, used in various columnist's blogs
|
||||
# (Eg. Nick Robinson, Robert Preston).
|
||||
social_links_reg_exp = '^.*social[_ -]*links.*$'
|
||||
|
||||
# Regular expression to remove quote and (multi) variant tags, Eg. 'quote',
|
||||
# 'endquote', 'quote-credit', 'quote-credit-title', etc. These are usually
|
||||
# removed by 'story-feature' removal (as they are usually within them), but
|
||||
# not always. The quotation removed is always (AFAICT) in the article text
|
||||
# as well but a 2nd copy is placed in a quote tag to draw attention to it.
|
||||
# The quote class tags may or may not appear in div's.
|
||||
quote_reg_exp = '^.*quote.*$'
|
||||
|
||||
# Regular expression to remove hidden and variant tags, Eg. 'hidden'.
|
||||
# The purpose of these is unclear, they seem to be an internal link to a
|
||||
# section within the article, but the text of the link (Eg. 'Continue reading
|
||||
# the main story') never seems to be displayed anyway. Removed to avoid clutter.
|
||||
# The hidden class tags may or may not appear in div's.
|
||||
hidden_reg_exp = '^.*hidden.*$'
|
||||
|
||||
# Regular expression to remove comment and variant tags, Eg. 'comment-introduction'.
|
||||
# Used on the site to display text about registered users entering comments.
|
||||
comment_reg_exp = '^.*comment.*$'
|
||||
|
||||
# Regular expression to remove form and variant tags, Eg. 'comment-form'.
|
||||
# Used on the site to allow registered BBC users to fill in forms, typically
|
||||
# for entering comments about an article.
|
||||
form_reg_exp = '^.*form.*$'
|
||||
|
||||
# Extra things to remove due to the addition of 'blq_content' in keep_only_tags.
|
||||
|
||||
#<div class="story-actions"> Used on sports pages for 'email' and 'print'.
|
||||
story_actions_reg_exp = '^.*story[_ -]*actions.*$'
|
||||
|
||||
#<div class="bookmark-list"> Used on sports pages instead of 'share-help' (for
|
||||
# social networking links).
|
||||
bookmark_list_reg_exp = '^.*bookmark[_ -]*list.*$'
|
||||
|
||||
#<div id="secondary-content" class="content-group">
|
||||
# NOTE: Don't remove class="content-group" that is needed.
|
||||
# Used on sports pages to link to 'similar stories'.
|
||||
secondary_content_reg_exp = '^.*secondary[_ -]*content.*$'
|
||||
|
||||
#<div id="featured-content" class="content-group">
|
||||
# NOTE: Don't remove class="content-group" that is needed.
|
||||
# Used on sports pages to link to pages like 'tables', 'fixtures', etc.
|
||||
featured_content_reg_exp = '^.*featured[_ -]*content.*$'
|
||||
|
||||
#<div id="navigation">
|
||||
# Used on sports pages to link to pages like 'tables', 'fixtures', etc.
|
||||
# Used sometimes instead of "featured-content" above.
|
||||
navigation_reg_exp = '^.*navigation.*$'
|
||||
|
||||
#<a class="skip" href="#blq-container-inner">Skip to top</a>
|
||||
# Used on sports pages to link to the top of the page.
|
||||
skip_reg_exp = '^.*skip.*$'
|
||||
|
||||
# Extra things to remove due to the addition of 'storycontent' in keep_only_tags,
|
||||
# which are the alterative table design based pages. The purpose of some of these
|
||||
# is not entirely clear from the pages (which are a total mess!).
|
||||
|
||||
# Remove mapping based tags, Eg. <map id="world_map">
|
||||
# The dynamic maps don't seem to work during ebook creation. TO DO: Investigate.
|
||||
map_reg_exp = '^.*map.*$'
|
||||
|
||||
# Remove social bookmarking variation, called 'socialBookMarks'.
|
||||
social_bookmarks_reg_exp = '^.*social[_ -]*bookmarks.*$'
|
||||
|
||||
# Remove page navigation tools, like 'search', 'email', 'print', called 'blq-mast'.
|
||||
blq_mast_reg_exp = '^.*blq[_ -]*mast.*$'
|
||||
|
||||
# Remove 'sharesb', I think this is a generic 'sharing' class. It seems to appear
|
||||
# alongside 'socialBookMarks' whenever that appears. I am removing it as well
|
||||
# under the assumption that it can appear alone as well.
|
||||
sharesb_reg_exp = '^.*sharesb.*$'
|
||||
|
||||
# Remove class 'o'. The worst named user created css class of all time. The creator
|
||||
# should immediately be fired. I've seen it used to hold nothing at all but with
|
||||
# 20 or so empty lines in it. Also to hold a single link to another article.
|
||||
# Whatever it was designed to do it is not wanted by this recipe. Exact match only.
|
||||
o_reg_exp = '^o$'
|
||||
|
||||
# Remove 'promotopbg' and 'promobottombg', link lists. Have decided to
|
||||
# use two reg expressions to make removing this (and variants) robust.
|
||||
promo_top_reg_exp = '^.*promotopbg.*$'
|
||||
promo_bottom_reg_exp = '^.*promobottombg.*$'
|
||||
|
||||
# Remove 'nlp', provides heading for link lists. Requires an exact match due to
|
||||
# risk of matching those letters in something needed, unless I see a variation
|
||||
# of 'nlp' used at a later date.
|
||||
nlp_reg_exp = '^nlp$'
|
||||
|
||||
# Remove 'mva', provides embedded floating content of various types. Variant 'mvb'
|
||||
# has also now been seen. Requires an exact match of 'mva' or 'mvb' due to risk of
|
||||
# matching those letters in something needed.
|
||||
mva_or_mvb_reg_exp = '^mv[ab]$'
|
||||
|
||||
# Remove 'mvtb', seems to be page navigation tools, like 'blq-mast'.
|
||||
mvtb_reg_exp = '^mvtb$'
|
||||
|
||||
# Remove 'blq-toplink', class to provide a link to the top of the page.
|
||||
blq_toplink_reg_exp = '^.*blq[_ -]*top[_ -]*link.*$'
|
||||
|
||||
# Remove 'products and services' links, Eg. desktop tools, alerts, and so on.
|
||||
# Eg. Class="servicev4 ukfs_services" - what a mess of a name. Have decided to
|
||||
# use two reg expressions to make removing this (and variants) robust.
|
||||
prods_services_01_reg_exp = '^.*servicev4.*$'
|
||||
prods_services_02_reg_exp = '^.*ukfs[_ -]*services.*$'
|
||||
|
||||
# Remove -what I think is- some kind of navigation tools helper class, though I am
|
||||
# not sure, it's called: 'blq-rst blq-new-nav'. What I do know is it pops up
|
||||
# frequently and it is not wanted. Have decided to use two reg expressions to make
|
||||
# removing this (and variants) robust.
|
||||
blq_misc_01_reg_exp = '^.*blq[_ -]*rst.*$'
|
||||
blq_misc_02_reg_exp = '^.*blq[_ -]*new[_ -]*nav.*$'
|
||||
|
||||
# Remove 'puffbox' - this may only appear inside 'storyextra', so it may not
|
||||
# need removing - I have no clue what it does other than it contains links.
|
||||
# Whatever it is - it is not part of the article and is not wanted.
|
||||
puffbox_reg_exp = '^.*puffbox.*$'
|
||||
|
||||
# Remove 'sibtbg' and 'sibtbgf' - some kind of table formatting classes.
|
||||
sibtbg_reg_exp = '^.*sibtbg.*$'
|
||||
|
||||
# Remove 'storyextra' - links to relevant articles and external sites.
|
||||
storyextra_reg_exp = '^.*story[_ -]*extra.*$'
|
||||
|
||||
|
||||
remove_tags = [ dict(name='div', attrs={'class':re.compile(story_feature_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(share_help_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(embedded_hyper_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(hypertabs_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(video_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(audio_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(picture_gallery_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(slideshow_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(quote_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(hidden_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(comment_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(story_actions_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(bookmark_list_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'id':re.compile(secondary_content_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'id':re.compile(featured_content_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'id':re.compile(navigation_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='form', attrs={'id':re.compile(form_reg_exp, re.IGNORECASE)}),
|
||||
dict(attrs={'class':re.compile(quote_reg_exp, re.IGNORECASE)}),
|
||||
dict(attrs={'class':re.compile(hidden_reg_exp, re.IGNORECASE)}),
|
||||
dict(attrs={'class':re.compile(social_links_reg_exp, re.IGNORECASE)}),
|
||||
dict(attrs={'class':re.compile(comment_reg_exp, re.IGNORECASE)}),
|
||||
dict(attrs={'class':re.compile(skip_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='map', attrs={'id':re.compile(map_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='map', attrs={'name':re.compile(map_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'id':re.compile(social_bookmarks_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'id':re.compile(blq_mast_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(sharesb_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(o_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(promo_top_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(promo_bottom_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(nlp_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(mva_or_mvb_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(mvtb_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(blq_toplink_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(prods_services_01_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(prods_services_02_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(blq_misc_01_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(blq_misc_02_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(puffbox_reg_exp, re.IGNORECASE)}),
|
||||
dict(attrs={'class':re.compile(sibtbg_reg_exp, re.IGNORECASE)}),
|
||||
dict(attrs={'class':re.compile(storyextra_reg_exp, re.IGNORECASE)})
|
||||
]
|
||||
|
||||
# Uses url to create and return the 'printer friendly' version of the url.
|
||||
# In other words the 'print this page' address of the page.
|
||||
#
|
||||
# There are 3 types of urls used in the BBC site's rss feeds. There is just
|
||||
# 1 type for the standard news while there are 2 used for sports feed urls.
|
||||
# Note: Sports urls are linked from regular news feeds (Eg. 'News Home') when
|
||||
# there is a major story of interest to 'everyone'. So even if no BBC sports
|
||||
# feeds are added to 'feeds' the logic of this method is still needed to avoid
|
||||
# blank / missing / empty articles which have an index title and then no body.
|
||||
def print_version(self, url):
|
||||
|
||||
# Handle sports page urls type 01:
|
||||
if (url.find("go/rss/-/sport1/") != -1):
|
||||
temp_url = url.replace("go/rss/-/", "")
|
||||
|
||||
# Handle sports page urls type 02:
|
||||
elif (url.find("go/rss/int/news/-/sport1/") != -1):
|
||||
temp_url = url.replace("go/rss/int/news/-/", "")
|
||||
|
||||
# Handle regular news page urls:
|
||||
else:
|
||||
temp_url = url.replace("go/rss/int/news/-/", "")
|
||||
|
||||
# Always add "?print=true" to the end of the url.
|
||||
print_url = temp_url + "?print=true"
|
||||
|
||||
return print_url
|
||||
|
||||
|
||||
# Remove articles in feeds based on a string in the article title or url.
|
||||
#
|
||||
# Code logic written by: Starson17 - posted in: "Recipes - Re-usable code"
|
||||
# thread, in post with title: "Remove articles from feed", see url:
|
||||
# http://www.mobileread.com/forums/showpost.php?p=1165462&postcount=6
|
||||
# Many thanks and all credit to Starson17.
|
||||
#
|
||||
# Starson17's code has obviously been altered to suite my requirements.
|
||||
def parse_feeds(self):
|
||||
|
||||
# Call parent's method.
|
||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||
|
||||
# Loop through all feeds.
|
||||
for feed in feeds:
|
||||
|
||||
# Loop through all articles in feed.
|
||||
for article in feed.articles[:]:
|
||||
|
||||
# Match key words and remove article if there's a match.
|
||||
|
||||
# Most BBC rss feed video only 'articles' use upper case 'VIDEO'
|
||||
# as a title prefix. Just match upper case 'VIDEO', so that
|
||||
# articles like 'Video game banned' won't be matched and removed.
|
||||
if 'VIDEO' in article.title:
|
||||
feed.articles.remove(article)
|
||||
|
||||
# Most BBC rss feed audio only 'articles' use upper case 'AUDIO'
|
||||
# as a title prefix. Just match upper case 'AUDIO', so that
|
||||
# articles like 'Hi-Def audio...' won't be matched and removed.
|
||||
elif 'AUDIO' in article.title:
|
||||
feed.articles.remove(article)
|
||||
|
||||
# Most BBC rss feed photo slideshow 'articles' use 'In Pictures',
|
||||
# 'In pictures', and 'in pictures', somewhere in their title.
|
||||
# Match any case of that phrase.
|
||||
elif 'IN PICTURES' in article.title.upper():
|
||||
feed.articles.remove(article)
|
||||
|
||||
# As above, but user contributed pictures. Match any case.
|
||||
elif 'YOUR PICTURES' in article.title.upper():
|
||||
feed.articles.remove(article)
|
||||
|
||||
# 'Sportsday Live' are articles which contain a constantly and
|
||||
# dynamically updated 'running commentary' during a live sporting
|
||||
# event. Match any case.
|
||||
elif 'SPORTSDAY LIVE' in article.title.upper():
|
||||
feed.articles.remove(article)
|
||||
|
||||
# Sometimes 'Sportsday Live' (above) becomes 'Live - Sport Name'.
|
||||
# These are being matched below using 'Live - ' because removing all
|
||||
# articles with 'live' in their titles would remove some articles
|
||||
# that are in fact not live sports pages. Match any case.
|
||||
elif 'LIVE - ' in article.title.upper():
|
||||
feed.articles.remove(article)
|
||||
|
||||
# 'Quiz of the week' is a Flash player weekly news quiz. Match only
|
||||
# the 'Quiz of the' part in anticipation of monthly and yearly
|
||||
# variants. Match any case.
|
||||
elif 'QUIZ OF THE' in article.title.upper():
|
||||
feed.articles.remove(article)
|
||||
|
||||
# Remove articles with 'scorecards' in the url. These are BBC sports
|
||||
# pages which just display a cricket scorecard. The pages have a mass
|
||||
# of table and css entries to display the scorecards nicely. Probably
|
||||
# could make them work with this recipe, but might take a whole day
|
||||
# of work to sort out all the css - basically a formatting nightmare.
|
||||
elif 'scorecards' in article.url:
|
||||
feed.articles.remove(article)
|
||||
|
||||
return feeds
|
||||
|
||||
# End of class and file.
|
||||
|
@ -1,61 +1,44 @@
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
'''Calibre recipe to convert the RSS feeds of the Berliner Zeitung to an ebook.'''
|
||||
|
||||
class SportsIllustratedRecipe(BasicNewsRecipe) :
|
||||
__author__ = 'ape'
|
||||
__copyright__ = 'ape'
|
||||
__author__ = 'a.peter'
|
||||
__copyright__ = 'a.peter'
|
||||
__license__ = 'GPL v3'
|
||||
language = 'de'
|
||||
description = 'Berliner Zeitung'
|
||||
version = 2
|
||||
description = 'Berliner Zeitung RSS'
|
||||
version = 4
|
||||
title = u'Berliner Zeitung'
|
||||
timefmt = ' [%d.%m.%Y]'
|
||||
|
||||
#oldest_article = 7.0
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
publication_type = 'newspaper'
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'teaser t_split t_artikel'})]
|
||||
remove_tags_before = dict(name='div', attrs={'class':'newstype'})
|
||||
remove_tags_after = [dict(id='article_text')]
|
||||
|
||||
INDEX = 'http://www.berlinonline.de/berliner-zeitung/'
|
||||
|
||||
def parse_index(self):
|
||||
base = 'http://www.berlinonline.de'
|
||||
answer = []
|
||||
articles = {}
|
||||
more = 1
|
||||
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
|
||||
# Get list of links to ressorts from index page
|
||||
ressort_list = soup.findAll('ul', attrs={'class': re.compile('ressortlist')})
|
||||
for ressort in ressort_list[0].findAll('a'):
|
||||
feed_title = ressort.string
|
||||
print 'Analyzing', feed_title
|
||||
if not articles.has_key(feed_title):
|
||||
articles[feed_title] = []
|
||||
answer.append(feed_title)
|
||||
# Load ressort page.
|
||||
feed = self.index_to_soup('http://www.berlinonline.de' + ressort['href'])
|
||||
# find mainbar div which contains the list of all articles
|
||||
for article_container in feed.findAll('div', attrs={'class': re.compile('mainbar')}):
|
||||
# iterate over all articles
|
||||
for article_teaser in article_container.findAll('div', attrs={'class': re.compile('teaser')}):
|
||||
# extract title of article
|
||||
if article_teaser.h3 != None:
|
||||
article = {'title' : article_teaser.h3.a.string, 'date' : u'', 'url' : base + article_teaser.h3.a['href'], 'description' : u''}
|
||||
articles[feed_title].append(article)
|
||||
else:
|
||||
# Skip teasers for missing photos
|
||||
if article_teaser.div.p.contents[0].find('Foto:') > -1:
|
||||
continue
|
||||
article = {'title': 'Weitere Artikel ' + str(more), 'date': u'', 'url': base + article_teaser.div.p.a['href'], 'description': u''}
|
||||
articles[feed_title].append(article)
|
||||
more += 1
|
||||
answer = [[key, articles[key]] for key in answer if articles.has_key(key)]
|
||||
return answer
|
||||
feeds = [(u'Startseite', u'http://www.berliner-zeitung.de/home/10808950,10808950,view,asFeed.xml'),
|
||||
(u'Politik', u'http://www.berliner-zeitung.de/home/10808018,10808018,view,asFeed.xml'),
|
||||
(u'Wirtschaft', u'http://www.berliner-zeitung.de/home/10808230,10808230,view,asFeed.xml'),
|
||||
(u'Berlin', u'http://www.berliner-zeitung.de/home/10809148,10809148,view,asFeed.xml'),
|
||||
(u'Brandenburg', u'http://www.berliner-zeitung.de/home/10809312,10809312,view,asFeed.xml'),
|
||||
(u'Wissenschaft', u'http://www.berliner-zeitung.de/home/10808894,10808894,view,asFeed.xml'),
|
||||
(u'Digital', u'http://www.berliner-zeitung.de/home/10808718,10808718,view,asFeed.xml'),
|
||||
(u'Kultur', u'http://www.berliner-zeitung.de/home/10809150,10809150,view,asFeed.xml'),
|
||||
(u'Panorama', u'http://www.berliner-zeitung.de/home/10808334,10808334,view,asFeed.xml'),
|
||||
(u'Sport', u'http://www.berliner-zeitung.de/home/10808794,10808794,view,asFeed.xml'),
|
||||
(u'Hertha', u'http://www.berliner-zeitung.de/home/10808800,10808800,view,asFeed.xml'),
|
||||
(u'Union', u'http://www.berliner-zeitung.de/home/10808802,10808802,view,asFeed.xml'),
|
||||
(u'Verkehr', u'http://www.berliner-zeitung.de/home/10809298,10809298,view,asFeed.xml'),
|
||||
(u'Polizei', u'http://www.berliner-zeitung.de/home/10809296,10809296,view,asFeed.xml'),
|
||||
(u'Meinung', u'http://www.berliner-zeitung.de/home/10808020,10808020,view,asFeed.xml')]
|
||||
|
||||
def get_masthead_url(self):
|
||||
return 'http://www.berlinonline.de/.img/berliner-zeitung/blz_logo.gif'
|
||||
return 'http://www.berliner-zeitung.de/image/view/10810244,7040611,data,logo.png'
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('.html', ',view,printVersion.html')
|
||||
|
38
recipes/biamag.recipe
Normal file
@ -0,0 +1,38 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
bianet.com.tr
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Radikal_tr(BasicNewsRecipe):
|
||||
title = 'BiaMag'
|
||||
__author__ = 'Osman Kaysan'
|
||||
description = 'Independent News from Turkey'
|
||||
publisher = 'BiaMag'
|
||||
category = 'news, politics, Turkey'
|
||||
oldest_article = 15
|
||||
max_articles_per_feed = 120
|
||||
masthead_url = 'http://bianet.org/images/biamag_logo.gif'
|
||||
language = 'tr'
|
||||
no_stylesheets = True
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
,'linearize_tables': True
|
||||
,'remove_paragraph_spacing': True,
|
||||
}
|
||||
|
||||
remove_tags_before = dict(name='div', attrs={'class':'manset'})
|
||||
remove_tags = [ dict(name='ul', attrs={'class':['altul']}), dict(name='div', attrs={'id':['habermenu']}), dict(name='div', attrs={'class':['mail']}), dict(name='div', attrs={'class':['from']})]
|
||||
remove_tags_after = dict(name='div', attrs={'id':'habermenu'})
|
||||
|
||||
feeds = [(u'BiaMag', u'http://www.bianet.org/biamag.rss')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
38
recipes/biamag_en.recipe
Normal file
@ -0,0 +1,38 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
bianet.com.tr
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Radikal_tr(BasicNewsRecipe):
|
||||
title = 'Bianet-English'
|
||||
__author__ = 'Osman Kaysan'
|
||||
description = 'Independent News Network from Turkey(English)'
|
||||
publisher = 'Bianet'
|
||||
category = 'news, politics, Turkey'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 150
|
||||
masthead_url = 'http://bianet.org/images/english_logo.gif'
|
||||
language = 'en_TR'
|
||||
no_stylesheets = True
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
,'linearize_tables': True
|
||||
,'remove_paragraph_spacing': True,
|
||||
}
|
||||
|
||||
remove_tags_before = dict(name='div', attrs={'class':'manset'})
|
||||
remove_tags = [ dict(name='ul', attrs={'class':['altul']}), dict(name='div', attrs={'id':['habermenu']}), dict(name='div', attrs={'class':['mail']}), dict(name='div', attrs={'class':['from']})]
|
||||
remove_tags_after = dict(name='div', attrs={'id':'habermenu'})
|
||||
|
||||
feeds = [(u'Bianet-English', u'http://www.bianet.org/english.rss')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
38
recipes/bianet.recipe
Normal file
@ -0,0 +1,38 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
bianet.com.tr
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Radikal_tr(BasicNewsRecipe):
|
||||
title = 'Bianet'
|
||||
__author__ = 'Osman Kaysan'
|
||||
description = 'Independent News from Turkey'
|
||||
publisher = 'Bianet'
|
||||
category = 'news, politics, Turkey'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 120
|
||||
masthead_url = 'http://bianet.org/images/bianet_logo.gif'
|
||||
language = 'tr'
|
||||
no_stylesheets = True
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
,'linearize_tables': True
|
||||
,'remove_paragraph_spacing': True,
|
||||
}
|
||||
|
||||
remove_tags_before = dict(name='div', attrs={'class':'manset'})
|
||||
remove_tags = [ dict(name='ul', attrs={'class':['altul']}), dict(name='div', attrs={'id':['habermenu']}), dict(name='div', attrs={'class':['mail']}), dict(name='div', attrs={'class':['from']})]
|
||||
remove_tags_after = dict(name='div', attrs={'id':'habermenu'})
|
||||
|
||||
feeds = [(u'Bianet', u'http://bianet.org/bianet.rss')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
19
recipes/biolog_pl.recipe
Normal file
@ -0,0 +1,19 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class Biolog_pl(BasicNewsRecipe):
|
||||
title = u'Biolog.pl'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds=True
|
||||
__author__ = 'fenuks'
|
||||
description = u'Przyrodnicze aktualności ze świata nauki (codziennie aktualizowane), kurs biologii, testy i sprawdziany, forum dyskusyjne.'
|
||||
category = 'biology'
|
||||
language = 'pl'
|
||||
cover_url='http://www.biolog.pl/naukowy,portal,biolog.png'
|
||||
no_stylesheets = True
|
||||
#keeps_only_tags=[dict(id='main')]
|
||||
remove_tags_before=dict(id='main')
|
||||
remove_tags_after=dict(name='a', attrs={'name':'komentarze'})
|
||||
remove_tags=[dict(name='img', attrs={'alt':'Komentarze'})]
|
||||
feeds = [(u'Wszystkie', u'http://www.biolog.pl/backend.php'), (u'Medycyna', u'http://www.biolog.pl/medycyna-rss.php'), (u'Ekologia', u'http://www.biolog.pl/rss-ekologia.php'), (u'Genetyka i biotechnologia', u'http://www.biolog.pl/rss-biotechnologia.php'), (u'Botanika', u'http://www.biolog.pl/rss-botanika.php'), (u'Le\u015bnictwo', u'http://www.biolog.pl/rss-lesnictwo.php'), (u'Zoologia', u'http://www.biolog.pl/rss-zoologia.php')]
|
50
recipes/birgun_gazetesi.recipe
Normal file
@ -0,0 +1,50 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Birgun (BasicNewsRecipe):
|
||||
|
||||
title = u'Birgün Gazetesi'
|
||||
__author__ = u'Osman Kaysan'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed =150
|
||||
use_embedded_content = False
|
||||
description = 'Birgun gazatesi haberleri, kose yazarlari'
|
||||
publisher = 'Birgün'
|
||||
category = 'news,haberler,turkce,gazete,birgun'
|
||||
language = 'tr'
|
||||
no_stylesheets = True
|
||||
publication_type = 'newspaper'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
,'linearize_tables': True
|
||||
,'remove_paragraph_spacing': True,
|
||||
}
|
||||
|
||||
cover_img_url = 'http://www.birgun.net/i/birgun.png'
|
||||
masthead_url = 'http://www.birgun.net/i/birgun.png'
|
||||
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
remove_tags_before = dict(name='h2', attrs={'class':'storyHeadline'})
|
||||
#remove_tags_after = dict(name='div', attrs={'class':'toollinks'})
|
||||
remove_tags_after = dict(name='tr', attrs={'valign':'top'})
|
||||
remove_tags = [ dict(name='div', attrs={'id':'byLine'}), dict(name='div', attrs={'class':'toollinks'})
|
||||
, dict(name='div', attrs={'class':'main-lead'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})
|
||||
, dict(name='a', attrs={'class':'addthis_button'})]
|
||||
|
||||
remove_empty_feeds= True
|
||||
|
||||
feeds = [
|
||||
( u'Güncel', u'http://www.birgun.net/actuels.xml')
|
||||
,( u'Köşe Yazarları', u'http://www.birgun.net/writer.xml')
|
||||
,( u'Politika', u'http://www.birgun.net/politics.xml')
|
||||
,( u'Ekonomi', u'http://www.birgun.net/economic.xml')
|
||||
,( u'Çalışma Yaşamı', u'http://www.birgun.net/workers.xml')
|
||||
,( u'Dünya', u'http://www.birgun.net/worlds.xml')
|
||||
,( u'Yaşam', u'http://www.birgun.net/lifes.xml')
|
||||
]
|
44
recipes/birmingham_post.recipe
Normal file
@ -0,0 +1,44 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
title = u'Birmingham post'
|
||||
description = 'News for Birmingham UK'
|
||||
timefmt = ''
|
||||
__author__ = 'Dave Asbury'
|
||||
cover_url = 'http://1.bp.blogspot.com/_GwWyq5eGw9M/S9BHPHxW55I/AAAAAAAAB6Q/iGCWl0egGzg/s320/Birmingham+post+Lite+front.JPG'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 20
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
auto_cleanup = True
|
||||
language = 'en_GB'
|
||||
|
||||
|
||||
masthead_url = 'http://www.pressgazette.co.uk/Pictures/web/t/c/g/birmingham_post.jpg'
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
#dict(name='h1',attrs={'id' : 'article-headline'}),
|
||||
#dict(attrs={'class':['article-meta-author','article-meta-date','article main','art-o art-align-center otm-1 ']}),
|
||||
#dict(name='p')
|
||||
#dict(attrs={'id' : 'three-col'})
|
||||
]
|
||||
remove_tags = [
|
||||
# dict(name='div',attrs={'class' : 'span-33 last header-links'})
|
||||
|
||||
]
|
||||
feeds = [
|
||||
#(u'News',u'http://www.birminghampost.net/news/rss.xml'),
|
||||
(u'Local News', u'http://www.birminghampost.net/news/west-midlands-news/rss.xml'),
|
||||
(u'UK News', u'http://www.birminghampost.net/news/uk-news/rss.xml'),
|
||||
(u'Sports',u'http://www.birminghampost.net/midlands-birmingham-sport/rss.xml'),
|
||||
(u'Bloggs & Comments',u'http://www.birminghampost.net/comment/rss.xml')
|
||||
|
||||
]
|
||||
extra_css = '''
|
||||
body {font: sans-serif medium;}'
|
||||
h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;}
|
||||
h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; }
|
||||
span{ font-size:9.5px; font-weight:bold;font-style:italic}
|
||||
p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
|
||||
|
||||
'''
|
@ -1,6 +1,6 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
blic.rs
|
||||
'''
|
||||
@ -73,7 +73,10 @@ class Blic(BasicNewsRecipe):
|
||||
def print_version(self, url):
|
||||
return url + '/print'
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.blic.rs/')
|
||||
alink = soup.find('a', attrs={'id':'blic_naslovna_print'})
|
||||
if alink:
|
||||
return 'http://www.blic.rs' + alink['href']
|
||||
return None
|
||||
|
26
recipes/blues.recipe
Normal file
@ -0,0 +1,26 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Oskar Kunicki <rakso at interia.pl>'
|
||||
'''
|
||||
Changelog:
|
||||
2011-11-27
|
||||
News from BluesRSS.info
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BluesRSS(BasicNewsRecipe):
|
||||
title = 'Blues News'
|
||||
__author__ = 'Oskar Kunicki'
|
||||
description ='Blues news from around the world'
|
||||
publisher = 'BluesRSS.info'
|
||||
category = 'news, blues, USA,UK'
|
||||
oldest_article = 5
|
||||
max_articles_per_feed = 100
|
||||
language = 'en'
|
||||
cover_url = 'http://bluesrss.info/cover.jpg'
|
||||
masthead_url = 'http://bluesrss.info/cover.jpg'
|
||||
no_stylesheets = True
|
||||
|
||||
remove_tags = [dict(name='div', attrs={'class':'wp-pagenavi'})]
|
||||
|
||||
feeds = [(u'News', u'http://bluesrss.info/feed/')]
|
@ -10,49 +10,39 @@ http://www.buffalonews.com/RSS/
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1298680852(BasicNewsRecipe):
|
||||
class BuffaloNews(BasicNewsRecipe):
|
||||
title = u'Buffalo News'
|
||||
oldest_article = 2
|
||||
language = 'en'
|
||||
__author__ = 'ChappyOnIce'
|
||||
__author__ = 'ChappyOnIce, Krittika Goyal'
|
||||
max_articles_per_feed = 20
|
||||
encoding = 'utf-8'
|
||||
masthead_url = 'http://www.buffalonews.com/buffalonews/skins/buffalonews/images/masthead/the_buffalo_news_logo.png'
|
||||
remove_javascript = True
|
||||
extra_css = 'body {text-align: justify;}\n \
|
||||
p {text-indent: 20px;}'
|
||||
auto_cleanup = True
|
||||
remove_empty_feeds = True
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':['main-content-left']})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'id':['commentCount']}),
|
||||
dict(name='div', attrs={'class':['story-list-links']})
|
||||
]
|
||||
|
||||
remove_tags_after = dict(name='div', attrs={'class':['body storyContent']})
|
||||
|
||||
feeds = [(u'City of Buffalo', u'http://www.buffalonews.com/city/communities/buffalo/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Southern Erie County', u'http://www.buffalonews.com/city/communities/southern-erie/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Eastern Erie County', u'http://www.buffalonews.com/city/communities/eastern-erie/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Southern Tier', u'http://www.buffalonews.com/city/communities/southern-tier/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Niagara County', u'http://www.buffalonews.com/city/communities/niagara-county/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Business', u'http://www.buffalonews.com/business/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'MoneySmart', u'http://www.buffalonews.com/business/moneysmart/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Bills & NFL', u'http://www.buffalonews.com/sports/bills-nfl/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Sabres & NHL', u'http://www.buffalonews.com/sports/sabres-nhl/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Bob DiCesare', u'http://www.buffalonews.com/sports/columns/bob-dicesare/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Bucky Gleason', u'http://www.buffalonews.com/sports/columns/bucky-gleason/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Mark Gaughan', u'http://www.buffalonews.com/sports/bills-nfl/inside-the-nfl/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Mike Harrington', u'http://www.buffalonews.com/sports/columns/mike-harrington/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Jerry Sullivan', u'http://www.buffalonews.com/sports/columns/jerry-sullivan/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Other Sports Columns', u'http://www.buffalonews.com/sports/columns/other-sports-columns/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Life', u'http://www.buffalonews.com/life/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Bruce Andriatch', u'http://www.buffalonews.com/city/columns/bruce-andriatch/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Donn Esmonde', u'http://www.buffalonews.com/city/columns/donn-esmonde/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Rod Watson', u'http://www.buffalonews.com/city/columns/rod-watson/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Entertainment', u'http://www.buffalonews.com/entertainment/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Off Main Street', u'http://www.buffalonews.com/city/columns/off-main-street/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Editorials', u'http://www.buffalonews.com/editorial-page/buffalo-news-editorials/?widget=rssfeed&view=feed&contentId=77944')
|
||||
feeds = [
|
||||
(u'City of Buffalo', u'http://www.buffalonews.com/city/communities/buffalo/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Southern Erie County', u'http://www.buffalonews.com/city/communities/southern-erie/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Eastern Erie County', u'http://www.buffalonews.com/city/communities/eastern-erie/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Southern Tier', u'http://www.buffalonews.com/city/communities/southern-tier/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Niagara County', u'http://www.buffalonews.com/city/communities/niagara-county/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Business', u'http://www.buffalonews.com/business/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'MoneySmart', u'http://www.buffalonews.com/business/moneysmart/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Bills & NFL', u'http://www.buffalonews.com/sports/bills-nfl/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Sabres & NHL', u'http://www.buffalonews.com/sports/sabres-nhl/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Bob DiCesare', u'http://www.buffalonews.com/sports/columns/bob-dicesare/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Bucky Gleason', u'http://www.buffalonews.com/sports/columns/bucky-gleason/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Mark Gaughan', u'http://www.buffalonews.com/sports/bills-nfl/inside-the-nfl/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Mike Harrington', u'http://www.buffalonews.com/sports/columns/mike-harrington/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Jerry Sullivan', u'http://www.buffalonews.com/sports/columns/jerry-sullivan/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Other Sports Columns', u'http://www.buffalonews.com/sports/columns/other-sports-columns/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Life', u'http://www.buffalonews.com/life/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Bruce Andriatch', u'http://www.buffalonews.com/city/columns/bruce-andriatch/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Donn Esmonde', u'http://www.buffalonews.com/city/columns/donn-esmonde/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Rod Watson', u'http://www.buffalonews.com/city/columns/rod-watson/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Entertainment', u'http://www.buffalonews.com/entertainment/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Off Main Street', u'http://www.buffalonews.com/city/columns/off-main-street/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Editorials', u'http://www.buffalonews.com/editorial-page/buffalo-news-editorials/?widget=rssfeed&view=feed&contentId=77944')
|
||||
]
|
||||
|
||||
|
@ -4,16 +4,16 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = u'2011, Silviu Cotoar\u0103'
|
||||
'''
|
||||
catavencu.ro
|
||||
academiacatavencu.info
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Catavencu(BasicNewsRecipe):
|
||||
class AcademiaCatavencu(BasicNewsRecipe):
|
||||
title = u'Academia Ca\u0163avencu'
|
||||
__author__ = u'Silviu Cotoar\u0103'
|
||||
description = 'Tagma cum laude'
|
||||
publisher = 'Catavencu'
|
||||
publisher = u'Ca\u0163avencu'
|
||||
oldest_article = 5
|
||||
language = 'ro'
|
||||
max_articles_per_feed = 100
|
||||
@ -21,32 +21,31 @@ class Catavencu(BasicNewsRecipe):
|
||||
use_embedded_content = False
|
||||
category = 'Ziare'
|
||||
encoding = 'utf-8'
|
||||
cover_url = 'http://upload.wikimedia.org/wikipedia/en/1/1e/Academia_Catavencu.jpg'
|
||||
cover_url = 'http://www.academiacatavencu.info/images/logo.png'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
,'publisher' : publisher
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='ul', attrs={'class':'articles'})
|
||||
dict(name='h1', attrs={'class':'art_title'}),
|
||||
dict(name='div', attrs={'class':'art_text'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['tools']})
|
||||
, dict(name='div', attrs={'class':['share']})
|
||||
, dict(name='div', attrs={'class':['category']})
|
||||
, dict(name='div', attrs={'id':['comments']})
|
||||
dict(name='div', attrs={'class':['desp_m']})
|
||||
, dict(name='div', attrs={'id':['tags']})
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='div', attrs={'id':'comments'})
|
||||
dict(name='div', attrs={'class':['desp_m']})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Feeds', u'http://catavencu.ro/feed/rss')
|
||||
(u'Feeds', u'http://www.academiacatavencu.info/rss.xml')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
@ -27,7 +27,7 @@ class CGM(BasicNewsRecipe):
|
||||
del item['style']
|
||||
ad=soup.findAll('a')
|
||||
for r in ad:
|
||||
if 'http://www.hustla.pl' in r['href']:
|
||||
if 'http://www.hustla.pl' in r['href'] or 'http://www.ebilet.pl' in r['href']:
|
||||
r.extract()
|
||||
gallery=soup.find('div', attrs={'class':'galleryFlash'})
|
||||
if gallery:
|
||||
|
@ -23,7 +23,9 @@ class TheCND(BasicNewsRecipe):
|
||||
remove_tags = [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
|
||||
no_stylesheets = True
|
||||
|
||||
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
||||
preprocess_regexps = [ (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
|
||||
(re.compile('<table width.*?</table>', re.DOTALL), lambda m: ''),
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
if url.find('news/article.php') >= 0:
|
||||
@ -46,16 +48,18 @@ class TheCND(BasicNewsRecipe):
|
||||
title = self.tag_to_string(a)
|
||||
self.log('\tFound article: ', title, 'at', url)
|
||||
date = a.nextSibling
|
||||
if re.search('cm', date):
|
||||
continue
|
||||
if (date is not None) and len(date)>2:
|
||||
if not articles.has_key(date):
|
||||
articles[date] = []
|
||||
articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
|
||||
self.log('\t\tAppend to : ', date)
|
||||
|
||||
self.log('log articles', articles)
|
||||
#self.log('log articles', articles)
|
||||
mostCurrent = sorted(articles).pop()
|
||||
self.title = 'CND ' + mostCurrent
|
||||
|
||||
self.title = 'CND ' + mostCurrent
|
||||
|
||||
feeds.append((self.title, articles[mostCurrent]))
|
||||
|
||||
return feeds
|
||||
|
72
recipes/cnd_weekly.recipe
Normal file
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Derek Liang <Derek.liang.ca @@@at@@@ gmail.com>'
|
||||
'''
|
||||
cnd.org
|
||||
'''
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class TheCND(BasicNewsRecipe):
|
||||
|
||||
title = 'CND Weekly'
|
||||
__author__ = 'Derek Liang'
|
||||
description = ''
|
||||
INDEX = 'http://cnd.org'
|
||||
language = 'zh'
|
||||
conversion_options = {'linearize_tables':True}
|
||||
|
||||
remove_tags_before = dict(name='div', id='articleHead')
|
||||
remove_tags_after = dict(id='copyright')
|
||||
remove_tags = [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
|
||||
no_stylesheets = True
|
||||
|
||||
preprocess_regexps = [ (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
|
||||
(re.compile('<table width.*?</table>', re.DOTALL), lambda m: ''),
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
if url.find('news/article.php') >= 0:
|
||||
return re.sub("^[^=]*", "http://my.cnd.org/modules/news/print.php?storyid", url)
|
||||
else:
|
||||
return re.sub("^[^=]*", "http://my.cnd.org/modules/wfsection/print.php?articleid", url)
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
|
||||
feeds = []
|
||||
articles = {}
|
||||
|
||||
for a in soup.findAll('a', attrs={'target':'_cnd'}):
|
||||
url = a['href']
|
||||
if url.find('article.php') < 0 :
|
||||
continue
|
||||
if url.startswith('/'):
|
||||
url = 'http://cnd.org'+url
|
||||
title = self.tag_to_string(a)
|
||||
date = a.nextSibling
|
||||
if not re.search('cm', date):
|
||||
continue
|
||||
self.log('\tFound article: ', title, 'at', url, '@', date)
|
||||
if (date is not None) and len(date)>2:
|
||||
if not articles.has_key(date):
|
||||
articles[date] = []
|
||||
articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
|
||||
self.log('\t\tAppend to : ', date)
|
||||
|
||||
|
||||
sorted_articles = sorted(articles)
|
||||
while sorted_articles:
|
||||
mostCurrent = sorted_articles.pop()
|
||||
self.title = 'CND ' + mostCurrent
|
||||
feeds.append((self.title, articles[mostCurrent]))
|
||||
|
||||
return feeds
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
header = soup.find('h3')
|
||||
self.log('header: ' + self.tag_to_string(header))
|
||||
pass
|
||||
|
22
recipes/computerworld_pl.recipe
Normal file
@ -0,0 +1,22 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class Computerworld_pl(BasicNewsRecipe):
|
||||
title = u'Computerworld.pl'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Serwis o IT w przemyśle, finansach, handlu, administracji oraz rynku IT i telekomunikacyjnym - wiadomości, opinie, analizy, porady prawne'
|
||||
category = 'IT'
|
||||
language = 'pl'
|
||||
no_stylesheets=True
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags=[dict(name='div', attrs={'id':'s'})]
|
||||
remove_tags_after=dict(name='div', attrs={'class':'rMobi'})
|
||||
remove_tags=[dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})]
|
||||
feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')]
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.computerworld.pl/')
|
||||
cover=soup.find(name='img', attrs={'class':'prawo'})
|
||||
self.cover_url=cover['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
52
recipes/cosmopolitan_uk.recipe
Normal file
@ -0,0 +1,52 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
#from calibre import __appname__
|
||||
from calibre.utils.magick import Image
|
||||
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
title = u'Cosmopolitan UK'
|
||||
description = 'Fashion, beauty and Gossip for women from COSMOPOLITAN -UK'
|
||||
|
||||
__author__ = 'Dave Asbury'
|
||||
#last update 21/12/11
|
||||
# greyscale code by Starson
|
||||
cover_url = 'http://www.cosmopolitan.magazine.co.uk/files/4613/2085/8988/Cosmo_Cover3.jpg'
|
||||
no_stylesheets = True
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 20
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<!-- Begin tmpl module_competition_offer -->.*?<!-- End tmpl module_competition_offer-->', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||
language = 'en_GB'
|
||||
|
||||
|
||||
masthead_url = 'http://www.cosmopolitan.co.uk/cm/cosmopolitanuk/site_images/header/cosmouk_logo_home.gif'
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
dict(attrs={'class' : ['dateAuthor', 'publishDate']}),
|
||||
dict(name='div',attrs ={'id' : ['main_content']})
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name='div',attrs={'class' : ['blogInfo','viral_toolbar','comment_number','prevEntry nav']}),
|
||||
dict(name='div',attrs={'class' : 'blog_module_about_the_authors'}),
|
||||
dict(attrs={'id': ['breadcrumbs','comment','related_links_list','right_rail','content_sec_fb_more','content_sec_mostpopularstories','content-sec_fb_frame_viewfb_bot']}),
|
||||
dict(attrs={'class' : ['read_liked_that_header','fb_back_next_area']}),
|
||||
dict(name='li',attrs={'class' : 'thumb'})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Love & Sex', u'http://www.cosmopolitan.co.uk/love-sex/rss/'), (u'Men', u'http://cosmopolitan.co.uk/men/rss/'), (u'Fashion', u'http://cosmopolitan.co.uk/fashion/rss/'), (u'Hair & Beauty', u'http://cosmopolitan.co.uk/beauty-hair/rss/'), (u'LifeStyle', u'http://cosmopolitan.co.uk/lifestyle/rss/'), (u'Cosmo On Campus', u'http://cosmopolitan.co.uk/campus/rss/'), (u'Celebrity Gossip', u'http://cosmopolitan.co.uk/celebrity-gossip/rss/')]
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
#process all the images
|
||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
||||
iurl = tag['src']
|
||||
img = Image()
|
||||
img.open(iurl)
|
||||
if img < 0:
|
||||
raise RuntimeError('Out of memory')
|
||||
img.type = "GrayscaleType"
|
||||
img.save(iurl)
|
||||
return soup
|
@ -5,7 +5,7 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
||||
description = 'News as provide by The Daily Mirror -UK'
|
||||
|
||||
__author__ = 'Dave Asbury'
|
||||
# last updated 30/10/11
|
||||
# last updated 26/12/11
|
||||
language = 'en_GB'
|
||||
|
||||
cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
|
||||
@ -13,30 +13,22 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
||||
masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
|
||||
|
||||
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 30
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 20
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
extra_css = '''
|
||||
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
|
||||
'''
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div',attrs={'id' : 'body-content'})
|
||||
]
|
||||
|
||||
remove_tags_after = [dict (name='div',attrs={'class' : 'related'})]
|
||||
|
||||
auto_cleanup = True
|
||||
remove_tags = [
|
||||
dict(name='div',attrs={'id' : ['sidebar','menu','search-box','roffers-top']}),
|
||||
dict(name='div',attrs={'class' :['inline-ad span-16 last','article-resize','related','list teasers']}),
|
||||
dict(attrs={'class' : ['channellink','article-tags','replace','append-html']}),
|
||||
dict(name='div',attrs={'class' : 'span-12 last sl-others addthis_toolbox addthis_default_style'})
|
||||
dict(name='title'),
|
||||
dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),
|
||||
]
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<dl class="q-search">.*?</dl>', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||
(re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'Advertisement >>', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||
|
||||
|
||||
feeds = [
|
||||
@ -53,5 +45,10 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
|
||||
,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
|
||||
|
||||
# example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
|
||||
|
||||
]
|
||||
extra_css = '''
|
||||
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
|
||||
h1{ font-size:18px;}
|
||||
img { display:block}
|
||||
'''
|
||||
|
||||
|
18
recipes/daily_writing_tips.recipe
Normal file
@ -0,0 +1,18 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DailyWritingTips(BasicNewsRecipe):
|
||||
title = u'Daily Writing Tips'
|
||||
language = 'en_GB'
|
||||
__author__ = 'NotTaken'
|
||||
oldest_article = 7 #days
|
||||
max_articles_per_feed = 40
|
||||
use_embedded_content = True
|
||||
no_stylesheets = True
|
||||
auto_cleanup = False
|
||||
encoding = 'utf-8'
|
||||
|
||||
|
||||
feeds = [
|
||||
('Latest tips',
|
||||
'http://feeds2.feedburner.com/DailyWritingTips'),
|
||||
]
|
15
recipes/datasport.recipe
Normal file
@ -0,0 +1,15 @@
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'faber1971'
|
||||
description = 'Italian soccer news website - v1.00 (17, December 2011)'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1324114272(BasicNewsRecipe):
|
||||
title = u'Datasport'
|
||||
language = 'it'
|
||||
__author__ = 'faber1971'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'Datasport', u'http://www.datasport.it/calcio/rss.xml')]
|
11
recipes/derin_dusunce.recipe
Normal file
@ -0,0 +1,11 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BasicUserRecipe1324913694(BasicNewsRecipe):
|
||||
title = u'Derin Dusunce'
|
||||
language = 'tr'
|
||||
__author__ = 'asalet_r'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 20
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'Derin D\xfc\u015f\xfcnce', u'http://www.derindusunce.org/feed/')]
|
27
recipes/descopera_org.recipe
Normal file
@ -0,0 +1,27 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
'''
|
||||
descopera.org
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Descopera(BasicNewsRecipe):
|
||||
title = u'Descoperă.org'
|
||||
__author__ = 'Marius Ignătescu'
|
||||
description = 'Descoperă. Placerea de a cunoaște'
|
||||
publisher = 'descopera.org'
|
||||
category = 'science, technology, culture, history, earth'
|
||||
language = 'ro'
|
||||
oldest_article = 14
|
||||
max_articles_per_feed = 100
|
||||
encoding = 'utf8'
|
||||
no_stylesheets = True
|
||||
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
|
||||
keep_only_tags = [dict(name='div', attrs={'class':['post']})]
|
||||
remove_tags = [dict(name='div', attrs={'class':['topnav', 'box_a', 'shr-bookmarks shr-bookmarks-expand shr-bookmarks-center shr-bookmarks-bg-knowledge']})]
|
||||
remove_attributes = ['width','height']
|
||||
cover_url = 'http://www.descopera.org/wp-content/themes/dorg/styles/default/img/b_top.png?width=400'
|
||||
feeds = [(u'Articles', u'http://www.descopera.org/feed/')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
@ -46,7 +46,8 @@ class DziennikInternautowRecipe(BasicNewsRecipe):
|
||||
dict(name = 'div', attrs = {'class' : 'poradniki_context'}),
|
||||
dict(name = 'div', attrs = {'class' : 'uniBox'}),
|
||||
dict(name = 'object', attrs = {}),
|
||||
dict(name = 'h3', attrs = {})
|
||||
dict(name = 'h3', attrs = {}),
|
||||
dict(attrs={'class':'twitter-share-button'})
|
||||
]
|
||||
|
||||
preprocess_regexps = [
|
||||
@ -58,3 +59,8 @@ class DziennikInternautowRecipe(BasicNewsRecipe):
|
||||
(r'\s*</', lambda match: '</'),
|
||||
]
|
||||
]
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
if 'Advertisement' in soup.title:
|
||||
nexturl=soup.find('a')['href']
|
||||
return self.index_to_soup(nexturl, raw=True)
|
||||
|
12
recipes/dunya_bizim.recipe
Normal file
@ -0,0 +1,12 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BasicUserRecipe1324736687(BasicNewsRecipe):
|
||||
title = u'D\xfcnya Bizim'
|
||||
language = 'tr'
|
||||
__author__ = 'asalet_r'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 10
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'Aktif \u0130mamlar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=31'), (u'Ayr\u0131nt\u0131 Defteri', u'http://dunyabizim.com/servisler/rss.php?kategoriID=58'), (u'Baba Kitaplar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=4'), (u'Bu da Oldu', u'http://dunyabizim.com/servisler/rss.php?kategoriID=32'), (u'\xc7-al\u0131nt\u0131 Yaz\u0131lar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=33'), (u'Dar\xfclmedya', u'http://dunyabizim.com/servisler/rss.php?kategoriID=49'), (u'Gidenler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=59'), (u'G\xfczel Mekanlar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=43'), (u'\u0130yi Haberler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=18'), (u'\u0130yi M\xfczikler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=2'), (u'Kalite Dergiler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=3'), (u'Konu\u015fa Konu\u015fa', u'http://dunyabizim.com/servisler/rss.php?kategoriID=24'), (u'M\xfcstesta G\xfczeller', u'http://dunyabizim.com/servisler/rss.php?kategoriID=65'), (u'O \u015eimdi Nerede?', u'http://dunyabizim.com/servisler/rss.php?kategoriID=52'), (u'Olsa Ke\u015fke', u'http://dunyabizim.com/servisler/rss.php?kategoriID=34'), (u'Orada Ne Oldu?', u'http://dunyabizim.com/servisler/rss.php?kategoriID=38'), (u'\xd6nemli Adamlar', u'http://dunyabizim.com/servisler/rss.php?kategoriID=1'), (u'Polemik', u'http://dunyabizim.com/servisler/rss.php?kategoriID=39'), (u'Sinema', u'http://dunyabizim.com/servisler/rss.php?kategoriID=23'), (u'Yalan Haber', u'http://dunyabizim.com/servisler/rss.php?kategoriID=40'), (u'Yeni \u015eeyler', u'http://dunyabizim.com/servisler/rss.php?kategoriID=57'), (u'Zekeriya Sofras\u0131', u'http://dunyabizim.com/servisler/rss.php?kategoriID=60')]
|
12
recipes/dunya_bulteni.recipe
Normal file
@ -0,0 +1,12 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BasicUserRecipe1321194347(BasicNewsRecipe):
|
||||
title = u'D\xfcnya B\xfclteni'
|
||||
language = 'tr'
|
||||
__author__ = 'asalet_r'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 50
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'Tarih Dosyas\u0131', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=157'), (u'R\xf6portaj', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=153'), (u'Makale-Yorum', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=174'), (u'K\xfclt\xfcr-Sanat', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=66'), (u'Hayat\u0131n \u0130\xe7inden', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=200'), (u'Haber Analiz', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=123'), (u'Gezi-\u0130zlenim', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=90'), (u'Aile Sa\u011fl\u0131k E\u011fitim', u'http://www.dunyabulteni.net/servisler/rss.php?kategoriID=75')]
|
58
recipes/dziennik_pl.recipe
Normal file
@ -0,0 +1,58 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
class Dziennik_pl(BasicNewsRecipe):
|
||||
title = u'Dziennik.pl'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Wiadomości z kraju i ze świata. Wiadomości gospodarcze. Znajdziesz u nas informacje, wydarzenia, komentarze, opinie.'
|
||||
category = 'newspaper'
|
||||
language = 'pl'
|
||||
cover_url='http://6.s.dziennik.pl/images/og_dziennik.jpg'
|
||||
no_stylesheets = True
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_javascript=True
|
||||
remove_empty_feeds=True
|
||||
preprocess_regexps = [(re.compile("Komentarze:"), lambda m: '')]
|
||||
keep_only_tags=[dict(id='article')]
|
||||
remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget']}), dict(name='a', attrs={'class':'komentarz'})]
|
||||
feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'),
|
||||
(u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'),
|
||||
(u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'),
|
||||
(u'Kobieta', u'http://rss.dziennik.pl/Dziennik-Kobieta'),
|
||||
(u'Auto', u'http://rss.dziennik.pl/Dziennik-Auto'),
|
||||
(u'Rozrywka', u'http://rss.dziennik.pl/Dziennik-Rozrywka'),
|
||||
(u'Film', u'http://rss.dziennik.pl/Dziennik-Film'),
|
||||
(u'Muzyka' , u'http://rss.dziennik.pl/Dziennik-Muzyka'),
|
||||
(u'Kultura', u'http://rss.dziennik.pl/Dziennik-Kultura'),
|
||||
(u'Nauka', u'http://rss.dziennik.pl/Dziennik-Nauka'),
|
||||
(u'Podróże', u'http://rss.dziennik.pl/Dziennik-Podroze/'),
|
||||
(u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')]
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
tag=soup.find('a', attrs={'class':'page_next'})
|
||||
if tag:
|
||||
appendtag.find('div', attrs={'class':'article_paginator'}).extract()
|
||||
while tag:
|
||||
soup2= self.index_to_soup(tag['href'])
|
||||
tag=soup2.find('a', attrs={'class':'page_next'})
|
||||
if not tag:
|
||||
for r in appendtag.findAll('div', attrs={'class':'art_src'}):
|
||||
r.extract()
|
||||
pagetext = soup2.find(name='div', attrs={'class':'article_body'})
|
||||
for dictionary in self.remove_tags:
|
||||
v=pagetext.findAll(name=dictionary['name'], attrs=dictionary['attrs'])
|
||||
for delete in v:
|
||||
delete.extract()
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
if appendtag.find('div', attrs={'class':'article_paginator'}):
|
||||
appendtag.find('div', attrs={'class':'article_paginator'}).extract()
|
||||
|
||||
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
46
recipes/echo_online.recipe
Normal file
@ -0,0 +1,46 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid at kovidgoyal.net>, Armin Geller'
|
||||
'''
|
||||
Fetch echo-online.de
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
class Echo_Online(BasicNewsRecipe):
|
||||
title = u'Echo Online' # 2011-12-28 AGe
|
||||
description = '-Echo Online-'
|
||||
publisher = 'Echo Online GmbH'
|
||||
category = 'News, Germany'
|
||||
__author__ = 'Armin Geller' # 2011-12-28 AGe
|
||||
language = 'de'
|
||||
lang = 'de-DE'
|
||||
encoding = 'iso-8859-1'
|
||||
timefmt = ' [%a, %d %b %Y]'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 50 # 2011-12-28 AGe
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
remove_javascript = True
|
||||
|
||||
feeds = [
|
||||
(u'Topnews', u'http://www.echo-online.de/storage/rss/rss/topnews.xml'),
|
||||
(u'Darmstadt', u'http://www.echo-online.de/rss/darmstadt.xml'),
|
||||
(u'Darmstadt-Dieburg', u'http://www.echo-online.de/rss/darmstadtdieburg.xml'),
|
||||
(u'Kreis Gro\xdf-Gerau', u'http://www.echo-online.de/rss/kreisgrossgerau.xml'),
|
||||
(u'R\xfcsselsheim', u'http://www.echo-online.de/rss/ruesselsheim.xml'),
|
||||
(u'Kreis Bergstra\xdfe', u'http://www.echo-online.de/rss/bergstrasse.xml'),
|
||||
(u'Odenwaldkreis', u'http://www.echo-online.de/rss/odenwald.xml'),
|
||||
(u'SV 98', u'http://www.echo-online.de/rss/sv98.xml'),
|
||||
(u'Kino', u'http://www.echo-online.de/rss/kino.xml'),
|
||||
(u'Ausstellungen', u'http://www.echo-online.de/rss/ausstellungen.xml'),
|
||||
(u'Ausflug & Reise', u'http://www.echo-online.de/rss/ausflugreise.xml'),
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return self.browser.open_novisit(url).geturl() + '?_FRAME=33&_FORMAT=PRINT'
|
||||
|
||||
remove_tags = [dict(name='div', attrs={'class':["header", "name"]}),]
|
||||
auto_cleanup_keep = '//div[@class="bild_gross w270"]'
|
||||
|
||||
cover_url = 'http://adcounter.darmstaedter-echo.de/webdav/files/config/gui/images/Zeitungsfaecher.gif'
|
||||
|
50
recipes/edge_conversations.recipe
Normal file
@ -0,0 +1,50 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012 Levien van Zon <levien@zonnetjes.net>'
|
||||
|
||||
'''
|
||||
Fetch Edge.org conversations
|
||||
'''
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class EdgeConversationRSS(BasicNewsRecipe):
|
||||
title = u'Edge.org Conversations'
|
||||
__author__ = 'levien'
|
||||
language = 'en'
|
||||
description = '''Edge.org offers "open-minded, free ranging, intellectually
|
||||
playful ... an unadorned pleasure in curiosity, a collective expression of
|
||||
wonder at the living and inanimate world ... an ongoing and thrilling
|
||||
colloquium.'''
|
||||
oldest_article = 60
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'HomeLeftPannel IMGCTRL'}) ]
|
||||
remove_tags = [
|
||||
dict(name='div',attrs={'class':'Logo'})
|
||||
]
|
||||
|
||||
feeds = [(u'Edge RSS', u'http://edge.org/feeds/')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('conversation/', 'conversation.php?cid=')
|
||||
|
||||
def parse_feeds(self):
|
||||
|
||||
# Call parent's method.
|
||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||
|
||||
# Loop through all feeds.
|
||||
for feed in feeds:
|
||||
|
||||
# Loop through all articles in feed.
|
||||
for article in feed.articles[:]:
|
||||
|
||||
# Remove anything that is not a conversation, and remove PDF files as well...
|
||||
|
||||
if not ('CONVERSATION' in article.title):
|
||||
feed.articles.remove(article)
|
||||
elif 'pdf' in article.url:
|
||||
feed.articles.remove(article)
|
||||
|
||||
return feeds
|
||||
|
@ -5,12 +5,11 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '04 December 2010, desUBIKado'
|
||||
__author__ = 'desUBIKado'
|
||||
__description__ = 'Daily newspaper from Aragon'
|
||||
__version__ = 'v0.07'
|
||||
__date__ = '06, February 2011'
|
||||
__version__ = 'v0.08'
|
||||
__date__ = '13, November 2011'
|
||||
'''
|
||||
elperiodicodearagon.com
|
||||
'''
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
|
||||
@ -20,13 +19,13 @@ class elperiodicodearagon(BasicNewsRecipe):
|
||||
description = u'Noticias desde Aragon'
|
||||
publisher = u'elperiodicodearagon.com'
|
||||
category = u'news, politics, Spain, Aragon'
|
||||
oldest_article = 2
|
||||
oldest_article = 1
|
||||
delay = 0
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'es'
|
||||
encoding = 'utf8'
|
||||
encoding = 'iso-8859-1'
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
|
||||
@ -39,61 +38,30 @@ class elperiodicodearagon(BasicNewsRecipe):
|
||||
}
|
||||
|
||||
feeds = [
|
||||
(u'Arag\xf3n', u'http://elperiodicodearagon.com/RSS/2.xml'),
|
||||
(u'Internacional', u'http://elperiodicodearagon.com/RSS/4.xml'),
|
||||
(u'Espa\xf1a', u'http://elperiodicodearagon.com/RSS/3.xml'),
|
||||
(u'Econom\xeda', u'http://elperiodicodearagon.com/RSS/5.xml'),
|
||||
(u'Deportes', u'http://elperiodicodearagon.com/RSS/7.xml'),
|
||||
(u'Real Zaragoza', u'http://elperiodicodearagon.com/RSS/10.xml'),
|
||||
(u'Opini\xf3n', u'http://elperiodicodearagon.com/RSS/103.xml'),
|
||||
(u'Escenarios', u'http://elperiodicodearagon.com/RSS/105.xml'),
|
||||
(u'Sociedad', u'http://elperiodicodearagon.com/RSS/104.xml'),
|
||||
(u'Gente', u'http://elperiodicodearagon.com/RSS/330.xml')
|
||||
(u'Portada', u'http://zetaestaticos.com/aragon/rss/portada_es.xml'),
|
||||
(u'Arag\xf3n', u'http://zetaestaticos.com/aragon/rss/2_es.xml'),
|
||||
(u'Internacional', u'http://zetaestaticos.com/aragon/rss/4_es.xml'),
|
||||
(u'Espa\xf1a', u'http://zetaestaticos.com/aragon/rss/3_es.xml'),
|
||||
(u'Econom\xeda', u'http://zetaestaticos.com/aragon/rss/5_es.xml'),
|
||||
(u'Deportes', u'http://zetaestaticos.com/aragon/rss/7_es.xml'),
|
||||
(u'Real Zaragoza', u'http://zetaestaticos.com/aragon/rss/10_es.xml'),
|
||||
(u'CAI Zaragoza', u'http://zetaestaticos.com/aragon/rss/91_es.xml'),
|
||||
(u'Monta\xf1ismo', u'http://zetaestaticos.com/aragon/rss/354_es.xml'),
|
||||
(u'Opini\xf3n', u'http://zetaestaticos.com/aragon/rss/103_es.xml'),
|
||||
(u'Tema del d\xeda', u'http://zetaestaticos.com/aragon/rss/102_es.xml'),
|
||||
(u'Escenarios', u'http://zetaestaticos.com/aragon/rss/105_es.xml'),
|
||||
(u'Sociedad', u'http://zetaestaticos.com/aragon/rss/104_es.xml'),
|
||||
(u'Gente', u'http://zetaestaticos.com/aragon/rss/330_es.xml'),
|
||||
(u'Espacio 3', u'http://zetaestaticos.com/aragon/rss/328_es.xml'),
|
||||
(u'Fiestas del Pilar', u'http://zetaestaticos.com/aragon/rss/107_es.xml')
|
||||
]
|
||||
|
||||
|
||||
extra_css = '''
|
||||
h3 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
|
||||
h2 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:18px;}
|
||||
h4 {font-family:Arial,Helvetica,sans-serif; font-style:italic; font-weight:normal;font-size:20px;}
|
||||
.columnaDeRecursosRelacionados {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
|
||||
img{margin-bottom: 0.4em}
|
||||
'''
|
||||
|
||||
remove_attributes = ['height','width']
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'contenidos'})]
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'Noticia'})]
|
||||
|
||||
|
||||
# Quitar toda la morralla
|
||||
|
||||
remove_tags = [dict(name='ul', attrs={'class':'herramientasDeNoticia'}),
|
||||
dict(name='span', attrs={'class':'MasInformacion '}),
|
||||
dict(name='span', attrs={'class':'MasInformacion'}),
|
||||
dict(name='div', attrs={'class':'Middle'}),
|
||||
dict(name='div', attrs={'class':'MenuCabeceraRZaragoza'}),
|
||||
dict(name='div', attrs={'id':'MenuCabeceraRZaragoza'}),
|
||||
dict(name='div', attrs={'class':'MenuEquipo'}),
|
||||
dict(name='div', attrs={'class':'TemasRelacionados'}),
|
||||
dict(name='div', attrs={'class':'GaleriaEnNoticia'}),
|
||||
dict(name='div', attrs={'class':'Recorte'}),
|
||||
dict(name='div', attrs={'id':'NoticiasenRecursos'}),
|
||||
dict(name='div', attrs={'id':'NoticiaEnPapel'}),
|
||||
dict(name='p', attrs={'class':'RecorteEnNoticias'}),
|
||||
dict(name='div', attrs={'id':'Comparte'}),
|
||||
dict(name='div', attrs={'id':'CajaComparte'}),
|
||||
dict(name='a', attrs={'class':'EscribirComentario'}),
|
||||
dict(name='a', attrs={'class':'AvisoComentario'}),
|
||||
dict(name='div', attrs={'class':'CajaAvisoComentario'}),
|
||||
dict(name='div', attrs={'class':'navegaNoticias'}),
|
||||
dict(name='div', attrs={'class':'Mensaje'}),
|
||||
dict(name='div', attrs={'id':'PaginadorDiCom'}),
|
||||
dict(name='div', attrs={'id':'CajaAccesoCuentaUsuario'}),
|
||||
dict(name='div', attrs={'id':'CintilloComentario'}),
|
||||
dict(name='div', attrs={'id':'EscribeComentario'}),
|
||||
dict(name='div', attrs={'id':'FormularioComentario'}),
|
||||
dict(name='div', attrs={'id':'FormularioNormas'})]
|
||||
|
||||
# Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion)
|
||||
|
||||
def get_cover_url(self):
|
||||
@ -104,23 +72,7 @@ class elperiodicodearagon(BasicNewsRecipe):
|
||||
return image['src'].rstrip('format=2') + 'format=1'
|
||||
return None
|
||||
|
||||
# Para quitar espacios entre la noticia y los comentarios (lineas 1 y 2)
|
||||
# El indice no apuntaba correctamente al empiece de la noticia (linea 3)
|
||||
# Usamos la versión para móviles
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<p> </p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(r'<p> </p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(r'<p id="">', re.DOTALL|re.IGNORECASE), lambda match: '<p>')
|
||||
]
|
||||
|
||||
# Para sustituir el video incrustado de YouTube por una imagen
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for video_yt in soup.findAll('iframe',{'title':'YouTube video player'}):
|
||||
if video_yt:
|
||||
video_yt.name = 'img'
|
||||
fuente = video_yt['src']
|
||||
fuente2 = fuente.replace('http://www.youtube.com/embed/','http://img.youtube.com/vi/')
|
||||
video_yt['src'] = fuente2 + '/0.jpg'
|
||||
|
||||
return soup
|
||||
def print_version(self, url):
|
||||
return url.replace('http://www.elperiodicodearagon.com/', 'http://www.elperiodicodearagon.com/m/')
|
||||
|
58
recipes/elet_es_irodalom.recipe
Normal file
@ -0,0 +1,58 @@
|
||||
################################################################################
|
||||
#Description: http://es.hu/ RSS channel
|
||||
#Author: Bigpapa (bigpapabig@hotmail.com)
|
||||
#Date: 2012.01.20. - V1.2
|
||||
################################################################################
|
||||
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class elet_es_irodalom(BasicNewsRecipe):
|
||||
title = u'\u00c9let \u00e9s Irodalom'
|
||||
__author__ = 'Bigpapa'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 30 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
|
||||
no_stylesheets = True
|
||||
#delay = 1
|
||||
use_embedded_content = False
|
||||
encoding = 'iso-8859-2'
|
||||
category = 'Cikkek'
|
||||
language = 'hu'
|
||||
publication_type = 'newsportal'
|
||||
extra_css = '.doc_title { font: bold 30px } .doc_author {font: bold 14px} '
|
||||
needs_subscription = 'optional'
|
||||
|
||||
masthead_url = 'http://www.es.hu/images/logo.jpg'
|
||||
timefmt = ' [%Y %b %d, %a]'
|
||||
|
||||
#Nem ide a kódba kell beleírni a hozzáférés adatait, hanem azt akkor adod meg, ha le akarod tölteni!
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('http://www.es.hu/')
|
||||
br.select_form(name='userfrmlogin')
|
||||
br['cusername'] = self.username
|
||||
br['cpassword'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':['doc_author', 'doc_title', 'doc']})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='a', attrs={'target':['_TOP']}),
|
||||
dict(name='div', attrs={'style':['float: right; margin-left: 5px; margin-bottom: 5px;', 'float: right; margin-left: 5px; margin-bottom: 5px;']}),
|
||||
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Publicisztika', 'http://www.feed43.com/4684235031168504.xml'),
|
||||
(u'Interj\xfa', 'http://www.feed43.com/4032465460040618.xml'),
|
||||
(u'Visszhang', 'http://www.feed43.com/3727375706873086.xml'),
|
||||
(u'P\xe1ratlan oldal', 'http://www.feed43.com/2525784782475057.xml'),
|
||||
(u'Feuilleton', 'http://www.feed43.com/7216025082703073.xml'),
|
||||
(u'Pr\xf3za', 'http://www.feed43.com/8760248802326384.xml'),
|
||||
(u'Vers', 'http://www.feed43.com/1737324675134275.xml'),
|
||||
(u'K\xf6nyvkritika', 'http://www.feed43.com/1281156550717082.xml'),
|
||||
(u'M\u0171b\xedr\xe1lat', 'http://www.feed43.com/1851854623681044.xml')
|
||||
]
|
@ -4,7 +4,8 @@ __copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
elmundo.es
|
||||
'''
|
||||
|
||||
import re
|
||||
import time
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class ElMundo(BasicNewsRecipe):
|
||||
@ -18,12 +19,15 @@ class ElMundo(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'iso8859_15'
|
||||
remove_javascript = True
|
||||
remove_empty_feeds = True
|
||||
language = 'es'
|
||||
masthead_url = 'http://estaticos03.elmundo.es/elmundo/iconos/v4.x/v4.01/bg_h1.png'
|
||||
publication_type = 'newspaper'
|
||||
extra_css = """
|
||||
body{font-family: Arial,Helvetica,sans-serif}
|
||||
.metadata_noticia{font-size: small}
|
||||
.pestana_GDP{font-size: small; font-weight:bold}
|
||||
h1,h2,h3,h4,h5,h6,.subtitulo {color: #3F5974}
|
||||
.hora{color: red}
|
||||
.update{color: gray}
|
||||
@ -41,22 +45,43 @@ class ElMundo(BasicNewsRecipe):
|
||||
remove_tags_after = dict(name='div' , attrs={'id':['desarrollo_noticia','tamano']})
|
||||
remove_attributes = ['lang','border']
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['herramientas','publicidad_google']})
|
||||
,dict(name='div', attrs={'id':'modulo_multimedia' })
|
||||
dict(name='div', attrs={'class':['herramientas','publicidad_google','comenta','col col-2b','apoyos','no-te-pierdas']})
|
||||
,dict(name='div', attrs={'class':['publicidad publicidad_cuerpo_noticia','comentarios_nav','mensaje_privado','interact']})
|
||||
,dict(name='div', attrs={'class':['num_comentarios estirar']})
|
||||
,dict(name='span', attrs={'class':['links_comentar']})
|
||||
,dict(name='div', attrs={'id':['comentar']})
|
||||
,dict(name='ul', attrs={'class':'herramientas' })
|
||||
,dict(name=['object','link','embed','iframe','base','meta'])
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Portada' , u'http://estaticos.elmundo.es/elmundo/rss/portada.xml' )
|
||||
(u'Portada' , u'http://estaticos.elmundo.es/elmundo/rss/portada.xml' )
|
||||
,(u'Deportes' , u'http://estaticos.elmundo.es/elmundodeporte/rss/portada.xml')
|
||||
,(u'Economia' , u'http://estaticos.elmundo.es/elmundo/rss/economia.xml' )
|
||||
,(u'Espana' , u'http://estaticos.elmundo.es/elmundo/rss/espana.xml' )
|
||||
,(u'Econom\xeda' , u'http://estaticos.elmundo.es/elmundo/rss/economia.xml' )
|
||||
,(u'Espa\xf1a' , u'http://estaticos.elmundo.es/elmundo/rss/espana.xml' )
|
||||
,(u'Internacional' , u'http://estaticos.elmundo.es/elmundo/rss/internacional.xml' )
|
||||
,(u'Cultura' , u'http://estaticos.elmundo.es/elmundo/rss/cultura.xml' )
|
||||
,(u'Ciencia/Ecologia', u'http://estaticos.elmundo.es/elmundo/rss/ciencia.xml' )
|
||||
,(u'Comunicacion' , u'http://estaticos.elmundo.es/elmundo/rss/comunicacion.xml' )
|
||||
,(u'Television' , u'http://estaticos.elmundo.es/elmundo/rss/television.xml' )
|
||||
,(u'Ciencia/Ecolog\xeda', u'http://estaticos.elmundo.es/elmundo/rss/ciencia.xml' )
|
||||
,(u'Comunicaci\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/comunicacion.xml' )
|
||||
,(u'Televisi\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/television.xml' )
|
||||
|
||||
,(u'Salud' , u'http://estaticos.elmundo.es/elmundosalud/rss/portada.xml' )
|
||||
,(u'Solidaridad' , u'http://estaticos.elmundo.es/elmundo/rss/solidaridad.xml' )
|
||||
,(u'Su vivienda' , u'http://estaticos.elmundo.es/elmundo/rss/suvivienda.xml' )
|
||||
,(u'Motor' , u'http://estaticos.elmundo.es/elmundomotor/rss/portada.xml' )
|
||||
|
||||
,(u'Madrid' , u'http://estaticos.elmundo.es/elmundo/rss/madrid.xml' )
|
||||
,(u'Barcelona' , u'http://estaticos.elmundo.es/elmundo/rss/barcelona.xml' )
|
||||
,(u'Pa\xeds Vasco' , u'http://estaticos.elmundo.es/elmundo/rss/paisvasco.xml' )
|
||||
,(u'Baleares' , u'http://estaticos.elmundo.es/elmundo/rss/baleares.xml' )
|
||||
,(u'Castilla y Le\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/castillayleon.xml' )
|
||||
,(u'Valladolid' , u'http://estaticos.elmundo.es/elmundo/rss/valladolid.xml' )
|
||||
,(u'Valencia' , u'http://estaticos.elmundo.es/elmundo/rss/valencia.xml' )
|
||||
,(u'Alicante' , u'http://estaticos.elmundo.es/elmundo/rss/alicante.xml' )
|
||||
,(u'Castell\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/castellon.xml' )
|
||||
,(u'Andaluc\xeda' , u'http://estaticos.elmundo.es/elmundo/rss/andalucia.xml' )
|
||||
,(u'Sevilla' , u'http://estaticos.elmundo.es/elmundo/rss/andalucia_sevilla.xml' )
|
||||
,(u'M\xe1laga' , u'http://estaticos.elmundo.es/elmundo/rss/andalucia_malaga.xml' )
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
@ -67,3 +92,34 @@ class ElMundo(BasicNewsRecipe):
|
||||
def get_article_url(self, article):
|
||||
return article.get('guid', None)
|
||||
|
||||
|
||||
preprocess_regexps = [
|
||||
# Para presentar la imagen de los videos incrustados
|
||||
|
||||
(re.compile(r'var imagen', re.DOTALL|re.IGNORECASE), lambda match: '--></script><img src'),
|
||||
(re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
|
||||
(re.compile(r'var video=', re.DOTALL|re.IGNORECASE), lambda match: '<script language="Javascript" type="text/javascript"><!--'),
|
||||
|
||||
# Para que no salga la numeración de comentarios: 1, 2, 3 ...
|
||||
|
||||
(re.compile(r'<ol>\n<li style="z-index:', re.DOTALL|re.IGNORECASE), lambda match: '<ul><li style="z-index:'),
|
||||
(re.compile(r'</ol>\n<div class="num_comentarios estirar">', re.DOTALL|re.IGNORECASE), lambda match: '</ul><div class="num_comentarios estirar">'),
|
||||
]
|
||||
|
||||
# Obtener la imagen de portada
|
||||
|
||||
def get_cover_url(self):
|
||||
cover = None
|
||||
st = time.localtime()
|
||||
year = str(st.tm_year)
|
||||
month = "%.2d" % st.tm_mon
|
||||
day = "%.2d" % st.tm_mday
|
||||
#http://img.kiosko.net/2011/11/19/es/elmundo.750.jpg
|
||||
cover='http://img.kiosko.net/'+ year + '/' + month + '/' + day +'/es/elmundo.750.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
self.log("\nPortada no disponible")
|
||||
cover ='http://estaticos03.elmundo.es/elmundo/iconos/v4.x/v4.01/bg_h1.png'
|
||||
return cover
|
||||
|
16
recipes/emuzica_pl.recipe
Normal file
@ -0,0 +1,16 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class eMuzyka(BasicNewsRecipe):
|
||||
title = u'eMuzyka'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Emuzyka to największa i najpopularniejsza strona o muzyce w Polsce'
|
||||
category = 'music'
|
||||
language = 'pl'
|
||||
cover_url='http://s.emuzyka.pl/img/emuzyka_invert_small.jpg'
|
||||
no_stylesheets = True
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags=[dict(name='div', attrs={'id':'news_container'}), dict(name='h3'), dict(name='div', attrs={'class':'review_text'})]
|
||||
remove_tags=[dict(name='span', attrs={'id':'date'})]
|
||||
feeds = [(u'Aktualno\u015bci', u'http://www.emuzyka.pl/rss.php?f=1'), (u'Recenzje', u'http://www.emuzyka.pl/rss.php?f=2')]
|
@ -20,7 +20,7 @@ class ESPN(BasicNewsRecipe):
|
||||
|
||||
use_embedded_content = False
|
||||
remove_javascript = True
|
||||
needs_subscription = True
|
||||
needs_subscription = 'optional'
|
||||
encoding= 'ISO-8859-1'
|
||||
|
||||
remove_tags_before = dict(name='font', attrs={'class':'date'})
|
||||
@ -75,32 +75,30 @@ class ESPN(BasicNewsRecipe):
|
||||
|
||||
return soup
|
||||
|
||||
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
br.set_handle_refresh(False)
|
||||
url = ('https://r.espn.go.com/members/v3_1/login')
|
||||
raw = br.open(url).read()
|
||||
raw = re.sub(r'(?s)<form>.*?id="regsigninbtn".*?</form>', '', raw)
|
||||
with TemporaryFile(suffix='.htm') as fname:
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(raw)
|
||||
br.open_local_file(fname)
|
||||
if self.username and self.password:
|
||||
br.set_handle_refresh(False)
|
||||
url = ('https://r.espn.go.com/members/v3_1/login')
|
||||
raw = br.open(url).read()
|
||||
raw = re.sub(r'(?s)<form>.*?id="regsigninbtn".*?</form>', '', raw)
|
||||
with TemporaryFile(suffix='.htm') as fname:
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(raw)
|
||||
br.open_local_file(fname)
|
||||
|
||||
br.form = br.forms().next()
|
||||
br.form.find_control(name='username', type='text').value = self.username
|
||||
br.form['password'] = self.password
|
||||
br.submit().read()
|
||||
br.open('http://espn.go.com').read()
|
||||
br.set_handle_refresh(True)
|
||||
br.form = br.forms().next()
|
||||
br.form.find_control(name='username', type='text').value = self.username
|
||||
br.form['password'] = self.password
|
||||
br.submit().read()
|
||||
br.open('http://espn.go.com').read()
|
||||
br.set_handle_refresh(True)
|
||||
return br
|
||||
|
||||
def get_article_url(self, article):
|
||||
return article.get('guid', None)
|
||||
|
||||
def print_version(self, url):
|
||||
|
||||
if 'eticket' in url:
|
||||
return url.partition('&')[0].replace('story?', 'print?')
|
||||
match = re.search(r'story\?(id=\d+)', url)
|
||||
|
@ -1,35 +1,43 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Gerardo Diez'
|
||||
__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
|
||||
description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '5, January 2011 Gerardo Diez<gerardo.diez.garcia@gmail.com> & desUBIKado'
|
||||
__author__ = 'desUBIKado, based on an earlier version by Gerardo Diez'
|
||||
__version__ = 'v1.01'
|
||||
__date__ = '13, November 2011'
|
||||
|
||||
'''
|
||||
expansion.es
|
||||
[url]http://www.expansion.com/[/url]
|
||||
'''
|
||||
|
||||
import time
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
class Publico(BasicNewsRecipe):
|
||||
title =u'Expansion.com'
|
||||
__author__ ='Gerardo Diez'
|
||||
publisher =u'Unidad Editorial Información Económica, S.L.'
|
||||
category ='finances, catalunya'
|
||||
oldest_article =1
|
||||
|
||||
class expansion_spanish(BasicNewsRecipe):
|
||||
__author__ ='Gerardo Diez & desUBIKado'
|
||||
description ='Financial news from Spain'
|
||||
title =u'Expansion'
|
||||
publisher =u'Unidad Editorial Internet, S.L.'
|
||||
category ='news, finances, Spain'
|
||||
oldest_article = 2
|
||||
simultaneous_downloads = 10
|
||||
max_articles_per_feed =100
|
||||
simultaneous_downloads =10
|
||||
cover_url =u'http://estaticos01.expansion.com/iconos/v2.x/v2.0/cabeceras/logo_expansion.png'
|
||||
timefmt ='[%A, %d %B, %Y]'
|
||||
encoding ='latin'
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
encoding ='iso-8859-15'
|
||||
language ='es'
|
||||
remove_javascript =True
|
||||
no_stylesheets =True
|
||||
use_embedded_content = False
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
remove_empty_feeds = True
|
||||
|
||||
keep_only_tags =dict(name='div', attrs={'class':['noticia primer_elemento']})
|
||||
|
||||
remove_tags =[
|
||||
dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto']}),
|
||||
dict(name='ul', attrs={'class':['bolos_desarrollo_noticia']}),
|
||||
dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto','tit_relacionadas','interact','paginacion estirar','sumario derecha']}),
|
||||
dict(name='ul', attrs={'class':['bolos_desarrollo_noticia','not_logged']}),
|
||||
dict(name='span', attrs={'class':['comentarios']}),
|
||||
dict(name='p', attrs={'class':['cintillo_comentarios', 'cintillo_comentarios formulario']}),
|
||||
dict(name='div', attrs={'id':['comentarios_lectores_listado']})
|
||||
dict(name='div', attrs={'id':['comentarios_lectores_listado','comentar']})
|
||||
]
|
||||
feeds =[
|
||||
(u'Portada', u'http://estaticos.expansion.com/rss/portada.xml'),
|
||||
@ -38,42 +46,112 @@ class Publico(BasicNewsRecipe):
|
||||
(u'Euribor', u'http://estaticos.expansion.com/rss/mercadoseuribor.xml'),
|
||||
(u'Materias Primas', u'http://estaticos.expansion.com/rss/mercadosmateriasprimas.xml'),
|
||||
(u'Renta Fija', u'http://estaticos.expansion.com/rss/mercadosrentafija.xml'),
|
||||
|
||||
(u'Portada: Mi Dinero', u'http://estaticos.expansion.com/rss/midinero.xml'),
|
||||
(u'Hipotecas', u'http://estaticos.expansion.com/rss/midinerohipotecas.xml'),
|
||||
(u'Créditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'),
|
||||
(u'Cr\xe9ditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'),
|
||||
(u'Pensiones', u'http://estaticos.expansion.com/rss/midineropensiones.xml'),
|
||||
(u'Fondos de Inversión', u'http://estaticos.expansion.com/rss/midinerofondos.xml'),
|
||||
(u'Fondos de Inversi\xf3n', u'http://estaticos.expansion.com/rss/midinerofondos.xml'),
|
||||
(u'Motor', u'http://estaticos.expansion.com/rss/midineromotor.xml'),
|
||||
|
||||
(u'Portada: Empresas', u'http://estaticos.expansion.com/rss/empresas.xml'),
|
||||
(u'Banca', u'http://estaticos.expansion.com/rss/empresasbanca.xml'),
|
||||
(u'TMT', u'http://estaticos.expansion.com/rss/empresastmt.xml'),
|
||||
(u'Energía', u'http://estaticos.expansion.com/rss/empresasenergia.xml'),
|
||||
(u'Inmobiliario y Construcción', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'),
|
||||
(u'Energ\xeda', u'http://estaticos.expansion.com/rss/empresasenergia.xml'),
|
||||
(u'Inmobiliario y Construcci\xf3n', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'),
|
||||
(u'Transporte y Turismo', u'http://estaticos.expansion.com/rss/empresastransporte.xml'),
|
||||
(u'Automoción e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'),
|
||||
(u'Distribución', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'),
|
||||
(u'Deporte y Negocio', u' http://estaticos.expansion.com/rss/empresasdeporte.xml'),
|
||||
(u'Automoci\xf3n e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'),
|
||||
(u'Distribuci\xf3n', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'),
|
||||
(u'Deporte y Negocio', u' [url]http://estaticos.expansion.com/rss/empresasdeporte.xml[/url]'),
|
||||
(u'Mi Negocio', u'http://estaticos.expansion.com/rss/empresasminegocio.xml'),
|
||||
(u'Interiores', u'http://estaticos.expansion.com/rss/empresasinteriores.xml'),
|
||||
(u'Digitech', u'http://estaticos.expansion.com/rss/empresasdigitech.xml'),
|
||||
|
||||
(u'Portada: Economía y Política', u'http://estaticos.expansion.com/rss/economiapolitica.xml'),
|
||||
(u'Política', u'http://estaticos.expansion.com/rss/economia.xml'),
|
||||
(u'Portada: Econom\xeda y Pol\xedtica', u'http://estaticos.expansion.com/rss/economiapolitica.xml'),
|
||||
(u'Pol\xedtica', u'http://estaticos.expansion.com/rss/economia.xml'),
|
||||
(u'Portada: Sociedad', u'http://estaticos.expansion.com/rss/entorno.xml'),
|
||||
|
||||
(u'Portada: Opinión', u'http://estaticos.expansion.com/rss/opinion.xml'),
|
||||
(u'Portada: Opini\xf3n', u'http://estaticos.expansion.com/rss/opinion.xml'),
|
||||
(u'Llaves y editoriales', u'http://estaticos.expansion.com/rss/opinioneditorialyllaves.xml'),
|
||||
(u'Tribunas', u'http://estaticos.expansion.com/rss/opiniontribunas.xml'),
|
||||
|
||||
(u'Portada: Jurídico', u'http://estaticos.expansion.com/rss/juridico.xml'),
|
||||
(u'Portada: Jur\xeddico', u'http://estaticos.expansion.com/rss/juridico.xml'),
|
||||
(u'Entrevistas', u'http://estaticos.expansion.com/rss/juridicoentrevistas.xml'),
|
||||
(u'Opinión', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'),
|
||||
(u'Opini\xf3n', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'),
|
||||
(u'Sentencias', u'http://estaticos.expansion.com/rss/juridicosentencias.xml'),
|
||||
|
||||
(u'Mujer', u'http://estaticos.expansion.com/rss/mujer-empresa.xml'),
|
||||
(u'Cataluña', u'http://estaticos.expansion.com/rss/catalunya.xml'),
|
||||
(u'Función pública', u'http://estaticos.expansion.com/rss/funcion-publica.xml')
|
||||
(u'Catalu\xf1a', u'http://estaticos.expansion.com/rss/catalunya.xml'),
|
||||
(u'Funci\xf3n p\xfablica', u'http://estaticos.expansion.com/rss/funcion-publica.xml')
|
||||
]
|
||||
|
||||
# Obtener la imagen de portada
|
||||
|
||||
def get_cover_url(self):
|
||||
cover = None
|
||||
st = time.localtime()
|
||||
year = str(st.tm_year)
|
||||
month = "%.2d" % st.tm_mon
|
||||
day = "%.2d" % st.tm_mday
|
||||
#[url]http://img5.kiosko.net/2011/11/14/es/expansion.750.jpg[/url]
|
||||
cover='http://img5.kiosko.net/'+ year + '/' + month + '/' + day +'/es/expansion.750.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
self.log("\nPortada no disponible")
|
||||
cover ='http://www.aproahp.org/enlaces/images/diario_expansion.gif'
|
||||
return cover
|
||||
|
||||
|
||||
|
||||
# Para que no salte la publicidad al recuperar la noticia, y que siempre se recupere
|
||||
# la página web, mando la variable "t" con la hora "linux" o "epoch" actual
|
||||
# haciendole creer al sitio web que justo se acaba de ver la publicidad
|
||||
|
||||
def print_version(self, url):
|
||||
st = time.time()
|
||||
segundos = str(int(st))
|
||||
parametros = '.html?t=' + segundos
|
||||
return url.replace('.html', parametros)
|
||||
|
||||
|
||||
|
||||
_processed_links = []
|
||||
|
||||
def get_article_url(self, article):
|
||||
|
||||
# Para obtener la url original del artículo a partir de la de "feedsportal"
|
||||
|
||||
link = article.get('link', None)
|
||||
if link is None:
|
||||
return article
|
||||
if link.split('/')[-1]=="story01.htm":
|
||||
link=link.split('/')[-2]
|
||||
a=['0B','0C','0D','0E','0F','0G','0N' ,'0L0S','0A']
|
||||
b=['.' ,'/' ,'?' ,'-' ,'=' ,'&' ,'.com','www.','0']
|
||||
for i in range(0,len(a)):
|
||||
link=link.replace(a[i],b[i])
|
||||
link="http://"+link
|
||||
|
||||
# Eliminar artículos duplicados en otros feeds
|
||||
|
||||
if not (link in self._processed_links):
|
||||
self._processed_links.append(link)
|
||||
else:
|
||||
link = None
|
||||
|
||||
return link
|
||||
|
||||
|
||||
|
||||
# Un poco de css para mejorar la presentación de las noticias
|
||||
|
||||
extra_css = '''
|
||||
.entradilla {font-family:Arial,Helvetica,sans-serif; font-weight:bold; font-style:italic; font-size:16px;}
|
||||
.fecha_publicacion,.autor {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
|
||||
'''
|
||||
|
||||
|
||||
|
||||
# Para presentar la imagen de los videos incrustados
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'var imagen', re.DOTALL|re.IGNORECASE), lambda match: '--></script><img src'),
|
||||
(re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
|
||||
(re.compile(r'var id_reproductor', re.DOTALL|re.IGNORECASE), lambda match: '<script language="Javascript" type="text/javascript"><!--'),
|
||||
]
|
||||
|
30
recipes/fhm_uk.recipe
Normal file
@ -0,0 +1,30 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
|
||||
title = u'FHM UK'
|
||||
description = 'Good News for Men'
|
||||
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
|
||||
masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif'
|
||||
__author__ = 'Dave Asbury'
|
||||
# last updated 27/12/11
|
||||
language = 'en_GB'
|
||||
oldest_article = 28
|
||||
max_articles_per_feed = 12
|
||||
remove_empty_feeds = True
|
||||
no_stylesheets = True
|
||||
#auto_cleanup = True
|
||||
#articles_are_obfuscated = True
|
||||
keep_only_tags = [
|
||||
dict(name='h1'),
|
||||
dict(name='img',attrs={'id' : 'ctl00_Body_imgMainImage'}),
|
||||
dict(name='div',attrs={'id' : ['articleLeft']}),
|
||||
dict(name='div',attrs={'class' : ['imagesCenterArticle','containerCenterArticle','articleBody']}),
|
||||
|
||||
]
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'From the Homepage',u'http://feed43.com/8053226782885416.xml'),
|
||||
(u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'),
|
||||
(u'Gaming',u'http://feed43.com/0755006465351035.xml'),
|
||||
]
|
18
recipes/fisco_oggi.recipe
Normal file
@ -0,0 +1,18 @@
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'faber1971'
|
||||
description = 'Website of Italian Governament Income Agency (about revenue, taxation, taxes)- v1.00 (17, December 2011)'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1324112023(BasicNewsRecipe):
|
||||
title = u'Fisco Oggi'
|
||||
language = 'it'
|
||||
__author__ = 'faber1971'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
feeds = [(u'Attualit\xe0', u'http://www.fiscooggi.it/taxonomy/term/1/feed'), (u'Normativa', u'http://www.fiscooggi.it/taxonomy/term/5/feed'), (u'Giurisprudenza', u'http://www.fiscooggi.it/taxonomy/term/8/feed'), (u'Dati e statistiche', u'http://www.fiscooggi.it/taxonomy/term/12/feed'), (u'Analisi e commenti', u'http://www.fiscooggi.it/taxonomy/term/13/feed'), (u'Bilancio e contabilit\xe0', u'http://www.fiscooggi.it/taxonomy/term/576/feed'), (u'Dalle regioni', u'http://www.fiscooggi.it/taxonomy/term/16/feed'), (u'Dal mondo', u'http://www.fiscooggi.it/taxonomy/term/17/feed')]
|
||||
|
@ -1,57 +1,68 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Focus_pl(BasicNewsRecipe):
|
||||
title = u'Focus.pl'
|
||||
oldest_article = 15
|
||||
max_articles_per_feed = 100
|
||||
__author__ = 'fenuks'
|
||||
language = 'pl'
|
||||
description ='polish scientific monthly magazine'
|
||||
class FocusRecipe(BasicNewsRecipe):
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = u'intromatyk <intromatyk@gmail.com>'
|
||||
language = 'pl'
|
||||
version = 1
|
||||
|
||||
title = u'Focus'
|
||||
publisher = u'Gruner + Jahr Polska'
|
||||
category = u'News'
|
||||
description = u'Newspaper'
|
||||
category='magazine'
|
||||
cover_url=''
|
||||
remove_empty_feeds= True
|
||||
no_stylesheets=True
|
||||
remove_tags_before=dict(name='div', attrs={'class':'h2 h2f'})
|
||||
remove_tags_after=dict(name='div', attrs={'class':'clear'})
|
||||
feeds = [(u'Wszystkie kategorie', u'http://focus.pl.feedsportal.com/c/32992/f/532692/index.rss'),
|
||||
(u'Nauka', u'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'),
|
||||
(u'Historia', u'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'),
|
||||
(u'Cywilizacja', u'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'),
|
||||
(u'Sport', u'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'),
|
||||
(u'Technika', u'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'),
|
||||
(u'Przyroda', u'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'),
|
||||
(u'Technologie', u'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'),
|
||||
(u'Warto wiedzieć', u'http://focus.pl.feedsportal.com/c/32992/f/532700/index.rss'),
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100000
|
||||
recursions = 0
|
||||
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
encoding = 'utf-8'
|
||||
# Seems to work best, but YMMV
|
||||
simultaneous_downloads = 5
|
||||
|
||||
r = re.compile('.*(?P<url>http:\/\/(www.focus.pl)|(rss.feedsportal.com\/c)\/.*\.html?).*')
|
||||
keep_only_tags =[]
|
||||
keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'cll'}))
|
||||
|
||||
remove_tags =[]
|
||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulm noprint'}))
|
||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'txb'}))
|
||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'h2'}))
|
||||
remove_tags.append(dict(name = 'ul', attrs = {'class' : 'txu'}))
|
||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulc'}))
|
||||
|
||||
extra_css = '''
|
||||
body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
|
||||
h1{text-align: left;}
|
||||
h2{font-size: medium; font-weight: bold;}
|
||||
p.lead {font-weight: bold; text-align: left;}
|
||||
.authordate {font-size: small; color: #696969;}
|
||||
.fot{font-size: x-small; color: #666666;}
|
||||
'''
|
||||
|
||||
|
||||
|
||||
]
|
||||
feeds = [
|
||||
('Nauka', 'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'),
|
||||
('Historia', 'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'),
|
||||
('Cywilizacja', 'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'),
|
||||
('Sport', 'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'),
|
||||
('Technika', 'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'),
|
||||
('Przyroda', 'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'),
|
||||
('Technologie', 'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'),
|
||||
]
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
tag=soup.find(name='a')
|
||||
if tag:
|
||||
new_soup=self.index_to_soup(tag['href']+ 'do-druku/1/', raw=True)
|
||||
return new_soup
|
||||
|
||||
def append_page(self, appendtag):
|
||||
tag=appendtag.find(name='div', attrs={'class':'arrows'})
|
||||
if tag:
|
||||
nexturl='http://www.focus.pl/'+tag.a['href']
|
||||
for rem in appendtag.findAll(name='div', attrs={'class':'klik-nav'}):
|
||||
rem.extract()
|
||||
while nexturl:
|
||||
soup2=self.index_to_soup(nexturl)
|
||||
nexturl=None
|
||||
pagetext=soup2.find(name='div', attrs={'class':'txt'})
|
||||
tag=pagetext.find(name='div', attrs={'class':'arrows'})
|
||||
for r in tag.findAll(name='a'):
|
||||
if u'Następne' in r.string:
|
||||
nexturl='http://www.focus.pl/'+r['href']
|
||||
for rem in pagetext.findAll(name='div', attrs={'class':'klik-nav'}):
|
||||
rem.extract()
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
if ('advertisement' in soup.find('title').string.lower()):
|
||||
href = soup.find('a').get('href')
|
||||
return self.index_to_soup(href, raw=True)
|
||||
else:
|
||||
return None
|
||||
|
||||
def get_cover_url(self):
|
||||
soup=self.index_to_soup('http://www.focus.pl/magazyn/')
|
||||
@ -60,7 +71,14 @@ class Focus_pl(BasicNewsRecipe):
|
||||
self.cover_url='http://www.focus.pl/' + tag.a['href']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup.body)
|
||||
return soup
|
||||
def print_version(self, url):
|
||||
if url.count ('focus.pl.feedsportal.com'):
|
||||
u = url.find('focus0Bpl')
|
||||
u = 'http://www.focus.pl/' + url[u + 11:]
|
||||
u = u.replace('0C', '/')
|
||||
u = u.replace('A', '')
|
||||
u = u.replace ('0E','-')
|
||||
u = u.replace('/nc/1//story01.htm', '/do-druku/1')
|
||||
else:
|
||||
u = url.replace('/nc/1','/do-druku/1')
|
||||
return u
|
@ -8,31 +8,35 @@ class FSP(BasicNewsRecipe):
|
||||
__author__ = 'fluzao'
|
||||
description = u'Printed edition contents. UOL subscription required (Folha subscription currently not supported).' + \
|
||||
u' [Conte\xfado completo da edi\xe7\xe3o impressa. Somente para assinantes UOL.]'
|
||||
INDEX = 'http://www1.folha.uol.com.br/fsp/indices/'
|
||||
|
||||
#found this to be the easiest place to find the index page (13-Nov-2011).
|
||||
# searching for the "Indice Geral" link
|
||||
HOMEPAGE = 'http://www1.folha.uol.com.br/fsp/'
|
||||
masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
|
||||
|
||||
language = 'pt'
|
||||
no_stylesheets = True
|
||||
max_articles_per_feed = 40
|
||||
remove_javascript = True
|
||||
needs_subscription = True
|
||||
remove_tags_before = dict(name='b')
|
||||
|
||||
remove_tags_before = dict(name='p')
|
||||
remove_tags = [dict(name='td', attrs={'align':'center'})]
|
||||
remove_attributes = ['height','width']
|
||||
masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
|
||||
|
||||
# fixes the problem with the section names
|
||||
section_dict = {'cotidian' : 'cotidiano', 'ilustrad': 'ilustrada', \
|
||||
'quadrin': 'quadrinhos' , 'opiniao' : u'opini\xE3o', \
|
||||
'ciencia' : u'ci\xeancia' , 'saude' : u'sa\xfade', \
|
||||
'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio'}
|
||||
'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio', \
|
||||
'imoveis' : u'im\xf3veis', 'negocios' : u'neg\xf3cios', \
|
||||
'veiculos' : u've\xedculos', 'corrida' : 'folha corrida'}
|
||||
|
||||
# this solves the problem with truncated content in Kindle
|
||||
conversion_options = {'linearize_tables' : True}
|
||||
|
||||
# this bit removes the footer where there are links for Proximo Texto, Texto Anterior,
|
||||
# Indice e Comunicar Erros
|
||||
preprocess_regexps = [(re.compile(r'<BR><BR>Texto Anterior:.*<!--/NOTICIA-->',
|
||||
re.DOTALL|re.IGNORECASE), lambda match: r''),
|
||||
(re.compile(r'<BR><BR>Próximo Texto:.*<!--/NOTICIA-->',
|
||||
preprocess_regexps = [(re.compile(r'<!--/NOTICIA-->.*Comunicar Erros</a>',
|
||||
re.DOTALL|re.IGNORECASE), lambda match: r'')]
|
||||
|
||||
def get_browser(self):
|
||||
@ -49,7 +53,25 @@ class FSP(BasicNewsRecipe):
|
||||
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
#Searching for the index page on the HOMEPAGE
|
||||
hpsoup = self.index_to_soup(self.HOMEPAGE)
|
||||
indexref = hpsoup.find('a', href=re.compile('^indices.*'))
|
||||
self.log('--> tag containing the today s index: ', indexref)
|
||||
INDEX = indexref['href']
|
||||
INDEX = 'http://www1.folha.uol.com.br/fsp/'+INDEX
|
||||
self.log('--> INDEX after extracting href and adding prefix: ', INDEX)
|
||||
# ... and taking the opportunity to get the cover image link
|
||||
coverurl = hpsoup.find('a', href=re.compile('^cp.*'))['href']
|
||||
if coverurl:
|
||||
self.log('--> tag containing the today s cover: ', coverurl)
|
||||
coverurl = coverurl.replace('htm', 'jpg')
|
||||
coverurl = 'http://www1.folha.uol.com.br/fsp/images/'+coverurl
|
||||
self.log('--> coverurl after extracting href and adding prefix: ', coverurl)
|
||||
self.cover_url = coverurl
|
||||
|
||||
#soup = self.index_to_soup(self.INDEX)
|
||||
soup = self.index_to_soup(INDEX)
|
||||
|
||||
feeds = []
|
||||
articles = []
|
||||
section_title = "Preambulo"
|
||||
@ -68,8 +90,12 @@ class FSP(BasicNewsRecipe):
|
||||
self.log('--> new section title: ', section_title)
|
||||
if strpost.startswith('<a href'):
|
||||
url = post['href']
|
||||
#this bit is kept if they ever go back to the old format (pre Nov-2011)
|
||||
if url.startswith('/fsp'):
|
||||
url = 'http://www1.folha.uol.com.br'+url
|
||||
#
|
||||
if url.startswith('http://www1.folha.uol.com.br/fsp'):
|
||||
#url = 'http://www1.folha.uol.com.br'+url
|
||||
title = self.tag_to_string(post)
|
||||
self.log()
|
||||
self.log('--> post: ', post)
|
||||
@ -82,15 +108,11 @@ class FSP(BasicNewsRecipe):
|
||||
# keeping the front page url
|
||||
minha_capa = feeds[0][1][1]['url']
|
||||
|
||||
# removing the 'Preambulo' section
|
||||
# removing the first section (now called 'top')
|
||||
del feeds[0]
|
||||
|
||||
# creating the url for the cover image
|
||||
coverurl = feeds[0][1][0]['url']
|
||||
coverurl = coverurl.replace('/opiniao/fz', '/images/cp')
|
||||
coverurl = coverurl.replace('01.htm', '.jpg')
|
||||
self.cover_url = coverurl
|
||||
|
||||
# inserting the cover page as the first article (nicer for kindle users)
|
||||
feeds.insert(0,(u'primeira p\xe1gina', [{'title':u'Primeira p\xe1gina' , 'url':minha_capa}]))
|
||||
return feeds
|
||||
|
||||
|
||||
|
50
recipes/formulaas.recipe
Normal file
@ -0,0 +1,50 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = u'2011, Silviu Cotoar\u0103'
|
||||
'''
|
||||
formula-as.ro
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class FormulaAS(BasicNewsRecipe):
|
||||
title = u'Formula AS'
|
||||
__author__ = u'Silviu Cotoar\u0103'
|
||||
publisher = u'Formula AS'
|
||||
description = u'Formula AS'
|
||||
oldest_article = 5
|
||||
language = 'ro'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
category = 'Ziare,Romania'
|
||||
encoding = 'utf-8'
|
||||
cover_url = 'http://www.formula-as.ro/_client/img/header_logo.png'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':'item padded'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='ul', attrs={'class':'subtitle lower'})
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='ul', attrs={'class':'subtitle lower'}),
|
||||
dict(name='div', attrs={'class':'item-brief-options'})
|
||||
]
|
||||
feeds = [
|
||||
(u'\u0218tiri', u'http://www.formula-as.ro/rss/articole.xml')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
@ -18,7 +18,7 @@ class FrazPC(BasicNewsRecipe):
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
|
||||
cover_url='http://www.frazpc.pl/images/logo.png'
|
||||
feeds = [
|
||||
(u'Aktualno\u015bci', u'http://www.frazpc.pl/feed/aktualnosci'),
|
||||
(u'Artyku\u0142y', u'http://www.frazpc.pl/feed/artykuly')
|
||||
@ -33,6 +33,7 @@ class FrazPC(BasicNewsRecipe):
|
||||
dict(name='div', attrs={'class':'comments_box'})
|
||||
]
|
||||
|
||||
remove_tags_after=dict(name='div', attrs={'class':'content'})
|
||||
preprocess_regexps = [(re.compile(r'\| <a href="#comments">Komentarze \([0-9]*\)</a>'), lambda match: '')]
|
||||
|
||||
remove_attributes = [ 'width', 'height' ]
|
||||
|
35
recipes/gazeta_pl_szczecin.recipe
Normal file
@ -0,0 +1,35 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
import re
|
||||
import string
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class GazetaPlSzczecin(BasicNewsRecipe):
|
||||
title = u'Gazeta.pl Szczecin'
|
||||
description = u'Wiadomości ze Szczecina na portalu Gazeta.pl.'
|
||||
__author__ = u'Michał Szkutnik'
|
||||
__license__ = u'GPL v3'
|
||||
language = 'pl'
|
||||
publisher = 'Agora S.A.'
|
||||
category = 'news, szczecin'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
remove_tags = [ { "name" : "a", "attrs" : { "href" : "http://szczecin.gazeta.pl/szczecin/www.gazeta.pl" }}]
|
||||
cover_url = "http://bi.gazeta.pl/i/hp/hp2009/logo.gif"
|
||||
feeds = [(u'Wszystkie', u'http://rss.feedsportal.com/c/32739/f/530434/index.rss')]
|
||||
|
||||
def get_article_url(self, article):
|
||||
s = re.search("""/0L(szczecin.*)/story01.htm""", article.link)
|
||||
s = s.group(1)
|
||||
replacements = { "0B" : ".", "0C" : "/", "0H" : ",", "0I" : "_"}
|
||||
for (a, b) in replacements.iteritems():
|
||||
s = string.replace(s, a, b)
|
||||
s = string.replace(s, "0A", "0")
|
||||
return "http://"+s
|
||||
|
||||
def print_version(self, url):
|
||||
s = re.search("""/(\d*),(\d*),(\d*),.*\.html""", url)
|
||||
no1 = s.group(2)
|
||||
no2 = s.group(3)
|
||||
return """http://szczecin.gazeta.pl/szczecin/2029020,%s,%s.html""" % (no1, no2)
|
90
recipes/givemesomethingtoread.recipe
Normal file
@ -0,0 +1,90 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class GiveMeSomethingToRead(BasicNewsRecipe):
|
||||
title = u'Give Me Something To Read'
|
||||
description = 'Curation / aggregation of articles on diverse topics'
|
||||
language = 'en'
|
||||
__author__ = 'barty on mobileread.com forum'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = False
|
||||
timefmt = ' [%a, %d %b, %Y]'
|
||||
oldest_article = 365
|
||||
auto_cleanup = True
|
||||
INDEX = 'http://givemesomethingtoread.com'
|
||||
CATEGORIES = [
|
||||
# comment out categories you don't want
|
||||
# (user friendly name, system name, max number of articles to load)
|
||||
('The Arts','arts',25),
|
||||
('Science','science',30),
|
||||
('Technology','technology',30),
|
||||
('Politics','politics',20),
|
||||
('Media','media',30),
|
||||
('Crime','crime',15),
|
||||
('Other articles','',10)
|
||||
]
|
||||
|
||||
def parse_index(self):
|
||||
self.cover_url = 'http://thegretchenshow.files.wordpress.com/2009/12/well-read-cat-small.jpg'
|
||||
feeds = []
|
||||
seen_urls = set([])
|
||||
regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
|
||||
|
||||
for category in self.CATEGORIES:
|
||||
|
||||
(cat_name, tag, max_articles) = category
|
||||
|
||||
tagurl = '' if tag=='' else '/tagged/'+tag
|
||||
self.log('Reading category:', cat_name)
|
||||
|
||||
articles = []
|
||||
pageno = 1
|
||||
|
||||
while len(articles) < max_articles and pageno < 100:
|
||||
|
||||
page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
|
||||
pageno += 1
|
||||
|
||||
self.log('\tReading page:', page)
|
||||
try:
|
||||
soup = self.index_to_soup(page)
|
||||
except:
|
||||
break
|
||||
|
||||
headers = soup.findAll('h2')
|
||||
if len(headers) == .0:
|
||||
break
|
||||
|
||||
for header in headers:
|
||||
atag = header.find('a')
|
||||
url = atag['href']
|
||||
# skip promotionals and duplicate
|
||||
if url.startswith('http://givemesomethingtoread') or url.startswith('/') or url in seen_urls:
|
||||
continue
|
||||
seen_urls.add(url)
|
||||
title = self.tag_to_string(header)
|
||||
self.log('\tFound article:', title)
|
||||
#self.log('\t', url)
|
||||
desc = header.parent.find('blockquote')
|
||||
desc = self.tag_to_string(desc) if desc else ''
|
||||
m = regex.match( url)
|
||||
if m:
|
||||
desc = "[%s] %s" % (m.group(2), desc)
|
||||
#self.log('\t', desc)
|
||||
date = ''
|
||||
p = header.parent.previousSibling
|
||||
# navigate up to find h3, which contains the date
|
||||
while p:
|
||||
if hasattr(p,'name') and p.name == 'h3':
|
||||
date = self.tag_to_string(p)
|
||||
break
|
||||
p = p.previousSibling
|
||||
articles.append({'title':title,'url':url,'description':desc,'date':date})
|
||||
if len(articles) >= max_articles:
|
||||
break
|
||||
|
||||
if articles:
|
||||
feeds.append((cat_name, articles))
|
||||
|
||||
return feeds
|
||||
|
@ -1,4 +1,3 @@
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class GlasgowHerald(BasicNewsRecipe):
|
||||
@ -9,12 +8,16 @@ class GlasgowHerald(BasicNewsRecipe):
|
||||
language = 'en_GB'
|
||||
|
||||
__author__ = 'Kovid Goyal'
|
||||
use_embedded_content = False
|
||||
|
||||
keep_only_tags = [dict(attrs={'class':'article'})]
|
||||
remove_tags = [
|
||||
dict(id=['pic-nav']),
|
||||
dict(attrs={'class':['comments-top']})
|
||||
]
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
|
||||
#keep_only_tags = [dict(attrs={'class':'article'})]
|
||||
#remove_tags = [
|
||||
#dict(id=['pic-nav']),
|
||||
#dict(attrs={'class':['comments-top']})
|
||||
#]
|
||||
|
||||
|
||||
feeds = [
|
||||
@ -25,5 +28,4 @@ class GlasgowHerald(BasicNewsRecipe):
|
||||
(u'Arts & Entertainment',
|
||||
u'http://www.heraldscotland.com/cmlink/1.768',),
|
||||
(u'Columnists', u'http://www.heraldscotland.com/cmlink/1.658574')]
|
||||
|
||||
|
||||
|
||||
|
@ -51,6 +51,13 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe):
|
||||
{'class':['articleTools', 'pagination', 'Ads', 'topad',
|
||||
'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}]
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
picdiv = soup.find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,picdiv['src'])
|
||||
|
||||
|
||||
#Use the mobile version rather than the web version
|
||||
def print_version(self, url):
|
||||
return url.rpartition('?')[0] + '?service=mobile'
|
||||
|
13
recipes/goal.recipe
Normal file
@ -0,0 +1,13 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1325677767(BasicNewsRecipe):
|
||||
title = u'Goal'
|
||||
oldest_article = 1
|
||||
language = 'it'
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
remove_tags_after = [dict(id='article_content')]
|
||||
feeds = [(u'Goal', u'http://www.goal.com/it/feeds/news?fmt=rss')]
|
||||
__author__ = 'faber1971'
|
||||
description = 'Sports news from Italy'
|
||||
|
@ -12,7 +12,6 @@ class GN(BasicNewsRecipe):
|
||||
EDITION = 0
|
||||
|
||||
__author__ = 'Piotr Kontek'
|
||||
title = u'Gość niedzielny'
|
||||
description = 'Weekly magazine'
|
||||
encoding = 'utf-8'
|
||||
no_stylesheets = True
|
||||
@ -20,6 +19,8 @@ class GN(BasicNewsRecipe):
|
||||
remove_javascript = True
|
||||
temp_files = []
|
||||
simultaneous_downloads = 1
|
||||
masthead_url = 'http://gosc.pl/files/11/03/12/949089_top.gif'
|
||||
title = u'Gość niedzielny'
|
||||
|
||||
articles_are_obfuscated = True
|
||||
|
||||
@ -64,7 +65,6 @@ class GN(BasicNewsRecipe):
|
||||
if img != None:
|
||||
a = img.parent
|
||||
self.EDITION = a['href']
|
||||
self.title = img['alt']
|
||||
self.cover_url = 'http://www.gosc.pl' + img['src']
|
||||
if not first:
|
||||
break
|
||||
|
76
recipes/grantland.recipe
Normal file
@ -0,0 +1,76 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class GrantLand(BasicNewsRecipe):
|
||||
title = u"Grantland"
|
||||
description = 'Writings on Sports & Pop Culture'
|
||||
language = 'en'
|
||||
__author__ = 'barty on mobileread.com forum'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
# auto_cleanup is too aggressive sometimes and we end up with blank articles
|
||||
auto_cleanup = False
|
||||
timefmt = ' [%a, %d %b %Y]'
|
||||
oldest_article = 90
|
||||
|
||||
cover_url = 'http://cdn0.sbnation.com/imported_assets/740965/blog_grantland_grid_3.jpg'
|
||||
masthead_url = 'http://a1.espncdn.com/prod/assets/grantland/grantland-logo.jpg'
|
||||
|
||||
INDEX = 'http://www.grantland.com'
|
||||
CATEGORIES = [
|
||||
# comment out second line if you don't want older articles
|
||||
# (user friendly name, url suffix, max number of articles to load)
|
||||
('Today in Grantland','',20),
|
||||
('In Case You Missed It','incaseyoumissedit',35),
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
{'name':['style','aside','nav','footer','script']},
|
||||
{'name':'h1','text':'Grantland'},
|
||||
{'id':['header','col-right']},
|
||||
{'class':['connect_widget']},
|
||||
{'name':'section','class':re.compile(r'\b(ad|module)\b')},
|
||||
]
|
||||
|
||||
preprocess_regexps = [
|
||||
# remove blog banners
|
||||
(re.compile(r'<a href="/blog/(?:(?!</a>).)+</a>', re.DOTALL|re.IGNORECASE), lambda m: ''),
|
||||
]
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
seen_urls = set([])
|
||||
|
||||
for category in self.CATEGORIES:
|
||||
|
||||
(cat_name, tag, max_articles) = category
|
||||
self.log('Reading category:', cat_name)
|
||||
articles = []
|
||||
|
||||
page = "%s/%s" % (self.INDEX, tag)
|
||||
soup = self.index_to_soup(page)
|
||||
|
||||
main = soup.find('div',id='col-main')
|
||||
if main is None:
|
||||
main = soup
|
||||
|
||||
for tag in main.findAll('a', href=re.compile(r'(story|post)/_/id/\d+')):
|
||||
url = tag['href']
|
||||
if url in seen_urls:
|
||||
continue
|
||||
title = tag.string
|
||||
# blank title probably means <a href=".."><img /></a>. skip
|
||||
if not title:
|
||||
continue
|
||||
self.log('\tFound article:', title)
|
||||
self.log('\t', url)
|
||||
articles.append({'title':title,'url':url})
|
||||
seen_urls.add(url)
|
||||
|
||||
if len(articles) >= max_articles:
|
||||
break
|
||||
|
||||
if articles:
|
||||
feeds.append((cat_name, articles))
|
||||
|
||||
return feeds
|
43
recipes/gs24_pl.recipe
Normal file
@ -0,0 +1,43 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
import re
|
||||
import string
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1322322819(BasicNewsRecipe):
|
||||
title = u'GS24.pl (Głos Szczeciński)'
|
||||
description = u'Internetowy serwis Głosu Szczecińskiego'
|
||||
__author__ = u'Michał Szkutnik'
|
||||
__license__ = u'GPL v3'
|
||||
language = 'pl'
|
||||
publisher = 'Media Regionalne sp. z o.o.'
|
||||
category = 'news, szczecin'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
cover_url = "http://www.gs24.pl/images/top_logo.png"
|
||||
|
||||
feeds = [
|
||||
# (u'Wszystko', u'http://www.gs24.pl/rss.xml'),
|
||||
(u'Szczecin', u'http://www.gs24.pl/szczecin.xml'),
|
||||
(u'Stargard', u'http://www.gs24.pl/stargard.xml'),
|
||||
(u'Świnoujście', u'http://www.gs24.pl/swinoujscie.xml'),
|
||||
(u'Goleniów', u'http://www.gs24.pl/goleniow.xml'),
|
||||
(u'Gryfice', u'http://www.gs24.pl/gryfice.xml'),
|
||||
(u'Kamień Pomorski', u'http://www.gs24.pl/kamienpomorski.xml'),
|
||||
(u'Police', u'http://www.gs24.pl/police.xml'),
|
||||
(u'Region', u'http://www.gs24.pl/region.xml'),
|
||||
(u'Sport', u'http://www.gs24.pl/sport.xml'),
|
||||
]
|
||||
|
||||
def get_article_url(self, article):
|
||||
s = re.search("""/0L0S(gs24.*)/story01.htm""", article.link)
|
||||
s = s.group(1)
|
||||
replacements = { "0B" : ".", "0C" : "/", "0H" : ",", "0I" : "_", "0D" : "?", "0F" : "="}
|
||||
for (a, b) in replacements.iteritems():
|
||||
s = string.replace(s, a, b)
|
||||
s = string.replace(s, "0A", "0")
|
||||
return "http://"+s
|
||||
|
||||
def print_version(self, url):
|
||||
return url + "&Template=printpicart"
|
@ -9,6 +9,7 @@ www.guardian.co.uk
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from datetime import date
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||
|
||||
class Guardian(BasicNewsRecipe):
|
||||
|
||||
@ -16,16 +17,19 @@ class Guardian(BasicNewsRecipe):
|
||||
if date.today().weekday() == 6:
|
||||
base_url = "http://www.guardian.co.uk/theobserver"
|
||||
cover_pic = 'Observer digital edition'
|
||||
masthead_url = 'http://static.guim.co.uk/sys-images/Guardian/Pix/site_furniture/2010/10/19/1287478087992/The-Observer-001.gif'
|
||||
else:
|
||||
base_url = "http://www.guardian.co.uk/theguardian"
|
||||
cover_pic = 'Guardian digital edition'
|
||||
masthead_url = 'http://static.guim.co.uk/static/f76b43f9dcfd761f0ecf7099a127b603b2922118/common/images/logos/the-guardian/titlepiece.gif'
|
||||
|
||||
__author__ = 'Seabound and Sujata Raman'
|
||||
language = 'en_GB'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_javascript = True
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_javascript = True
|
||||
encoding = 'utf-8'
|
||||
|
||||
# List of section titles to ignore
|
||||
# For example: ['Sport']
|
||||
@ -41,6 +45,16 @@ class Guardian(BasicNewsRecipe):
|
||||
dict(name='div', attrs={'class':["guardian-tickets promo-component",]}),
|
||||
dict(name='ul', attrs={'class':["pagination"]}),
|
||||
dict(name='ul', attrs={'id':["content-actions"]}),
|
||||
# article history link
|
||||
dict(name='a', attrs={'class':["rollover history-link"]}),
|
||||
# "a version of this article ..." speil
|
||||
dict(name='div' , attrs = { 'class' : ['section']}),
|
||||
# "about this article" js dialog
|
||||
dict(name='div', attrs={'class':["share-top",]}),
|
||||
# author picture
|
||||
dict(name='img', attrs={'class':["contributor-pic-small"]}),
|
||||
# embedded videos/captions
|
||||
dict(name='span',attrs={'class' : ['inline embed embed-media']}),
|
||||
#dict(name='img'),
|
||||
]
|
||||
use_embedded_content = False
|
||||
@ -65,8 +79,21 @@ class Guardian(BasicNewsRecipe):
|
||||
url = None
|
||||
return url
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
picdiv = soup.find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,picdiv['src'])
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
||||
# multiple html sections in soup, useful stuff in the first
|
||||
html = soup.find('html')
|
||||
soup2 = BeautifulSoup()
|
||||
soup2.insert(0,html)
|
||||
|
||||
soup = soup2
|
||||
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
|
||||
@ -74,7 +101,18 @@ class Guardian(BasicNewsRecipe):
|
||||
del item['face']
|
||||
for tag in soup.findAll(name=['ul','li']):
|
||||
tag.name = 'div'
|
||||
|
||||
|
||||
# removes number next to rating stars
|
||||
items_to_remove = []
|
||||
rating_container = soup.find('div', attrs = {'class': ['rating-container']})
|
||||
if rating_container:
|
||||
for item in rating_container:
|
||||
if isinstance(item, Tag) and str(item.name) == 'span':
|
||||
items_to_remove.append(item)
|
||||
|
||||
for item in items_to_remove:
|
||||
item.extract()
|
||||
|
||||
return soup
|
||||
|
||||
def find_sections(self):
|
||||
|
@ -9,9 +9,9 @@ from calibre.ptempfile import PersistentTemporaryFile
|
||||
from urlparse import urlparse
|
||||
import re
|
||||
|
||||
class HackerNews(BasicNewsRecipe):
|
||||
title = 'Hacker News'
|
||||
__author__ = 'Tom Scholl'
|
||||
class HNWithCommentsLink(BasicNewsRecipe):
|
||||
title = 'HN With Comments Link'
|
||||
__author__ = 'Tom Scholl & David Kerschner'
|
||||
description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.'
|
||||
publisher = 'Y Combinator'
|
||||
category = 'news, programming, it, technology'
|
||||
@ -80,6 +80,11 @@ class HackerNews(BasicNewsRecipe):
|
||||
body = body + comments
|
||||
return u'<html><title>' + title + u'</title><body>' + body + '</body></html>'
|
||||
|
||||
def parse_feeds(self):
|
||||
a = super(HNWithCommentsLink, self).parse_feeds()
|
||||
self.hn_articles = a[0].articles
|
||||
return a
|
||||
|
||||
def get_obfuscated_article(self, url):
|
||||
if url.startswith('http://news.ycombinator.com'):
|
||||
content = self.get_hn_content(url)
|
||||
@ -97,6 +102,13 @@ class HackerNews(BasicNewsRecipe):
|
||||
else:
|
||||
content = self.get_readable_content(url)
|
||||
|
||||
article = 0
|
||||
for a in self.hn_articles:
|
||||
if a.url == url:
|
||||
article = a
|
||||
|
||||
content = re.sub(r'</body>\s*</html>\s*$', '', content) + article.summary + '</body></html>'
|
||||
|
||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||
self.temp_files[-1].write(content)
|
||||
self.temp_files[-1].close()
|
||||
|
11
recipes/haksoz.recipe
Normal file
@ -0,0 +1,11 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BasicUserRecipe1324739199(BasicNewsRecipe):
|
||||
title = u'Haks\xf6z'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 20
|
||||
auto_cleanup = True
|
||||
language = 'tr'
|
||||
__author__ = 'asalet_r'
|
||||
|
||||
feeds = [(u'Haks\xf6z', u'http://www.haksozhaber.net/rss/')]
|
58
recipes/hamilton_spectator.recipe
Normal file
@ -0,0 +1,58 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
'''
|
||||
Hamilton Spectator Calibre Recipe
|
||||
'''
|
||||
class HamiltonSpectator(BasicNewsRecipe):
|
||||
title = u'Hamilton Spectator'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
__author__ = u'Eric Coolman'
|
||||
publisher = u'thespec.com'
|
||||
description = u'Ontario Canada Newspaper'
|
||||
category = u'News, Ontario, Canada'
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
language = 'en_CA'
|
||||
encoding = 'utf-8'
|
||||
|
||||
feeds = [
|
||||
(u'Top Stories',u'http://www.thespec.com/rss?query=/&assetType=Article'),
|
||||
(u'All News',u'http://www.thespec.com/rss?query=/news&assetType=Article'),
|
||||
(u'Local',u'http://www.thespec.com/rss?query=/local&assetType=Article'),
|
||||
(u'Ontario',u'http://www.thespec.com/rss?query=/ontario&assetType=Article'),
|
||||
(u'Canada',u'http://www.thespec.com/rss?query=/canada&assetType=Article'),
|
||||
(u'World News',u'http://www.thespec.com/rss?query=/world&assetType=Article'),
|
||||
(u'Business',u'http://www.thespec.com/rss?query=/business&assetType=Article'),
|
||||
(u'Crime',u'http://www.thespec.com/rss?query=/crime&assetType=Article'),
|
||||
(u'All Sports',u'http://www.thespec.com/rss?query=/sports&assetType=Article'),
|
||||
(u'Ticats',u'http://www.thespec.com/rss?query=/sports/ticats&assetType=Article'),
|
||||
(u'Bulldogs',u'http://www.thespec.com/rss?query=/sports/bulldogs&assetType=Article'),
|
||||
(u'High School Sports',u'http://www.thespec.com/rss?query=/sports/highschools&assetType=Article'),
|
||||
(u'Local Sports',u'http://www.thespec.com/rss?query=/sports/local&assetType=Article'),
|
||||
(u'What''s On',u'http://www.thespec.com/rss?query=/whatson&assetType=Article'),
|
||||
(u'Arts and Entertainment',u'http://www.thespec.com/rss?query=/whatson/artsentertainment&assetType=Article'),
|
||||
(u'Books',u'http://www.thespec.com/rss?query=/whatson/books&assetType=Article'),
|
||||
(u'Movies',u'http://www.thespec.com/rss?query=/whatson/movies&assetType=Article'),
|
||||
(u'Music',u'http://www.thespec.com/rss?query=/whatson/music&assetType=Article'),
|
||||
(u'Restaurant Reviews',u'http://www.thespec.com/rss?query=/whatson/restaurants&assetType=Article'),
|
||||
(u'Opinion',u'http://www.thespec.com/rss?query=/opinion&assetType=Article'),
|
||||
(u'Opinion Columns',u'http://www.thespec.com/rss?query=/opinion/columns&assetType=Article'),
|
||||
(u'Cartoons',u'http://www.thespec.com/rss?query=/opinion/cartoons&assetType=Article'),
|
||||
(u'Letters',u'http://www.thespec.com/rss?query=/opinion/letters&assetType=Article'),
|
||||
(u'Editorial',u'http://www.thespec.com/rss?query=/opinion/editorial&assetType=Article'),
|
||||
(u'Community',u'http://www.thespec.com/rss?query=/community&assetType=Article'),
|
||||
(u'Education',u'http://www.thespec.com/rss?query=/community/education&assetType=Article'),
|
||||
(u'Faith',u'http://www.thespec.com/rss?query=/community/faith&assetType=Article'),
|
||||
(u'Contests',u'http://www.thespec.com/rss?query=/community/contests&assetType=Article'),
|
||||
(u'Living',u'http://www.thespec.com/rss?query=/living&assetType=Article'),
|
||||
(u'Food',u'http://www.thespec.com/rss?query=/living/food&assetType=Article'),
|
||||
(u'Health and Fitness',u'http://www.thespec.com/rss?query=/living/healthfitness&assetType=Article'),
|
||||
(u'Your Home',u'http://www.thespec.com/rss?query=/living/home&assetType=Article'),
|
||||
(u'Travel',u'http://www.thespec.com/rss?query=/living/travel&assetType=Article'),
|
||||
(u'Family and Parenting',u'http://www.thespec.com/rss?query=/living/familyparenting&assetType=Article'),
|
||||
(u'Style',u'http://www.thespec.com/rss?query=/living/style&assetType=Article')
|
||||
]
|
||||
|
@ -1,11 +1,11 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
class AdvancedUserRecipe(BasicNewsRecipe):
|
||||
|
||||
title = 'heise online'
|
||||
title = 'Heise-online'
|
||||
description = 'News vom Heise-Verlag'
|
||||
__author__ = 'schuster'
|
||||
masthead_url = 'http://www.heise.de/icons/ho/heise_online_logo.gif'
|
||||
publisher = 'Heise Zeitschriften Verlag GmbH & Co. KG'
|
||||
use_embedded_content = False
|
||||
language = 'de'
|
||||
oldest_article = 2
|
||||
@ -14,11 +14,10 @@ class AdvancedUserRecipe(BasicNewsRecipe):
|
||||
remove_empty_feeds = True
|
||||
timeout = 5
|
||||
no_stylesheets = True
|
||||
encoding = 'utf-8'
|
||||
|
||||
|
||||
remove_tags_after = dict(name ='p', attrs={'class':'editor'})
|
||||
remove_tags = [{'class':'navi_top_container'},
|
||||
remove_tags = [dict(id='navi_top_container'),
|
||||
dict(id='navi_bottom'),
|
||||
dict(id='mitte_rechts'),
|
||||
dict(id='navigation'),
|
||||
@ -29,27 +28,31 @@ class AdvancedUserRecipe(BasicNewsRecipe):
|
||||
dict(id='seiten_navi'),
|
||||
dict(id='adbottom'),
|
||||
dict(id='sitemap'),
|
||||
dict(name='a', href=re.compile(r'^/([a-zA-Z]+/)?')),
|
||||
]
|
||||
dict(name='div', attrs={'id':'sitemap'}),
|
||||
dict(name='ul', attrs={'class':'erste_zeile'}),
|
||||
dict(name='ul', attrs={'class':'zweite_zeile'}),
|
||||
dict(name='div', attrs={'class':'navi_top_container'})]
|
||||
|
||||
feeds = [
|
||||
('Newsticker', 'http://www.heise.de/newsticker/heise.rdf'),
|
||||
('iX', 'http://www.heise.de/ix/news/news.rdf'),
|
||||
('Technology Review', 'http://www.heise.de/tr/news-atom.xml'),
|
||||
('mobil', 'http://www.heise.de/mobil/newsticker/heise-atom.xml'),
|
||||
('Security', 'http://www.heise.de/security/news/news-atom.xml'),
|
||||
('Netze', 'http://www.heise.de/netze/rss/netze-atom.xml'),
|
||||
('Open Source', 'http://www.heise.de/open/news/news-atom.xml'),
|
||||
('Resale ', 'http://www.heise.de/resale/rss/resale.rdf'),
|
||||
('Auto', 'http://www.heise.de/autos/rss/news.rdf'),
|
||||
('Foto ', 'http://www.heise.de/foto/rss/news-atom.xml'),
|
||||
('Autos', 'http://www.heise.de/autos/rss/news.rdf'),
|
||||
('Mac & i', 'http://www.heise.de/mac-and-i/news.rdf'),
|
||||
('Mac&i', 'http://www.heise.de/mac-and-i/news.rdf'),
|
||||
('Mobile ', 'http://www.heise.de/mobil/newsticker/heise-atom.xml'),
|
||||
('Netz ', 'http://www.heise.de/netze/rss/netze-atom.xml'),
|
||||
('Open ', 'http://www.heise.de/open/news/news-atom.xml'),
|
||||
('Resale ', 'http://www.heise.de/resale/rss/resale.rdf'),
|
||||
('Security ', 'http://www.heise.de/security/news/news-atom.xml'),
|
||||
('C`t', 'http://www.heise.de/ct/rss/artikel-atom.xml'),
|
||||
('iX', 'http://www.heise.de/ix/news/news.rdf'),
|
||||
('Mach-flott', 'http://www.heise.de/mach-flott/rss/mach-flott-atom.xml'),
|
||||
('Blog: Babel-Bulletin', 'http://www.heise.de/developer/rss/babel-bulletin/blog.rdf'),
|
||||
('Blog: Der Dotnet-Doktor', 'http://www.heise.de/developer/rss/dotnet-doktor/blog.rdf'),
|
||||
('Blog: Bernds Management-Welt', 'http://www.heise.de/developer/rss/bernds-management-welt/blog.rdf'),
|
||||
('Blog: The World of IT', 'http://www.heise.de/developer/rss/world-of-it/blog.rdf'),
|
||||
('Blog: Kais bewegtes Web', 'http://www.heise.de/developer/rss/kais-bewegtes-web/blog.rdf')
|
||||
]
|
||||
('Blog: IT conversation', 'http://www.heise.de/developer/rss/world-of-it/blog.rdf'),
|
||||
('Blog: Kais bewegtes Web', 'http://www.heise.de/developer/rss/kais-bewegtes-web/blog.rdf')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?view=print'
|
||||
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import urllib, re
|
||||
|
||||
class HindustanTimes(BasicNewsRecipe):
|
||||
title = u'Hindustan Times'
|
||||
@ -26,4 +27,24 @@ class HindustanTimes(BasicNewsRecipe):
|
||||
'http://feeds.hindustantimes.com/HT-Homepage-LifestyleNews'),
|
||||
]
|
||||
|
||||
def get_article_url(self, article):
|
||||
'''
|
||||
HT uses a variant of the feedportal RSS ad display mechanism
|
||||
'''
|
||||
try:
|
||||
s = article.summary
|
||||
return urllib.unquote(
|
||||
re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
|
||||
except:
|
||||
pass
|
||||
url = BasicNewsRecipe.get_article_url(self, article)
|
||||
res = self.browser.open_novisit(url)
|
||||
url = res.geturl().split('/')[-2]
|
||||
encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
|
||||
'0D': '?', '0E': '-', '0N': '.com', '0L': 'http://', '0S':
|
||||
'www.'}
|
||||
for k, v in encoding.iteritems():
|
||||
url = url.replace(k, v)
|
||||
return url
|
||||
|
||||
|
||||
|
@ -4,56 +4,20 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2010, matek09, matek09@gmail.com'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
class Histmag(BasicNewsRecipe):
|
||||
title = u'Histmag'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
cover_url='http://histmag.org/grafika/loga/histmag-logo-2-300px.png'
|
||||
__author__ = 'matek09'
|
||||
description = u"Artykuly historyczne i publicystyczne"
|
||||
encoding = 'utf-8'
|
||||
#preprocess_regexps = [(re.compile(r'</span>'), lambda match: '</span><br><br>'),(re.compile(r'<span>'), lambda match: '<br><br><span>')]
|
||||
no_stylesheets = True
|
||||
language = 'pl'
|
||||
remove_javascript = True
|
||||
keep_only_tags=[dict(id='article')]
|
||||
remove_tags=[dict(name = 'p', attrs = {'class' : 'article-tags'})]
|
||||
|
||||
title = u'Histmag'
|
||||
__author__ = 'matek09'
|
||||
description = u"Artykuly historyczne i publicystyczne"
|
||||
encoding = 'utf-8'
|
||||
no_stylesheets = True
|
||||
language = 'pl'
|
||||
remove_javascript = True
|
||||
#max_articles_per_feed = 1
|
||||
remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'article'}))
|
||||
remove_tags_after = dict(dict(name = 'h2', attrs = {'class' : 'komentarze'}))
|
||||
#keep_only_tags =[]
|
||||
#keep_only_tags.append(dict(name = 'h2'))
|
||||
#keep_only_tags.append(dict(name = 'p'))
|
||||
|
||||
remove_tags =[]
|
||||
remove_tags.append(dict(name = 'p', attrs = {'class' : 'podpis'}))
|
||||
remove_tags.append(dict(name = 'h2', attrs = {'class' : 'komentarze'}))
|
||||
remove_tags.append(dict(name = 'img', attrs = {'src' : 'style/buttons/wesprzyjnas-1.jpg'}))
|
||||
|
||||
preprocess_regexps = [(re.compile(r'</span>'), lambda match: '</span><br><br>'),
|
||||
(re.compile(r'<span>'), lambda match: '<br><br><span>')]
|
||||
extra_css = '''
|
||||
.left {font-size: x-small}
|
||||
.right {font-size: x-small}
|
||||
'''
|
||||
|
||||
def find_articles(self, soup):
|
||||
articles = []
|
||||
for div in soup.findAll('div', attrs={'class' : 'text'}):
|
||||
articles.append({
|
||||
'title' : self.tag_to_string(div.h3.a),
|
||||
'url' : 'http://www.histmag.org/' + div.h3.a['href'],
|
||||
'date' : self.tag_to_string(div.next('p')).split('|')[0],
|
||||
'description' : self.tag_to_string(div.next('p', podpis=False)),
|
||||
})
|
||||
return articles
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup('http://histmag.org/?arc=4&dx=0')
|
||||
feeds = []
|
||||
feeds.append((u"Artykuly historyczne", self.find_articles(soup)))
|
||||
soup = self.index_to_soup('http://histmag.org/?arc=5&dx=0')
|
||||
feeds.append((u"Artykuly publicystyczne", self.find_articles(soup)))
|
||||
soup = self.index_to_soup('http://histmag.org/?arc=1&dx=0')
|
||||
feeds.append((u"Wydarzenia", self.find_articles(soup)))
|
||||
|
||||
return feeds
|
||||
|
||||
|
||||
feeds = [(u'Wszystkie', u'http://histmag.org/rss/wszystkie.xml'), (u'Wydarzenia', u'http://histmag.org/rss/wydarzenia.xml'), (u'Recenzje', u'http://histmag.org/rss/recenzje.xml'), (u'Artykuły historyczne', u'http://histmag.org/rss/historia.xml'), (u'Publicystyka', u'http://histmag.org/rss/publicystyka.xml')]
|
||||
|
@ -8,6 +8,15 @@ class Historia_org_pl(BasicNewsRecipe):
|
||||
category = 'history'
|
||||
language = 'pl'
|
||||
oldest_article = 8
|
||||
remove_empty_feeds=True
|
||||
max_articles_per_feed = 100
|
||||
|
||||
feeds = [(u'Artykuły', u'http://www.historia.org.pl/index.php?format=feed&type=rss')]
|
||||
feeds = [(u'Wszystkie', u'http://www.historia.org.pl/index.php?format=feed&type=rss'),
|
||||
(u'Wiadomości', u'http://www.historia.org.pl/index.php/wiadomosci.feed?type=rss'),
|
||||
(u'Publikacje', u'http://www.historia.org.pl/index.php/publikacje.feed?type=rss'),
|
||||
(u'Publicystyka', u'http://www.historia.org.pl/index.php/publicystyka.feed?type=rss'),
|
||||
(u'Recenzje', u'http://historia.org.pl/index.php/recenzje.feed?type=rss'),
|
||||
(u'Kultura i sztuka', u'http://www.historia.org.pl/index.php/kultura-i-sztuka.feed?type=rss'),
|
||||
(u'Rekonstykcje', u'http://www.historia.org.pl/index.php/rekonstrukcje.feed?type=rss'),
|
||||
(u'Projekty', u'http://www.historia.org.pl/index.php/projekty.feed?type=rss'),
|
||||
(u'Konkursy'), (u'http://www.historia.org.pl/index.php/konkursy.feed?type=rss')]
|
||||
|
@ -1,44 +1,58 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
################################################################################
|
||||
#Description: http://hvg.hu/ RSS channel
|
||||
#Author: Bigpapa (bigpapabig@hotmail.com)
|
||||
#Date: 2011.12.20. - V1.1
|
||||
################################################################################
|
||||
|
||||
class HVG(BasicNewsRecipe):
|
||||
title = 'HVG.HU'
|
||||
__author__ = u'István Papp'
|
||||
description = u'Friss hírek a HVG-től'
|
||||
timefmt = ' [%Y. %b. %d., %a.]'
|
||||
oldest_article = 4
|
||||
language = 'hu'
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'utf8'
|
||||
publisher = 'HVG Online'
|
||||
category = u'news, hírek, hvg'
|
||||
extra_css = 'body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
|
||||
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
||||
remove_tags_before = dict(id='pg-content')
|
||||
remove_javascript = True
|
||||
remove_empty_feeds = True
|
||||
class hvg(BasicNewsRecipe):
|
||||
title = u'HVG'
|
||||
__author__ = 'Bigpapa'
|
||||
language = 'hu'
|
||||
oldest_article = 5 # Hany napos legyen a legregebbi cikk amit leszedjen.
|
||||
max_articles_per_feed = 5 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
|
||||
no_stylesheets = True
|
||||
encoding = 'utf8'
|
||||
extra_css = ' h2 { font:bold 28px} '
|
||||
|
||||
feeds = [
|
||||
(u'Itthon', u'http://hvg.hu/rss/itthon')
|
||||
,(u'Világ', u'http://hvg.hu/rss/vilag')
|
||||
,(u'Gazdaság', u'http://hvg.hu/rss/gazdasag')
|
||||
,(u'IT | Tudomány', u'http://hvg.hu/rss/tudomany')
|
||||
,(u'Panoráma', u'http://hvg.hu/rss/Panorama')
|
||||
,(u'Karrier', u'http://hvg.hu/rss/karrier')
|
||||
,(u'Gasztronómia', u'http://hvg.hu/rss/gasztronomia')
|
||||
,(u'Helyi érték', u'http://hvg.hu/rss/helyiertek')
|
||||
,(u'Kultúra', u'http://hvg.hu/rss/kultura')
|
||||
,(u'Cégautó', u'http://hvg.hu/rss/cegauto')
|
||||
,(u'Vállalkozó szellem', u'http://hvg.hu/rss/kkv')
|
||||
,(u'Egészség', u'http://hvg.hu/rss/egeszseg')
|
||||
,(u'Vélemény', u'http://hvg.hu/rss/velemeny')
|
||||
,(u'Sport', u'http://hvg.hu/rss/sport')
|
||||
]
|
||||
remove_attributes = ['style','font', 'href']
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace ('#rss', '/print')
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id':['pg-content']})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['box articlemenu', 'bannergoogle468', 'boxcontainer left', 'boxcontainer', 'commentbox']}),
|
||||
dict(name='table', attrs={'class':['banner2', 'monocle']}),
|
||||
dict(name='div', attrs={'id':['connect_widget_4cf63ca849ddf4577922632', 'sharetip', 'upprev_box']}),
|
||||
dict(name='div', attrs={'style':['float: right; margin-bottom: 5px;', 'display: none;']}),
|
||||
dict(name='h3', attrs={'class':['hthree']}),
|
||||
dict(name='ul', attrs={'class':['defaultul']}),
|
||||
dict(name='form', attrs={'id':['commentForm']}),
|
||||
dict(name='h6', attrs={'class':['hthree']}),
|
||||
dict(name='h6', attrs={'class':['more2']}),
|
||||
dict(name='img', attrs={'class':['framed']}),
|
||||
dict(name='td', attrs={'class':['greyboxbody','embedvideobody','embedvideofooter','embedvideobottom']}),
|
||||
|
||||
|
||||
|
||||
]
|
||||
|
||||
feeds = [
|
||||
# (u'\xd6sszes', 'http://hvg.hu/rss'),
|
||||
(u'Itthon', 'http://hvg.hu/rss/itthon'),
|
||||
(u'Vil\xe1g', 'http://hvg.hu/rss/vilag'),
|
||||
(u'Gazdas\xe1g', 'http://hvg.hu/rss/gazdasag'),
|
||||
(u'Tudom\xe1ny', 'http://hvg.hu/rss/tudomany'),
|
||||
(u'Panor\xe1ma', 'http://hvg.hu/rss/panorama'),
|
||||
(u'Karrier', 'http://hvg.hu/rss/karrier'),
|
||||
(u'Gasztron\xf3mia', 'http://hvg.hu/rss/gasztronomia'),
|
||||
(u'Helyi \xe9rt\xe9k', 'http://hvg.hu/rss/helyiertek'),
|
||||
(u'Kult\xfara', 'http://hvg.hu/rss/kultura'),
|
||||
(u'C\xe9gaut\xf3', 'http://hvg.hu/rss/cegauto'),
|
||||
(u'V\xe1llalkoz\xf3 szellem', 'http://hvg.hu/rss/kkv'),
|
||||
(u'Eg\xe9szs\xe9g', 'http://hvg.hu/rss/egeszseg'),
|
||||
(u'V\xe9lem\xe9ny', 'http://hvg.hu/rss/velemeny'),
|
||||
(u'Sport', 'http://hvg.hu/rss/sport')
|
||||
]
|
BIN
recipes/icons/biolog_pl.png
Normal file
After Width: | Height: | Size: 1.2 KiB |
BIN
recipes/icons/blues.png
Normal file
After Width: | Height: | Size: 910 B |
BIN
recipes/icons/computerworld_pl.png
Normal file
After Width: | Height: | Size: 373 B |
BIN
recipes/icons/descopera_org.png
Normal file
After Width: | Height: | Size: 9.3 KiB |
BIN
recipes/icons/dziennik_pl.png
Normal file
After Width: | Height: | Size: 481 B |
BIN
recipes/icons/formulaas.png
Normal file
After Width: | Height: | Size: 687 B |
BIN
recipes/icons/infra_pl.png
Normal file
After Width: | Height: | Size: 1.5 KiB |
BIN
recipes/icons/kosmonauta_pl.png
Normal file
After Width: | Height: | Size: 1.2 KiB |
BIN
recipes/icons/mlody_technik_pl.png
Normal file
After Width: | Height: | Size: 2.1 KiB |
BIN
recipes/icons/moneynews.png
Normal file
After Width: | Height: | Size: 914 B |
BIN
recipes/icons/novilist_novine_hr.png
Normal file
After Width: | Height: | Size: 241 B |
BIN
recipes/icons/novilist_portal_hr.png
Normal file
After Width: | Height: | Size: 944 B |
BIN
recipes/icons/rionegro.png
Normal file
After Width: | Height: | Size: 817 B |
BIN
recipes/icons/skylife.png
Normal file
After Width: | Height: | Size: 3.3 KiB |
BIN
recipes/icons/zaman.png
Normal file
After Width: | Height: | Size: 999 B |
68
recipes/ideal_almeria.recipe
Normal file
@ -0,0 +1,68 @@
|
||||
# encoding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Josemi Liébana <office at josemi-liebana.com>'
|
||||
__copyright__ = 'Josemi Liébana'
|
||||
__version__ = 'v0.1'
|
||||
__date__ = '5 January 2012'
|
||||
|
||||
|
||||
'''
|
||||
www.ideal.es
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Ideal(BasicNewsRecipe):
|
||||
title = u'Ideal (Edición Almería)'
|
||||
__author__ = u'Josemi Liébana'
|
||||
description = u'Noticias de Almería y el resto del mundo'
|
||||
publisher = 'Ideal'
|
||||
category = u'News, Politics, Spain, Almería'
|
||||
publication_type = 'Newspaper'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1252'
|
||||
use_embedded_content = False
|
||||
language = 'es'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = u'http://www.ideal.es/img/rd.logotipo2_ideal.gif'
|
||||
cover_url = u'http://www.ideal.es/granada/noticias/201112/24/Media/Granada/portada--647x894.JPG'
|
||||
extra_css = u' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .photo-caption{font-size: x-small} '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(attrs={'id':'title'})
|
||||
,dict(attrs={'class':['overhead','headline','subhead','date','text','noticia_cont','desarrollo']})
|
||||
]
|
||||
|
||||
remove_tags = [dict(name='ul')]
|
||||
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
feeds = [
|
||||
(u'Última Hora' , u'http://www.ideal.es/almeria/rss/feeds/ultima.xml' )
|
||||
,(u'Portada' , u'http://www.ideal.es/almeria/portada.xml' )
|
||||
,(u'Local' , u'http://www.ideal.es/almeria/rss/feeds/granada.xml' )
|
||||
,(u'Deportes' , u'http://www.ideal.es/almeria/rss/feeds/deportes.xml' )
|
||||
,(u'Sociedad' , u'http://www.ideal.es/almeria/rss/feeds/sociedad.xml' )
|
||||
,(u'Cultura' , u'http://www.ideal.es/almeria/rss/feeds/cultura.xml' )
|
||||
,(u'Economía' , u'http://www.ideal.es/almeria/rss/feeds/economia.xml' )
|
||||
,(u'Costa' , u'http://www.ideal.es/almeria/rss/feeds/costa.xml' )
|
||||
,(u'Puerta Purchena' , u'http://www.ideal.es/almeria/rss/feeds/puerta_purchena.xml' )
|
||||
,(u'Andalucía' , u'http://www.ideal.es/almeria/rss/feeds/andalucia.xml' )
|
||||
,(u'España' , u'http://www.ideal.es/almeria/rss/feeds/espana.xml' )
|
||||
,(u'Mundo' , u'http://www.ideal.es/almeria/rss/feeds/internacional.xml' )
|
||||
,(u'Vivir' , u'http://www.ideal.es/almeria/rss/feeds/vivir.xml' )
|
||||
,(u'Opinión' , u'http://www.ideal.es/almeria/rss/feeds/opinion.xml' )
|
||||
,(u'Televisión' , u'http://www.ideal.es/almeria/rss/feeds/television.xml' )
|
||||
,(u'Contraportada' , u'http://www.ideal.es/almeria/rss/feeds/contraportada.xml' )
|
||||
]
|
||||
|
69
recipes/ideal_granada.recipe
Normal file
@ -0,0 +1,69 @@
|
||||
# encoding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Josemi Liébana <office at josemi-liebana.com>'
|
||||
__copyright__ = 'Josemi Liébana'
|
||||
__version__ = 'v0.1'
|
||||
__date__ = '5 January 2012'
|
||||
|
||||
|
||||
'''
|
||||
www.ideal.es
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Ideal(BasicNewsRecipe):
|
||||
title = u'Ideal (Edición Granada)'
|
||||
__author__ = u'Josemi Liébana'
|
||||
description = u'Noticias de Granada y el resto del mundo'
|
||||
publisher = 'Ideal'
|
||||
category = 'News, Politics, Spain, Granada'
|
||||
publication_type = 'Newspaper'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1252'
|
||||
use_embedded_content = False
|
||||
language = 'es'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.ideal.es/img/rd.logotipo2_ideal.gif'
|
||||
cover_url = 'http://www.ideal.es/granada/noticias/201112/24/Media/Granada/portada--647x894.JPG'
|
||||
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .photo-caption{font-size: x-small} '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(attrs={'id':'title'})
|
||||
,dict(attrs={'class':['overhead','headline','subhead','date','text','noticia_cont','desarrollo']})
|
||||
]
|
||||
|
||||
remove_tags = [dict(name='ul')]
|
||||
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
feeds = [
|
||||
(u'Última Hora' , u'http://www.ideal.es/granada/rss/feeds/ultima.xml' )
|
||||
,(u'Portada' , u'http://www.ideal.es/granada/portada.xml' )
|
||||
,(u'Local' , u'http://www.ideal.es/granada/rss/feeds/granada.xml' )
|
||||
,(u'Deportes' , u'http://www.ideal.es/granada/rss/feeds/deportes.xml' )
|
||||
,(u'Sociedad' , u'http://www.ideal.es/granada/rss/feeds/sociedad.xml' )
|
||||
,(u'Cultura' , u'http://www.ideal.es/granada/rss/feeds/cultura.xml' )
|
||||
,(u'Economía' , u'http://www.ideal.es/granada/rss/feeds/economia.xml' )
|
||||
,(u'Costa' , u'http://www.ideal.es/granada/rss/feeds/costa.xml' )
|
||||
,(u'La Carrera' , u'http://www.ideal.es/granada/rss/feeds/la_carrera.xml' )
|
||||
,(u'Puerta Real' , u'http://www.ideal.es/granada/rss/feeds/puerta_real.xml' )
|
||||
,(u'Andalucía' , u'http://www.ideal.es/granada/rss/feeds/andalucia.xml' )
|
||||
,(u'España' , u'http://www.ideal.es/granada/rss/feeds/espana.xml' )
|
||||
,(u'Mundo' , u'http://www.ideal.es/granada/rss/feeds/internacional.xml' )
|
||||
,(u'Vivir' , u'http://www.ideal.es/granada/rss/feeds/vivir.xml' )
|
||||
,(u'Opinión' , u'http://www.ideal.es/granada/rss/feeds/opinion.xml' )
|
||||
,(u'Televisión' , u'http://www.ideal.es/granada/rss/feeds/television.xml' )
|
||||
,(u'Contraportada' , u'http://www.ideal.es/granada/rss/feeds/contraportada.xml' )
|
||||
]
|
||||
|
67
recipes/ideal_jaen.recipe
Normal file
@ -0,0 +1,67 @@
|
||||
# encoding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Josemi Liébana <office at josemi-liebana.com>'
|
||||
__copyright__ = 'Josemi Liébana'
|
||||
__version__ = 'v0.1'
|
||||
__date__ = '5 January 2012'
|
||||
|
||||
|
||||
'''
|
||||
www.ideal.es
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Ideal(BasicNewsRecipe):
|
||||
title = u'Ideal (Edición Jaén)'
|
||||
__author__ = u'Josemi Liébana'
|
||||
description = u'Noticias de Jaén y el resto del mundo'
|
||||
publisher = 'Ideal'
|
||||
category = u'News, Politics, Spain, Jaén'
|
||||
publication_type = 'Newspaper'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1252'
|
||||
use_embedded_content = False
|
||||
language = 'es'
|
||||
remove_empty_feeds = True
|
||||
masthead_url = 'http://www.ideal.es/img/rd.logotipo2_ideal.gif'
|
||||
cover_url = 'http://www.ideal.es/granada/noticias/201112/24/Media/Granada/portada--647x894.JPG'
|
||||
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} .photo-caption{font-size: x-small} '
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(attrs={'id':'title'})
|
||||
,dict(attrs={'class':['overhead','headline','subhead','date','text','noticia_cont','desarrollo']})
|
||||
]
|
||||
|
||||
remove_tags = [dict(name='ul')]
|
||||
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
feeds = [
|
||||
(u'Última Hora' , u'http://www.ideal.es/jaen/rss/feeds/ultima.xml' )
|
||||
,(u'Portada' , u'http://www.ideal.es/jaen/portada.xml' )
|
||||
,(u'Local' , u'http://www.ideal.es/jaen/rss/feeds/granada.xml' )
|
||||
,(u'Deportes' , u'http://www.ideal.es/jaen/rss/feeds/deportes.xml' )
|
||||
,(u'Sociedad' , u'http://www.ideal.es/jaen/rss/feeds/sociedad.xml' )
|
||||
,(u'Cultura' , u'http://www.ideal.es/jaen/rss/feeds/cultura.xml' )
|
||||
,(u'Economía' , u'http://www.ideal.es/jaen/rss/feeds/economia.xml' )
|
||||
,(u'Costa' , u'http://www.ideal.es/jaen/rss/feeds/costa.xml' )
|
||||
,(u'Andalucía' , u'http://www.ideal.es/jaen/rss/feeds/andalucia.xml' )
|
||||
,(u'España' , u'http://www.ideal.es/jaen/rss/feeds/espana.xml' )
|
||||
,(u'Mundo' , u'http://www.ideal.es/jaen/rss/feeds/internacional.xml' )
|
||||
,(u'Vivir' , u'http://www.ideal.es/jaen/rss/feeds/vivir.xml' )
|
||||
,(u'Opinión' , u'http://www.ideal.es/jaen/rss/feeds/opinion.xml' )
|
||||
,(u'Televisión' , u'http://www.ideal.es/jaen/rss/feeds/television.xml' )
|
||||
,(u'Contraportada' , u'http://www.ideal.es/jaen/rss/feeds/contraportada.xml' )
|
||||
]
|
||||
|
@ -1,63 +1,30 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Derry FitzGerald'
|
||||
'''
|
||||
iht.com
|
||||
'''
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
|
||||
class NYTimesGlobal(BasicNewsRecipe):
|
||||
title = u'NY Times Global'
|
||||
language = 'en'
|
||||
__author__ = 'Krittika Goyal'
|
||||
oldest_article = 1 #days
|
||||
max_articles_per_feed = 25
|
||||
use_embedded_content = False
|
||||
|
||||
class InternationalHeraldTribune(BasicNewsRecipe):
|
||||
title = u'The International Herald Tribune'
|
||||
__author__ = 'Derry FitzGerald'
|
||||
language = 'en'
|
||||
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 30
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
|
||||
remove_tags = [dict(name='div', attrs={'class':['footer','header']}),
|
||||
dict(name=['form'])]
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<!-- webtrends.*', re.DOTALL),
|
||||
lambda m:'</body></html>')
|
||||
]
|
||||
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
|
||||
|
||||
remove_empty_feeds = True
|
||||
|
||||
feeds = [
|
||||
(u'Frontpage', u'http://www.iht.com/rss/frontpage.xml'),
|
||||
(u'Business', u'http://www.iht.com/rss/business.xml'),
|
||||
(u'Americas', u'http://www.iht.com/rss/america.xml'),
|
||||
(u'Europe', u'http://www.iht.com/rss/europe.xml'),
|
||||
(u'Asia', u'http://www.iht.com/rss/asia.xml'),
|
||||
(u'Africa and Middle East', u'http://www.iht.com/rss/africa.xml'),
|
||||
(u'Opinion', u'http://www.iht.com/rss/opinion.xml'),
|
||||
(u'Technology', u'http://www.iht.com/rss/technology.xml'),
|
||||
(u'Health and Science', u'http://www.iht.com/rss/healthscience.xml'),
|
||||
(u'Sports', u'http://www.iht.com/rss/sports.xml'),
|
||||
(u'Culture', u'http://www.iht.com/rss/arts.xml'),
|
||||
(u'Style and Design', u'http://www.iht.com/rss/style.xml'),
|
||||
(u'Travel', u'http://www.iht.com/rss/travel.xml'),
|
||||
(u'At Home Abroad', u'http://www.iht.com/rss/athome.xml'),
|
||||
(u'Your Money', u'http://www.iht.com/rss/yourmoney.xml'),
|
||||
(u'Properties', u'http://www.iht.com/rss/properties.xml')
|
||||
]
|
||||
temp_files = []
|
||||
articles_are_obfuscated = True
|
||||
|
||||
masthead_url = 'http://graphics8.nytimes.com/images/misc/iht-masthead-logo.gif'
|
||||
|
||||
def get_obfuscated_article(self, url):
|
||||
br = self.get_browser()
|
||||
br.open(url)
|
||||
response1 = br.follow_link(url_regex=re.compile(r'.*pagewanted=print.*'))
|
||||
html = response1.read()
|
||||
|
||||
self.temp_files.append(PersistentTemporaryFile('_iht.html'))
|
||||
self.temp_files[-1].write(html)
|
||||
self.temp_files[-1].close()
|
||||
return self.temp_files[-1].name
|
||||
('NYTimes',
|
||||
'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml'),
|
||||
('NYTimes global',
|
||||
'http://www.nytimes.com/services/xml/rss/nyt/GlobalHome.xml'),
|
||||
('World',
|
||||
'http://www.nytimes.com/services/xml/rss/nyt/World.xml'),
|
||||
('U.S.',
|
||||
'http://www.nytimes.com/services/xml/rss/nyt/US.xml'),
|
||||
('Business',
|
||||
'http://feeds.nytimes.com/nyt/rss/Business'),
|
||||
('Sports',
|
||||
'http://www.nytimes.com/services/xml/rss/nyt/Sports.xml'),
|
||||
('Technology',
|
||||
'http://feeds.nytimes.com/nyt/rss/Technology'),
|
||||
]
|
||||
|
12
recipes/iktibas.recipe
Normal file
@ -0,0 +1,12 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BasicUserRecipe1324739406(BasicNewsRecipe):
|
||||
title = u'\u0130ktibas'
|
||||
language = 'tr'
|
||||
__author__ = 'asalet_r'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 20
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'\u0130ktibas', u'http://www.iktibasdergisi.com/rss/rss.xml')]
|
@ -1,27 +1,26 @@
|
||||
# adapted from old recipe by Darko Miletic <darko.miletic at gmail.com>
|
||||
|
||||
import string, re
|
||||
from calibre import strftime
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
|
||||
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
|
||||
|
||||
|
||||
class TheIndependentNew(BasicNewsRecipe):
|
||||
|
||||
|
||||
# flag to enable/disable article graphics on business pages/some others
|
||||
# eg http://www.independent.co.uk/news/world/europe/berlusconi-departure-fails-to-calm-the-markets-6259682.html
|
||||
# -max dimensions can be altered using the .pictureContainer img selector in the css
|
||||
_FETCH_ARTICLE_GRAPHICS = True
|
||||
|
||||
|
||||
#Flag to enable/disable image fetching (not business)
|
||||
_FETCH_IMAGES = True
|
||||
|
||||
|
||||
|
||||
|
||||
#used for converting rating to stars
|
||||
_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star.png'
|
||||
_NO_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star_grey.png'
|
||||
|
||||
|
||||
|
||||
|
||||
title = u'The Independent'
|
||||
__author__ = 'Will'
|
||||
description = 'The latest in UK News and World News from The \
|
||||
@ -40,28 +39,30 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
encoding = 'utf-8'
|
||||
remove_tags =[
|
||||
dict(attrs={'id' : ['RelatedArtTag','renderBiography']}),
|
||||
dict(attrs={'class' : ['autoplay','openBiogPopup']})
|
||||
dict(attrs={'class' : ['autoplay','openBiogPopup']}),
|
||||
dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}),
|
||||
dict(attrs={'style' : re.compile('.*')}),
|
||||
]
|
||||
|
||||
|
||||
keep_only_tags =[dict(attrs={'id':'main'})]
|
||||
recursions = 0
|
||||
|
||||
|
||||
# fixes non compliant html nesting and 'marks' article graphics links
|
||||
preprocess_regexps = [
|
||||
(re.compile('<span class="storyTop ">(?P<nested>.*?)</span>', re.DOTALL),
|
||||
lambda match: '<div class="storyTop">' + match.group('nested') + '</div>'),
|
||||
(re.compile('(<strong>.*?[Cc]lick.*?<a.*?((HERE)|([Hh]ere)).*?</strong>)', re.DOTALL),
|
||||
lambda match: '<div class="article-graphic">' + match.group(0) + '</div>'),
|
||||
]
|
||||
|
||||
|
||||
]
|
||||
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
extra_css = """
|
||||
h1{font-family: Georgia,serif }
|
||||
body{font-family: Verdana,Arial,Helvetica,sans-serif}
|
||||
@ -81,124 +82,133 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
.articleContent {display: block; clear:left;}
|
||||
.storyTop{}
|
||||
.pictureContainer img { max-width: 400px; max-height: 400px;}
|
||||
"""
|
||||
|
||||
"""
|
||||
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
|
||||
|
||||
_processed_urls = []
|
||||
|
||||
|
||||
|
||||
|
||||
def get_article_url(self, article):
|
||||
url = super(self.__class__,self).get_article_url(article)
|
||||
|
||||
|
||||
title = article.get('title', None)
|
||||
if title and re.search("^Video:",title):
|
||||
return None
|
||||
|
||||
#remove duplicates
|
||||
|
||||
#remove duplicates
|
||||
if not (url in self._processed_urls):
|
||||
self._processed_urls.append(url)
|
||||
else:
|
||||
url = None
|
||||
return url
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
picdiv = soup.find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,picdiv['src'])
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
||||
|
||||
#remove 'advertorial articles'
|
||||
strapline = soup.find('div',attrs={'class' : re.compile('.*strapLine.*')})
|
||||
if strapline:
|
||||
for para in strapline.findAll('p'):
|
||||
if len(para.contents) and isinstance(para.contents[0],NavigableString) \
|
||||
and para.contents[0] == 'ADVERTORIAL FEATURE':
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
items_to_extract = []
|
||||
|
||||
slideshow_elements = []
|
||||
|
||||
for item in soup.findAll(attrs={'class' : re.compile("widget.*")}):
|
||||
remove = True
|
||||
pattern = re.compile('((articleContent)|(title))$')
|
||||
if (pattern.search(item['class'])) is not None:
|
||||
remove = False
|
||||
|
||||
|
||||
# corrections
|
||||
# story content always good
|
||||
pattern = re.compile('storyContent')
|
||||
pattern = re.compile('storyContent')
|
||||
if (pattern.search(item['class'])) is not None:
|
||||
remove = False
|
||||
|
||||
|
||||
#images
|
||||
pattern = re.compile('slideshow')
|
||||
pattern = re.compile('slideshow')
|
||||
if (pattern.search(item['class'])) is not None:
|
||||
if self._FETCH_IMAGES:
|
||||
remove = False
|
||||
slideshow_elements.append(item)
|
||||
else:
|
||||
remove = True
|
||||
|
||||
|
||||
#social widgets always bad
|
||||
pattern = re.compile('socialwidget')
|
||||
pattern = re.compile('socialwidget')
|
||||
if (pattern.search(item['class'])) is not None:
|
||||
remove = True
|
||||
|
||||
|
||||
if remove:
|
||||
items_to_extract.append(item)
|
||||
|
||||
|
||||
for item in items_to_extract:
|
||||
item.extract()
|
||||
|
||||
items_to_extract = []
|
||||
|
||||
if self._FETCH_IMAGES:
|
||||
for item in soup.findAll('a',attrs={'href' : re.compile('.*')}):
|
||||
if item.img is not None:
|
||||
#use full size image
|
||||
img = item.findNext('img')
|
||||
|
||||
img['src'] = item['href']
|
||||
|
||||
#insert caption if available
|
||||
if img['title'] is not None and (len(img['title']) > 1):
|
||||
tag = Tag(soup,'h3')
|
||||
text = NavigableString(img['title'])
|
||||
tag.insert(0,text)
|
||||
|
||||
#picture before text
|
||||
img.extract()
|
||||
item.insert(0,img)
|
||||
item.insert(1,tag)
|
||||
|
||||
# remove link
|
||||
item.name = "div"
|
||||
item["class"]='image'
|
||||
del item["href"]
|
||||
|
||||
|
||||
item.extract()
|
||||
|
||||
items_to_extract = []
|
||||
|
||||
if self._FETCH_IMAGES:
|
||||
for element in slideshow_elements:
|
||||
for item in element.findAll('a',attrs={'href' : re.compile('.*')}):
|
||||
if item.img is not None:
|
||||
#use full size image
|
||||
img = item.findNext('img')
|
||||
|
||||
img['src'] = item['href']
|
||||
|
||||
#insert caption if available
|
||||
if img.get('title') and (len(img['title']) > 1):
|
||||
tag = Tag(soup,'h3')
|
||||
text = NavigableString(img['title'])
|
||||
tag.insert(0,text)
|
||||
|
||||
#picture before text
|
||||
img.extract()
|
||||
item.insert(0,img)
|
||||
item.insert(1,tag)
|
||||
|
||||
# remove link
|
||||
item.name = "div"
|
||||
item["class"]='image'
|
||||
del item["href"]
|
||||
|
||||
|
||||
#remove empty subtitles
|
||||
"""
|
||||
currently the subtitle is located in first paragraph after
|
||||
sibling <h3 class="subtitle"> tag. This may be 'fixed' at
|
||||
some point.
|
||||
"""
|
||||
some point.
|
||||
"""
|
||||
subtitle = soup.find('h3',attrs={'class' : 'subtitle'})
|
||||
if subtitle is not None:
|
||||
subtitleText = subtitle.findNext('p')
|
||||
if subtitleText is not None:
|
||||
if len(subtitleText.contents[0]) <= 1 :
|
||||
if len(subtitleText.contents[0]) <= 1 :
|
||||
subtitleText.extract()
|
||||
subtitle.extract()
|
||||
|
||||
|
||||
|
||||
|
||||
#replace rating numbers with stars
|
||||
for item in soup.findAll('div',attrs={ 'class' : 'starRating'}):
|
||||
if item is not None:
|
||||
soup2 = self._insertRatingStars(soup,item)
|
||||
if soup2 is not None:
|
||||
soup = soup2
|
||||
|
||||
|
||||
|
||||
|
||||
#remove empty paragraph tags in storyTop which can leave a space
|
||||
#between first paragraph and rest of story
|
||||
nested_content = False
|
||||
nested_content = False
|
||||
storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
|
||||
for item in storyTop.findAll('p'):
|
||||
for nested in item:
|
||||
@ -207,19 +217,19 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
break
|
||||
if not nested_content and item.contents is not None and len(item.contents[0]) <= 1 :
|
||||
items_to_extract.append(item)
|
||||
|
||||
|
||||
for item in items_to_extract:
|
||||
item.extract()
|
||||
|
||||
items_to_extract = []
|
||||
|
||||
|
||||
item.extract()
|
||||
|
||||
items_to_extract = []
|
||||
|
||||
|
||||
#remove line breaks immediately next to tags with default margins
|
||||
#to prevent double line spacing and narrow columns of text
|
||||
storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
|
||||
self._remove_undesired_line_breaks_from_tag(storyTop,soup)
|
||||
|
||||
|
||||
self._remove_undesired_line_breaks_from_tag(storyTop,soup)
|
||||
|
||||
|
||||
#replace article graphics link with the graphics themselves
|
||||
if self._FETCH_ARTICLE_GRAPHICS:
|
||||
items_to_insert = []
|
||||
@ -231,20 +241,20 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
if isinstance(child,Tag):
|
||||
if str(child.name) == 'a':
|
||||
items_to_insert.extend(self._get_article_graphic(strong,child['href'],soup))
|
||||
|
||||
|
||||
for item in items_to_insert:
|
||||
item[0].replaceWith(item[1])
|
||||
|
||||
item[0].replaceWith(item[1])
|
||||
|
||||
for item in items_to_extract:
|
||||
item.extract()
|
||||
|
||||
item.extract()
|
||||
|
||||
return soup
|
||||
|
||||
|
||||
|
||||
|
||||
def _get_article_graphic(self,old_item,url,soup):
|
||||
|
||||
|
||||
items_to_insert = []
|
||||
|
||||
|
||||
if re.search('\.jpg$',str(url)):
|
||||
div = Tag(soup,'div')
|
||||
div['class'] = 'pictureContainer'
|
||||
@ -254,20 +264,23 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
div.insert(0,img)
|
||||
items_to_insert.append((old_item,div,))
|
||||
return items_to_insert
|
||||
|
||||
|
||||
soup2 = self.index_to_soup(url)
|
||||
for item in soup2.findAll('div',attrs={'class' : re.compile("widget picture article.*")}):
|
||||
items_to_insert.append((old_item,item),)
|
||||
return items_to_insert
|
||||
|
||||
|
||||
|
||||
|
||||
def _insertRatingStars(self,soup,item):
|
||||
if item.contents is None:
|
||||
if item.contents is None or len(item.contents) < 1:
|
||||
return
|
||||
rating = item.contents[0]
|
||||
if not rating.isdigit():
|
||||
return None
|
||||
rating = int(item.contents[0])
|
||||
|
||||
try:
|
||||
rating = float(item.contents[0])
|
||||
except:
|
||||
print 'Could not convert decimal rating to star: malformatted float.'
|
||||
return
|
||||
for i in range(1,6):
|
||||
star = Tag(soup,'img')
|
||||
if i <= rating:
|
||||
@ -277,26 +290,26 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
star['alt'] = 'star number ' + str(i)
|
||||
item.insert(i,star)
|
||||
#item.contents[0] = NavigableString('(' + str(rating) + ')')
|
||||
item.contents[0] = ''
|
||||
|
||||
item.contents[0] = ''
|
||||
|
||||
def postprocess_html(self,soup, first_fetch):
|
||||
#find broken images and remove captions
|
||||
items_to_extract = []
|
||||
for item in soup.findAll('div', attrs={'class' : 'image'}):
|
||||
img = item.findNext('img')
|
||||
if img is not None and img['src'] is not None:
|
||||
# broken images still point to remote url
|
||||
pattern = re.compile('http://www.independent.co.uk.*')
|
||||
if img and img.get('src'):
|
||||
# broken images still point to remote url
|
||||
pattern = re.compile('http://www.independent.co.uk.*')
|
||||
if pattern.match(img["src"]) is not None:
|
||||
caption = img.findNextSibling('h3')
|
||||
if caption is not None:
|
||||
items_to_extract.append(caption)
|
||||
items_to_extract.append(img)
|
||||
|
||||
|
||||
for item in items_to_extract:
|
||||
item.extract()
|
||||
item.extract()
|
||||
return soup
|
||||
|
||||
|
||||
def _recurisvely_linearise_tag_tree(
|
||||
self,
|
||||
item,
|
||||
@ -311,25 +324,25 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
if not (isinstance(item,Tag)):
|
||||
return linearised
|
||||
for nested in item:
|
||||
linearised.append(nested)
|
||||
linearised.append(nested)
|
||||
linearised = self._recurisvely_linearise_tag_tree(nested,linearised, count)
|
||||
return linearised
|
||||
|
||||
|
||||
|
||||
|
||||
def _get_previous_tag(self,current_index, tag_tree):
|
||||
if current_index == 0:
|
||||
return None
|
||||
else:
|
||||
return tag_tree[current_index - 1]
|
||||
|
||||
|
||||
|
||||
|
||||
def _get_next_tag(self,current_index, tag_tree):
|
||||
if current_index < len(tag_tree) - 1:
|
||||
return tag_tree[current_index + 1]
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
|
||||
|
||||
def _list_match(self,test_str, list_regex):
|
||||
for regex in list_regex:
|
||||
match = re.match(regex, test_str)
|
||||
@ -338,24 +351,24 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
return False
|
||||
|
||||
def _remove_undesired_line_breaks_from_tag(self,parent,soup):
|
||||
|
||||
|
||||
if parent is None:
|
||||
return
|
||||
|
||||
|
||||
|
||||
|
||||
tag_tree = self._recurisvely_linearise_tag_tree(parent)
|
||||
items_to_remove = []
|
||||
|
||||
|
||||
|
||||
|
||||
for item in tag_tree:
|
||||
if item == u'\n':
|
||||
items_to_remove.append(item)
|
||||
continue;
|
||||
|
||||
|
||||
for item in items_to_remove:
|
||||
tag_tree.remove(item)
|
||||
|
||||
|
||||
|
||||
|
||||
spaced_tags = [r'p', r'h\d', r'blockquote']
|
||||
tags_to_extract = []
|
||||
tags_to_replace = []
|
||||
@ -363,41 +376,41 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
if isinstance(tag, Tag):
|
||||
if str(tag) == '<br />':
|
||||
previous_tag = self._get_previous_tag(i, tag_tree)
|
||||
|
||||
|
||||
if isinstance(previous_tag, Tag):
|
||||
previous_tag_is_spaced = previous_tag is not None\
|
||||
and self._list_match(str(previous_tag.name),
|
||||
spaced_tags)
|
||||
else:
|
||||
previous_tag_is_spaced = False
|
||||
|
||||
|
||||
next_tag = self._get_next_tag(i, tag_tree)
|
||||
|
||||
|
||||
if isinstance(next_tag, Tag):
|
||||
next_tag_is_spaced = next_tag is not None\
|
||||
and self._list_match(str(next_tag.name), spaced_tags)
|
||||
else:
|
||||
next_tag_is_spaced = False
|
||||
|
||||
|
||||
if previous_tag_is_spaced or next_tag_is_spaced or i == 0\
|
||||
or i == len(tag_tree) - 1:
|
||||
tags_to_extract.append(tag)
|
||||
else:
|
||||
tags_to_replace.append((tag,NavigableString(' '),))
|
||||
|
||||
|
||||
|
||||
|
||||
for pair in tags_to_replace:
|
||||
pair[0].replaceWith(pair[1])
|
||||
pair[0].replaceWith(pair[1])
|
||||
for tag in tags_to_extract:
|
||||
tag.extract()
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'News - UK',
|
||||
u'http://www.independent.co.uk/news/uk/?service=rss'),
|
||||
(u'News - World',
|
||||
u'http://www.independent.co.uk/news/world/?service=rss'),
|
||||
(u'News - Business',
|
||||
u'http://www.independent.co.uk/news/business/?service=rss'),
|
||||
u'http://www.independent.co.uk/news/business/?service=rss'),
|
||||
(u'News - People',
|
||||
u'http://www.independent.co.uk/news/people/?service=rss'),
|
||||
(u'News - Science',
|
||||
@ -497,4 +510,4 @@ class TheIndependentNew(BasicNewsRecipe):
|
||||
(u'IndyBest',
|
||||
u'http://www.independent.co.uk/extras/indybest/?service=rss'),
|
||||
]
|
||||
|
||||
|
||||
|
@ -1,16 +1,20 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1234144423(BasicNewsRecipe):
|
||||
title = u'Indianapolis Star'
|
||||
oldest_article = 5
|
||||
language = 'en'
|
||||
class IndianapolisStar(BasicNewsRecipe):
|
||||
title = u'Indianapolis Star'
|
||||
oldest_article = 10
|
||||
auto_cleanup = True
|
||||
language = 'en'
|
||||
__author__ = 'Owen Kelly'
|
||||
max_articles_per_feed = 100
|
||||
cover_url = u'http://www2.indystar.com/frontpage/images/today.jpg'
|
||||
feeds = [(u'Community Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=LOCAL&template=rss'),
|
||||
(u'News Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=NEWS&template=rss'),
|
||||
(u'Business Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=BUSINESS&template=rss'),
|
||||
(u'Politics and Government', u'http://www.indystar.com/apps/pbcs.dll/section?Category=NEWS05&template=rss'),
|
||||
(u'Lifestyle Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=LIVING&template=rss&mime=XML'),
|
||||
(u'Opinion Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=OPINION&template=rss&mime=XML')
|
||||
]
|
||||
|
||||
__author__ = 'Owen Kelly'
|
||||
max_articles_per_feed = 100
|
||||
|
||||
cover_url = u'http://www2.indystar.com/frontpage/images/today.jpg'
|
||||
|
||||
feeds = [(u'Community Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=LOCAL&template=rss&mime=XML'), (u'News Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=NEWS&template=rss&mime=XML'), (u'Business Headlines', u'http://www..indystar.com/apps/pbcs.dll/section?Category=BUSINESS&template=rss&mime=XML'), (u'Sports Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=SPORTS&template=rss&mime=XML'), (u'Lifestyle Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=LIVING&template=rss&mime=XML'), (u'Opinion Headlines', u'http://www.indystar.com/apps/pbcs.dll/section?Category=OPINION&template=rss&mime=XML')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '&template=printart'
|
||||
def print_version(self, url):
|
||||
return url + '&template=printart'
|
||||
|
17
recipes/infra_pl.recipe
Normal file
@ -0,0 +1,17 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class INFRA(BasicNewsRecipe):
|
||||
title = u'INFRA'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
__author__ = 'fenuks'
|
||||
description = u'Serwis Informacyjny INFRA - UFO, Zjawiska Paranormalne, Duchy, Tajemnice świata.'
|
||||
cover_url = 'http://npn.nazwa.pl/templates/ja_teline_ii/images/logo.jpg'
|
||||
category = 'UFO'
|
||||
language = 'pl'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheers=True
|
||||
remove_tags_before=dict(name='h2', attrs={'class':'contentheading'})
|
||||
remove_tags_after=dict(attrs={'class':'pagenav'})
|
||||
remove_tags=[dict(attrs={'class':'pagenav'})]
|
||||
feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/index.php?option=com_rd_rss&id=1')]
|
12
recipes/izdiham.com.recipe
Normal file
@ -0,0 +1,12 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BasicUserRecipe1324158549(BasicNewsRecipe):
|
||||
title = u'izdiham.com'
|
||||
language = 'tr'
|
||||
__author__ = 'asalet_r'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 20
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'\u0130zdiham', u'http://www.izdiham.com/index.php/feed')]
|
18
recipes/japan_news.recipe
Normal file
@ -0,0 +1,18 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class NewsOnJapan(BasicNewsRecipe):
|
||||
title = u'News On Japan'
|
||||
language = 'en'
|
||||
__author__ = 'Krittika Goyal'
|
||||
oldest_article = 1 #days
|
||||
max_articles_per_feed = 25
|
||||
use_embedded_content = False
|
||||
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
|
||||
|
||||
feeds = [
|
||||
('News',
|
||||
'http://newsonjapan.com/rss/top.xml'),
|
||||
]
|
72
recipes/klip_me.recipe
Normal file
@ -0,0 +1,72 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1299694372(BasicNewsRecipe):
|
||||
title = u'Klipme'
|
||||
__author__ = 'Ken Sun'
|
||||
publisher = 'Klip.me'
|
||||
category = 'info, custom, Klip.me'
|
||||
oldest_article = 365
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'id':'text_controls_toggle'})
|
||||
,dict(name='script')
|
||||
,dict(name='div', attrs={'id':'text_controls'})
|
||||
,dict(name='div', attrs={'id':'editing_controls'})
|
||||
,dict(name='div', attrs={'class':'bar bottom'})
|
||||
]
|
||||
use_embedded_content = False
|
||||
needs_subscription = True
|
||||
INDEX = u'http://www.klip.me'
|
||||
LOGIN = INDEX + u'/fav/signin?callback=/fav'
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'Klip.me unread', u'http://www.klip.me/fav'),
|
||||
(u'Klip.me started', u'http://www.klip.me/fav?s=starred')
|
||||
]
|
||||
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None:
|
||||
br.open(self.LOGIN)
|
||||
br.select_form(nr=0)
|
||||
br['Email'] = self.username
|
||||
if self.password is not None:
|
||||
br['Passwd'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
def parse_index(self):
|
||||
totalfeeds = []
|
||||
lfeeds = self.get_feeds()
|
||||
for feedobj in lfeeds:
|
||||
feedtitle, feedurl = feedobj
|
||||
self.report_progress(0, 'Fetching feed'+' %s...'%(feedtitle if feedtitle else feedurl))
|
||||
articles = []
|
||||
soup = self.index_to_soup(feedurl)
|
||||
for item in soup.findAll('table',attrs={'class':['item','item new']}):
|
||||
atag = item.a
|
||||
if atag and atag.has_key('href'):
|
||||
url = atag['href']
|
||||
articles.append({
|
||||
'url' :url
|
||||
})
|
||||
totalfeeds.append((feedtitle, articles))
|
||||
return totalfeeds
|
||||
|
||||
def print_version(self, url):
|
||||
return 'http://www.klip.me' + url
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
article.title = soup.find('title').contents[0].strip()
|
||||
|
||||
def postprocess_html(self, soup, first_fetch):
|
||||
for link_tag in soup.findAll(attrs={"id" : "story"}):
|
||||
link_tag.insert(0,'<h1>'+soup.find('title').contents[0].strip()+'</h1>')
|
||||
print link_tag
|
||||
|
||||
return soup
|
||||
|
@ -1,79 +1,79 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Attis <attis@attis.one.pl>'
|
||||
__copyright__ = '2011 Attis <attis@attis.one.pl>, 2012 Tomasz Długosz <tomek3d@gmail.com>'
|
||||
__version__ = 'v. 0.1'
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class KopalniaWiedzy(BasicNewsRecipe):
|
||||
title = u'Kopalnia Wiedzy'
|
||||
publisher = u'Kopalnia Wiedzy'
|
||||
description = u'Ciekawostki ze świata nauki i techniki'
|
||||
encoding = 'utf-8'
|
||||
__author__ = 'Attis'
|
||||
language = 'pl'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
INDEX = u'http://kopalniawiedzy.pl/'
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
remove_tags = [{'name':'p', 'attrs': {'class': 'keywords'} }, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}]
|
||||
remove_tags_after = dict(attrs={'class':'ad-square'})
|
||||
keep_only_tags = [dict(name="div", attrs={'id':'articleContent'})]
|
||||
extra_css = '.topimage {margin-top: 30px}'
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'),
|
||||
lambda match: '<img class="topimage" ' + match.group(1) + '>' ),
|
||||
(re.compile(u'<br /><br />'),
|
||||
lambda match: '<br\/>')
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Biologia', u'http://kopalniawiedzy.pl/wiadomosci_biologia.rss'),
|
||||
(u'Medycyna', u'http://kopalniawiedzy.pl/wiadomosci_medycyna.rss'),
|
||||
(u'Psychologia', u'http://kopalniawiedzy.pl/wiadomosci_psychologia.rss'),
|
||||
(u'Technologie', u'http://kopalniawiedzy.pl/wiadomosci_technologie.rss'),
|
||||
(u'Ciekawostki', u'http://kopalniawiedzy.pl/wiadomosci_ciekawostki.rss'),
|
||||
(u'Artykuły', u'http://kopalniawiedzy.pl/artykuly.rss')
|
||||
]
|
||||
|
||||
def is_link_wanted(self, url, tag):
|
||||
return tag['class'] == 'next'
|
||||
|
||||
def remove_beyond(self, tag, next):
|
||||
while tag is not None and getattr(tag, 'name', None) != 'body':
|
||||
after = getattr(tag, next)
|
||||
while after is not None:
|
||||
ns = getattr(tag, next)
|
||||
after.extract()
|
||||
after = ns
|
||||
tag = tag.parent
|
||||
|
||||
def append_page(self, soup, appendtag, position):
|
||||
pager = soup.find('a',attrs={'class':'next'})
|
||||
if pager:
|
||||
nexturl = self.INDEX + pager['href']
|
||||
soup2 = self.index_to_soup(nexturl)
|
||||
texttag = soup2.find('div', attrs={'id':'articleContent'})
|
||||
|
||||
tag = texttag.find(attrs={'class':'pages'})
|
||||
self.remove_beyond(tag, 'nextSibling')
|
||||
|
||||
newpos = len(texttag.contents)
|
||||
self.append_page(soup2,texttag,newpos)
|
||||
title = u'Kopalnia Wiedzy'
|
||||
publisher = u'Kopalnia Wiedzy'
|
||||
description = u'Ciekawostki ze świata nauki i techniki'
|
||||
encoding = 'utf-8'
|
||||
__author__ = 'Attis & Tomasz Długosz'
|
||||
language = 'pl'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
INDEX = u'http://kopalniawiedzy.pl/'
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
appendtag.insert(position,texttag)
|
||||
remove_tags = [{'name':'p', 'attrs': {'class': 'keywords'}}, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}, {'name':'div', 'attrs': {'class':'article-time-and-cat'}}, {'name':'p', 'attrs': {'class':'tags'}}]
|
||||
remove_tags_after = dict(attrs={'class':'ad-square'})
|
||||
keep_only_tags = [dict(name="div", attrs={'class':'article-text text-small'})]
|
||||
extra_css = '.topimage {margin-top: 30px}'
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'),
|
||||
lambda match: '<img class="topimage" ' + match.group(1) + '>' ),
|
||||
(re.compile(u'<br /><br />'),
|
||||
lambda match: '<br\/>')
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Biologia', u'http://kopalniawiedzy.pl/wiadomosci_biologia.rss'),
|
||||
(u'Medycyna', u'http://kopalniawiedzy.pl/wiadomosci_medycyna.rss'),
|
||||
(u'Psychologia', u'http://kopalniawiedzy.pl/wiadomosci_psychologia.rss'),
|
||||
(u'Technologie', u'http://kopalniawiedzy.pl/wiadomosci_technologie.rss'),
|
||||
(u'Ciekawostki', u'http://kopalniawiedzy.pl/wiadomosci_ciekawostki.rss'),
|
||||
(u'Artykuły', u'http://kopalniawiedzy.pl/artykuly.rss')
|
||||
]
|
||||
|
||||
def is_link_wanted(self, url, tag):
|
||||
return tag['class'] == 'next'
|
||||
|
||||
def remove_beyond(self, tag, next):
|
||||
while tag is not None and getattr(tag, 'name', None) != 'body':
|
||||
after = getattr(tag, next)
|
||||
while after is not None:
|
||||
ns = getattr(tag, next)
|
||||
after.extract()
|
||||
after = ns
|
||||
tag = tag.parent
|
||||
|
||||
def append_page(self, soup, appendtag, position):
|
||||
pager = soup.find('a',attrs={'class':'next'})
|
||||
if pager:
|
||||
nexturl = self.INDEX + pager['href']
|
||||
soup2 = self.index_to_soup(nexturl)
|
||||
texttag = soup2.find('div', attrs={'id':'articleContent'})
|
||||
|
||||
tag = texttag.find(attrs={'class':'pages'})
|
||||
self.remove_beyond(tag, 'nextSibling')
|
||||
|
||||
newpos = len(texttag.contents)
|
||||
self.append_page(soup2,texttag,newpos)
|
||||
|
||||
appendtag.insert(position,texttag)
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body, 3)
|
||||
|
||||
for item in soup.findAll('div',attrs={'class':'pages'}):
|
||||
item.extract()
|
||||
|
||||
for item in soup.findAll('p', attrs={'class':'wykop'}):
|
||||
item.extract()
|
||||
|
||||
return soup
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body, 3)
|
||||
|
||||
for item in soup.findAll('div',attrs={'class':'pages'}):
|
||||
item.extract()
|
||||
|
||||
for item in soup.findAll('p', attrs={'class':'wykop'}):
|
||||
item.extract()
|
||||
|
||||
return soup
|
||||
|
14
recipes/kosmonauta_pl.recipe
Normal file
@ -0,0 +1,14 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class Kosmonauta(BasicNewsRecipe):
|
||||
title = u'Kosmonauta.net'
|
||||
__author__ = 'fenuks'
|
||||
description = u'polskojęzyczny portal w całości dedykowany misjom kosmicznym i badaniom kosmosu.'
|
||||
category = 'astronomy'
|
||||
language = 'pl'
|
||||
cover_url='http://bi.gazeta.pl/im/4/10393/z10393414X,Kosmonauta-net.jpg'
|
||||
no_stylesheets = True
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
feeds = [(u'Kosmonauta.net', u'http://www.kosmonauta.net/index.php/feed/rss.html')]
|
@ -1,10 +1,9 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.la-razon.com
|
||||
'''
|
||||
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class LaRazon_Bol(BasicNewsRecipe):
|
||||
@ -16,19 +15,17 @@ class LaRazon_Bol(BasicNewsRecipe):
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 200
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1252'
|
||||
encoding = 'utf8'
|
||||
use_embedded_content = False
|
||||
language = 'es_BO'
|
||||
publication_type = 'newspaper'
|
||||
delay = 1
|
||||
remove_empty_feeds = True
|
||||
cover_url = strftime('http://www.la-razon.com/portadas/%Y%m%d_LaRazon.jpg')
|
||||
masthead_url = 'http://www.la-razon.com/imagenes/logo.jpg'
|
||||
extra_css = """ body{font-family: Arial,Helvetica,sans-serif }
|
||||
img{margin-bottom: 0.4em}
|
||||
.noticia-titulo{font-family: Georgia,"Times New Roman",Times,serif}
|
||||
.lead{font-weight: bold; font-size: 0.8em}
|
||||
"""
|
||||
masthead_url = 'http://www.la-razon.com/static/LRZRazon/images/lrz-logo.png'
|
||||
extra_css = """ body{font-family: Georgia,"Times New Roman",Times,serif}
|
||||
img{margin-bottom: 0.4em; display: block}
|
||||
.meta{font-size: small; font-family: Arial,Helvetica,sans-serif}
|
||||
"""
|
||||
INDEX = 'http://www.la-razon.com/'
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
@ -37,28 +34,37 @@ class LaRazon_Bol(BasicNewsRecipe):
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':['noticia-titulo','noticia-desarrollo']})]
|
||||
remove_tags = [dict(name=['meta','link','form','iframe','embed','object'])]
|
||||
keep_only_tags = [dict(name='div', attrs={'class':['pg-hd', 'pg-bd']})]
|
||||
remove_tags = [
|
||||
dict(name=['meta','link','form','iframe','embed','object'])
|
||||
,dict(name='div', attrs={'class':'bd'})
|
||||
]
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
feeds = [
|
||||
(u'Editorial' , u'http://www.la-razon.com/rss_editorial.php' )
|
||||
,(u'Opinión' , u'http://www.la-razon.com/rss_opinion.php' )
|
||||
,(u'Nacional' , u'http://www.la-razon.com/rss_nacional.php' )
|
||||
,(u'Economia' , u'http://www.la-razon.com/rss_economia.php' )
|
||||
,(u'Ciudades' , u'http://www.la-razon.com/rss_ciudades.php' )
|
||||
,(u'Sociedad' , u'http://www.la-razon.com/rss_sociedad.php' )
|
||||
,(u'Mundo' , u'http://www.la-razon.com/rss_sociedad.php' )
|
||||
,(u'La Revista' , u'http://www.la-razon.com/rss_larevista.php' )
|
||||
,(u'Sociales' , u'http://www.la-razon.com/rss_sociales.php' )
|
||||
,(u'Mia' , u'http://www.la-razon.com/rss_mia.php' )
|
||||
,(u'Marcas' , u'http://www.la-razon.com/rss_marcas.php' )
|
||||
,(u'Escape' , u'http://www.la-razon.com/rss_escape.php' )
|
||||
,(u'El Financiero' , u'http://www.la-razon.com/rss_financiero.php')
|
||||
,(u'Tendencias' , u'http://www.la-razon.com/rss_tendencias.php')
|
||||
(u'Editorial' , u'http://www.la-razon.com/rss/opinion/editorial/' )
|
||||
,(u'Nacional' , u'http://www.la-razon.com/rss/nacional/' )
|
||||
,(u'Economia' , u'http://www.la-razon.com/rss/economia/' )
|
||||
,(u'Ciudades' , u'http://www.la-razon.com/rss/ciudades/' )
|
||||
,(u'Sociedad' , u'http://www.la-razon.com/rss/sociedad/' )
|
||||
,(u'Mundo' , u'http://www.la-razon.com/rss/mundo/' )
|
||||
,(u'La Revista' , u'http://www.la-razon.com/rss/la_revista/' )
|
||||
,(u'Sociales' , u'http://www.la-razon.com/rss/sociales/' )
|
||||
,(u'Mia' , u'http://www.la-razon.com/rss/suplementos/mia/' )
|
||||
,(u'Marcas' , u'http://www.la-razon.com/rss/marcas/' )
|
||||
,(u'Escape' , u'http://www.la-razon.com/rss/suplementos/escape/' )
|
||||
,(u'El Financiero' , u'http://www.la-razon.com/rss/suplementos/financiero/')
|
||||
,(u'Tendencias' , u'http://www.la-razon.com/rss/suplementos/tendencias/')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
return soup
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
lightbox = soup.find('div', attrs = {'class' : 'lightbox lightbox-frontpage'})
|
||||
return lightbox.img['src']
|
||||
|
||||
|
||||
|
@ -1,13 +1,12 @@
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Lorenzo Vigentini, based on Darko Miletic, Gabriele Marini'
|
||||
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
|
||||
description = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version'
|
||||
description = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version; 14.12.2011 new version'
|
||||
|
||||
'''
|
||||
http://www.repubblica.it/
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
@ -32,12 +31,6 @@ class LaRepubblica(BasicNewsRecipe):
|
||||
"""
|
||||
|
||||
remove_attributes = ['width','height','lang','xmlns:og','xmlns:fb']
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'.*?<head>', re.DOTALL|re.IGNORECASE), lambda match: '<head>'),
|
||||
(re.compile(r'<head>.*?<title>', re.DOTALL|re.IGNORECASE), lambda match: '<head><title>'),
|
||||
(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE), lambda match: '</title></head>')
|
||||
]
|
||||
|
||||
def get_article_url(self, article):
|
||||
link = BasicNewsRecipe.get_article_url(self, article)
|
||||
@ -73,15 +66,15 @@ class LaRepubblica(BasicNewsRecipe):
|
||||
remove_tags = [
|
||||
dict(name=['object','link','meta','iframe','embed']),
|
||||
dict(name='span',attrs={'class':'linkindice'}),
|
||||
dict(name='div', attrs={'class':'bottom-mobile'}),
|
||||
dict(name='div', attrs={'id':['rssdiv','blocco']}),
|
||||
dict(name='div', attrs={'class':'utility'}),
|
||||
dict(name='div', attrs={'class':['bottom-mobile','adv adv-middle-inline']}),
|
||||
dict(name='div', attrs={'id':['rssdiv','blocco','fb-like-head']}),
|
||||
dict(name='div', attrs={'class':['utility','fb-like-button','archive-button']}),
|
||||
dict(name='div', attrs={'class':'generalbox'}),
|
||||
dict(name='ul', attrs={'id':'hystory'})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Rilievo', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
|
||||
(u'Homepage', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
|
||||
(u'Cronaca', u'http://www.repubblica.it/rss/cronaca/rss2.0.xml'),
|
||||
(u'Esteri', u'http://www.repubblica.it/rss/esteri/rss2.0.xml'),
|
||||
(u'Economia', u'http://www.repubblica.it/rss/economia/rss2.0.xml'),
|
||||
@ -110,3 +103,5 @@ class LaRepubblica(BasicNewsRecipe):
|
||||
del item['style']
|
||||
return soup
|
||||
|
||||
def preprocess_raw_html(self, raw, url):
|
||||
return '<html><head>'+raw[raw.find('</head>'):]
|
||||
|
14
recipes/lega_nerd.recipe
Normal file
@ -0,0 +1,14 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1326135232(BasicNewsRecipe):
|
||||
title = u'Lega Nerd'
|
||||
description = 'nerd / geek culture, pc, comics, music, culture'
|
||||
language = 'it'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'Lega Nerd', u'http://feeds.feedburner.com/LegaNerd')]
|
||||
__author__ = 'faber1971'
|
||||
__version__ = 'v1.0'
|
||||
__date__ = '9, January 2011'
|
94
recipes/letsgetcritical.recipe
Normal file
@ -0,0 +1,94 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class LetsGetCritical(BasicNewsRecipe):
|
||||
title = u"Let's Get Critical"
|
||||
description = 'Curation / aggregation of criticisms of the arts and culture '
|
||||
language = 'en'
|
||||
__author__ = 'barty on mobileread.com forum'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = False
|
||||
timefmt = ' [%a, %d %b, %Y]'
|
||||
oldest_article = 365
|
||||
auto_cleanup = True
|
||||
INDEX = 'http://www.letsgetcritical.org'
|
||||
CATEGORIES = [
|
||||
# comment out categories you don't want
|
||||
# (user friendly name, system name, max number of articles to load)
|
||||
('Architecture','architecture',30),
|
||||
('Art','art',30),
|
||||
('Books','books',30),
|
||||
('Design','design',30),
|
||||
('Digital','digital',30),
|
||||
('Food','food',30),
|
||||
('Movies','movies',30),
|
||||
('Music','music',30),
|
||||
('Television','television',30),
|
||||
('Other articles','',10)
|
||||
]
|
||||
|
||||
def parse_index(self):
|
||||
self.cover_url = 'http://www.letsgetcritical.org/wp-content/themes/lets_get_critical/images/lgc.jpg'
|
||||
feeds = []
|
||||
seen_urls = set([])
|
||||
regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
|
||||
|
||||
for category in self.CATEGORIES:
|
||||
|
||||
(cat_name, tag, max_articles) = category
|
||||
|
||||
tagurl = '' if tag=='' else '/category/'+tag.lower()
|
||||
self.log('Reading category:', cat_name)
|
||||
|
||||
articles = []
|
||||
pageno = 1
|
||||
|
||||
while len(articles) < max_articles and pageno < 100:
|
||||
|
||||
page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
|
||||
pageno += 1
|
||||
|
||||
self.log('\tReading page:', page)
|
||||
try:
|
||||
soup = self.index_to_soup(page)
|
||||
except:
|
||||
break
|
||||
|
||||
posts = soup.findAll('div',attrs={'class':'post_multi'})
|
||||
if len(posts) == 0:
|
||||
break
|
||||
|
||||
for post in posts:
|
||||
dt = post.find('div',attrs={'class':'title'})
|
||||
atag = dt.find('a')
|
||||
url = atag['href']
|
||||
# skip promotionals and duplicate
|
||||
if url.startswith('http://letsgetcritical') or url.startswith('/') or url in seen_urls:
|
||||
continue
|
||||
seen_urls.add(url)
|
||||
title = self.tag_to_string(atag)
|
||||
self.log('\tFound article:', title)
|
||||
self.log('\t', url)
|
||||
desc = post.find('blockquote')
|
||||
desc = self.tag_to_string(desc) if desc else ''
|
||||
m = regex.match( url)
|
||||
if m:
|
||||
desc = "[%s] %s" % (m.group(2), desc)
|
||||
#self.log('\t', desc)
|
||||
date = ''
|
||||
p = post.previousSibling
|
||||
# navigate up sibling to find date
|
||||
while p:
|
||||
if hasattr(p,'class') and p['class'] == 'singledate':
|
||||
date = self.tag_to_string(p)
|
||||
break
|
||||
p = p.previousSibling
|
||||
articles.append({'title':title,'url':url,'description':desc,'date':date})
|
||||
if len(articles) >= max_articles:
|
||||
break
|
||||
|
||||
if articles:
|
||||
feeds.append((cat_name, articles))
|
||||
|
||||
return feeds
|
||||
|
@ -41,7 +41,7 @@ class LosTiempos_Bol(BasicNewsRecipe):
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'articulo'})]
|
||||
remove_tags = [
|
||||
dict(name=['meta','link','form','iframe','embed','object','hr'])
|
||||
,dict(attrs={'class':['caja_fonts sin_border_bot','pub']})
|
||||
,dict(attrs={'class':['caja_fonts sin_border_bot','pub','twitter-share-button']})
|
||||
]
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
|
@ -14,8 +14,11 @@ class WeeklyLWN(BasicNewsRecipe):
|
||||
description = 'Weekly summary of what has happened in the free software world.'
|
||||
__author__ = 'Davide Cavalca'
|
||||
language = 'en'
|
||||
site_url = 'http://lwn.net'
|
||||
|
||||
cover_url = 'http://lwn.net/images/lcorner.png'
|
||||
extra_css = 'pre,code,samp,kbd,tt { font-size: 80% }\nblockquote {margin-left:0 }\n* { color: black }\n'
|
||||
|
||||
cover_url = site_url + '/images/lcorner.png'
|
||||
#masthead_url = 'http://lwn.net/images/lcorner.png'
|
||||
publication_type = 'magazine'
|
||||
|
||||
@ -43,11 +46,29 @@ class WeeklyLWN(BasicNewsRecipe):
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
def print_version(self, url):
|
||||
|
||||
# Strip off anchor
|
||||
url = url.split('#')[0]
|
||||
|
||||
# Prepend site_url
|
||||
if url[0:len(self.site_url)] != self.site_url:
|
||||
url = self.site_url + url
|
||||
|
||||
# Append printable URL parameter
|
||||
print_param = '?format=printable'
|
||||
if url[-len(print_param):] != print_param:
|
||||
url += print_param
|
||||
|
||||
#import sys
|
||||
#print >>sys.stderr, "*** print_version(url):", url
|
||||
return url
|
||||
|
||||
def parse_index(self):
|
||||
if self.username is not None and self.password is not None:
|
||||
index_url = 'http://lwn.net/current/bigpage?format=printable'
|
||||
index_url = self.print_version('/current/bigpage')
|
||||
else:
|
||||
index_url = 'http://lwn.net/free/bigpage?format=printable'
|
||||
index_url = self.print_version('/free/bigpage')
|
||||
soup = self.index_to_soup(index_url)
|
||||
body = soup.body
|
||||
|
||||
@ -56,19 +77,19 @@ class WeeklyLWN(BasicNewsRecipe):
|
||||
url_re = re.compile('^/Articles/')
|
||||
|
||||
while True:
|
||||
tag_title = body.findNext(name='p', attrs={'class':'SummaryHL'})
|
||||
tag_title = body.findNext(attrs={'class':'SummaryHL'})
|
||||
if tag_title == None:
|
||||
break
|
||||
|
||||
tag_section = tag_title.findPrevious(name='p', attrs={'class':'Cat1HL'})
|
||||
tag_section = tag_title.findPrevious(attrs={'class':'Cat1HL'})
|
||||
if tag_section == None:
|
||||
section = 'Front Page'
|
||||
else:
|
||||
section = tag_section.string
|
||||
|
||||
tag_section2 = tag_title.findPrevious(name='p', attrs={'class':'Cat2HL'})
|
||||
tag_section2 = tag_title.findPrevious(attrs={'class':'Cat2HL'})
|
||||
if tag_section2 != None:
|
||||
if tag_section2.findPrevious(name='p', attrs={'class':'Cat1HL'}) == tag_section:
|
||||
if tag_section2.findPrevious(attrs={'class':'Cat1HL'}) == tag_section:
|
||||
section = "%s: %s" %(section, tag_section2.string)
|
||||
|
||||
if section not in articles.keys():
|
||||
@ -94,9 +115,10 @@ class WeeklyLWN(BasicNewsRecipe):
|
||||
if tag_url == None:
|
||||
break
|
||||
|
||||
|
||||
article = dict(
|
||||
title=self.tag_to_string(tag_title),
|
||||
url= 'http://lwn.net' + tag_url['href'].split('#')[0] + '?format=printable',
|
||||
url=tag_url['href'],
|
||||
description='', content='', date='')
|
||||
articles[section].append(article)
|
||||
|
||||
|