Sync to trunk.

This commit is contained in:
John Schember 2011-12-26 17:56:09 -05:00
commit d9babbc43b
612 changed files with 206194 additions and 119528 deletions

View File

@ -2,6 +2,7 @@
.check-cache.pickle .check-cache.pickle
src/calibre/plugins src/calibre/plugins
resources/images.qrc resources/images.qrc
src/calibre/ebooks/oeb/display/test/*.js
src/calibre/manual/.build/ src/calibre/manual/.build/
src/calibre/manual/cli/ src/calibre/manual/cli/
src/calibre/manual/template_ref.rst src/calibre/manual/template_ref.rst
@ -15,6 +16,7 @@ resources/ebook-convert-complete.pickle
resources/builtin_recipes.xml resources/builtin_recipes.xml
resources/builtin_recipes.zip resources/builtin_recipes.zip
resources/template-functions.json resources/template-functions.json
resources/display/*.js
setup/installer/windows/calibre/build.log setup/installer/windows/calibre/build.log
src/calibre/translations/.errors src/calibre/translations/.errors
src/cssutils/.svn/ src/cssutils/.svn/

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010, Dean Cording' __copyright__ = '2011, Pat Stapleton <pat.stapleton at gmail.com>'
''' '''
abc.net.au/news abc.net.au/news
''' '''
@ -8,7 +8,7 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
class ABCNews(BasicNewsRecipe): class ABCNews(BasicNewsRecipe):
title = 'ABC News' title = 'ABC News'
__author__ = 'Dean Cording' __author__ = 'Pat Stapleton, Dean Cording'
description = 'News from Australia' description = 'News from Australia'
masthead_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png' masthead_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png'
cover_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png' cover_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png'
@ -23,7 +23,9 @@ class ABCNews(BasicNewsRecipe):
category = 'News, Australia, World' category = 'News, Australia, World'
language = 'en_AU' language = 'en_AU'
publication_type = 'newsportal' publication_type = 'newsportal'
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')] # preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
#Remove annoying map links (inline-caption class is also used for some image captions! hence regex to match maps.google)
preprocess_regexps = [(re.compile(r'<a class="inline-caption" href="http://maps\.google\.com.*?/a>', re.DOTALL), lambda m: '')]
conversion_options = { conversion_options = {
'comments' : description 'comments' : description
,'tags' : category ,'tags' : category
@ -32,23 +34,23 @@ class ABCNews(BasicNewsRecipe):
,'linearize_tables': False ,'linearize_tables': False
} }
keep_only_tags = dict(id='article') keep_only_tags = [dict(attrs={'class':['article section']})]
remove_tags = [dict(attrs={'class':['related', 'tags']}), remove_tags = [dict(attrs={'class':['related', 'tags', 'tools', 'attached-content ready',
dict(id='statepromo') 'inline-content story left', 'inline-content map left contracted', 'published',
] 'story-map', 'statepromo', 'topics', ]})]
remove_attributes = ['width','height'] remove_attributes = ['width','height']
feeds = [ feeds = [
('Top Stories', 'http://www.abc.net.au/news/syndicate/topstoriesrss.xml'), ('Top Stories', 'http://www.abc.net.au/news/feed/45910/rss.xml'),
('Canberra', 'http://www.abc.net.au/news/indexes/idx-act/rss.xml'), ('Canberra', 'http://www.abc.net.au/news/feed/6910/rss.xml'),
('Sydney', 'http://www.abc.net.au/news/indexes/sydney/rss.xml'), ('Sydney', 'http://www.abc.net.au/news/feed/10232/rss.xml'),
('Melbourne', 'http://www.abc.net.au/news/indexes/melbourne/rss.xml'), ('Melbourne', 'http://www.abc.net.au/news/feed/21708/rss.xml'),
('Brisbane', 'http://www.abc.net.au/news/indexes/brisbane/rss.xml'), ('Brisbane', 'http://www.abc.net.au/news/feed/12858/rss.xml'),
('Perth', 'http://www.abc.net.au/news/indexes/perth/rss.xml'), ('Perth', 'feed://www.abc.net.au/news/feed/24886/rss.xml'),
('Australia', 'http://www.abc.net.au/news/indexes/idx-australia/rss.xml'), ('Australia', 'http://www.abc.net.au/news/feed/46182/rss.xml'),
('World', 'http://www.abc.net.au/news/indexes/world/rss.xml'), ('World', 'http://www.abc.net.au/news/feed/52278/rss.xml'),
('Business', 'http://www.abc.net.au/news/indexes/business/rss.xml'), ('Business', 'http://www.abc.net.au/news/feed/51892/rss.xml'),
('Science and Technology', 'http://www.abc.net.au/news/tag/science-and-technology/rss.xml'), ('Science and Technology', 'http://www.abc.net.au/news/feed/2298/rss.xml'),
] ]

View File

@ -1,19 +1,38 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import re
class Adventure_zone(BasicNewsRecipe): class Adventure_zone(BasicNewsRecipe):
title = u'Adventure Zone' title = u'Adventure Zone'
__author__ = 'fenuks' __author__ = 'fenuks'
description = 'Adventure zone - adventure games from A to Z' description = 'Adventure zone - adventure games from A to Z'
category = 'games' category = 'games'
language = 'pl' language = 'pl'
oldest_article = 15
max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
oldest_article = 20
max_articles_per_feed = 100
use_embedded_content=False
preprocess_regexps = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: '')]
remove_tags_before= dict(name='td', attrs={'class':'main-bg'}) remove_tags_before= dict(name='td', attrs={'class':'main-bg'})
remove_tags_after= dict(name='td', attrs={'class':'main-body middle-border'}) remove_tags= [dict(name='img', attrs={'alt':'Drukuj'})]
remove_tags_after= dict(id='comments')
extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; }' extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; }'
feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')] feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')]
def parse_feeds (self):
feeds = BasicNewsRecipe.parse_feeds(self)
soup=self.index_to_soup(u'http://www.adventure-zone.info/fusion/feeds/news.php')
tag=soup.find(name='channel')
titles=[]
for r in tag.findAll(name='image'):
r.extract()
art=tag.findAll(name='item')
for i in art:
titles.append(i.title.string)
for feed in feeds:
for article in feed.articles[:]:
article.title=titles[feed.articles.index(article)]
return feeds
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php') soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php')
cover=soup.find(id='box_OstatninumerAZ') cover=soup.find(id='box_OstatninumerAZ')
@ -22,17 +41,10 @@ class Adventure_zone(BasicNewsRecipe):
def skip_ad_pages(self, soup): def skip_ad_pages(self, soup):
skip_tag = soup.body.findAll(name='a') skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'})
if skip_tag is not None: skip_tag = skip_tag.findAll(name='a')
for r in skip_tag: for r in skip_tag:
if 'articles.php?' in r['href']: if r.strong:
if r.strong is not None: word=r.strong.string
word=r.strong.string if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word)):
if ('zapowied' or 'recenzj') in word: return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item_id'+r['href'][r['href'].find('_id')+3:], raw=True)
else:
None
def print_version(self, url):
return url.replace('news.php?readmore', 'print.php?type=N&item_id')

View File

@ -1,5 +1,4 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class AstroNEWS(BasicNewsRecipe): class AstroNEWS(BasicNewsRecipe):
title = u'AstroNEWS' title = u'AstroNEWS'
__author__ = 'fenuks' __author__ = 'fenuks'
@ -8,11 +7,16 @@ class AstroNEWS(BasicNewsRecipe):
language = 'pl' language = 'pl'
oldest_article = 8 oldest_article = 8
max_articles_per_feed = 100 max_articles_per_feed = 100
auto_cleanup = True #extra_css= 'table {text-align: left;}'
no_stylesheets=True
cover_url='http://news.astronet.pl/img/logo_news.jpg' cover_url='http://news.astronet.pl/img/logo_news.jpg'
# no_stylesheets= True remove_tags=[dict(name='hr')]
feeds = [(u'Wiadomości', u'http://news.astronet.pl/rss.cgi')] feeds = [(u'Wiadomości', u'http://news.astronet.pl/rss.cgi')]
def print_version(self, url): def print_version(self, url):
return url.replace('astronet.pl/', 'astronet.pl/print.cgi?') return url.replace('astronet.pl/', 'astronet.pl/print.cgi?')
def preprocess_html(self, soup):
for item in soup.findAll(align=True):
del item['align']
return soup

View File

@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = u'2011, Silviu Cotoar\u0103'
'''
b365.realitatea.net
'''
from calibre.web.feeds.news import BasicNewsRecipe
class b365Realitatea(BasicNewsRecipe):
title = u'b365 Realitatea'
__author__ = u'Silviu Cotoar\u0103'
publisher = u'b365 Realitatea'
description = u'b365 Realitatea'
oldest_article = 5
language = 'ro'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
category = 'Ziare,Romania,Bucuresti'
encoding = 'utf-8'
cover_url = 'http://b365.realitatea.net/wp-content/themes/b/images/b365-logo.png'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
keep_only_tags = [
dict(name='div', attrs={'class':'newsArticle'})
]
remove_tags = [
dict(name='div', attrs={'class':'date'})
, dict(name='dic', attrs={'class':'addthis_toolbox addthis_default_style'})
, dict(name='div', attrs={'class':'related_posts'})
, dict(name='div', attrs={'id':'RelevantiWidget'})
]
remove_tags_after = [
dict(name='div', attrs={'id':'RelevantiWidget'})
]
feeds = [
(u'\u0218tiri', u'http://b365.realitatea.net/rss-full/')
]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -1,61 +1,648 @@
__license__ = 'GPL v3' ##
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>' ## Title: BBC News, Sport, and Blog Calibre Recipe
## Contact: mattst - jmstanfield@gmail.com
##
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
## Copyright: mattst - jmstanfield@gmail.com
##
## Written: November 2011
## Last Edited: 2011-11-19
##
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__ = 'mattst - jmstanfield@gmail.com'
''' '''
news.bbc.co.uk BBC News, Sport, and Blog Calibre Recipe
''' '''
# Import the regular expressions module.
import re import re
# Import the BasicNewsRecipe class which this class extends.
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
class BBC(BasicNewsRecipe): class BBCNewsSportBlog(BasicNewsRecipe):
title = 'BBC News'
__author__ = 'Darko Miletic, Starson17' #
description = 'News from UK. ' # **** IMPORTANT USERS READ ME ****
oldest_article = 2 #
max_articles_per_feed = 100 # First select the feeds you want then scroll down below the feeds list
no_stylesheets = True # and select the values you want for the other user preferences, like
#delay = 1 # oldest_article and such like.
use_embedded_content = False #
encoding = 'utf8' #
publisher = 'BBC' # Select the BBC rss feeds which you want in your ebook.
category = 'news, UK, world' # Selected feed have NO '#' at their start, de-selected feeds begin with a '#'.
language = 'en_GB' #
publication_type = 'newsportal' # Eg. ("News Home", "http://feeds.bbci.co.uk/... - include feed.
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' # Eg. #("News Home", "http://feeds.bbci.co.uk/... - do not include feed.
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')] #
conversion_options = { # There are 68 feeds below which constitute the bulk of the available rss
'comments' : description # feeds on the BBC web site. These include 5 blogs by editors and
,'tags' : category # correspondants, 16 sports feeds, 15 'sub' regional feeds (Eg. North West
,'language' : language # Wales, Scotland Business), and 7 Welsh language feeds.
,'publisher' : publisher #
,'linearize_tables': True # Some of the feeds are low volume (Eg. blogs), or very low volume (Eg. Click)
# so if "oldest_article = 1.5" (only articles published in the last 36 hours)
# you may get some 'empty feeds' which will not then be included in the ebook.
#
# The 15 feeds currently selected below are simply my default ones.
#
# Note: With all 68 feeds selected, oldest_article set to 2,
# max_articles_per_feed set to 100, and simultaneous_downloads set to 10,
# the ebook creation took 29 minutes on my speedy 100 mbps net connection,
# fairly high-end desktop PC running Linux (Ubuntu Lucid-Lynx).
# More realistically with 15 feeds selected, oldest_article set to 1.5,
# max_articles_per_feed set to 100, and simultaneous_downloads set to 20,
# it took 6 minutes. If that's too slow increase 'simultaneous_downloads'.
#
# Select / de-select the feeds you want in your ebook.
#
feeds = [
("News Home", "http://feeds.bbci.co.uk/news/rss.xml"),
("UK", "http://feeds.bbci.co.uk/news/uk/rss.xml"),
("World", "http://feeds.bbci.co.uk/news/world/rss.xml"),
#("England", "http://feeds.bbci.co.uk/news/england/rss.xml"),
#("Scotland", "http://feeds.bbci.co.uk/news/scotland/rss.xml"),
#("Wales", "http://feeds.bbci.co.uk/news/wales/rss.xml"),
#("N. Ireland", "http://feeds.bbci.co.uk/news/northern_ireland/rss.xml"),
#("Africa", "http://feeds.bbci.co.uk/news/world/africa/rss.xml"),
#("Asia", "http://feeds.bbci.co.uk/news/world/asia/rss.xml"),
#("Europe", "http://feeds.bbci.co.uk/news/world/europe/rss.xml"),
#("Latin America", "http://feeds.bbci.co.uk/news/world/latin_america/rss.xml"),
#("Middle East", "http://feeds.bbci.co.uk/news/world/middle_east/rss.xml"),
("US & Canada", "http://feeds.bbci.co.uk/news/world/us_and_canada/rss.xml"),
("Politics", "http://feeds.bbci.co.uk/news/politics/rss.xml"),
("Science/Environment", "http://feeds.bbci.co.uk/news/science_and_environment/rss.xml"),
("Technology", "http://feeds.bbci.co.uk/news/technology/rss.xml"),
("Magazine", "http://feeds.bbci.co.uk/news/magazine/rss.xml"),
("Entertainment/Arts", "http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml"),
#("Health", "http://feeds.bbci.co.uk/news/health/rss.xml"),
#("Education/Family", "http://feeds.bbci.co.uk/news/education/rss.xml"),
("Business", "http://feeds.bbci.co.uk/news/business/rss.xml"),
("Special Reports", "http://feeds.bbci.co.uk/news/special_reports/rss.xml"),
("Also in the News", "http://feeds.bbci.co.uk/news/also_in_the_news/rss.xml"),
#("Newsbeat", "http://www.bbc.co.uk/newsbeat/rss.xml"),
#("Click", "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/programmes/click_online/rss.xml"),
("Blog: Nick Robinson (Political Editor)", "http://feeds.bbci.co.uk/news/correspondents/nickrobinson/rss.sxml"),
#("Blog: Mark D'Arcy (Parliamentary Correspondent)", "http://feeds.bbci.co.uk/news/correspondents/markdarcy/rss.sxml"),
#("Blog: Robert Peston (Business Editor)", "http://feeds.bbci.co.uk/news/correspondents/robertpeston/rss.sxml"),
#("Blog: Stephanie Flanders (Economics Editor)", "http://feeds.bbci.co.uk/news/correspondents/stephanieflanders/rss.sxml"),
("Blog: Rory Cellan-Jones (Technology correspondent)", "http://feeds.bbci.co.uk/news/correspondents/rorycellanjones/rss.sxml"),
("Sport Front Page", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/front_page/rss.xml"),
#("Football", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/football/rss.xml"),
#("Cricket", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/cricket/rss.xml"),
#("Rugby Union", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_union/rss.xml"),
#("Rugby League", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_league/rss.xml"),
#("Tennis", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/tennis/rss.xml"),
#("Golf", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/golf/rss.xml"),
#("Motorsport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/motorsport/rss.xml"),
#("Boxing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/boxing/rss.xml"),
#("Athletics", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/athletics/rss.xml"),
#("Snooker", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/snooker/rss.xml"),
#("Horse Racing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/horse_racing/rss.xml"),
#("Cycling", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/cycling/rss.xml"),
#("Disability Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/disability_sport/rss.xml"),
#("Other Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/rss.xml"),
#("Olympics 2012", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/olympics_2012/rss.xml"),
#("N. Ireland Politics", "http://feeds.bbci.co.uk/news/northern_ireland/northern_ireland_politics/rss.xml"),
#("Scotland Politics", "http://feeds.bbci.co.uk/news/scotland/scotland_politics/rss.xml"),
#("Scotland Business", "http://feeds.bbci.co.uk/news/scotland/scotland_business/rss.xml"),
#("E. Scotland, Edinburgh & Fife", "http://feeds.bbci.co.uk/news/scotland/edinburgh_east_and_fife/rss.xml"),
#("W. Scotland & Glasgow", "http://feeds.bbci.co.uk/news/scotland/glasgow_and_west/rss.xml"),
#("Highlands & Islands", "http://feeds.bbci.co.uk/news/scotland/highlands_and_islands/rss.xml"),
#("NE. Scotland, Orkney & Shetland", "http://feeds.bbci.co.uk/news/scotland/north_east_orkney_and_shetland/rss.xml"),
#("South Scotland", "http://feeds.bbci.co.uk/news/scotland/south_scotland/rss.xml"),
#("Central Scotland & Tayside", "http://feeds.bbci.co.uk/news/scotland/tayside_and_central/rss.xml"),
#("Wales Politics", "http://feeds.bbci.co.uk/news/wales/wales_politics/rss.xml"),
#("NW. Wales", "http://feeds.bbci.co.uk/news/wales/north_west_wales/rss.xml"),
#("NE. Wales", "http://feeds.bbci.co.uk/news/wales/north_east_wales/rss.xml"),
#("Mid. Wales", "http://feeds.bbci.co.uk/news/wales/mid_wales/rss.xml"),
#("SW. Wales", "http://feeds.bbci.co.uk/news/wales/south_west_wales/rss.xml"),
#("SE. Wales", "http://feeds.bbci.co.uk/news/wales/south_east_wales/rss.xml"),
#("Newyddion - News in Welsh", "http://feeds.bbci.co.uk/newyddion/rss.xml"),
#("Gwleidyddiaeth", "http://feeds.bbci.co.uk/newyddion/gwleidyddiaeth/rss.xml"),
#("Gogledd-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/gogledd-ddwyrain/rss.xml"),
#("Gogledd-Orllewin", "http://feeds.bbci.co.uk/newyddion/gogledd-orllewin/rss.xml"),
#("Canolbarth", "http://feeds.bbci.co.uk/newyddion/canolbarth/rss.xml"),
#("De-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/de-ddwyrain/rss.xml"),
#("De-Orllewin", "http://feeds.bbci.co.uk/newyddion/de-orllewin/rss.xml"),
]
# **** SELECT YOUR USER PREFERENCES ****
# Title to use for the ebook.
#
title = 'BBC News'
# A brief description for the ebook.
#
description = u'BBC web site ebook created using rss feeds.'
# The max number of articles which may be downloaded from each feed.
# I've never seen more than about 70 articles in a single feed in the
# BBC feeds.
#
max_articles_per_feed = 100
# The max age of articles which may be downloaded from each feed. This is
# specified in days - note fractions of days are allowed, Eg. 2.5 (2 and a
# half days). My default of 1.5 days is the last 36 hours, the point at
# which I've decided 'news' becomes 'old news', but be warned this is not
# so good for the blogs, technology, magazine, etc., and sports feeds.
# You may wish to extend this to 2-5 but watch out ebook creation time will
# increase as well. Setting this to 30 will get everything (AFAICT) as long
# as max_articles_per_feed remains set high (except for 'Click' which is
# v. low volume and its currently oldest article is 4th Feb 2011).
#
oldest_article = 1.5
# Number of simultaneous downloads. 20 is consistantly working fine on the
# BBC News feeds with no problems. Speeds things up from the defualt of 5.
# If you have a lot of feeds and/or have increased oldest_article above 2
# then you may wish to try increasing simultaneous_downloads to 25-30,
# Or, of course, if you are in a hurry. [I've not tried beyond 20.]
#
simultaneous_downloads = 20
# Timeout for fetching files from the server in seconds. The default of
# 120 seconds, seems somewhat excessive.
#
timeout = 30
# The format string for the date shown on the ebook's first page.
# List of all values: http://docs.python.org/library/time.html
# Default in news.py has a leading space so that's mirrored here.
# As with 'feeds' select/de-select by adding/removing the initial '#',
# only one timefmt should be selected, here's a few to choose from.
#
timefmt = ' [%a, %d %b %Y]' # [Fri, 14 Nov 2011] (Calibre default)
#timefmt = ' [%a, %d %b %Y %H:%M]' # [Fri, 14 Nov 2011 18:30]
#timefmt = ' [%a, %d %b %Y %I:%M %p]' # [Fri, 14 Nov 2011 06:30 PM]
#timefmt = ' [%d %b %Y]' # [14 Nov 2011]
#timefmt = ' [%d %b %Y %H:%M]' # [14 Nov 2011 18.30]
#timefmt = ' [%Y-%m-%d]' # [2011-11-14]
#timefmt = ' [%Y-%m-%d-%H-%M]' # [2011-11-14-18-30]
#
# **** IMPORTANT ****
#
# DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING.
#
# DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING.
#
# I MEAN IT, YES I DO, ABSOLUTELY, AT YOU OWN RISK. :)
#
# **** IMPORTANT ****
#
# Author of this recipe.
__author__ = 'mattst'
# Specify English as the language of the RSS feeds (ISO-639 code).
language = 'en_GB'
# Set tags.
tags = 'news, sport, blog'
# Set publisher and publication type.
publisher = 'BBC'
publication_type = 'newspaper'
# Disable stylesheets from site.
no_stylesheets = True
# Specifies an override encoding for sites that have an incorrect charset
# specified. Default of 'None' says to auto-detect. Some other BBC recipes
# use 'utf8', which works fine (so use that if necessary) but auto-detecting
# with None is working fine, so stick with that for robustness.
encoding = None
# Sets whether a feed has full articles embedded in it. The BBC feeds do not.
use_embedded_content = False
# Removes empty feeds - why keep them!?
remove_empty_feeds = True
# Create a custom title which fits nicely in the Kindle title list.
# Requires "import time" above class declaration, and replacing
# title with custom_title in conversion_options (right column only).
# Example of string below: "BBC News - 14 Nov 2011"
#
# custom_title = "BBC News - " + time.strftime('%d %b %Y')
'''
# Conversion options for advanced users, but don't forget to comment out the
# current conversion_options below. Avoid setting 'linearize_tables' as that
# plays havoc with the 'old style' table based pages.
#
conversion_options = { 'title' : title,
'comments' : description,
'tags' : tags,
'language' : language,
'publisher' : publisher,
'authors' : publisher,
'smarten_punctuation' : True
} }
'''
keep_only_tags = [ conversion_options = { 'smarten_punctuation' : True }
dict(name='div', attrs={'class':['layout-block-a layout-block']})
,dict(attrs={'class':['story-body','storybody']})
]
remove_tags = [ # Specify extra CSS - overrides ALL other CSS (IE. Added last).
dict(name='div', attrs={'class':['story-feature related narrow', 'share-help', 'embedded-hyper', extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
'story-feature wide ', 'story-feature narrow']}), .introduction, .first { font-weight: bold; } \
dict(id=['hypertab', 'comment-form']), .cross-head { font-weight: bold; font-size: 125%; } \
] .cap, .caption { display: block; font-size: 80%; font-style: italic; } \
.cap, .caption, .caption img, .caption span { display: block; text-align: center; margin: 5px auto; } \
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
text-align: center; font-size: 80%; font-style: italic; margin: 1px auto; } \
.story-date, .published { font-size: 80%; } \
table { width: 100%; } \
td img { display: block; margin: 5px auto; } \
ul { padding-top: 10px; } \
ol { padding-top: 10px; } \
li { padding-top: 5px; padding-bottom: 5px; } \
h1 { text-align: center; font-size: 175%; font-weight: bold; } \
h2 { text-align: center; font-size: 150%; font-weight: bold; } \
h3 { text-align: center; font-size: 125%; font-weight: bold; } \
h4, h5, h6 { text-align: center; font-size: 100%; font-weight: bold; }'
remove_attributes = ['width','height'] # Remove various tag attributes to improve the look of the ebook pages.
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
feeds = [ # Remove the (admittedly rarely used) line breaks, "<br />", which sometimes
('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'), # cause a section of the ebook to start in an unsightly fashion or, more
('Science/Nature', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/science/nature/rss.xml'), # frequently, a "<br />" will muck up the formatting of a correspondant's byline.
('Technology', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/technology/rss.xml'), # "<br />" and "<br clear/>" are far more frequently used on the table formatted
('Entertainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'), # style of pages, and really spoil the look of the ebook pages.
('Magazine', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/magazine/rss.xml'), preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
('Business', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml'), (re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: '')]
('Health', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/health/rss.xml'),
('Americas', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml'),
('Europe', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml'),
('South Asia', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/south_asia/rss.xml'),
('UK', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/rss.xml'),
('Asia-Pacific', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/asia-pacific/rss.xml'),
('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'),
]
# Create regular expressions for tag keeping and removal to make the matches more
# robust against minor changes and errors in the HTML, Eg. double spaces, leading
# and trailing spaces, missing hyphens, and such like.
# Python regular expression ('re' class) page: http://docs.python.org/library/re.html
# ***************************************
# Regular expressions for keep_only_tags:
# ***************************************
# The BBC News HTML pages use variants of 'storybody' to denote the section of a HTML
# page which contains the main text of the article. Match storybody variants: 'storybody',
# 'story-body', 'story body','storybody ', etc.
storybody_reg_exp = '^.*story[_ -]*body.*$'
# The BBC sport and 'newsbeat' (features) HTML pages use 'blq_content' to hold the title
# and published date. This is one level above the usual news pages which have the title
# and date within 'story-body'. This is annoying since 'blq_content' must also be kept,
# resulting in a lot of extra things to be removed by remove_tags.
blq_content_reg_exp = '^.*blq[_ -]*content.*$'
# The BBC has an alternative page design structure, which I suspect is an out-of-date
# design but which is still used in some articles, Eg. 'Click' (technology), 'FastTrack'
# (travel), and in some sport pages. These alternative pages are table based (which is
# why I think they are an out-of-date design) and account for -I'm guesstimaking- less
# than 1% of all articles. They use a table class 'storycontent' to hold the article
# and like blq_content (above) have required lots of extra removal by remove_tags.
story_content_reg_exp = '^.*story[_ -]*content.*$'
# Keep the sections of the HTML which match the list below. The HTML page created by
# Calibre will fill <body> with those sections which are matched. Note that the
# blq_content_reg_exp must be listed before storybody_reg_exp in keep_only_tags due to
# it being the parent of storybody_reg_exp, that is to say the div class/id 'story-body'
# will be inside div class/id 'blq_content' in the HTML (if 'blq_content' is there at
# all). If they are the other way around in keep_only_tags then blq_content_reg_exp
# will end up being discarded.
keep_only_tags = [ dict(name='table', attrs={'class':re.compile(story_content_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(blq_content_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(blq_content_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(storybody_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(storybody_reg_exp, re.IGNORECASE)}) ]
# ************************************
# Regular expressions for remove_tags:
# ************************************
# Regular expression to remove share-help and variant tags. The share-help class
# is used by the site for a variety of 'sharing' type links, Eg. Facebook, delicious,
# twitter, email. Removed to avoid page clutter.
share_help_reg_exp = '^.*share[_ -]*help.*$'
# Regular expression to remove embedded-hyper and variant tags. This class is used to
# display links to other BBC News articles on the same/similar subject.
embedded_hyper_reg_exp = '^.*embed*ed[_ -]*hyper.*$'
# Regular expression to remove hypertabs and variant tags. This class is used to
# display a tab bar at the top of an article which allows the user to switch to
# an article (viewed on the same page) providing further info., 'in depth' analysis,
# an editorial, a correspondant's blog entry, and such like. The ability to handle
# a tab bar of this nature is currently beyond the scope of this recipe and
# possibly of Calibre itself (not sure about that - TO DO - check!).
hypertabs_reg_exp = '^.*hyper[_ -]*tabs.*$'
# Regular expression to remove story-feature and variant tags. Eg. 'story-feature',
# 'story-feature related narrow', 'story-feature wide', 'story-feature narrow'.
# This class is used to add additional info. boxes, or small lists, outside of
# the main story. TO DO: Work out a way to incorporate these neatly.
story_feature_reg_exp = '^.*story[_ -]*feature.*$'
# Regular expression to remove video and variant tags, Eg. 'videoInStoryB',
# 'videoInStoryC'. This class is used to embed video.
video_reg_exp = '^.*video.*$'
# Regular expression to remove audio and variant tags, Eg. 'audioInStoryD'.
# This class is used to embed audio.
audio_reg_exp = '^.*audio.*$'
# Regular expression to remove pictureGallery and variant tags, Eg. 'pictureGallery'.
# This class is used to embed a photo slideshow. See also 'slideshow' below.
picture_gallery_reg_exp = '^.*picture.*$'
# Regular expression to remove slideshow and variant tags, Eg. 'dslideshow-enclosure'.
# This class is used to embed a slideshow (not necessarily photo) but both
# 'slideshow' and 'pictureGallery' are used for slideshows.
slideshow_reg_exp = '^.*slide[_ -]*show.*$'
# Regular expression to remove social-links and variant tags. This class is used to
# display links to a BBC bloggers main page, used in various columnist's blogs
# (Eg. Nick Robinson, Robert Preston).
social_links_reg_exp = '^.*social[_ -]*links.*$'
# Regular expression to remove quote and (multi) variant tags, Eg. 'quote',
# 'endquote', 'quote-credit', 'quote-credit-title', etc. These are usually
# removed by 'story-feature' removal (as they are usually within them), but
# not always. The quotation removed is always (AFAICT) in the article text
# as well but a 2nd copy is placed in a quote tag to draw attention to it.
# The quote class tags may or may not appear in div's.
quote_reg_exp = '^.*quote.*$'
# Regular expression to remove hidden and variant tags, Eg. 'hidden'.
# The purpose of these is unclear, they seem to be an internal link to a
# section within the article, but the text of the link (Eg. 'Continue reading
# the main story') never seems to be displayed anyway. Removed to avoid clutter.
# The hidden class tags may or may not appear in div's.
hidden_reg_exp = '^.*hidden.*$'
# Regular expression to remove comment and variant tags, Eg. 'comment-introduction'.
# Used on the site to display text about registered users entering comments.
comment_reg_exp = '^.*comment.*$'
# Regular expression to remove form and variant tags, Eg. 'comment-form'.
# Used on the site to allow registered BBC users to fill in forms, typically
# for entering comments about an article.
form_reg_exp = '^.*form.*$'
# Extra things to remove due to the addition of 'blq_content' in keep_only_tags.
#<div class="story-actions"> Used on sports pages for 'email' and 'print'.
story_actions_reg_exp = '^.*story[_ -]*actions.*$'
#<div class="bookmark-list"> Used on sports pages instead of 'share-help' (for
# social networking links).
bookmark_list_reg_exp = '^.*bookmark[_ -]*list.*$'
#<div id="secondary-content" class="content-group">
# NOTE: Don't remove class="content-group" that is needed.
# Used on sports pages to link to 'similar stories'.
secondary_content_reg_exp = '^.*secondary[_ -]*content.*$'
#<div id="featured-content" class="content-group">
# NOTE: Don't remove class="content-group" that is needed.
# Used on sports pages to link to pages like 'tables', 'fixtures', etc.
featured_content_reg_exp = '^.*featured[_ -]*content.*$'
#<div id="navigation">
# Used on sports pages to link to pages like 'tables', 'fixtures', etc.
# Used sometimes instead of "featured-content" above.
navigation_reg_exp = '^.*navigation.*$'
#<a class="skip" href="#blq-container-inner">Skip to top</a>
# Used on sports pages to link to the top of the page.
skip_reg_exp = '^.*skip.*$'
# Extra things to remove due to the addition of 'storycontent' in keep_only_tags,
# which are the alterative table design based pages. The purpose of some of these
# is not entirely clear from the pages (which are a total mess!).
# Remove mapping based tags, Eg. <map id="world_map">
# The dynamic maps don't seem to work during ebook creation. TO DO: Investigate.
map_reg_exp = '^.*map.*$'
# Remove social bookmarking variation, called 'socialBookMarks'.
social_bookmarks_reg_exp = '^.*social[_ -]*bookmarks.*$'
# Remove page navigation tools, like 'search', 'email', 'print', called 'blq-mast'.
blq_mast_reg_exp = '^.*blq[_ -]*mast.*$'
# Remove 'sharesb', I think this is a generic 'sharing' class. It seems to appear
# alongside 'socialBookMarks' whenever that appears. I am removing it as well
# under the assumption that it can appear alone as well.
sharesb_reg_exp = '^.*sharesb.*$'
# Remove class 'o'. The worst named user created css class of all time. The creator
# should immediately be fired. I've seen it used to hold nothing at all but with
# 20 or so empty lines in it. Also to hold a single link to another article.
# Whatever it was designed to do it is not wanted by this recipe. Exact match only.
o_reg_exp = '^o$'
# Remove 'promotopbg' and 'promobottombg', link lists. Have decided to
# use two reg expressions to make removing this (and variants) robust.
promo_top_reg_exp = '^.*promotopbg.*$'
promo_bottom_reg_exp = '^.*promobottombg.*$'
# Remove 'nlp', provides heading for link lists. Requires an exact match due to
# risk of matching those letters in something needed, unless I see a variation
# of 'nlp' used at a later date.
nlp_reg_exp = '^nlp$'
# Remove 'mva', provides embedded floating content of various types. Variant 'mvb'
# has also now been seen. Requires an exact match of 'mva' or 'mvb' due to risk of
# matching those letters in something needed.
mva_or_mvb_reg_exp = '^mv[ab]$'
# Remove 'mvtb', seems to be page navigation tools, like 'blq-mast'.
mvtb_reg_exp = '^mvtb$'
# Remove 'blq-toplink', class to provide a link to the top of the page.
blq_toplink_reg_exp = '^.*blq[_ -]*top[_ -]*link.*$'
# Remove 'products and services' links, Eg. desktop tools, alerts, and so on.
# Eg. Class="servicev4 ukfs_services" - what a mess of a name. Have decided to
# use two reg expressions to make removing this (and variants) robust.
prods_services_01_reg_exp = '^.*servicev4.*$'
prods_services_02_reg_exp = '^.*ukfs[_ -]*services.*$'
# Remove -what I think is- some kind of navigation tools helper class, though I am
# not sure, it's called: 'blq-rst blq-new-nav'. What I do know is it pops up
# frequently and it is not wanted. Have decided to use two reg expressions to make
# removing this (and variants) robust.
blq_misc_01_reg_exp = '^.*blq[_ -]*rst.*$'
blq_misc_02_reg_exp = '^.*blq[_ -]*new[_ -]*nav.*$'
# Remove 'puffbox' - this may only appear inside 'storyextra', so it may not
# need removing - I have no clue what it does other than it contains links.
# Whatever it is - it is not part of the article and is not wanted.
puffbox_reg_exp = '^.*puffbox.*$'
# Remove 'sibtbg' and 'sibtbgf' - some kind of table formatting classes.
sibtbg_reg_exp = '^.*sibtbg.*$'
# Remove 'storyextra' - links to relevant articles and external sites.
storyextra_reg_exp = '^.*story[_ -]*extra.*$'
remove_tags = [ dict(name='div', attrs={'class':re.compile(story_feature_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(share_help_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(embedded_hyper_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(hypertabs_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(video_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(audio_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(picture_gallery_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(slideshow_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(quote_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(hidden_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(comment_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(story_actions_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(bookmark_list_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(secondary_content_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(featured_content_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(navigation_reg_exp, re.IGNORECASE)}),
dict(name='form', attrs={'id':re.compile(form_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(quote_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(hidden_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(social_links_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(comment_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(skip_reg_exp, re.IGNORECASE)}),
dict(name='map', attrs={'id':re.compile(map_reg_exp, re.IGNORECASE)}),
dict(name='map', attrs={'name':re.compile(map_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(social_bookmarks_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'id':re.compile(blq_mast_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(sharesb_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(o_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(promo_top_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(promo_bottom_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(nlp_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(mva_or_mvb_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(mvtb_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(blq_toplink_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(prods_services_01_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(prods_services_02_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(blq_misc_01_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(blq_misc_02_reg_exp, re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(puffbox_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(sibtbg_reg_exp, re.IGNORECASE)}),
dict(attrs={'class':re.compile(storyextra_reg_exp, re.IGNORECASE)})
]
# Uses url to create and return the 'printer friendly' version of the url.
# In other words the 'print this page' address of the page.
#
# There are 3 types of urls used in the BBC site's rss feeds. There is just
# 1 type for the standard news while there are 2 used for sports feed urls.
# Note: Sports urls are linked from regular news feeds (Eg. 'News Home') when
# there is a major story of interest to 'everyone'. So even if no BBC sports
# feeds are added to 'feeds' the logic of this method is still needed to avoid
# blank / missing / empty articles which have an index title and then no body.
def print_version(self, url):
# Handle sports page urls type 01:
if (url.find("go/rss/-/sport1/") != -1):
temp_url = url.replace("go/rss/-/", "")
# Handle sports page urls type 02:
elif (url.find("go/rss/int/news/-/sport1/") != -1):
temp_url = url.replace("go/rss/int/news/-/", "")
# Handle regular news page urls:
else:
temp_url = url.replace("go/rss/int/news/-/", "")
# Always add "?print=true" to the end of the url.
print_url = temp_url + "?print=true"
return print_url
# Remove articles in feeds based on a string in the article title or url.
#
# Code logic written by: Starson17 - posted in: "Recipes - Re-usable code"
# thread, in post with title: "Remove articles from feed", see url:
# http://www.mobileread.com/forums/showpost.php?p=1165462&postcount=6
# Many thanks and all credit to Starson17.
#
# Starson17's code has obviously been altered to suite my requirements.
def parse_feeds(self):
# Call parent's method.
feeds = BasicNewsRecipe.parse_feeds(self)
# Loop through all feeds.
for feed in feeds:
# Loop through all articles in feed.
for article in feed.articles[:]:
# Match key words and remove article if there's a match.
# Most BBC rss feed video only 'articles' use upper case 'VIDEO'
# as a title prefix. Just match upper case 'VIDEO', so that
# articles like 'Video game banned' won't be matched and removed.
if 'VIDEO' in article.title:
feed.articles.remove(article)
# Most BBC rss feed audio only 'articles' use upper case 'AUDIO'
# as a title prefix. Just match upper case 'AUDIO', so that
# articles like 'Hi-Def audio...' won't be matched and removed.
elif 'AUDIO' in article.title:
feed.articles.remove(article)
# Most BBC rss feed photo slideshow 'articles' use 'In Pictures',
# 'In pictures', and 'in pictures', somewhere in their title.
# Match any case of that phrase.
elif 'IN PICTURES' in article.title.upper():
feed.articles.remove(article)
# As above, but user contributed pictures. Match any case.
elif 'YOUR PICTURES' in article.title.upper():
feed.articles.remove(article)
# 'Sportsday Live' are articles which contain a constantly and
# dynamically updated 'running commentary' during a live sporting
# event. Match any case.
elif 'SPORTSDAY LIVE' in article.title.upper():
feed.articles.remove(article)
# Sometimes 'Sportsday Live' (above) becomes 'Live - Sport Name'.
# These are being matched below using 'Live - ' because removing all
# articles with 'live' in their titles would remove some articles
# that are in fact not live sports pages. Match any case.
elif 'LIVE - ' in article.title.upper():
feed.articles.remove(article)
# 'Quiz of the week' is a Flash player weekly news quiz. Match only
# the 'Quiz of the' part in anticipation of monthly and yearly
# variants. Match any case.
elif 'QUIZ OF THE' in article.title.upper():
feed.articles.remove(article)
# Remove articles with 'scorecards' in the url. These are BBC sports
# pages which just display a cricket scorecard. The pages have a mass
# of table and css entries to display the scorecards nicely. Probably
# could make them work with this recipe, but might take a whole day
# of work to sort out all the css - basically a formatting nightmare.
elif 'scorecards' in article.url:
feed.articles.remove(article)
return feeds
# End of class and file.

View File

@ -1,61 +1,44 @@
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
import re
'''Calibre recipe to convert the RSS feeds of the Berliner Zeitung to an ebook.'''
class SportsIllustratedRecipe(BasicNewsRecipe) : class SportsIllustratedRecipe(BasicNewsRecipe) :
__author__ = 'ape' __author__ = 'a.peter'
__copyright__ = 'ape' __copyright__ = 'a.peter'
__license__ = 'GPL v3' __license__ = 'GPL v3'
language = 'de' language = 'de'
description = 'Berliner Zeitung' description = 'Berliner Zeitung RSS'
version = 2 version = 4
title = u'Berliner Zeitung' title = u'Berliner Zeitung'
timefmt = ' [%d.%m.%Y]' timefmt = ' [%d.%m.%Y]'
#oldest_article = 7.0
no_stylesheets = True no_stylesheets = True
remove_javascript = True remove_javascript = True
use_embedded_content = False use_embedded_content = False
publication_type = 'newspaper' publication_type = 'newspaper'
keep_only_tags = [dict(name='div', attrs={'class':'teaser t_split t_artikel'})] remove_tags_before = dict(name='div', attrs={'class':'newstype'})
remove_tags_after = [dict(id='article_text')]
INDEX = 'http://www.berlinonline.de/berliner-zeitung/' feeds = [(u'Startseite', u'http://www.berliner-zeitung.de/home/10808950,10808950,view,asFeed.xml'),
(u'Politik', u'http://www.berliner-zeitung.de/home/10808018,10808018,view,asFeed.xml'),
def parse_index(self): (u'Wirtschaft', u'http://www.berliner-zeitung.de/home/10808230,10808230,view,asFeed.xml'),
base = 'http://www.berlinonline.de' (u'Berlin', u'http://www.berliner-zeitung.de/home/10809148,10809148,view,asFeed.xml'),
answer = [] (u'Brandenburg', u'http://www.berliner-zeitung.de/home/10809312,10809312,view,asFeed.xml'),
articles = {} (u'Wissenschaft', u'http://www.berliner-zeitung.de/home/10808894,10808894,view,asFeed.xml'),
more = 1 (u'Digital', u'http://www.berliner-zeitung.de/home/10808718,10808718,view,asFeed.xml'),
(u'Kultur', u'http://www.berliner-zeitung.de/home/10809150,10809150,view,asFeed.xml'),
soup = self.index_to_soup(self.INDEX) (u'Panorama', u'http://www.berliner-zeitung.de/home/10808334,10808334,view,asFeed.xml'),
(u'Sport', u'http://www.berliner-zeitung.de/home/10808794,10808794,view,asFeed.xml'),
# Get list of links to ressorts from index page (u'Hertha', u'http://www.berliner-zeitung.de/home/10808800,10808800,view,asFeed.xml'),
ressort_list = soup.findAll('ul', attrs={'class': re.compile('ressortlist')}) (u'Union', u'http://www.berliner-zeitung.de/home/10808802,10808802,view,asFeed.xml'),
for ressort in ressort_list[0].findAll('a'): (u'Verkehr', u'http://www.berliner-zeitung.de/home/10809298,10809298,view,asFeed.xml'),
feed_title = ressort.string (u'Polizei', u'http://www.berliner-zeitung.de/home/10809296,10809296,view,asFeed.xml'),
print 'Analyzing', feed_title (u'Meinung', u'http://www.berliner-zeitung.de/home/10808020,10808020,view,asFeed.xml')]
if not articles.has_key(feed_title):
articles[feed_title] = []
answer.append(feed_title)
# Load ressort page.
feed = self.index_to_soup('http://www.berlinonline.de' + ressort['href'])
# find mainbar div which contains the list of all articles
for article_container in feed.findAll('div', attrs={'class': re.compile('mainbar')}):
# iterate over all articles
for article_teaser in article_container.findAll('div', attrs={'class': re.compile('teaser')}):
# extract title of article
if article_teaser.h3 != None:
article = {'title' : article_teaser.h3.a.string, 'date' : u'', 'url' : base + article_teaser.h3.a['href'], 'description' : u''}
articles[feed_title].append(article)
else:
# Skip teasers for missing photos
if article_teaser.div.p.contents[0].find('Foto:') > -1:
continue
article = {'title': 'Weitere Artikel ' + str(more), 'date': u'', 'url': base + article_teaser.div.p.a['href'], 'description': u''}
articles[feed_title].append(article)
more += 1
answer = [[key, articles[key]] for key in answer if articles.has_key(key)]
return answer
def get_masthead_url(self): def get_masthead_url(self):
return 'http://www.berlinonline.de/.img/berliner-zeitung/blz_logo.gif' return 'http://www.berliner-zeitung.de/image/view/10810244,7040611,data,logo.png'
def print_version(self, url):
return url.replace('.html', ',view,printVersion.html')

View File

@ -1,4 +1,3 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
''' '''
@ -18,11 +17,17 @@ class Berlingske_dk(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
remove_empty_feeds = True remove_empty_feeds = True
use_embedded_content = False use_embedded_content = False
remove_javascript = True
publication_type = 'newspaper' publication_type = 'newspaper'
encoding = 'utf8' encoding = 'utf8'
language = 'da' language = 'da'
masthead_url = 'http://www.berlingske.dk/sites/all/themes/bm/img/layout/masthead_bg.gif' auto_cleanup = True
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } h1,.manchet,.byline{font-family: Cambria,Georgia,Times,"Times New Roman",serif } ' extra_css = '''
.manchet {color:#888888;}
.dateline {font-size: x-small; color:#444444;}
.manchet,.dateline { font-family: Cambria,Georgia,Times,"Times New Roman",serif }
.body {font-family: Arial,Helvetica,sans-serif }
'''
conversion_options = { conversion_options = {
'comment' : description 'comment' : description
@ -32,18 +37,14 @@ class Berlingske_dk(BasicNewsRecipe):
} }
feeds = [ feeds = [
(u'Breaking news' , u'http://www.berlingske.dk/breaking/rss' ) (u'Breaking news' , u'http://www.b.dk/breaking/rss' )
,(u'Seneste nyt' , u'http://www.berlingske.dk/seneste/rss' ) ,(u'Seneste nyt' , u'http://www.b.dk/seneste/rss' )
,(u'Topnyheder' , u'http://www.berlingske.dk/top/rss' ) ,(u'Topnyheder' , u'http://www.b.dk/top/rss' )
,(u'Danmark' , u'http://www.berlingske.dk/danmark/seneste/rss' ) ,(u'Danmark' , u'http://www.b.dk/danmark/seneste/rss' )
,(u'Verden' , u'http://www.berlingske.dk/verden/seneste/rss' ) ,(u'Verden' , u'http://www.b.dk/verden/seneste/rss' )
,(u'Klima' , u'http://www.berlingske.dk/klima/seneste/rss' ) ,(u'Klima' , u'http://www.b.dk/klima/seneste/rss' )
,(u'Debat' , u'http://www.berlingske.dk/debat/seneste/rss' ) ,(u'Debat' , u'http://www.b.dk/debat/seneste/rss' )
,(u'Koebenhavn' , u'http://www.berlingske.dk/koebenhavn/seneste/rss') ,(u'Koebenhavn' , u'http://www.b.dk/koebenhavn/seneste/rss')
,(u'Politik' , u'http://www.berlingske.dk/politik/seneste/rss' ) ,(u'Politik' , u'http://www.b.dk/politik/seneste/rss' )
,(u'Kultur' , u'http://www.berlingske.dk/kultur/seneste/rss' ) ,(u'Kultur' , u'http://www.b.dk/kultur/seneste/rss' )
] ]
keep_only_tags = [dict(attrs={'class':['first','pt-article']})]
remove_tags = [dict(name=['object','link','base','iframe','embed'])]

38
recipes/biamag.recipe Normal file
View File

@ -0,0 +1,38 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
bianet.com.tr
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Radikal_tr(BasicNewsRecipe):
title = 'BiaMag'
__author__ = 'Osman Kaysan'
description = 'Independent News from Turkey'
publisher = 'BiaMag'
category = 'news, politics, Turkey'
oldest_article = 15
max_articles_per_feed = 120
masthead_url = 'http://bianet.org/images/biamag_logo.gif'
language = 'tr'
no_stylesheets = True
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
,'remove_paragraph_spacing': True,
}
remove_tags_before = dict(name='div', attrs={'class':'manset'})
remove_tags = [ dict(name='ul', attrs={'class':['altul']}), dict(name='div', attrs={'id':['habermenu']}), dict(name='div', attrs={'class':['mail']}), dict(name='div', attrs={'class':['from']})]
remove_tags_after = dict(name='div', attrs={'id':'habermenu'})
feeds = [(u'BiaMag', u'http://www.bianet.org/biamag.rss')]
def preprocess_html(self, soup):
return self.adeify_images(soup)

38
recipes/biamag_en.recipe Normal file
View File

@ -0,0 +1,38 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
bianet.com.tr
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Radikal_tr(BasicNewsRecipe):
title = 'Bianet-English'
__author__ = 'Osman Kaysan'
description = 'Independent News Network from Turkey(English)'
publisher = 'Bianet'
category = 'news, politics, Turkey'
oldest_article = 7
max_articles_per_feed = 150
masthead_url = 'http://bianet.org/images/english_logo.gif'
language = 'en_TR'
no_stylesheets = True
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
,'remove_paragraph_spacing': True,
}
remove_tags_before = dict(name='div', attrs={'class':'manset'})
remove_tags = [ dict(name='ul', attrs={'class':['altul']}), dict(name='div', attrs={'id':['habermenu']}), dict(name='div', attrs={'class':['mail']}), dict(name='div', attrs={'class':['from']})]
remove_tags_after = dict(name='div', attrs={'id':'habermenu'})
feeds = [(u'Bianet-English', u'http://www.bianet.org/english.rss')]
def preprocess_html(self, soup):
return self.adeify_images(soup)

38
recipes/bianet.recipe Normal file
View File

@ -0,0 +1,38 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
bianet.com.tr
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Radikal_tr(BasicNewsRecipe):
title = 'Bianet'
__author__ = 'Osman Kaysan'
description = 'Independent News from Turkey'
publisher = 'Bianet'
category = 'news, politics, Turkey'
oldest_article = 7
max_articles_per_feed = 120
masthead_url = 'http://bianet.org/images/bianet_logo.gif'
language = 'tr'
no_stylesheets = True
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
,'remove_paragraph_spacing': True,
}
remove_tags_before = dict(name='div', attrs={'class':'manset'})
remove_tags = [ dict(name='ul', attrs={'class':['altul']}), dict(name='div', attrs={'id':['habermenu']}), dict(name='div', attrs={'class':['mail']}), dict(name='div', attrs={'class':['from']})]
remove_tags_after = dict(name='div', attrs={'id':'habermenu'})
feeds = [(u'Bianet', u'http://bianet.org/bianet.rss')]
def preprocess_html(self, soup):
return self.adeify_images(soup)

19
recipes/biolog_pl.recipe Normal file
View File

@ -0,0 +1,19 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe
class Biolog_pl(BasicNewsRecipe):
title = u'Biolog.pl'
oldest_article = 7
max_articles_per_feed = 100
remove_empty_feeds=True
__author__ = 'fenuks'
description = u'Przyrodnicze aktualności ze świata nauki (codziennie aktualizowane), kurs biologii, testy i sprawdziany, forum dyskusyjne.'
category = 'biology'
language = 'pl'
cover_url='http://www.biolog.pl/naukowy,portal,biolog.png'
no_stylesheets = True
#keeps_only_tags=[dict(id='main')]
remove_tags_before=dict(id='main')
remove_tags_after=dict(name='a', attrs={'name':'komentarze'})
remove_tags=[dict(name='img', attrs={'alt':'Komentarze'})]
feeds = [(u'Wszystkie', u'http://www.biolog.pl/backend.php'), (u'Medycyna', u'http://www.biolog.pl/medycyna-rss.php'), (u'Ekologia', u'http://www.biolog.pl/rss-ekologia.php'), (u'Genetyka i biotechnologia', u'http://www.biolog.pl/rss-biotechnologia.php'), (u'Botanika', u'http://www.biolog.pl/rss-botanika.php'), (u'Le\u015bnictwo', u'http://www.biolog.pl/rss-lesnictwo.php'), (u'Zoologia', u'http://www.biolog.pl/rss-zoologia.php')]

View File

@ -0,0 +1,50 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe
class Birgun (BasicNewsRecipe):
title = u'Birgün Gazetesi'
__author__ = u'Osman Kaysan'
oldest_article = 7
max_articles_per_feed =150
use_embedded_content = False
description = 'Birgun gazatesi haberleri, kose yazarlari'
publisher = 'Birgün'
category = 'news,haberler,turkce,gazete,birgun'
language = 'tr'
no_stylesheets = True
publication_type = 'newspaper'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
,'remove_paragraph_spacing': True,
}
cover_img_url = 'http://www.birgun.net/i/birgun.png'
masthead_url = 'http://www.birgun.net/i/birgun.png'
remove_attributes = ['width','height']
remove_tags_before = dict(name='h2', attrs={'class':'storyHeadline'})
#remove_tags_after = dict(name='div', attrs={'class':'toollinks'})
remove_tags_after = dict(name='tr', attrs={'valign':'top'})
remove_tags = [ dict(name='div', attrs={'id':'byLine'}), dict(name='div', attrs={'class':'toollinks'})
, dict(name='div', attrs={'class':'main-lead'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})
, dict(name='a', attrs={'class':'addthis_button'})]
remove_empty_feeds= True
feeds = [
( u'Güncel', u'http://www.birgun.net/actuels.xml')
,( u'Köşe Yazarları', u'http://www.birgun.net/writer.xml')
,( u'Politika', u'http://www.birgun.net/politics.xml')
,( u'Ekonomi', u'http://www.birgun.net/economic.xml')
,( u'Çalışma Yaşamı', u'http://www.birgun.net/workers.xml')
,( u'Dünya', u'http://www.birgun.net/worlds.xml')
,( u'Yaşam', u'http://www.birgun.net/lifes.xml')
]

View File

@ -0,0 +1,44 @@
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Birmingham post'
description = 'News for Birmingham UK'
timefmt = ''
__author__ = 'Dave Asbury'
cover_url = 'http://1.bp.blogspot.com/_GwWyq5eGw9M/S9BHPHxW55I/AAAAAAAAB6Q/iGCWl0egGzg/s320/Birmingham+post+Lite+front.JPG'
oldest_article = 1
max_articles_per_feed = 20
remove_empty_feeds = True
remove_javascript = True
auto_cleanup = True
language = 'en_GB'
masthead_url = 'http://www.pressgazette.co.uk/Pictures/web/t/c/g/birmingham_post.jpg'
keep_only_tags = [
#dict(name='h1',attrs={'id' : 'article-headline'}),
#dict(attrs={'class':['article-meta-author','article-meta-date','article main','art-o art-align-center otm-1 ']}),
#dict(name='p')
#dict(attrs={'id' : 'three-col'})
]
remove_tags = [
# dict(name='div',attrs={'class' : 'span-33 last header-links'})
]
feeds = [
#(u'News',u'http://www.birminghampost.net/news/rss.xml'),
(u'Local News', u'http://www.birminghampost.net/news/west-midlands-news/rss.xml'),
(u'UK News', u'http://www.birminghampost.net/news/uk-news/rss.xml'),
(u'Sports',u'http://www.birminghampost.net/midlands-birmingham-sport/rss.xml'),
(u'Bloggs & Comments',u'http://www.birminghampost.net/comment/rss.xml')
]
extra_css = '''
body {font: sans-serif medium;}'
h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;}
h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; }
span{ font-size:9.5px; font-weight:bold;font-style:italic}
p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
'''

26
recipes/blues.recipe Normal file
View File

@ -0,0 +1,26 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Oskar Kunicki <rakso at interia.pl>'
'''
Changelog:
2011-11-27
News from BluesRSS.info
'''
from calibre.web.feeds.news import BasicNewsRecipe
class BluesRSS(BasicNewsRecipe):
title = 'Blues News'
__author__ = 'Oskar Kunicki'
description ='Blues news from around the world'
publisher = 'BluesRSS.info'
category = 'news, blues, USA,UK'
oldest_article = 5
max_articles_per_feed = 100
language = 'en'
cover_url = 'http://bluesrss.info/cover.jpg'
masthead_url = 'http://bluesrss.info/cover.jpg'
no_stylesheets = True
remove_tags = [dict(name='div', attrs={'class':'wp-pagenavi'})]
feeds = [(u'News', u'http://bluesrss.info/feed/')]

View File

@ -10,49 +10,39 @@ http://www.buffalonews.com/RSS/
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1298680852(BasicNewsRecipe): class BuffaloNews(BasicNewsRecipe):
title = u'Buffalo News' title = u'Buffalo News'
oldest_article = 2 oldest_article = 2
language = 'en' language = 'en'
__author__ = 'ChappyOnIce' __author__ = 'ChappyOnIce, Krittika Goyal'
max_articles_per_feed = 20 max_articles_per_feed = 20
encoding = 'utf-8' encoding = 'utf-8'
masthead_url = 'http://www.buffalonews.com/buffalonews/skins/buffalonews/images/masthead/the_buffalo_news_logo.png' masthead_url = 'http://www.buffalonews.com/buffalonews/skins/buffalonews/images/masthead/the_buffalo_news_logo.png'
remove_javascript = True auto_cleanup = True
extra_css = 'body {text-align: justify;}\n \ remove_empty_feeds = True
p {text-indent: 20px;}'
keep_only_tags = [ feeds = [
dict(name='div', attrs={'class':['main-content-left']}) (u'City of Buffalo', u'http://www.buffalonews.com/city/communities/buffalo/?widget=rssfeed&view=feed&contentId=77944'),
] (u'Southern Erie County', u'http://www.buffalonews.com/city/communities/southern-erie/?widget=rssfeed&view=feed&contentId=77944'),
(u'Eastern Erie County', u'http://www.buffalonews.com/city/communities/eastern-erie/?widget=rssfeed&view=feed&contentId=77944'),
remove_tags = [ (u'Southern Tier', u'http://www.buffalonews.com/city/communities/southern-tier/?widget=rssfeed&view=feed&contentId=77944'),
dict(name='div', attrs={'id':['commentCount']}), (u'Niagara County', u'http://www.buffalonews.com/city/communities/niagara-county/?widget=rssfeed&view=feed&contentId=77944'),
dict(name='div', attrs={'class':['story-list-links']}) (u'Business', u'http://www.buffalonews.com/business/?widget=rssfeed&view=feed&contentId=77944'),
] (u'MoneySmart', u'http://www.buffalonews.com/business/moneysmart/?widget=rssfeed&view=feed&contentId=77944'),
(u'Bills & NFL', u'http://www.buffalonews.com/sports/bills-nfl/?widget=rssfeed&view=feed&contentId=77944'),
remove_tags_after = dict(name='div', attrs={'class':['body storyContent']}) (u'Sabres & NHL', u'http://www.buffalonews.com/sports/sabres-nhl/?widget=rssfeed&view=feed&contentId=77944'),
(u'Bob DiCesare', u'http://www.buffalonews.com/sports/columns/bob-dicesare/?widget=rssfeed&view=feed&contentId=77944'),
feeds = [(u'City of Buffalo', u'http://www.buffalonews.com/city/communities/buffalo/?widget=rssfeed&view=feed&contentId=77944'), (u'Bucky Gleason', u'http://www.buffalonews.com/sports/columns/bucky-gleason/?widget=rssfeed&view=feed&contentId=77944'),
(u'Southern Erie County', u'http://www.buffalonews.com/city/communities/southern-erie/?widget=rssfeed&view=feed&contentId=77944'), (u'Mark Gaughan', u'http://www.buffalonews.com/sports/bills-nfl/inside-the-nfl/?widget=rssfeed&view=feed&contentId=77944'),
(u'Eastern Erie County', u'http://www.buffalonews.com/city/communities/eastern-erie/?widget=rssfeed&view=feed&contentId=77944'), (u'Mike Harrington', u'http://www.buffalonews.com/sports/columns/mike-harrington/?widget=rssfeed&view=feed&contentId=77944'),
(u'Southern Tier', u'http://www.buffalonews.com/city/communities/southern-tier/?widget=rssfeed&view=feed&contentId=77944'), (u'Jerry Sullivan', u'http://www.buffalonews.com/sports/columns/jerry-sullivan/?widget=rssfeed&view=feed&contentId=77944'),
(u'Niagara County', u'http://www.buffalonews.com/city/communities/niagara-county/?widget=rssfeed&view=feed&contentId=77944'), (u'Other Sports Columns', u'http://www.buffalonews.com/sports/columns/other-sports-columns/?widget=rssfeed&view=feed&contentId=77944'),
(u'Business', u'http://www.buffalonews.com/business/?widget=rssfeed&view=feed&contentId=77944'), (u'Life', u'http://www.buffalonews.com/life/?widget=rssfeed&view=feed&contentId=77944'),
(u'MoneySmart', u'http://www.buffalonews.com/business/moneysmart/?widget=rssfeed&view=feed&contentId=77944'), (u'Bruce Andriatch', u'http://www.buffalonews.com/city/columns/bruce-andriatch/?widget=rssfeed&view=feed&contentId=77944'),
(u'Bills & NFL', u'http://www.buffalonews.com/sports/bills-nfl/?widget=rssfeed&view=feed&contentId=77944'), (u'Donn Esmonde', u'http://www.buffalonews.com/city/columns/donn-esmonde/?widget=rssfeed&view=feed&contentId=77944'),
(u'Sabres & NHL', u'http://www.buffalonews.com/sports/sabres-nhl/?widget=rssfeed&view=feed&contentId=77944'), (u'Rod Watson', u'http://www.buffalonews.com/city/columns/rod-watson/?widget=rssfeed&view=feed&contentId=77944'),
(u'Bob DiCesare', u'http://www.buffalonews.com/sports/columns/bob-dicesare/?widget=rssfeed&view=feed&contentId=77944'), (u'Entertainment', u'http://www.buffalonews.com/entertainment/?widget=rssfeed&view=feed&contentId=77944'),
(u'Bucky Gleason', u'http://www.buffalonews.com/sports/columns/bucky-gleason/?widget=rssfeed&view=feed&contentId=77944'), (u'Off Main Street', u'http://www.buffalonews.com/city/columns/off-main-street/?widget=rssfeed&view=feed&contentId=77944'),
(u'Mark Gaughan', u'http://www.buffalonews.com/sports/bills-nfl/inside-the-nfl/?widget=rssfeed&view=feed&contentId=77944'), (u'Editorials', u'http://www.buffalonews.com/editorial-page/buffalo-news-editorials/?widget=rssfeed&view=feed&contentId=77944')
(u'Mike Harrington', u'http://www.buffalonews.com/sports/columns/mike-harrington/?widget=rssfeed&view=feed&contentId=77944'),
(u'Jerry Sullivan', u'http://www.buffalonews.com/sports/columns/jerry-sullivan/?widget=rssfeed&view=feed&contentId=77944'),
(u'Other Sports Columns', u'http://www.buffalonews.com/sports/columns/other-sports-columns/?widget=rssfeed&view=feed&contentId=77944'),
(u'Life', u'http://www.buffalonews.com/life/?widget=rssfeed&view=feed&contentId=77944'),
(u'Bruce Andriatch', u'http://www.buffalonews.com/city/columns/bruce-andriatch/?widget=rssfeed&view=feed&contentId=77944'),
(u'Donn Esmonde', u'http://www.buffalonews.com/city/columns/donn-esmonde/?widget=rssfeed&view=feed&contentId=77944'),
(u'Rod Watson', u'http://www.buffalonews.com/city/columns/rod-watson/?widget=rssfeed&view=feed&contentId=77944'),
(u'Entertainment', u'http://www.buffalonews.com/entertainment/?widget=rssfeed&view=feed&contentId=77944'),
(u'Off Main Street', u'http://www.buffalonews.com/city/columns/off-main-street/?widget=rssfeed&view=feed&contentId=77944'),
(u'Editorials', u'http://www.buffalonews.com/editorial-page/buffalo-news-editorials/?widget=rssfeed&view=feed&contentId=77944')
] ]

51
recipes/catavencii.recipe Normal file
View File

@ -0,0 +1,51 @@
# -*- coding: utf-8 -*-
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = u'2011, Silviu Cotoar\u0103'
'''
catavencii.ro
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Catavencii(BasicNewsRecipe):
title = u'Ca\u0163avencii'
__author__ = u'Silviu Cotoar\u0103'
publisher = u'Ca\u0163avencii'
description = u'Ca\u0163avencii'
oldest_article = 5
language = 'ro'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
category = 'Ziare,Romania'
encoding = 'utf-8'
cover_url = 'http://www.simonatache.ro/wp-content/uploads/2011/06/catavencii-logo.png'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
keep_only_tags = [
dict(name='div', attrs={'id':'content'})
]
remove_tags = [
dict(name='div', attrs={'id':'breadcrumbs'})
, dict(name='span', attrs={'class':'info'})
, dict(name='div', attrs={'id':'social-media-article'})
]
remove_tags_after = [
dict(name='div', attrs={'id':'social-media-article'})
]
feeds = [
(u'\u0218tiri', u'http://www.catavencii.ro/rss')
]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -4,16 +4,16 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = u'2011, Silviu Cotoar\u0103' __copyright__ = u'2011, Silviu Cotoar\u0103'
''' '''
catavencu.ro academiacatavencu.info
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Catavencu(BasicNewsRecipe): class AcademiaCatavencu(BasicNewsRecipe):
title = u'Academia Ca\u0163avencu' title = u'Academia Ca\u0163avencu'
__author__ = u'Silviu Cotoar\u0103' __author__ = u'Silviu Cotoar\u0103'
description = 'Tagma cum laude' description = 'Tagma cum laude'
publisher = 'Catavencu' publisher = u'Ca\u0163avencu'
oldest_article = 5 oldest_article = 5
language = 'ro' language = 'ro'
max_articles_per_feed = 100 max_articles_per_feed = 100
@ -21,32 +21,31 @@ class Catavencu(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
category = 'Ziare' category = 'Ziare'
encoding = 'utf-8' encoding = 'utf-8'
cover_url = 'http://upload.wikimedia.org/wikipedia/en/1/1e/Academia_Catavencu.jpg' cover_url = 'http://www.academiacatavencu.info/images/logo.png'
conversion_options = { conversion_options = {
'comments' : description 'comments' : description
,'tags' : category ,'tags' : category
,'language' : language ,'language' : language
,'publisher' : publisher ,'publisher' : publisher
} }
keep_only_tags = [ keep_only_tags = [
dict(name='ul', attrs={'class':'articles'}) dict(name='h1', attrs={'class':'art_title'}),
dict(name='div', attrs={'class':'art_text'})
] ]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'class':['tools']}) dict(name='div', attrs={'class':['desp_m']})
, dict(name='div', attrs={'class':['share']}) , dict(name='div', attrs={'id':['tags']})
, dict(name='div', attrs={'class':['category']})
, dict(name='div', attrs={'id':['comments']})
] ]
remove_tags_after = [ remove_tags_after = [
dict(name='div', attrs={'id':'comments'}) dict(name='div', attrs={'class':['desp_m']})
] ]
feeds = [ feeds = [
(u'Feeds', u'http://catavencu.ro/feed/rss') (u'Feeds', u'http://www.academiacatavencu.info/rss.xml')
] ]
def preprocess_html(self, soup): def preprocess_html(self, soup):

View File

@ -27,7 +27,7 @@ class CGM(BasicNewsRecipe):
del item['style'] del item['style']
ad=soup.findAll('a') ad=soup.findAll('a')
for r in ad: for r in ad:
if 'http://www.hustla.pl' in r['href']: if 'http://www.hustla.pl' in r['href'] or 'http://www.ebilet.pl' in r['href']:
r.extract() r.extract()
gallery=soup.find('div', attrs={'class':'galleryFlash'}) gallery=soup.find('div', attrs={'class':'galleryFlash'})
if gallery: if gallery:

View File

@ -23,7 +23,9 @@ class TheCND(BasicNewsRecipe):
remove_tags = [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})] remove_tags = [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
no_stylesheets = True no_stylesheets = True
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')] preprocess_regexps = [ (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
(re.compile('<table width.*?</table>', re.DOTALL), lambda m: ''),
]
def print_version(self, url): def print_version(self, url):
if url.find('news/article.php') >= 0: if url.find('news/article.php') >= 0:
@ -46,13 +48,15 @@ class TheCND(BasicNewsRecipe):
title = self.tag_to_string(a) title = self.tag_to_string(a)
self.log('\tFound article: ', title, 'at', url) self.log('\tFound article: ', title, 'at', url)
date = a.nextSibling date = a.nextSibling
if re.search('cm', date):
continue
if (date is not None) and len(date)>2: if (date is not None) and len(date)>2:
if not articles.has_key(date): if not articles.has_key(date):
articles[date] = [] articles[date] = []
articles[date].append({'title':title, 'url':url, 'description': '', 'date':''}) articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
self.log('\t\tAppend to : ', date) self.log('\t\tAppend to : ', date)
self.log('log articles', articles) #self.log('log articles', articles)
mostCurrent = sorted(articles).pop() mostCurrent = sorted(articles).pop()
self.title = 'CND ' + mostCurrent self.title = 'CND ' + mostCurrent

72
recipes/cnd_weekly.recipe Normal file
View File

@ -0,0 +1,72 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2010, Derek Liang <Derek.liang.ca @@@at@@@ gmail.com>'
'''
cnd.org
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class TheCND(BasicNewsRecipe):
title = 'CND Weekly'
__author__ = 'Derek Liang'
description = ''
INDEX = 'http://cnd.org'
language = 'zh'
conversion_options = {'linearize_tables':True}
remove_tags_before = dict(name='div', id='articleHead')
remove_tags_after = dict(id='copyright')
remove_tags = [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
no_stylesheets = True
preprocess_regexps = [ (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
(re.compile('<table width.*?</table>', re.DOTALL), lambda m: ''),
]
def print_version(self, url):
if url.find('news/article.php') >= 0:
return re.sub("^[^=]*", "http://my.cnd.org/modules/news/print.php?storyid", url)
else:
return re.sub("^[^=]*", "http://my.cnd.org/modules/wfsection/print.php?articleid", url)
def parse_index(self):
soup = self.index_to_soup(self.INDEX)
feeds = []
articles = {}
for a in soup.findAll('a', attrs={'target':'_cnd'}):
url = a['href']
if url.find('article.php') < 0 :
continue
if url.startswith('/'):
url = 'http://cnd.org'+url
title = self.tag_to_string(a)
date = a.nextSibling
if not re.search('cm', date):
continue
self.log('\tFound article: ', title, 'at', url, '@', date)
if (date is not None) and len(date)>2:
if not articles.has_key(date):
articles[date] = []
articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
self.log('\t\tAppend to : ', date)
sorted_articles = sorted(articles)
while sorted_articles:
mostCurrent = sorted_articles.pop()
self.title = 'CND ' + mostCurrent
feeds.append((self.title, articles[mostCurrent]))
return feeds
def populate_article_metadata(self, article, soup, first):
header = soup.find('h3')
self.log('header: ' + self.tag_to_string(header))
pass

View File

@ -0,0 +1,22 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe
class Computerworld_pl(BasicNewsRecipe):
title = u'Computerworld.pl'
__author__ = 'fenuks'
description = u'Serwis o IT w przemyśle, finansach, handlu, administracji oraz rynku IT i telekomunikacyjnym - wiadomości, opinie, analizy, porady prawne'
category = 'IT'
language = 'pl'
no_stylesheets=True
oldest_article = 7
max_articles_per_feed = 100
keep_only_tags=[dict(name='div', attrs={'id':'s'})]
remove_tags_after=dict(name='div', attrs={'class':'rMobi'})
remove_tags=[dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})]
feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')]
def get_cover_url(self):
soup = self.index_to_soup('http://www.computerworld.pl/')
cover=soup.find(name='img', attrs={'class':'prawo'})
self.cover_url=cover['src']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -0,0 +1,52 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
#from calibre import __appname__
from calibre.utils.magick import Image
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Cosmopolitan UK'
description = 'Fashion, beauty and Gossip for women from COSMOPOLITAN -UK'
__author__ = 'Dave Asbury'
#last update 21/12/11
# greyscale code by Starson
cover_url = 'http://www.cosmopolitan.magazine.co.uk/files/4613/2085/8988/Cosmo_Cover3.jpg'
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 20
remove_empty_feeds = True
remove_javascript = True
preprocess_regexps = [
(re.compile(r'<!-- Begin tmpl module_competition_offer -->.*?<!-- End tmpl module_competition_offer-->', re.IGNORECASE | re.DOTALL), lambda match: '')]
language = 'en_GB'
masthead_url = 'http://www.cosmopolitan.co.uk/cm/cosmopolitanuk/site_images/header/cosmouk_logo_home.gif'
keep_only_tags = [
dict(attrs={'class' : ['dateAuthor', 'publishDate']}),
dict(name='div',attrs ={'id' : ['main_content']})
]
remove_tags = [
dict(name='div',attrs={'class' : ['blogInfo','viral_toolbar','comment_number','prevEntry nav']}),
dict(name='div',attrs={'class' : 'blog_module_about_the_authors'}),
dict(attrs={'id': ['breadcrumbs','comment','related_links_list','right_rail','content_sec_fb_more','content_sec_mostpopularstories','content-sec_fb_frame_viewfb_bot']}),
dict(attrs={'class' : ['read_liked_that_header','fb_back_next_area']}),
dict(name='li',attrs={'class' : 'thumb'})
]
feeds = [
(u'Love & Sex', u'http://www.cosmopolitan.co.uk/love-sex/rss/'), (u'Men', u'http://cosmopolitan.co.uk/men/rss/'), (u'Fashion', u'http://cosmopolitan.co.uk/fashion/rss/'), (u'Hair & Beauty', u'http://cosmopolitan.co.uk/beauty-hair/rss/'), (u'LifeStyle', u'http://cosmopolitan.co.uk/lifestyle/rss/'), (u'Cosmo On Campus', u'http://cosmopolitan.co.uk/campus/rss/'), (u'Celebrity Gossip', u'http://cosmopolitan.co.uk/celebrity-gossip/rss/')]
def postprocess_html(self, soup, first):
#process all the images
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
if img < 0:
raise RuntimeError('Out of memory')
img.type = "GrayscaleType"
img.save(iurl)
return soup

View File

@ -0,0 +1,18 @@
from calibre.web.feeds.news import BasicNewsRecipe
class DailyWritingTips(BasicNewsRecipe):
title = u'Daily Writing Tips'
language = 'en_GB'
__author__ = 'NotTaken'
oldest_article = 7 #days
max_articles_per_feed = 40
use_embedded_content = True
no_stylesheets = True
auto_cleanup = False
encoding = 'utf-8'
feeds = [
('Latest tips',
'http://feeds2.feedburner.com/DailyWritingTips'),
]

15
recipes/datasport.recipe Normal file
View File

@ -0,0 +1,15 @@
__license__ = 'GPL v3'
__author__ = 'faber1971'
description = 'Italian soccer news website - v1.00 (17, December 2011)'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1324114272(BasicNewsRecipe):
title = u'Datasport'
language = 'it'
__author__ = 'faber1971'
oldest_article = 1
max_articles_per_feed = 100
auto_cleanup = True
feeds = [(u'Datasport', u'http://www.datasport.it/calcio/rss.xml')]

View File

@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
'''
descopera.org
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Descopera(BasicNewsRecipe):
title = u'Descoperă.org'
__author__ = 'Marius Ignătescu'
description = 'Descoperă. Placerea de a cunoaște'
publisher = 'descopera.org'
category = 'science, technology, culture, history, earth'
language = 'ro'
oldest_article = 14
max_articles_per_feed = 100
encoding = 'utf8'
no_stylesheets = True
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
keep_only_tags = [dict(name='div', attrs={'class':['post']})]
remove_tags = [dict(name='div', attrs={'class':['topnav', 'box_a', 'shr-bookmarks shr-bookmarks-expand shr-bookmarks-center shr-bookmarks-bg-knowledge']})]
remove_attributes = ['width','height']
cover_url = 'http://www.descopera.org/wp-content/themes/dorg/styles/default/img/b_top.png?width=400'
feeds = [(u'Articles', u'http://www.descopera.org/feed/')]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -46,7 +46,8 @@ class DziennikInternautowRecipe(BasicNewsRecipe):
dict(name = 'div', attrs = {'class' : 'poradniki_context'}), dict(name = 'div', attrs = {'class' : 'poradniki_context'}),
dict(name = 'div', attrs = {'class' : 'uniBox'}), dict(name = 'div', attrs = {'class' : 'uniBox'}),
dict(name = 'object', attrs = {}), dict(name = 'object', attrs = {}),
dict(name = 'h3', attrs = {}) dict(name = 'h3', attrs = {}),
dict(attrs={'class':'twitter-share-button'})
] ]
preprocess_regexps = [ preprocess_regexps = [
@ -58,3 +59,8 @@ class DziennikInternautowRecipe(BasicNewsRecipe):
(r'\s*</', lambda match: '</'), (r'\s*</', lambda match: '</'),
] ]
] ]
def skip_ad_pages(self, soup):
if 'Advertisement' in soup.title:
nexturl=soup.find('a')['href']
return self.index_to_soup(nexturl, raw=True)

View File

@ -0,0 +1,58 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe
import re
class Dziennik_pl(BasicNewsRecipe):
title = u'Dziennik.pl'
__author__ = 'fenuks'
description = u'Wiadomości z kraju i ze świata. Wiadomości gospodarcze. Znajdziesz u nas informacje, wydarzenia, komentarze, opinie.'
category = 'newspaper'
language = 'pl'
cover_url='http://6.s.dziennik.pl/images/og_dziennik.jpg'
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 100
remove_javascript=True
remove_empty_feeds=True
preprocess_regexps = [(re.compile("Komentarze:"), lambda m: '')]
keep_only_tags=[dict(id='article')]
remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget']}), dict(name='a', attrs={'class':'komentarz'})]
feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'),
(u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'),
(u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'),
(u'Kobieta', u'http://rss.dziennik.pl/Dziennik-Kobieta'),
(u'Auto', u'http://rss.dziennik.pl/Dziennik-Auto'),
(u'Rozrywka', u'http://rss.dziennik.pl/Dziennik-Rozrywka'),
(u'Film', u'http://rss.dziennik.pl/Dziennik-Film'),
(u'Muzyka' , u'http://rss.dziennik.pl/Dziennik-Muzyka'),
(u'Kultura', u'http://rss.dziennik.pl/Dziennik-Kultura'),
(u'Nauka', u'http://rss.dziennik.pl/Dziennik-Nauka'),
(u'Podróże', u'http://rss.dziennik.pl/Dziennik-Podroze/'),
(u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')]
def append_page(self, soup, appendtag):
tag=soup.find('a', attrs={'class':'page_next'})
if tag:
appendtag.find('div', attrs={'class':'article_paginator'}).extract()
while tag:
soup2= self.index_to_soup(tag['href'])
tag=soup2.find('a', attrs={'class':'page_next'})
if not tag:
for r in appendtag.findAll('div', attrs={'class':'art_src'}):
r.extract()
pagetext = soup2.find(name='div', attrs={'class':'article_body'})
for dictionary in self.remove_tags:
v=pagetext.findAll(name=dictionary['name'], attrs=dictionary['attrs'])
for delete in v:
delete.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
if appendtag.find('div', attrs={'class':'article_paginator'}):
appendtag.find('div', attrs={'class':'article_paginator'}).extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -0,0 +1,47 @@
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid at kovidgoyal.net>, Armin Geller'
'''
Fetch echo-online.de
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class Echo_Online(BasicNewsRecipe):
title = u' Echo Online'
description = '-Echo Online-'
publisher = 'Echo Online GmbH'
category = 'News, Germany'
__author__ = 'Armin Geller' # 2011-12-17
language = 'de'
lang = 'de-DE'
encoding = 'iso-8859-1'
timefmt = ' [%a, %d %b %Y]'
oldest_article = 7
max_articles_per_feed = 2
no_stylesheets = True
auto_cleanup = True
remove_javascript = True
feeds = [
(u'Topnews', u'http://www.echo-online.de/storage/rss/rss/topnews.xml'),
(u'Darmstadt', u'http://www.echo-online.de/rss/darmstadt.xml'),
(u'Darmstadt-Dieburg', u'http://www.echo-online.de/rss/darmstadtdieburg.xml'),
(u'Kreis Gro\xdf-Gerau', u'http://www.echo-online.de/rss/kreisgrossgerau.xml'),
(u'R\xfcsselsheim', u'http://www.echo-online.de/rss/ruesselsheim.xml'),
(u'Kreis Bergstra\xdfe', u'http://www.echo-online.de/rss/bergstrasse.xml'),
(u'Odenwaldkreis', u'http://www.echo-online.de/rss/odenwald.xml'),
(u'SV 98', u'http://www.echo-online.de/rss/sv98.xml'),
(u'Kino', u'http://www.echo-online.de/rss/kino.xml'),
(u'Ausstellungen', u'http://www.echo-online.de/rss/ausstellungen.xml'),
(u'Ausflug & Reise', u'http://www.echo-online.de/rss/ausflugreise.xml'),
]
def print_version(self, url):
return self.browser.open_novisit(url).geturl() + '?_FRAME=33&_FORMAT=PRINT'
remove_tags = [dict(name='div', attrs={'class':["header", "name"]}),]
auto_cleanup_keep = '//div[@class="bild_gross w270"]'
# cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-ash2/41801_145340745513489_893927_n.jpg' # 2011-12-16 AGe
cover_url = 'http://adcounter.darmstaedter-echo.de/webdav/files/config/gui/images/Zeitungsfaecher.gif' # 2011-12-16 AGe

View File

@ -55,12 +55,17 @@ class Economist(BasicNewsRecipe):
''' '''
def get_cover_url(self): def get_cover_url(self):
br = self.browser soup = self.index_to_soup('http://www.economist.com/printedition/covers')
br.open(self.INDEX) div = soup.find('div', attrs={'class':lambda x: x and
issue = br.geturl().split('/')[4] 'print-cover-links' in x})
self.log('Fetching cover for issue: %s'%issue) a = div.find('a', href=True)
cover_url = "http://media.economist.com/sites/default/files/imagecache/print-cover-full/print-covers/%s_CNA400.jpg" %(issue.translate(None,'-')) url = a.get('href')
return cover_url if url.startswith('/'):
url = 'http://www.economist.com' + url
soup = self.index_to_soup(url)
div = soup.find('div', attrs={'class':'cover-content'})
img = div.find('img', src=True)
return img.get('src')
def parse_index(self): def parse_index(self):
return self.economist_parse_index() return self.economist_parse_index()

View File

@ -39,13 +39,17 @@ class Economist(BasicNewsRecipe):
delay = 1 delay = 1
def get_cover_url(self): def get_cover_url(self):
br = self.browser soup = self.index_to_soup('http://www.economist.com/printedition/covers')
br.open(self.INDEX) div = soup.find('div', attrs={'class':lambda x: x and
issue = br.geturl().split('/')[4] 'print-cover-links' in x})
self.log('Fetching cover for issue: %s'%issue) a = div.find('a', href=True)
cover_url = "http://media.economist.com/sites/default/files/imagecache/print-cover-full/print-covers/%s_CNA400.jpg" %(issue.translate(None,'-')) url = a.get('href')
return cover_url if url.startswith('/'):
url = 'http://www.economist.com' + url
soup = self.index_to_soup(url)
div = soup.find('div', attrs={'class':'cover-content'})
img = div.find('img', src=True)
return img.get('src')
def parse_index(self): def parse_index(self):
try: try:

View File

@ -5,12 +5,11 @@ __license__ = 'GPL v3'
__copyright__ = '04 December 2010, desUBIKado' __copyright__ = '04 December 2010, desUBIKado'
__author__ = 'desUBIKado' __author__ = 'desUBIKado'
__description__ = 'Daily newspaper from Aragon' __description__ = 'Daily newspaper from Aragon'
__version__ = 'v0.07' __version__ = 'v0.08'
__date__ = '06, February 2011' __date__ = '13, November 2011'
''' '''
elperiodicodearagon.com elperiodicodearagon.com
''' '''
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -20,13 +19,13 @@ class elperiodicodearagon(BasicNewsRecipe):
description = u'Noticias desde Aragon' description = u'Noticias desde Aragon'
publisher = u'elperiodicodearagon.com' publisher = u'elperiodicodearagon.com'
category = u'news, politics, Spain, Aragon' category = u'news, politics, Spain, Aragon'
oldest_article = 2 oldest_article = 1
delay = 0 delay = 0
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
language = 'es' language = 'es'
encoding = 'utf8' encoding = 'iso-8859-1'
remove_empty_feeds = True remove_empty_feeds = True
remove_javascript = True remove_javascript = True
@ -39,61 +38,30 @@ class elperiodicodearagon(BasicNewsRecipe):
} }
feeds = [ feeds = [
(u'Arag\xf3n', u'http://elperiodicodearagon.com/RSS/2.xml'), (u'Portada', u'http://zetaestaticos.com/aragon/rss/portada_es.xml'),
(u'Internacional', u'http://elperiodicodearagon.com/RSS/4.xml'), (u'Arag\xf3n', u'http://zetaestaticos.com/aragon/rss/2_es.xml'),
(u'Espa\xf1a', u'http://elperiodicodearagon.com/RSS/3.xml'), (u'Internacional', u'http://zetaestaticos.com/aragon/rss/4_es.xml'),
(u'Econom\xeda', u'http://elperiodicodearagon.com/RSS/5.xml'), (u'Espa\xf1a', u'http://zetaestaticos.com/aragon/rss/3_es.xml'),
(u'Deportes', u'http://elperiodicodearagon.com/RSS/7.xml'), (u'Econom\xeda', u'http://zetaestaticos.com/aragon/rss/5_es.xml'),
(u'Real Zaragoza', u'http://elperiodicodearagon.com/RSS/10.xml'), (u'Deportes', u'http://zetaestaticos.com/aragon/rss/7_es.xml'),
(u'Opini\xf3n', u'http://elperiodicodearagon.com/RSS/103.xml'), (u'Real Zaragoza', u'http://zetaestaticos.com/aragon/rss/10_es.xml'),
(u'Escenarios', u'http://elperiodicodearagon.com/RSS/105.xml'), (u'CAI Zaragoza', u'http://zetaestaticos.com/aragon/rss/91_es.xml'),
(u'Sociedad', u'http://elperiodicodearagon.com/RSS/104.xml'), (u'Monta\xf1ismo', u'http://zetaestaticos.com/aragon/rss/354_es.xml'),
(u'Gente', u'http://elperiodicodearagon.com/RSS/330.xml') (u'Opini\xf3n', u'http://zetaestaticos.com/aragon/rss/103_es.xml'),
(u'Tema del d\xeda', u'http://zetaestaticos.com/aragon/rss/102_es.xml'),
(u'Escenarios', u'http://zetaestaticos.com/aragon/rss/105_es.xml'),
(u'Sociedad', u'http://zetaestaticos.com/aragon/rss/104_es.xml'),
(u'Gente', u'http://zetaestaticos.com/aragon/rss/330_es.xml'),
(u'Espacio 3', u'http://zetaestaticos.com/aragon/rss/328_es.xml'),
(u'Fiestas del Pilar', u'http://zetaestaticos.com/aragon/rss/107_es.xml')
] ]
extra_css = '''
h3 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
h2 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:18px;}
h4 {font-family:Arial,Helvetica,sans-serif; font-style:italic; font-weight:normal;font-size:20px;}
.columnaDeRecursosRelacionados {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
img{margin-bottom: 0.4em}
'''
remove_attributes = ['height','width'] remove_attributes = ['height','width']
keep_only_tags = [dict(name='div', attrs={'id':'contenidos'})] keep_only_tags = [dict(name='div', attrs={'id':'Noticia'})]
# Quitar toda la morralla
remove_tags = [dict(name='ul', attrs={'class':'herramientasDeNoticia'}),
dict(name='span', attrs={'class':'MasInformacion '}),
dict(name='span', attrs={'class':'MasInformacion'}),
dict(name='div', attrs={'class':'Middle'}),
dict(name='div', attrs={'class':'MenuCabeceraRZaragoza'}),
dict(name='div', attrs={'id':'MenuCabeceraRZaragoza'}),
dict(name='div', attrs={'class':'MenuEquipo'}),
dict(name='div', attrs={'class':'TemasRelacionados'}),
dict(name='div', attrs={'class':'GaleriaEnNoticia'}),
dict(name='div', attrs={'class':'Recorte'}),
dict(name='div', attrs={'id':'NoticiasenRecursos'}),
dict(name='div', attrs={'id':'NoticiaEnPapel'}),
dict(name='p', attrs={'class':'RecorteEnNoticias'}),
dict(name='div', attrs={'id':'Comparte'}),
dict(name='div', attrs={'id':'CajaComparte'}),
dict(name='a', attrs={'class':'EscribirComentario'}),
dict(name='a', attrs={'class':'AvisoComentario'}),
dict(name='div', attrs={'class':'CajaAvisoComentario'}),
dict(name='div', attrs={'class':'navegaNoticias'}),
dict(name='div', attrs={'class':'Mensaje'}),
dict(name='div', attrs={'id':'PaginadorDiCom'}),
dict(name='div', attrs={'id':'CajaAccesoCuentaUsuario'}),
dict(name='div', attrs={'id':'CintilloComentario'}),
dict(name='div', attrs={'id':'EscribeComentario'}),
dict(name='div', attrs={'id':'FormularioComentario'}),
dict(name='div', attrs={'id':'FormularioNormas'})]
# Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion) # Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion)
def get_cover_url(self): def get_cover_url(self):
@ -104,23 +72,7 @@ class elperiodicodearagon(BasicNewsRecipe):
return image['src'].rstrip('format=2') + 'format=1' return image['src'].rstrip('format=2') + 'format=1'
return None return None
# Para quitar espacios entre la noticia y los comentarios (lineas 1 y 2) # Usamos la versión para móviles
# El indice no apuntaba correctamente al empiece de la noticia (linea 3)
preprocess_regexps = [ def print_version(self, url):
(re.compile(r'<p>&nbsp;</p>', re.DOTALL|re.IGNORECASE), lambda match: ''), return url.replace('http://www.elperiodicodearagon.com/', 'http://www.elperiodicodearagon.com/m/')
(re.compile(r'<p> </p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(r'<p id="">', re.DOTALL|re.IGNORECASE), lambda match: '<p>')
]
# Para sustituir el video incrustado de YouTube por una imagen
def preprocess_html(self, soup):
for video_yt in soup.findAll('iframe',{'title':'YouTube video player'}):
if video_yt:
video_yt.name = 'img'
fuente = video_yt['src']
fuente2 = fuente.replace('http://www.youtube.com/embed/','http://img.youtube.com/vi/')
video_yt['src'] = fuente2 + '/0.jpg'
return soup

View File

@ -0,0 +1,48 @@
################################################################################
#Description: http://es.hu/ RSS channel
#Author: Bigpapa (bigpapabig@hotmail.com)
#Date: 2010.12.01. - V1.0
################################################################################
from calibre.web.feeds.recipes import BasicNewsRecipe
class elet_es_irodalom(BasicNewsRecipe):
title = u'Elet es Irodalom'
__author__ = 'Bigpapa'
oldest_article = 7
max_articles_per_feed = 20 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
no_stylesheets = True
#delay = 1
use_embedded_content = False
encoding = 'iso-8859-2'
category = 'Cikkek'
language = 'hu'
publication_type = 'newsportal'
extra_css = '.doc_title { font: bold 30px } .doc_author {font: bold 14px} '
keep_only_tags = [
dict(name='div', attrs={'class':['doc_author', 'doc_title', 'doc']})
]
remove_tags = [
dict(name='a', attrs={'target':['_TOP']}),
dict(name='div', attrs={'style':['float: right; margin-left: 5px; margin-bottom: 5px;', 'float: right; margin-left: 5px; margin-bottom: 5px;']}),
]
feeds = [
(u'Publicisztika', 'http://www.feed43.com/4684235031168504.xml'),
(u'Interj\xfa', 'http://www.feed43.com/4032465460040618.xml'),
(u'Visszhang', 'http://www.feed43.com/3727375706873086.xml'),
(u'P\xe1ratlan oldal', 'http://www.feed43.com/2525784782475057.xml'),
(u'Feuilleton', 'http://www.feed43.com/7216025082703073.xml'),
(u'Pr\xf3za', 'http://www.feed43.com/8760248802326384.xml'),
(u'Vers', 'http://www.feed43.com/1737324675134275.xml'),
(u'K\xf6nyvkritika', 'http://www.feed43.com/1281156550717082.xml'),
(u'M\u0171b\xedr\xe1lat', 'http://www.feed43.com/1851854623681044.xml')
]

View File

@ -4,7 +4,8 @@ __copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
''' '''
elmundo.es elmundo.es
''' '''
import re
import time
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class ElMundo(BasicNewsRecipe): class ElMundo(BasicNewsRecipe):
@ -18,12 +19,15 @@ class ElMundo(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'iso8859_15' encoding = 'iso8859_15'
remove_javascript = True
remove_empty_feeds = True
language = 'es' language = 'es'
masthead_url = 'http://estaticos03.elmundo.es/elmundo/iconos/v4.x/v4.01/bg_h1.png' masthead_url = 'http://estaticos03.elmundo.es/elmundo/iconos/v4.x/v4.01/bg_h1.png'
publication_type = 'newspaper' publication_type = 'newspaper'
extra_css = """ extra_css = """
body{font-family: Arial,Helvetica,sans-serif} body{font-family: Arial,Helvetica,sans-serif}
.metadata_noticia{font-size: small} .metadata_noticia{font-size: small}
.pestana_GDP{font-size: small; font-weight:bold}
h1,h2,h3,h4,h5,h6,.subtitulo {color: #3F5974} h1,h2,h3,h4,h5,h6,.subtitulo {color: #3F5974}
.hora{color: red} .hora{color: red}
.update{color: gray} .update{color: gray}
@ -41,8 +45,11 @@ class ElMundo(BasicNewsRecipe):
remove_tags_after = dict(name='div' , attrs={'id':['desarrollo_noticia','tamano']}) remove_tags_after = dict(name='div' , attrs={'id':['desarrollo_noticia','tamano']})
remove_attributes = ['lang','border'] remove_attributes = ['lang','border']
remove_tags = [ remove_tags = [
dict(name='div', attrs={'class':['herramientas','publicidad_google']}) dict(name='div', attrs={'class':['herramientas','publicidad_google','comenta','col col-2b','apoyos','no-te-pierdas']})
,dict(name='div', attrs={'id':'modulo_multimedia' }) ,dict(name='div', attrs={'class':['publicidad publicidad_cuerpo_noticia','comentarios_nav','mensaje_privado','interact']})
,dict(name='div', attrs={'class':['num_comentarios estirar']})
,dict(name='span', attrs={'class':['links_comentar']})
,dict(name='div', attrs={'id':['comentar']})
,dict(name='ul', attrs={'class':'herramientas' }) ,dict(name='ul', attrs={'class':'herramientas' })
,dict(name=['object','link','embed','iframe','base','meta']) ,dict(name=['object','link','embed','iframe','base','meta'])
] ]
@ -50,13 +57,31 @@ class ElMundo(BasicNewsRecipe):
feeds = [ feeds = [
(u'Portada' , u'http://estaticos.elmundo.es/elmundo/rss/portada.xml' ) (u'Portada' , u'http://estaticos.elmundo.es/elmundo/rss/portada.xml' )
,(u'Deportes' , u'http://estaticos.elmundo.es/elmundodeporte/rss/portada.xml') ,(u'Deportes' , u'http://estaticos.elmundo.es/elmundodeporte/rss/portada.xml')
,(u'Economia' , u'http://estaticos.elmundo.es/elmundo/rss/economia.xml' ) ,(u'Econom\xeda' , u'http://estaticos.elmundo.es/elmundo/rss/economia.xml' )
,(u'Espana' , u'http://estaticos.elmundo.es/elmundo/rss/espana.xml' ) ,(u'Espa\xf1a' , u'http://estaticos.elmundo.es/elmundo/rss/espana.xml' )
,(u'Internacional' , u'http://estaticos.elmundo.es/elmundo/rss/internacional.xml' ) ,(u'Internacional' , u'http://estaticos.elmundo.es/elmundo/rss/internacional.xml' )
,(u'Cultura' , u'http://estaticos.elmundo.es/elmundo/rss/cultura.xml' ) ,(u'Cultura' , u'http://estaticos.elmundo.es/elmundo/rss/cultura.xml' )
,(u'Ciencia/Ecologia', u'http://estaticos.elmundo.es/elmundo/rss/ciencia.xml' ) ,(u'Ciencia/Ecolog\xeda', u'http://estaticos.elmundo.es/elmundo/rss/ciencia.xml' )
,(u'Comunicacion' , u'http://estaticos.elmundo.es/elmundo/rss/comunicacion.xml' ) ,(u'Comunicaci\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/comunicacion.xml' )
,(u'Television' , u'http://estaticos.elmundo.es/elmundo/rss/television.xml' ) ,(u'Televisi\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/television.xml' )
,(u'Salud' , u'http://estaticos.elmundo.es/elmundosalud/rss/portada.xml' )
,(u'Solidaridad' , u'http://estaticos.elmundo.es/elmundo/rss/solidaridad.xml' )
,(u'Su vivienda' , u'http://estaticos.elmundo.es/elmundo/rss/suvivienda.xml' )
,(u'Motor' , u'http://estaticos.elmundo.es/elmundomotor/rss/portada.xml' )
,(u'Madrid' , u'http://estaticos.elmundo.es/elmundo/rss/madrid.xml' )
,(u'Barcelona' , u'http://estaticos.elmundo.es/elmundo/rss/barcelona.xml' )
,(u'Pa\xeds Vasco' , u'http://estaticos.elmundo.es/elmundo/rss/paisvasco.xml' )
,(u'Baleares' , u'http://estaticos.elmundo.es/elmundo/rss/baleares.xml' )
,(u'Castilla y Le\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/castillayleon.xml' )
,(u'Valladolid' , u'http://estaticos.elmundo.es/elmundo/rss/valladolid.xml' )
,(u'Valencia' , u'http://estaticos.elmundo.es/elmundo/rss/valencia.xml' )
,(u'Alicante' , u'http://estaticos.elmundo.es/elmundo/rss/alicante.xml' )
,(u'Castell\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/castellon.xml' )
,(u'Andaluc\xeda' , u'http://estaticos.elmundo.es/elmundo/rss/andalucia.xml' )
,(u'Sevilla' , u'http://estaticos.elmundo.es/elmundo/rss/andalucia_sevilla.xml' )
,(u'M\xe1laga' , u'http://estaticos.elmundo.es/elmundo/rss/andalucia_malaga.xml' )
] ]
def preprocess_html(self, soup): def preprocess_html(self, soup):
@ -67,3 +92,34 @@ class ElMundo(BasicNewsRecipe):
def get_article_url(self, article): def get_article_url(self, article):
return article.get('guid', None) return article.get('guid', None)
preprocess_regexps = [
# Para presentar la imagen de los videos incrustados
(re.compile(r'var imagen', re.DOTALL|re.IGNORECASE), lambda match: '--></script><img src'),
(re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
(re.compile(r'var video=', re.DOTALL|re.IGNORECASE), lambda match: '<script language="Javascript" type="text/javascript"><!--'),
# Para que no salga la numeración de comentarios: 1, 2, 3 ...
(re.compile(r'<ol>\n<li style="z-index:', re.DOTALL|re.IGNORECASE), lambda match: '<ul><li style="z-index:'),
(re.compile(r'</ol>\n<div class="num_comentarios estirar">', re.DOTALL|re.IGNORECASE), lambda match: '</ul><div class="num_comentarios estirar">'),
]
# Obtener la imagen de portada
def get_cover_url(self):
cover = None
st = time.localtime()
year = str(st.tm_year)
month = "%.2d" % st.tm_mon
day = "%.2d" % st.tm_mday
#http://img.kiosko.net/2011/11/19/es/elmundo.750.jpg
cover='http://img.kiosko.net/'+ year + '/' + month + '/' + day +'/es/elmundo.750.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
self.log("\nPortada no disponible")
cover ='http://estaticos03.elmundo.es/elmundo/iconos/v4.x/v4.01/bg_h1.png'
return cover

16
recipes/emuzica_pl.recipe Normal file
View File

@ -0,0 +1,16 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe
class eMuzyka(BasicNewsRecipe):
title = u'eMuzyka'
__author__ = 'fenuks'
description = u'Emuzyka to największa i najpopularniejsza strona o muzyce w Polsce'
category = 'music'
language = 'pl'
cover_url='http://s.emuzyka.pl/img/emuzyka_invert_small.jpg'
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 100
keep_only_tags=[dict(name='div', attrs={'id':'news_container'}), dict(name='h3'), dict(name='div', attrs={'class':'review_text'})]
remove_tags=[dict(name='span', attrs={'id':'date'})]
feeds = [(u'Aktualno\u015bci', u'http://www.emuzyka.pl/rss.php?f=1'), (u'Recenzje', u'http://www.emuzyka.pl/rss.php?f=2')]

View File

@ -1,35 +1,43 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'Gerardo Diez' __copyright__ = '5, January 2011 Gerardo Diez<gerardo.diez.garcia@gmail.com> & desUBIKado'
__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>' __author__ = 'desUBIKado, based on an earlier version by Gerardo Diez'
description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)' __version__ = 'v1.01'
__docformat__ = 'restructuredtext en' __date__ = '13, November 2011'
''' '''
expansion.es [url]http://www.expansion.com/[/url]
''' '''
import time
import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
class Publico(BasicNewsRecipe):
title =u'Expansion.com' class expansion_spanish(BasicNewsRecipe):
__author__ ='Gerardo Diez' __author__ ='Gerardo Diez & desUBIKado'
publisher =u'Unidad Editorial Información Económica, S.L.' description ='Financial news from Spain'
category ='finances, catalunya' title =u'Expansion'
oldest_article =1 publisher =u'Unidad Editorial Internet, S.L.'
category ='news, finances, Spain'
oldest_article = 2
simultaneous_downloads = 10
max_articles_per_feed =100 max_articles_per_feed =100
simultaneous_downloads =10 timefmt = '[%a, %d %b, %Y]'
cover_url =u'http://estaticos01.expansion.com/iconos/v2.x/v2.0/cabeceras/logo_expansion.png' encoding ='iso-8859-15'
timefmt ='[%A, %d %B, %Y]'
encoding ='latin'
language ='es' language ='es'
remove_javascript =True use_embedded_content = False
no_stylesheets =True remove_javascript = True
no_stylesheets = True
remove_empty_feeds = True
keep_only_tags =dict(name='div', attrs={'class':['noticia primer_elemento']}) keep_only_tags =dict(name='div', attrs={'class':['noticia primer_elemento']})
remove_tags =[ remove_tags =[
dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto']}), dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto','tit_relacionadas','interact','paginacion estirar','sumario derecha']}),
dict(name='ul', attrs={'class':['bolos_desarrollo_noticia']}), dict(name='ul', attrs={'class':['bolos_desarrollo_noticia','not_logged']}),
dict(name='span', attrs={'class':['comentarios']}), dict(name='span', attrs={'class':['comentarios']}),
dict(name='p', attrs={'class':['cintillo_comentarios', 'cintillo_comentarios formulario']}), dict(name='p', attrs={'class':['cintillo_comentarios', 'cintillo_comentarios formulario']}),
dict(name='div', attrs={'id':['comentarios_lectores_listado']}) dict(name='div', attrs={'id':['comentarios_lectores_listado','comentar']})
] ]
feeds =[ feeds =[
(u'Portada', u'http://estaticos.expansion.com/rss/portada.xml'), (u'Portada', u'http://estaticos.expansion.com/rss/portada.xml'),
@ -38,42 +46,112 @@ class Publico(BasicNewsRecipe):
(u'Euribor', u'http://estaticos.expansion.com/rss/mercadoseuribor.xml'), (u'Euribor', u'http://estaticos.expansion.com/rss/mercadoseuribor.xml'),
(u'Materias Primas', u'http://estaticos.expansion.com/rss/mercadosmateriasprimas.xml'), (u'Materias Primas', u'http://estaticos.expansion.com/rss/mercadosmateriasprimas.xml'),
(u'Renta Fija', u'http://estaticos.expansion.com/rss/mercadosrentafija.xml'), (u'Renta Fija', u'http://estaticos.expansion.com/rss/mercadosrentafija.xml'),
(u'Portada: Mi Dinero', u'http://estaticos.expansion.com/rss/midinero.xml'), (u'Portada: Mi Dinero', u'http://estaticos.expansion.com/rss/midinero.xml'),
(u'Hipotecas', u'http://estaticos.expansion.com/rss/midinerohipotecas.xml'), (u'Hipotecas', u'http://estaticos.expansion.com/rss/midinerohipotecas.xml'),
(u'Créditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'), (u'Cr\xe9ditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'),
(u'Pensiones', u'http://estaticos.expansion.com/rss/midineropensiones.xml'), (u'Pensiones', u'http://estaticos.expansion.com/rss/midineropensiones.xml'),
(u'Fondos de Inversión', u'http://estaticos.expansion.com/rss/midinerofondos.xml'), (u'Fondos de Inversi\xf3n', u'http://estaticos.expansion.com/rss/midinerofondos.xml'),
(u'Motor', u'http://estaticos.expansion.com/rss/midineromotor.xml'), (u'Motor', u'http://estaticos.expansion.com/rss/midineromotor.xml'),
(u'Portada: Empresas', u'http://estaticos.expansion.com/rss/empresas.xml'), (u'Portada: Empresas', u'http://estaticos.expansion.com/rss/empresas.xml'),
(u'Banca', u'http://estaticos.expansion.com/rss/empresasbanca.xml'), (u'Banca', u'http://estaticos.expansion.com/rss/empresasbanca.xml'),
(u'TMT', u'http://estaticos.expansion.com/rss/empresastmt.xml'), (u'TMT', u'http://estaticos.expansion.com/rss/empresastmt.xml'),
(u'Energía', u'http://estaticos.expansion.com/rss/empresasenergia.xml'), (u'Energ\xeda', u'http://estaticos.expansion.com/rss/empresasenergia.xml'),
(u'Inmobiliario y Construcción', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'), (u'Inmobiliario y Construcci\xf3n', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'),
(u'Transporte y Turismo', u'http://estaticos.expansion.com/rss/empresastransporte.xml'), (u'Transporte y Turismo', u'http://estaticos.expansion.com/rss/empresastransporte.xml'),
(u'Automoción e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'), (u'Automoci\xf3n e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'),
(u'Distribución', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'), (u'Distribuci\xf3n', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'),
(u'Deporte y Negocio', u' http://estaticos.expansion.com/rss/empresasdeporte.xml'), (u'Deporte y Negocio', u' [url]http://estaticos.expansion.com/rss/empresasdeporte.xml[/url]'),
(u'Mi Negocio', u'http://estaticos.expansion.com/rss/empresasminegocio.xml'), (u'Mi Negocio', u'http://estaticos.expansion.com/rss/empresasminegocio.xml'),
(u'Interiores', u'http://estaticos.expansion.com/rss/empresasinteriores.xml'), (u'Interiores', u'http://estaticos.expansion.com/rss/empresasinteriores.xml'),
(u'Digitech', u'http://estaticos.expansion.com/rss/empresasdigitech.xml'), (u'Digitech', u'http://estaticos.expansion.com/rss/empresasdigitech.xml'),
(u'Portada: Econom\xeda y Pol\xedtica', u'http://estaticos.expansion.com/rss/economiapolitica.xml'),
(u'Portada: Economía y Política', u'http://estaticos.expansion.com/rss/economiapolitica.xml'), (u'Pol\xedtica', u'http://estaticos.expansion.com/rss/economia.xml'),
(u'Política', u'http://estaticos.expansion.com/rss/economia.xml'),
(u'Portada: Sociedad', u'http://estaticos.expansion.com/rss/entorno.xml'), (u'Portada: Sociedad', u'http://estaticos.expansion.com/rss/entorno.xml'),
(u'Portada: Opini\xf3n', u'http://estaticos.expansion.com/rss/opinion.xml'),
(u'Portada: Opinión', u'http://estaticos.expansion.com/rss/opinion.xml'),
(u'Llaves y editoriales', u'http://estaticos.expansion.com/rss/opinioneditorialyllaves.xml'), (u'Llaves y editoriales', u'http://estaticos.expansion.com/rss/opinioneditorialyllaves.xml'),
(u'Tribunas', u'http://estaticos.expansion.com/rss/opiniontribunas.xml'), (u'Tribunas', u'http://estaticos.expansion.com/rss/opiniontribunas.xml'),
(u'Portada: Jur\xeddico', u'http://estaticos.expansion.com/rss/juridico.xml'),
(u'Portada: Jurídico', u'http://estaticos.expansion.com/rss/juridico.xml'),
(u'Entrevistas', u'http://estaticos.expansion.com/rss/juridicoentrevistas.xml'), (u'Entrevistas', u'http://estaticos.expansion.com/rss/juridicoentrevistas.xml'),
(u'Opinión', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'), (u'Opini\xf3n', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'),
(u'Sentencias', u'http://estaticos.expansion.com/rss/juridicosentencias.xml'), (u'Sentencias', u'http://estaticos.expansion.com/rss/juridicosentencias.xml'),
(u'Mujer', u'http://estaticos.expansion.com/rss/mujer-empresa.xml'), (u'Mujer', u'http://estaticos.expansion.com/rss/mujer-empresa.xml'),
(u'Catalu&ntilde;a', u'http://estaticos.expansion.com/rss/catalunya.xml'), (u'Catalu\xf1a', u'http://estaticos.expansion.com/rss/catalunya.xml'),
(u'Función pública', u'http://estaticos.expansion.com/rss/funcion-publica.xml') (u'Funci\xf3n p\xfablica', u'http://estaticos.expansion.com/rss/funcion-publica.xml')
] ]
# Obtener la imagen de portada
def get_cover_url(self):
cover = None
st = time.localtime()
year = str(st.tm_year)
month = "%.2d" % st.tm_mon
day = "%.2d" % st.tm_mday
#[url]http://img5.kiosko.net/2011/11/14/es/expansion.750.jpg[/url]
cover='http://img5.kiosko.net/'+ year + '/' + month + '/' + day +'/es/expansion.750.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
self.log("\nPortada no disponible")
cover ='http://www.aproahp.org/enlaces/images/diario_expansion.gif'
return cover
# Para que no salte la publicidad al recuperar la noticia, y que siempre se recupere
# la página web, mando la variable "t" con la hora "linux" o "epoch" actual
# haciendole creer al sitio web que justo se acaba de ver la publicidad
def print_version(self, url):
st = time.time()
segundos = str(int(st))
parametros = '.html?t=' + segundos
return url.replace('.html', parametros)
_processed_links = []
def get_article_url(self, article):
# Para obtener la url original del artículo a partir de la de "feedsportal"
link = article.get('link', None)
if link is None:
return article
if link.split('/')[-1]=="story01.htm":
link=link.split('/')[-2]
a=['0B','0C','0D','0E','0F','0G','0N' ,'0L0S','0A']
b=['.' ,'/' ,'?' ,'-' ,'=' ,'&' ,'.com','www.','0']
for i in range(0,len(a)):
link=link.replace(a[i],b[i])
link="http://"+link
# Eliminar artículos duplicados en otros feeds
if not (link in self._processed_links):
self._processed_links.append(link)
else:
link = None
return link
# Un poco de css para mejorar la presentación de las noticias
extra_css = '''
.entradilla {font-family:Arial,Helvetica,sans-serif; font-weight:bold; font-style:italic; font-size:16px;}
.fecha_publicacion,.autor {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
'''
# Para presentar la imagen de los videos incrustados
preprocess_regexps = [
(re.compile(r'var imagen', re.DOTALL|re.IGNORECASE), lambda match: '--></script><img src'),
(re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
(re.compile(r'var id_reproductor', re.DOTALL|re.IGNORECASE), lambda match: '<script language="Javascript" type="text/javascript"><!--'),
]

18
recipes/fisco_oggi.recipe Normal file
View File

@ -0,0 +1,18 @@
__license__ = 'GPL v3'
__author__ = 'faber1971'
description = 'Website of Italian Governament Income Agency (about revenue, taxation, taxes)- v1.00 (17, December 2011)'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1324112023(BasicNewsRecipe):
title = u'Fisco Oggi'
language = 'it'
__author__ = 'faber1971'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
remove_javascript = True
no_stylesheets = True
feeds = [(u'Attualit\xe0', u'http://www.fiscooggi.it/taxonomy/term/1/feed'), (u'Normativa', u'http://www.fiscooggi.it/taxonomy/term/5/feed'), (u'Giurisprudenza', u'http://www.fiscooggi.it/taxonomy/term/8/feed'), (u'Dati e statistiche', u'http://www.fiscooggi.it/taxonomy/term/12/feed'), (u'Analisi e commenti', u'http://www.fiscooggi.it/taxonomy/term/13/feed'), (u'Bilancio e contabilit\xe0', u'http://www.fiscooggi.it/taxonomy/term/576/feed'), (u'Dalle regioni', u'http://www.fiscooggi.it/taxonomy/term/16/feed'), (u'Dal mondo', u'http://www.fiscooggi.it/taxonomy/term/17/feed')]

View File

@ -1,57 +1,68 @@
# -*- coding: utf-8 -*- import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Focus_pl(BasicNewsRecipe): class FocusRecipe(BasicNewsRecipe):
title = u'Focus.pl' __license__ = 'GPL v3'
oldest_article = 15 __author__ = u'intromatyk <intromatyk@gmail.com>'
max_articles_per_feed = 100 language = 'pl'
__author__ = 'fenuks' version = 1
language = 'pl'
description ='polish scientific monthly magazine' title = u'Focus'
publisher = u'Gruner + Jahr Polska'
category = u'News'
description = u'Newspaper'
category='magazine' category='magazine'
cover_url='' cover_url=''
remove_empty_feeds= True remove_empty_feeds= True
no_stylesheets=True no_stylesheets=True
remove_tags_before=dict(name='div', attrs={'class':'h2 h2f'}) oldest_article = 7
remove_tags_after=dict(name='div', attrs={'class':'clear'}) max_articles_per_feed = 100000
feeds = [(u'Wszystkie kategorie', u'http://focus.pl.feedsportal.com/c/32992/f/532692/index.rss'), recursions = 0
(u'Nauka', u'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'),
(u'Historia', u'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'), no_stylesheets = True
(u'Cywilizacja', u'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'), remove_javascript = True
(u'Sport', u'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'), encoding = 'utf-8'
(u'Technika', u'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'), # Seems to work best, but YMMV
(u'Przyroda', u'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'), simultaneous_downloads = 5
(u'Technologie', u'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'),
(u'Warto wiedzieć', u'http://focus.pl.feedsportal.com/c/32992/f/532700/index.rss'), r = re.compile('.*(?P<url>http:\/\/(www.focus.pl)|(rss.feedsportal.com\/c)\/.*\.html?).*')
keep_only_tags =[]
keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'cll'}))
remove_tags =[]
remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulm noprint'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'txb'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'h2'}))
remove_tags.append(dict(name = 'ul', attrs = {'class' : 'txu'}))
remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulc'}))
extra_css = '''
body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
h1{text-align: left;}
h2{font-size: medium; font-weight: bold;}
p.lead {font-weight: bold; text-align: left;}
.authordate {font-size: small; color: #696969;}
.fot{font-size: x-small; color: #666666;}
'''
feeds = [
] ('Nauka', 'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'),
('Historia', 'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'),
('Cywilizacja', 'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'),
('Sport', 'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'),
('Technika', 'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'),
('Przyroda', 'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'),
('Technologie', 'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'),
]
def skip_ad_pages(self, soup): def skip_ad_pages(self, soup):
tag=soup.find(name='a') if ('advertisement' in soup.find('title').string.lower()):
if tag: href = soup.find('a').get('href')
new_soup=self.index_to_soup(tag['href']+ 'do-druku/1/', raw=True) return self.index_to_soup(href, raw=True)
return new_soup else:
return None
def append_page(self, appendtag):
tag=appendtag.find(name='div', attrs={'class':'arrows'})
if tag:
nexturl='http://www.focus.pl/'+tag.a['href']
for rem in appendtag.findAll(name='div', attrs={'class':'klik-nav'}):
rem.extract()
while nexturl:
soup2=self.index_to_soup(nexturl)
nexturl=None
pagetext=soup2.find(name='div', attrs={'class':'txt'})
tag=pagetext.find(name='div', attrs={'class':'arrows'})
for r in tag.findAll(name='a'):
if u'Następne' in r.string:
nexturl='http://www.focus.pl/'+r['href']
for rem in pagetext.findAll(name='div', attrs={'class':'klik-nav'}):
rem.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
def get_cover_url(self): def get_cover_url(self):
soup=self.index_to_soup('http://www.focus.pl/magazyn/') soup=self.index_to_soup('http://www.focus.pl/magazyn/')
@ -60,7 +71,14 @@ class Focus_pl(BasicNewsRecipe):
self.cover_url='http://www.focus.pl/' + tag.a['href'] self.cover_url='http://www.focus.pl/' + tag.a['href']
return getattr(self, 'cover_url', self.cover_url) return getattr(self, 'cover_url', self.cover_url)
def print_version(self, url):
def preprocess_html(self, soup): if url.count ('focus.pl.feedsportal.com'):
self.append_page(soup.body) u = url.find('focus0Bpl')
return soup u = 'http://www.focus.pl/' + url[u + 11:]
u = u.replace('0C', '/')
u = u.replace('A', '')
u = u.replace ('0E','-')
u = u.replace('/nc/1//story01.htm', '/do-druku/1')
else:
u = url.replace('/nc/1','/do-druku/1')
return u

View File

@ -8,31 +8,35 @@ class FSP(BasicNewsRecipe):
__author__ = 'fluzao' __author__ = 'fluzao'
description = u'Printed edition contents. UOL subscription required (Folha subscription currently not supported).' + \ description = u'Printed edition contents. UOL subscription required (Folha subscription currently not supported).' + \
u' [Conte\xfado completo da edi\xe7\xe3o impressa. Somente para assinantes UOL.]' u' [Conte\xfado completo da edi\xe7\xe3o impressa. Somente para assinantes UOL.]'
INDEX = 'http://www1.folha.uol.com.br/fsp/indices/'
#found this to be the easiest place to find the index page (13-Nov-2011).
# searching for the "Indice Geral" link
HOMEPAGE = 'http://www1.folha.uol.com.br/fsp/'
masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
language = 'pt' language = 'pt'
no_stylesheets = True no_stylesheets = True
max_articles_per_feed = 40 max_articles_per_feed = 40
remove_javascript = True remove_javascript = True
needs_subscription = True needs_subscription = True
remove_tags_before = dict(name='b')
remove_tags_before = dict(name='p')
remove_tags = [dict(name='td', attrs={'align':'center'})] remove_tags = [dict(name='td', attrs={'align':'center'})]
remove_attributes = ['height','width'] remove_attributes = ['height','width']
masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
# fixes the problem with the section names # fixes the problem with the section names
section_dict = {'cotidian' : 'cotidiano', 'ilustrad': 'ilustrada', \ section_dict = {'cotidian' : 'cotidiano', 'ilustrad': 'ilustrada', \
'quadrin': 'quadrinhos' , 'opiniao' : u'opini\xE3o', \ 'quadrin': 'quadrinhos' , 'opiniao' : u'opini\xE3o', \
'ciencia' : u'ci\xeancia' , 'saude' : u'sa\xfade', \ 'ciencia' : u'ci\xeancia' , 'saude' : u'sa\xfade', \
'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio'} 'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio', \
'imoveis' : u'im\xf3veis', 'negocios' : u'neg\xf3cios', \
'veiculos' : u've\xedculos', 'corrida' : 'folha corrida'}
# this solves the problem with truncated content in Kindle # this solves the problem with truncated content in Kindle
conversion_options = {'linearize_tables' : True} conversion_options = {'linearize_tables' : True}
# this bit removes the footer where there are links for Proximo Texto, Texto Anterior, # this bit removes the footer where there are links for Proximo Texto, Texto Anterior,
# Indice e Comunicar Erros # Indice e Comunicar Erros
preprocess_regexps = [(re.compile(r'<BR><BR>Texto Anterior:.*<!--/NOTICIA-->', preprocess_regexps = [(re.compile(r'<!--/NOTICIA-->.*Comunicar Erros</a>',
re.DOTALL|re.IGNORECASE), lambda match: r''),
(re.compile(r'<BR><BR>Pr&oacute;ximo Texto:.*<!--/NOTICIA-->',
re.DOTALL|re.IGNORECASE), lambda match: r'')] re.DOTALL|re.IGNORECASE), lambda match: r'')]
def get_browser(self): def get_browser(self):
@ -49,7 +53,25 @@ class FSP(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
soup = self.index_to_soup(self.INDEX) #Searching for the index page on the HOMEPAGE
hpsoup = self.index_to_soup(self.HOMEPAGE)
indexref = hpsoup.find('a', href=re.compile('^indices.*'))
self.log('--> tag containing the today s index: ', indexref)
INDEX = indexref['href']
INDEX = 'http://www1.folha.uol.com.br/fsp/'+INDEX
self.log('--> INDEX after extracting href and adding prefix: ', INDEX)
# ... and taking the opportunity to get the cover image link
coverurl = hpsoup.find('a', href=re.compile('^cp.*'))['href']
if coverurl:
self.log('--> tag containing the today s cover: ', coverurl)
coverurl = coverurl.replace('htm', 'jpg')
coverurl = 'http://www1.folha.uol.com.br/fsp/images/'+coverurl
self.log('--> coverurl after extracting href and adding prefix: ', coverurl)
self.cover_url = coverurl
#soup = self.index_to_soup(self.INDEX)
soup = self.index_to_soup(INDEX)
feeds = [] feeds = []
articles = [] articles = []
section_title = "Preambulo" section_title = "Preambulo"
@ -68,8 +90,12 @@ class FSP(BasicNewsRecipe):
self.log('--> new section title: ', section_title) self.log('--> new section title: ', section_title)
if strpost.startswith('<a href'): if strpost.startswith('<a href'):
url = post['href'] url = post['href']
#this bit is kept if they ever go back to the old format (pre Nov-2011)
if url.startswith('/fsp'): if url.startswith('/fsp'):
url = 'http://www1.folha.uol.com.br'+url url = 'http://www1.folha.uol.com.br'+url
#
if url.startswith('http://www1.folha.uol.com.br/fsp'):
#url = 'http://www1.folha.uol.com.br'+url
title = self.tag_to_string(post) title = self.tag_to_string(post)
self.log() self.log()
self.log('--> post: ', post) self.log('--> post: ', post)
@ -82,15 +108,11 @@ class FSP(BasicNewsRecipe):
# keeping the front page url # keeping the front page url
minha_capa = feeds[0][1][1]['url'] minha_capa = feeds[0][1][1]['url']
# removing the 'Preambulo' section # removing the first section (now called 'top')
del feeds[0] del feeds[0]
# creating the url for the cover image
coverurl = feeds[0][1][0]['url']
coverurl = coverurl.replace('/opiniao/fz', '/images/cp')
coverurl = coverurl.replace('01.htm', '.jpg')
self.cover_url = coverurl
# inserting the cover page as the first article (nicer for kindle users) # inserting the cover page as the first article (nicer for kindle users)
feeds.insert(0,(u'primeira p\xe1gina', [{'title':u'Primeira p\xe1gina' , 'url':minha_capa}])) feeds.insert(0,(u'primeira p\xe1gina', [{'title':u'Primeira p\xe1gina' , 'url':minha_capa}]))
return feeds return feeds

50
recipes/formulaas.recipe Normal file
View File

@ -0,0 +1,50 @@
# -*- coding: utf-8 -*-
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = u'2011, Silviu Cotoar\u0103'
'''
formula-as.ro
'''
from calibre.web.feeds.news import BasicNewsRecipe
class FormulaAS(BasicNewsRecipe):
title = u'Formula AS'
__author__ = u'Silviu Cotoar\u0103'
publisher = u'Formula AS'
description = u'Formula AS'
oldest_article = 5
language = 'ro'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
category = 'Ziare,Romania'
encoding = 'utf-8'
cover_url = 'http://www.formula-as.ro/_client/img/header_logo.png'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
keep_only_tags = [
dict(name='div', attrs={'class':'item padded'})
]
remove_tags = [
dict(name='ul', attrs={'class':'subtitle lower'})
]
remove_tags_after = [
dict(name='ul', attrs={'class':'subtitle lower'}),
dict(name='div', attrs={'class':'item-brief-options'})
]
feeds = [
(u'\u0218tiri', u'http://www.formula-as.ro/rss/articole.xml')
]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -1,35 +1,61 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2010-2011, Christian Schmitt'
'''
fr-online.de
'''
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe(BasicNewsRecipe):
title = u'Frankfurter Rundschau' class FROnlineRecipe(BasicNewsRecipe):
__author__ = 'schuster' title = 'Frankfurter Rundschau'
oldest_article = 1 __author__ = 'maccs'
max_articles_per_feed = 100 description = 'Nachrichten aus D und aller Welt'
no_stylesheets = True encoding = 'utf-8'
use_embedded_content = False masthead_url = 'http://www.fr-online.de/image/view/-/1474018/data/823552/-/logo.png'
language = 'de' publisher = 'Druck- und Verlagshaus Frankfurt am Main GmbH'
remove_javascript = True category = 'news, germany, world'
cover_url = 'http://www.fr-online.de/image/view/-/1474018/data/823538/-/logo.png' language = 'de'
extra_css = ''' publication_type = 'newspaper'
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} use_embedded_content = False
h4{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} remove_javascript = True
img {min-width:300px; max-width:600px; min-height:300px; max-height:800px} no_stylesheets = True
p{font-family:Arial,Helvetica,sans-serif;font-size:small;} oldest_article = 1 # Increase this number if you're interested in older articles
body{font-family:Helvetica,Arial,sans-serif;font-size:small;} max_articles_per_feed = 50 # Seems a reasonable number to me
''' extra_css = '''
body { font-family: "arial", "verdana", "geneva", sans-serif; font-size: 12px; margin: 0px; background-color: #ffffff;}
feeds = [(u'Startseite', u'http://www.fr-online.de/home/-/1472778/1472778/-/view/asFeed/-/index.xml'), .imgSubline{background-color: #f4f4f4; font-size: 0.8em;}
(u'Politik', u'http://www.fr-online.de/politik/-/1472596/1472596/-/view/asFeed/-/index.xml'), .p--heading-1 {font-weight: bold;}
(u'Meinungen', u'http://www.fr-online.de/politik/meinung/-/1472602/1472602/-/view/asFeed/-/index.xml'), .calibre_navbar {font-size: 0.8em; font-family: "arial", "verdana", "geneva", sans-serif;}
(u'Wirtschaft', u'http://www.fr-online.de/wirtschaft/-/1472780/1472780/-/view/asFeed/-/index.xml'), '''
(u'Sport', u'http://www.fr-online.de/sport/-/1472784/1472784/-/view/asFeed/-/index.xml'), keep_only_tags = [{'class':'ArticleHeadlineH1'}, {'class':'article_text'}]
(u'Kultur', u'http://www.fr-online.de/kultur/-/1472786/1472786/-/view/asFeed/-/index.xml'), cover_url = 'http://www.fr-online.de/image/view/-/1474018/data/823552/-/logo.png'
(u'Panorama', u'http://www.fr-online.de/panorama/-/1472782/1472782/-/view/asFeed/-/index.xml'), cover_margins = (100, 150, '#ffffff')
(u'Digital', u'http://www.fr-online.de/digital/-/1472406/1472406/-/view/asFeed/-/index.xml'),
(u'Wissenschaft', u'http://www.fr-online.de/wissenschaft/-/1472788/1472788/-/view/asFeed/-/index.xml')
]
def print_version(self, url): feeds = []
return url.replace('index.html', 'view/printVersion/-/index.html') feeds.append(('Startseite', u'http://www.fr-online.de/home/-/1472778/1472778/-/view/asFeed/-/index.xml'))
feeds.append(('Politik', u'http://www.fr-online.de/politik/-/1472596/1472596/-/view/asFeed/-/index.xml'))
feeds.append(('Meinung', u'http://www.fr-online.de/politik/meinung/-/1472602/1472602/-/view/asFeed/-/index.xml'))
feeds.append(('Wirtschaft', u'http://www.fr-online.de/wirtschaft/-/1472780/1472780/-/view/asFeed/-/index.xml'))
feeds.append(('Sport', u'http://www.fr-online.de/sport/-/1472784/1472784/-/view/asFeed/-/index.xml'))
feeds.append(('Eintracht Frankfurt', u'http://www.fr-online.de/sport/eintracht-frankfurt/-/1473446/1473446/-/view/asFeed/-/index.xml'))
feeds.append(('Kultur und Medien', u'http://www.fr-online.de/kultur/-/1472786/1472786/-/view/asFeed/-/index.xml'))
feeds.append(('Panorama', u'http://www.fr-online.de/panorama/-/1472782/1472782/-/view/asFeed/-/index.xml'))
feeds.append(('Frankfurt', u'http://www.fr-online.de/frankfurt/-/1472798/1472798/-/view/asFeed/-/index.xml'))
feeds.append(('Rhein-Main', u'http://www.fr-online.de/rhein-main/-/1472796/1472796/-/view/asFeed/-/index.xml'))
feeds.append(('Hanau', u'http://www.fr-online.de/rhein-main/hanau/-/1472866/1472866/-/view/asFeed/-/index.xml'))
feeds.append(('Darmstadt', u'http://www.fr-online.de/rhein-main/darmstadt/-/1472858/1472858/-/view/asFeed/-/index.xml'))
feeds.append(('Wiesbaden', u'http://www.fr-online.de/rhein-main/wiesbaden/-/1472860/1472860/-/view/asFeed/-/index.xml'))
feeds.append(('Offenbach', u'http://www.fr-online.de/rhein-main/offenbach/-/1472856/1472856/-/view/asFeed/-/index.xml'))
feeds.append(('Bad Homburg', u'http://www.fr-online.de/rhein-main/bad-homburg/-/1472864/1472864/-/view/asFeed/-/index.xml'))
feeds.append(('Digital', u'http://www.fr-online.de/digital/-/1472406/1472406/-/view/asFeed/-/index.xml'))
feeds.append(('Wissenschaft', u'http://www.fr-online.de/wissenschaft/-/1472788/1472788/-/view/asFeed/-/index.xml'))
def print_version(self, url):
return url.replace('index.html', 'view/printVersion/-/index.html')

View File

@ -18,7 +18,7 @@ class FrazPC(BasicNewsRecipe):
max_articles_per_feed = 100 max_articles_per_feed = 100
use_embedded_content = False use_embedded_content = False
no_stylesheets = True no_stylesheets = True
cover_url='http://www.frazpc.pl/images/logo.png'
feeds = [ feeds = [
(u'Aktualno\u015bci', u'http://www.frazpc.pl/feed/aktualnosci'), (u'Aktualno\u015bci', u'http://www.frazpc.pl/feed/aktualnosci'),
(u'Artyku\u0142y', u'http://www.frazpc.pl/feed/artykuly') (u'Artyku\u0142y', u'http://www.frazpc.pl/feed/artykuly')
@ -33,6 +33,7 @@ class FrazPC(BasicNewsRecipe):
dict(name='div', attrs={'class':'comments_box'}) dict(name='div', attrs={'class':'comments_box'})
] ]
remove_tags_after=dict(name='div', attrs={'class':'content'})
preprocess_regexps = [(re.compile(r'\| <a href="#comments">Komentarze \([0-9]*\)</a>'), lambda match: '')] preprocess_regexps = [(re.compile(r'\| <a href="#comments">Komentarze \([0-9]*\)</a>'), lambda match: '')]
remove_attributes = [ 'width', 'height' ] remove_attributes = [ 'width', 'height' ]

View File

@ -0,0 +1,35 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
import re
import string
from calibre.web.feeds.news import BasicNewsRecipe
class GazetaPlSzczecin(BasicNewsRecipe):
title = u'Gazeta.pl Szczecin'
description = u'Wiadomości ze Szczecina na portalu Gazeta.pl.'
__author__ = u'Michał Szkutnik'
__license__ = u'GPL v3'
language = 'pl'
publisher = 'Agora S.A.'
category = 'news, szczecin'
oldest_article = 2
max_articles_per_feed = 100
auto_cleanup = True
remove_tags = [ { "name" : "a", "attrs" : { "href" : "http://szczecin.gazeta.pl/szczecin/www.gazeta.pl" }}]
cover_url = "http://bi.gazeta.pl/i/hp/hp2009/logo.gif"
feeds = [(u'Wszystkie', u'http://rss.feedsportal.com/c/32739/f/530434/index.rss')]
def get_article_url(self, article):
s = re.search("""/0L(szczecin.*)/story01.htm""", article.link)
s = s.group(1)
replacements = { "0B" : ".", "0C" : "/", "0H" : ",", "0I" : "_"}
for (a, b) in replacements.iteritems():
s = string.replace(s, a, b)
s = string.replace(s, "0A", "0")
return "http://"+s
def print_version(self, url):
s = re.search("""/(\d*),(\d*),(\d*),.*\.html""", url)
no1 = s.group(2)
no2 = s.group(3)
return """http://szczecin.gazeta.pl/szczecin/2029020,%s,%s.html""" % (no1, no2)

View File

@ -0,0 +1,90 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class GiveMeSomethingToRead(BasicNewsRecipe):
title = u'Give Me Something To Read'
description = 'Curation / aggregation of articles on diverse topics'
language = 'en'
__author__ = 'barty on mobileread.com forum'
max_articles_per_feed = 100
no_stylesheets = False
timefmt = ' [%a, %d %b, %Y]'
oldest_article = 365
auto_cleanup = True
INDEX = 'http://givemesomethingtoread.com'
CATEGORIES = [
# comment out categories you don't want
# (user friendly name, system name, max number of articles to load)
('The Arts','arts',25),
('Science','science',30),
('Technology','technology',30),
('Politics','politics',20),
('Media','media',30),
('Crime','crime',15),
('Other articles','',10)
]
def parse_index(self):
self.cover_url = 'http://thegretchenshow.files.wordpress.com/2009/12/well-read-cat-small.jpg'
feeds = []
seen_urls = set([])
regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
for category in self.CATEGORIES:
(cat_name, tag, max_articles) = category
tagurl = '' if tag=='' else '/tagged/'+tag
self.log('Reading category:', cat_name)
articles = []
pageno = 1
while len(articles) < max_articles and pageno < 100:
page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
pageno += 1
self.log('\tReading page:', page)
try:
soup = self.index_to_soup(page)
except:
break
headers = soup.findAll('h2')
if len(headers) == .0:
break
for header in headers:
atag = header.find('a')
url = atag['href']
# skip promotionals and duplicate
if url.startswith('http://givemesomethingtoread') or url.startswith('/') or url in seen_urls:
continue
seen_urls.add(url)
title = self.tag_to_string(header)
self.log('\tFound article:', title)
#self.log('\t', url)
desc = header.parent.find('blockquote')
desc = self.tag_to_string(desc) if desc else ''
m = regex.match( url)
if m:
desc = "[%s] %s" % (m.group(2), desc)
#self.log('\t', desc)
date = ''
p = header.parent.previousSibling
# navigate up to find h3, which contains the date
while p:
if hasattr(p,'name') and p.name == 'h3':
date = self.tag_to_string(p)
break
p = p.previousSibling
articles.append({'title':title,'url':url,'description':desc,'date':date})
if len(articles) >= max_articles:
break
if articles:
feeds.append((cat_name, articles))
return feeds

View File

@ -1,4 +1,3 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class GlasgowHerald(BasicNewsRecipe): class GlasgowHerald(BasicNewsRecipe):
@ -9,12 +8,16 @@ class GlasgowHerald(BasicNewsRecipe):
language = 'en_GB' language = 'en_GB'
__author__ = 'Kovid Goyal' __author__ = 'Kovid Goyal'
use_embedded_content = False
keep_only_tags = [dict(attrs={'class':'article'})] no_stylesheets = True
remove_tags = [ auto_cleanup = True
dict(id=['pic-nav']),
dict(attrs={'class':['comments-top']}) #keep_only_tags = [dict(attrs={'class':'article'})]
] #remove_tags = [
#dict(id=['pic-nav']),
#dict(attrs={'class':['comments-top']})
#]
feeds = [ feeds = [
@ -26,4 +29,3 @@ class GlasgowHerald(BasicNewsRecipe):
u'http://www.heraldscotland.com/cmlink/1.768',), u'http://www.heraldscotland.com/cmlink/1.768',),
(u'Columnists', u'http://www.heraldscotland.com/cmlink/1.658574')] (u'Columnists', u'http://www.heraldscotland.com/cmlink/1.658574')]

View File

@ -51,6 +51,13 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe):
{'class':['articleTools', 'pagination', 'Ads', 'topad', {'class':['articleTools', 'pagination', 'Ads', 'topad',
'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}] 'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}]
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
#Use the mobile version rather than the web version #Use the mobile version rather than the web version
def print_version(self, url): def print_version(self, url):
return url.rpartition('?')[0] + '?service=mobile' return url.rpartition('?')[0] + '?service=mobile'

View File

@ -12,7 +12,6 @@ class GN(BasicNewsRecipe):
EDITION = 0 EDITION = 0
__author__ = 'Piotr Kontek' __author__ = 'Piotr Kontek'
title = u'Gość niedzielny'
description = 'Weekly magazine' description = 'Weekly magazine'
encoding = 'utf-8' encoding = 'utf-8'
no_stylesheets = True no_stylesheets = True
@ -20,6 +19,8 @@ class GN(BasicNewsRecipe):
remove_javascript = True remove_javascript = True
temp_files = [] temp_files = []
simultaneous_downloads = 1 simultaneous_downloads = 1
masthead_url = 'http://gosc.pl/files/11/03/12/949089_top.gif'
title = u'Gość niedzielny'
articles_are_obfuscated = True articles_are_obfuscated = True
@ -64,7 +65,6 @@ class GN(BasicNewsRecipe):
if img != None: if img != None:
a = img.parent a = img.parent
self.EDITION = a['href'] self.EDITION = a['href']
self.title = img['alt']
self.cover_url = 'http://www.gosc.pl' + img['src'] self.cover_url = 'http://www.gosc.pl' + img['src']
if not first: if not first:
break break

96
recipes/grantland.recipe Normal file
View File

@ -0,0 +1,96 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class GrantLand(BasicNewsRecipe):
title = u"Grantland"
description = 'Writings on Sports & Pop Culture'
language = 'en'
__author__ = 'Barty'
max_articles_per_feed = 100
no_stylesheets = False
# auto_cleanup is too aggressive sometimes and we end up with blank articles
auto_cleanup = False
timefmt = ' [%a, %d %b %Y]'
oldest_article = 365
cover_url = 'http://cdn0.sbnation.com/imported_assets/740965/blog_grantland_grid_3.jpg'
masthead_url = 'http://a1.espncdn.com/prod/assets/grantland/grantland-logo.jpg'
INDEX = 'http://www.grantland.com'
CATEGORIES = [
# comment out categories you don't want
# (user friendly name, url suffix, max number of articles to load)
('Today in Grantland','',20),
('In Case You Missed It','incaseyoumissedit',35),
]
remove_tags = [
{'name':['head','style','script']},
{'id':['header']},
{'class':re.compile(r'\bside|\bad\b|floatright|tags')}
]
remove_tags_before = {'class':'wrapper'}
remove_tags_after = [{'id':'content'}]
preprocess_regexps = [
# <header> tags with an img inside are just blog banners, don't need them
# note: there are other useful <header> tags so we don't want to just strip all of them
(re.compile(r'<header class.+?<img .+?>.+?</header>', re.DOTALL|re.IGNORECASE),lambda m: ''),
# delete everything between the *last* <hr class="small" /> and </article>
(re.compile(r'<hr class="small"(?:(?!<hr class="small").)+</article>', re.DOTALL|re.IGNORECASE),lambda m: '<hr class="small" /></article>'),
]
extra_css = """cite, time { font-size: 0.8em !important; margin-right: 1em !important; }
img + cite { display:block; text-align:right}"""
def parse_index(self):
feeds = []
seen_urls = set([])
for category in self.CATEGORIES:
(cat_name, tag, max_articles) = category
self.log('Reading category:', cat_name)
articles = []
page = "%s/%s" % (self.INDEX, tag)
soup = self.index_to_soup(page)
headers = soup.findAll('h2' if tag=='' else 'h3')
for header in headers:
tag = header.find('a')
if tag is None or not hasattr(tag,'href'):
continue
url = tag['href']
if url.startswith('/'):
url = self.INDEX + url
if url in seen_urls:
continue
seen_urls.add(url)
title = self.tag_to_string(tag)
if 'Podcast:' in title or 'In Case You Missed It' in title:
continue
desc = dt = ''
par = header.parent
#tag = par.find('cite')
#if tag is not None:
# desc = '['+self.tag_to_string(tag) + '] '
tag = par.find('div')
if tag is not None:
desc = desc + self.tag_to_string(tag)
tag = tag.find('time')
if tag is not None:
dt = self.tag_to_string( tag)
self.log('\tFound article:', title)
self.log('\t', url)
articles.append({'title':title,'url':url,'description':desc,'date':dt})
if len(articles) >= max_articles:
break
if articles:
feeds.append((cat_name, articles))
return feeds
def print_version(self, url):
return url+'?view=print'

43
recipes/gs24_pl.recipe Normal file
View File

@ -0,0 +1,43 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
import re
import string
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1322322819(BasicNewsRecipe):
title = u'GS24.pl (Głos Szczeciński)'
description = u'Internetowy serwis Głosu Szczecińskiego'
__author__ = u'Michał Szkutnik'
__license__ = u'GPL v3'
language = 'pl'
publisher = 'Media Regionalne sp. z o.o.'
category = 'news, szczecin'
oldest_article = 2
max_articles_per_feed = 100
auto_cleanup = True
cover_url = "http://www.gs24.pl/images/top_logo.png"
feeds = [
# (u'Wszystko', u'http://www.gs24.pl/rss.xml'),
(u'Szczecin', u'http://www.gs24.pl/szczecin.xml'),
(u'Stargard', u'http://www.gs24.pl/stargard.xml'),
(u'Świnoujście', u'http://www.gs24.pl/swinoujscie.xml'),
(u'Goleniów', u'http://www.gs24.pl/goleniow.xml'),
(u'Gryfice', u'http://www.gs24.pl/gryfice.xml'),
(u'Kamień Pomorski', u'http://www.gs24.pl/kamienpomorski.xml'),
(u'Police', u'http://www.gs24.pl/police.xml'),
(u'Region', u'http://www.gs24.pl/region.xml'),
(u'Sport', u'http://www.gs24.pl/sport.xml'),
]
def get_article_url(self, article):
s = re.search("""/0L0S(gs24.*)/story01.htm""", article.link)
s = s.group(1)
replacements = { "0B" : ".", "0C" : "/", "0H" : ",", "0I" : "_", "0D" : "?", "0F" : "="}
for (a, b) in replacements.iteritems():
s = string.replace(s, a, b)
s = string.replace(s, "0A", "0")
return "http://"+s
def print_version(self, url):
return url + "&Template=printpicart"

View File

@ -9,6 +9,7 @@ www.guardian.co.uk
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from datetime import date from datetime import date
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class Guardian(BasicNewsRecipe): class Guardian(BasicNewsRecipe):
@ -16,16 +17,19 @@ class Guardian(BasicNewsRecipe):
if date.today().weekday() == 6: if date.today().weekday() == 6:
base_url = "http://www.guardian.co.uk/theobserver" base_url = "http://www.guardian.co.uk/theobserver"
cover_pic = 'Observer digital edition' cover_pic = 'Observer digital edition'
masthead_url = 'http://static.guim.co.uk/sys-images/Guardian/Pix/site_furniture/2010/10/19/1287478087992/The-Observer-001.gif'
else: else:
base_url = "http://www.guardian.co.uk/theguardian" base_url = "http://www.guardian.co.uk/theguardian"
cover_pic = 'Guardian digital edition' cover_pic = 'Guardian digital edition'
masthead_url = 'http://static.guim.co.uk/static/f76b43f9dcfd761f0ecf7099a127b603b2922118/common/images/logos/the-guardian/titlepiece.gif'
__author__ = 'Seabound and Sujata Raman' __author__ = 'Seabound and Sujata Raman'
language = 'en_GB' language = 'en_GB'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_javascript = True remove_javascript = True
encoding = 'utf-8'
# List of section titles to ignore # List of section titles to ignore
# For example: ['Sport'] # For example: ['Sport']
@ -41,6 +45,16 @@ class Guardian(BasicNewsRecipe):
dict(name='div', attrs={'class':["guardian-tickets promo-component",]}), dict(name='div', attrs={'class':["guardian-tickets promo-component",]}),
dict(name='ul', attrs={'class':["pagination"]}), dict(name='ul', attrs={'class':["pagination"]}),
dict(name='ul', attrs={'id':["content-actions"]}), dict(name='ul', attrs={'id':["content-actions"]}),
# article history link
dict(name='a', attrs={'class':["rollover history-link"]}),
# "a version of this article ..." speil
dict(name='div' , attrs = { 'class' : ['section']}),
# "about this article" js dialog
dict(name='div', attrs={'class':["share-top",]}),
# author picture
dict(name='img', attrs={'class':["contributor-pic-small"]}),
# embedded videos/captions
dict(name='span',attrs={'class' : ['inline embed embed-media']}),
#dict(name='img'), #dict(name='img'),
] ]
use_embedded_content = False use_embedded_content = False
@ -65,8 +79,21 @@ class Guardian(BasicNewsRecipe):
url = None url = None
return url return url
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
def preprocess_html(self, soup): def preprocess_html(self, soup):
# multiple html sections in soup, useful stuff in the first
html = soup.find('html')
soup2 = BeautifulSoup()
soup2.insert(0,html)
soup = soup2
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
@ -75,6 +102,17 @@ class Guardian(BasicNewsRecipe):
for tag in soup.findAll(name=['ul','li']): for tag in soup.findAll(name=['ul','li']):
tag.name = 'div' tag.name = 'div'
# removes number next to rating stars
items_to_remove = []
rating_container = soup.find('div', attrs = {'class': ['rating-container']})
if rating_container:
for item in rating_container:
if isinstance(item, Tag) and str(item.name) == 'span':
items_to_remove.append(item)
for item in items_to_remove:
item.extract()
return soup return soup
def find_sections(self): def find_sections(self):

View File

@ -9,9 +9,9 @@ from calibre.ptempfile import PersistentTemporaryFile
from urlparse import urlparse from urlparse import urlparse
import re import re
class HackerNews(BasicNewsRecipe): class HNWithCommentsLink(BasicNewsRecipe):
title = 'Hacker News' title = 'HN With Comments Link'
__author__ = 'Tom Scholl' __author__ = 'Tom Scholl & David Kerschner'
description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.' description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.'
publisher = 'Y Combinator' publisher = 'Y Combinator'
category = 'news, programming, it, technology' category = 'news, programming, it, technology'
@ -80,6 +80,11 @@ class HackerNews(BasicNewsRecipe):
body = body + comments body = body + comments
return u'<html><title>' + title + u'</title><body>' + body + '</body></html>' return u'<html><title>' + title + u'</title><body>' + body + '</body></html>'
def parse_feeds(self):
a = super(HNWithCommentsLink, self).parse_feeds()
self.hn_articles = a[0].articles
return a
def get_obfuscated_article(self, url): def get_obfuscated_article(self, url):
if url.startswith('http://news.ycombinator.com'): if url.startswith('http://news.ycombinator.com'):
content = self.get_hn_content(url) content = self.get_hn_content(url)
@ -97,6 +102,13 @@ class HackerNews(BasicNewsRecipe):
else: else:
content = self.get_readable_content(url) content = self.get_readable_content(url)
article = 0
for a in self.hn_articles:
if a.url == url:
article = a
content = re.sub(r'</body>\s*</html>\s*$', '', content) + article.summary + '</body></html>'
self.temp_files.append(PersistentTemporaryFile('_fa.html')) self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(content) self.temp_files[-1].write(content)
self.temp_files[-1].close() self.temp_files[-1].close()

View File

@ -1,11 +1,11 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import re
class AdvancedUserRecipe(BasicNewsRecipe): class AdvancedUserRecipe(BasicNewsRecipe):
title = 'heise online' title = 'Heise-online'
description = 'News vom Heise-Verlag' description = 'News vom Heise-Verlag'
__author__ = 'schuster' __author__ = 'schuster'
masthead_url = 'http://www.heise.de/icons/ho/heise_online_logo.gif'
publisher = 'Heise Zeitschriften Verlag GmbH & Co. KG'
use_embedded_content = False use_embedded_content = False
language = 'de' language = 'de'
oldest_article = 2 oldest_article = 2
@ -14,11 +14,10 @@ class AdvancedUserRecipe(BasicNewsRecipe):
remove_empty_feeds = True remove_empty_feeds = True
timeout = 5 timeout = 5
no_stylesheets = True no_stylesheets = True
encoding = 'utf-8'
remove_tags_after = dict(name ='p', attrs={'class':'editor'}) remove_tags_after = dict(name ='p', attrs={'class':'editor'})
remove_tags = [{'class':'navi_top_container'}, remove_tags = [dict(id='navi_top_container'),
dict(id='navi_bottom'), dict(id='navi_bottom'),
dict(id='mitte_rechts'), dict(id='mitte_rechts'),
dict(id='navigation'), dict(id='navigation'),
@ -29,27 +28,31 @@ class AdvancedUserRecipe(BasicNewsRecipe):
dict(id='seiten_navi'), dict(id='seiten_navi'),
dict(id='adbottom'), dict(id='adbottom'),
dict(id='sitemap'), dict(id='sitemap'),
dict(name='a', href=re.compile(r'^/([a-zA-Z]+/)?')), dict(name='div', attrs={'id':'sitemap'}),
] dict(name='ul', attrs={'class':'erste_zeile'}),
dict(name='ul', attrs={'class':'zweite_zeile'}),
dict(name='div', attrs={'class':'navi_top_container'})]
feeds = [ feeds = [
('Newsticker', 'http://www.heise.de/newsticker/heise.rdf'), ('Newsticker', 'http://www.heise.de/newsticker/heise.rdf'),
('iX', 'http://www.heise.de/ix/news/news.rdf'), ('Auto', 'http://www.heise.de/autos/rss/news.rdf'),
('Technology Review', 'http://www.heise.de/tr/news-atom.xml'),
('mobil', 'http://www.heise.de/mobil/newsticker/heise-atom.xml'),
('Security', 'http://www.heise.de/security/news/news-atom.xml'),
('Netze', 'http://www.heise.de/netze/rss/netze-atom.xml'),
('Open Source', 'http://www.heise.de/open/news/news-atom.xml'),
('Resale ', 'http://www.heise.de/resale/rss/resale.rdf'),
('Foto ', 'http://www.heise.de/foto/rss/news-atom.xml'), ('Foto ', 'http://www.heise.de/foto/rss/news-atom.xml'),
('Autos', 'http://www.heise.de/autos/rss/news.rdf'), ('Mac&i', 'http://www.heise.de/mac-and-i/news.rdf'),
('Mac & i', 'http://www.heise.de/mac-and-i/news.rdf'), ('Mobile ', 'http://www.heise.de/mobil/newsticker/heise-atom.xml'),
('Netz ', 'http://www.heise.de/netze/rss/netze-atom.xml'),
('Open ', 'http://www.heise.de/open/news/news-atom.xml'),
('Resale ', 'http://www.heise.de/resale/rss/resale.rdf'),
('Security ', 'http://www.heise.de/security/news/news-atom.xml'),
('C`t', 'http://www.heise.de/ct/rss/artikel-atom.xml'),
('iX', 'http://www.heise.de/ix/news/news.rdf'),
('Mach-flott', 'http://www.heise.de/mach-flott/rss/mach-flott-atom.xml'),
('Blog: Babel-Bulletin', 'http://www.heise.de/developer/rss/babel-bulletin/blog.rdf'), ('Blog: Babel-Bulletin', 'http://www.heise.de/developer/rss/babel-bulletin/blog.rdf'),
('Blog: Der Dotnet-Doktor', 'http://www.heise.de/developer/rss/dotnet-doktor/blog.rdf'), ('Blog: Der Dotnet-Doktor', 'http://www.heise.de/developer/rss/dotnet-doktor/blog.rdf'),
('Blog: Bernds Management-Welt', 'http://www.heise.de/developer/rss/bernds-management-welt/blog.rdf'), ('Blog: Bernds Management-Welt', 'http://www.heise.de/developer/rss/bernds-management-welt/blog.rdf'),
('Blog: The World of IT', 'http://www.heise.de/developer/rss/world-of-it/blog.rdf'), ('Blog: IT conversation', 'http://www.heise.de/developer/rss/world-of-it/blog.rdf'),
('Blog: Kais bewegtes Web', 'http://www.heise.de/developer/rss/kais-bewegtes-web/blog.rdf') ('Blog: Kais bewegtes Web', 'http://www.heise.de/developer/rss/kais-bewegtes-web/blog.rdf')]
]
def print_version(self, url): def print_version(self, url):
return url + '?view=print' return url + '?view=print'

View File

@ -1,4 +1,5 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import urllib, re
class HindustanTimes(BasicNewsRecipe): class HindustanTimes(BasicNewsRecipe):
title = u'Hindustan Times' title = u'Hindustan Times'
@ -26,4 +27,24 @@ class HindustanTimes(BasicNewsRecipe):
'http://feeds.hindustantimes.com/HT-Homepage-LifestyleNews'), 'http://feeds.hindustantimes.com/HT-Homepage-LifestyleNews'),
] ]
def get_article_url(self, article):
'''
HT uses a variant of the feedportal RSS ad display mechanism
'''
try:
s = article.summary
return urllib.unquote(
re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
except:
pass
url = BasicNewsRecipe.get_article_url(self, article)
res = self.browser.open_novisit(url)
url = res.geturl().split('/')[-2]
encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
'0D': '?', '0E': '-', '0N': '.com', '0L': 'http://', '0S':
'www.'}
for k, v in encoding.iteritems():
url = url.replace(k, v)
return url

View File

@ -4,56 +4,20 @@ __license__ = 'GPL v3'
__copyright__ = '2010, matek09, matek09@gmail.com' __copyright__ = '2010, matek09, matek09@gmail.com'
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import re
class Histmag(BasicNewsRecipe): class Histmag(BasicNewsRecipe):
title = u'Histmag'
oldest_article = 7
max_articles_per_feed = 100
cover_url='http://histmag.org/grafika/loga/histmag-logo-2-300px.png'
__author__ = 'matek09'
description = u"Artykuly historyczne i publicystyczne"
encoding = 'utf-8'
#preprocess_regexps = [(re.compile(r'</span>'), lambda match: '</span><br><br>'),(re.compile(r'<span>'), lambda match: '<br><br><span>')]
no_stylesheets = True
language = 'pl'
remove_javascript = True
keep_only_tags=[dict(id='article')]
remove_tags=[dict(name = 'p', attrs = {'class' : 'article-tags'})]
title = u'Histmag' feeds = [(u'Wszystkie', u'http://histmag.org/rss/wszystkie.xml'), (u'Wydarzenia', u'http://histmag.org/rss/wydarzenia.xml'), (u'Recenzje', u'http://histmag.org/rss/recenzje.xml'), (u'Artykuły historyczne', u'http://histmag.org/rss/historia.xml'), (u'Publicystyka', u'http://histmag.org/rss/publicystyka.xml')]
__author__ = 'matek09'
description = u"Artykuly historyczne i publicystyczne"
encoding = 'utf-8'
no_stylesheets = True
language = 'pl'
remove_javascript = True
#max_articles_per_feed = 1
remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'article'}))
remove_tags_after = dict(dict(name = 'h2', attrs = {'class' : 'komentarze'}))
#keep_only_tags =[]
#keep_only_tags.append(dict(name = 'h2'))
#keep_only_tags.append(dict(name = 'p'))
remove_tags =[]
remove_tags.append(dict(name = 'p', attrs = {'class' : 'podpis'}))
remove_tags.append(dict(name = 'h2', attrs = {'class' : 'komentarze'}))
remove_tags.append(dict(name = 'img', attrs = {'src' : 'style/buttons/wesprzyjnas-1.jpg'}))
preprocess_regexps = [(re.compile(r'</span>'), lambda match: '</span><br><br>'),
(re.compile(r'<span>'), lambda match: '<br><br><span>')]
extra_css = '''
.left {font-size: x-small}
.right {font-size: x-small}
'''
def find_articles(self, soup):
articles = []
for div in soup.findAll('div', attrs={'class' : 'text'}):
articles.append({
'title' : self.tag_to_string(div.h3.a),
'url' : 'http://www.histmag.org/' + div.h3.a['href'],
'date' : self.tag_to_string(div.next('p')).split('|')[0],
'description' : self.tag_to_string(div.next('p', podpis=False)),
})
return articles
def parse_index(self):
soup = self.index_to_soup('http://histmag.org/?arc=4&dx=0')
feeds = []
feeds.append((u"Artykuly historyczne", self.find_articles(soup)))
soup = self.index_to_soup('http://histmag.org/?arc=5&dx=0')
feeds.append((u"Artykuly publicystyczne", self.find_articles(soup)))
soup = self.index_to_soup('http://histmag.org/?arc=1&dx=0')
feeds.append((u"Wydarzenia", self.find_articles(soup)))
return feeds

View File

@ -8,6 +8,15 @@ class Historia_org_pl(BasicNewsRecipe):
category = 'history' category = 'history'
language = 'pl' language = 'pl'
oldest_article = 8 oldest_article = 8
remove_empty_feeds=True
max_articles_per_feed = 100 max_articles_per_feed = 100
feeds = [(u'Artykuły', u'http://www.historia.org.pl/index.php?format=feed&type=rss')] feeds = [(u'Wszystkie', u'http://www.historia.org.pl/index.php?format=feed&type=rss'),
(u'Wiadomości', u'http://www.historia.org.pl/index.php/wiadomosci.feed?type=rss'),
(u'Publikacje', u'http://www.historia.org.pl/index.php/publikacje.feed?type=rss'),
(u'Publicystyka', u'http://www.historia.org.pl/index.php/publicystyka.feed?type=rss'),
(u'Recenzje', u'http://historia.org.pl/index.php/recenzje.feed?type=rss'),
(u'Kultura i sztuka', u'http://www.historia.org.pl/index.php/kultura-i-sztuka.feed?type=rss'),
(u'Rekonstykcje', u'http://www.historia.org.pl/index.php/rekonstrukcje.feed?type=rss'),
(u'Projekty', u'http://www.historia.org.pl/index.php/projekty.feed?type=rss'),
(u'Konkursy'), (u'http://www.historia.org.pl/index.php/konkursy.feed?type=rss')]

View File

@ -1,44 +1,58 @@
# -*- coding: utf-8 -*- ################################################################################
import re #Description: http://hvg.hu/ RSS channel
from calibre.web.feeds.recipes import BasicNewsRecipe #Author: Bigpapa (bigpapabig@hotmail.com)
#Date: 2011.12.20. - V1.1
################################################################################
class HVG(BasicNewsRecipe): from calibre.web.feeds.news import BasicNewsRecipe
title = 'HVG.HU'
__author__ = u'István Papp'
description = u'Friss hírek a HVG-től'
timefmt = ' [%Y. %b. %d., %a.]'
oldest_article = 4
language = 'hu'
max_articles_per_feed = 100 class hvg(BasicNewsRecipe):
no_stylesheets = True title = u'HVG'
use_embedded_content = False __author__ = 'Bigpapa'
encoding = 'utf8' language = 'hu'
publisher = 'HVG Online' oldest_article = 5 # Hany napos legyen a legregebbi cikk amit leszedjen.
category = u'news, hírek, hvg' max_articles_per_feed = 5 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
extra_css = 'body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} ' no_stylesheets = True
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')] encoding = 'utf8'
remove_tags_before = dict(id='pg-content') extra_css = ' h2 { font:bold 28px} '
remove_javascript = True
remove_empty_feeds = True
feeds = [ remove_attributes = ['style','font', 'href']
(u'Itthon', u'http://hvg.hu/rss/itthon')
,(u'Világ', u'http://hvg.hu/rss/vilag')
,(u'Gazdaság', u'http://hvg.hu/rss/gazdasag')
,(u'IT | Tudomány', u'http://hvg.hu/rss/tudomany')
,(u'Panoráma', u'http://hvg.hu/rss/Panorama')
,(u'Karrier', u'http://hvg.hu/rss/karrier')
,(u'Gasztronómia', u'http://hvg.hu/rss/gasztronomia')
,(u'Helyi érték', u'http://hvg.hu/rss/helyiertek')
,(u'Kultúra', u'http://hvg.hu/rss/kultura')
,(u'Cégautó', u'http://hvg.hu/rss/cegauto')
,(u'Vállalkozó szellem', u'http://hvg.hu/rss/kkv')
,(u'Egészség', u'http://hvg.hu/rss/egeszseg')
,(u'Vélemény', u'http://hvg.hu/rss/velemeny')
,(u'Sport', u'http://hvg.hu/rss/sport')
]
def print_version(self, url): keep_only_tags = [
return url.replace ('#rss', '/print') dict(name='div', attrs={'id':['pg-content']})
]
remove_tags = [
dict(name='div', attrs={'class':['box articlemenu', 'bannergoogle468', 'boxcontainer left', 'boxcontainer', 'commentbox']}),
dict(name='table', attrs={'class':['banner2', 'monocle']}),
dict(name='div', attrs={'id':['connect_widget_4cf63ca849ddf4577922632', 'sharetip', 'upprev_box']}),
dict(name='div', attrs={'style':['float: right; margin-bottom: 5px;', 'display: none;']}),
dict(name='h3', attrs={'class':['hthree']}),
dict(name='ul', attrs={'class':['defaultul']}),
dict(name='form', attrs={'id':['commentForm']}),
dict(name='h6', attrs={'class':['hthree']}),
dict(name='h6', attrs={'class':['more2']}),
dict(name='img', attrs={'class':['framed']}),
dict(name='td', attrs={'class':['greyboxbody','embedvideobody','embedvideofooter','embedvideobottom']}),
]
feeds = [
# (u'\xd6sszes', 'http://hvg.hu/rss'),
(u'Itthon', 'http://hvg.hu/rss/itthon'),
(u'Vil\xe1g', 'http://hvg.hu/rss/vilag'),
(u'Gazdas\xe1g', 'http://hvg.hu/rss/gazdasag'),
(u'Tudom\xe1ny', 'http://hvg.hu/rss/tudomany'),
(u'Panor\xe1ma', 'http://hvg.hu/rss/panorama'),
(u'Karrier', 'http://hvg.hu/rss/karrier'),
(u'Gasztron\xf3mia', 'http://hvg.hu/rss/gasztronomia'),
(u'Helyi \xe9rt\xe9k', 'http://hvg.hu/rss/helyiertek'),
(u'Kult\xfara', 'http://hvg.hu/rss/kultura'),
(u'C\xe9gaut\xf3', 'http://hvg.hu/rss/cegauto'),
(u'V\xe1llalkoz\xf3 szellem', 'http://hvg.hu/rss/kkv'),
(u'Eg\xe9szs\xe9g', 'http://hvg.hu/rss/egeszseg'),
(u'V\xe9lem\xe9ny', 'http://hvg.hu/rss/velemeny'),
(u'Sport', 'http://hvg.hu/rss/sport')
]

Binary file not shown.

After

Width:  |  Height:  |  Size: 323 B

BIN
recipes/icons/biolog_pl.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB

BIN
recipes/icons/blues.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 910 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 600 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 373 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 481 B

BIN
recipes/icons/formulaas.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 687 B

BIN
recipes/icons/infra_pl.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

BIN
recipes/icons/moneynews.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 914 B

BIN
recipes/icons/skylife.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.3 KiB

BIN
recipes/icons/zaman.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 999 B

View File

@ -1,8 +1,8 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'Gabriele Marini, based on Darko Miletic' __author__ = 'Gambarini, based on Darko Miletic'
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
description = 'Italian daily newspaper - 19-04-2010' description = 'Italian daily newspaper - 09-11-2011'
''' '''
http://www.ilgiornale.it/ http://www.ilgiornale.it/
@ -11,7 +11,7 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class IlGiornale(BasicNewsRecipe): class IlGiornale(BasicNewsRecipe):
__author__ = 'Marini Gabriele' __author__ = 'GAMBARINI'
description = 'Italian daily newspaper' description = 'Italian daily newspaper'
cover_url = 'http://www.ilgiornale.it/img_v1/logo.gif' cover_url = 'http://www.ilgiornale.it/img_v1/logo.gif'
@ -23,9 +23,8 @@ class IlGiornale(BasicNewsRecipe):
timefmt = '[%a, %d %b, %Y]' timefmt = '[%a, %d %b, %Y]'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 50 max_articles_per_feed = 100
use_embedded_content = False use_embedded_content = False
recursion = 100
no_stylesheets = True no_stylesheets = True
conversion_options = {'linearize_tables':True} conversion_options = {'linearize_tables':True}
@ -38,11 +37,11 @@ class IlGiornale(BasicNewsRecipe):
def print_version(self, url): def print_version(self, url):
raw = self.browser.open(url).read() raw = self.browser.open(url).read()
soup = BeautifulSoup(raw.decode('utf8', 'replace')) soup = BeautifulSoup(raw.decode('utf8', 'replace'))
all_print_tags = soup.find('div', {'style':'float:left; width:35%;'}) all_print_tags = soup.find('div', {'id':'print_article'})
print_link = all_print_tags.contents[1] print_link = all_print_tags.a
if all_print_tags is None: if print_link is None:
return url return url
return print_link['href'] return 'http://www.ilgiornale.it' + print_link['href']
feeds = [ feeds = [

View File

@ -1,33 +1,60 @@
__license__ = 'GPL v3' # adapted from old recipe by Darko Miletic <darko.miletic at gmail.com>
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
'''
www.independent.co.uk
'''
from calibre.web.feeds.news import BasicNewsRecipe import re
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
class TheIndependentNew(BasicNewsRecipe):
# flag to enable/disable article graphics on business pages/some others
# eg http://www.independent.co.uk/news/world/europe/berlusconi-departure-fails-to-calm-the-markets-6259682.html
# -max dimensions can be altered using the .pictureContainer img selector in the css
_FETCH_ARTICLE_GRAPHICS = True
#Flag to enable/disable image fetching (not business)
_FETCH_IMAGES = True
#used for converting rating to stars
_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star.png'
_NO_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star_grey.png'
title = u'The Independent'
__author__ = 'Will'
description = 'The latest in UK News and World News from The \
Independent. Wide range of international and local news, sports \
news, commentary and opinion pieces.Independent News - Breaking news \
that matters. Your daily comprehensive news source - The \
Independent Newspaper'
publisher = 'The Independent'
category = 'news, UK'
no_stylesheets = True
use_embedded_content = False
remove_empty_feeds = True
language = 'en_GB'
publication_type = 'newspaper'
masthead_url = 'http://www.independent.co.uk/independent.co.uk/editorial/logo/independent_Masthead.png'
encoding = 'utf-8'
remove_tags =[
dict(attrs={'id' : ['RelatedArtTag','renderBiography']}),
dict(attrs={'class' : ['autoplay','openBiogPopup']}),
dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}),
dict(attrs={'style' : re.compile('.*')}),
]
keep_only_tags =[dict(attrs={'id':'main'})]
recursions = 0
# fixes non compliant html nesting and 'marks' article graphics links
preprocess_regexps = [
(re.compile('<span class="storyTop ">(?P<nested>.*?)</span>', re.DOTALL),
lambda match: '<div class="storyTop">' + match.group('nested') + '</div>'),
(re.compile('(<strong>.*?[Cc]lick.*?<a.*?((HERE)|([Hh]ere)).*?</strong>)', re.DOTALL),
lambda match: '<div class="article-graphic">' + match.group(0) + '</div>'),
]
class TheIndependent(BasicNewsRecipe):
title = 'The Independent'
__author__ = 'Darko Miletic'
description = 'Independent News - Breaking news, comment and features from The Independent newspaper'
publisher = 'The Independent'
category = 'news, politics, UK'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'cp1252'
use_embedded_content = False
language = 'en_GB'
remove_empty_feeds = True
publication_type = 'newspaper'
masthead_url = 'http://www.independent.co.uk/independent.co.uk/images/logo-london.png'
extra_css = """
h1{font-family: Georgia,serif }
body{font-family: Verdana,Arial,Helvetica,sans-serif}
img{margin-bottom: 0.4em; display:block}
.info,.caption,.credits{font-size: x-small}
"""
conversion_options = { conversion_options = {
'comment' : description 'comment' : description
@ -36,51 +63,451 @@ class TheIndependent(BasicNewsRecipe):
, 'language' : language , 'language' : language
} }
remove_tags =[ extra_css = """
dict(name=['meta','link','object','embed','iframe','base','style']) h1{font-family: Georgia,serif }
,dict(attrs={'class':['related-articles','share','googleCols','article-tools','paging','googleArt']}) body{font-family: Verdana,Arial,Helvetica,sans-serif}
,dict(attrs={'id':['newsVideoPlayer','yahoobook','google-intext']}) img{margin-bottom: 0.4em; display:block}
] .starRating img {float: left}
keep_only_tags =[dict(attrs={'id':'article'})] .starRating {margin-top:0.4em; display: block}
remove_attributes=['lang','onclick','width','xmlns:fb'] .image {clear:left; font-size: x-small; color:#888888;}
.articleByTimeLocation {font-size: x-small; color:#888888;
margin-bottom:0.2em ; margin-top:0.2em ; display:block}
.subtitle {clear:left}
.column-1 h1 { color: #191919}
.column-1 h2 { color: #333333}
.column-1 h3 { color: #444444}
.column-1 p { color: #777777}
.column-1 p,a,h1,h2,h3 { margin: 0; }
.column-1 div{color:#888888; margin: 0;}
.articleContent {display: block; clear:left;}
.storyTop{}
.pictureContainer img { max-width: 400px; max-height: 400px;}
"""
oldest_article = 1
max_articles_per_feed = 100
_processed_urls = []
feeds = [
(u'UK' , u'http://www.independent.co.uk/news/uk/rss' )
,(u'World' , u'http://www.independent.co.uk/news/world/rss' )
,(u'Business' , u'http://www.independent.co.uk/news/business/rss' )
,(u'People' , u'http://www.independent.co.uk/news/people/rss' )
,(u'Science' , u'http://www.independent.co.uk/news/science/rss' )
,(u'Media' , u'http://www.independent.co.uk/news/media/rss' )
,(u'Education' , u'http://www.independent.co.uk/news/education/rss' )
,(u'Leading Articles' , u'http://www.independent.co.uk/opinion/leading-articles/rss')
,(u'Comentators' , u'http://www.independent.co.uk/opinion/commentators/rss' )
,(u'Columnists' , u'http://www.independent.co.uk/opinion/columnists/rss' )
,(u'Letters' , u'http://www.independent.co.uk/opinion/letters/rss' )
,(u'Big Question' , u'http://www.independent.co.uk/extras/big-question/rss' )
,(u'Sport' , u'http://www.independent.co.uk/sport/rss' )
,(u'Life&Style' , u'http://www.independent.co.uk/life-style/rss' )
,(u'Arts&Entertainment' , u'http://www.independent.co.uk/arts-entertainment/rss' )
,(u'Travel' , u'http://www.independent.co.uk/travel/rss' )
,(u'Money' , u'http://www.independent.co.uk/money/rss' )
]
def get_article_url(self, article): def get_article_url(self, article):
return article.get('guid', None) url = super(self.__class__,self).get_article_url(article)
title = article.get('title', None)
if title and re.search("^Video:",title):
return None
#remove duplicates
if not (url in self._processed_urls):
self._processed_urls.append(url)
else:
url = None
return url
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.body.findAll(style=True):
del item['style'] #remove 'advertorial articles'
for item in soup.body.findAll(['author','preform']): strapline = soup.find('div',attrs={'class' : re.compile('.*strapLine.*')})
item.name='span' if strapline:
for item in soup.body.findAll('img'): for para in strapline.findAll('p'):
if not item.has_key('alt'): if len(para.contents) and isinstance(para.contents[0],NavigableString) \
item['alt'] = 'image' and para.contents[0] == 'ADVERTORIAL FEATURE':
for item in soup.body.findAll('div', attrs={'class':['clear-o','body','photoCaption']}): return None
item.name = 'p'
for item in soup.body.findAll('div'): items_to_extract = []
if not item.attrs and not item.contents: slideshow_elements = []
item.extract()
soup2 = BeautifulSoup('<html><head><title>t</title></head><body></body></html>') for item in soup.findAll(attrs={'class' : re.compile("widget.*")}):
soup2.body.replaceWith(soup.body) remove = True
return soup2 pattern = re.compile('((articleContent)|(title))$')
if (pattern.search(item['class'])) is not None:
remove = False
# corrections
# story content always good
pattern = re.compile('storyContent')
if (pattern.search(item['class'])) is not None:
remove = False
#images
pattern = re.compile('slideshow')
if (pattern.search(item['class'])) is not None:
if self._FETCH_IMAGES:
remove = False
slideshow_elements.append(item)
else:
remove = True
#social widgets always bad
pattern = re.compile('socialwidget')
if (pattern.search(item['class'])) is not None:
remove = True
if remove:
items_to_extract.append(item)
for item in items_to_extract:
item.extract()
items_to_extract = []
if self._FETCH_IMAGES:
for element in slideshow_elements:
for item in element.findAll('a',attrs={'href' : re.compile('.*')}):
if item.img is not None:
#use full size image
img = item.findNext('img')
img['src'] = item['href']
#insert caption if available
if img.get('title') and (len(img['title']) > 1):
tag = Tag(soup,'h3')
text = NavigableString(img['title'])
tag.insert(0,text)
#picture before text
img.extract()
item.insert(0,img)
item.insert(1,tag)
# remove link
item.name = "div"
item["class"]='image'
del item["href"]
#remove empty subtitles
"""
currently the subtitle is located in first paragraph after
sibling <h3 class="subtitle"> tag. This may be 'fixed' at
some point.
"""
subtitle = soup.find('h3',attrs={'class' : 'subtitle'})
if subtitle is not None:
subtitleText = subtitle.findNext('p')
if subtitleText is not None:
if len(subtitleText.contents[0]) <= 1 :
subtitleText.extract()
subtitle.extract()
#replace rating numbers with stars
for item in soup.findAll('div',attrs={ 'class' : 'starRating'}):
if item is not None:
soup2 = self._insertRatingStars(soup,item)
if soup2 is not None:
soup = soup2
#remove empty paragraph tags in storyTop which can leave a space
#between first paragraph and rest of story
nested_content = False
storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
for item in storyTop.findAll('p'):
for nested in item:
if isinstance(nested, Tag):
nested_content = True
break
if not nested_content and item.contents is not None and len(item.contents[0]) <= 1 :
items_to_extract.append(item)
for item in items_to_extract:
item.extract()
items_to_extract = []
#remove line breaks immediately next to tags with default margins
#to prevent double line spacing and narrow columns of text
storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
self._remove_undesired_line_breaks_from_tag(storyTop,soup)
#replace article graphics link with the graphics themselves
if self._FETCH_ARTICLE_GRAPHICS:
items_to_insert = []
for item in soup.findAll('div', attrs={'class' : ['article-graphic']}):
strong = item.find('strong')
if not strong:
continue
for child in strong:
if isinstance(child,Tag):
if str(child.name) == 'a':
items_to_insert.extend(self._get_article_graphic(strong,child['href'],soup))
for item in items_to_insert:
item[0].replaceWith(item[1])
for item in items_to_extract:
item.extract()
return soup
def _get_article_graphic(self,old_item,url,soup):
items_to_insert = []
if re.search('\.jpg$',str(url)):
div = Tag(soup,'div')
div['class'] = 'pictureContainer'
img = Tag(soup,'img')
img['src'] = url
img['alt'] = 'article graphic'
div.insert(0,img)
items_to_insert.append((old_item,div,))
return items_to_insert
soup2 = self.index_to_soup(url)
for item in soup2.findAll('div',attrs={'class' : re.compile("widget picture article.*")}):
items_to_insert.append((old_item,item),)
return items_to_insert
def _insertRatingStars(self,soup,item):
if item.contents is None or len(item.contents) < 1:
return
rating = item.contents[0]
try:
rating = float(item.contents[0])
except:
print 'Could not convert decimal rating to star: malformatted float.'
return
for i in range(1,6):
star = Tag(soup,'img')
if i <= rating:
star['src'] = self._STAR_URL
else:
star['src'] = self._NO_STAR_URL
star['alt'] = 'star number ' + str(i)
item.insert(i,star)
#item.contents[0] = NavigableString('(' + str(rating) + ')')
item.contents[0] = ''
def postprocess_html(self,soup, first_fetch):
#find broken images and remove captions
items_to_extract = []
for item in soup.findAll('div', attrs={'class' : 'image'}):
img = item.findNext('img')
if img and img.get('src'):
# broken images still point to remote url
pattern = re.compile('http://www.independent.co.uk.*')
if pattern.match(img["src"]) is not None:
caption = img.findNextSibling('h3')
if caption is not None:
items_to_extract.append(caption)
items_to_extract.append(img)
for item in items_to_extract:
item.extract()
return soup
def _recurisvely_linearise_tag_tree(
self,
item,
linearised= None,
count=0,
limit = 100
):
linearised = linearised or []
count = count + 1
if count > limit:
return linearised
if not (isinstance(item,Tag)):
return linearised
for nested in item:
linearised.append(nested)
linearised = self._recurisvely_linearise_tag_tree(nested,linearised, count)
return linearised
def _get_previous_tag(self,current_index, tag_tree):
if current_index == 0:
return None
else:
return tag_tree[current_index - 1]
def _get_next_tag(self,current_index, tag_tree):
if current_index < len(tag_tree) - 1:
return tag_tree[current_index + 1]
else:
return None
def _list_match(self,test_str, list_regex):
for regex in list_regex:
match = re.match(regex, test_str)
if match is not None:
return True
return False
def _remove_undesired_line_breaks_from_tag(self,parent,soup):
if parent is None:
return
tag_tree = self._recurisvely_linearise_tag_tree(parent)
items_to_remove = []
for item in tag_tree:
if item == u'\n':
items_to_remove.append(item)
continue;
for item in items_to_remove:
tag_tree.remove(item)
spaced_tags = [r'p', r'h\d', r'blockquote']
tags_to_extract = []
tags_to_replace = []
for (i, tag) in enumerate(tag_tree):
if isinstance(tag, Tag):
if str(tag) == '<br />':
previous_tag = self._get_previous_tag(i, tag_tree)
if isinstance(previous_tag, Tag):
previous_tag_is_spaced = previous_tag is not None\
and self._list_match(str(previous_tag.name),
spaced_tags)
else:
previous_tag_is_spaced = False
next_tag = self._get_next_tag(i, tag_tree)
if isinstance(next_tag, Tag):
next_tag_is_spaced = next_tag is not None\
and self._list_match(str(next_tag.name), spaced_tags)
else:
next_tag_is_spaced = False
if previous_tag_is_spaced or next_tag_is_spaced or i == 0\
or i == len(tag_tree) - 1:
tags_to_extract.append(tag)
else:
tags_to_replace.append((tag,NavigableString(' '),))
for pair in tags_to_replace:
pair[0].replaceWith(pair[1])
for tag in tags_to_extract:
tag.extract()
feeds = [
(u'News - UK',
u'http://www.independent.co.uk/news/uk/?service=rss'),
(u'News - World',
u'http://www.independent.co.uk/news/world/?service=rss'),
(u'News - Business',
u'http://www.independent.co.uk/news/business/?service=rss'),
(u'News - People',
u'http://www.independent.co.uk/news/people/?service=rss'),
(u'News - Science',
u'http://www.independent.co.uk/news/science/?service=rss'),
(u'News - Media',
u'http://www.independent.co.uk/news/media/?service=rss'),
(u'News - Education',
u'http://www.independent.co.uk/news/education/?service=rss'),
(u'News - Obituaries',
u'http://www.independent.co.uk/news/obituaries/?service=rss'),
(u'News - Corrections',
u'http://www.independent.co.uk/news/corrections/?service=rss'
),
(u'Opinion',
u'http://www.independent.co.uk/opinion/?service=rss'),
(u'Environment',
u'http://www.independent.co.uk/environment/?service=rss'),
(u'Sport - Athletics',
u'http://www.independent.co.uk/sport/general/athletics/?service=rss'
),
(u'Sport - Cricket',
u'http://www.independent.co.uk/sport/cricket/?service=rss'),
(u'Sport - Football',
u'http://www.independent.co.uk/sport/football/?service=rss'),
(u'Sport - Golf',
u'http://www.independent.co.uk/sport/golf/?service=rss'),
(u'Sport - Motor racing',
u'http://www.independent.co.uk/sport/motor-racing/?service=rss'
),
(u'Sport - Olympics',
u'http://www.independent.co.uk/sport/olympics/?service=rss'),
(u'Sport - Racing',
u'http://www.independent.co.uk/sport/racing/?service=rss'),
(u'Sport - Rugby League',
u'http://www.independent.co.uk/sport/general/rugby-league/?service=rss'),
(u'Sport - Rugby Union',
u'http://www.independent.co.uk/sport/rugby/rugby-union/?service=rss'
),
(u'Sport - Sailing',
u'http://www.independent.co.uk/sport/general/sailing/?service=rss'
),
(u'Sport - Tennis',
u'http://www.independent.co.uk/sport/tennis/?service=rss'),
(u'Sport - Others',
u'http://www.independent.co.uk/sport/general/others/?service=rss'
),
(u'Life & Style - Fashion',
u'http://www.independent.co.uk/life-style/fashion/?service=rss'
),
(u'Life & Style -Food & Drink',
u'http://www.independent.co.uk/life-style/food-and-drink/?service=rss'
),
(u'Life & Style - Health and Families',
u'http://www.independent.co.uk/life-style/health-and-families/?service=rss'
),
(u'Life & Style - House & Home',
u'http://www.independent.co.uk/life-style/house-and-home/'),
(u'Life & Style - History',
u'http://www.independent.co.uk/life-style/history/?service=rss'
),
(u'Life & Style - Gadgets & Tech',
u'http://www.independent.co.uk/life-style/gadgets-and-tech/?service=rss'
),
(u'Life & Style - Motoring',
u'http://www.independent.co.uk/life-style/motoring/?service=rss'
),
(u'Arts & Ents - Art',
u'http://www.independent.co.uk/arts-entertainment/art/?service=rss'
),
(u'Arts & Ents - Architecture',
u'http://www.independent.co.uk/arts-entertainment/architecture/?service=rss'
),
(u'Arts & Ents - Music',
u'http://www.independent.co.uk/arts-entertainment/music/?service=rss'
),
(u'Arts & Ents - Classical',
u'http://www.independent.co.uk/arts-entertainment/classical/?service=rss'
),
(u'Arts & Ents - Films',
u'http://www.independent.co.uk/arts-entertainment/films/?service=rss'
),
(u'Arts & Ents - TV',
u'http://www.independent.co.uk/arts-entertainment/tv/?service=rss'
),
(u'Arts & Ents - Theatre and Dance',
u'http://www.independent.co.uk/arts-entertainment/theatre-dance/?service=rss'
),
(u'Arts & Ents - Comedy',
u'http://www.independent.co.uk/arts-entertainment/comedy/?service=rss'
),
(u'Arts & Ents - Books',
u'http://www.independent.co.uk/arts-entertainment/books/?service=rss'
),
(u'Travel', u'http://www.independent.co.uk/travel/?service=rss'
),
(u'Money', u'http://www.independent.co.uk/money/?service=rss'),
(u'IndyBest',
u'http://www.independent.co.uk/extras/indybest/?service=rss'),
]

17
recipes/infra_pl.recipe Normal file
View File

@ -0,0 +1,17 @@
from calibre.web.feeds.news import BasicNewsRecipe
class INFRA(BasicNewsRecipe):
title = u'INFRA'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'Serwis Informacyjny INFRA - UFO, Zjawiska Paranormalne, Duchy, Tajemnice świata.'
cover_url = 'http://npn.nazwa.pl/templates/ja_teline_ii/images/logo.jpg'
category = 'UFO'
language = 'pl'
max_articles_per_feed = 100
no_stylesheers=True
remove_tags_before=dict(name='h2', attrs={'class':'contentheading'})
remove_tags_after=dict(attrs={'class':'pagenav'})
remove_tags=[dict(attrs={'class':'pagenav'})]
feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/index.php?option=com_rd_rss&id=1')]

18
recipes/japan_news.recipe Normal file
View File

@ -0,0 +1,18 @@
from calibre.web.feeds.news import BasicNewsRecipe
class NewsOnJapan(BasicNewsRecipe):
title = u'News On Japan'
language = 'en'
__author__ = 'Krittika Goyal'
oldest_article = 1 #days
max_articles_per_feed = 25
use_embedded_content = False
no_stylesheets = True
auto_cleanup = True
feeds = [
('News',
'http://newsonjapan.com/rss/top.xml'),
]

View File

@ -0,0 +1,14 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe
class Kosmonauta(BasicNewsRecipe):
title = u'Kosmonauta.net'
__author__ = 'fenuks'
description = u'polskojęzyczny portal w całości dedykowany misjom kosmicznym i badaniom kosmosu.'
category = 'astronomy'
language = 'pl'
cover_url='http://bi.gazeta.pl/im/4/10393/z10393414X,Kosmonauta-net.jpg'
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 100
feeds = [(u'Kosmonauta.net', u'http://www.kosmonauta.net/index.php/feed/rss.html')]

View File

@ -11,7 +11,7 @@ __description__ = 'Italian weekly magazine'
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Espresso(BasicNewsRecipe): class Espresso(BasicNewsRecipe):
__author__ = 'Lorenzo Vigentini, Gabriele Marini' __author__ = 'Lorenzo Vigentini, Gabriele Marini, Krittika Goyal'
description = 'Italian weekly magazine' description = 'Italian weekly magazine'
cover_url = 'http://espresso.repubblica.it/images/logo_espresso.gif' cover_url = 'http://espresso.repubblica.it/images/logo_espresso.gif'
@ -26,10 +26,9 @@ class Espresso(BasicNewsRecipe):
oldest_article = 16 oldest_article = 16
max_articles_per_feed = 100 max_articles_per_feed = 100
use_embedded_content = False use_embedded_content = False
recursion = 10
remove_javascript = True
no_stylesheets = True no_stylesheets = True
auto_cleanup = True
feeds = [ feeds = [
@ -42,36 +41,3 @@ class Espresso(BasicNewsRecipe):
(u'Chiesa: HomePage', u'http://data.kataweb.it/rss/chiesa/homepage/it'), (u'Chiesa: HomePage', u'http://data.kataweb.it/rss/chiesa/homepage/it'),
(u'Chiesa: Speciali e Focus', u'http://data.kataweb.it/rss/chiesa/speciali_e_focus/it') (u'Chiesa: Speciali e Focus', u'http://data.kataweb.it/rss/chiesa/speciali_e_focus/it')
] ]
def print_version(self,url):
print url[7:25]
if url[7:25] == 'temi.repubblica.it':
return url + '/?printpage=undefined'
elif url[7:25] == 'www.chiesa.espress':
return url
return url + '/&print=true'
keep_only_tags = [
dict(name='div', attrs={'class':['testo','copertina','occhiello','firma','didascalia','content-second-right','detail-articles','titolo-local','generic-articles']}),
dict(name='div', attrs={'class':['generic-articles','summary','detail-articles']}),
dict(name='div', attrs={'id':['content-second-right','content2']})
]
remove_tags = [
dict(name='div',attrs={'class':['servizi','aggiungi','label-web','bottom-mobile','box-abbonamenti','box-cerca','big','little','stampaweb']}),
dict(name='div',attrs={'id':['topheader','header','navigation-new','navigation','content-second-left','menutext']}),
dict(name='ul',attrs={'id':'user-utility'}),
dict(name=['script','noscript','iframe'])
]
# extra_css = '''
# h1 {font-family:Times New Roman,"Trebuchet MS",Arial,Helvetica,sans-serif; font-size:24px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:18px;}
# h2 {font-family:Times New Roman, "Trebuchet MS",Arial,Helvetica,sans-serif; font-size:18px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:16px; }
# h3 {color:#333333;font-family:Times New Roman, "Trebuchet MS",Arial,Helvetica,sans-serif; font-size:16px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px;}
# h4 {color:#333333; font-family:Times New Roman, "Trebuchet MS",Arial,Helvetica,sans-serif;font-size:16px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; }
# h5 {color:#333333; font-family:Times New Roman, "Trebuchet MS",Arial,Helvetica,sans-serif; font-size:12px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; text-transform:uppercase;}
# .firma {color:#333333;font-family:Times New Roman, "Trebuchet MS",Arial,Helvetica,sans-serif;font-size:12px; font-size-adjust:none; font-stretch:normal; font-style:italic; font-variant:normal; font-weight:bold; line-height:15px; text-decoration:none;}
# .testo {font-family:Times New Roman, "Trebuchet MS",Arial,Helvetica,sans-serif; font-size:10px;}
# '''

View File

@ -1,13 +1,12 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'Lorenzo Vigentini, based on Darko Miletic, Gabriele Marini' __author__ = 'Lorenzo Vigentini, based on Darko Miletic, Gabriele Marini'
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>' __copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
description = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version' description = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version; 14.12.2011 new version'
''' '''
http://www.repubblica.it/ http://www.repubblica.it/
''' '''
import re
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -33,12 +32,6 @@ class LaRepubblica(BasicNewsRecipe):
remove_attributes = ['width','height','lang','xmlns:og','xmlns:fb'] remove_attributes = ['width','height','lang','xmlns:og','xmlns:fb']
preprocess_regexps = [
(re.compile(r'.*?<head>', re.DOTALL|re.IGNORECASE), lambda match: '<head>'),
(re.compile(r'<head>.*?<title>', re.DOTALL|re.IGNORECASE), lambda match: '<head><title>'),
(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE), lambda match: '</title></head>')
]
def get_article_url(self, article): def get_article_url(self, article):
link = BasicNewsRecipe.get_article_url(self, article) link = BasicNewsRecipe.get_article_url(self, article)
if link and not '.repubblica.it/' in link: if link and not '.repubblica.it/' in link:
@ -73,15 +66,15 @@ class LaRepubblica(BasicNewsRecipe):
remove_tags = [ remove_tags = [
dict(name=['object','link','meta','iframe','embed']), dict(name=['object','link','meta','iframe','embed']),
dict(name='span',attrs={'class':'linkindice'}), dict(name='span',attrs={'class':'linkindice'}),
dict(name='div', attrs={'class':'bottom-mobile'}), dict(name='div', attrs={'class':['bottom-mobile','adv adv-middle-inline']}),
dict(name='div', attrs={'id':['rssdiv','blocco']}), dict(name='div', attrs={'id':['rssdiv','blocco','fb-like-head']}),
dict(name='div', attrs={'class':'utility'}), dict(name='div', attrs={'class':['utility','fb-like-button','archive-button']}),
dict(name='div', attrs={'class':'generalbox'}), dict(name='div', attrs={'class':'generalbox'}),
dict(name='ul', attrs={'id':'hystory'}) dict(name='ul', attrs={'id':'hystory'})
] ]
feeds = [ feeds = [
(u'Rilievo', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'), (u'Homepage', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
(u'Cronaca', u'http://www.repubblica.it/rss/cronaca/rss2.0.xml'), (u'Cronaca', u'http://www.repubblica.it/rss/cronaca/rss2.0.xml'),
(u'Esteri', u'http://www.repubblica.it/rss/esteri/rss2.0.xml'), (u'Esteri', u'http://www.repubblica.it/rss/esteri/rss2.0.xml'),
(u'Economia', u'http://www.repubblica.it/rss/economia/rss2.0.xml'), (u'Economia', u'http://www.repubblica.it/rss/economia/rss2.0.xml'),
@ -110,3 +103,5 @@ class LaRepubblica(BasicNewsRecipe):
del item['style'] del item['style']
return soup return soup
def preprocess_raw_html(self, raw, url):
return '<html><head>'+raw[raw.find('</head>'):]

View File

@ -0,0 +1,94 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class LetsGetCritical(BasicNewsRecipe):
title = u"Let's Get Critical"
description = 'Curation / aggregation of criticisms of the arts and culture '
language = 'en'
__author__ = 'barty on mobileread.com forum'
max_articles_per_feed = 100
no_stylesheets = False
timefmt = ' [%a, %d %b, %Y]'
oldest_article = 365
auto_cleanup = True
INDEX = 'http://www.letsgetcritical.org'
CATEGORIES = [
# comment out categories you don't want
# (user friendly name, system name, max number of articles to load)
('Architecture','architecture',30),
('Art','art',30),
('Books','books',30),
('Design','design',30),
('Digital','digital',30),
('Food','food',30),
('Movies','movies',30),
('Music','music',30),
('Television','television',30),
('Other articles','',10)
]
def parse_index(self):
self.cover_url = 'http://www.letsgetcritical.org/wp-content/themes/lets_get_critical/images/lgc.jpg'
feeds = []
seen_urls = set([])
regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
for category in self.CATEGORIES:
(cat_name, tag, max_articles) = category
tagurl = '' if tag=='' else '/category/'+tag.lower()
self.log('Reading category:', cat_name)
articles = []
pageno = 1
while len(articles) < max_articles and pageno < 100:
page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
pageno += 1
self.log('\tReading page:', page)
try:
soup = self.index_to_soup(page)
except:
break
posts = soup.findAll('div',attrs={'class':'post_multi'})
if len(posts) == 0:
break
for post in posts:
dt = post.find('div',attrs={'class':'title'})
atag = dt.find('a')
url = atag['href']
# skip promotionals and duplicate
if url.startswith('http://letsgetcritical') or url.startswith('/') or url in seen_urls:
continue
seen_urls.add(url)
title = self.tag_to_string(atag)
self.log('\tFound article:', title)
self.log('\t', url)
desc = post.find('blockquote')
desc = self.tag_to_string(desc) if desc else ''
m = regex.match( url)
if m:
desc = "[%s] %s" % (m.group(2), desc)
#self.log('\t', desc)
date = ''
p = post.previousSibling
# navigate up sibling to find date
while p:
if hasattr(p,'class') and p['class'] == 'singledate':
date = self.tag_to_string(p)
break
p = p.previousSibling
articles.append({'title':title,'url':url,'description':desc,'date':date})
if len(articles) >= max_articles:
break
if articles:
feeds.append((cat_name, articles))
return feeds

View File

@ -1,95 +1,117 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import re import re
from calibre.utils.magick import Image from calibre.utils.magick import Image
from BeautifulSoup import BeautifulSoup
try:
from calibre_plugins.drMerry.debug import debuglogger as mlog
print 'drMerry debuglogger found, debug options can be used'
from calibre_plugins.drMerry.stats import statslogger as mstat
print 'drMerry stats tracker found, stat can be tracked'
mlog.setLoglevel(1) #-1 == no log; 0 for normal output
mstat.calculateStats(False) #track stats (to track stats loglevel must be > 0
KEEPSTATS = mstat.keepmystats()
SHOWDEBUG0 = mlog.showdebuglevel(0)
SHOWDEBUG1 = mlog.showdebuglevel(1)
SHOWDEBUG2 = mlog.showdebuglevel(2)
except:
#print 'drMerry debuglogger not found, skipping debug options'
SHOWDEBUG0 = False
SHOWDEBUG1 = False
SHOWDEBUG2 = False
KEEPSTATS = False
#print ('level0: %s\nlevel1: %s\nlevel2: %s' % (SHOWDEBUG0,SHOWDEBUG1,SHOWDEBUG2))
''' Version 1.2, updated cover image to match the changed website. ''' Version 1.2, updated cover image to match the changed website.
added info date on title added info date on title
version 1.4 Updated tags, delay and added autoclean 22-09-2011 version 1.4 Updated tags, delay and added autoclean 22-09-2011
version 1.5 Changes due to changes in site version 1.5 Changes due to changes in site
version 1.6 Added css, removed auto cleanup, added buitenland section, added use_embedded_content, added remove_attributes version 1.6 Added css, removed auto cleanup, added buitenland section, added use_embedded_content, added remove_attributes
Added som processing on pictures Added some processing on pictures
Removed links in html Removed links in html
Removed extre white characters Removed extre white characters
changed handling of self closing span changed handling of self closing span
''' Version 1.7 11-11-2011 Changed oldest_article back to 1.5
changed è into &egrave;
updated remove tags
removed keep_only tags
Version 1.8 26-11-2022
added remove tag: article-slideshow
'''
class AdvancedUserRecipe1306097511(BasicNewsRecipe): class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Metro Nieuws NL' title = u'Metro Nieuws NL'
oldest_article = 2 oldest_article = 10
max_articles_per_feed = 100 max_articles_per_feed = 15
__author__ = u'DrMerry' __author__ = u'DrMerry'
description = u'Metro Nederland' description = u'Metro Nederland'
language = u'nl' language = u'nl'
simultaneous_downloads = 5 simultaneous_downloads = 5
#delay = 1 masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif'
#auto_cleanup = True timeout = 2
#auto_cleanup_keep = '//div[@class="article-image-caption-2column"]/*|//div[@id="date"]/*|//div[@class="article-image-caption-3column"]/*' center_navbar = True
timefmt = ' [%A, %d %b %Y]' timefmt = ' [%A, %d %b %Y]'
no_stylesheets = True no_stylesheets = True
remove_javascript = True remove_javascript = True
remove_empty_feeds = True remove_empty_feeds = True
cover_url = 'http://www.oldreadmetro.com/img/en/metroholland/last/1/small.jpg' cover_url = 'http://www.oldreadmetro.com/img/en/metroholland/last/1/small.jpg'
publication_type = 'newspaper' publication_type = 'newspaper'
remove_tags_before = dict(name='div', attrs={'id':'date'})
remove_tags_after = dict(name='div', attrs={'class':'article-body'})
encoding = 'utf-8' encoding = 'utf-8'
remove_attributes = ['style', 'font', 'width', 'height'] remove_attributes = ['style', 'font', 'width', 'height']
use_embedded_content = False use_embedded_content = False
conversion_options = {
'authors' : 'Metro Nederland & calibre & DrMerry',
'author_sort' : 'Metro Nederland & calibre & DrMerry',
'publisher' : 'DrMerry/Metro Nederland'
}
extra_css = 'body {padding:5px 0px; background:#fff;font-size: 13px;}\ extra_css = 'body {padding:5px 0px; background:#fff;font-size: 13px;}\
#date {clear: both;margin-left: 19px;font-size: 11px;font-weight: 300;color: #616262;height: 15px;}\ #date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {clear: both;margin-bottom: 10px;font-size:0.5em; color: #616262;}\
.article-box-fact.module-title {clear:both;border-top:1px solid black;border-bottom:4px solid black;padding: 8px 0;color: #24763b;font-family: arial, sans-serif;font-size: 14px;font-weight: bold;}\ .article-box-fact.module-title {clear:both;padding: 8px 0;color: #24763b;font-family: arial, sans-serif;font-size: 14px;font-weight: bold;}\
h1.title {color: #000000;font-size: 44px;padding-bottom: 10px;line-height: 1.15;font-weight: 300;} h2.subtitle {font-size: 13px;font-weight: 700;padding-bottom: 10px;}\ h1.title {color: #000000;font-size: 44px;padding-bottom: 10px;font-weight: 300;} h2.subtitle {font-size: 13px;font-weight: 700;padding-bottom: 10px;}\
.article-body p{padding-bottom:10px;}div.column-1-3{float: left;display: inline;width: 567px;margin-left: 19px;border-right: 1px solid #CACACA;padding-right: 9px;}\ .article-body p{padding-bottom:10px;}div.column-1-3{margin-left: 19px;padding-right: 9px;}\
div.column-1-2 {float: left;display: inline;width: 373px;padding-right: 7px;border-right: 1px solid #CACACA;}\ div.column-1-2 {display: inline;padding-right: 7px;}\
p.article-image-caption {font-size: 12px;font-weight: 300;line-height: 1.4;color: #616262;margin-top: 5px;} \ p.article-image-caption {font-size: 12px;font-weight: 300;color: #616262;margin-top: 5px;} \
p.article-image-caption .credits {font-style: italic;font-size: 10px;}\ p.article-image-caption .credits {font-style: italic;font-size: 10px;}\
div.article-image-caption {width: 246px;margin-bottom: 5px;margin-left: 10px;}\ div.article-image-caption {width: 246px;margin-bottom: 5px;margin-left: 10px;}\
div.article-image-caption-2column {margin-bottom: 10px;width: 373px;} div.article-image-caption-3column {}\ div.article-image-caption-2column {margin-bottom: 10px;width: 373px;} div.article-image-caption-3column {}\
img {border:0px;} .img-mask {position:absolute;top:0px;left:0px;}' img {border:0px; padding:2px;} hr.merryhr {width:30%; border-width:0px; color:green; margin-left:5px; background-color: green} div.column-3 {background-color:#eee; width:50%; margin:2px; float:right; padding:2px;} div.column-3 module-title {border: 1px solid #aaa} div.article-box-fact div.subtitle {font-weight:bold; color:green;}'
keep_only_tags = [dict(name='div', attrs={'class':[ 'article-image-caption-2column', 'article-image-caption-3column', 'article-body', 'article-box-fact']}), preprocess_regexps = [
dict(name='div', attrs={'id':['date']}), (re.compile(r'<img[^>]+top-line[^>]+>', re.DOTALL|re.IGNORECASE),
dict(name='h1', attrs={'class':['title']}), lambda match: '<hr class="merryhr" />'),
dict(name='h2', attrs={'class':['subtitle']})] (re.compile(r'(<img[^>]+metronieuws\.nl/[^>]+/templates/[^>]+jpe?g[^>]+>|metronieuws\.nl/internal\-roxen\-unit\.gif)', re.DOTALL|re.IGNORECASE),
lambda match: ''),
remove_tags = [dict(name='div', attrs={'class':[ 'metroCommentFormWrap',
'commentForm', 'metroCommentInnerWrap', 'article-slideshow-counter-container', 'article-slideshow-control', 'ad', 'header-links',
'art-rgt','pluck-app pluck-comm', 'share-and-byline', 'article-tools-below-title', 'col-179 ', 'related-links', 'clear padding-top-15', 'share-tools', 'article-page-auto-pushes', 'footer-edit']}),
dict(name='div', attrs={'id':['article-2', 'article-4', 'article-1', 'navigation', 'footer', 'header', 'comments', 'sidebar', 'share-and-byline']}),
dict(name='iframe')]
preprocess_regexps = [(re.compile(r'(<p>(&nbsp;|\s)*</p>|<a[^>]*>Tweet</a>|<a[^>]*>|</a>|<!--.*?-->)', re.DOTALL|re.IGNORECASE),lambda match: ''),
(re.compile(r'(&nbsp;|\s\s)+\s*', re.DOTALL|re.IGNORECASE),lambda match: ' '),
(re.compile(r'([\s>])([^\s>]+)(<span[^>]+) />', re.DOTALL|re.IGNORECASE),
lambda match: match.group(1) + match.group(3) + '>' + match.group(2) + '</span>'),
] ]
def preprocess_html(self, soup):
if SHOWDEBUG0 == True:
mlog.setdefaults()
mlog.addTextAndTag(['Show debug = on with level'], [str(mlog.debuglevel)])
if KEEPSTATS == True:
mlog.addDebug('Stats will be calculated')
else:
mlog.addTextAndTag(['Stats won\'t be calculated\nTo be enabled, stats must be true, currently','and debug level must be 1 or higher, currently'],[mstat.dokeepmystats, mlog.debuglevel])
mlog.showDebug()
myProcess = MerryProcess()
myProcess.removeUnwantedTags(soup)
return soup
def postprocess_html(self, soup, first): def postprocess_html(self, soup, first):
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): myProcess = MerryProcess()
iurl = tag['src'] myProcess.optimizeLayout(soup)
img = Image() if SHOWDEBUG0 == True:
img.open(iurl) if KEEPSTATS == True:
#width, height = img.size statinfo = 'generated stats:'
#print '***img is: ', iurl, '\n****width is: ', width, 'height is: ', height statinfo += str(mstat.stats(mstat.statslist))
img.trim(0) print statinfo
img.save(iurl) statinfo = 'generated stats (for removed tags):'
''' statinfo += str(mstat.stats(mstat.removedtagslist))
#width, height = img.size print statinfo
#print '***TRIMMED img width is: ', width, 'height is: ', height #show all Debug info we forgot to report
left=0 #Using print to be sure that this text will not be added at the end of the log.
top=0 print '\n!!!!!unreported messages:\n(should be empty)\n'
border_color='#ffffff' mlog.showDebug()
width, height = img.size
#print '***retrieved img width is: ', width, 'height is: ', height
height_correction = 1.17
canvas = create_canvas(width, height*height_correction,border_color)
canvas.compose(img, left, top)
#img = canvas
canvas.save(iurl)
#width, height = canvas.size
#print '***NEW img width is: ', width, 'height is: ', height
'''
return soup return soup
feeds = [ feeds = [
@ -105,6 +127,291 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
(u'Familie', u'http://www.metronieuws.nl/rss.xml?c=1283166782-9'), (u'Familie', u'http://www.metronieuws.nl/rss.xml?c=1283166782-9'),
(u'Blogs', u'http://www.metronieuws.nl/rss.xml?c=1295586825-6'), (u'Blogs', u'http://www.metronieuws.nl/rss.xml?c=1295586825-6'),
(u'Reizen', u'http://www.metronieuws.nl/rss.xml?c=1277377288-13'), (u'Reizen', u'http://www.metronieuws.nl/rss.xml?c=1277377288-13'),
(u'Carrière', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'), (u'Carri&egrave;re', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
(u'Sport', u'http://www.metronieuws.nl/rss.xml?c=1277377288-12') (u'Sport', u'http://www.metronieuws.nl/rss.xml?c=1277377288-12')
] ]
class MerryPreProcess():
def replacePictures(self, soup):
#to be implemented
return soup
def optimizePicture(self,soup):
if SHOWDEBUG0 == True:
mlog.addDebug('start image optimize')
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
img.trim(0)
img.save(iurl)
if SHOWDEBUG0 == True:
mlog.addDebug('Images optimized')
mlog.showDebug()
return soup
class MerryExtract():
def safeRemovePart(self, killingSoup, soupIsArray):
if killingSoup and not killingSoup == None:
if SHOWDEBUG2 == True:
mlog.addTextAndTag(['items to remove'],[killingSoup])
try:
if soupIsArray == True:
for killer in killingSoup:
killer.extract()
else:
killingSoup.extract()
if SHOWDEBUG1 == True:
mlog.addDebug('tag extracted')
mlog.showDebug()
if KEEPSTATS == True:
try:
mstat.addstat(mstat.removedtagslist,str(killingSoup.name))
except:
mstat.addstat(mstat.removedtagslist,'unknown')
except:
if SHOWDEBUG1 == True:
mlog.addDebug('tag extraction failed')
mlog.showDebug()
if KEEPSTATS == True:
mstat.addstat(mstat.removedtagslist,'exception')
return False
else:
return False
return killingSoup
class MerryReplace():
myKiller = MerryExtract()
def replaceATag(self, soup):
anchors = []
anchors = soup.findAll('a')
if anchors and not (anchors == None or anchors == []):
try:
for link in anchors:
# print str(link)
if link and not link == None:
# print ('type: %s'%(str(type(link))))
# print ('link: %s' % (link))
myParent = link.parent
# print str('parent: %s'%(myParent))
try:
myIndex = link.parent.index(link)
hasIndex = True
except:
myIndex = 0
hasIndex = False
# print str('index %s'%(myIndex))
if not link.string == None:
# print 'link=notnone'
if hasIndex == True:
myParent.insert(myIndex, link.string)
else:
myParent.append(link.string)
else:
# print 'link=none'
myParent.insert(myIndex, link.contents)
self.myKiller.safeRemovePart(link, False)
else:
notshown = 'tag received is empty' # print
except:
notshown = 'tag received is empty' # print
notshown
return soup
class MerryProcess(BeautifulSoup):
myKiller = MerryExtract()
myReplacer = MerryReplace()
myPrepare = MerryPreProcess()
def optimizeLayout(self,soup):
self.myPrepare.optimizePicture(soup)
if SHOWDEBUG0 == True:
mlog.addDebug('End of Optimize Layout')
mlog.showDebug()
return soup
def insertFacts(self, soup):
allfacts = soup.findAll('div', {'class':re.compile('^article-box-fact.*$')})
if SHOWDEBUG0 == True:
mlog.addTextAndTag(['allfacts'],[allfacts])
mlog.showDebug()
if allfacts and not allfacts == None:
allfactsparent = soup.find('div', {'class':re.compile('^article-box-fact.*$')}).parent
if SHOWDEBUG0 == True:
mlog.addTextAndTag(['allfactsparent'],[allfactsparent])
mlog.showDebug()
for part in allfactsparent:
if not part in allfacts:
if SHOWDEBUG0 == True:
mlog.addTextAndTag(['FOUND A non-fact'],[part])
mlog.showDebug()
self.myKiller.safeRemovePart(part, True)
if SHOWDEBUG1 == True:
mlog.addTextAndTag(['New All Facts'],[allfacts])
mlog.showDebug()
articlefacts = soup.find('div', {'class':'article-box-fact column'})
errorOccured=False
if (articlefacts and not articlefacts==None):
try:
contenttag = soup.find('div', {'class':'article-body'})
if SHOWDEBUG0 == True:
mlog.addTextAndTag(['curcontag'],[contenttag])
mlog.showDebug()
foundrighttag = False
if contenttag and not contenttag == None:
foundrighttag = True
if SHOWDEBUG0 == True:
if errorOccured == False:
mlog.addTextAndTag(['type','curcontag (in while)'],[type(contenttag),contenttag])
else:
mlog.addDebug('Could not find right parent tag. Error Occured')
mlog.showDebug()
if foundrighttag == True:
contenttag.insert(0, allfactsparent)
if SHOWDEBUG2 == True:
mlog.addTextAndTag(['added parent'],[soup.prettify()])
mlog.showDebug()
except:
errorOccured=True
mlog.addTrace()
else:
errorOccured=True
if SHOWDEBUG0 == True and errorOccured == True:
mlog.addTextAndTag(['no articlefacts'],[articlefacts])
mlog.showDebug()
return soup
def previousNextSibRemover(self, soup, previous=True, soupIsArray=False):
findsibsof = soup
firstpart = previous
if findsibsof and not findsibsof == None:
if soupIsArray == True:
for foundsib in findsibsof:
self.previousNextSibRemover(foundsib, firstpart, soupIsArray=False)
else:
if firstpart == True and soupIsArray == False:
sibs = findsibsof.previousSiblingGenerator()
else:
sibs = findsibsof.nextSiblingGenerator()
for sib in sibs:
self.myKiller.safeRemovePart(sib, True)
else:
if SHOWDEBUG1 == True:
mlog.addDebug('Not any sib found')
return
def removeUnwantedTags(self,soup):
if SHOWDEBUG1 == True:
mlog.addTextAndTag(['Len of Soup before RemoveTagsByName'],[len(str(soup))])
mlog.showDebug()
self.removeTagsByName(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before firstandlastpart: %s' % len(str(soup)))
mlog.showDebug()
self.insertFacts(soup)
self.removeFirstAndLastPart(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before unwantedpart: %s' % len(str(soup)))
mlog.showDebug()
self.removeUnwantedParts(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before EmptyParts: %s' % len(str(soup)))
mlog.showDebug()
self.removeEmptyTags(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup after EmptyParts: %s' % len(str(soup)))
mlog.showDebug()
self.myReplacer.replaceATag(soup)
return soup
def removeUnwantedParts(self, soup):
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before UnwantedID: %s' % len(str(soup)))
mlog.showDebug()
self.removeUnwantedTagsByID(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before Class: %s' % len(str(soup)))
mlog.showDebug()
self.removeUnwantedTagsByClass(soup)
if SHOWDEBUG1 == True:
mlog.addDebug('Len of Soup before Style: %s' % len(str(soup)))
mlog.showDebug()
self.removeUnwantedTagsByStyle(soup)
return soup
def removeUnwantedTagsByStyle(self,soup):
self.removeArrayOfTags(soup.findAll(attrs={'style' : re.compile("^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$")}))
if SHOWDEBUG0 == True:
mlog.addDebug('end remove by style')
return soup
def removeArrayOfTags(self,souparray):
return self.myKiller.safeRemovePart(souparray, True)
def removeUnwantedTagsByClass(self,soup):
if SHOWDEBUG0 == True:
mlog.addDebug('start remove by class')
self.removeArrayOfTags(soup.findAll("div", { "class" :re.compile('^(promo.*?|article-tools-below-title|metroCommentFormWrap|ad|share-tools|tools|header-links|related-links|padding-top-15)$')}))
return soup
def removeUnwantedTagsByID(self,soup):
defaultids = ['footer-extra',re.compile('^ad(\d+|adcomp.*?)?$'),'column-4-5','navigation','header',re.compile('^column-1-5-(top|bottom)$'),'footer','hidden_div','sidebar',re.compile('^article-\d$'),'comments','footer']
for removeid in defaultids:
if SHOWDEBUG1 == True:
mlog.addDebug('RemoveTagByID, tag: %s, Len of Soup: %s' % (str(removeid), len(str(soup))))
mlog.showDebug()
self.removeArrayOfTags(soup.findAll(id=removeid))
return soup
# def safeRemoveTag(self, subtree):
# return self.myKiller.safeRemovePart(subtree, True)
def removeTagsByName(self, soup):
self.myKiller.safeRemovePart(soup.script, True)
self.myKiller.safeRemovePart(soup.iframe, True)
self.myKiller.safeRemovePart(soup.style, True)
self.myKiller.safeRemovePart(soup.noscript, True)
return soup
def removeEmptyTags(self,soup,run=0):
if SHOWDEBUG0 == True:
mlog.addDebug('starting removeEmptyTags')
if SHOWDEBUG1 == True:
run += 1
mlog.addDebug(run)
if SHOWDEBUG2 == True:
mlog.addDebug(str(soup.prettify()))
mlog.showDebug()
emptymatches = re.compile('^(&nbsp;|\s|\n|\r|\t)*$')
emptytags = soup.findAll(lambda tag: tag.find(True) is None and (tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing)
if emptytags and not (emptytags == None or emptytags == []):
if SHOWDEBUG1 == True:
mlog.addDebug('tags found')
mlog.addDebug(str(emptytags))
self.removeArrayOfTags(emptytags)
#recursive in case removing empty tag creates new empty tag
self.removeEmptyTags(soup, run=run)
else:
if SHOWDEBUG1 == True:
mlog.addDebug('no empty tags found')
mlog.showDebug()
if SHOWDEBUG0 == True:
if SHOWDEBUG2 == True:
mlog.addDebug('new soup:')
mlog.addDebug(str(soup.prettify()))
mlog.addDebug('RemoveEmptyTags Completed')
mlog.showDebug()
return soup
def removeFirstAndLastPart(self,soup):
def findparenttag(lookuptag):
if lookuptag and not lookuptag == None:
return lookuptag.findParents()
findtag = soup.find(id="date")
self.previousNextSibRemover(findtag, previous=True, soupIsArray=False)
self.previousNextSibRemover(findparenttag(findtag), previous=True, soupIsArray=True)
for endtag in [soup.find(id="share-and-byline"), soup.find("div", { "class" : "gallery-text" })]:
self.previousNextSibRemover(endtag, previous=False, soupIsArray=False)
self.previousNextSibRemover(findparenttag(endtag), previous=False, soupIsArray=True)
return soup

View File

@ -5,8 +5,8 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
description = 'News as provide by The Metro -UK' description = 'News as provide by The Metro -UK'
__author__ = 'Dave Asbury' __author__ = 'Dave Asbury'
#last update 3/12/11
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg' cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
no_stylesheets = True no_stylesheets = True
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 20 max_articles_per_feed = 20
@ -26,15 +26,17 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
keep_only_tags = [ keep_only_tags = [
dict(name='h1'),dict(name='h2', attrs={'class':'h2'}), dict(name='h1'),dict(name='h2', attrs={'class':'h2'}),
dict(attrs={'class':['img-cnt figure']}), dict(attrs={'class':['img-cnt figure']}),
dict(attrs={'class':['art-img']}), dict(attrs={'class':['art-img']}),
dict(name='div', attrs={'class':'art-lft'}), dict(name='div', attrs={'class':'art-lft'}),
dict(name='p') dict(name='p')
] ]
remove_tags = [dict(name='div', attrs={'class':[ 'news m12 clrd clr-b p5t shareBtm', 'commentForm', 'metroCommentInnerWrap', remove_tags = [
'art-rgt','pluck-app pluck-comm','news m12 clrd clr-l p5t', 'flt-r' ]}), dict(name = 'div',attrs={'id' : ['comments-news','formSubmission']}),
dict(attrs={'class':[ 'metroCommentFormWrap','commentText','commentsNav','avatar','submDateAndTime']}) dict(name='div', attrs={'class':[ 'news m12 clrd clr-b p5t shareBtm', 'commentForm', 'metroCommentInnerWrap',
'art-rgt','pluck-app pluck-comm','news m12 clrd clr-l p5t', 'flt-r','username','clrd' ]}),
dict(attrs={'class':['username', 'metroCommentFormWrap','commentText','commentsNav','avatar','submDateAndTime','addYourComment','displayName']})
,dict(name='div', attrs={'class' : 'clrd art-fd fd-gr1-b'}) ,dict(name='div', attrs={'class' : 'clrd art-fd fd-gr1-b'})
] ]
feeds = [ feeds = [
@ -42,9 +44,9 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
extra_css = ''' extra_css = '''
body {font: sans-serif medium;}' body {font: sans-serif medium;}'
h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;} h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;}
h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; } h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; }
span{ font-size:9.5px; font-weight:bold;font-style:italic} span{ font-size:9.5px; font-weight:bold;font-style:italic}
p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;} p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
''' '''

View File

@ -10,6 +10,10 @@ __MakePeriodical__ = True
__UseChineseTitle__ = False __UseChineseTitle__ = False
# Set it to False if you want to skip images (Default: True) # Set it to False if you want to skip images (Default: True)
__KeepImages__ = True __KeepImages__ = True
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
__IncludeSummary__ = False
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
__IncludeThumbnails__ = True
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True) # (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
__UseLife__ = True __UseLife__ = True
# (HK only) It is to disable premium content (Default: False) # (HK only) It is to disable premium content (Default: False)
@ -24,6 +28,10 @@ __Date__ = ''
''' '''
Change Log: Change Log:
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt 2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
2011/10/19: fix a bug in txt source parsing 2011/10/19: fix a bug in txt source parsing
2011/10/17: disable fetching of premium content, also improved txt source parsing 2011/10/17: disable fetching of premium content, also improved txt source parsing
@ -52,6 +60,7 @@ Change Log:
2010/10/31: skip repeated articles in section pages 2010/10/31: skip repeated articles in section pages
''' '''
from calibre.utils.date import now as nowf
import os, datetime, re, mechanize import os, datetime, re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested from contextlib import nested
@ -59,11 +68,15 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
from calibre.utils.localization import canonicalize_lang
# MAIN CLASS # MAIN CLASS
class MPRecipe(BasicNewsRecipe): class MPRecipe(BasicNewsRecipe):
if __Region__ == 'Hong Kong': if __Region__ == 'Hong Kong':
title = 'Ming Pao - Hong Kong' if __UseChineseTitle__ == True:
title = u'\u660e\u5831 (\u9999\u6e2f)'
else:
title = 'Ming Pao - Hong Kong'
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)' description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
category = 'Chinese, News, Hong Kong' category = 'Chinese, News, Hong Kong'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
@ -108,7 +121,10 @@ class MPRecipe(BasicNewsRecipe):
lambda match: "</b>") lambda match: "</b>")
] ]
elif __Region__ == 'Vancouver': elif __Region__ == 'Vancouver':
title = 'Ming Pao - Vancouver' if __UseChineseTitle__ == True:
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
else:
title = 'Ming Pao - Vancouver'
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)' description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
category = 'Chinese, News, Vancouver' category = 'Chinese, News, Vancouver'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
@ -126,7 +142,10 @@ class MPRecipe(BasicNewsRecipe):
lambda match: ''), lambda match: ''),
] ]
elif __Region__ == 'Toronto': elif __Region__ == 'Toronto':
title = 'Ming Pao - Toronto' if __UseChineseTitle__ == True:
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
else:
title = 'Ming Pao - Toronto'
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)' description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
category = 'Chinese, News, Toronto' category = 'Chinese, News, Toronto'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
@ -160,9 +179,9 @@ class MPRecipe(BasicNewsRecipe):
def get_dtlocal(self): def get_dtlocal(self):
dt_utc = datetime.datetime.utcnow() dt_utc = datetime.datetime.utcnow()
if __Region__ == 'Hong Kong': if __Region__ == 'Hong Kong':
# convert UTC to local hk time - at HKT 5.30am, all news are available # convert UTC to local hk time - at HKT 4.30am, all news are available
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24) dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24) # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
elif __Region__ == 'Vancouver': elif __Region__ == 'Vancouver':
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available # convert UTC to local Vancouver time - at PST time 5.30am, all news are available
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24) dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
@ -185,6 +204,18 @@ class MPRecipe(BasicNewsRecipe):
else: else:
return self.get_dtlocal().strftime("%Y-%m-%d") return self.get_dtlocal().strftime("%Y-%m-%d")
def get_fetchyear(self):
if __Date__ <> '':
return __Date__[0:4]
else:
return self.get_dtlocal().strftime("%Y")
def get_fetchmonth(self):
if __Date__ <> '':
return __Date__[4:6]
else:
return self.get_dtlocal().strftime("%m")
def get_fetchday(self): def get_fetchday(self):
if __Date__ <> '': if __Date__ <> '':
return __Date__[6:8] return __Date__[6:8]
@ -533,12 +564,22 @@ class MPRecipe(BasicNewsRecipe):
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">' new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
next_is_img_txt = False next_is_img_txt = False
title_started = False title_started = False
title_break_reached = False
met_article_start_char = False met_article_start_char = False
for item in splitter.split(raw_html): for item in splitter.split(raw_html):
item = item.strip() item = item.strip()
if item.startswith(u'\u3010'): # if title already reached but break between title and content not yet found, record title_break_reached
met_article_start_char = True if title_started == True and title_break_reached == False and item == '':
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n' title_break_reached = True
# if title reached and title_break_reached and met_article_start_char == False and item is not empty
# start content
elif title_started == True and title_break_reached == True and met_article_start_char == False:
if item <> '':
met_article_start_char = True
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
#if item.startswith(u'\u3010'):
# met_article_start_char = True
# new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
else: else:
if next_is_img_txt == False: if next_is_img_txt == False:
if item.startswith("=@"): if item.startswith("=@"):
@ -643,77 +684,153 @@ class MPRecipe(BasicNewsRecipe):
del item['absmiddle'] del item['absmiddle']
return soup return soup
def populate_article_metadata(self, article, soup, first):
# thumbnails shouldn't be available if using hi-res images
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
img = soup.find('img')
if img is not None:
self.add_toc_thumbnail(article, img['src'])
try:
if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
# look for content
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'class':'content'})
if not articlebodies:
articlebodies = soup.findAll('div', attrs={'id':'font'})
if articlebodies:
for articlebody in articlebodies:
if articlebody:
# the text may or may not be enclosed in <p></p> tag
paras = articlebody.findAll('p')
if not paras:
paras = articlebody
textFound = False
for p in paras:
if not textFound:
summary_candidate = self.tag_to_string(p).strip()
summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
if len(summary_candidate) > 0:
article.summary = article.text_summary = summary_candidate
textFound = True
else:
# display a simple text
#article.summary = article.text_summary = u'\u66f4\u591a......'
# display word counts
counts = 0
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'class':'content'})
if not articlebodies:
articlebodies = soup.findAll('div', attrs={'id':'font'})
if articlebodies:
for articlebody in articlebodies:
# the text may or may not be enclosed in <p></p> tag
paras = articlebody.findAll('p')
if not paras:
paras = articlebody
for p in paras:
summary_candidate = self.tag_to_string(p).strip()
counts += len(summary_candidate)
article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
except:
self.log("Error creating article descriptions")
return
# override from the one in version 0.8.31
def create_opf(self, feeds, dir=None): def create_opf(self, feeds, dir=None):
if dir is None: if dir is None:
dir = self.output_dir dir = self.output_dir
if __UseChineseTitle__ == True: title = self.short_title()
if __Region__ == 'Hong Kong': # change 1: allow our own flag to tell if a periodical is to be generated
title = u'\u660e\u5831 (\u9999\u6e2f)' # also use customed date instead of current time
elif __Region__ == 'Vancouver': if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
elif __Region__ == 'Toronto':
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
else:
title = self.short_title()
# if not generating a periodical, force date to apply in title
if __MakePeriodical__ == False:
title = title + ' ' + self.get_fetchformatteddate() title = title + ' ' + self.get_fetchformatteddate()
if True: # end of change 1
mi = MetaInformation(title, [self.publisher]) # change 2: __appname__ replaced by newspaper publisher
mi.publisher = self.publisher __appname__ = self.publisher
mi.author_sort = self.publisher mi = MetaInformation(title, [__appname__])
if __MakePeriodical__ == True: mi.publisher = __appname__
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() mi.author_sort = __appname__
else: # change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
mi.publication_type = self.publication_type+':'+self.short_title() if __MakePeriodical__ == True:
#mi.timestamp = nowf() mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
mi.timestamp = self.get_dtlocal() else:
mi.comments = self.description mi.publication_type = self.publication_type+':'+self.short_title()
if not isinstance(mi.comments, unicode): #mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
mi.comments = mi.comments.decode('utf-8', 'replace') # change 4: in the following, all the nowf() are changed to adjusted time
#mi.pubdate = nowf() # This one doesn't matter
mi.pubdate = self.get_dtlocal() mi.timestamp = nowf()
opf_path = os.path.join(dir, 'index.opf') # change 5: skip listing the articles
ncx_path = os.path.join(dir, 'index.ncx') #article_titles, aseen = [], set()
opf = OPFCreator(dir, mi) #for f in feeds:
# Add mastheadImage entry to <guide> section # for a in f:
mp = getattr(self, 'masthead_path', None) # if a.title and a.title not in aseen:
if mp is not None and os.access(mp, os.R_OK): # aseen.add(a.title)
from calibre.ebooks.metadata.opf2 import Guide # article_titles.append(force_unicode(a.title, 'utf-8'))
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
ref.type = 'masthead'
ref.title = 'Masthead Image'
opf.guide.append(ref)
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] #mi.comments = self.description
manifest.append(os.path.join(dir, 'index.html')) #if not isinstance(mi.comments, unicode):
manifest.append(os.path.join(dir, 'index.ncx')) # mi.comments = mi.comments.decode('utf-8', 'replace')
#mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
# '\n\n'.join(article_titles))
# Get cover language = canonicalize_lang(self.language)
cpath = getattr(self, 'cover_path', None) if language is not None:
if cpath is None: mi.language = language
pf = open(os.path.join(dir, 'cover.jpg'), 'wb') # This one affects the pub date shown in kindle title
if self.default_cover(pf): #mi.pubdate = nowf()
cpath = pf.name # now appears to need the time field to be > 12.00noon as well
if cpath is not None and os.access(cpath, os.R_OK): mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
opf.cover = cpath opf_path = os.path.join(dir, 'index.opf')
manifest.append(cpath) ncx_path = os.path.join(dir, 'index.ncx')
# Get masthead opf = OPFCreator(dir, mi)
mpath = getattr(self, 'masthead_path', None) # Add mastheadImage entry to <guide> section
if mpath is not None and os.access(mpath, os.R_OK): mp = getattr(self, 'masthead_path', None)
manifest.append(mpath) if mp is not None and os.access(mp, os.R_OK):
from calibre.ebooks.metadata.opf2 import Guide
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
ref.type = 'masthead'
ref.title = 'Masthead Image'
opf.guide.append(ref)
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
manifest.append(os.path.join(dir, 'index.html'))
manifest.append(os.path.join(dir, 'index.ncx'))
# Get cover
cpath = getattr(self, 'cover_path', None)
if cpath is None:
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
if self.default_cover(pf):
cpath = pf.name
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
manifest.append(cpath)
# Get masthead
mpath = getattr(self, 'masthead_path', None)
if mpath is not None and os.access(mpath, os.R_OK):
manifest.append(mpath)
opf.create_manifest_from_files_in(manifest)
for mani in opf.manifest:
if mani.path.endswith('.ncx'):
mani.id = 'ncx'
if mani.path.endswith('mastheadImage.jpg'):
mani.id = 'masthead-image'
entries = ['index.html']
toc = TOC(base_path=dir)
self.play_order_counter = 0
self.play_order_map = {}
opf.create_manifest_from_files_in(manifest)
for mani in opf.manifest:
if mani.path.endswith('.ncx'):
mani.id = 'ncx'
if mani.path.endswith('mastheadImage.jpg'):
mani.id = 'masthead-image'
entries = ['index.html']
toc = TOC(base_path=dir)
self.play_order_counter = 0
self.play_order_map = {}
def feed_index(num, parent): def feed_index(num, parent):
f = feeds[num] f = feeds[num]
@ -728,13 +845,16 @@ class MPRecipe(BasicNewsRecipe):
desc = None desc = None
else: else:
desc = self.description_limiter(desc) desc = self.description_limiter(desc)
tt = a.toc_thumbnail if a.toc_thumbnail else None
entries.append('%sindex.html'%adir) entries.append('%sindex.html'%adir)
po = self.play_order_map.get(entries[-1], None) po = self.play_order_map.get(entries[-1], None)
if po is None: if po is None:
self.play_order_counter += 1 self.play_order_counter += 1
po = self.play_order_counter po = self.play_order_counter
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'), parent.add_item('%sindex.html'%adir, None,
play_order=po, author=auth, description=desc) a.title if a.title else _('Untitled Article'),
play_order=po, author=auth,
description=desc, toc_thumbnail=tt)
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
for sp in a.sub_pages: for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp]) prefix = os.path.commonprefix([opf_path, sp])
@ -751,7 +871,7 @@ class MPRecipe(BasicNewsRecipe):
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f), templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed, not self.has_single_feed,
a.orig_url, self.publisher, prefix=prefix, a.orig_url, __appname__, prefix=prefix,
center=self.center_navbar) center=self.center_navbar)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem) body.insert(len(body.contents), elem)
@ -774,7 +894,7 @@ class MPRecipe(BasicNewsRecipe):
if not desc: if not desc:
desc = None desc = None
feed_index(i, toc.add_item('feed_%d/index.html'%i, None, feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
f.title, play_order=po, description=desc, author=auth)) f.title, play_order=po, description=desc, author=auth))
else: else:
entries.append('feed_%d/index.html'%0) entries.append('feed_%d/index.html'%0)
@ -787,3 +907,5 @@ class MPRecipe(BasicNewsRecipe):
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file) opf.render(opf_file, ncx_file)

View File

@ -4,18 +4,41 @@ __copyright__ = '2010-2011, Eddie Lau'
# Region - Hong Kong, Vancouver, Toronto # Region - Hong Kong, Vancouver, Toronto
__Region__ = 'Toronto' __Region__ = 'Toronto'
# Users of Kindle 3 with limited system-level CJK support # Users of Kindle 3 with limited system-level CJK support
# please replace the following "True" with "False". # please replace the following "True" with "False". (Default: True)
__MakePeriodical__ = True __MakePeriodical__ = True
# Turn below to true if your device supports display of CJK titles # Turn below to True if your device supports display of CJK titles (Default: False)
__UseChineseTitle__ = False __UseChineseTitle__ = False
# Set it to False if you want to skip images # Set it to False if you want to skip images (Default: True)
__KeepImages__ = True __KeepImages__ = True
# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source # Set it to True if you want to include a summary in Kindle's article view (Default: False)
__IncludeSummary__ = False
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
__IncludeThumbnails__ = True
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
__UseLife__ = True __UseLife__ = True
# (HK only) It is to disable premium content (Default: False)
__InclPremium__ = False
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
__ParsePFF__ = True
# (HK only) Turn below to True if you wish hi-res images (Default: False)
__HiResImg__ = False
# Override the date returned by the program if specifying a YYYYMMDD below
__Date__ = ''
''' '''
Change Log: Change Log:
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
2011/10/19: fix a bug in txt source parsing
2011/10/17: disable fetching of premium content, also improved txt source parsing
2011/10/04: option to get hi-res photos for the articles
2011/09/21: fetching "column" section is made optional.
2011/09/18: parse "column" section stuff from source text file directly.
2011/09/07: disable "column" section as it is no longer offered free.
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source 2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
provide options to remove all images in the file provide options to remove all images in the file
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages 2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
@ -37,30 +60,38 @@ Change Log:
2010/10/31: skip repeated articles in section pages 2010/10/31: skip repeated articles in section pages
''' '''
import os, datetime, re from calibre.utils.date import now as nowf
import os, datetime, re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested from contextlib import nested
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
from calibre.utils.localization import canonicalize_lang
# MAIN CLASS # MAIN CLASS
class MPRecipe(BasicNewsRecipe): class MPRecipe(BasicNewsRecipe):
if __Region__ == 'Hong Kong': if __Region__ == 'Hong Kong':
title = 'Ming Pao - Hong Kong' if __UseChineseTitle__ == True:
title = u'\u660e\u5831 (\u9999\u6e2f)'
else:
title = 'Ming Pao - Hong Kong'
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)' description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
category = 'Chinese, News, Hong Kong' category = 'Chinese, News, Hong Kong'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
keep_only_tags = [dict(name='h1'), keep_only_tags = [dict(name='h1'),
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
dict(name='font', attrs={'color':['AA0000']}), # for column articles title dict(name='font', attrs={'color':['AA0000']}), # for column articles title
dict(attrs={'class':['heading']}), # for heading from txt
dict(attrs={'id':['newscontent']}), # entertainment and column page content dict(attrs={'id':['newscontent']}), # entertainment and column page content
dict(attrs={'id':['newscontent01','newscontent02']}), dict(attrs={'id':['newscontent01','newscontent02']}),
dict(attrs={'class':['content']}), # for content from txt
dict(attrs={'class':['photo']}), dict(attrs={'class':['photo']}),
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
dict(attrs={'class':['images']}) # for images from txt
] ]
if __KeepImages__: if __KeepImages__:
remove_tags = [dict(name='style'), remove_tags = [dict(name='style'),
@ -90,7 +121,10 @@ class MPRecipe(BasicNewsRecipe):
lambda match: "</b>") lambda match: "</b>")
] ]
elif __Region__ == 'Vancouver': elif __Region__ == 'Vancouver':
title = 'Ming Pao - Vancouver' if __UseChineseTitle__ == True:
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
else:
title = 'Ming Pao - Vancouver'
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)' description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
category = 'Chinese, News, Vancouver' category = 'Chinese, News, Vancouver'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
@ -108,7 +142,10 @@ class MPRecipe(BasicNewsRecipe):
lambda match: ''), lambda match: ''),
] ]
elif __Region__ == 'Toronto': elif __Region__ == 'Toronto':
title = 'Ming Pao - Toronto' if __UseChineseTitle__ == True:
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
else:
title = 'Ming Pao - Toronto'
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)' description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
category = 'Chinese, News, Toronto' category = 'Chinese, News, Toronto'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
@ -139,49 +176,12 @@ class MPRecipe(BasicNewsRecipe):
conversion_options = {'linearize_tables':True} conversion_options = {'linearize_tables':True}
timefmt = '' timefmt = ''
def image_url_processor(cls, baseurl, url):
# trick: break the url at the first occurance of digit, add an additional
# '_' at the front
# not working, may need to move this to preprocess_html() method
# minIdx = 10000
# i0 = url.find('0')
# if i0 >= 0 and i0 < minIdx:
# minIdx = i0
# i1 = url.find('1')
# if i1 >= 0 and i1 < minIdx:
# minIdx = i1
# i2 = url.find('2')
# if i2 >= 0 and i2 < minIdx:
# minIdx = i2
# i3 = url.find('3')
# if i3 >= 0 and i0 < minIdx:
# minIdx = i3
# i4 = url.find('4')
# if i4 >= 0 and i4 < minIdx:
# minIdx = i4
# i5 = url.find('5')
# if i5 >= 0 and i5 < minIdx:
# minIdx = i5
# i6 = url.find('6')
# if i6 >= 0 and i6 < minIdx:
# minIdx = i6
# i7 = url.find('7')
# if i7 >= 0 and i7 < minIdx:
# minIdx = i7
# i8 = url.find('8')
# if i8 >= 0 and i8 < minIdx:
# minIdx = i8
# i9 = url.find('9')
# if i9 >= 0 and i9 < minIdx:
# minIdx = i9
return url
def get_dtlocal(self): def get_dtlocal(self):
dt_utc = datetime.datetime.utcnow() dt_utc = datetime.datetime.utcnow()
if __Region__ == 'Hong Kong': if __Region__ == 'Hong Kong':
# convert UTC to local hk time - at HKT 5.30am, all news are available # convert UTC to local hk time - at HKT 4.30am, all news are available
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24) dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24) # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
elif __Region__ == 'Vancouver': elif __Region__ == 'Vancouver':
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available # convert UTC to local Vancouver time - at PST time 5.30am, all news are available
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24) dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
@ -193,13 +193,34 @@ class MPRecipe(BasicNewsRecipe):
return dt_local return dt_local
def get_fetchdate(self): def get_fetchdate(self):
return self.get_dtlocal().strftime("%Y%m%d") if __Date__ <> '':
return __Date__
else:
return self.get_dtlocal().strftime("%Y%m%d")
def get_fetchformatteddate(self): def get_fetchformatteddate(self):
return self.get_dtlocal().strftime("%Y-%m-%d") if __Date__ <> '':
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
else:
return self.get_dtlocal().strftime("%Y-%m-%d")
def get_fetchyear(self):
if __Date__ <> '':
return __Date__[0:4]
else:
return self.get_dtlocal().strftime("%Y")
def get_fetchmonth(self):
if __Date__ <> '':
return __Date__[4:6]
else:
return self.get_dtlocal().strftime("%m")
def get_fetchday(self): def get_fetchday(self):
return self.get_dtlocal().strftime("%d") if __Date__ <> '':
return __Date__[6:8]
else:
return self.get_dtlocal().strftime("%d")
def get_cover_url(self): def get_cover_url(self):
if __Region__ == 'Hong Kong': if __Region__ == 'Hong Kong':
@ -230,12 +251,23 @@ class MPRecipe(BasicNewsRecipe):
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'), (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'), (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'), (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'), (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]: ]:
articles = self.parse_section2(url, keystr) if __InclPremium__ == True:
articles = self.parse_section2_txt(url, keystr)
else:
articles = self.parse_section2(url, keystr)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
if __InclPremium__ == True:
# parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
]:
articles = self.parse_section2_txt(url, keystr)
if articles:
feeds.append((title, articles))
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
articles = self.parse_section(url) articles = self.parse_section(url)
@ -244,15 +276,16 @@ class MPRecipe(BasicNewsRecipe):
else: else:
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]: (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
articles = self.parse_section(url) articles = self.parse_section(url)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
# special- editorial # special- editorial
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr') #ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
if ed_articles: #if ed_articles:
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles)) # feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
@ -263,20 +296,39 @@ class MPRecipe(BasicNewsRecipe):
# special - finance # special - finance
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea') #fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
if fin_articles: #if fin_articles:
feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) # feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]: articles = self.parse_section2_txt(url, keystr)
articles = self.parse_section(url)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
#for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
# (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
# articles = self.parse_section(url)
# if articles:
# feeds.append((title, articles))
# special - entertainment # special - entertainment
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') #ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
if ent_articles: #if ent_articles:
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles)) # feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
]:
articles = self.parse_section2_txt(url, keystr)
if articles:
feeds.append((title, articles))
if __InclPremium__ == True:
# parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
]:
articles = self.parse_section2_txt(url, keystr)
if articles:
feeds.append((title, articles))
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
@ -284,11 +336,6 @@ class MPRecipe(BasicNewsRecipe):
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
# special- columns
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
if col_articles:
feeds.append((u'\u5c08\u6b04 Columns', col_articles))
elif __Region__ == 'Vancouver': elif __Region__ == 'Vancouver':
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'), for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'), (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
@ -332,6 +379,16 @@ class MPRecipe(BasicNewsRecipe):
title = self.tag_to_string(a) title = self.tag_to_string(a)
url = a.get('href', False) url = a.get('href', False)
url = 'http://news.mingpao.com/' + dateStr + '/' +url url = 'http://news.mingpao.com/' + dateStr + '/' +url
# replace the url to the print-friendly version
if __ParsePFF__ == True:
if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
url = re.sub('%2F.*%2F', '/', url)
title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
url = url.replace('%2Etxt', '_print.htm')
url = url.replace('%5F', '_')
else:
url = url.replace('.htm', '_print.htm')
if url not in included_urls and url.rfind('Redirect') == -1: if url not in included_urls and url.rfind('Redirect') == -1:
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
included_urls.append(url) included_urls.append(url)
@ -340,6 +397,8 @@ class MPRecipe(BasicNewsRecipe):
# parse from life.mingpao.com # parse from life.mingpao.com
def parse_section2(self, url, keystr): def parse_section2(self, url, keystr):
br = mechanize.Browser()
br.set_handle_redirect(False)
self.get_fetchdate() self.get_fetchdate()
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
a = soup.findAll('a', href=True) a = soup.findAll('a', href=True)
@ -350,7 +409,29 @@ class MPRecipe(BasicNewsRecipe):
title = self.tag_to_string(i) title = self.tag_to_string(i)
url = 'http://life.mingpao.com/cfm/' + i.get('href', False) url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article try:
br.open_novisit(url)
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
current_articles.append({'title': title, 'url': url, 'description': ''})
included_urls.append(url)
except:
print 'skipping a premium article'
current_articles.reverse()
return current_articles
# parse from text file of life.mingpao.com
def parse_section2_txt(self, url, keystr):
self.get_fetchdate()
soup = self.index_to_soup(url)
a = soup.findAll('a', href=True)
a.reverse()
current_articles = []
included_urls = []
for i in a:
title = self.tag_to_string(i)
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/') # use printed version of the article
current_articles.append({'title': title, 'url': url, 'description': ''}) current_articles.append({'title': title, 'url': url, 'description': ''})
included_urls.append(url) included_urls.append(url)
current_articles.reverse() current_articles.reverse()
@ -438,6 +519,162 @@ class MPRecipe(BasicNewsRecipe):
current_articles.reverse() current_articles.reverse()
return current_articles return current_articles
# preprocess those .txt and javascript based files
def preprocess_raw_html(self, raw_html, url):
new_html = raw_html
if url.rfind('ftp') <> -1 or url.rfind('_print.htm') <> -1:
if url.rfind('_print.htm') <> -1:
# javascript based file
splitter = re.compile(r'\n')
new_raw_html = '<html><head><title>Untitled</title></head>'
new_raw_html = new_raw_html + '<body>'
for item in splitter.split(raw_html):
if item.startswith('var heading1 ='):
heading = item.replace('var heading1 = \'', '')
heading = heading.replace('\'', '')
heading = heading.replace(';', '')
new_raw_html = new_raw_html + '<div class="heading">' + heading
if item.startswith('var heading2 ='):
heading = item.replace('var heading2 = \'', '')
heading = heading.replace('\'', '')
heading = heading.replace(';', '')
if heading <> '':
new_raw_html = new_raw_html + '<br>' + heading + '</div>'
else:
new_raw_html = new_raw_html + '</div>'
if item.startswith('var content ='):
content = item.replace("var content = ", '')
content = content.replace('\'', '')
content = content.replace(';', '')
new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
if item.startswith('var photocontent ='):
photo = item.replace('var photocontent = \'', '')
photo = photo.replace('\'', '')
photo = photo.replace(';', '')
photo = photo.replace('<tr>', '')
photo = photo.replace('<td>', '')
photo = photo.replace('</tr>', '')
photo = photo.replace('</td>', '<br>')
photo = photo.replace('class="photo"', '')
new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
new_html = new_raw_html + '</body></html>'
else:
# .txt based file
splitter = re.compile(r'\n') # Match non-digits
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
next_is_img_txt = False
title_started = False
title_break_reached = False
met_article_start_char = False
for item in splitter.split(raw_html):
item = item.strip()
# if title already reached but break between title and content not yet found, record title_break_reached
if title_started == True and title_break_reached == False and item == '':
title_break_reached = True
# if title reached and title_break_reached and met_article_start_char == False and item is not empty
# start content
elif title_started == True and title_break_reached == True and met_article_start_char == False:
if item <> '':
met_article_start_char = True
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
#if item.startswith(u'\u3010'):
# met_article_start_char = True
# new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
else:
if next_is_img_txt == False:
if item.startswith("=@"):
print 'skip movie link'
elif item.startswith("=?"):
next_is_img_txt = True
new_raw_html += '<img src="' + str(item)[2:].strip() + '.gif" /><p>\n'
elif item.startswith('=='):
next_is_img_txt = True
if False:
# TODO: check existence of .gif first
newimg = '_' + item[2:].strip() + '.jpg'
new_raw_html += '<img src="' + newimg + '" /><p>\n'
else:
new_raw_html += '<img src="' + str(item)[2:].strip() + '.jpg" /><p>\n'
elif item.startswith('='):
next_is_img_txt = True
if False:
# TODO: check existence of .gif first
newimg = '_' + item[1:].strip() + '.jpg'
new_raw_html += '<img src="' + newimg + '" /><p>\n'
else:
new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
else:
if next_is_img_txt == False and met_article_start_char == False:
if item <> '':
if title_started == False:
#print 'Title started at ', item
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
title_started = True
else:
new_raw_html = new_raw_html + item + '\n'
else:
new_raw_html = new_raw_html + item + '<p>\n'
else:
next_is_img_txt = False
new_raw_html = new_raw_html + item + '\n'
new_html = new_raw_html + '</div></body></html>'
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
if __HiResImg__ == True:
# TODO: add a _ in front of an image url
if url.rfind('news.mingpao.com') > -1:
imglist = re.findall('src="?.*?jpg"', new_html)
br = mechanize.Browser()
br.set_handle_redirect(False)
for img in imglist:
gifimg = img.replace('jpg"', 'gif"')
try:
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
new_html = new_html.replace(img, gifimg)
except:
# find the location of the first _
pos = img.find('_')
if pos > -1:
# if found, insert _ after the first _
newimg = img[0:pos] + '_' + img[pos:]
new_html = new_html.replace(img, newimg)
else:
# if not found, insert _ after "
new_html = new_html.replace(img[1:], '"_' + img[1:])
elif url.rfind('life.mingpao.com') > -1:
imglist = re.findall('src=\'?.*?jpg\'', new_html)
br = mechanize.Browser()
br.set_handle_redirect(False)
#print 'Img list: ', imglist, '\n'
for img in imglist:
#print 'Found img: ', img
gifimg = img.replace('jpg\'', 'gif\'')
try:
gifurl = re.sub(r'dailynews.*txt', '', url)
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
new_html = new_html.replace(img, gifimg)
except:
pos = img.rfind('/')
newimg = img[0:pos+1] + '_' + img[pos+1:]
new_html = new_html.replace(img, newimg)
# repeat with src quoted by double quotes, for text parsed from src txt
imglist = re.findall('src="?.*?jpg"', new_html)
for img in imglist:
#print 'Found img: ', img
gifimg = img.replace('jpg"', 'gif"')
try:
#print 'url', url
pos = url.rfind('/')
gifurl = url[:pos+1]
#print 'try it:', gifurl + gifimg[5:len(gifimg)-1]
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
new_html = new_html.replace(img, gifimg)
except:
pos = img.find('"')
newimg = img[0:pos+1] + '_' + img[pos+1:]
#print 'Use hi-res img', newimg
new_html = new_html.replace(img, newimg)
return new_html
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
@ -447,77 +684,153 @@ class MPRecipe(BasicNewsRecipe):
del item['absmiddle'] del item['absmiddle']
return soup return soup
def populate_article_metadata(self, article, soup, first):
# thumbnails shouldn't be available if using hi-res images
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
img = soup.find('img')
if img is not None:
self.add_toc_thumbnail(article, img['src'])
try:
if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
# look for content
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'class':'content'})
if not articlebodies:
articlebodies = soup.findAll('div', attrs={'id':'font'})
if articlebodies:
for articlebody in articlebodies:
if articlebody:
# the text may or may not be enclosed in <p></p> tag
paras = articlebody.findAll('p')
if not paras:
paras = articlebody
textFound = False
for p in paras:
if not textFound:
summary_candidate = self.tag_to_string(p).strip()
summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
if len(summary_candidate) > 0:
article.summary = article.text_summary = summary_candidate
textFound = True
else:
# display a simple text
#article.summary = article.text_summary = u'\u66f4\u591a......'
# display word counts
counts = 0
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'class':'content'})
if not articlebodies:
articlebodies = soup.findAll('div', attrs={'id':'font'})
if articlebodies:
for articlebody in articlebodies:
# the text may or may not be enclosed in <p></p> tag
paras = articlebody.findAll('p')
if not paras:
paras = articlebody
for p in paras:
summary_candidate = self.tag_to_string(p).strip()
counts += len(summary_candidate)
article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
except:
self.log("Error creating article descriptions")
return
# override from the one in version 0.8.31
def create_opf(self, feeds, dir=None): def create_opf(self, feeds, dir=None):
if dir is None: if dir is None:
dir = self.output_dir dir = self.output_dir
if __UseChineseTitle__ == True: title = self.short_title()
if __Region__ == 'Hong Kong': # change 1: allow our own flag to tell if a periodical is to be generated
title = u'\u660e\u5831 (\u9999\u6e2f)' # also use customed date instead of current time
elif __Region__ == 'Vancouver': if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
elif __Region__ == 'Toronto':
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
else:
title = self.short_title()
# if not generating a periodical, force date to apply in title
if __MakePeriodical__ == False:
title = title + ' ' + self.get_fetchformatteddate() title = title + ' ' + self.get_fetchformatteddate()
if True: # end of change 1
mi = MetaInformation(title, [self.publisher]) # change 2: __appname__ replaced by newspaper publisher
mi.publisher = self.publisher __appname__ = self.publisher
mi.author_sort = self.publisher mi = MetaInformation(title, [__appname__])
if __MakePeriodical__ == True: mi.publisher = __appname__
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() mi.author_sort = __appname__
else: # change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
mi.publication_type = self.publication_type+':'+self.short_title() if __MakePeriodical__ == True:
#mi.timestamp = nowf() mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
mi.timestamp = self.get_dtlocal() else:
mi.comments = self.description mi.publication_type = self.publication_type+':'+self.short_title()
if not isinstance(mi.comments, unicode): #mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
mi.comments = mi.comments.decode('utf-8', 'replace') # change 4: in the following, all the nowf() are changed to adjusted time
#mi.pubdate = nowf() # This one doesn't matter
mi.pubdate = self.get_dtlocal() mi.timestamp = nowf()
opf_path = os.path.join(dir, 'index.opf') # change 5: skip listing the articles
ncx_path = os.path.join(dir, 'index.ncx') #article_titles, aseen = [], set()
opf = OPFCreator(dir, mi) #for f in feeds:
# Add mastheadImage entry to <guide> section # for a in f:
mp = getattr(self, 'masthead_path', None) # if a.title and a.title not in aseen:
if mp is not None and os.access(mp, os.R_OK): # aseen.add(a.title)
from calibre.ebooks.metadata.opf2 import Guide # article_titles.append(force_unicode(a.title, 'utf-8'))
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
ref.type = 'masthead'
ref.title = 'Masthead Image'
opf.guide.append(ref)
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] #mi.comments = self.description
manifest.append(os.path.join(dir, 'index.html')) #if not isinstance(mi.comments, unicode):
manifest.append(os.path.join(dir, 'index.ncx')) # mi.comments = mi.comments.decode('utf-8', 'replace')
#mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
# '\n\n'.join(article_titles))
# Get cover language = canonicalize_lang(self.language)
cpath = getattr(self, 'cover_path', None) if language is not None:
if cpath is None: mi.language = language
pf = open(os.path.join(dir, 'cover.jpg'), 'wb') # This one affects the pub date shown in kindle title
if self.default_cover(pf): #mi.pubdate = nowf()
cpath = pf.name # now appears to need the time field to be > 12.00noon as well
if cpath is not None and os.access(cpath, os.R_OK): mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
opf.cover = cpath opf_path = os.path.join(dir, 'index.opf')
manifest.append(cpath) ncx_path = os.path.join(dir, 'index.ncx')
# Get masthead opf = OPFCreator(dir, mi)
mpath = getattr(self, 'masthead_path', None) # Add mastheadImage entry to <guide> section
if mpath is not None and os.access(mpath, os.R_OK): mp = getattr(self, 'masthead_path', None)
manifest.append(mpath) if mp is not None and os.access(mp, os.R_OK):
from calibre.ebooks.metadata.opf2 import Guide
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
ref.type = 'masthead'
ref.title = 'Masthead Image'
opf.guide.append(ref)
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
manifest.append(os.path.join(dir, 'index.html'))
manifest.append(os.path.join(dir, 'index.ncx'))
# Get cover
cpath = getattr(self, 'cover_path', None)
if cpath is None:
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
if self.default_cover(pf):
cpath = pf.name
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
manifest.append(cpath)
# Get masthead
mpath = getattr(self, 'masthead_path', None)
if mpath is not None and os.access(mpath, os.R_OK):
manifest.append(mpath)
opf.create_manifest_from_files_in(manifest)
for mani in opf.manifest:
if mani.path.endswith('.ncx'):
mani.id = 'ncx'
if mani.path.endswith('mastheadImage.jpg'):
mani.id = 'masthead-image'
entries = ['index.html']
toc = TOC(base_path=dir)
self.play_order_counter = 0
self.play_order_map = {}
opf.create_manifest_from_files_in(manifest)
for mani in opf.manifest:
if mani.path.endswith('.ncx'):
mani.id = 'ncx'
if mani.path.endswith('mastheadImage.jpg'):
mani.id = 'masthead-image'
entries = ['index.html']
toc = TOC(base_path=dir)
self.play_order_counter = 0
self.play_order_map = {}
def feed_index(num, parent): def feed_index(num, parent):
f = feeds[num] f = feeds[num]
@ -532,13 +845,16 @@ class MPRecipe(BasicNewsRecipe):
desc = None desc = None
else: else:
desc = self.description_limiter(desc) desc = self.description_limiter(desc)
tt = a.toc_thumbnail if a.toc_thumbnail else None
entries.append('%sindex.html'%adir) entries.append('%sindex.html'%adir)
po = self.play_order_map.get(entries[-1], None) po = self.play_order_map.get(entries[-1], None)
if po is None: if po is None:
self.play_order_counter += 1 self.play_order_counter += 1
po = self.play_order_counter po = self.play_order_counter
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'), parent.add_item('%sindex.html'%adir, None,
play_order=po, author=auth, description=desc) a.title if a.title else _('Untitled Article'),
play_order=po, author=auth,
description=desc, toc_thumbnail=tt)
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
for sp in a.sub_pages: for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp]) prefix = os.path.commonprefix([opf_path, sp])
@ -555,7 +871,7 @@ class MPRecipe(BasicNewsRecipe):
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f), templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed, not self.has_single_feed,
a.orig_url, self.publisher, prefix=prefix, a.orig_url, __appname__, prefix=prefix,
center=self.center_navbar) center=self.center_navbar)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem) body.insert(len(body.contents), elem)
@ -578,7 +894,7 @@ class MPRecipe(BasicNewsRecipe):
if not desc: if not desc:
desc = None desc = None
feed_index(i, toc.add_item('feed_%d/index.html'%i, None, feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
f.title, play_order=po, description=desc, author=auth)) f.title, play_order=po, description=desc, author=auth))
else: else:
entries.append('feed_%d/index.html'%0) entries.append('feed_%d/index.html'%0)
@ -592,3 +908,4 @@ class MPRecipe(BasicNewsRecipe):
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file) opf.render(opf_file, ncx_file)

View File

@ -4,18 +4,41 @@ __copyright__ = '2010-2011, Eddie Lau'
# Region - Hong Kong, Vancouver, Toronto # Region - Hong Kong, Vancouver, Toronto
__Region__ = 'Vancouver' __Region__ = 'Vancouver'
# Users of Kindle 3 with limited system-level CJK support # Users of Kindle 3 with limited system-level CJK support
# please replace the following "True" with "False". # please replace the following "True" with "False". (Default: True)
__MakePeriodical__ = True __MakePeriodical__ = True
# Turn below to true if your device supports display of CJK titles # Turn below to True if your device supports display of CJK titles (Default: False)
__UseChineseTitle__ = False __UseChineseTitle__ = False
# Set it to False if you want to skip images # Set it to False if you want to skip images (Default: True)
__KeepImages__ = True __KeepImages__ = True
# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source # Set it to True if you want to include a summary in Kindle's article view (Default: False)
__IncludeSummary__ = False
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
__IncludeThumbnails__ = True
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
__UseLife__ = True __UseLife__ = True
# (HK only) It is to disable premium content (Default: False)
__InclPremium__ = False
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
__ParsePFF__ = True
# (HK only) Turn below to True if you wish hi-res images (Default: False)
__HiResImg__ = False
# Override the date returned by the program if specifying a YYYYMMDD below
__Date__ = ''
''' '''
Change Log: Change Log:
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
2011/10/19: fix a bug in txt source parsing
2011/10/17: disable fetching of premium content, also improved txt source parsing
2011/10/04: option to get hi-res photos for the articles
2011/09/21: fetching "column" section is made optional.
2011/09/18: parse "column" section stuff from source text file directly.
2011/09/07: disable "column" section as it is no longer offered free.
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source 2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
provide options to remove all images in the file provide options to remove all images in the file
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages 2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
@ -37,30 +60,38 @@ Change Log:
2010/10/31: skip repeated articles in section pages 2010/10/31: skip repeated articles in section pages
''' '''
import os, datetime, re from calibre.utils.date import now as nowf
import os, datetime, re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from contextlib import nested from contextlib import nested
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
from calibre.utils.localization import canonicalize_lang
# MAIN CLASS # MAIN CLASS
class MPRecipe(BasicNewsRecipe): class MPRecipe(BasicNewsRecipe):
if __Region__ == 'Hong Kong': if __Region__ == 'Hong Kong':
title = 'Ming Pao - Hong Kong' if __UseChineseTitle__ == True:
title = u'\u660e\u5831 (\u9999\u6e2f)'
else:
title = 'Ming Pao - Hong Kong'
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)' description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
category = 'Chinese, News, Hong Kong' category = 'Chinese, News, Hong Kong'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
keep_only_tags = [dict(name='h1'), keep_only_tags = [dict(name='h1'),
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
dict(name='font', attrs={'color':['AA0000']}), # for column articles title dict(name='font', attrs={'color':['AA0000']}), # for column articles title
dict(attrs={'class':['heading']}), # for heading from txt
dict(attrs={'id':['newscontent']}), # entertainment and column page content dict(attrs={'id':['newscontent']}), # entertainment and column page content
dict(attrs={'id':['newscontent01','newscontent02']}), dict(attrs={'id':['newscontent01','newscontent02']}),
dict(attrs={'class':['content']}), # for content from txt
dict(attrs={'class':['photo']}), dict(attrs={'class':['photo']}),
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
dict(attrs={'class':['images']}) # for images from txt
] ]
if __KeepImages__: if __KeepImages__:
remove_tags = [dict(name='style'), remove_tags = [dict(name='style'),
@ -90,7 +121,10 @@ class MPRecipe(BasicNewsRecipe):
lambda match: "</b>") lambda match: "</b>")
] ]
elif __Region__ == 'Vancouver': elif __Region__ == 'Vancouver':
title = 'Ming Pao - Vancouver' if __UseChineseTitle__ == True:
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
else:
title = 'Ming Pao - Vancouver'
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)' description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
category = 'Chinese, News, Vancouver' category = 'Chinese, News, Vancouver'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
@ -108,7 +142,10 @@ class MPRecipe(BasicNewsRecipe):
lambda match: ''), lambda match: ''),
] ]
elif __Region__ == 'Toronto': elif __Region__ == 'Toronto':
title = 'Ming Pao - Toronto' if __UseChineseTitle__ == True:
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
else:
title = 'Ming Pao - Toronto'
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)' description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
category = 'Chinese, News, Toronto' category = 'Chinese, News, Toronto'
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
@ -139,49 +176,12 @@ class MPRecipe(BasicNewsRecipe):
conversion_options = {'linearize_tables':True} conversion_options = {'linearize_tables':True}
timefmt = '' timefmt = ''
def image_url_processor(cls, baseurl, url):
# trick: break the url at the first occurance of digit, add an additional
# '_' at the front
# not working, may need to move this to preprocess_html() method
# minIdx = 10000
# i0 = url.find('0')
# if i0 >= 0 and i0 < minIdx:
# minIdx = i0
# i1 = url.find('1')
# if i1 >= 0 and i1 < minIdx:
# minIdx = i1
# i2 = url.find('2')
# if i2 >= 0 and i2 < minIdx:
# minIdx = i2
# i3 = url.find('3')
# if i3 >= 0 and i0 < minIdx:
# minIdx = i3
# i4 = url.find('4')
# if i4 >= 0 and i4 < minIdx:
# minIdx = i4
# i5 = url.find('5')
# if i5 >= 0 and i5 < minIdx:
# minIdx = i5
# i6 = url.find('6')
# if i6 >= 0 and i6 < minIdx:
# minIdx = i6
# i7 = url.find('7')
# if i7 >= 0 and i7 < minIdx:
# minIdx = i7
# i8 = url.find('8')
# if i8 >= 0 and i8 < minIdx:
# minIdx = i8
# i9 = url.find('9')
# if i9 >= 0 and i9 < minIdx:
# minIdx = i9
return url
def get_dtlocal(self): def get_dtlocal(self):
dt_utc = datetime.datetime.utcnow() dt_utc = datetime.datetime.utcnow()
if __Region__ == 'Hong Kong': if __Region__ == 'Hong Kong':
# convert UTC to local hk time - at HKT 5.30am, all news are available # convert UTC to local hk time - at HKT 4.30am, all news are available
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24) dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24) # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
elif __Region__ == 'Vancouver': elif __Region__ == 'Vancouver':
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available # convert UTC to local Vancouver time - at PST time 5.30am, all news are available
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24) dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
@ -193,13 +193,34 @@ class MPRecipe(BasicNewsRecipe):
return dt_local return dt_local
def get_fetchdate(self): def get_fetchdate(self):
return self.get_dtlocal().strftime("%Y%m%d") if __Date__ <> '':
return __Date__
else:
return self.get_dtlocal().strftime("%Y%m%d")
def get_fetchformatteddate(self): def get_fetchformatteddate(self):
return self.get_dtlocal().strftime("%Y-%m-%d") if __Date__ <> '':
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
else:
return self.get_dtlocal().strftime("%Y-%m-%d")
def get_fetchyear(self):
if __Date__ <> '':
return __Date__[0:4]
else:
return self.get_dtlocal().strftime("%Y")
def get_fetchmonth(self):
if __Date__ <> '':
return __Date__[4:6]
else:
return self.get_dtlocal().strftime("%m")
def get_fetchday(self): def get_fetchday(self):
return self.get_dtlocal().strftime("%d") if __Date__ <> '':
return __Date__[6:8]
else:
return self.get_dtlocal().strftime("%d")
def get_cover_url(self): def get_cover_url(self):
if __Region__ == 'Hong Kong': if __Region__ == 'Hong Kong':
@ -230,12 +251,23 @@ class MPRecipe(BasicNewsRecipe):
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'), (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'), (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'), (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'), (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]: ]:
articles = self.parse_section2(url, keystr) if __InclPremium__ == True:
articles = self.parse_section2_txt(url, keystr)
else:
articles = self.parse_section2(url, keystr)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
if __InclPremium__ == True:
# parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
]:
articles = self.parse_section2_txt(url, keystr)
if articles:
feeds.append((title, articles))
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
articles = self.parse_section(url) articles = self.parse_section(url)
@ -244,15 +276,16 @@ class MPRecipe(BasicNewsRecipe):
else: else:
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]: (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
articles = self.parse_section(url) articles = self.parse_section(url)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
# special- editorial # special- editorial
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr') #ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
if ed_articles: #if ed_articles:
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles)) # feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
@ -263,20 +296,39 @@ class MPRecipe(BasicNewsRecipe):
# special - finance # special - finance
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea') #fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
if fin_articles: #if fin_articles:
feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) # feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]: articles = self.parse_section2_txt(url, keystr)
articles = self.parse_section(url)
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
#for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
# (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
# articles = self.parse_section(url)
# if articles:
# feeds.append((title, articles))
# special - entertainment # special - entertainment
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') #ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
if ent_articles: #if ent_articles:
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles)) # feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
]:
articles = self.parse_section2_txt(url, keystr)
if articles:
feeds.append((title, articles))
if __InclPremium__ == True:
# parse column section articles directly from .txt files
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
]:
articles = self.parse_section2_txt(url, keystr)
if articles:
feeds.append((title, articles))
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
@ -284,11 +336,6 @@ class MPRecipe(BasicNewsRecipe):
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
# special- columns
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
if col_articles:
feeds.append((u'\u5c08\u6b04 Columns', col_articles))
elif __Region__ == 'Vancouver': elif __Region__ == 'Vancouver':
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'), for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'), (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
@ -332,6 +379,16 @@ class MPRecipe(BasicNewsRecipe):
title = self.tag_to_string(a) title = self.tag_to_string(a)
url = a.get('href', False) url = a.get('href', False)
url = 'http://news.mingpao.com/' + dateStr + '/' +url url = 'http://news.mingpao.com/' + dateStr + '/' +url
# replace the url to the print-friendly version
if __ParsePFF__ == True:
if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
url = re.sub('%2F.*%2F', '/', url)
title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
url = url.replace('%2Etxt', '_print.htm')
url = url.replace('%5F', '_')
else:
url = url.replace('.htm', '_print.htm')
if url not in included_urls and url.rfind('Redirect') == -1: if url not in included_urls and url.rfind('Redirect') == -1:
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
included_urls.append(url) included_urls.append(url)
@ -340,6 +397,8 @@ class MPRecipe(BasicNewsRecipe):
# parse from life.mingpao.com # parse from life.mingpao.com
def parse_section2(self, url, keystr): def parse_section2(self, url, keystr):
br = mechanize.Browser()
br.set_handle_redirect(False)
self.get_fetchdate() self.get_fetchdate()
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
a = soup.findAll('a', href=True) a = soup.findAll('a', href=True)
@ -350,7 +409,29 @@ class MPRecipe(BasicNewsRecipe):
title = self.tag_to_string(i) title = self.tag_to_string(i)
url = 'http://life.mingpao.com/cfm/' + i.get('href', False) url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1): if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article try:
br.open_novisit(url)
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
current_articles.append({'title': title, 'url': url, 'description': ''})
included_urls.append(url)
except:
print 'skipping a premium article'
current_articles.reverse()
return current_articles
# parse from text file of life.mingpao.com
def parse_section2_txt(self, url, keystr):
self.get_fetchdate()
soup = self.index_to_soup(url)
a = soup.findAll('a', href=True)
a.reverse()
current_articles = []
included_urls = []
for i in a:
title = self.tag_to_string(i)
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/') # use printed version of the article
current_articles.append({'title': title, 'url': url, 'description': ''}) current_articles.append({'title': title, 'url': url, 'description': ''})
included_urls.append(url) included_urls.append(url)
current_articles.reverse() current_articles.reverse()
@ -438,6 +519,162 @@ class MPRecipe(BasicNewsRecipe):
current_articles.reverse() current_articles.reverse()
return current_articles return current_articles
# preprocess those .txt and javascript based files
def preprocess_raw_html(self, raw_html, url):
new_html = raw_html
if url.rfind('ftp') <> -1 or url.rfind('_print.htm') <> -1:
if url.rfind('_print.htm') <> -1:
# javascript based file
splitter = re.compile(r'\n')
new_raw_html = '<html><head><title>Untitled</title></head>'
new_raw_html = new_raw_html + '<body>'
for item in splitter.split(raw_html):
if item.startswith('var heading1 ='):
heading = item.replace('var heading1 = \'', '')
heading = heading.replace('\'', '')
heading = heading.replace(';', '')
new_raw_html = new_raw_html + '<div class="heading">' + heading
if item.startswith('var heading2 ='):
heading = item.replace('var heading2 = \'', '')
heading = heading.replace('\'', '')
heading = heading.replace(';', '')
if heading <> '':
new_raw_html = new_raw_html + '<br>' + heading + '</div>'
else:
new_raw_html = new_raw_html + '</div>'
if item.startswith('var content ='):
content = item.replace("var content = ", '')
content = content.replace('\'', '')
content = content.replace(';', '')
new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
if item.startswith('var photocontent ='):
photo = item.replace('var photocontent = \'', '')
photo = photo.replace('\'', '')
photo = photo.replace(';', '')
photo = photo.replace('<tr>', '')
photo = photo.replace('<td>', '')
photo = photo.replace('</tr>', '')
photo = photo.replace('</td>', '<br>')
photo = photo.replace('class="photo"', '')
new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
new_html = new_raw_html + '</body></html>'
else:
# .txt based file
splitter = re.compile(r'\n') # Match non-digits
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
next_is_img_txt = False
title_started = False
title_break_reached = False
met_article_start_char = False
for item in splitter.split(raw_html):
item = item.strip()
# if title already reached but break between title and content not yet found, record title_break_reached
if title_started == True and title_break_reached == False and item == '':
title_break_reached = True
# if title reached and title_break_reached and met_article_start_char == False and item is not empty
# start content
elif title_started == True and title_break_reached == True and met_article_start_char == False:
if item <> '':
met_article_start_char = True
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
#if item.startswith(u'\u3010'):
# met_article_start_char = True
# new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
else:
if next_is_img_txt == False:
if item.startswith("=@"):
print 'skip movie link'
elif item.startswith("=?"):
next_is_img_txt = True
new_raw_html += '<img src="' + str(item)[2:].strip() + '.gif" /><p>\n'
elif item.startswith('=='):
next_is_img_txt = True
if False:
# TODO: check existence of .gif first
newimg = '_' + item[2:].strip() + '.jpg'
new_raw_html += '<img src="' + newimg + '" /><p>\n'
else:
new_raw_html += '<img src="' + str(item)[2:].strip() + '.jpg" /><p>\n'
elif item.startswith('='):
next_is_img_txt = True
if False:
# TODO: check existence of .gif first
newimg = '_' + item[1:].strip() + '.jpg'
new_raw_html += '<img src="' + newimg + '" /><p>\n'
else:
new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
else:
if next_is_img_txt == False and met_article_start_char == False:
if item <> '':
if title_started == False:
#print 'Title started at ', item
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
title_started = True
else:
new_raw_html = new_raw_html + item + '\n'
else:
new_raw_html = new_raw_html + item + '<p>\n'
else:
next_is_img_txt = False
new_raw_html = new_raw_html + item + '\n'
new_html = new_raw_html + '</div></body></html>'
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
if __HiResImg__ == True:
# TODO: add a _ in front of an image url
if url.rfind('news.mingpao.com') > -1:
imglist = re.findall('src="?.*?jpg"', new_html)
br = mechanize.Browser()
br.set_handle_redirect(False)
for img in imglist:
gifimg = img.replace('jpg"', 'gif"')
try:
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
new_html = new_html.replace(img, gifimg)
except:
# find the location of the first _
pos = img.find('_')
if pos > -1:
# if found, insert _ after the first _
newimg = img[0:pos] + '_' + img[pos:]
new_html = new_html.replace(img, newimg)
else:
# if not found, insert _ after "
new_html = new_html.replace(img[1:], '"_' + img[1:])
elif url.rfind('life.mingpao.com') > -1:
imglist = re.findall('src=\'?.*?jpg\'', new_html)
br = mechanize.Browser()
br.set_handle_redirect(False)
#print 'Img list: ', imglist, '\n'
for img in imglist:
#print 'Found img: ', img
gifimg = img.replace('jpg\'', 'gif\'')
try:
gifurl = re.sub(r'dailynews.*txt', '', url)
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
new_html = new_html.replace(img, gifimg)
except:
pos = img.rfind('/')
newimg = img[0:pos+1] + '_' + img[pos+1:]
new_html = new_html.replace(img, newimg)
# repeat with src quoted by double quotes, for text parsed from src txt
imglist = re.findall('src="?.*?jpg"', new_html)
for img in imglist:
#print 'Found img: ', img
gifimg = img.replace('jpg"', 'gif"')
try:
#print 'url', url
pos = url.rfind('/')
gifurl = url[:pos+1]
#print 'try it:', gifurl + gifimg[5:len(gifimg)-1]
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
new_html = new_html.replace(img, gifimg)
except:
pos = img.find('"')
newimg = img[0:pos+1] + '_' + img[pos+1:]
#print 'Use hi-res img', newimg
new_html = new_html.replace(img, newimg)
return new_html
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
@ -447,77 +684,153 @@ class MPRecipe(BasicNewsRecipe):
del item['absmiddle'] del item['absmiddle']
return soup return soup
def populate_article_metadata(self, article, soup, first):
# thumbnails shouldn't be available if using hi-res images
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
img = soup.find('img')
if img is not None:
self.add_toc_thumbnail(article, img['src'])
try:
if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
# look for content
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'class':'content'})
if not articlebodies:
articlebodies = soup.findAll('div', attrs={'id':'font'})
if articlebodies:
for articlebody in articlebodies:
if articlebody:
# the text may or may not be enclosed in <p></p> tag
paras = articlebody.findAll('p')
if not paras:
paras = articlebody
textFound = False
for p in paras:
if not textFound:
summary_candidate = self.tag_to_string(p).strip()
summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
if len(summary_candidate) > 0:
article.summary = article.text_summary = summary_candidate
textFound = True
else:
# display a simple text
#article.summary = article.text_summary = u'\u66f4\u591a......'
# display word counts
counts = 0
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
if not articlebodies:
articlebodies = soup.findAll('div',attrs={'class':'content'})
if not articlebodies:
articlebodies = soup.findAll('div', attrs={'id':'font'})
if articlebodies:
for articlebody in articlebodies:
# the text may or may not be enclosed in <p></p> tag
paras = articlebody.findAll('p')
if not paras:
paras = articlebody
for p in paras:
summary_candidate = self.tag_to_string(p).strip()
counts += len(summary_candidate)
article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
except:
self.log("Error creating article descriptions")
return
# override from the one in version 0.8.31
def create_opf(self, feeds, dir=None): def create_opf(self, feeds, dir=None):
if dir is None: if dir is None:
dir = self.output_dir dir = self.output_dir
if __UseChineseTitle__ == True: title = self.short_title()
if __Region__ == 'Hong Kong': # change 1: allow our own flag to tell if a periodical is to be generated
title = u'\u660e\u5831 (\u9999\u6e2f)' # also use customed date instead of current time
elif __Region__ == 'Vancouver': if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
elif __Region__ == 'Toronto':
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
else:
title = self.short_title()
# if not generating a periodical, force date to apply in title
if __MakePeriodical__ == False:
title = title + ' ' + self.get_fetchformatteddate() title = title + ' ' + self.get_fetchformatteddate()
if True: # end of change 1
mi = MetaInformation(title, [self.publisher]) # change 2: __appname__ replaced by newspaper publisher
mi.publisher = self.publisher __appname__ = self.publisher
mi.author_sort = self.publisher mi = MetaInformation(title, [__appname__])
if __MakePeriodical__ == True: mi.publisher = __appname__
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() mi.author_sort = __appname__
else: # change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
mi.publication_type = self.publication_type+':'+self.short_title() if __MakePeriodical__ == True:
#mi.timestamp = nowf() mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
mi.timestamp = self.get_dtlocal() else:
mi.comments = self.description mi.publication_type = self.publication_type+':'+self.short_title()
if not isinstance(mi.comments, unicode): #mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
mi.comments = mi.comments.decode('utf-8', 'replace') # change 4: in the following, all the nowf() are changed to adjusted time
#mi.pubdate = nowf() # This one doesn't matter
mi.pubdate = self.get_dtlocal() mi.timestamp = nowf()
opf_path = os.path.join(dir, 'index.opf') # change 5: skip listing the articles
ncx_path = os.path.join(dir, 'index.ncx') #article_titles, aseen = [], set()
opf = OPFCreator(dir, mi) #for f in feeds:
# Add mastheadImage entry to <guide> section # for a in f:
mp = getattr(self, 'masthead_path', None) # if a.title and a.title not in aseen:
if mp is not None and os.access(mp, os.R_OK): # aseen.add(a.title)
from calibre.ebooks.metadata.opf2 import Guide # article_titles.append(force_unicode(a.title, 'utf-8'))
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
ref.type = 'masthead'
ref.title = 'Masthead Image'
opf.guide.append(ref)
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] #mi.comments = self.description
manifest.append(os.path.join(dir, 'index.html')) #if not isinstance(mi.comments, unicode):
manifest.append(os.path.join(dir, 'index.ncx')) # mi.comments = mi.comments.decode('utf-8', 'replace')
#mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
# '\n\n'.join(article_titles))
# Get cover language = canonicalize_lang(self.language)
cpath = getattr(self, 'cover_path', None) if language is not None:
if cpath is None: mi.language = language
pf = open(os.path.join(dir, 'cover.jpg'), 'wb') # This one affects the pub date shown in kindle title
if self.default_cover(pf): #mi.pubdate = nowf()
cpath = pf.name # now appears to need the time field to be > 12.00noon as well
if cpath is not None and os.access(cpath, os.R_OK): mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
opf.cover = cpath opf_path = os.path.join(dir, 'index.opf')
manifest.append(cpath) ncx_path = os.path.join(dir, 'index.ncx')
# Get masthead opf = OPFCreator(dir, mi)
mpath = getattr(self, 'masthead_path', None) # Add mastheadImage entry to <guide> section
if mpath is not None and os.access(mpath, os.R_OK): mp = getattr(self, 'masthead_path', None)
manifest.append(mpath) if mp is not None and os.access(mp, os.R_OK):
from calibre.ebooks.metadata.opf2 import Guide
ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
ref.type = 'masthead'
ref.title = 'Masthead Image'
opf.guide.append(ref)
manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
manifest.append(os.path.join(dir, 'index.html'))
manifest.append(os.path.join(dir, 'index.ncx'))
# Get cover
cpath = getattr(self, 'cover_path', None)
if cpath is None:
pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
if self.default_cover(pf):
cpath = pf.name
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
manifest.append(cpath)
# Get masthead
mpath = getattr(self, 'masthead_path', None)
if mpath is not None and os.access(mpath, os.R_OK):
manifest.append(mpath)
opf.create_manifest_from_files_in(manifest)
for mani in opf.manifest:
if mani.path.endswith('.ncx'):
mani.id = 'ncx'
if mani.path.endswith('mastheadImage.jpg'):
mani.id = 'masthead-image'
entries = ['index.html']
toc = TOC(base_path=dir)
self.play_order_counter = 0
self.play_order_map = {}
opf.create_manifest_from_files_in(manifest)
for mani in opf.manifest:
if mani.path.endswith('.ncx'):
mani.id = 'ncx'
if mani.path.endswith('mastheadImage.jpg'):
mani.id = 'masthead-image'
entries = ['index.html']
toc = TOC(base_path=dir)
self.play_order_counter = 0
self.play_order_map = {}
def feed_index(num, parent): def feed_index(num, parent):
f = feeds[num] f = feeds[num]
@ -532,13 +845,16 @@ class MPRecipe(BasicNewsRecipe):
desc = None desc = None
else: else:
desc = self.description_limiter(desc) desc = self.description_limiter(desc)
tt = a.toc_thumbnail if a.toc_thumbnail else None
entries.append('%sindex.html'%adir) entries.append('%sindex.html'%adir)
po = self.play_order_map.get(entries[-1], None) po = self.play_order_map.get(entries[-1], None)
if po is None: if po is None:
self.play_order_counter += 1 self.play_order_counter += 1
po = self.play_order_counter po = self.play_order_counter
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'), parent.add_item('%sindex.html'%adir, None,
play_order=po, author=auth, description=desc) a.title if a.title else _('Untitled Article'),
play_order=po, author=auth,
description=desc, toc_thumbnail=tt)
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
for sp in a.sub_pages: for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp]) prefix = os.path.commonprefix([opf_path, sp])
@ -555,7 +871,7 @@ class MPRecipe(BasicNewsRecipe):
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f), templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed, not self.has_single_feed,
a.orig_url, self.publisher, prefix=prefix, a.orig_url, __appname__, prefix=prefix,
center=self.center_navbar) center=self.center_navbar)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem) body.insert(len(body.contents), elem)
@ -578,7 +894,7 @@ class MPRecipe(BasicNewsRecipe):
if not desc: if not desc:
desc = None desc = None
feed_index(i, toc.add_item('feed_%d/index.html'%i, None, feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
f.title, play_order=po, description=desc, author=auth)) f.title, play_order=po, description=desc, author=auth))
else: else:
entries.append('feed_%d/index.html'%0) entries.append('feed_%d/index.html'%0)
@ -592,3 +908,4 @@ class MPRecipe(BasicNewsRecipe):
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
opf.render(opf_file, ncx_file) opf.render(opf_file, ncx_file)

View File

@ -0,0 +1,15 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from calibre.web.feeds.news import BasicNewsRecipe
class Mlody_technik(BasicNewsRecipe):
title = u'Mlody technik'
__author__ = 'fenuks'
description = u'Młody technik'
category = 'science'
language = 'pl'
cover_url='http://science-everywhere.pl/wp-content/uploads/2011/10/mt12.jpg'
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 100
#keep_only_tags=[dict(id='container')]
feeds = [(u'Artyku\u0142y', u'http://www.mt.com.pl/feed')]

View File

@ -1,9 +1,7 @@
#!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
''' '''
moneynews.newsmax.com www.moneynews.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -12,40 +10,40 @@ class MoneyNews(BasicNewsRecipe):
title = 'Moneynews.com' title = 'Moneynews.com'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Financial news worldwide' description = 'Financial news worldwide'
publisher = 'moneynews.com' publisher = 'Newsmax.com'
language = 'en' language = 'en'
category = 'news, finances, USA, business' category = 'news, finances, USA, business'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'utf8'
extra_css = 'img{display: block} body{font-family: Arial, Helvetica, sans-serif}'
html2lrf_options = [ conversion_options = {
'--comment', description 'comment' : description
, '--category', category , 'tags' : category
, '--publisher', publisher , 'publisher' : publisher
, '--ignore-tables' , 'language' : language
] , 'linearize_tables' : True
}
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
feeds = [ feeds = [
(u'Street Talk' , u'http://moneynews.newsmax.com/xml/streettalk.xml' ) (u'Street Talk' , u'http://www.moneynews.com/rss/StreetTalk/8.xml' )
,(u'Finance News' , u'http://moneynews.newsmax.com/xml/FinanceNews.xml' ) ,(u'Finance News' , u'http://www.moneynews.com/rss/FinanceNews/4.xml' )
,(u'Economy' , u'http://moneynews.newsmax.com/xml/economy.xml' ) ,(u'Economy' , u'http://www.moneynews.com/rss/Economy/2.xml' )
,(u'Companies' , u'http://moneynews.newsmax.com/xml/companies.xml' ) ,(u'Companies' , u'http://www.moneynews.com/rss/Companies/6.xml' )
,(u'Markets' , u'http://moneynews.newsmax.com/xml/Markets.xml' ) ,(u'Markets' , u'http://www.moneynews.com/rss/Markets/7.xml' )
,(u'Investing & Analysis' , u'http://moneynews.newsmax.com/xml/investing.xml' ) ,(u'Investing & Analysis' , u'http://www.moneynews.com/rss/InvestingAnalysis/17.xml')
] ]
keep_only_tags = [dict(name='div', attrs={'class':'copy'})]
keep_only_tags = [dict(name='table', attrs={'class':'copy'})]
remove_tags = [ remove_tags = [
dict(name='td' , attrs={'id':'article_fontsize'}) dict(attrs={'class':['MsoNormal', 'MsoNoSpacing']}),
,dict(name='table', attrs={'id':'toolbox' }) dict(name=['object','link','embed','form','meta'])
,dict(name='tr' , attrs={'id':'noprint3' })
] ]
def print_version(self, url):
nodeid = url.rpartition('/')[2]
return 'http://www.moneynews.com/PrintTemplate?nodeid=' + nodeid

View File

@ -7,6 +7,7 @@ class naczytniki(BasicNewsRecipe):
language = 'pl' language = 'pl'
description ='everything about e-readers' description ='everything about e-readers'
category='readers' category='readers'
no_stylesheets=True
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_tags_after= dict(name='div', attrs={'class':'sociable'}) remove_tags_after= dict(name='div', attrs={'class':'sociable'})

View File

@ -6,11 +6,7 @@ www.nin.co.rs
''' '''
import re import re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from contextlib import closing
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre import entity_to_unicode
class Nin(BasicNewsRecipe): class Nin(BasicNewsRecipe):
title = 'NIN online' title = 'NIN online'
@ -80,59 +76,11 @@ class Nin(BasicNewsRecipe):
return self.PREFIX + item.img['src'] return self.PREFIX + item.img['src']
return cover_url return cover_url
def parse_index(self): feeds = [(u'NIN Online', u'http://www.nin.co.rs/misc/rss.php?feed=RSS2.0')]
articles = []
count = 0
soup = self.index_to_soup(self.INDEX)
for item in soup.findAll('a',attrs={'class':'lmeninavFont'}):
count = count +1
if self.test and count > 2:
return articles
section = self.tag_to_string(item)
feedlink = self.PREFIX + item['href']
feedpage = self.index_to_soup(feedlink)
self.report_progress(0, _('Fetching feed')+' %s...'%(section))
inarts = []
for art in feedpage.findAll('span',attrs={'class':'artTitle'}):
alink = art.parent
url = self.PREFIX + alink['href']
title = self.tag_to_string(art)
sparent = alink.parent
alink.extract()
description = self.tag_to_string(sparent)
date = strftime(self.timefmt)
inarts.append({
'title' :title
,'date' :date
,'url' :url
,'description':description
})
articles.append((section,inarts))
return articles
def index_to_soup(self, url_or_raw, raw=False): def get_article_url(self, article):
if re.match(r'\w+://', url_or_raw): url = BasicNewsRecipe.get_article_url(self, article)
open_func = getattr(self.browser, 'open_novisit', self.browser.open) return url.replace('.co.yu', '.co.rs')
with closing(open_func(url_or_raw)) as f:
_raw = f.read()
if not _raw:
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
else:
_raw = url_or_raw
if raw:
return _raw
if not isinstance(_raw, unicode) and self.encoding:
if callable(self.encoding):
_raw = self.encoding(_raw)
else:
_raw = _raw.decode(self.encoding, 'replace')
massage = list(BeautifulSoup.MARKUP_MASSAGE)
enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding
massage.append((re.compile(r'&(\S+?);'), lambda match:
entity_to_unicode(match, encoding=enc)))
massage.append((re.compile(r'[\x00-\x08]+'), lambda match:
''))
return BeautifulSoup(_raw, markupMassage=massage)
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):

54
recipes/nol.recipe Normal file
View File

@ -0,0 +1,54 @@
################################################################################
#Description: http://nol.hu/ RSS channel
#Author: Bigpapa (bigpapabig@hotmail.com)
#Date: 2011.12.18. - V1.1
################################################################################
from calibre.web.feeds.recipes import BasicNewsRecipe
class NOL(BasicNewsRecipe):
title = u'NOL'
__author__ = 'Bigpapa'
oldest_article = 5
max_articles_per_feed = 5 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
no_stylesheets = True
#delay = 1
use_embedded_content = False
encoding = 'utf8'
language = 'hu'
publication_type = 'newsportal'
conversion_options ={
'linearize_tables' : True,
}
keep_only_tags = [
dict(name='table', attrs={'class':['article-box']})
]
remove_tags = [
dict(name='div', attrs={'class':['h','ad-container-outer','tags noborder','ad-container-inner','image-container-lead','tags','related-container']}),
dict(name='h4'),
dict(name='tfoot'),
dict(name='td', attrs={'class':['foot']}),
dict(name='span', attrs={'class':['image-container-caption']}),
]
feeds = [
# (u'V\xe1logat\xe1s', 'http://nol.hu/feed/valogatas.rss'),
(u'Belf\xf6ld', 'http://nol.hu/feed/belfold.rss'),
(u'K\xfclf\xf6ld', 'http://nol.hu/feed/kulfold.rss'),
(u'Gazdas\xe1g', 'http://nol.hu/feed/gazdasag.rss'),
(u'V\xe9lem\xe9ny', 'http://nol.hu/feed/velemeny.rss'),
(u'Kult\xfara', 'http://nol.hu/feed/kult.rss'),
(u'Tud/Tech', 'http://nol.hu/feed/tud-tech.rss'),
(u'Sport', 'http://nol.hu/feed/sport.rss'),
(u'Noller', 'http://nol.hu/feed/noller.rss'),
(u'Mozaik', 'http://nol.hu/feed/mozaik.rss'),
(u'Utaz\xe1s', 'http://nol.hu/feed/utazas.rss'),
(u'Aut\xf3', 'http://nol.hu/feed/auto.rss'),
(u'Voks', 'http://nol.hu/feed/voks.rss'),
]

View File

@ -1,20 +1,21 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Nowa_Fantastyka(BasicNewsRecipe): class Nowa_Fantastyka(BasicNewsRecipe):
title = u'Nowa Fantastyka' title = u'Nowa Fantastyka'
oldest_article = 7 oldest_article = 7
__author__ = 'fenuks' __author__ = 'fenuks'
language = 'pl' language = 'pl'
encoding='latin2'
description ='site for fantasy readers' description ='site for fantasy readers'
category='fantasy' category='fantasy'
max_articles_per_feed = 100 max_articles_per_feed = 100
INDEX='http://www.fantastyka.pl/' INDEX='http://www.fantastyka.pl/'
no_stylesheets=True
needs_subscription = 'optional'
remove_tags_before=dict(attrs={'class':'belka1-tlo-md'}) remove_tags_before=dict(attrs={'class':'belka1-tlo-md'})
#remove_tags_after=dict(name='span', attrs={'class':'naglowek-oceny'}) #remove_tags_after=dict(name='span', attrs={'class':'naglowek-oceny'})
remove_tags_after=dict(name='td', attrs={'class':'belka1-bot'}) remove_tags_after=dict(name='td', attrs={'class':'belka1-bot'})
remove_tags=[dict(attrs={'class':'avatar2'})] remove_tags=[dict(attrs={'class':'avatar2'}), dict(name='span', attrs={'class':'alert-oceny'}), dict(name='img', attrs={'src':['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'),dict(name='a', attrs={'href':'http://www.fantastyka.pl/10,1727.html'})]
feeds = []
def find_articles(self, url): def find_articles(self, url):
articles = [] articles = []
@ -45,3 +46,13 @@ class Nowa_Fantastyka(BasicNewsRecipe):
cover=soup.find(name='img', attrs={'class':'okladka'}) cover=soup.find(name='img', attrs={'class':'okladka'})
self.cover_url=self.INDEX+ cover['src'] self.cover_url=self.INDEX+ cover['src']
return getattr(self, 'cover_url', self.cover_url) return getattr(self, 'cover_url', self.cover_url)
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.fantastyka.pl/')
br.select_form(nr=0)
br['login'] = self.username
br['pass'] = self.password
br.submit()
return br

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
''' '''
@ -707,6 +707,16 @@ class NYTimes(BasicNewsRecipe):
return soup return soup
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
if idxdiv is not None:
if idxdiv.img:
self.add_toc_thumbnail(article, idxdiv.img['src'])
else:
img = soup.find('img')
if img is not None:
self.add_toc_thumbnail(article, img['src'])
shortparagraph = "" shortparagraph = ""
try: try:
if len(article.text_summary.strip()) == 0: if len(article.text_summary.strip()) == 0:

View File

@ -855,6 +855,16 @@ class NYTimes(BasicNewsRecipe):
return soup return soup
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
if idxdiv is not None:
if idxdiv.img:
self.add_toc_thumbnail(article, idxdiv.img['src'])
else:
img = soup.find('img')
if img is not None:
self.add_toc_thumbnail(article, img['src'])
shortparagraph = "" shortparagraph = ""
try: try:
if len(article.text_summary.strip()) == 0: if len(article.text_summary.strip()) == 0:

View File

@ -23,7 +23,7 @@ class OSNewsRecipe(BasicNewsRecipe):
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
cover_url='http://osnews.pl/wp-content/themes/osnews/img/logo.png'
extra_css = ''' extra_css = '''
.news-heading {font-size:150%} .news-heading {font-size:150%}
.newsinformations li {display:inline;} .newsinformations li {display:inline;}
@ -44,7 +44,9 @@ class OSNewsRecipe(BasicNewsRecipe):
dict(name = 'div', attrs = {'class' : 'sociable'}), dict(name = 'div', attrs = {'class' : 'sociable'}),
dict(name = 'div', attrs = {'class' : 'post_prev'}), dict(name = 'div', attrs = {'class' : 'post_prev'}),
dict(name = 'div', attrs = {'class' : 'post_next'}), dict(name = 'div', attrs = {'class' : 'post_next'}),
dict(name = 'div', attrs = {'class' : 'clr'}) dict(name = 'div', attrs = {'class' : 'clr'}),
dict(name = 'div', attrs = {'class' : 'tw_button'}),
dict(name = 'div', attrs = {'style' : 'width:56px;height:60px;float:left;margin-right:10px'})
] ]
preprocess_regexps = [(re.compile(u'</span>Komentarze: \(?[0-9]+\)? ?<span'), lambda match: '</span><span')] preprocess_regexps = [(re.compile(u'</span>Komentarze: \(?[0-9]+\)? ?<span'), lambda match: '</span><span')]

View File

@ -0,0 +1,79 @@
#!/usr/bin/env python
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
__license__ = 'GPL v3'
'''
calibre recipe for prospectmagazine.co.uk (subscription)
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class ProspectMagUK(BasicNewsRecipe):
title = u'Prospect Magazine'
description = 'A general-interest publication offering analysis and commentary about politics, news and business.'
__author__ = 'barty, duluoz'
timefmt = ' [%d %B %Y]'
no_stylesheets = True
publication_type = 'magazine'
masthead_url = 'http://www.prospectmagazine.co.uk/wp-content/themes/prospect/images/titleMain.jpg'
category = 'news, UK'
language = 'en_GB'
max_articles_per_feed = 100
auto_cleanup = True
needs_subscription = True
auto_cleanup_keep = '//div[@class="lead_image"]'
remove_tags = [{'class':['shareinpost','postutils','postinfo']}]
INDEX = 'http://www.prospectmagazine.co.uk/current-issue'
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.prospectmagazine.co.uk/wp-login.php')
br.select_form(name='loginform')
br['log'] = self.username
br['pwd'] = self.password
br.submit()
return br
def parse_index(self):
soup = self.index_to_soup(self.INDEX)
#div = soup.find('h1',text=re.compile(r'Issue \d+'))
#fname = self.tag_to_string( div) if div is not None else 'Current Issue'
div = soup.find('div', id='cover_image')
if div is not None:
img = div.find('img', src=True)
if img is not None:
src = img['src']
if src.startswith('/'):
src = 'http://www.prospectmagazine.co.uk' + src
self.cover_url = src
feeds = []
# loop through sections
for sect in soup.findAll('div',attrs={'class':'sectionheading'}):
fname = self.tag_to_string( sect).replace('>','').strip()
self.log('Found section', fname)
articles = []
# note: can't just find siblings with class='post' because that will also
# grab all the articles belonging to the sections that follow.
for item in sect.findNextSiblings('div',attrs={'class':True}):
if not 'post' in item['class']: break
a = item.find('a', href=True)
if a is None: continue
url = a['href']
title = self.tag_to_string(a)
p = item.find('p')
desc = self.tag_to_string( p) if p is not None else ''
art = {'title':title, 'description':desc,'date':' ', 'url':url}
p = item.find(attrs={'class':re.compile('author')})
self.log('\tFound article:', title, '::', url)
if p is not None:
art['author'] = self.tag_to_string( p).strip()
articles.append(art)
feeds.append((fname, articles))
return feeds

View File

@ -42,6 +42,9 @@ class Radikal_tr(BasicNewsRecipe):
,(u'Politika' , u'http://www.radikal.com.tr/d/rss/Rss_98.xml' ) ,(u'Politika' , u'http://www.radikal.com.tr/d/rss/Rss_98.xml' )
,(u'Dis Haberler', u'http://www.radikal.com.tr/d/rss/Rss_100.xml' ) ,(u'Dis Haberler', u'http://www.radikal.com.tr/d/rss/Rss_100.xml' )
,(u'Ekonomi' , u'http://www.radikal.com.tr/d/rss/Rss_101.xml' ) ,(u'Ekonomi' , u'http://www.radikal.com.tr/d/rss/Rss_101.xml' )
,(u'Radikal Iki' , u'http://www.radikal.com.tr/d/rss/Rss_42.xml')
,(u'Radikal Hayat' , u'http://www.radikal.com.tr/d/rss/Rss_41.xml' )
,(u'Radikal Kitap' , u'http://www.radikal.com.tr/d/rss/Rss_40.xml' )
] ]
def print_version(self, url): def print_version(self, url):

View File

@ -29,22 +29,7 @@ class RollingStones(BasicNewsRecipe):
max_articles_per_feed = 25 max_articles_per_feed = 25
use_embedded_content = False use_embedded_content = False
no_stylesheets = True no_stylesheets = True
auto_cleanup = True
remove_javascript = True
#####################################################################################
# cleanup section #
#####################################################################################
keep_only_tags = [
dict(name='div', attrs={'class':['c65l']}),
dict(name='div', attrs={'id':['col1']}),
]
remove_tags = [
dict(name='div', attrs={'class': ['storyActions upper','storyActions lowerArticleNav']}),
dict(name='div', attrs={'id': ['comments','related']}),
]
feeds = [ feeds = [
(u'News', u'http://www.rollingstone.com/siteServices/rss/allNews'), (u'News', u'http://www.rollingstone.com/siteServices/rss/allNews'),
@ -58,25 +43,7 @@ class RollingStones(BasicNewsRecipe):
def get_article_url(self, article): def print_version(self, url):
return article.get('guid', None) return url +'?print=true'
def append_page(self, soup, appendtag, position):
'''
Some are the articles are multipage so the below function
will get the articles that have <next>
'''
pager = soup.find('li',attrs={'class':'next'})
if pager:
nexturl = pager.a['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'id':'storyTextContainer'})
for it in texttag.findAll(style=True):
del it['style']
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
texttag.extract()
appendtag.insert(position,texttag)

View File

@ -0,0 +1,21 @@
from calibre.web.feeds.news import BasicNewsRecipe
class rynekzdrowia(BasicNewsRecipe):
title = u'Rynek Zdrowia'
__author__ = u'spi630'
language = 'pl'
masthead_url = 'http://k.rynekzdrowia.pl/images/headerLogo.png'
cover_url = 'http://k.rynekzdrowia.pl/images/headerLogo.png'
oldest_article = 3
max_articles_per_feed = 25
no_stylesheets = True
auto_cleanup = True
remove_empty_feeds=True
remove_tags_before = dict(name='h3')
feeds = [(u'Finanse i Zarz\u0105dzanie', u'http://www.rynekzdrowia.pl/Kanal/finanse.html'), (u'Inwestycje', u'http://www.rynekzdrowia.pl/Kanal/inwestycje.html'), (u'Aparatura i wyposa\u017cenie', u'http://www.rynekzdrowia.pl/Kanal/aparatura.html'), (u'Informatyka', u'http://www.rynekzdrowia.pl/Kanal/informatyka.html'), (u'Prawo', u'http://www.rynekzdrowia.pl/Kanal/prawo.html'), (u'Polityka zdrowotna', u'http://www.rynekzdrowia.pl/Kanal/polityka_zdrowotna.html'), (u'Ubezpieczenia Zdrowotne', u'http://www.rynekzdrowia.pl/Kanal/ubezpieczenia.html'), (u'Farmacja', u'http://www.rynekzdrowia.pl/Kanal/farmacja.html'), (u'Badania i rozw\xf3j', u'http://www.rynekzdrowia.pl/Kanal/badania.html'), (u'Nauka', u'http://www.rynekzdrowia.pl/Kanal/nauka.html'), (u'Po godzinach', u'http://www.rynekzdrowia.pl/Kanal/godziny.html'), (u'Us\u0142ugi medyczne', u'http://www.rynekzdrowia.pl/Kanal/uslugi.html')]
def print_version(self, url):
url = url.replace('.html', ',drukuj.html')
return url

View File

@ -11,17 +11,16 @@ from calibre.web.feeds.news import BasicNewsRecipe
class Salon_com(BasicNewsRecipe): class Salon_com(BasicNewsRecipe):
title = 'Salon.com' title = 'Salon.com'
__author__ = 'cix3' __author__ = 'Kovid Goyal'
description = 'Salon.com - Breaking news, opinion, politics, entertainment, sports and culture.' description = 'Salon.com - Breaking news, opinion, politics, entertainment, sports and culture.'
timefmt = ' [%b %d, %Y]' timefmt = ' [%b %d, %Y]'
language = 'en' language = 'en'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
auto_cleanup = True
remove_tags = [dict(name='div', attrs={'class':['ad_content', 'clearfix']}), dict(name='hr'), dict(name='img')] auto_cleanup_keep = '//div[@class="art"]'
remove_empty_feeds = True
remove_tags_before = dict(name='h2')
feeds = [ feeds = [
('News & Politics', 'http://feeds.salon.com/salon/news'), ('News & Politics', 'http://feeds.salon.com/salon/news'),
@ -40,5 +39,5 @@ class Salon_com(BasicNewsRecipe):
] ]
def print_version(self, url): def print_version(self, url):
return url.replace('/index.html', '/print.html') return url + '/print/'

View File

@ -0,0 +1,17 @@
from calibre.web.feeds.news import BasicNewsRecipe
class spn(BasicNewsRecipe):
title = u'Salonica Press News'
language = 'gr'
__author__ = "SteliosGero"
oldest_article = 3
max_articles_per_feed = 100
auto_cleanup = True
category = 'news, GR'
language = 'el'
feeds = [(u'\u03a0\u03bf\u03bb\u03b9\u03c4\u03b9\u03ba\u03ae', u'http://www.spnews.gr/politiki?format=feed&amp;type=rss'), (u'\u039f\u03b9\u03ba\u03bf\u03bd\u03bf\u03bc\u03af\u03b1', u'http://www.spnews.gr/oikonomia?format=feed&amp;type=rss'), (u'\u0391\u03c5\u03c4\u03bf\u03b4\u03b9\u03bf\u03af\u03ba\u03b7\u03c3\u03b7', u'http://www.spnews.gr/aftodioikisi?format=feed&amp;type=rss'), (u'\u039a\u03bf\u03b9\u03bd\u03c9\u03bd\u03af\u03b1', u'http://www.spnews.gr/koinonia?format=feed&amp;type=rss'), (u'\u0391\u03b8\u03bb\u03b7\u03c4\u03b9\u03c3\u03bc\u03cc\u03c2', u'http://www.spnews.gr/sports?format=feed&amp;type=rss'), (u'\u0394\u03b9\u03b5\u03b8\u03bd\u03ae', u'http://www.spnews.gr/diethni?format=feed&amp;type=rss'), (u'\u03a0\u03bf\u03bb\u03b9\u03c4\u03b9\u03c3\u03bc\u03cc\u03c2', u'http://www.spnews.gr/politismos?format=feed&amp;type=rss'), (u'Media', u'http://www.spnews.gr/media-news?format=feed&amp;type=rss'), (u'\u0396\u03c9\u03ae', u'http://www.spnews.gr/zoi?format=feed&amp;type=rss'), (u'\u03a4\u03b5\u03c7\u03bd\u03bf\u03bb\u03bf\u03b3\u03af\u03b1', u'http://spnews.gr/texnologia?format=feed&amp;type=rss'), (u'\u03a0\u03b5\u03c1\u03b9\u03b2\u03ac\u03bb\u03bb\u03bf\u03bd', u'http://spnews.gr/periballon?format=feed&amp;type=rss'), (u'\u03a0\u03b1\u03c1\u03b1\u03c0\u03bf\u03bb\u03b9\u03c4\u03b9\u03ba\u03ac', u'http://spnews.gr/parapolitika?format=feed&amp;type=rss'), (u'\u03a0\u03b1\u03c1\u03b1\u03b4\u03b7\u03bc\u03bf\u03c4\u03b9\u03ba\u03ac', u'http://spnews.gr/paradimotika?format=feed&amp;type=rss'), (u'\u03a0\u03b1\u03c1\u03b1\u03b1\u03b8\u03bb\u03b7\u03c4\u03b9\u03ba\u03ac', u'http://spnews.gr/parathlitika?format=feed&amp;type=rss'), (u'\u0391\u03c0\u03cc\u03c8\u03b5\u03b9\u03c2', u'http://spnews.gr/apopseis?format=feed&amp;type=rss'), (u'\u03a3\u03c5\u03bd\u03b5\u03cd\u03be\u03b5\u03b9\u03c2', u'http://spnews.gr/synenteykseis?format=feed&amp;type=rss'), (u'Alert!', u'http://spnews.gr/alert?format=feed&amp;type=rss')]
def print_version(self, url):
return url+'?tmpl=component&print=1&layout=default&page='

Some files were not shown because too many files have changed in this diff Show More