Sync to trunk.
@ -2,6 +2,7 @@
|
||||
.check-cache.pickle
|
||||
src/calibre/plugins
|
||||
resources/images.qrc
|
||||
src/calibre/ebooks/oeb/display/test/*.js
|
||||
src/calibre/manual/.build/
|
||||
src/calibre/manual/cli/
|
||||
src/calibre/manual/template_ref.rst
|
||||
@ -15,6 +16,7 @@ resources/ebook-convert-complete.pickle
|
||||
resources/builtin_recipes.xml
|
||||
resources/builtin_recipes.zip
|
||||
resources/template-functions.json
|
||||
resources/display/*.js
|
||||
setup/installer/windows/calibre/build.log
|
||||
src/calibre/translations/.errors
|
||||
src/cssutils/.svn/
|
||||
|
4210
Changelog.old.yaml
4706
Changelog.yaml
@ -1,5 +1,5 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Dean Cording'
|
||||
__copyright__ = '2011, Pat Stapleton <pat.stapleton at gmail.com>'
|
||||
'''
|
||||
abc.net.au/news
|
||||
'''
|
||||
@ -8,7 +8,7 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class ABCNews(BasicNewsRecipe):
|
||||
title = 'ABC News'
|
||||
__author__ = 'Dean Cording'
|
||||
__author__ = 'Pat Stapleton, Dean Cording'
|
||||
description = 'News from Australia'
|
||||
masthead_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png'
|
||||
cover_url = 'http://www.abc.net.au/news/assets/v5/images/common/logo-news.png'
|
||||
@ -23,7 +23,9 @@ class ABCNews(BasicNewsRecipe):
|
||||
category = 'News, Australia, World'
|
||||
language = 'en_AU'
|
||||
publication_type = 'newsportal'
|
||||
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
||||
# preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
||||
#Remove annoying map links (inline-caption class is also used for some image captions! hence regex to match maps.google)
|
||||
preprocess_regexps = [(re.compile(r'<a class="inline-caption" href="http://maps\.google\.com.*?/a>', re.DOTALL), lambda m: '')]
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
@ -32,23 +34,23 @@ class ABCNews(BasicNewsRecipe):
|
||||
,'linearize_tables': False
|
||||
}
|
||||
|
||||
keep_only_tags = dict(id='article')
|
||||
keep_only_tags = [dict(attrs={'class':['article section']})]
|
||||
|
||||
remove_tags = [dict(attrs={'class':['related', 'tags']}),
|
||||
dict(id='statepromo')
|
||||
]
|
||||
remove_tags = [dict(attrs={'class':['related', 'tags', 'tools', 'attached-content ready',
|
||||
'inline-content story left', 'inline-content map left contracted', 'published',
|
||||
'story-map', 'statepromo', 'topics', ]})]
|
||||
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
feeds = [
|
||||
('Top Stories', 'http://www.abc.net.au/news/syndicate/topstoriesrss.xml'),
|
||||
('Canberra', 'http://www.abc.net.au/news/indexes/idx-act/rss.xml'),
|
||||
('Sydney', 'http://www.abc.net.au/news/indexes/sydney/rss.xml'),
|
||||
('Melbourne', 'http://www.abc.net.au/news/indexes/melbourne/rss.xml'),
|
||||
('Brisbane', 'http://www.abc.net.au/news/indexes/brisbane/rss.xml'),
|
||||
('Perth', 'http://www.abc.net.au/news/indexes/perth/rss.xml'),
|
||||
('Australia', 'http://www.abc.net.au/news/indexes/idx-australia/rss.xml'),
|
||||
('World', 'http://www.abc.net.au/news/indexes/world/rss.xml'),
|
||||
('Business', 'http://www.abc.net.au/news/indexes/business/rss.xml'),
|
||||
('Science and Technology', 'http://www.abc.net.au/news/tag/science-and-technology/rss.xml'),
|
||||
('Top Stories', 'http://www.abc.net.au/news/feed/45910/rss.xml'),
|
||||
('Canberra', 'http://www.abc.net.au/news/feed/6910/rss.xml'),
|
||||
('Sydney', 'http://www.abc.net.au/news/feed/10232/rss.xml'),
|
||||
('Melbourne', 'http://www.abc.net.au/news/feed/21708/rss.xml'),
|
||||
('Brisbane', 'http://www.abc.net.au/news/feed/12858/rss.xml'),
|
||||
('Perth', 'feed://www.abc.net.au/news/feed/24886/rss.xml'),
|
||||
('Australia', 'http://www.abc.net.au/news/feed/46182/rss.xml'),
|
||||
('World', 'http://www.abc.net.au/news/feed/52278/rss.xml'),
|
||||
('Business', 'http://www.abc.net.au/news/feed/51892/rss.xml'),
|
||||
('Science and Technology', 'http://www.abc.net.au/news/feed/2298/rss.xml'),
|
||||
]
|
||||
|
@ -1,19 +1,38 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
import re
|
||||
class Adventure_zone(BasicNewsRecipe):
|
||||
title = u'Adventure Zone'
|
||||
__author__ = 'fenuks'
|
||||
description = 'Adventure zone - adventure games from A to Z'
|
||||
category = 'games'
|
||||
language = 'pl'
|
||||
oldest_article = 15
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
oldest_article = 20
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content=False
|
||||
preprocess_regexps = [(re.compile(r"<td class='capmain'>Komentarze</td>", re.IGNORECASE), lambda m: '')]
|
||||
remove_tags_before= dict(name='td', attrs={'class':'main-bg'})
|
||||
remove_tags_after= dict(name='td', attrs={'class':'main-body middle-border'})
|
||||
remove_tags= [dict(name='img', attrs={'alt':'Drukuj'})]
|
||||
remove_tags_after= dict(id='comments')
|
||||
extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; }'
|
||||
feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')]
|
||||
|
||||
def parse_feeds (self):
|
||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||
soup=self.index_to_soup(u'http://www.adventure-zone.info/fusion/feeds/news.php')
|
||||
tag=soup.find(name='channel')
|
||||
titles=[]
|
||||
for r in tag.findAll(name='image'):
|
||||
r.extract()
|
||||
art=tag.findAll(name='item')
|
||||
for i in art:
|
||||
titles.append(i.title.string)
|
||||
for feed in feeds:
|
||||
for article in feed.articles[:]:
|
||||
article.title=titles[feed.articles.index(article)]
|
||||
return feeds
|
||||
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php')
|
||||
cover=soup.find(id='box_OstatninumerAZ')
|
||||
@ -22,17 +41,10 @@ class Adventure_zone(BasicNewsRecipe):
|
||||
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
skip_tag = soup.body.findAll(name='a')
|
||||
if skip_tag is not None:
|
||||
skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'})
|
||||
skip_tag = skip_tag.findAll(name='a')
|
||||
for r in skip_tag:
|
||||
if 'articles.php?' in r['href']:
|
||||
if r.strong is not None:
|
||||
if r.strong:
|
||||
word=r.strong.string
|
||||
if ('zapowied' or 'recenzj') in word:
|
||||
return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item_id'+r['href'][r['href'].find('_id')+3:], raw=True)
|
||||
else:
|
||||
None
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('news.php?readmore', 'print.php?type=N&item_id')
|
||||
|
||||
if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word)):
|
||||
return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
|
@ -1,5 +1,4 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AstroNEWS(BasicNewsRecipe):
|
||||
title = u'AstroNEWS'
|
||||
__author__ = 'fenuks'
|
||||
@ -8,11 +7,16 @@ class AstroNEWS(BasicNewsRecipe):
|
||||
language = 'pl'
|
||||
oldest_article = 8
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
#extra_css= 'table {text-align: left;}'
|
||||
no_stylesheets=True
|
||||
cover_url='http://news.astronet.pl/img/logo_news.jpg'
|
||||
# no_stylesheets= True
|
||||
remove_tags=[dict(name='hr')]
|
||||
feeds = [(u'Wiadomości', u'http://news.astronet.pl/rss.cgi')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('astronet.pl/', 'astronet.pl/print.cgi?')
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(align=True):
|
||||
del item['align']
|
||||
return soup
|
||||
|
52
recipes/b365realitatea.recipe
Normal file
@ -0,0 +1,52 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = u'2011, Silviu Cotoar\u0103'
|
||||
'''
|
||||
b365.realitatea.net
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class b365Realitatea(BasicNewsRecipe):
|
||||
title = u'b365 Realitatea'
|
||||
__author__ = u'Silviu Cotoar\u0103'
|
||||
publisher = u'b365 Realitatea'
|
||||
description = u'b365 Realitatea'
|
||||
oldest_article = 5
|
||||
language = 'ro'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
category = 'Ziare,Romania,Bucuresti'
|
||||
encoding = 'utf-8'
|
||||
cover_url = 'http://b365.realitatea.net/wp-content/themes/b/images/b365-logo.png'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':'newsArticle'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':'date'})
|
||||
, dict(name='dic', attrs={'class':'addthis_toolbox addthis_default_style'})
|
||||
, dict(name='div', attrs={'class':'related_posts'})
|
||||
, dict(name='div', attrs={'id':'RelevantiWidget'})
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='div', attrs={'id':'RelevantiWidget'})
|
||||
]
|
||||
feeds = [
|
||||
(u'\u0218tiri', u'http://b365.realitatea.net/rss-full/')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
||||
|
@ -1,61 +1,648 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
##
|
||||
## Title: BBC News, Sport, and Blog Calibre Recipe
|
||||
## Contact: mattst - jmstanfield@gmail.com
|
||||
##
|
||||
## License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
|
||||
## Copyright: mattst - jmstanfield@gmail.com
|
||||
##
|
||||
## Written: November 2011
|
||||
## Last Edited: 2011-11-19
|
||||
##
|
||||
|
||||
__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
|
||||
__copyright__ = 'mattst - jmstanfield@gmail.com'
|
||||
|
||||
|
||||
'''
|
||||
news.bbc.co.uk
|
||||
BBC News, Sport, and Blog Calibre Recipe
|
||||
'''
|
||||
|
||||
# Import the regular expressions module.
|
||||
import re
|
||||
|
||||
# Import the BasicNewsRecipe class which this class extends.
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class BBC(BasicNewsRecipe):
|
||||
title = 'BBC News'
|
||||
__author__ = 'Darko Miletic, Starson17'
|
||||
description = 'News from UK. '
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
#delay = 1
|
||||
use_embedded_content = False
|
||||
encoding = 'utf8'
|
||||
publisher = 'BBC'
|
||||
category = 'news, UK, world'
|
||||
language = 'en_GB'
|
||||
publication_type = 'newsportal'
|
||||
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
|
||||
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
,'linearize_tables': True
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':['layout-block-a layout-block']})
|
||||
,dict(attrs={'class':['story-body','storybody']})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['story-feature related narrow', 'share-help', 'embedded-hyper',
|
||||
'story-feature wide ', 'story-feature narrow']}),
|
||||
dict(id=['hypertab', 'comment-form']),
|
||||
]
|
||||
|
||||
remove_attributes = ['width','height']
|
||||
class BBCNewsSportBlog(BasicNewsRecipe):
|
||||
|
||||
#
|
||||
# **** IMPORTANT USERS READ ME ****
|
||||
#
|
||||
# First select the feeds you want then scroll down below the feeds list
|
||||
# and select the values you want for the other user preferences, like
|
||||
# oldest_article and such like.
|
||||
#
|
||||
#
|
||||
# Select the BBC rss feeds which you want in your ebook.
|
||||
# Selected feed have NO '#' at their start, de-selected feeds begin with a '#'.
|
||||
#
|
||||
# Eg. ("News Home", "http://feeds.bbci.co.uk/... - include feed.
|
||||
# Eg. #("News Home", "http://feeds.bbci.co.uk/... - do not include feed.
|
||||
#
|
||||
# There are 68 feeds below which constitute the bulk of the available rss
|
||||
# feeds on the BBC web site. These include 5 blogs by editors and
|
||||
# correspondants, 16 sports feeds, 15 'sub' regional feeds (Eg. North West
|
||||
# Wales, Scotland Business), and 7 Welsh language feeds.
|
||||
#
|
||||
# Some of the feeds are low volume (Eg. blogs), or very low volume (Eg. Click)
|
||||
# so if "oldest_article = 1.5" (only articles published in the last 36 hours)
|
||||
# you may get some 'empty feeds' which will not then be included in the ebook.
|
||||
#
|
||||
# The 15 feeds currently selected below are simply my default ones.
|
||||
#
|
||||
# Note: With all 68 feeds selected, oldest_article set to 2,
|
||||
# max_articles_per_feed set to 100, and simultaneous_downloads set to 10,
|
||||
# the ebook creation took 29 minutes on my speedy 100 mbps net connection,
|
||||
# fairly high-end desktop PC running Linux (Ubuntu Lucid-Lynx).
|
||||
# More realistically with 15 feeds selected, oldest_article set to 1.5,
|
||||
# max_articles_per_feed set to 100, and simultaneous_downloads set to 20,
|
||||
# it took 6 minutes. If that's too slow increase 'simultaneous_downloads'.
|
||||
#
|
||||
# Select / de-select the feeds you want in your ebook.
|
||||
#
|
||||
feeds = [
|
||||
('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'),
|
||||
('Science/Nature', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/science/nature/rss.xml'),
|
||||
('Technology', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/technology/rss.xml'),
|
||||
('Entertainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'),
|
||||
('Magazine', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/magazine/rss.xml'),
|
||||
('Business', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml'),
|
||||
('Health', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/health/rss.xml'),
|
||||
('Americas', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml'),
|
||||
('Europe', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml'),
|
||||
('South Asia', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/south_asia/rss.xml'),
|
||||
('UK', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/rss.xml'),
|
||||
('Asia-Pacific', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/asia-pacific/rss.xml'),
|
||||
('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'),
|
||||
("News Home", "http://feeds.bbci.co.uk/news/rss.xml"),
|
||||
("UK", "http://feeds.bbci.co.uk/news/uk/rss.xml"),
|
||||
("World", "http://feeds.bbci.co.uk/news/world/rss.xml"),
|
||||
#("England", "http://feeds.bbci.co.uk/news/england/rss.xml"),
|
||||
#("Scotland", "http://feeds.bbci.co.uk/news/scotland/rss.xml"),
|
||||
#("Wales", "http://feeds.bbci.co.uk/news/wales/rss.xml"),
|
||||
#("N. Ireland", "http://feeds.bbci.co.uk/news/northern_ireland/rss.xml"),
|
||||
#("Africa", "http://feeds.bbci.co.uk/news/world/africa/rss.xml"),
|
||||
#("Asia", "http://feeds.bbci.co.uk/news/world/asia/rss.xml"),
|
||||
#("Europe", "http://feeds.bbci.co.uk/news/world/europe/rss.xml"),
|
||||
#("Latin America", "http://feeds.bbci.co.uk/news/world/latin_america/rss.xml"),
|
||||
#("Middle East", "http://feeds.bbci.co.uk/news/world/middle_east/rss.xml"),
|
||||
("US & Canada", "http://feeds.bbci.co.uk/news/world/us_and_canada/rss.xml"),
|
||||
("Politics", "http://feeds.bbci.co.uk/news/politics/rss.xml"),
|
||||
("Science/Environment", "http://feeds.bbci.co.uk/news/science_and_environment/rss.xml"),
|
||||
("Technology", "http://feeds.bbci.co.uk/news/technology/rss.xml"),
|
||||
("Magazine", "http://feeds.bbci.co.uk/news/magazine/rss.xml"),
|
||||
("Entertainment/Arts", "http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml"),
|
||||
#("Health", "http://feeds.bbci.co.uk/news/health/rss.xml"),
|
||||
#("Education/Family", "http://feeds.bbci.co.uk/news/education/rss.xml"),
|
||||
("Business", "http://feeds.bbci.co.uk/news/business/rss.xml"),
|
||||
("Special Reports", "http://feeds.bbci.co.uk/news/special_reports/rss.xml"),
|
||||
("Also in the News", "http://feeds.bbci.co.uk/news/also_in_the_news/rss.xml"),
|
||||
#("Newsbeat", "http://www.bbc.co.uk/newsbeat/rss.xml"),
|
||||
#("Click", "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/programmes/click_online/rss.xml"),
|
||||
("Blog: Nick Robinson (Political Editor)", "http://feeds.bbci.co.uk/news/correspondents/nickrobinson/rss.sxml"),
|
||||
#("Blog: Mark D'Arcy (Parliamentary Correspondent)", "http://feeds.bbci.co.uk/news/correspondents/markdarcy/rss.sxml"),
|
||||
#("Blog: Robert Peston (Business Editor)", "http://feeds.bbci.co.uk/news/correspondents/robertpeston/rss.sxml"),
|
||||
#("Blog: Stephanie Flanders (Economics Editor)", "http://feeds.bbci.co.uk/news/correspondents/stephanieflanders/rss.sxml"),
|
||||
("Blog: Rory Cellan-Jones (Technology correspondent)", "http://feeds.bbci.co.uk/news/correspondents/rorycellanjones/rss.sxml"),
|
||||
("Sport Front Page", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/front_page/rss.xml"),
|
||||
#("Football", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/football/rss.xml"),
|
||||
#("Cricket", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/cricket/rss.xml"),
|
||||
#("Rugby Union", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_union/rss.xml"),
|
||||
#("Rugby League", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_league/rss.xml"),
|
||||
#("Tennis", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/tennis/rss.xml"),
|
||||
#("Golf", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/golf/rss.xml"),
|
||||
#("Motorsport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/motorsport/rss.xml"),
|
||||
#("Boxing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/boxing/rss.xml"),
|
||||
#("Athletics", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/athletics/rss.xml"),
|
||||
#("Snooker", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/snooker/rss.xml"),
|
||||
#("Horse Racing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/horse_racing/rss.xml"),
|
||||
#("Cycling", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/cycling/rss.xml"),
|
||||
#("Disability Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/disability_sport/rss.xml"),
|
||||
#("Other Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/rss.xml"),
|
||||
#("Olympics 2012", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/olympics_2012/rss.xml"),
|
||||
#("N. Ireland Politics", "http://feeds.bbci.co.uk/news/northern_ireland/northern_ireland_politics/rss.xml"),
|
||||
#("Scotland Politics", "http://feeds.bbci.co.uk/news/scotland/scotland_politics/rss.xml"),
|
||||
#("Scotland Business", "http://feeds.bbci.co.uk/news/scotland/scotland_business/rss.xml"),
|
||||
#("E. Scotland, Edinburgh & Fife", "http://feeds.bbci.co.uk/news/scotland/edinburgh_east_and_fife/rss.xml"),
|
||||
#("W. Scotland & Glasgow", "http://feeds.bbci.co.uk/news/scotland/glasgow_and_west/rss.xml"),
|
||||
#("Highlands & Islands", "http://feeds.bbci.co.uk/news/scotland/highlands_and_islands/rss.xml"),
|
||||
#("NE. Scotland, Orkney & Shetland", "http://feeds.bbci.co.uk/news/scotland/north_east_orkney_and_shetland/rss.xml"),
|
||||
#("South Scotland", "http://feeds.bbci.co.uk/news/scotland/south_scotland/rss.xml"),
|
||||
#("Central Scotland & Tayside", "http://feeds.bbci.co.uk/news/scotland/tayside_and_central/rss.xml"),
|
||||
#("Wales Politics", "http://feeds.bbci.co.uk/news/wales/wales_politics/rss.xml"),
|
||||
#("NW. Wales", "http://feeds.bbci.co.uk/news/wales/north_west_wales/rss.xml"),
|
||||
#("NE. Wales", "http://feeds.bbci.co.uk/news/wales/north_east_wales/rss.xml"),
|
||||
#("Mid. Wales", "http://feeds.bbci.co.uk/news/wales/mid_wales/rss.xml"),
|
||||
#("SW. Wales", "http://feeds.bbci.co.uk/news/wales/south_west_wales/rss.xml"),
|
||||
#("SE. Wales", "http://feeds.bbci.co.uk/news/wales/south_east_wales/rss.xml"),
|
||||
#("Newyddion - News in Welsh", "http://feeds.bbci.co.uk/newyddion/rss.xml"),
|
||||
#("Gwleidyddiaeth", "http://feeds.bbci.co.uk/newyddion/gwleidyddiaeth/rss.xml"),
|
||||
#("Gogledd-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/gogledd-ddwyrain/rss.xml"),
|
||||
#("Gogledd-Orllewin", "http://feeds.bbci.co.uk/newyddion/gogledd-orllewin/rss.xml"),
|
||||
#("Canolbarth", "http://feeds.bbci.co.uk/newyddion/canolbarth/rss.xml"),
|
||||
#("De-Ddwyrain", "http://feeds.bbci.co.uk/newyddion/de-ddwyrain/rss.xml"),
|
||||
#("De-Orllewin", "http://feeds.bbci.co.uk/newyddion/de-orllewin/rss.xml"),
|
||||
]
|
||||
|
||||
|
||||
# **** SELECT YOUR USER PREFERENCES ****
|
||||
|
||||
# Title to use for the ebook.
|
||||
#
|
||||
title = 'BBC News'
|
||||
|
||||
# A brief description for the ebook.
|
||||
#
|
||||
description = u'BBC web site ebook created using rss feeds.'
|
||||
|
||||
# The max number of articles which may be downloaded from each feed.
|
||||
# I've never seen more than about 70 articles in a single feed in the
|
||||
# BBC feeds.
|
||||
#
|
||||
max_articles_per_feed = 100
|
||||
|
||||
# The max age of articles which may be downloaded from each feed. This is
|
||||
# specified in days - note fractions of days are allowed, Eg. 2.5 (2 and a
|
||||
# half days). My default of 1.5 days is the last 36 hours, the point at
|
||||
# which I've decided 'news' becomes 'old news', but be warned this is not
|
||||
# so good for the blogs, technology, magazine, etc., and sports feeds.
|
||||
# You may wish to extend this to 2-5 but watch out ebook creation time will
|
||||
# increase as well. Setting this to 30 will get everything (AFAICT) as long
|
||||
# as max_articles_per_feed remains set high (except for 'Click' which is
|
||||
# v. low volume and its currently oldest article is 4th Feb 2011).
|
||||
#
|
||||
oldest_article = 1.5
|
||||
|
||||
# Number of simultaneous downloads. 20 is consistantly working fine on the
|
||||
# BBC News feeds with no problems. Speeds things up from the defualt of 5.
|
||||
# If you have a lot of feeds and/or have increased oldest_article above 2
|
||||
# then you may wish to try increasing simultaneous_downloads to 25-30,
|
||||
# Or, of course, if you are in a hurry. [I've not tried beyond 20.]
|
||||
#
|
||||
simultaneous_downloads = 20
|
||||
|
||||
# Timeout for fetching files from the server in seconds. The default of
|
||||
# 120 seconds, seems somewhat excessive.
|
||||
#
|
||||
timeout = 30
|
||||
|
||||
# The format string for the date shown on the ebook's first page.
|
||||
# List of all values: http://docs.python.org/library/time.html
|
||||
# Default in news.py has a leading space so that's mirrored here.
|
||||
# As with 'feeds' select/de-select by adding/removing the initial '#',
|
||||
# only one timefmt should be selected, here's a few to choose from.
|
||||
#
|
||||
timefmt = ' [%a, %d %b %Y]' # [Fri, 14 Nov 2011] (Calibre default)
|
||||
#timefmt = ' [%a, %d %b %Y %H:%M]' # [Fri, 14 Nov 2011 18:30]
|
||||
#timefmt = ' [%a, %d %b %Y %I:%M %p]' # [Fri, 14 Nov 2011 06:30 PM]
|
||||
#timefmt = ' [%d %b %Y]' # [14 Nov 2011]
|
||||
#timefmt = ' [%d %b %Y %H:%M]' # [14 Nov 2011 18.30]
|
||||
#timefmt = ' [%Y-%m-%d]' # [2011-11-14]
|
||||
#timefmt = ' [%Y-%m-%d-%H-%M]' # [2011-11-14-18-30]
|
||||
|
||||
|
||||
|
||||
#
|
||||
# **** IMPORTANT ****
|
||||
#
|
||||
# DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING.
|
||||
#
|
||||
# DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING.
|
||||
#
|
||||
# I MEAN IT, YES I DO, ABSOLUTELY, AT YOU OWN RISK. :)
|
||||
#
|
||||
# **** IMPORTANT ****
|
||||
#
|
||||
|
||||
|
||||
|
||||
# Author of this recipe.
|
||||
__author__ = 'mattst'
|
||||
|
||||
# Specify English as the language of the RSS feeds (ISO-639 code).
|
||||
language = 'en_GB'
|
||||
|
||||
# Set tags.
|
||||
tags = 'news, sport, blog'
|
||||
|
||||
# Set publisher and publication type.
|
||||
publisher = 'BBC'
|
||||
publication_type = 'newspaper'
|
||||
|
||||
# Disable stylesheets from site.
|
||||
no_stylesheets = True
|
||||
|
||||
# Specifies an override encoding for sites that have an incorrect charset
|
||||
# specified. Default of 'None' says to auto-detect. Some other BBC recipes
|
||||
# use 'utf8', which works fine (so use that if necessary) but auto-detecting
|
||||
# with None is working fine, so stick with that for robustness.
|
||||
encoding = None
|
||||
|
||||
# Sets whether a feed has full articles embedded in it. The BBC feeds do not.
|
||||
use_embedded_content = False
|
||||
|
||||
# Removes empty feeds - why keep them!?
|
||||
remove_empty_feeds = True
|
||||
|
||||
# Create a custom title which fits nicely in the Kindle title list.
|
||||
# Requires "import time" above class declaration, and replacing
|
||||
# title with custom_title in conversion_options (right column only).
|
||||
# Example of string below: "BBC News - 14 Nov 2011"
|
||||
#
|
||||
# custom_title = "BBC News - " + time.strftime('%d %b %Y')
|
||||
|
||||
'''
|
||||
# Conversion options for advanced users, but don't forget to comment out the
|
||||
# current conversion_options below. Avoid setting 'linearize_tables' as that
|
||||
# plays havoc with the 'old style' table based pages.
|
||||
#
|
||||
conversion_options = { 'title' : title,
|
||||
'comments' : description,
|
||||
'tags' : tags,
|
||||
'language' : language,
|
||||
'publisher' : publisher,
|
||||
'authors' : publisher,
|
||||
'smarten_punctuation' : True
|
||||
}
|
||||
'''
|
||||
|
||||
conversion_options = { 'smarten_punctuation' : True }
|
||||
|
||||
# Specify extra CSS - overrides ALL other CSS (IE. Added last).
|
||||
extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
|
||||
.introduction, .first { font-weight: bold; } \
|
||||
.cross-head { font-weight: bold; font-size: 125%; } \
|
||||
.cap, .caption { display: block; font-size: 80%; font-style: italic; } \
|
||||
.cap, .caption, .caption img, .caption span { display: block; text-align: center; margin: 5px auto; } \
|
||||
.byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
|
||||
.correspondent-portrait img, .byline-lead-in, .name, .bbc-role { display: block; \
|
||||
text-align: center; font-size: 80%; font-style: italic; margin: 1px auto; } \
|
||||
.story-date, .published { font-size: 80%; } \
|
||||
table { width: 100%; } \
|
||||
td img { display: block; margin: 5px auto; } \
|
||||
ul { padding-top: 10px; } \
|
||||
ol { padding-top: 10px; } \
|
||||
li { padding-top: 5px; padding-bottom: 5px; } \
|
||||
h1 { text-align: center; font-size: 175%; font-weight: bold; } \
|
||||
h2 { text-align: center; font-size: 150%; font-weight: bold; } \
|
||||
h3 { text-align: center; font-size: 125%; font-weight: bold; } \
|
||||
h4, h5, h6 { text-align: center; font-size: 100%; font-weight: bold; }'
|
||||
|
||||
# Remove various tag attributes to improve the look of the ebook pages.
|
||||
remove_attributes = [ 'border', 'cellspacing', 'align', 'cellpadding', 'colspan',
|
||||
'valign', 'vspace', 'hspace', 'alt', 'width', 'height' ]
|
||||
|
||||
# Remove the (admittedly rarely used) line breaks, "<br />", which sometimes
|
||||
# cause a section of the ebook to start in an unsightly fashion or, more
|
||||
# frequently, a "<br />" will muck up the formatting of a correspondant's byline.
|
||||
# "<br />" and "<br clear/>" are far more frequently used on the table formatted
|
||||
# style of pages, and really spoil the look of the ebook pages.
|
||||
preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
|
||||
(re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: '')]
|
||||
|
||||
|
||||
# Create regular expressions for tag keeping and removal to make the matches more
|
||||
# robust against minor changes and errors in the HTML, Eg. double spaces, leading
|
||||
# and trailing spaces, missing hyphens, and such like.
|
||||
# Python regular expression ('re' class) page: http://docs.python.org/library/re.html
|
||||
|
||||
# ***************************************
|
||||
# Regular expressions for keep_only_tags:
|
||||
# ***************************************
|
||||
|
||||
# The BBC News HTML pages use variants of 'storybody' to denote the section of a HTML
|
||||
# page which contains the main text of the article. Match storybody variants: 'storybody',
|
||||
# 'story-body', 'story body','storybody ', etc.
|
||||
storybody_reg_exp = '^.*story[_ -]*body.*$'
|
||||
|
||||
# The BBC sport and 'newsbeat' (features) HTML pages use 'blq_content' to hold the title
|
||||
# and published date. This is one level above the usual news pages which have the title
|
||||
# and date within 'story-body'. This is annoying since 'blq_content' must also be kept,
|
||||
# resulting in a lot of extra things to be removed by remove_tags.
|
||||
blq_content_reg_exp = '^.*blq[_ -]*content.*$'
|
||||
|
||||
# The BBC has an alternative page design structure, which I suspect is an out-of-date
|
||||
# design but which is still used in some articles, Eg. 'Click' (technology), 'FastTrack'
|
||||
# (travel), and in some sport pages. These alternative pages are table based (which is
|
||||
# why I think they are an out-of-date design) and account for -I'm guesstimaking- less
|
||||
# than 1% of all articles. They use a table class 'storycontent' to hold the article
|
||||
# and like blq_content (above) have required lots of extra removal by remove_tags.
|
||||
story_content_reg_exp = '^.*story[_ -]*content.*$'
|
||||
|
||||
# Keep the sections of the HTML which match the list below. The HTML page created by
|
||||
# Calibre will fill <body> with those sections which are matched. Note that the
|
||||
# blq_content_reg_exp must be listed before storybody_reg_exp in keep_only_tags due to
|
||||
# it being the parent of storybody_reg_exp, that is to say the div class/id 'story-body'
|
||||
# will be inside div class/id 'blq_content' in the HTML (if 'blq_content' is there at
|
||||
# all). If they are the other way around in keep_only_tags then blq_content_reg_exp
|
||||
# will end up being discarded.
|
||||
keep_only_tags = [ dict(name='table', attrs={'class':re.compile(story_content_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(blq_content_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'id':re.compile(blq_content_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(storybody_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'id':re.compile(storybody_reg_exp, re.IGNORECASE)}) ]
|
||||
|
||||
# ************************************
|
||||
# Regular expressions for remove_tags:
|
||||
# ************************************
|
||||
|
||||
# Regular expression to remove share-help and variant tags. The share-help class
|
||||
# is used by the site for a variety of 'sharing' type links, Eg. Facebook, delicious,
|
||||
# twitter, email. Removed to avoid page clutter.
|
||||
share_help_reg_exp = '^.*share[_ -]*help.*$'
|
||||
|
||||
# Regular expression to remove embedded-hyper and variant tags. This class is used to
|
||||
# display links to other BBC News articles on the same/similar subject.
|
||||
embedded_hyper_reg_exp = '^.*embed*ed[_ -]*hyper.*$'
|
||||
|
||||
# Regular expression to remove hypertabs and variant tags. This class is used to
|
||||
# display a tab bar at the top of an article which allows the user to switch to
|
||||
# an article (viewed on the same page) providing further info., 'in depth' analysis,
|
||||
# an editorial, a correspondant's blog entry, and such like. The ability to handle
|
||||
# a tab bar of this nature is currently beyond the scope of this recipe and
|
||||
# possibly of Calibre itself (not sure about that - TO DO - check!).
|
||||
hypertabs_reg_exp = '^.*hyper[_ -]*tabs.*$'
|
||||
|
||||
# Regular expression to remove story-feature and variant tags. Eg. 'story-feature',
|
||||
# 'story-feature related narrow', 'story-feature wide', 'story-feature narrow'.
|
||||
# This class is used to add additional info. boxes, or small lists, outside of
|
||||
# the main story. TO DO: Work out a way to incorporate these neatly.
|
||||
story_feature_reg_exp = '^.*story[_ -]*feature.*$'
|
||||
|
||||
# Regular expression to remove video and variant tags, Eg. 'videoInStoryB',
|
||||
# 'videoInStoryC'. This class is used to embed video.
|
||||
video_reg_exp = '^.*video.*$'
|
||||
|
||||
# Regular expression to remove audio and variant tags, Eg. 'audioInStoryD'.
|
||||
# This class is used to embed audio.
|
||||
audio_reg_exp = '^.*audio.*$'
|
||||
|
||||
# Regular expression to remove pictureGallery and variant tags, Eg. 'pictureGallery'.
|
||||
# This class is used to embed a photo slideshow. See also 'slideshow' below.
|
||||
picture_gallery_reg_exp = '^.*picture.*$'
|
||||
|
||||
# Regular expression to remove slideshow and variant tags, Eg. 'dslideshow-enclosure'.
|
||||
# This class is used to embed a slideshow (not necessarily photo) but both
|
||||
# 'slideshow' and 'pictureGallery' are used for slideshows.
|
||||
slideshow_reg_exp = '^.*slide[_ -]*show.*$'
|
||||
|
||||
# Regular expression to remove social-links and variant tags. This class is used to
|
||||
# display links to a BBC bloggers main page, used in various columnist's blogs
|
||||
# (Eg. Nick Robinson, Robert Preston).
|
||||
social_links_reg_exp = '^.*social[_ -]*links.*$'
|
||||
|
||||
# Regular expression to remove quote and (multi) variant tags, Eg. 'quote',
|
||||
# 'endquote', 'quote-credit', 'quote-credit-title', etc. These are usually
|
||||
# removed by 'story-feature' removal (as they are usually within them), but
|
||||
# not always. The quotation removed is always (AFAICT) in the article text
|
||||
# as well but a 2nd copy is placed in a quote tag to draw attention to it.
|
||||
# The quote class tags may or may not appear in div's.
|
||||
quote_reg_exp = '^.*quote.*$'
|
||||
|
||||
# Regular expression to remove hidden and variant tags, Eg. 'hidden'.
|
||||
# The purpose of these is unclear, they seem to be an internal link to a
|
||||
# section within the article, but the text of the link (Eg. 'Continue reading
|
||||
# the main story') never seems to be displayed anyway. Removed to avoid clutter.
|
||||
# The hidden class tags may or may not appear in div's.
|
||||
hidden_reg_exp = '^.*hidden.*$'
|
||||
|
||||
# Regular expression to remove comment and variant tags, Eg. 'comment-introduction'.
|
||||
# Used on the site to display text about registered users entering comments.
|
||||
comment_reg_exp = '^.*comment.*$'
|
||||
|
||||
# Regular expression to remove form and variant tags, Eg. 'comment-form'.
|
||||
# Used on the site to allow registered BBC users to fill in forms, typically
|
||||
# for entering comments about an article.
|
||||
form_reg_exp = '^.*form.*$'
|
||||
|
||||
# Extra things to remove due to the addition of 'blq_content' in keep_only_tags.
|
||||
|
||||
#<div class="story-actions"> Used on sports pages for 'email' and 'print'.
|
||||
story_actions_reg_exp = '^.*story[_ -]*actions.*$'
|
||||
|
||||
#<div class="bookmark-list"> Used on sports pages instead of 'share-help' (for
|
||||
# social networking links).
|
||||
bookmark_list_reg_exp = '^.*bookmark[_ -]*list.*$'
|
||||
|
||||
#<div id="secondary-content" class="content-group">
|
||||
# NOTE: Don't remove class="content-group" that is needed.
|
||||
# Used on sports pages to link to 'similar stories'.
|
||||
secondary_content_reg_exp = '^.*secondary[_ -]*content.*$'
|
||||
|
||||
#<div id="featured-content" class="content-group">
|
||||
# NOTE: Don't remove class="content-group" that is needed.
|
||||
# Used on sports pages to link to pages like 'tables', 'fixtures', etc.
|
||||
featured_content_reg_exp = '^.*featured[_ -]*content.*$'
|
||||
|
||||
#<div id="navigation">
|
||||
# Used on sports pages to link to pages like 'tables', 'fixtures', etc.
|
||||
# Used sometimes instead of "featured-content" above.
|
||||
navigation_reg_exp = '^.*navigation.*$'
|
||||
|
||||
#<a class="skip" href="#blq-container-inner">Skip to top</a>
|
||||
# Used on sports pages to link to the top of the page.
|
||||
skip_reg_exp = '^.*skip.*$'
|
||||
|
||||
# Extra things to remove due to the addition of 'storycontent' in keep_only_tags,
|
||||
# which are the alterative table design based pages. The purpose of some of these
|
||||
# is not entirely clear from the pages (which are a total mess!).
|
||||
|
||||
# Remove mapping based tags, Eg. <map id="world_map">
|
||||
# The dynamic maps don't seem to work during ebook creation. TO DO: Investigate.
|
||||
map_reg_exp = '^.*map.*$'
|
||||
|
||||
# Remove social bookmarking variation, called 'socialBookMarks'.
|
||||
social_bookmarks_reg_exp = '^.*social[_ -]*bookmarks.*$'
|
||||
|
||||
# Remove page navigation tools, like 'search', 'email', 'print', called 'blq-mast'.
|
||||
blq_mast_reg_exp = '^.*blq[_ -]*mast.*$'
|
||||
|
||||
# Remove 'sharesb', I think this is a generic 'sharing' class. It seems to appear
|
||||
# alongside 'socialBookMarks' whenever that appears. I am removing it as well
|
||||
# under the assumption that it can appear alone as well.
|
||||
sharesb_reg_exp = '^.*sharesb.*$'
|
||||
|
||||
# Remove class 'o'. The worst named user created css class of all time. The creator
|
||||
# should immediately be fired. I've seen it used to hold nothing at all but with
|
||||
# 20 or so empty lines in it. Also to hold a single link to another article.
|
||||
# Whatever it was designed to do it is not wanted by this recipe. Exact match only.
|
||||
o_reg_exp = '^o$'
|
||||
|
||||
# Remove 'promotopbg' and 'promobottombg', link lists. Have decided to
|
||||
# use two reg expressions to make removing this (and variants) robust.
|
||||
promo_top_reg_exp = '^.*promotopbg.*$'
|
||||
promo_bottom_reg_exp = '^.*promobottombg.*$'
|
||||
|
||||
# Remove 'nlp', provides heading for link lists. Requires an exact match due to
|
||||
# risk of matching those letters in something needed, unless I see a variation
|
||||
# of 'nlp' used at a later date.
|
||||
nlp_reg_exp = '^nlp$'
|
||||
|
||||
# Remove 'mva', provides embedded floating content of various types. Variant 'mvb'
|
||||
# has also now been seen. Requires an exact match of 'mva' or 'mvb' due to risk of
|
||||
# matching those letters in something needed.
|
||||
mva_or_mvb_reg_exp = '^mv[ab]$'
|
||||
|
||||
# Remove 'mvtb', seems to be page navigation tools, like 'blq-mast'.
|
||||
mvtb_reg_exp = '^mvtb$'
|
||||
|
||||
# Remove 'blq-toplink', class to provide a link to the top of the page.
|
||||
blq_toplink_reg_exp = '^.*blq[_ -]*top[_ -]*link.*$'
|
||||
|
||||
# Remove 'products and services' links, Eg. desktop tools, alerts, and so on.
|
||||
# Eg. Class="servicev4 ukfs_services" - what a mess of a name. Have decided to
|
||||
# use two reg expressions to make removing this (and variants) robust.
|
||||
prods_services_01_reg_exp = '^.*servicev4.*$'
|
||||
prods_services_02_reg_exp = '^.*ukfs[_ -]*services.*$'
|
||||
|
||||
# Remove -what I think is- some kind of navigation tools helper class, though I am
|
||||
# not sure, it's called: 'blq-rst blq-new-nav'. What I do know is it pops up
|
||||
# frequently and it is not wanted. Have decided to use two reg expressions to make
|
||||
# removing this (and variants) robust.
|
||||
blq_misc_01_reg_exp = '^.*blq[_ -]*rst.*$'
|
||||
blq_misc_02_reg_exp = '^.*blq[_ -]*new[_ -]*nav.*$'
|
||||
|
||||
# Remove 'puffbox' - this may only appear inside 'storyextra', so it may not
|
||||
# need removing - I have no clue what it does other than it contains links.
|
||||
# Whatever it is - it is not part of the article and is not wanted.
|
||||
puffbox_reg_exp = '^.*puffbox.*$'
|
||||
|
||||
# Remove 'sibtbg' and 'sibtbgf' - some kind of table formatting classes.
|
||||
sibtbg_reg_exp = '^.*sibtbg.*$'
|
||||
|
||||
# Remove 'storyextra' - links to relevant articles and external sites.
|
||||
storyextra_reg_exp = '^.*story[_ -]*extra.*$'
|
||||
|
||||
|
||||
remove_tags = [ dict(name='div', attrs={'class':re.compile(story_feature_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(share_help_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(embedded_hyper_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(hypertabs_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(video_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(audio_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(picture_gallery_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(slideshow_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(quote_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(hidden_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(comment_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(story_actions_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(bookmark_list_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'id':re.compile(secondary_content_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'id':re.compile(featured_content_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'id':re.compile(navigation_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='form', attrs={'id':re.compile(form_reg_exp, re.IGNORECASE)}),
|
||||
dict(attrs={'class':re.compile(quote_reg_exp, re.IGNORECASE)}),
|
||||
dict(attrs={'class':re.compile(hidden_reg_exp, re.IGNORECASE)}),
|
||||
dict(attrs={'class':re.compile(social_links_reg_exp, re.IGNORECASE)}),
|
||||
dict(attrs={'class':re.compile(comment_reg_exp, re.IGNORECASE)}),
|
||||
dict(attrs={'class':re.compile(skip_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='map', attrs={'id':re.compile(map_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='map', attrs={'name':re.compile(map_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'id':re.compile(social_bookmarks_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'id':re.compile(blq_mast_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(sharesb_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(o_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(promo_top_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(promo_bottom_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(nlp_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(mva_or_mvb_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(mvtb_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(blq_toplink_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(prods_services_01_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(prods_services_02_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(blq_misc_01_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(blq_misc_02_reg_exp, re.IGNORECASE)}),
|
||||
dict(name='div', attrs={'class':re.compile(puffbox_reg_exp, re.IGNORECASE)}),
|
||||
dict(attrs={'class':re.compile(sibtbg_reg_exp, re.IGNORECASE)}),
|
||||
dict(attrs={'class':re.compile(storyextra_reg_exp, re.IGNORECASE)})
|
||||
]
|
||||
|
||||
# Uses url to create and return the 'printer friendly' version of the url.
|
||||
# In other words the 'print this page' address of the page.
|
||||
#
|
||||
# There are 3 types of urls used in the BBC site's rss feeds. There is just
|
||||
# 1 type for the standard news while there are 2 used for sports feed urls.
|
||||
# Note: Sports urls are linked from regular news feeds (Eg. 'News Home') when
|
||||
# there is a major story of interest to 'everyone'. So even if no BBC sports
|
||||
# feeds are added to 'feeds' the logic of this method is still needed to avoid
|
||||
# blank / missing / empty articles which have an index title and then no body.
|
||||
def print_version(self, url):
|
||||
|
||||
# Handle sports page urls type 01:
|
||||
if (url.find("go/rss/-/sport1/") != -1):
|
||||
temp_url = url.replace("go/rss/-/", "")
|
||||
|
||||
# Handle sports page urls type 02:
|
||||
elif (url.find("go/rss/int/news/-/sport1/") != -1):
|
||||
temp_url = url.replace("go/rss/int/news/-/", "")
|
||||
|
||||
# Handle regular news page urls:
|
||||
else:
|
||||
temp_url = url.replace("go/rss/int/news/-/", "")
|
||||
|
||||
# Always add "?print=true" to the end of the url.
|
||||
print_url = temp_url + "?print=true"
|
||||
|
||||
return print_url
|
||||
|
||||
|
||||
# Remove articles in feeds based on a string in the article title or url.
|
||||
#
|
||||
# Code logic written by: Starson17 - posted in: "Recipes - Re-usable code"
|
||||
# thread, in post with title: "Remove articles from feed", see url:
|
||||
# http://www.mobileread.com/forums/showpost.php?p=1165462&postcount=6
|
||||
# Many thanks and all credit to Starson17.
|
||||
#
|
||||
# Starson17's code has obviously been altered to suite my requirements.
|
||||
def parse_feeds(self):
|
||||
|
||||
# Call parent's method.
|
||||
feeds = BasicNewsRecipe.parse_feeds(self)
|
||||
|
||||
# Loop through all feeds.
|
||||
for feed in feeds:
|
||||
|
||||
# Loop through all articles in feed.
|
||||
for article in feed.articles[:]:
|
||||
|
||||
# Match key words and remove article if there's a match.
|
||||
|
||||
# Most BBC rss feed video only 'articles' use upper case 'VIDEO'
|
||||
# as a title prefix. Just match upper case 'VIDEO', so that
|
||||
# articles like 'Video game banned' won't be matched and removed.
|
||||
if 'VIDEO' in article.title:
|
||||
feed.articles.remove(article)
|
||||
|
||||
# Most BBC rss feed audio only 'articles' use upper case 'AUDIO'
|
||||
# as a title prefix. Just match upper case 'AUDIO', so that
|
||||
# articles like 'Hi-Def audio...' won't be matched and removed.
|
||||
elif 'AUDIO' in article.title:
|
||||
feed.articles.remove(article)
|
||||
|
||||
# Most BBC rss feed photo slideshow 'articles' use 'In Pictures',
|
||||
# 'In pictures', and 'in pictures', somewhere in their title.
|
||||
# Match any case of that phrase.
|
||||
elif 'IN PICTURES' in article.title.upper():
|
||||
feed.articles.remove(article)
|
||||
|
||||
# As above, but user contributed pictures. Match any case.
|
||||
elif 'YOUR PICTURES' in article.title.upper():
|
||||
feed.articles.remove(article)
|
||||
|
||||
# 'Sportsday Live' are articles which contain a constantly and
|
||||
# dynamically updated 'running commentary' during a live sporting
|
||||
# event. Match any case.
|
||||
elif 'SPORTSDAY LIVE' in article.title.upper():
|
||||
feed.articles.remove(article)
|
||||
|
||||
# Sometimes 'Sportsday Live' (above) becomes 'Live - Sport Name'.
|
||||
# These are being matched below using 'Live - ' because removing all
|
||||
# articles with 'live' in their titles would remove some articles
|
||||
# that are in fact not live sports pages. Match any case.
|
||||
elif 'LIVE - ' in article.title.upper():
|
||||
feed.articles.remove(article)
|
||||
|
||||
# 'Quiz of the week' is a Flash player weekly news quiz. Match only
|
||||
# the 'Quiz of the' part in anticipation of monthly and yearly
|
||||
# variants. Match any case.
|
||||
elif 'QUIZ OF THE' in article.title.upper():
|
||||
feed.articles.remove(article)
|
||||
|
||||
# Remove articles with 'scorecards' in the url. These are BBC sports
|
||||
# pages which just display a cricket scorecard. The pages have a mass
|
||||
# of table and css entries to display the scorecards nicely. Probably
|
||||
# could make them work with this recipe, but might take a whole day
|
||||
# of work to sort out all the css - basically a formatting nightmare.
|
||||
elif 'scorecards' in article.url:
|
||||
feed.articles.remove(article)
|
||||
|
||||
return feeds
|
||||
|
||||
# End of class and file.
|
||||
|
@ -1,61 +1,44 @@
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
'''Calibre recipe to convert the RSS feeds of the Berliner Zeitung to an ebook.'''
|
||||
|
||||
class SportsIllustratedRecipe(BasicNewsRecipe) :
|
||||
__author__ = 'ape'
|
||||
__copyright__ = 'ape'
|
||||
__author__ = 'a.peter'
|
||||
__copyright__ = 'a.peter'
|
||||
__license__ = 'GPL v3'
|
||||
language = 'de'
|
||||
description = 'Berliner Zeitung'
|
||||
version = 2
|
||||
description = 'Berliner Zeitung RSS'
|
||||
version = 4
|
||||
title = u'Berliner Zeitung'
|
||||
timefmt = ' [%d.%m.%Y]'
|
||||
|
||||
#oldest_article = 7.0
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
publication_type = 'newspaper'
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'teaser t_split t_artikel'})]
|
||||
remove_tags_before = dict(name='div', attrs={'class':'newstype'})
|
||||
remove_tags_after = [dict(id='article_text')]
|
||||
|
||||
INDEX = 'http://www.berlinonline.de/berliner-zeitung/'
|
||||
|
||||
def parse_index(self):
|
||||
base = 'http://www.berlinonline.de'
|
||||
answer = []
|
||||
articles = {}
|
||||
more = 1
|
||||
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
|
||||
# Get list of links to ressorts from index page
|
||||
ressort_list = soup.findAll('ul', attrs={'class': re.compile('ressortlist')})
|
||||
for ressort in ressort_list[0].findAll('a'):
|
||||
feed_title = ressort.string
|
||||
print 'Analyzing', feed_title
|
||||
if not articles.has_key(feed_title):
|
||||
articles[feed_title] = []
|
||||
answer.append(feed_title)
|
||||
# Load ressort page.
|
||||
feed = self.index_to_soup('http://www.berlinonline.de' + ressort['href'])
|
||||
# find mainbar div which contains the list of all articles
|
||||
for article_container in feed.findAll('div', attrs={'class': re.compile('mainbar')}):
|
||||
# iterate over all articles
|
||||
for article_teaser in article_container.findAll('div', attrs={'class': re.compile('teaser')}):
|
||||
# extract title of article
|
||||
if article_teaser.h3 != None:
|
||||
article = {'title' : article_teaser.h3.a.string, 'date' : u'', 'url' : base + article_teaser.h3.a['href'], 'description' : u''}
|
||||
articles[feed_title].append(article)
|
||||
else:
|
||||
# Skip teasers for missing photos
|
||||
if article_teaser.div.p.contents[0].find('Foto:') > -1:
|
||||
continue
|
||||
article = {'title': 'Weitere Artikel ' + str(more), 'date': u'', 'url': base + article_teaser.div.p.a['href'], 'description': u''}
|
||||
articles[feed_title].append(article)
|
||||
more += 1
|
||||
answer = [[key, articles[key]] for key in answer if articles.has_key(key)]
|
||||
return answer
|
||||
feeds = [(u'Startseite', u'http://www.berliner-zeitung.de/home/10808950,10808950,view,asFeed.xml'),
|
||||
(u'Politik', u'http://www.berliner-zeitung.de/home/10808018,10808018,view,asFeed.xml'),
|
||||
(u'Wirtschaft', u'http://www.berliner-zeitung.de/home/10808230,10808230,view,asFeed.xml'),
|
||||
(u'Berlin', u'http://www.berliner-zeitung.de/home/10809148,10809148,view,asFeed.xml'),
|
||||
(u'Brandenburg', u'http://www.berliner-zeitung.de/home/10809312,10809312,view,asFeed.xml'),
|
||||
(u'Wissenschaft', u'http://www.berliner-zeitung.de/home/10808894,10808894,view,asFeed.xml'),
|
||||
(u'Digital', u'http://www.berliner-zeitung.de/home/10808718,10808718,view,asFeed.xml'),
|
||||
(u'Kultur', u'http://www.berliner-zeitung.de/home/10809150,10809150,view,asFeed.xml'),
|
||||
(u'Panorama', u'http://www.berliner-zeitung.de/home/10808334,10808334,view,asFeed.xml'),
|
||||
(u'Sport', u'http://www.berliner-zeitung.de/home/10808794,10808794,view,asFeed.xml'),
|
||||
(u'Hertha', u'http://www.berliner-zeitung.de/home/10808800,10808800,view,asFeed.xml'),
|
||||
(u'Union', u'http://www.berliner-zeitung.de/home/10808802,10808802,view,asFeed.xml'),
|
||||
(u'Verkehr', u'http://www.berliner-zeitung.de/home/10809298,10809298,view,asFeed.xml'),
|
||||
(u'Polizei', u'http://www.berliner-zeitung.de/home/10809296,10809296,view,asFeed.xml'),
|
||||
(u'Meinung', u'http://www.berliner-zeitung.de/home/10808020,10808020,view,asFeed.xml')]
|
||||
|
||||
def get_masthead_url(self):
|
||||
return 'http://www.berlinonline.de/.img/berliner-zeitung/blz_logo.gif'
|
||||
return 'http://www.berliner-zeitung.de/image/view/10810244,7040611,data,logo.png'
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('.html', ',view,printVersion.html')
|
||||
|
@ -1,4 +1,3 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
@ -18,11 +17,17 @@ class Berlingske_dk(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
remove_empty_feeds = True
|
||||
use_embedded_content = False
|
||||
remove_javascript = True
|
||||
publication_type = 'newspaper'
|
||||
encoding = 'utf8'
|
||||
language = 'da'
|
||||
masthead_url = 'http://www.berlingske.dk/sites/all/themes/bm/img/layout/masthead_bg.gif'
|
||||
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } h1,.manchet,.byline{font-family: Cambria,Georgia,Times,"Times New Roman",serif } '
|
||||
auto_cleanup = True
|
||||
extra_css = '''
|
||||
.manchet {color:#888888;}
|
||||
.dateline {font-size: x-small; color:#444444;}
|
||||
.manchet,.dateline { font-family: Cambria,Georgia,Times,"Times New Roman",serif }
|
||||
.body {font-family: Arial,Helvetica,sans-serif }
|
||||
'''
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
@ -32,18 +37,14 @@ class Berlingske_dk(BasicNewsRecipe):
|
||||
}
|
||||
|
||||
feeds = [
|
||||
(u'Breaking news' , u'http://www.berlingske.dk/breaking/rss' )
|
||||
,(u'Seneste nyt' , u'http://www.berlingske.dk/seneste/rss' )
|
||||
,(u'Topnyheder' , u'http://www.berlingske.dk/top/rss' )
|
||||
,(u'Danmark' , u'http://www.berlingske.dk/danmark/seneste/rss' )
|
||||
,(u'Verden' , u'http://www.berlingske.dk/verden/seneste/rss' )
|
||||
,(u'Klima' , u'http://www.berlingske.dk/klima/seneste/rss' )
|
||||
,(u'Debat' , u'http://www.berlingske.dk/debat/seneste/rss' )
|
||||
,(u'Koebenhavn' , u'http://www.berlingske.dk/koebenhavn/seneste/rss')
|
||||
,(u'Politik' , u'http://www.berlingske.dk/politik/seneste/rss' )
|
||||
,(u'Kultur' , u'http://www.berlingske.dk/kultur/seneste/rss' )
|
||||
(u'Breaking news' , u'http://www.b.dk/breaking/rss' )
|
||||
,(u'Seneste nyt' , u'http://www.b.dk/seneste/rss' )
|
||||
,(u'Topnyheder' , u'http://www.b.dk/top/rss' )
|
||||
,(u'Danmark' , u'http://www.b.dk/danmark/seneste/rss' )
|
||||
,(u'Verden' , u'http://www.b.dk/verden/seneste/rss' )
|
||||
,(u'Klima' , u'http://www.b.dk/klima/seneste/rss' )
|
||||
,(u'Debat' , u'http://www.b.dk/debat/seneste/rss' )
|
||||
,(u'Koebenhavn' , u'http://www.b.dk/koebenhavn/seneste/rss')
|
||||
,(u'Politik' , u'http://www.b.dk/politik/seneste/rss' )
|
||||
,(u'Kultur' , u'http://www.b.dk/kultur/seneste/rss' )
|
||||
]
|
||||
|
||||
keep_only_tags = [dict(attrs={'class':['first','pt-article']})]
|
||||
remove_tags = [dict(name=['object','link','base','iframe','embed'])]
|
||||
|
||||
|
38
recipes/biamag.recipe
Normal file
@ -0,0 +1,38 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
bianet.com.tr
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Radikal_tr(BasicNewsRecipe):
|
||||
title = 'BiaMag'
|
||||
__author__ = 'Osman Kaysan'
|
||||
description = 'Independent News from Turkey'
|
||||
publisher = 'BiaMag'
|
||||
category = 'news, politics, Turkey'
|
||||
oldest_article = 15
|
||||
max_articles_per_feed = 120
|
||||
masthead_url = 'http://bianet.org/images/biamag_logo.gif'
|
||||
language = 'tr'
|
||||
no_stylesheets = True
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
,'linearize_tables': True
|
||||
,'remove_paragraph_spacing': True,
|
||||
}
|
||||
|
||||
remove_tags_before = dict(name='div', attrs={'class':'manset'})
|
||||
remove_tags = [ dict(name='ul', attrs={'class':['altul']}), dict(name='div', attrs={'id':['habermenu']}), dict(name='div', attrs={'class':['mail']}), dict(name='div', attrs={'class':['from']})]
|
||||
remove_tags_after = dict(name='div', attrs={'id':'habermenu'})
|
||||
|
||||
feeds = [(u'BiaMag', u'http://www.bianet.org/biamag.rss')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
38
recipes/biamag_en.recipe
Normal file
@ -0,0 +1,38 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
bianet.com.tr
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Radikal_tr(BasicNewsRecipe):
|
||||
title = 'Bianet-English'
|
||||
__author__ = 'Osman Kaysan'
|
||||
description = 'Independent News Network from Turkey(English)'
|
||||
publisher = 'Bianet'
|
||||
category = 'news, politics, Turkey'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 150
|
||||
masthead_url = 'http://bianet.org/images/english_logo.gif'
|
||||
language = 'en_TR'
|
||||
no_stylesheets = True
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
,'linearize_tables': True
|
||||
,'remove_paragraph_spacing': True,
|
||||
}
|
||||
|
||||
remove_tags_before = dict(name='div', attrs={'class':'manset'})
|
||||
remove_tags = [ dict(name='ul', attrs={'class':['altul']}), dict(name='div', attrs={'id':['habermenu']}), dict(name='div', attrs={'class':['mail']}), dict(name='div', attrs={'class':['from']})]
|
||||
remove_tags_after = dict(name='div', attrs={'id':'habermenu'})
|
||||
|
||||
feeds = [(u'Bianet-English', u'http://www.bianet.org/english.rss')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
38
recipes/bianet.recipe
Normal file
@ -0,0 +1,38 @@
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
bianet.com.tr
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Radikal_tr(BasicNewsRecipe):
|
||||
title = 'Bianet'
|
||||
__author__ = 'Osman Kaysan'
|
||||
description = 'Independent News from Turkey'
|
||||
publisher = 'Bianet'
|
||||
category = 'news, politics, Turkey'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 120
|
||||
masthead_url = 'http://bianet.org/images/bianet_logo.gif'
|
||||
language = 'tr'
|
||||
no_stylesheets = True
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
,'linearize_tables': True
|
||||
,'remove_paragraph_spacing': True,
|
||||
}
|
||||
|
||||
remove_tags_before = dict(name='div', attrs={'class':'manset'})
|
||||
remove_tags = [ dict(name='ul', attrs={'class':['altul']}), dict(name='div', attrs={'id':['habermenu']}), dict(name='div', attrs={'class':['mail']}), dict(name='div', attrs={'class':['from']})]
|
||||
remove_tags_after = dict(name='div', attrs={'id':'habermenu'})
|
||||
|
||||
feeds = [(u'Bianet', u'http://bianet.org/bianet.rss')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
19
recipes/biolog_pl.recipe
Normal file
@ -0,0 +1,19 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class Biolog_pl(BasicNewsRecipe):
|
||||
title = u'Biolog.pl'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_empty_feeds=True
|
||||
__author__ = 'fenuks'
|
||||
description = u'Przyrodnicze aktualności ze świata nauki (codziennie aktualizowane), kurs biologii, testy i sprawdziany, forum dyskusyjne.'
|
||||
category = 'biology'
|
||||
language = 'pl'
|
||||
cover_url='http://www.biolog.pl/naukowy,portal,biolog.png'
|
||||
no_stylesheets = True
|
||||
#keeps_only_tags=[dict(id='main')]
|
||||
remove_tags_before=dict(id='main')
|
||||
remove_tags_after=dict(name='a', attrs={'name':'komentarze'})
|
||||
remove_tags=[dict(name='img', attrs={'alt':'Komentarze'})]
|
||||
feeds = [(u'Wszystkie', u'http://www.biolog.pl/backend.php'), (u'Medycyna', u'http://www.biolog.pl/medycyna-rss.php'), (u'Ekologia', u'http://www.biolog.pl/rss-ekologia.php'), (u'Genetyka i biotechnologia', u'http://www.biolog.pl/rss-biotechnologia.php'), (u'Botanika', u'http://www.biolog.pl/rss-botanika.php'), (u'Le\u015bnictwo', u'http://www.biolog.pl/rss-lesnictwo.php'), (u'Zoologia', u'http://www.biolog.pl/rss-zoologia.php')]
|
50
recipes/birgun_gazetesi.recipe
Normal file
@ -0,0 +1,50 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Birgun (BasicNewsRecipe):
|
||||
|
||||
title = u'Birgün Gazetesi'
|
||||
__author__ = u'Osman Kaysan'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed =150
|
||||
use_embedded_content = False
|
||||
description = 'Birgun gazatesi haberleri, kose yazarlari'
|
||||
publisher = 'Birgün'
|
||||
category = 'news,haberler,turkce,gazete,birgun'
|
||||
language = 'tr'
|
||||
no_stylesheets = True
|
||||
publication_type = 'newspaper'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
,'linearize_tables': True
|
||||
,'remove_paragraph_spacing': True,
|
||||
}
|
||||
|
||||
cover_img_url = 'http://www.birgun.net/i/birgun.png'
|
||||
masthead_url = 'http://www.birgun.net/i/birgun.png'
|
||||
|
||||
remove_attributes = ['width','height']
|
||||
|
||||
remove_tags_before = dict(name='h2', attrs={'class':'storyHeadline'})
|
||||
#remove_tags_after = dict(name='div', attrs={'class':'toollinks'})
|
||||
remove_tags_after = dict(name='tr', attrs={'valign':'top'})
|
||||
remove_tags = [ dict(name='div', attrs={'id':'byLine'}), dict(name='div', attrs={'class':'toollinks'})
|
||||
, dict(name='div', attrs={'class':'main-lead'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})
|
||||
, dict(name='a', attrs={'class':'addthis_button'})]
|
||||
|
||||
remove_empty_feeds= True
|
||||
|
||||
feeds = [
|
||||
( u'Güncel', u'http://www.birgun.net/actuels.xml')
|
||||
,( u'Köşe Yazarları', u'http://www.birgun.net/writer.xml')
|
||||
,( u'Politika', u'http://www.birgun.net/politics.xml')
|
||||
,( u'Ekonomi', u'http://www.birgun.net/economic.xml')
|
||||
,( u'Çalışma Yaşamı', u'http://www.birgun.net/workers.xml')
|
||||
,( u'Dünya', u'http://www.birgun.net/worlds.xml')
|
||||
,( u'Yaşam', u'http://www.birgun.net/lifes.xml')
|
||||
]
|
44
recipes/birmingham_post.recipe
Normal file
@ -0,0 +1,44 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
title = u'Birmingham post'
|
||||
description = 'News for Birmingham UK'
|
||||
timefmt = ''
|
||||
__author__ = 'Dave Asbury'
|
||||
cover_url = 'http://1.bp.blogspot.com/_GwWyq5eGw9M/S9BHPHxW55I/AAAAAAAAB6Q/iGCWl0egGzg/s320/Birmingham+post+Lite+front.JPG'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 20
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
auto_cleanup = True
|
||||
language = 'en_GB'
|
||||
|
||||
|
||||
masthead_url = 'http://www.pressgazette.co.uk/Pictures/web/t/c/g/birmingham_post.jpg'
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
#dict(name='h1',attrs={'id' : 'article-headline'}),
|
||||
#dict(attrs={'class':['article-meta-author','article-meta-date','article main','art-o art-align-center otm-1 ']}),
|
||||
#dict(name='p')
|
||||
#dict(attrs={'id' : 'three-col'})
|
||||
]
|
||||
remove_tags = [
|
||||
# dict(name='div',attrs={'class' : 'span-33 last header-links'})
|
||||
|
||||
]
|
||||
feeds = [
|
||||
#(u'News',u'http://www.birminghampost.net/news/rss.xml'),
|
||||
(u'Local News', u'http://www.birminghampost.net/news/west-midlands-news/rss.xml'),
|
||||
(u'UK News', u'http://www.birminghampost.net/news/uk-news/rss.xml'),
|
||||
(u'Sports',u'http://www.birminghampost.net/midlands-birmingham-sport/rss.xml'),
|
||||
(u'Bloggs & Comments',u'http://www.birminghampost.net/comment/rss.xml')
|
||||
|
||||
]
|
||||
extra_css = '''
|
||||
body {font: sans-serif medium;}'
|
||||
h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;}
|
||||
h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; }
|
||||
span{ font-size:9.5px; font-weight:bold;font-style:italic}
|
||||
p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
|
||||
|
||||
'''
|
26
recipes/blues.recipe
Normal file
@ -0,0 +1,26 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Oskar Kunicki <rakso at interia.pl>'
|
||||
'''
|
||||
Changelog:
|
||||
2011-11-27
|
||||
News from BluesRSS.info
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class BluesRSS(BasicNewsRecipe):
|
||||
title = 'Blues News'
|
||||
__author__ = 'Oskar Kunicki'
|
||||
description ='Blues news from around the world'
|
||||
publisher = 'BluesRSS.info'
|
||||
category = 'news, blues, USA,UK'
|
||||
oldest_article = 5
|
||||
max_articles_per_feed = 100
|
||||
language = 'en'
|
||||
cover_url = 'http://bluesrss.info/cover.jpg'
|
||||
masthead_url = 'http://bluesrss.info/cover.jpg'
|
||||
no_stylesheets = True
|
||||
|
||||
remove_tags = [dict(name='div', attrs={'class':'wp-pagenavi'})]
|
||||
|
||||
feeds = [(u'News', u'http://bluesrss.info/feed/')]
|
@ -10,30 +10,19 @@ http://www.buffalonews.com/RSS/
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1298680852(BasicNewsRecipe):
|
||||
class BuffaloNews(BasicNewsRecipe):
|
||||
title = u'Buffalo News'
|
||||
oldest_article = 2
|
||||
language = 'en'
|
||||
__author__ = 'ChappyOnIce'
|
||||
__author__ = 'ChappyOnIce, Krittika Goyal'
|
||||
max_articles_per_feed = 20
|
||||
encoding = 'utf-8'
|
||||
masthead_url = 'http://www.buffalonews.com/buffalonews/skins/buffalonews/images/masthead/the_buffalo_news_logo.png'
|
||||
remove_javascript = True
|
||||
extra_css = 'body {text-align: justify;}\n \
|
||||
p {text-indent: 20px;}'
|
||||
auto_cleanup = True
|
||||
remove_empty_feeds = True
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':['main-content-left']})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'id':['commentCount']}),
|
||||
dict(name='div', attrs={'class':['story-list-links']})
|
||||
]
|
||||
|
||||
remove_tags_after = dict(name='div', attrs={'class':['body storyContent']})
|
||||
|
||||
feeds = [(u'City of Buffalo', u'http://www.buffalonews.com/city/communities/buffalo/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
feeds = [
|
||||
(u'City of Buffalo', u'http://www.buffalonews.com/city/communities/buffalo/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Southern Erie County', u'http://www.buffalonews.com/city/communities/southern-erie/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Eastern Erie County', u'http://www.buffalonews.com/city/communities/eastern-erie/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Southern Tier', u'http://www.buffalonews.com/city/communities/southern-tier/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
@ -56,3 +45,4 @@ class AdvancedUserRecipe1298680852(BasicNewsRecipe):
|
||||
(u'Off Main Street', u'http://www.buffalonews.com/city/columns/off-main-street/?widget=rssfeed&view=feed&contentId=77944'),
|
||||
(u'Editorials', u'http://www.buffalonews.com/editorial-page/buffalo-news-editorials/?widget=rssfeed&view=feed&contentId=77944')
|
||||
]
|
||||
|
||||
|
51
recipes/catavencii.recipe
Normal file
@ -0,0 +1,51 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = u'2011, Silviu Cotoar\u0103'
|
||||
'''
|
||||
catavencii.ro
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Catavencii(BasicNewsRecipe):
|
||||
title = u'Ca\u0163avencii'
|
||||
__author__ = u'Silviu Cotoar\u0103'
|
||||
publisher = u'Ca\u0163avencii'
|
||||
description = u'Ca\u0163avencii'
|
||||
oldest_article = 5
|
||||
language = 'ro'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
category = 'Ziare,Romania'
|
||||
encoding = 'utf-8'
|
||||
cover_url = 'http://www.simonatache.ro/wp-content/uploads/2011/06/catavencii-logo.png'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id':'content'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'id':'breadcrumbs'})
|
||||
, dict(name='span', attrs={'class':'info'})
|
||||
, dict(name='div', attrs={'id':'social-media-article'})
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='div', attrs={'id':'social-media-article'})
|
||||
]
|
||||
feeds = [
|
||||
(u'\u0218tiri', u'http://www.catavencii.ro/rss')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
@ -4,16 +4,16 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = u'2011, Silviu Cotoar\u0103'
|
||||
'''
|
||||
catavencu.ro
|
||||
academiacatavencu.info
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Catavencu(BasicNewsRecipe):
|
||||
class AcademiaCatavencu(BasicNewsRecipe):
|
||||
title = u'Academia Ca\u0163avencu'
|
||||
__author__ = u'Silviu Cotoar\u0103'
|
||||
description = 'Tagma cum laude'
|
||||
publisher = 'Catavencu'
|
||||
publisher = u'Ca\u0163avencu'
|
||||
oldest_article = 5
|
||||
language = 'ro'
|
||||
max_articles_per_feed = 100
|
||||
@ -21,7 +21,7 @@ class Catavencu(BasicNewsRecipe):
|
||||
use_embedded_content = False
|
||||
category = 'Ziare'
|
||||
encoding = 'utf-8'
|
||||
cover_url = 'http://upload.wikimedia.org/wikipedia/en/1/1e/Academia_Catavencu.jpg'
|
||||
cover_url = 'http://www.academiacatavencu.info/images/logo.png'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
@ -31,22 +31,21 @@ class Catavencu(BasicNewsRecipe):
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='ul', attrs={'class':'articles'})
|
||||
dict(name='h1', attrs={'class':'art_title'}),
|
||||
dict(name='div', attrs={'class':'art_text'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['tools']})
|
||||
, dict(name='div', attrs={'class':['share']})
|
||||
, dict(name='div', attrs={'class':['category']})
|
||||
, dict(name='div', attrs={'id':['comments']})
|
||||
dict(name='div', attrs={'class':['desp_m']})
|
||||
, dict(name='div', attrs={'id':['tags']})
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='div', attrs={'id':'comments'})
|
||||
dict(name='div', attrs={'class':['desp_m']})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Feeds', u'http://catavencu.ro/feed/rss')
|
||||
(u'Feeds', u'http://www.academiacatavencu.info/rss.xml')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
@ -27,7 +27,7 @@ class CGM(BasicNewsRecipe):
|
||||
del item['style']
|
||||
ad=soup.findAll('a')
|
||||
for r in ad:
|
||||
if 'http://www.hustla.pl' in r['href']:
|
||||
if 'http://www.hustla.pl' in r['href'] or 'http://www.ebilet.pl' in r['href']:
|
||||
r.extract()
|
||||
gallery=soup.find('div', attrs={'class':'galleryFlash'})
|
||||
if gallery:
|
||||
|
@ -23,7 +23,9 @@ class TheCND(BasicNewsRecipe):
|
||||
remove_tags = [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
|
||||
no_stylesheets = True
|
||||
|
||||
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
||||
preprocess_regexps = [ (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
|
||||
(re.compile('<table width.*?</table>', re.DOTALL), lambda m: ''),
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
if url.find('news/article.php') >= 0:
|
||||
@ -46,13 +48,15 @@ class TheCND(BasicNewsRecipe):
|
||||
title = self.tag_to_string(a)
|
||||
self.log('\tFound article: ', title, 'at', url)
|
||||
date = a.nextSibling
|
||||
if re.search('cm', date):
|
||||
continue
|
||||
if (date is not None) and len(date)>2:
|
||||
if not articles.has_key(date):
|
||||
articles[date] = []
|
||||
articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
|
||||
self.log('\t\tAppend to : ', date)
|
||||
|
||||
self.log('log articles', articles)
|
||||
#self.log('log articles', articles)
|
||||
mostCurrent = sorted(articles).pop()
|
||||
self.title = 'CND ' + mostCurrent
|
||||
|
||||
|
72
recipes/cnd_weekly.recipe
Normal file
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Derek Liang <Derek.liang.ca @@@at@@@ gmail.com>'
|
||||
'''
|
||||
cnd.org
|
||||
'''
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class TheCND(BasicNewsRecipe):
|
||||
|
||||
title = 'CND Weekly'
|
||||
__author__ = 'Derek Liang'
|
||||
description = ''
|
||||
INDEX = 'http://cnd.org'
|
||||
language = 'zh'
|
||||
conversion_options = {'linearize_tables':True}
|
||||
|
||||
remove_tags_before = dict(name='div', id='articleHead')
|
||||
remove_tags_after = dict(id='copyright')
|
||||
remove_tags = [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
|
||||
no_stylesheets = True
|
||||
|
||||
preprocess_regexps = [ (re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''),
|
||||
(re.compile('<table width.*?</table>', re.DOTALL), lambda m: ''),
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
if url.find('news/article.php') >= 0:
|
||||
return re.sub("^[^=]*", "http://my.cnd.org/modules/news/print.php?storyid", url)
|
||||
else:
|
||||
return re.sub("^[^=]*", "http://my.cnd.org/modules/wfsection/print.php?articleid", url)
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
|
||||
feeds = []
|
||||
articles = {}
|
||||
|
||||
for a in soup.findAll('a', attrs={'target':'_cnd'}):
|
||||
url = a['href']
|
||||
if url.find('article.php') < 0 :
|
||||
continue
|
||||
if url.startswith('/'):
|
||||
url = 'http://cnd.org'+url
|
||||
title = self.tag_to_string(a)
|
||||
date = a.nextSibling
|
||||
if not re.search('cm', date):
|
||||
continue
|
||||
self.log('\tFound article: ', title, 'at', url, '@', date)
|
||||
if (date is not None) and len(date)>2:
|
||||
if not articles.has_key(date):
|
||||
articles[date] = []
|
||||
articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
|
||||
self.log('\t\tAppend to : ', date)
|
||||
|
||||
|
||||
sorted_articles = sorted(articles)
|
||||
while sorted_articles:
|
||||
mostCurrent = sorted_articles.pop()
|
||||
self.title = 'CND ' + mostCurrent
|
||||
feeds.append((self.title, articles[mostCurrent]))
|
||||
|
||||
return feeds
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
header = soup.find('h3')
|
||||
self.log('header: ' + self.tag_to_string(header))
|
||||
pass
|
||||
|
22
recipes/computerworld_pl.recipe
Normal file
@ -0,0 +1,22 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class Computerworld_pl(BasicNewsRecipe):
|
||||
title = u'Computerworld.pl'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Serwis o IT w przemyśle, finansach, handlu, administracji oraz rynku IT i telekomunikacyjnym - wiadomości, opinie, analizy, porady prawne'
|
||||
category = 'IT'
|
||||
language = 'pl'
|
||||
no_stylesheets=True
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags=[dict(name='div', attrs={'id':'s'})]
|
||||
remove_tags_after=dict(name='div', attrs={'class':'rMobi'})
|
||||
remove_tags=[dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})]
|
||||
feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')]
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('http://www.computerworld.pl/')
|
||||
cover=soup.find(name='img', attrs={'class':'prawo'})
|
||||
self.cover_url=cover['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
52
recipes/cosmopolitan_uk.recipe
Normal file
@ -0,0 +1,52 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
#from calibre import __appname__
|
||||
from calibre.utils.magick import Image
|
||||
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
title = u'Cosmopolitan UK'
|
||||
description = 'Fashion, beauty and Gossip for women from COSMOPOLITAN -UK'
|
||||
|
||||
__author__ = 'Dave Asbury'
|
||||
#last update 21/12/11
|
||||
# greyscale code by Starson
|
||||
cover_url = 'http://www.cosmopolitan.magazine.co.uk/files/4613/2085/8988/Cosmo_Cover3.jpg'
|
||||
no_stylesheets = True
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 20
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<!-- Begin tmpl module_competition_offer -->.*?<!-- End tmpl module_competition_offer-->', re.IGNORECASE | re.DOTALL), lambda match: '')]
|
||||
language = 'en_GB'
|
||||
|
||||
|
||||
masthead_url = 'http://www.cosmopolitan.co.uk/cm/cosmopolitanuk/site_images/header/cosmouk_logo_home.gif'
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
dict(attrs={'class' : ['dateAuthor', 'publishDate']}),
|
||||
dict(name='div',attrs ={'id' : ['main_content']})
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name='div',attrs={'class' : ['blogInfo','viral_toolbar','comment_number','prevEntry nav']}),
|
||||
dict(name='div',attrs={'class' : 'blog_module_about_the_authors'}),
|
||||
dict(attrs={'id': ['breadcrumbs','comment','related_links_list','right_rail','content_sec_fb_more','content_sec_mostpopularstories','content-sec_fb_frame_viewfb_bot']}),
|
||||
dict(attrs={'class' : ['read_liked_that_header','fb_back_next_area']}),
|
||||
dict(name='li',attrs={'class' : 'thumb'})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Love & Sex', u'http://www.cosmopolitan.co.uk/love-sex/rss/'), (u'Men', u'http://cosmopolitan.co.uk/men/rss/'), (u'Fashion', u'http://cosmopolitan.co.uk/fashion/rss/'), (u'Hair & Beauty', u'http://cosmopolitan.co.uk/beauty-hair/rss/'), (u'LifeStyle', u'http://cosmopolitan.co.uk/lifestyle/rss/'), (u'Cosmo On Campus', u'http://cosmopolitan.co.uk/campus/rss/'), (u'Celebrity Gossip', u'http://cosmopolitan.co.uk/celebrity-gossip/rss/')]
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
#process all the images
|
||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
||||
iurl = tag['src']
|
||||
img = Image()
|
||||
img.open(iurl)
|
||||
if img < 0:
|
||||
raise RuntimeError('Out of memory')
|
||||
img.type = "GrayscaleType"
|
||||
img.save(iurl)
|
||||
return soup
|
18
recipes/daily_writing_tips.recipe
Normal file
@ -0,0 +1,18 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class DailyWritingTips(BasicNewsRecipe):
|
||||
title = u'Daily Writing Tips'
|
||||
language = 'en_GB'
|
||||
__author__ = 'NotTaken'
|
||||
oldest_article = 7 #days
|
||||
max_articles_per_feed = 40
|
||||
use_embedded_content = True
|
||||
no_stylesheets = True
|
||||
auto_cleanup = False
|
||||
encoding = 'utf-8'
|
||||
|
||||
|
||||
feeds = [
|
||||
('Latest tips',
|
||||
'http://feeds2.feedburner.com/DailyWritingTips'),
|
||||
]
|
15
recipes/datasport.recipe
Normal file
@ -0,0 +1,15 @@
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'faber1971'
|
||||
description = 'Italian soccer news website - v1.00 (17, December 2011)'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1324114272(BasicNewsRecipe):
|
||||
title = u'Datasport'
|
||||
language = 'it'
|
||||
__author__ = 'faber1971'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [(u'Datasport', u'http://www.datasport.it/calcio/rss.xml')]
|
27
recipes/descopera_org.recipe
Normal file
@ -0,0 +1,27 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
'''
|
||||
descopera.org
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Descopera(BasicNewsRecipe):
|
||||
title = u'Descoperă.org'
|
||||
__author__ = 'Marius Ignătescu'
|
||||
description = 'Descoperă. Placerea de a cunoaște'
|
||||
publisher = 'descopera.org'
|
||||
category = 'science, technology, culture, history, earth'
|
||||
language = 'ro'
|
||||
oldest_article = 14
|
||||
max_articles_per_feed = 100
|
||||
encoding = 'utf8'
|
||||
no_stylesheets = True
|
||||
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
|
||||
keep_only_tags = [dict(name='div', attrs={'class':['post']})]
|
||||
remove_tags = [dict(name='div', attrs={'class':['topnav', 'box_a', 'shr-bookmarks shr-bookmarks-expand shr-bookmarks-center shr-bookmarks-bg-knowledge']})]
|
||||
remove_attributes = ['width','height']
|
||||
cover_url = 'http://www.descopera.org/wp-content/themes/dorg/styles/default/img/b_top.png?width=400'
|
||||
feeds = [(u'Articles', u'http://www.descopera.org/feed/')]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
@ -46,7 +46,8 @@ class DziennikInternautowRecipe(BasicNewsRecipe):
|
||||
dict(name = 'div', attrs = {'class' : 'poradniki_context'}),
|
||||
dict(name = 'div', attrs = {'class' : 'uniBox'}),
|
||||
dict(name = 'object', attrs = {}),
|
||||
dict(name = 'h3', attrs = {})
|
||||
dict(name = 'h3', attrs = {}),
|
||||
dict(attrs={'class':'twitter-share-button'})
|
||||
]
|
||||
|
||||
preprocess_regexps = [
|
||||
@ -58,3 +59,8 @@ class DziennikInternautowRecipe(BasicNewsRecipe):
|
||||
(r'\s*</', lambda match: '</'),
|
||||
]
|
||||
]
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
if 'Advertisement' in soup.title:
|
||||
nexturl=soup.find('a')['href']
|
||||
return self.index_to_soup(nexturl, raw=True)
|
||||
|
58
recipes/dziennik_pl.recipe
Normal file
@ -0,0 +1,58 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
class Dziennik_pl(BasicNewsRecipe):
|
||||
title = u'Dziennik.pl'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Wiadomości z kraju i ze świata. Wiadomości gospodarcze. Znajdziesz u nas informacje, wydarzenia, komentarze, opinie.'
|
||||
category = 'newspaper'
|
||||
language = 'pl'
|
||||
cover_url='http://6.s.dziennik.pl/images/og_dziennik.jpg'
|
||||
no_stylesheets = True
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_javascript=True
|
||||
remove_empty_feeds=True
|
||||
preprocess_regexps = [(re.compile("Komentarze:"), lambda m: '')]
|
||||
keep_only_tags=[dict(id='article')]
|
||||
remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget']}), dict(name='a', attrs={'class':'komentarz'})]
|
||||
feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'),
|
||||
(u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'),
|
||||
(u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'),
|
||||
(u'Kobieta', u'http://rss.dziennik.pl/Dziennik-Kobieta'),
|
||||
(u'Auto', u'http://rss.dziennik.pl/Dziennik-Auto'),
|
||||
(u'Rozrywka', u'http://rss.dziennik.pl/Dziennik-Rozrywka'),
|
||||
(u'Film', u'http://rss.dziennik.pl/Dziennik-Film'),
|
||||
(u'Muzyka' , u'http://rss.dziennik.pl/Dziennik-Muzyka'),
|
||||
(u'Kultura', u'http://rss.dziennik.pl/Dziennik-Kultura'),
|
||||
(u'Nauka', u'http://rss.dziennik.pl/Dziennik-Nauka'),
|
||||
(u'Podróże', u'http://rss.dziennik.pl/Dziennik-Podroze/'),
|
||||
(u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')]
|
||||
|
||||
def append_page(self, soup, appendtag):
|
||||
tag=soup.find('a', attrs={'class':'page_next'})
|
||||
if tag:
|
||||
appendtag.find('div', attrs={'class':'article_paginator'}).extract()
|
||||
while tag:
|
||||
soup2= self.index_to_soup(tag['href'])
|
||||
tag=soup2.find('a', attrs={'class':'page_next'})
|
||||
if not tag:
|
||||
for r in appendtag.findAll('div', attrs={'class':'art_src'}):
|
||||
r.extract()
|
||||
pagetext = soup2.find(name='div', attrs={'class':'article_body'})
|
||||
for dictionary in self.remove_tags:
|
||||
v=pagetext.findAll(name=dictionary['name'], attrs=dictionary['attrs'])
|
||||
for delete in v:
|
||||
delete.extract()
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
if appendtag.find('div', attrs={'class':'article_paginator'}):
|
||||
appendtag.find('div', attrs={'class':'article_paginator'}).extract()
|
||||
|
||||
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup, soup.body)
|
||||
return soup
|
47
recipes/echo_online.recipe
Normal file
@ -0,0 +1,47 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid at kovidgoyal.net>, Armin Geller'
|
||||
'''
|
||||
Fetch echo-online.de
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
class Echo_Online(BasicNewsRecipe):
|
||||
title = u' Echo Online'
|
||||
description = '-Echo Online-'
|
||||
publisher = 'Echo Online GmbH'
|
||||
category = 'News, Germany'
|
||||
__author__ = 'Armin Geller' # 2011-12-17
|
||||
language = 'de'
|
||||
lang = 'de-DE'
|
||||
encoding = 'iso-8859-1'
|
||||
timefmt = ' [%a, %d %b %Y]'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 2
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
remove_javascript = True
|
||||
|
||||
feeds = [
|
||||
(u'Topnews', u'http://www.echo-online.de/storage/rss/rss/topnews.xml'),
|
||||
(u'Darmstadt', u'http://www.echo-online.de/rss/darmstadt.xml'),
|
||||
(u'Darmstadt-Dieburg', u'http://www.echo-online.de/rss/darmstadtdieburg.xml'),
|
||||
(u'Kreis Gro\xdf-Gerau', u'http://www.echo-online.de/rss/kreisgrossgerau.xml'),
|
||||
(u'R\xfcsselsheim', u'http://www.echo-online.de/rss/ruesselsheim.xml'),
|
||||
(u'Kreis Bergstra\xdfe', u'http://www.echo-online.de/rss/bergstrasse.xml'),
|
||||
(u'Odenwaldkreis', u'http://www.echo-online.de/rss/odenwald.xml'),
|
||||
(u'SV 98', u'http://www.echo-online.de/rss/sv98.xml'),
|
||||
(u'Kino', u'http://www.echo-online.de/rss/kino.xml'),
|
||||
(u'Ausstellungen', u'http://www.echo-online.de/rss/ausstellungen.xml'),
|
||||
(u'Ausflug & Reise', u'http://www.echo-online.de/rss/ausflugreise.xml'),
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return self.browser.open_novisit(url).geturl() + '?_FRAME=33&_FORMAT=PRINT'
|
||||
|
||||
remove_tags = [dict(name='div', attrs={'class':["header", "name"]}),]
|
||||
auto_cleanup_keep = '//div[@class="bild_gross w270"]'
|
||||
|
||||
# cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-ash2/41801_145340745513489_893927_n.jpg' # 2011-12-16 AGe
|
||||
cover_url = 'http://adcounter.darmstaedter-echo.de/webdav/files/config/gui/images/Zeitungsfaecher.gif' # 2011-12-16 AGe
|
||||
|
@ -55,12 +55,17 @@ class Economist(BasicNewsRecipe):
|
||||
'''
|
||||
|
||||
def get_cover_url(self):
|
||||
br = self.browser
|
||||
br.open(self.INDEX)
|
||||
issue = br.geturl().split('/')[4]
|
||||
self.log('Fetching cover for issue: %s'%issue)
|
||||
cover_url = "http://media.economist.com/sites/default/files/imagecache/print-cover-full/print-covers/%s_CNA400.jpg" %(issue.translate(None,'-'))
|
||||
return cover_url
|
||||
soup = self.index_to_soup('http://www.economist.com/printedition/covers')
|
||||
div = soup.find('div', attrs={'class':lambda x: x and
|
||||
'print-cover-links' in x})
|
||||
a = div.find('a', href=True)
|
||||
url = a.get('href')
|
||||
if url.startswith('/'):
|
||||
url = 'http://www.economist.com' + url
|
||||
soup = self.index_to_soup(url)
|
||||
div = soup.find('div', attrs={'class':'cover-content'})
|
||||
img = div.find('img', src=True)
|
||||
return img.get('src')
|
||||
|
||||
def parse_index(self):
|
||||
return self.economist_parse_index()
|
||||
|
@ -39,13 +39,17 @@ class Economist(BasicNewsRecipe):
|
||||
delay = 1
|
||||
|
||||
def get_cover_url(self):
|
||||
br = self.browser
|
||||
br.open(self.INDEX)
|
||||
issue = br.geturl().split('/')[4]
|
||||
self.log('Fetching cover for issue: %s'%issue)
|
||||
cover_url = "http://media.economist.com/sites/default/files/imagecache/print-cover-full/print-covers/%s_CNA400.jpg" %(issue.translate(None,'-'))
|
||||
return cover_url
|
||||
|
||||
soup = self.index_to_soup('http://www.economist.com/printedition/covers')
|
||||
div = soup.find('div', attrs={'class':lambda x: x and
|
||||
'print-cover-links' in x})
|
||||
a = div.find('a', href=True)
|
||||
url = a.get('href')
|
||||
if url.startswith('/'):
|
||||
url = 'http://www.economist.com' + url
|
||||
soup = self.index_to_soup(url)
|
||||
div = soup.find('div', attrs={'class':'cover-content'})
|
||||
img = div.find('img', src=True)
|
||||
return img.get('src')
|
||||
|
||||
def parse_index(self):
|
||||
try:
|
||||
|
@ -5,12 +5,11 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '04 December 2010, desUBIKado'
|
||||
__author__ = 'desUBIKado'
|
||||
__description__ = 'Daily newspaper from Aragon'
|
||||
__version__ = 'v0.07'
|
||||
__date__ = '06, February 2011'
|
||||
__version__ = 'v0.08'
|
||||
__date__ = '13, November 2011'
|
||||
'''
|
||||
elperiodicodearagon.com
|
||||
'''
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
|
||||
@ -20,13 +19,13 @@ class elperiodicodearagon(BasicNewsRecipe):
|
||||
description = u'Noticias desde Aragon'
|
||||
publisher = u'elperiodicodearagon.com'
|
||||
category = u'news, politics, Spain, Aragon'
|
||||
oldest_article = 2
|
||||
oldest_article = 1
|
||||
delay = 0
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'es'
|
||||
encoding = 'utf8'
|
||||
encoding = 'iso-8859-1'
|
||||
remove_empty_feeds = True
|
||||
remove_javascript = True
|
||||
|
||||
@ -39,61 +38,30 @@ class elperiodicodearagon(BasicNewsRecipe):
|
||||
}
|
||||
|
||||
feeds = [
|
||||
(u'Arag\xf3n', u'http://elperiodicodearagon.com/RSS/2.xml'),
|
||||
(u'Internacional', u'http://elperiodicodearagon.com/RSS/4.xml'),
|
||||
(u'Espa\xf1a', u'http://elperiodicodearagon.com/RSS/3.xml'),
|
||||
(u'Econom\xeda', u'http://elperiodicodearagon.com/RSS/5.xml'),
|
||||
(u'Deportes', u'http://elperiodicodearagon.com/RSS/7.xml'),
|
||||
(u'Real Zaragoza', u'http://elperiodicodearagon.com/RSS/10.xml'),
|
||||
(u'Opini\xf3n', u'http://elperiodicodearagon.com/RSS/103.xml'),
|
||||
(u'Escenarios', u'http://elperiodicodearagon.com/RSS/105.xml'),
|
||||
(u'Sociedad', u'http://elperiodicodearagon.com/RSS/104.xml'),
|
||||
(u'Gente', u'http://elperiodicodearagon.com/RSS/330.xml')
|
||||
(u'Portada', u'http://zetaestaticos.com/aragon/rss/portada_es.xml'),
|
||||
(u'Arag\xf3n', u'http://zetaestaticos.com/aragon/rss/2_es.xml'),
|
||||
(u'Internacional', u'http://zetaestaticos.com/aragon/rss/4_es.xml'),
|
||||
(u'Espa\xf1a', u'http://zetaestaticos.com/aragon/rss/3_es.xml'),
|
||||
(u'Econom\xeda', u'http://zetaestaticos.com/aragon/rss/5_es.xml'),
|
||||
(u'Deportes', u'http://zetaestaticos.com/aragon/rss/7_es.xml'),
|
||||
(u'Real Zaragoza', u'http://zetaestaticos.com/aragon/rss/10_es.xml'),
|
||||
(u'CAI Zaragoza', u'http://zetaestaticos.com/aragon/rss/91_es.xml'),
|
||||
(u'Monta\xf1ismo', u'http://zetaestaticos.com/aragon/rss/354_es.xml'),
|
||||
(u'Opini\xf3n', u'http://zetaestaticos.com/aragon/rss/103_es.xml'),
|
||||
(u'Tema del d\xeda', u'http://zetaestaticos.com/aragon/rss/102_es.xml'),
|
||||
(u'Escenarios', u'http://zetaestaticos.com/aragon/rss/105_es.xml'),
|
||||
(u'Sociedad', u'http://zetaestaticos.com/aragon/rss/104_es.xml'),
|
||||
(u'Gente', u'http://zetaestaticos.com/aragon/rss/330_es.xml'),
|
||||
(u'Espacio 3', u'http://zetaestaticos.com/aragon/rss/328_es.xml'),
|
||||
(u'Fiestas del Pilar', u'http://zetaestaticos.com/aragon/rss/107_es.xml')
|
||||
]
|
||||
|
||||
|
||||
extra_css = '''
|
||||
h3 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
|
||||
h2 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:18px;}
|
||||
h4 {font-family:Arial,Helvetica,sans-serif; font-style:italic; font-weight:normal;font-size:20px;}
|
||||
.columnaDeRecursosRelacionados {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
|
||||
img{margin-bottom: 0.4em}
|
||||
'''
|
||||
|
||||
remove_attributes = ['height','width']
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'contenidos'})]
|
||||
keep_only_tags = [dict(name='div', attrs={'id':'Noticia'})]
|
||||
|
||||
|
||||
# Quitar toda la morralla
|
||||
|
||||
remove_tags = [dict(name='ul', attrs={'class':'herramientasDeNoticia'}),
|
||||
dict(name='span', attrs={'class':'MasInformacion '}),
|
||||
dict(name='span', attrs={'class':'MasInformacion'}),
|
||||
dict(name='div', attrs={'class':'Middle'}),
|
||||
dict(name='div', attrs={'class':'MenuCabeceraRZaragoza'}),
|
||||
dict(name='div', attrs={'id':'MenuCabeceraRZaragoza'}),
|
||||
dict(name='div', attrs={'class':'MenuEquipo'}),
|
||||
dict(name='div', attrs={'class':'TemasRelacionados'}),
|
||||
dict(name='div', attrs={'class':'GaleriaEnNoticia'}),
|
||||
dict(name='div', attrs={'class':'Recorte'}),
|
||||
dict(name='div', attrs={'id':'NoticiasenRecursos'}),
|
||||
dict(name='div', attrs={'id':'NoticiaEnPapel'}),
|
||||
dict(name='p', attrs={'class':'RecorteEnNoticias'}),
|
||||
dict(name='div', attrs={'id':'Comparte'}),
|
||||
dict(name='div', attrs={'id':'CajaComparte'}),
|
||||
dict(name='a', attrs={'class':'EscribirComentario'}),
|
||||
dict(name='a', attrs={'class':'AvisoComentario'}),
|
||||
dict(name='div', attrs={'class':'CajaAvisoComentario'}),
|
||||
dict(name='div', attrs={'class':'navegaNoticias'}),
|
||||
dict(name='div', attrs={'class':'Mensaje'}),
|
||||
dict(name='div', attrs={'id':'PaginadorDiCom'}),
|
||||
dict(name='div', attrs={'id':'CajaAccesoCuentaUsuario'}),
|
||||
dict(name='div', attrs={'id':'CintilloComentario'}),
|
||||
dict(name='div', attrs={'id':'EscribeComentario'}),
|
||||
dict(name='div', attrs={'id':'FormularioComentario'}),
|
||||
dict(name='div', attrs={'id':'FormularioNormas'})]
|
||||
|
||||
# Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion)
|
||||
|
||||
def get_cover_url(self):
|
||||
@ -104,23 +72,7 @@ class elperiodicodearagon(BasicNewsRecipe):
|
||||
return image['src'].rstrip('format=2') + 'format=1'
|
||||
return None
|
||||
|
||||
# Para quitar espacios entre la noticia y los comentarios (lineas 1 y 2)
|
||||
# El indice no apuntaba correctamente al empiece de la noticia (linea 3)
|
||||
# Usamos la versión para móviles
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<p> </p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(r'<p> </p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
|
||||
(re.compile(r'<p id="">', re.DOTALL|re.IGNORECASE), lambda match: '<p>')
|
||||
]
|
||||
|
||||
# Para sustituir el video incrustado de YouTube por una imagen
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for video_yt in soup.findAll('iframe',{'title':'YouTube video player'}):
|
||||
if video_yt:
|
||||
video_yt.name = 'img'
|
||||
fuente = video_yt['src']
|
||||
fuente2 = fuente.replace('http://www.youtube.com/embed/','http://img.youtube.com/vi/')
|
||||
video_yt['src'] = fuente2 + '/0.jpg'
|
||||
|
||||
return soup
|
||||
def print_version(self, url):
|
||||
return url.replace('http://www.elperiodicodearagon.com/', 'http://www.elperiodicodearagon.com/m/')
|
||||
|
48
recipes/elet_es_irodalom.recipe
Normal file
@ -0,0 +1,48 @@
|
||||
################################################################################
|
||||
#Description: http://es.hu/ RSS channel
|
||||
#Author: Bigpapa (bigpapabig@hotmail.com)
|
||||
#Date: 2010.12.01. - V1.0
|
||||
################################################################################
|
||||
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class elet_es_irodalom(BasicNewsRecipe):
|
||||
title = u'Elet es Irodalom'
|
||||
__author__ = 'Bigpapa'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 20 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
|
||||
no_stylesheets = True
|
||||
#delay = 1
|
||||
use_embedded_content = False
|
||||
encoding = 'iso-8859-2'
|
||||
category = 'Cikkek'
|
||||
language = 'hu'
|
||||
publication_type = 'newsportal'
|
||||
extra_css = '.doc_title { font: bold 30px } .doc_author {font: bold 14px} '
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':['doc_author', 'doc_title', 'doc']})
|
||||
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='a', attrs={'target':['_TOP']}),
|
||||
dict(name='div', attrs={'style':['float: right; margin-left: 5px; margin-bottom: 5px;', 'float: right; margin-left: 5px; margin-bottom: 5px;']}),
|
||||
|
||||
|
||||
]
|
||||
|
||||
|
||||
|
||||
feeds = [
|
||||
(u'Publicisztika', 'http://www.feed43.com/4684235031168504.xml'),
|
||||
(u'Interj\xfa', 'http://www.feed43.com/4032465460040618.xml'),
|
||||
(u'Visszhang', 'http://www.feed43.com/3727375706873086.xml'),
|
||||
(u'P\xe1ratlan oldal', 'http://www.feed43.com/2525784782475057.xml'),
|
||||
(u'Feuilleton', 'http://www.feed43.com/7216025082703073.xml'),
|
||||
(u'Pr\xf3za', 'http://www.feed43.com/8760248802326384.xml'),
|
||||
(u'Vers', 'http://www.feed43.com/1737324675134275.xml'),
|
||||
(u'K\xf6nyvkritika', 'http://www.feed43.com/1281156550717082.xml'),
|
||||
(u'M\u0171b\xedr\xe1lat', 'http://www.feed43.com/1851854623681044.xml')
|
||||
|
||||
]
|
@ -4,7 +4,8 @@ __copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
elmundo.es
|
||||
'''
|
||||
|
||||
import re
|
||||
import time
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class ElMundo(BasicNewsRecipe):
|
||||
@ -18,12 +19,15 @@ class ElMundo(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'iso8859_15'
|
||||
remove_javascript = True
|
||||
remove_empty_feeds = True
|
||||
language = 'es'
|
||||
masthead_url = 'http://estaticos03.elmundo.es/elmundo/iconos/v4.x/v4.01/bg_h1.png'
|
||||
publication_type = 'newspaper'
|
||||
extra_css = """
|
||||
body{font-family: Arial,Helvetica,sans-serif}
|
||||
.metadata_noticia{font-size: small}
|
||||
.pestana_GDP{font-size: small; font-weight:bold}
|
||||
h1,h2,h3,h4,h5,h6,.subtitulo {color: #3F5974}
|
||||
.hora{color: red}
|
||||
.update{color: gray}
|
||||
@ -41,8 +45,11 @@ class ElMundo(BasicNewsRecipe):
|
||||
remove_tags_after = dict(name='div' , attrs={'id':['desarrollo_noticia','tamano']})
|
||||
remove_attributes = ['lang','border']
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['herramientas','publicidad_google']})
|
||||
,dict(name='div', attrs={'id':'modulo_multimedia' })
|
||||
dict(name='div', attrs={'class':['herramientas','publicidad_google','comenta','col col-2b','apoyos','no-te-pierdas']})
|
||||
,dict(name='div', attrs={'class':['publicidad publicidad_cuerpo_noticia','comentarios_nav','mensaje_privado','interact']})
|
||||
,dict(name='div', attrs={'class':['num_comentarios estirar']})
|
||||
,dict(name='span', attrs={'class':['links_comentar']})
|
||||
,dict(name='div', attrs={'id':['comentar']})
|
||||
,dict(name='ul', attrs={'class':'herramientas' })
|
||||
,dict(name=['object','link','embed','iframe','base','meta'])
|
||||
]
|
||||
@ -50,13 +57,31 @@ class ElMundo(BasicNewsRecipe):
|
||||
feeds = [
|
||||
(u'Portada' , u'http://estaticos.elmundo.es/elmundo/rss/portada.xml' )
|
||||
,(u'Deportes' , u'http://estaticos.elmundo.es/elmundodeporte/rss/portada.xml')
|
||||
,(u'Economia' , u'http://estaticos.elmundo.es/elmundo/rss/economia.xml' )
|
||||
,(u'Espana' , u'http://estaticos.elmundo.es/elmundo/rss/espana.xml' )
|
||||
,(u'Econom\xeda' , u'http://estaticos.elmundo.es/elmundo/rss/economia.xml' )
|
||||
,(u'Espa\xf1a' , u'http://estaticos.elmundo.es/elmundo/rss/espana.xml' )
|
||||
,(u'Internacional' , u'http://estaticos.elmundo.es/elmundo/rss/internacional.xml' )
|
||||
,(u'Cultura' , u'http://estaticos.elmundo.es/elmundo/rss/cultura.xml' )
|
||||
,(u'Ciencia/Ecologia', u'http://estaticos.elmundo.es/elmundo/rss/ciencia.xml' )
|
||||
,(u'Comunicacion' , u'http://estaticos.elmundo.es/elmundo/rss/comunicacion.xml' )
|
||||
,(u'Television' , u'http://estaticos.elmundo.es/elmundo/rss/television.xml' )
|
||||
,(u'Ciencia/Ecolog\xeda', u'http://estaticos.elmundo.es/elmundo/rss/ciencia.xml' )
|
||||
,(u'Comunicaci\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/comunicacion.xml' )
|
||||
,(u'Televisi\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/television.xml' )
|
||||
|
||||
,(u'Salud' , u'http://estaticos.elmundo.es/elmundosalud/rss/portada.xml' )
|
||||
,(u'Solidaridad' , u'http://estaticos.elmundo.es/elmundo/rss/solidaridad.xml' )
|
||||
,(u'Su vivienda' , u'http://estaticos.elmundo.es/elmundo/rss/suvivienda.xml' )
|
||||
,(u'Motor' , u'http://estaticos.elmundo.es/elmundomotor/rss/portada.xml' )
|
||||
|
||||
,(u'Madrid' , u'http://estaticos.elmundo.es/elmundo/rss/madrid.xml' )
|
||||
,(u'Barcelona' , u'http://estaticos.elmundo.es/elmundo/rss/barcelona.xml' )
|
||||
,(u'Pa\xeds Vasco' , u'http://estaticos.elmundo.es/elmundo/rss/paisvasco.xml' )
|
||||
,(u'Baleares' , u'http://estaticos.elmundo.es/elmundo/rss/baleares.xml' )
|
||||
,(u'Castilla y Le\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/castillayleon.xml' )
|
||||
,(u'Valladolid' , u'http://estaticos.elmundo.es/elmundo/rss/valladolid.xml' )
|
||||
,(u'Valencia' , u'http://estaticos.elmundo.es/elmundo/rss/valencia.xml' )
|
||||
,(u'Alicante' , u'http://estaticos.elmundo.es/elmundo/rss/alicante.xml' )
|
||||
,(u'Castell\xf3n' , u'http://estaticos.elmundo.es/elmundo/rss/castellon.xml' )
|
||||
,(u'Andaluc\xeda' , u'http://estaticos.elmundo.es/elmundo/rss/andalucia.xml' )
|
||||
,(u'Sevilla' , u'http://estaticos.elmundo.es/elmundo/rss/andalucia_sevilla.xml' )
|
||||
,(u'M\xe1laga' , u'http://estaticos.elmundo.es/elmundo/rss/andalucia_malaga.xml' )
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
@ -67,3 +92,34 @@ class ElMundo(BasicNewsRecipe):
|
||||
def get_article_url(self, article):
|
||||
return article.get('guid', None)
|
||||
|
||||
|
||||
preprocess_regexps = [
|
||||
# Para presentar la imagen de los videos incrustados
|
||||
|
||||
(re.compile(r'var imagen', re.DOTALL|re.IGNORECASE), lambda match: '--></script><img src'),
|
||||
(re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
|
||||
(re.compile(r'var video=', re.DOTALL|re.IGNORECASE), lambda match: '<script language="Javascript" type="text/javascript"><!--'),
|
||||
|
||||
# Para que no salga la numeración de comentarios: 1, 2, 3 ...
|
||||
|
||||
(re.compile(r'<ol>\n<li style="z-index:', re.DOTALL|re.IGNORECASE), lambda match: '<ul><li style="z-index:'),
|
||||
(re.compile(r'</ol>\n<div class="num_comentarios estirar">', re.DOTALL|re.IGNORECASE), lambda match: '</ul><div class="num_comentarios estirar">'),
|
||||
]
|
||||
|
||||
# Obtener la imagen de portada
|
||||
|
||||
def get_cover_url(self):
|
||||
cover = None
|
||||
st = time.localtime()
|
||||
year = str(st.tm_year)
|
||||
month = "%.2d" % st.tm_mon
|
||||
day = "%.2d" % st.tm_mday
|
||||
#http://img.kiosko.net/2011/11/19/es/elmundo.750.jpg
|
||||
cover='http://img.kiosko.net/'+ year + '/' + month + '/' + day +'/es/elmundo.750.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
self.log("\nPortada no disponible")
|
||||
cover ='http://estaticos03.elmundo.es/elmundo/iconos/v4.x/v4.01/bg_h1.png'
|
||||
return cover
|
||||
|
16
recipes/emuzica_pl.recipe
Normal file
@ -0,0 +1,16 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class eMuzyka(BasicNewsRecipe):
|
||||
title = u'eMuzyka'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Emuzyka to największa i najpopularniejsza strona o muzyce w Polsce'
|
||||
category = 'music'
|
||||
language = 'pl'
|
||||
cover_url='http://s.emuzyka.pl/img/emuzyka_invert_small.jpg'
|
||||
no_stylesheets = True
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
keep_only_tags=[dict(name='div', attrs={'id':'news_container'}), dict(name='h3'), dict(name='div', attrs={'class':'review_text'})]
|
||||
remove_tags=[dict(name='span', attrs={'id':'date'})]
|
||||
feeds = [(u'Aktualno\u015bci', u'http://www.emuzyka.pl/rss.php?f=1'), (u'Recenzje', u'http://www.emuzyka.pl/rss.php?f=2')]
|
@ -1,35 +1,43 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Gerardo Diez'
|
||||
__copyright__ = 'Gerardo Diez<gerardo.diez.garcia@gmail.com>'
|
||||
description = 'Main daily newspaper from Spain - v1.00 (05, Enero 2011)'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
__copyright__ = '5, January 2011 Gerardo Diez<gerardo.diez.garcia@gmail.com> & desUBIKado'
|
||||
__author__ = 'desUBIKado, based on an earlier version by Gerardo Diez'
|
||||
__version__ = 'v1.01'
|
||||
__date__ = '13, November 2011'
|
||||
|
||||
'''
|
||||
expansion.es
|
||||
[url]http://www.expansion.com/[/url]
|
||||
'''
|
||||
|
||||
import time
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
class Publico(BasicNewsRecipe):
|
||||
title =u'Expansion.com'
|
||||
__author__ ='Gerardo Diez'
|
||||
publisher =u'Unidad Editorial Información Económica, S.L.'
|
||||
category ='finances, catalunya'
|
||||
oldest_article =1
|
||||
max_articles_per_feed =100
|
||||
|
||||
class expansion_spanish(BasicNewsRecipe):
|
||||
__author__ ='Gerardo Diez & desUBIKado'
|
||||
description ='Financial news from Spain'
|
||||
title =u'Expansion'
|
||||
publisher =u'Unidad Editorial Internet, S.L.'
|
||||
category ='news, finances, Spain'
|
||||
oldest_article = 2
|
||||
simultaneous_downloads = 10
|
||||
cover_url =u'http://estaticos01.expansion.com/iconos/v2.x/v2.0/cabeceras/logo_expansion.png'
|
||||
timefmt ='[%A, %d %B, %Y]'
|
||||
encoding ='latin'
|
||||
max_articles_per_feed =100
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
encoding ='iso-8859-15'
|
||||
language ='es'
|
||||
use_embedded_content = False
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
remove_empty_feeds = True
|
||||
|
||||
keep_only_tags =dict(name='div', attrs={'class':['noticia primer_elemento']})
|
||||
|
||||
remove_tags =[
|
||||
dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto']}),
|
||||
dict(name='ul', attrs={'class':['bolos_desarrollo_noticia']}),
|
||||
dict(name='div', attrs={'class':['compartir', 'metadata_desarrollo_noticia', 'relacionadas', 'mas_info','publicidad publicidad_textlink', 'ampliarfoto','tit_relacionadas','interact','paginacion estirar','sumario derecha']}),
|
||||
dict(name='ul', attrs={'class':['bolos_desarrollo_noticia','not_logged']}),
|
||||
dict(name='span', attrs={'class':['comentarios']}),
|
||||
dict(name='p', attrs={'class':['cintillo_comentarios', 'cintillo_comentarios formulario']}),
|
||||
dict(name='div', attrs={'id':['comentarios_lectores_listado']})
|
||||
dict(name='div', attrs={'id':['comentarios_lectores_listado','comentar']})
|
||||
]
|
||||
feeds =[
|
||||
(u'Portada', u'http://estaticos.expansion.com/rss/portada.xml'),
|
||||
@ -38,42 +46,112 @@ class Publico(BasicNewsRecipe):
|
||||
(u'Euribor', u'http://estaticos.expansion.com/rss/mercadoseuribor.xml'),
|
||||
(u'Materias Primas', u'http://estaticos.expansion.com/rss/mercadosmateriasprimas.xml'),
|
||||
(u'Renta Fija', u'http://estaticos.expansion.com/rss/mercadosrentafija.xml'),
|
||||
|
||||
(u'Portada: Mi Dinero', u'http://estaticos.expansion.com/rss/midinero.xml'),
|
||||
(u'Hipotecas', u'http://estaticos.expansion.com/rss/midinerohipotecas.xml'),
|
||||
(u'Créditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'),
|
||||
(u'Cr\xe9ditos', u'http://estaticos.expansion.com/rss/midinerocreditos.xml'),
|
||||
(u'Pensiones', u'http://estaticos.expansion.com/rss/midineropensiones.xml'),
|
||||
(u'Fondos de Inversión', u'http://estaticos.expansion.com/rss/midinerofondos.xml'),
|
||||
(u'Fondos de Inversi\xf3n', u'http://estaticos.expansion.com/rss/midinerofondos.xml'),
|
||||
(u'Motor', u'http://estaticos.expansion.com/rss/midineromotor.xml'),
|
||||
|
||||
(u'Portada: Empresas', u'http://estaticos.expansion.com/rss/empresas.xml'),
|
||||
(u'Banca', u'http://estaticos.expansion.com/rss/empresasbanca.xml'),
|
||||
(u'TMT', u'http://estaticos.expansion.com/rss/empresastmt.xml'),
|
||||
(u'Energía', u'http://estaticos.expansion.com/rss/empresasenergia.xml'),
|
||||
(u'Inmobiliario y Construcción', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'),
|
||||
(u'Energ\xeda', u'http://estaticos.expansion.com/rss/empresasenergia.xml'),
|
||||
(u'Inmobiliario y Construcci\xf3n', u'http://estaticos.expansion.com/rss/empresasinmobiliario.xml'),
|
||||
(u'Transporte y Turismo', u'http://estaticos.expansion.com/rss/empresastransporte.xml'),
|
||||
(u'Automoción e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'),
|
||||
(u'Distribución', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'),
|
||||
(u'Deporte y Negocio', u' http://estaticos.expansion.com/rss/empresasdeporte.xml'),
|
||||
(u'Automoci\xf3n e Industria', u'http://estaticos.expansion.com/rss/empresasauto-industria.xml'),
|
||||
(u'Distribuci\xf3n', u'http://estaticos.expansion.com/rss/empresasdistribucion.xml'),
|
||||
(u'Deporte y Negocio', u' [url]http://estaticos.expansion.com/rss/empresasdeporte.xml[/url]'),
|
||||
(u'Mi Negocio', u'http://estaticos.expansion.com/rss/empresasminegocio.xml'),
|
||||
(u'Interiores', u'http://estaticos.expansion.com/rss/empresasinteriores.xml'),
|
||||
(u'Digitech', u'http://estaticos.expansion.com/rss/empresasdigitech.xml'),
|
||||
|
||||
(u'Portada: Economía y Política', u'http://estaticos.expansion.com/rss/economiapolitica.xml'),
|
||||
(u'Política', u'http://estaticos.expansion.com/rss/economia.xml'),
|
||||
(u'Portada: Econom\xeda y Pol\xedtica', u'http://estaticos.expansion.com/rss/economiapolitica.xml'),
|
||||
(u'Pol\xedtica', u'http://estaticos.expansion.com/rss/economia.xml'),
|
||||
(u'Portada: Sociedad', u'http://estaticos.expansion.com/rss/entorno.xml'),
|
||||
|
||||
(u'Portada: Opinión', u'http://estaticos.expansion.com/rss/opinion.xml'),
|
||||
(u'Portada: Opini\xf3n', u'http://estaticos.expansion.com/rss/opinion.xml'),
|
||||
(u'Llaves y editoriales', u'http://estaticos.expansion.com/rss/opinioneditorialyllaves.xml'),
|
||||
(u'Tribunas', u'http://estaticos.expansion.com/rss/opiniontribunas.xml'),
|
||||
|
||||
(u'Portada: Jurídico', u'http://estaticos.expansion.com/rss/juridico.xml'),
|
||||
(u'Portada: Jur\xeddico', u'http://estaticos.expansion.com/rss/juridico.xml'),
|
||||
(u'Entrevistas', u'http://estaticos.expansion.com/rss/juridicoentrevistas.xml'),
|
||||
(u'Opinión', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'),
|
||||
(u'Opini\xf3n', u'http://estaticos.expansion.com/rss/juridicoopinion.xml'),
|
||||
(u'Sentencias', u'http://estaticos.expansion.com/rss/juridicosentencias.xml'),
|
||||
|
||||
(u'Mujer', u'http://estaticos.expansion.com/rss/mujer-empresa.xml'),
|
||||
(u'Cataluña', u'http://estaticos.expansion.com/rss/catalunya.xml'),
|
||||
(u'Función pública', u'http://estaticos.expansion.com/rss/funcion-publica.xml')
|
||||
(u'Catalu\xf1a', u'http://estaticos.expansion.com/rss/catalunya.xml'),
|
||||
(u'Funci\xf3n p\xfablica', u'http://estaticos.expansion.com/rss/funcion-publica.xml')
|
||||
]
|
||||
|
||||
# Obtener la imagen de portada
|
||||
|
||||
def get_cover_url(self):
|
||||
cover = None
|
||||
st = time.localtime()
|
||||
year = str(st.tm_year)
|
||||
month = "%.2d" % st.tm_mon
|
||||
day = "%.2d" % st.tm_mday
|
||||
#[url]http://img5.kiosko.net/2011/11/14/es/expansion.750.jpg[/url]
|
||||
cover='http://img5.kiosko.net/'+ year + '/' + month + '/' + day +'/es/expansion.750.jpg'
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
try:
|
||||
br.open(cover)
|
||||
except:
|
||||
self.log("\nPortada no disponible")
|
||||
cover ='http://www.aproahp.org/enlaces/images/diario_expansion.gif'
|
||||
return cover
|
||||
|
||||
|
||||
|
||||
# Para que no salte la publicidad al recuperar la noticia, y que siempre se recupere
|
||||
# la página web, mando la variable "t" con la hora "linux" o "epoch" actual
|
||||
# haciendole creer al sitio web que justo se acaba de ver la publicidad
|
||||
|
||||
def print_version(self, url):
|
||||
st = time.time()
|
||||
segundos = str(int(st))
|
||||
parametros = '.html?t=' + segundos
|
||||
return url.replace('.html', parametros)
|
||||
|
||||
|
||||
|
||||
_processed_links = []
|
||||
|
||||
def get_article_url(self, article):
|
||||
|
||||
# Para obtener la url original del artículo a partir de la de "feedsportal"
|
||||
|
||||
link = article.get('link', None)
|
||||
if link is None:
|
||||
return article
|
||||
if link.split('/')[-1]=="story01.htm":
|
||||
link=link.split('/')[-2]
|
||||
a=['0B','0C','0D','0E','0F','0G','0N' ,'0L0S','0A']
|
||||
b=['.' ,'/' ,'?' ,'-' ,'=' ,'&' ,'.com','www.','0']
|
||||
for i in range(0,len(a)):
|
||||
link=link.replace(a[i],b[i])
|
||||
link="http://"+link
|
||||
|
||||
# Eliminar artículos duplicados en otros feeds
|
||||
|
||||
if not (link in self._processed_links):
|
||||
self._processed_links.append(link)
|
||||
else:
|
||||
link = None
|
||||
|
||||
return link
|
||||
|
||||
|
||||
|
||||
# Un poco de css para mejorar la presentación de las noticias
|
||||
|
||||
extra_css = '''
|
||||
.entradilla {font-family:Arial,Helvetica,sans-serif; font-weight:bold; font-style:italic; font-size:16px;}
|
||||
.fecha_publicacion,.autor {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
|
||||
'''
|
||||
|
||||
|
||||
|
||||
# Para presentar la imagen de los videos incrustados
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'var imagen', re.DOTALL|re.IGNORECASE), lambda match: '--></script><img src'),
|
||||
(re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
|
||||
(re.compile(r'var id_reproductor', re.DOTALL|re.IGNORECASE), lambda match: '<script language="Javascript" type="text/javascript"><!--'),
|
||||
]
|
||||
|
18
recipes/fisco_oggi.recipe
Normal file
@ -0,0 +1,18 @@
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'faber1971'
|
||||
description = 'Website of Italian Governament Income Agency (about revenue, taxation, taxes)- v1.00 (17, December 2011)'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1324112023(BasicNewsRecipe):
|
||||
title = u'Fisco Oggi'
|
||||
language = 'it'
|
||||
__author__ = 'faber1971'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
|
||||
feeds = [(u'Attualit\xe0', u'http://www.fiscooggi.it/taxonomy/term/1/feed'), (u'Normativa', u'http://www.fiscooggi.it/taxonomy/term/5/feed'), (u'Giurisprudenza', u'http://www.fiscooggi.it/taxonomy/term/8/feed'), (u'Dati e statistiche', u'http://www.fiscooggi.it/taxonomy/term/12/feed'), (u'Analisi e commenti', u'http://www.fiscooggi.it/taxonomy/term/13/feed'), (u'Bilancio e contabilit\xe0', u'http://www.fiscooggi.it/taxonomy/term/576/feed'), (u'Dalle regioni', u'http://www.fiscooggi.it/taxonomy/term/16/feed'), (u'Dal mondo', u'http://www.fiscooggi.it/taxonomy/term/17/feed')]
|
||||
|
@ -1,57 +1,68 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Focus_pl(BasicNewsRecipe):
|
||||
title = u'Focus.pl'
|
||||
oldest_article = 15
|
||||
max_articles_per_feed = 100
|
||||
__author__ = 'fenuks'
|
||||
class FocusRecipe(BasicNewsRecipe):
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = u'intromatyk <intromatyk@gmail.com>'
|
||||
language = 'pl'
|
||||
description ='polish scientific monthly magazine'
|
||||
version = 1
|
||||
|
||||
title = u'Focus'
|
||||
publisher = u'Gruner + Jahr Polska'
|
||||
category = u'News'
|
||||
description = u'Newspaper'
|
||||
category='magazine'
|
||||
cover_url=''
|
||||
remove_empty_feeds= True
|
||||
no_stylesheets=True
|
||||
remove_tags_before=dict(name='div', attrs={'class':'h2 h2f'})
|
||||
remove_tags_after=dict(name='div', attrs={'class':'clear'})
|
||||
feeds = [(u'Wszystkie kategorie', u'http://focus.pl.feedsportal.com/c/32992/f/532692/index.rss'),
|
||||
(u'Nauka', u'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'),
|
||||
(u'Historia', u'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'),
|
||||
(u'Cywilizacja', u'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'),
|
||||
(u'Sport', u'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'),
|
||||
(u'Technika', u'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'),
|
||||
(u'Przyroda', u'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'),
|
||||
(u'Technologie', u'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'),
|
||||
(u'Warto wiedzieć', u'http://focus.pl.feedsportal.com/c/32992/f/532700/index.rss'),
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100000
|
||||
recursions = 0
|
||||
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
encoding = 'utf-8'
|
||||
# Seems to work best, but YMMV
|
||||
simultaneous_downloads = 5
|
||||
|
||||
r = re.compile('.*(?P<url>http:\/\/(www.focus.pl)|(rss.feedsportal.com\/c)\/.*\.html?).*')
|
||||
keep_only_tags =[]
|
||||
keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'cll'}))
|
||||
|
||||
remove_tags =[]
|
||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulm noprint'}))
|
||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'txb'}))
|
||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'h2'}))
|
||||
remove_tags.append(dict(name = 'ul', attrs = {'class' : 'txu'}))
|
||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulc'}))
|
||||
|
||||
extra_css = '''
|
||||
body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
|
||||
h1{text-align: left;}
|
||||
h2{font-size: medium; font-weight: bold;}
|
||||
p.lead {font-weight: bold; text-align: left;}
|
||||
.authordate {font-size: small; color: #696969;}
|
||||
.fot{font-size: x-small; color: #666666;}
|
||||
'''
|
||||
|
||||
|
||||
feeds = [
|
||||
('Nauka', 'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'),
|
||||
('Historia', 'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'),
|
||||
('Cywilizacja', 'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'),
|
||||
('Sport', 'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'),
|
||||
('Technika', 'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'),
|
||||
('Przyroda', 'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'),
|
||||
('Technologie', 'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'),
|
||||
]
|
||||
|
||||
def skip_ad_pages(self, soup):
|
||||
tag=soup.find(name='a')
|
||||
if tag:
|
||||
new_soup=self.index_to_soup(tag['href']+ 'do-druku/1/', raw=True)
|
||||
return new_soup
|
||||
|
||||
def append_page(self, appendtag):
|
||||
tag=appendtag.find(name='div', attrs={'class':'arrows'})
|
||||
if tag:
|
||||
nexturl='http://www.focus.pl/'+tag.a['href']
|
||||
for rem in appendtag.findAll(name='div', attrs={'class':'klik-nav'}):
|
||||
rem.extract()
|
||||
while nexturl:
|
||||
soup2=self.index_to_soup(nexturl)
|
||||
nexturl=None
|
||||
pagetext=soup2.find(name='div', attrs={'class':'txt'})
|
||||
tag=pagetext.find(name='div', attrs={'class':'arrows'})
|
||||
for r in tag.findAll(name='a'):
|
||||
if u'Następne' in r.string:
|
||||
nexturl='http://www.focus.pl/'+r['href']
|
||||
for rem in pagetext.findAll(name='div', attrs={'class':'klik-nav'}):
|
||||
rem.extract()
|
||||
pos = len(appendtag.contents)
|
||||
appendtag.insert(pos, pagetext)
|
||||
if ('advertisement' in soup.find('title').string.lower()):
|
||||
href = soup.find('a').get('href')
|
||||
return self.index_to_soup(href, raw=True)
|
||||
else:
|
||||
return None
|
||||
|
||||
def get_cover_url(self):
|
||||
soup=self.index_to_soup('http://www.focus.pl/magazyn/')
|
||||
@ -60,7 +71,14 @@ class Focus_pl(BasicNewsRecipe):
|
||||
self.cover_url='http://www.focus.pl/' + tag.a['href']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
self.append_page(soup.body)
|
||||
return soup
|
||||
def print_version(self, url):
|
||||
if url.count ('focus.pl.feedsportal.com'):
|
||||
u = url.find('focus0Bpl')
|
||||
u = 'http://www.focus.pl/' + url[u + 11:]
|
||||
u = u.replace('0C', '/')
|
||||
u = u.replace('A', '')
|
||||
u = u.replace ('0E','-')
|
||||
u = u.replace('/nc/1//story01.htm', '/do-druku/1')
|
||||
else:
|
||||
u = url.replace('/nc/1','/do-druku/1')
|
||||
return u
|
@ -8,31 +8,35 @@ class FSP(BasicNewsRecipe):
|
||||
__author__ = 'fluzao'
|
||||
description = u'Printed edition contents. UOL subscription required (Folha subscription currently not supported).' + \
|
||||
u' [Conte\xfado completo da edi\xe7\xe3o impressa. Somente para assinantes UOL.]'
|
||||
INDEX = 'http://www1.folha.uol.com.br/fsp/indices/'
|
||||
|
||||
#found this to be the easiest place to find the index page (13-Nov-2011).
|
||||
# searching for the "Indice Geral" link
|
||||
HOMEPAGE = 'http://www1.folha.uol.com.br/fsp/'
|
||||
masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
|
||||
|
||||
language = 'pt'
|
||||
no_stylesheets = True
|
||||
max_articles_per_feed = 40
|
||||
remove_javascript = True
|
||||
needs_subscription = True
|
||||
remove_tags_before = dict(name='b')
|
||||
|
||||
remove_tags_before = dict(name='p')
|
||||
remove_tags = [dict(name='td', attrs={'align':'center'})]
|
||||
remove_attributes = ['height','width']
|
||||
masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
|
||||
|
||||
# fixes the problem with the section names
|
||||
section_dict = {'cotidian' : 'cotidiano', 'ilustrad': 'ilustrada', \
|
||||
'quadrin': 'quadrinhos' , 'opiniao' : u'opini\xE3o', \
|
||||
'ciencia' : u'ci\xeancia' , 'saude' : u'sa\xfade', \
|
||||
'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio'}
|
||||
'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio', \
|
||||
'imoveis' : u'im\xf3veis', 'negocios' : u'neg\xf3cios', \
|
||||
'veiculos' : u've\xedculos', 'corrida' : 'folha corrida'}
|
||||
|
||||
# this solves the problem with truncated content in Kindle
|
||||
conversion_options = {'linearize_tables' : True}
|
||||
|
||||
# this bit removes the footer where there are links for Proximo Texto, Texto Anterior,
|
||||
# Indice e Comunicar Erros
|
||||
preprocess_regexps = [(re.compile(r'<BR><BR>Texto Anterior:.*<!--/NOTICIA-->',
|
||||
re.DOTALL|re.IGNORECASE), lambda match: r''),
|
||||
(re.compile(r'<BR><BR>Próximo Texto:.*<!--/NOTICIA-->',
|
||||
preprocess_regexps = [(re.compile(r'<!--/NOTICIA-->.*Comunicar Erros</a>',
|
||||
re.DOTALL|re.IGNORECASE), lambda match: r'')]
|
||||
|
||||
def get_browser(self):
|
||||
@ -49,7 +53,25 @@ class FSP(BasicNewsRecipe):
|
||||
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
#Searching for the index page on the HOMEPAGE
|
||||
hpsoup = self.index_to_soup(self.HOMEPAGE)
|
||||
indexref = hpsoup.find('a', href=re.compile('^indices.*'))
|
||||
self.log('--> tag containing the today s index: ', indexref)
|
||||
INDEX = indexref['href']
|
||||
INDEX = 'http://www1.folha.uol.com.br/fsp/'+INDEX
|
||||
self.log('--> INDEX after extracting href and adding prefix: ', INDEX)
|
||||
# ... and taking the opportunity to get the cover image link
|
||||
coverurl = hpsoup.find('a', href=re.compile('^cp.*'))['href']
|
||||
if coverurl:
|
||||
self.log('--> tag containing the today s cover: ', coverurl)
|
||||
coverurl = coverurl.replace('htm', 'jpg')
|
||||
coverurl = 'http://www1.folha.uol.com.br/fsp/images/'+coverurl
|
||||
self.log('--> coverurl after extracting href and adding prefix: ', coverurl)
|
||||
self.cover_url = coverurl
|
||||
|
||||
#soup = self.index_to_soup(self.INDEX)
|
||||
soup = self.index_to_soup(INDEX)
|
||||
|
||||
feeds = []
|
||||
articles = []
|
||||
section_title = "Preambulo"
|
||||
@ -68,8 +90,12 @@ class FSP(BasicNewsRecipe):
|
||||
self.log('--> new section title: ', section_title)
|
||||
if strpost.startswith('<a href'):
|
||||
url = post['href']
|
||||
#this bit is kept if they ever go back to the old format (pre Nov-2011)
|
||||
if url.startswith('/fsp'):
|
||||
url = 'http://www1.folha.uol.com.br'+url
|
||||
#
|
||||
if url.startswith('http://www1.folha.uol.com.br/fsp'):
|
||||
#url = 'http://www1.folha.uol.com.br'+url
|
||||
title = self.tag_to_string(post)
|
||||
self.log()
|
||||
self.log('--> post: ', post)
|
||||
@ -82,15 +108,11 @@ class FSP(BasicNewsRecipe):
|
||||
# keeping the front page url
|
||||
minha_capa = feeds[0][1][1]['url']
|
||||
|
||||
# removing the 'Preambulo' section
|
||||
# removing the first section (now called 'top')
|
||||
del feeds[0]
|
||||
|
||||
# creating the url for the cover image
|
||||
coverurl = feeds[0][1][0]['url']
|
||||
coverurl = coverurl.replace('/opiniao/fz', '/images/cp')
|
||||
coverurl = coverurl.replace('01.htm', '.jpg')
|
||||
self.cover_url = coverurl
|
||||
|
||||
# inserting the cover page as the first article (nicer for kindle users)
|
||||
feeds.insert(0,(u'primeira p\xe1gina', [{'title':u'Primeira p\xe1gina' , 'url':minha_capa}]))
|
||||
return feeds
|
||||
|
||||
|
||||
|
50
recipes/formulaas.recipe
Normal file
@ -0,0 +1,50 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = u'2011, Silviu Cotoar\u0103'
|
||||
'''
|
||||
formula-as.ro
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class FormulaAS(BasicNewsRecipe):
|
||||
title = u'Formula AS'
|
||||
__author__ = u'Silviu Cotoar\u0103'
|
||||
publisher = u'Formula AS'
|
||||
description = u'Formula AS'
|
||||
oldest_article = 5
|
||||
language = 'ro'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
category = 'Ziare,Romania'
|
||||
encoding = 'utf-8'
|
||||
cover_url = 'http://www.formula-as.ro/_client/img/header_logo.png'
|
||||
|
||||
conversion_options = {
|
||||
'comments' : description
|
||||
,'tags' : category
|
||||
,'language' : language
|
||||
,'publisher' : publisher
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':'item padded'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='ul', attrs={'class':'subtitle lower'})
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='ul', attrs={'class':'subtitle lower'}),
|
||||
dict(name='div', attrs={'class':'item-brief-options'})
|
||||
]
|
||||
feeds = [
|
||||
(u'\u0218tiri', u'http://www.formula-as.ro/rss/articole.xml')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
return self.adeify_images(soup)
|
@ -1,35 +1,61 @@
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
class AdvancedUserRecipe(BasicNewsRecipe):
|
||||
#!/usr/bin/env python
|
||||
|
||||
title = u'Frankfurter Rundschau'
|
||||
__author__ = 'schuster'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'de'
|
||||
remove_javascript = True
|
||||
cover_url = 'http://www.fr-online.de/image/view/-/1474018/data/823538/-/logo.png'
|
||||
extra_css = '''
|
||||
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
|
||||
h4{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
|
||||
img {min-width:300px; max-width:600px; min-height:300px; max-height:800px}
|
||||
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
|
||||
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010-2011, Christian Schmitt'
|
||||
|
||||
'''
|
||||
fr-online.de
|
||||
'''
|
||||
|
||||
feeds = [(u'Startseite', u'http://www.fr-online.de/home/-/1472778/1472778/-/view/asFeed/-/index.xml'),
|
||||
(u'Politik', u'http://www.fr-online.de/politik/-/1472596/1472596/-/view/asFeed/-/index.xml'),
|
||||
(u'Meinungen', u'http://www.fr-online.de/politik/meinung/-/1472602/1472602/-/view/asFeed/-/index.xml'),
|
||||
(u'Wirtschaft', u'http://www.fr-online.de/wirtschaft/-/1472780/1472780/-/view/asFeed/-/index.xml'),
|
||||
(u'Sport', u'http://www.fr-online.de/sport/-/1472784/1472784/-/view/asFeed/-/index.xml'),
|
||||
(u'Kultur', u'http://www.fr-online.de/kultur/-/1472786/1472786/-/view/asFeed/-/index.xml'),
|
||||
(u'Panorama', u'http://www.fr-online.de/panorama/-/1472782/1472782/-/view/asFeed/-/index.xml'),
|
||||
(u'Digital', u'http://www.fr-online.de/digital/-/1472406/1472406/-/view/asFeed/-/index.xml'),
|
||||
(u'Wissenschaft', u'http://www.fr-online.de/wissenschaft/-/1472788/1472788/-/view/asFeed/-/index.xml')
|
||||
]
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class FROnlineRecipe(BasicNewsRecipe):
|
||||
title = 'Frankfurter Rundschau'
|
||||
__author__ = 'maccs'
|
||||
description = 'Nachrichten aus D und aller Welt'
|
||||
encoding = 'utf-8'
|
||||
masthead_url = 'http://www.fr-online.de/image/view/-/1474018/data/823552/-/logo.png'
|
||||
publisher = 'Druck- und Verlagshaus Frankfurt am Main GmbH'
|
||||
category = 'news, germany, world'
|
||||
language = 'de'
|
||||
publication_type = 'newspaper'
|
||||
use_embedded_content = False
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
oldest_article = 1 # Increase this number if you're interested in older articles
|
||||
max_articles_per_feed = 50 # Seems a reasonable number to me
|
||||
extra_css = '''
|
||||
body { font-family: "arial", "verdana", "geneva", sans-serif; font-size: 12px; margin: 0px; background-color: #ffffff;}
|
||||
.imgSubline{background-color: #f4f4f4; font-size: 0.8em;}
|
||||
.p--heading-1 {font-weight: bold;}
|
||||
.calibre_navbar {font-size: 0.8em; font-family: "arial", "verdana", "geneva", sans-serif;}
|
||||
'''
|
||||
keep_only_tags = [{'class':'ArticleHeadlineH1'}, {'class':'article_text'}]
|
||||
cover_url = 'http://www.fr-online.de/image/view/-/1474018/data/823552/-/logo.png'
|
||||
cover_margins = (100, 150, '#ffffff')
|
||||
|
||||
|
||||
feeds = []
|
||||
feeds.append(('Startseite', u'http://www.fr-online.de/home/-/1472778/1472778/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Politik', u'http://www.fr-online.de/politik/-/1472596/1472596/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Meinung', u'http://www.fr-online.de/politik/meinung/-/1472602/1472602/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Wirtschaft', u'http://www.fr-online.de/wirtschaft/-/1472780/1472780/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Sport', u'http://www.fr-online.de/sport/-/1472784/1472784/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Eintracht Frankfurt', u'http://www.fr-online.de/sport/eintracht-frankfurt/-/1473446/1473446/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Kultur und Medien', u'http://www.fr-online.de/kultur/-/1472786/1472786/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Panorama', u'http://www.fr-online.de/panorama/-/1472782/1472782/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Frankfurt', u'http://www.fr-online.de/frankfurt/-/1472798/1472798/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Rhein-Main', u'http://www.fr-online.de/rhein-main/-/1472796/1472796/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Hanau', u'http://www.fr-online.de/rhein-main/hanau/-/1472866/1472866/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Darmstadt', u'http://www.fr-online.de/rhein-main/darmstadt/-/1472858/1472858/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Wiesbaden', u'http://www.fr-online.de/rhein-main/wiesbaden/-/1472860/1472860/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Offenbach', u'http://www.fr-online.de/rhein-main/offenbach/-/1472856/1472856/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Bad Homburg', u'http://www.fr-online.de/rhein-main/bad-homburg/-/1472864/1472864/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Digital', u'http://www.fr-online.de/digital/-/1472406/1472406/-/view/asFeed/-/index.xml'))
|
||||
feeds.append(('Wissenschaft', u'http://www.fr-online.de/wissenschaft/-/1472788/1472788/-/view/asFeed/-/index.xml'))
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('index.html', 'view/printVersion/-/index.html')
|
||||
|
||||
|
||||
|
@ -18,7 +18,7 @@ class FrazPC(BasicNewsRecipe):
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
|
||||
cover_url='http://www.frazpc.pl/images/logo.png'
|
||||
feeds = [
|
||||
(u'Aktualno\u015bci', u'http://www.frazpc.pl/feed/aktualnosci'),
|
||||
(u'Artyku\u0142y', u'http://www.frazpc.pl/feed/artykuly')
|
||||
@ -33,6 +33,7 @@ class FrazPC(BasicNewsRecipe):
|
||||
dict(name='div', attrs={'class':'comments_box'})
|
||||
]
|
||||
|
||||
remove_tags_after=dict(name='div', attrs={'class':'content'})
|
||||
preprocess_regexps = [(re.compile(r'\| <a href="#comments">Komentarze \([0-9]*\)</a>'), lambda match: '')]
|
||||
|
||||
remove_attributes = [ 'width', 'height' ]
|
||||
|
35
recipes/gazeta_pl_szczecin.recipe
Normal file
@ -0,0 +1,35 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
import re
|
||||
import string
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class GazetaPlSzczecin(BasicNewsRecipe):
|
||||
title = u'Gazeta.pl Szczecin'
|
||||
description = u'Wiadomości ze Szczecina na portalu Gazeta.pl.'
|
||||
__author__ = u'Michał Szkutnik'
|
||||
__license__ = u'GPL v3'
|
||||
language = 'pl'
|
||||
publisher = 'Agora S.A.'
|
||||
category = 'news, szczecin'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
remove_tags = [ { "name" : "a", "attrs" : { "href" : "http://szczecin.gazeta.pl/szczecin/www.gazeta.pl" }}]
|
||||
cover_url = "http://bi.gazeta.pl/i/hp/hp2009/logo.gif"
|
||||
feeds = [(u'Wszystkie', u'http://rss.feedsportal.com/c/32739/f/530434/index.rss')]
|
||||
|
||||
def get_article_url(self, article):
|
||||
s = re.search("""/0L(szczecin.*)/story01.htm""", article.link)
|
||||
s = s.group(1)
|
||||
replacements = { "0B" : ".", "0C" : "/", "0H" : ",", "0I" : "_"}
|
||||
for (a, b) in replacements.iteritems():
|
||||
s = string.replace(s, a, b)
|
||||
s = string.replace(s, "0A", "0")
|
||||
return "http://"+s
|
||||
|
||||
def print_version(self, url):
|
||||
s = re.search("""/(\d*),(\d*),(\d*),.*\.html""", url)
|
||||
no1 = s.group(2)
|
||||
no2 = s.group(3)
|
||||
return """http://szczecin.gazeta.pl/szczecin/2029020,%s,%s.html""" % (no1, no2)
|
90
recipes/givemesomethingtoread.recipe
Normal file
@ -0,0 +1,90 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class GiveMeSomethingToRead(BasicNewsRecipe):
|
||||
title = u'Give Me Something To Read'
|
||||
description = 'Curation / aggregation of articles on diverse topics'
|
||||
language = 'en'
|
||||
__author__ = 'barty on mobileread.com forum'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = False
|
||||
timefmt = ' [%a, %d %b, %Y]'
|
||||
oldest_article = 365
|
||||
auto_cleanup = True
|
||||
INDEX = 'http://givemesomethingtoread.com'
|
||||
CATEGORIES = [
|
||||
# comment out categories you don't want
|
||||
# (user friendly name, system name, max number of articles to load)
|
||||
('The Arts','arts',25),
|
||||
('Science','science',30),
|
||||
('Technology','technology',30),
|
||||
('Politics','politics',20),
|
||||
('Media','media',30),
|
||||
('Crime','crime',15),
|
||||
('Other articles','',10)
|
||||
]
|
||||
|
||||
def parse_index(self):
|
||||
self.cover_url = 'http://thegretchenshow.files.wordpress.com/2009/12/well-read-cat-small.jpg'
|
||||
feeds = []
|
||||
seen_urls = set([])
|
||||
regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
|
||||
|
||||
for category in self.CATEGORIES:
|
||||
|
||||
(cat_name, tag, max_articles) = category
|
||||
|
||||
tagurl = '' if tag=='' else '/tagged/'+tag
|
||||
self.log('Reading category:', cat_name)
|
||||
|
||||
articles = []
|
||||
pageno = 1
|
||||
|
||||
while len(articles) < max_articles and pageno < 100:
|
||||
|
||||
page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
|
||||
pageno += 1
|
||||
|
||||
self.log('\tReading page:', page)
|
||||
try:
|
||||
soup = self.index_to_soup(page)
|
||||
except:
|
||||
break
|
||||
|
||||
headers = soup.findAll('h2')
|
||||
if len(headers) == .0:
|
||||
break
|
||||
|
||||
for header in headers:
|
||||
atag = header.find('a')
|
||||
url = atag['href']
|
||||
# skip promotionals and duplicate
|
||||
if url.startswith('http://givemesomethingtoread') or url.startswith('/') or url in seen_urls:
|
||||
continue
|
||||
seen_urls.add(url)
|
||||
title = self.tag_to_string(header)
|
||||
self.log('\tFound article:', title)
|
||||
#self.log('\t', url)
|
||||
desc = header.parent.find('blockquote')
|
||||
desc = self.tag_to_string(desc) if desc else ''
|
||||
m = regex.match( url)
|
||||
if m:
|
||||
desc = "[%s] %s" % (m.group(2), desc)
|
||||
#self.log('\t', desc)
|
||||
date = ''
|
||||
p = header.parent.previousSibling
|
||||
# navigate up to find h3, which contains the date
|
||||
while p:
|
||||
if hasattr(p,'name') and p.name == 'h3':
|
||||
date = self.tag_to_string(p)
|
||||
break
|
||||
p = p.previousSibling
|
||||
articles.append({'title':title,'url':url,'description':desc,'date':date})
|
||||
if len(articles) >= max_articles:
|
||||
break
|
||||
|
||||
if articles:
|
||||
feeds.append((cat_name, articles))
|
||||
|
||||
return feeds
|
||||
|
@ -1,4 +1,3 @@
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class GlasgowHerald(BasicNewsRecipe):
|
||||
@ -9,12 +8,16 @@ class GlasgowHerald(BasicNewsRecipe):
|
||||
language = 'en_GB'
|
||||
|
||||
__author__ = 'Kovid Goyal'
|
||||
use_embedded_content = False
|
||||
|
||||
keep_only_tags = [dict(attrs={'class':'article'})]
|
||||
remove_tags = [
|
||||
dict(id=['pic-nav']),
|
||||
dict(attrs={'class':['comments-top']})
|
||||
]
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
|
||||
#keep_only_tags = [dict(attrs={'class':'article'})]
|
||||
#remove_tags = [
|
||||
#dict(id=['pic-nav']),
|
||||
#dict(attrs={'class':['comments-top']})
|
||||
#]
|
||||
|
||||
|
||||
feeds = [
|
||||
@ -26,4 +29,3 @@ class GlasgowHerald(BasicNewsRecipe):
|
||||
u'http://www.heraldscotland.com/cmlink/1.768',),
|
||||
(u'Columnists', u'http://www.heraldscotland.com/cmlink/1.658574')]
|
||||
|
||||
|
||||
|
@ -51,6 +51,13 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe):
|
||||
{'class':['articleTools', 'pagination', 'Ads', 'topad',
|
||||
'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}]
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
picdiv = soup.find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,picdiv['src'])
|
||||
|
||||
|
||||
#Use the mobile version rather than the web version
|
||||
def print_version(self, url):
|
||||
return url.rpartition('?')[0] + '?service=mobile'
|
||||
|
@ -12,7 +12,6 @@ class GN(BasicNewsRecipe):
|
||||
EDITION = 0
|
||||
|
||||
__author__ = 'Piotr Kontek'
|
||||
title = u'Gość niedzielny'
|
||||
description = 'Weekly magazine'
|
||||
encoding = 'utf-8'
|
||||
no_stylesheets = True
|
||||
@ -20,6 +19,8 @@ class GN(BasicNewsRecipe):
|
||||
remove_javascript = True
|
||||
temp_files = []
|
||||
simultaneous_downloads = 1
|
||||
masthead_url = 'http://gosc.pl/files/11/03/12/949089_top.gif'
|
||||
title = u'Gość niedzielny'
|
||||
|
||||
articles_are_obfuscated = True
|
||||
|
||||
@ -64,7 +65,6 @@ class GN(BasicNewsRecipe):
|
||||
if img != None:
|
||||
a = img.parent
|
||||
self.EDITION = a['href']
|
||||
self.title = img['alt']
|
||||
self.cover_url = 'http://www.gosc.pl' + img['src']
|
||||
if not first:
|
||||
break
|
||||
|
96
recipes/grantland.recipe
Normal file
@ -0,0 +1,96 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class GrantLand(BasicNewsRecipe):
|
||||
title = u"Grantland"
|
||||
description = 'Writings on Sports & Pop Culture'
|
||||
language = 'en'
|
||||
__author__ = 'Barty'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = False
|
||||
# auto_cleanup is too aggressive sometimes and we end up with blank articles
|
||||
auto_cleanup = False
|
||||
timefmt = ' [%a, %d %b %Y]'
|
||||
oldest_article = 365
|
||||
|
||||
cover_url = 'http://cdn0.sbnation.com/imported_assets/740965/blog_grantland_grid_3.jpg'
|
||||
masthead_url = 'http://a1.espncdn.com/prod/assets/grantland/grantland-logo.jpg'
|
||||
|
||||
INDEX = 'http://www.grantland.com'
|
||||
CATEGORIES = [
|
||||
# comment out categories you don't want
|
||||
# (user friendly name, url suffix, max number of articles to load)
|
||||
('Today in Grantland','',20),
|
||||
('In Case You Missed It','incaseyoumissedit',35),
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
{'name':['head','style','script']},
|
||||
{'id':['header']},
|
||||
{'class':re.compile(r'\bside|\bad\b|floatright|tags')}
|
||||
]
|
||||
remove_tags_before = {'class':'wrapper'}
|
||||
remove_tags_after = [{'id':'content'}]
|
||||
|
||||
preprocess_regexps = [
|
||||
# <header> tags with an img inside are just blog banners, don't need them
|
||||
# note: there are other useful <header> tags so we don't want to just strip all of them
|
||||
(re.compile(r'<header class.+?<img .+?>.+?</header>', re.DOTALL|re.IGNORECASE),lambda m: ''),
|
||||
# delete everything between the *last* <hr class="small" /> and </article>
|
||||
(re.compile(r'<hr class="small"(?:(?!<hr class="small").)+</article>', re.DOTALL|re.IGNORECASE),lambda m: '<hr class="small" /></article>'),
|
||||
]
|
||||
extra_css = """cite, time { font-size: 0.8em !important; margin-right: 1em !important; }
|
||||
img + cite { display:block; text-align:right}"""
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
seen_urls = set([])
|
||||
|
||||
for category in self.CATEGORIES:
|
||||
|
||||
(cat_name, tag, max_articles) = category
|
||||
self.log('Reading category:', cat_name)
|
||||
articles = []
|
||||
|
||||
page = "%s/%s" % (self.INDEX, tag)
|
||||
soup = self.index_to_soup(page)
|
||||
headers = soup.findAll('h2' if tag=='' else 'h3')
|
||||
|
||||
for header in headers:
|
||||
tag = header.find('a')
|
||||
if tag is None or not hasattr(tag,'href'):
|
||||
continue
|
||||
url = tag['href']
|
||||
if url.startswith('/'):
|
||||
url = self.INDEX + url
|
||||
if url in seen_urls:
|
||||
continue
|
||||
seen_urls.add(url)
|
||||
title = self.tag_to_string(tag)
|
||||
if 'Podcast:' in title or 'In Case You Missed It' in title:
|
||||
continue
|
||||
desc = dt = ''
|
||||
par = header.parent
|
||||
#tag = par.find('cite')
|
||||
#if tag is not None:
|
||||
# desc = '['+self.tag_to_string(tag) + '] '
|
||||
tag = par.find('div')
|
||||
if tag is not None:
|
||||
desc = desc + self.tag_to_string(tag)
|
||||
tag = tag.find('time')
|
||||
if tag is not None:
|
||||
dt = self.tag_to_string( tag)
|
||||
|
||||
self.log('\tFound article:', title)
|
||||
self.log('\t', url)
|
||||
articles.append({'title':title,'url':url,'description':desc,'date':dt})
|
||||
if len(articles) >= max_articles:
|
||||
break
|
||||
|
||||
if articles:
|
||||
feeds.append((cat_name, articles))
|
||||
|
||||
return feeds
|
||||
|
||||
def print_version(self, url):
|
||||
return url+'?view=print'
|
43
recipes/gs24_pl.recipe
Normal file
@ -0,0 +1,43 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
import re
|
||||
import string
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1322322819(BasicNewsRecipe):
|
||||
title = u'GS24.pl (Głos Szczeciński)'
|
||||
description = u'Internetowy serwis Głosu Szczecińskiego'
|
||||
__author__ = u'Michał Szkutnik'
|
||||
__license__ = u'GPL v3'
|
||||
language = 'pl'
|
||||
publisher = 'Media Regionalne sp. z o.o.'
|
||||
category = 'news, szczecin'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
cover_url = "http://www.gs24.pl/images/top_logo.png"
|
||||
|
||||
feeds = [
|
||||
# (u'Wszystko', u'http://www.gs24.pl/rss.xml'),
|
||||
(u'Szczecin', u'http://www.gs24.pl/szczecin.xml'),
|
||||
(u'Stargard', u'http://www.gs24.pl/stargard.xml'),
|
||||
(u'Świnoujście', u'http://www.gs24.pl/swinoujscie.xml'),
|
||||
(u'Goleniów', u'http://www.gs24.pl/goleniow.xml'),
|
||||
(u'Gryfice', u'http://www.gs24.pl/gryfice.xml'),
|
||||
(u'Kamień Pomorski', u'http://www.gs24.pl/kamienpomorski.xml'),
|
||||
(u'Police', u'http://www.gs24.pl/police.xml'),
|
||||
(u'Region', u'http://www.gs24.pl/region.xml'),
|
||||
(u'Sport', u'http://www.gs24.pl/sport.xml'),
|
||||
]
|
||||
|
||||
def get_article_url(self, article):
|
||||
s = re.search("""/0L0S(gs24.*)/story01.htm""", article.link)
|
||||
s = s.group(1)
|
||||
replacements = { "0B" : ".", "0C" : "/", "0H" : ",", "0I" : "_", "0D" : "?", "0F" : "="}
|
||||
for (a, b) in replacements.iteritems():
|
||||
s = string.replace(s, a, b)
|
||||
s = string.replace(s, "0A", "0")
|
||||
return "http://"+s
|
||||
|
||||
def print_version(self, url):
|
||||
return url + "&Template=printpicart"
|
@ -9,6 +9,7 @@ www.guardian.co.uk
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from datetime import date
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||
|
||||
class Guardian(BasicNewsRecipe):
|
||||
|
||||
@ -16,9 +17,11 @@ class Guardian(BasicNewsRecipe):
|
||||
if date.today().weekday() == 6:
|
||||
base_url = "http://www.guardian.co.uk/theobserver"
|
||||
cover_pic = 'Observer digital edition'
|
||||
masthead_url = 'http://static.guim.co.uk/sys-images/Guardian/Pix/site_furniture/2010/10/19/1287478087992/The-Observer-001.gif'
|
||||
else:
|
||||
base_url = "http://www.guardian.co.uk/theguardian"
|
||||
cover_pic = 'Guardian digital edition'
|
||||
masthead_url = 'http://static.guim.co.uk/static/f76b43f9dcfd761f0ecf7099a127b603b2922118/common/images/logos/the-guardian/titlepiece.gif'
|
||||
|
||||
__author__ = 'Seabound and Sujata Raman'
|
||||
language = 'en_GB'
|
||||
@ -26,6 +29,7 @@ class Guardian(BasicNewsRecipe):
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_javascript = True
|
||||
encoding = 'utf-8'
|
||||
|
||||
# List of section titles to ignore
|
||||
# For example: ['Sport']
|
||||
@ -41,6 +45,16 @@ class Guardian(BasicNewsRecipe):
|
||||
dict(name='div', attrs={'class':["guardian-tickets promo-component",]}),
|
||||
dict(name='ul', attrs={'class':["pagination"]}),
|
||||
dict(name='ul', attrs={'id':["content-actions"]}),
|
||||
# article history link
|
||||
dict(name='a', attrs={'class':["rollover history-link"]}),
|
||||
# "a version of this article ..." speil
|
||||
dict(name='div' , attrs = { 'class' : ['section']}),
|
||||
# "about this article" js dialog
|
||||
dict(name='div', attrs={'class':["share-top",]}),
|
||||
# author picture
|
||||
dict(name='img', attrs={'class':["contributor-pic-small"]}),
|
||||
# embedded videos/captions
|
||||
dict(name='span',attrs={'class' : ['inline embed embed-media']}),
|
||||
#dict(name='img'),
|
||||
]
|
||||
use_embedded_content = False
|
||||
@ -65,8 +79,21 @@ class Guardian(BasicNewsRecipe):
|
||||
url = None
|
||||
return url
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
picdiv = soup.find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,picdiv['src'])
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
||||
# multiple html sections in soup, useful stuff in the first
|
||||
html = soup.find('html')
|
||||
soup2 = BeautifulSoup()
|
||||
soup2.insert(0,html)
|
||||
|
||||
soup = soup2
|
||||
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
|
||||
@ -75,6 +102,17 @@ class Guardian(BasicNewsRecipe):
|
||||
for tag in soup.findAll(name=['ul','li']):
|
||||
tag.name = 'div'
|
||||
|
||||
# removes number next to rating stars
|
||||
items_to_remove = []
|
||||
rating_container = soup.find('div', attrs = {'class': ['rating-container']})
|
||||
if rating_container:
|
||||
for item in rating_container:
|
||||
if isinstance(item, Tag) and str(item.name) == 'span':
|
||||
items_to_remove.append(item)
|
||||
|
||||
for item in items_to_remove:
|
||||
item.extract()
|
||||
|
||||
return soup
|
||||
|
||||
def find_sections(self):
|
||||
|
@ -9,9 +9,9 @@ from calibre.ptempfile import PersistentTemporaryFile
|
||||
from urlparse import urlparse
|
||||
import re
|
||||
|
||||
class HackerNews(BasicNewsRecipe):
|
||||
title = 'Hacker News'
|
||||
__author__ = 'Tom Scholl'
|
||||
class HNWithCommentsLink(BasicNewsRecipe):
|
||||
title = 'HN With Comments Link'
|
||||
__author__ = 'Tom Scholl & David Kerschner'
|
||||
description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.'
|
||||
publisher = 'Y Combinator'
|
||||
category = 'news, programming, it, technology'
|
||||
@ -80,6 +80,11 @@ class HackerNews(BasicNewsRecipe):
|
||||
body = body + comments
|
||||
return u'<html><title>' + title + u'</title><body>' + body + '</body></html>'
|
||||
|
||||
def parse_feeds(self):
|
||||
a = super(HNWithCommentsLink, self).parse_feeds()
|
||||
self.hn_articles = a[0].articles
|
||||
return a
|
||||
|
||||
def get_obfuscated_article(self, url):
|
||||
if url.startswith('http://news.ycombinator.com'):
|
||||
content = self.get_hn_content(url)
|
||||
@ -97,6 +102,13 @@ class HackerNews(BasicNewsRecipe):
|
||||
else:
|
||||
content = self.get_readable_content(url)
|
||||
|
||||
article = 0
|
||||
for a in self.hn_articles:
|
||||
if a.url == url:
|
||||
article = a
|
||||
|
||||
content = re.sub(r'</body>\s*</html>\s*$', '', content) + article.summary + '</body></html>'
|
||||
|
||||
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
|
||||
self.temp_files[-1].write(content)
|
||||
self.temp_files[-1].close()
|
||||
|
@ -1,11 +1,11 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
class AdvancedUserRecipe(BasicNewsRecipe):
|
||||
|
||||
title = 'heise online'
|
||||
title = 'Heise-online'
|
||||
description = 'News vom Heise-Verlag'
|
||||
__author__ = 'schuster'
|
||||
masthead_url = 'http://www.heise.de/icons/ho/heise_online_logo.gif'
|
||||
publisher = 'Heise Zeitschriften Verlag GmbH & Co. KG'
|
||||
use_embedded_content = False
|
||||
language = 'de'
|
||||
oldest_article = 2
|
||||
@ -14,11 +14,10 @@ class AdvancedUserRecipe(BasicNewsRecipe):
|
||||
remove_empty_feeds = True
|
||||
timeout = 5
|
||||
no_stylesheets = True
|
||||
encoding = 'utf-8'
|
||||
|
||||
|
||||
remove_tags_after = dict(name ='p', attrs={'class':'editor'})
|
||||
remove_tags = [{'class':'navi_top_container'},
|
||||
remove_tags = [dict(id='navi_top_container'),
|
||||
dict(id='navi_bottom'),
|
||||
dict(id='mitte_rechts'),
|
||||
dict(id='navigation'),
|
||||
@ -29,27 +28,31 @@ class AdvancedUserRecipe(BasicNewsRecipe):
|
||||
dict(id='seiten_navi'),
|
||||
dict(id='adbottom'),
|
||||
dict(id='sitemap'),
|
||||
dict(name='a', href=re.compile(r'^/([a-zA-Z]+/)?')),
|
||||
]
|
||||
dict(name='div', attrs={'id':'sitemap'}),
|
||||
dict(name='ul', attrs={'class':'erste_zeile'}),
|
||||
dict(name='ul', attrs={'class':'zweite_zeile'}),
|
||||
dict(name='div', attrs={'class':'navi_top_container'})]
|
||||
|
||||
feeds = [
|
||||
('Newsticker', 'http://www.heise.de/newsticker/heise.rdf'),
|
||||
('iX', 'http://www.heise.de/ix/news/news.rdf'),
|
||||
('Technology Review', 'http://www.heise.de/tr/news-atom.xml'),
|
||||
('mobil', 'http://www.heise.de/mobil/newsticker/heise-atom.xml'),
|
||||
('Security', 'http://www.heise.de/security/news/news-atom.xml'),
|
||||
('Netze', 'http://www.heise.de/netze/rss/netze-atom.xml'),
|
||||
('Open Source', 'http://www.heise.de/open/news/news-atom.xml'),
|
||||
('Resale ', 'http://www.heise.de/resale/rss/resale.rdf'),
|
||||
('Auto', 'http://www.heise.de/autos/rss/news.rdf'),
|
||||
('Foto ', 'http://www.heise.de/foto/rss/news-atom.xml'),
|
||||
('Autos', 'http://www.heise.de/autos/rss/news.rdf'),
|
||||
('Mac&i', 'http://www.heise.de/mac-and-i/news.rdf'),
|
||||
('Mobile ', 'http://www.heise.de/mobil/newsticker/heise-atom.xml'),
|
||||
('Netz ', 'http://www.heise.de/netze/rss/netze-atom.xml'),
|
||||
('Open ', 'http://www.heise.de/open/news/news-atom.xml'),
|
||||
('Resale ', 'http://www.heise.de/resale/rss/resale.rdf'),
|
||||
('Security ', 'http://www.heise.de/security/news/news-atom.xml'),
|
||||
('C`t', 'http://www.heise.de/ct/rss/artikel-atom.xml'),
|
||||
('iX', 'http://www.heise.de/ix/news/news.rdf'),
|
||||
('Mach-flott', 'http://www.heise.de/mach-flott/rss/mach-flott-atom.xml'),
|
||||
('Blog: Babel-Bulletin', 'http://www.heise.de/developer/rss/babel-bulletin/blog.rdf'),
|
||||
('Blog: Der Dotnet-Doktor', 'http://www.heise.de/developer/rss/dotnet-doktor/blog.rdf'),
|
||||
('Blog: Bernds Management-Welt', 'http://www.heise.de/developer/rss/bernds-management-welt/blog.rdf'),
|
||||
('Blog: The World of IT', 'http://www.heise.de/developer/rss/world-of-it/blog.rdf'),
|
||||
('Blog: Kais bewegtes Web', 'http://www.heise.de/developer/rss/kais-bewegtes-web/blog.rdf')
|
||||
]
|
||||
('Blog: IT conversation', 'http://www.heise.de/developer/rss/world-of-it/blog.rdf'),
|
||||
('Blog: Kais bewegtes Web', 'http://www.heise.de/developer/rss/kais-bewegtes-web/blog.rdf')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?view=print'
|
||||
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import urllib, re
|
||||
|
||||
class HindustanTimes(BasicNewsRecipe):
|
||||
title = u'Hindustan Times'
|
||||
@ -26,4 +27,24 @@ class HindustanTimes(BasicNewsRecipe):
|
||||
'http://feeds.hindustantimes.com/HT-Homepage-LifestyleNews'),
|
||||
]
|
||||
|
||||
def get_article_url(self, article):
|
||||
'''
|
||||
HT uses a variant of the feedportal RSS ad display mechanism
|
||||
'''
|
||||
try:
|
||||
s = article.summary
|
||||
return urllib.unquote(
|
||||
re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
|
||||
except:
|
||||
pass
|
||||
url = BasicNewsRecipe.get_article_url(self, article)
|
||||
res = self.browser.open_novisit(url)
|
||||
url = res.geturl().split('/')[-2]
|
||||
encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
|
||||
'0D': '?', '0E': '-', '0N': '.com', '0L': 'http://', '0S':
|
||||
'www.'}
|
||||
for k, v in encoding.iteritems():
|
||||
url = url.replace(k, v)
|
||||
return url
|
||||
|
||||
|
||||
|
@ -4,56 +4,20 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2010, matek09, matek09@gmail.com'
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
|
||||
class Histmag(BasicNewsRecipe):
|
||||
|
||||
title = u'Histmag'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
cover_url='http://histmag.org/grafika/loga/histmag-logo-2-300px.png'
|
||||
__author__ = 'matek09'
|
||||
description = u"Artykuly historyczne i publicystyczne"
|
||||
encoding = 'utf-8'
|
||||
#preprocess_regexps = [(re.compile(r'</span>'), lambda match: '</span><br><br>'),(re.compile(r'<span>'), lambda match: '<br><br><span>')]
|
||||
no_stylesheets = True
|
||||
language = 'pl'
|
||||
remove_javascript = True
|
||||
#max_articles_per_feed = 1
|
||||
remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'article'}))
|
||||
remove_tags_after = dict(dict(name = 'h2', attrs = {'class' : 'komentarze'}))
|
||||
#keep_only_tags =[]
|
||||
#keep_only_tags.append(dict(name = 'h2'))
|
||||
#keep_only_tags.append(dict(name = 'p'))
|
||||
|
||||
remove_tags =[]
|
||||
remove_tags.append(dict(name = 'p', attrs = {'class' : 'podpis'}))
|
||||
remove_tags.append(dict(name = 'h2', attrs = {'class' : 'komentarze'}))
|
||||
remove_tags.append(dict(name = 'img', attrs = {'src' : 'style/buttons/wesprzyjnas-1.jpg'}))
|
||||
|
||||
preprocess_regexps = [(re.compile(r'</span>'), lambda match: '</span><br><br>'),
|
||||
(re.compile(r'<span>'), lambda match: '<br><br><span>')]
|
||||
extra_css = '''
|
||||
.left {font-size: x-small}
|
||||
.right {font-size: x-small}
|
||||
'''
|
||||
|
||||
def find_articles(self, soup):
|
||||
articles = []
|
||||
for div in soup.findAll('div', attrs={'class' : 'text'}):
|
||||
articles.append({
|
||||
'title' : self.tag_to_string(div.h3.a),
|
||||
'url' : 'http://www.histmag.org/' + div.h3.a['href'],
|
||||
'date' : self.tag_to_string(div.next('p')).split('|')[0],
|
||||
'description' : self.tag_to_string(div.next('p', podpis=False)),
|
||||
})
|
||||
return articles
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup('http://histmag.org/?arc=4&dx=0')
|
||||
feeds = []
|
||||
feeds.append((u"Artykuly historyczne", self.find_articles(soup)))
|
||||
soup = self.index_to_soup('http://histmag.org/?arc=5&dx=0')
|
||||
feeds.append((u"Artykuly publicystyczne", self.find_articles(soup)))
|
||||
soup = self.index_to_soup('http://histmag.org/?arc=1&dx=0')
|
||||
feeds.append((u"Wydarzenia", self.find_articles(soup)))
|
||||
|
||||
return feeds
|
||||
|
||||
keep_only_tags=[dict(id='article')]
|
||||
remove_tags=[dict(name = 'p', attrs = {'class' : 'article-tags'})]
|
||||
|
||||
feeds = [(u'Wszystkie', u'http://histmag.org/rss/wszystkie.xml'), (u'Wydarzenia', u'http://histmag.org/rss/wydarzenia.xml'), (u'Recenzje', u'http://histmag.org/rss/recenzje.xml'), (u'Artykuły historyczne', u'http://histmag.org/rss/historia.xml'), (u'Publicystyka', u'http://histmag.org/rss/publicystyka.xml')]
|
||||
|
@ -8,6 +8,15 @@ class Historia_org_pl(BasicNewsRecipe):
|
||||
category = 'history'
|
||||
language = 'pl'
|
||||
oldest_article = 8
|
||||
remove_empty_feeds=True
|
||||
max_articles_per_feed = 100
|
||||
|
||||
feeds = [(u'Artykuły', u'http://www.historia.org.pl/index.php?format=feed&type=rss')]
|
||||
feeds = [(u'Wszystkie', u'http://www.historia.org.pl/index.php?format=feed&type=rss'),
|
||||
(u'Wiadomości', u'http://www.historia.org.pl/index.php/wiadomosci.feed?type=rss'),
|
||||
(u'Publikacje', u'http://www.historia.org.pl/index.php/publikacje.feed?type=rss'),
|
||||
(u'Publicystyka', u'http://www.historia.org.pl/index.php/publicystyka.feed?type=rss'),
|
||||
(u'Recenzje', u'http://historia.org.pl/index.php/recenzje.feed?type=rss'),
|
||||
(u'Kultura i sztuka', u'http://www.historia.org.pl/index.php/kultura-i-sztuka.feed?type=rss'),
|
||||
(u'Rekonstykcje', u'http://www.historia.org.pl/index.php/rekonstrukcje.feed?type=rss'),
|
||||
(u'Projekty', u'http://www.historia.org.pl/index.php/projekty.feed?type=rss'),
|
||||
(u'Konkursy'), (u'http://www.historia.org.pl/index.php/konkursy.feed?type=rss')]
|
||||
|
@ -1,44 +1,58 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
################################################################################
|
||||
#Description: http://hvg.hu/ RSS channel
|
||||
#Author: Bigpapa (bigpapabig@hotmail.com)
|
||||
#Date: 2011.12.20. - V1.1
|
||||
################################################################################
|
||||
|
||||
class HVG(BasicNewsRecipe):
|
||||
title = 'HVG.HU'
|
||||
__author__ = u'István Papp'
|
||||
description = u'Friss hírek a HVG-től'
|
||||
timefmt = ' [%Y. %b. %d., %a.]'
|
||||
oldest_article = 4
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class hvg(BasicNewsRecipe):
|
||||
title = u'HVG'
|
||||
__author__ = 'Bigpapa'
|
||||
language = 'hu'
|
||||
|
||||
max_articles_per_feed = 100
|
||||
oldest_article = 5 # Hany napos legyen a legregebbi cikk amit leszedjen.
|
||||
max_articles_per_feed = 5 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'utf8'
|
||||
publisher = 'HVG Online'
|
||||
category = u'news, hírek, hvg'
|
||||
extra_css = 'body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
|
||||
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
|
||||
remove_tags_before = dict(id='pg-content')
|
||||
remove_javascript = True
|
||||
remove_empty_feeds = True
|
||||
extra_css = ' h2 { font:bold 28px} '
|
||||
|
||||
feeds = [
|
||||
(u'Itthon', u'http://hvg.hu/rss/itthon')
|
||||
,(u'Világ', u'http://hvg.hu/rss/vilag')
|
||||
,(u'Gazdaság', u'http://hvg.hu/rss/gazdasag')
|
||||
,(u'IT | Tudomány', u'http://hvg.hu/rss/tudomany')
|
||||
,(u'Panoráma', u'http://hvg.hu/rss/Panorama')
|
||||
,(u'Karrier', u'http://hvg.hu/rss/karrier')
|
||||
,(u'Gasztronómia', u'http://hvg.hu/rss/gasztronomia')
|
||||
,(u'Helyi érték', u'http://hvg.hu/rss/helyiertek')
|
||||
,(u'Kultúra', u'http://hvg.hu/rss/kultura')
|
||||
,(u'Cégautó', u'http://hvg.hu/rss/cegauto')
|
||||
,(u'Vállalkozó szellem', u'http://hvg.hu/rss/kkv')
|
||||
,(u'Egészség', u'http://hvg.hu/rss/egeszseg')
|
||||
,(u'Vélemény', u'http://hvg.hu/rss/velemeny')
|
||||
,(u'Sport', u'http://hvg.hu/rss/sport')
|
||||
remove_attributes = ['style','font', 'href']
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'id':['pg-content']})
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace ('#rss', '/print')
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['box articlemenu', 'bannergoogle468', 'boxcontainer left', 'boxcontainer', 'commentbox']}),
|
||||
dict(name='table', attrs={'class':['banner2', 'monocle']}),
|
||||
dict(name='div', attrs={'id':['connect_widget_4cf63ca849ddf4577922632', 'sharetip', 'upprev_box']}),
|
||||
dict(name='div', attrs={'style':['float: right; margin-bottom: 5px;', 'display: none;']}),
|
||||
dict(name='h3', attrs={'class':['hthree']}),
|
||||
dict(name='ul', attrs={'class':['defaultul']}),
|
||||
dict(name='form', attrs={'id':['commentForm']}),
|
||||
dict(name='h6', attrs={'class':['hthree']}),
|
||||
dict(name='h6', attrs={'class':['more2']}),
|
||||
dict(name='img', attrs={'class':['framed']}),
|
||||
dict(name='td', attrs={'class':['greyboxbody','embedvideobody','embedvideofooter','embedvideobottom']}),
|
||||
|
||||
|
||||
|
||||
]
|
||||
|
||||
feeds = [
|
||||
# (u'\xd6sszes', 'http://hvg.hu/rss'),
|
||||
(u'Itthon', 'http://hvg.hu/rss/itthon'),
|
||||
(u'Vil\xe1g', 'http://hvg.hu/rss/vilag'),
|
||||
(u'Gazdas\xe1g', 'http://hvg.hu/rss/gazdasag'),
|
||||
(u'Tudom\xe1ny', 'http://hvg.hu/rss/tudomany'),
|
||||
(u'Panor\xe1ma', 'http://hvg.hu/rss/panorama'),
|
||||
(u'Karrier', 'http://hvg.hu/rss/karrier'),
|
||||
(u'Gasztron\xf3mia', 'http://hvg.hu/rss/gasztronomia'),
|
||||
(u'Helyi \xe9rt\xe9k', 'http://hvg.hu/rss/helyiertek'),
|
||||
(u'Kult\xfara', 'http://hvg.hu/rss/kultura'),
|
||||
(u'C\xe9gaut\xf3', 'http://hvg.hu/rss/cegauto'),
|
||||
(u'V\xe1llalkoz\xf3 szellem', 'http://hvg.hu/rss/kkv'),
|
||||
(u'Eg\xe9szs\xe9g', 'http://hvg.hu/rss/egeszseg'),
|
||||
(u'V\xe9lem\xe9ny', 'http://hvg.hu/rss/velemeny'),
|
||||
(u'Sport', 'http://hvg.hu/rss/sport')
|
||||
]
|
BIN
recipes/icons/b365realitatea.png
Normal file
After Width: | Height: | Size: 323 B |
BIN
recipes/icons/biolog_pl.png
Normal file
After Width: | Height: | Size: 1.2 KiB |
BIN
recipes/icons/blues.png
Normal file
After Width: | Height: | Size: 910 B |
BIN
recipes/icons/catavencii.png
Normal file
After Width: | Height: | Size: 600 B |
BIN
recipes/icons/computerworld_pl.png
Normal file
After Width: | Height: | Size: 373 B |
BIN
recipes/icons/descopera_org.png
Normal file
After Width: | Height: | Size: 9.3 KiB |
BIN
recipes/icons/dziennik_pl.png
Normal file
After Width: | Height: | Size: 481 B |
BIN
recipes/icons/formulaas.png
Normal file
After Width: | Height: | Size: 687 B |
BIN
recipes/icons/infra_pl.png
Normal file
After Width: | Height: | Size: 1.5 KiB |
BIN
recipes/icons/kosmonauta_pl.png
Normal file
After Width: | Height: | Size: 1.2 KiB |
BIN
recipes/icons/mlody_technik_pl.png
Normal file
After Width: | Height: | Size: 2.1 KiB |
BIN
recipes/icons/moneynews.png
Normal file
After Width: | Height: | Size: 914 B |
BIN
recipes/icons/skylife.png
Normal file
After Width: | Height: | Size: 3.3 KiB |
BIN
recipes/icons/zaman.png
Normal file
After Width: | Height: | Size: 999 B |
@ -1,8 +1,8 @@
|
||||
#!/usr/bin/env python
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Gabriele Marini, based on Darko Miletic'
|
||||
__author__ = 'Gambarini, based on Darko Miletic'
|
||||
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
|
||||
description = 'Italian daily newspaper - 19-04-2010'
|
||||
description = 'Italian daily newspaper - 09-11-2011'
|
||||
|
||||
'''
|
||||
http://www.ilgiornale.it/
|
||||
@ -11,7 +11,7 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class IlGiornale(BasicNewsRecipe):
|
||||
__author__ = 'Marini Gabriele'
|
||||
__author__ = 'GAMBARINI'
|
||||
description = 'Italian daily newspaper'
|
||||
|
||||
cover_url = 'http://www.ilgiornale.it/img_v1/logo.gif'
|
||||
@ -23,9 +23,8 @@ class IlGiornale(BasicNewsRecipe):
|
||||
timefmt = '[%a, %d %b, %Y]'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 50
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
recursion = 100
|
||||
|
||||
no_stylesheets = True
|
||||
conversion_options = {'linearize_tables':True}
|
||||
@ -38,11 +37,11 @@ class IlGiornale(BasicNewsRecipe):
|
||||
def print_version(self, url):
|
||||
raw = self.browser.open(url).read()
|
||||
soup = BeautifulSoup(raw.decode('utf8', 'replace'))
|
||||
all_print_tags = soup.find('div', {'style':'float:left; width:35%;'})
|
||||
print_link = all_print_tags.contents[1]
|
||||
if all_print_tags is None:
|
||||
all_print_tags = soup.find('div', {'id':'print_article'})
|
||||
print_link = all_print_tags.a
|
||||
if print_link is None:
|
||||
return url
|
||||
return print_link['href']
|
||||
return 'http://www.ilgiornale.it' + print_link['href']
|
||||
|
||||
|
||||
feeds = [
|
||||
|
@ -1,33 +1,60 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
www.independent.co.uk
|
||||
'''
|
||||
# adapted from old recipe by Darko Miletic <darko.miletic at gmail.com>
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
|
||||
|
||||
class TheIndependent(BasicNewsRecipe):
|
||||
title = 'The Independent'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Independent News - Breaking news, comment and features from The Independent newspaper'
|
||||
|
||||
class TheIndependentNew(BasicNewsRecipe):
|
||||
|
||||
# flag to enable/disable article graphics on business pages/some others
|
||||
# eg http://www.independent.co.uk/news/world/europe/berlusconi-departure-fails-to-calm-the-markets-6259682.html
|
||||
# -max dimensions can be altered using the .pictureContainer img selector in the css
|
||||
_FETCH_ARTICLE_GRAPHICS = True
|
||||
|
||||
#Flag to enable/disable image fetching (not business)
|
||||
_FETCH_IMAGES = True
|
||||
|
||||
|
||||
#used for converting rating to stars
|
||||
_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star.png'
|
||||
_NO_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star_grey.png'
|
||||
|
||||
|
||||
title = u'The Independent'
|
||||
__author__ = 'Will'
|
||||
description = 'The latest in UK News and World News from The \
|
||||
Independent. Wide range of international and local news, sports \
|
||||
news, commentary and opinion pieces.Independent News - Breaking news \
|
||||
that matters. Your daily comprehensive news source - The \
|
||||
Independent Newspaper'
|
||||
publisher = 'The Independent'
|
||||
category = 'news, politics, UK'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 200
|
||||
category = 'news, UK'
|
||||
no_stylesheets = True
|
||||
encoding = 'cp1252'
|
||||
use_embedded_content = False
|
||||
language = 'en_GB'
|
||||
remove_empty_feeds = True
|
||||
language = 'en_GB'
|
||||
publication_type = 'newspaper'
|
||||
masthead_url = 'http://www.independent.co.uk/independent.co.uk/images/logo-london.png'
|
||||
extra_css = """
|
||||
h1{font-family: Georgia,serif }
|
||||
body{font-family: Verdana,Arial,Helvetica,sans-serif}
|
||||
img{margin-bottom: 0.4em; display:block}
|
||||
.info,.caption,.credits{font-size: x-small}
|
||||
"""
|
||||
masthead_url = 'http://www.independent.co.uk/independent.co.uk/editorial/logo/independent_Masthead.png'
|
||||
encoding = 'utf-8'
|
||||
remove_tags =[
|
||||
dict(attrs={'id' : ['RelatedArtTag','renderBiography']}),
|
||||
dict(attrs={'class' : ['autoplay','openBiogPopup']}),
|
||||
dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}),
|
||||
dict(attrs={'style' : re.compile('.*')}),
|
||||
]
|
||||
|
||||
keep_only_tags =[dict(attrs={'id':'main'})]
|
||||
recursions = 0
|
||||
|
||||
# fixes non compliant html nesting and 'marks' article graphics links
|
||||
preprocess_regexps = [
|
||||
(re.compile('<span class="storyTop ">(?P<nested>.*?)</span>', re.DOTALL),
|
||||
lambda match: '<div class="storyTop">' + match.group('nested') + '</div>'),
|
||||
(re.compile('(<strong>.*?[Cc]lick.*?<a.*?((HERE)|([Hh]ere)).*?</strong>)', re.DOTALL),
|
||||
lambda match: '<div class="article-graphic">' + match.group(0) + '</div>'),
|
||||
]
|
||||
|
||||
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
@ -36,51 +63,451 @@ class TheIndependent(BasicNewsRecipe):
|
||||
, 'language' : language
|
||||
}
|
||||
|
||||
remove_tags =[
|
||||
dict(name=['meta','link','object','embed','iframe','base','style'])
|
||||
,dict(attrs={'class':['related-articles','share','googleCols','article-tools','paging','googleArt']})
|
||||
,dict(attrs={'id':['newsVideoPlayer','yahoobook','google-intext']})
|
||||
]
|
||||
keep_only_tags =[dict(attrs={'id':'article'})]
|
||||
remove_attributes=['lang','onclick','width','xmlns:fb']
|
||||
extra_css = """
|
||||
h1{font-family: Georgia,serif }
|
||||
body{font-family: Verdana,Arial,Helvetica,sans-serif}
|
||||
img{margin-bottom: 0.4em; display:block}
|
||||
.starRating img {float: left}
|
||||
.starRating {margin-top:0.4em; display: block}
|
||||
.image {clear:left; font-size: x-small; color:#888888;}
|
||||
.articleByTimeLocation {font-size: x-small; color:#888888;
|
||||
margin-bottom:0.2em ; margin-top:0.2em ; display:block}
|
||||
.subtitle {clear:left}
|
||||
.column-1 h1 { color: #191919}
|
||||
.column-1 h2 { color: #333333}
|
||||
.column-1 h3 { color: #444444}
|
||||
.column-1 p { color: #777777}
|
||||
.column-1 p,a,h1,h2,h3 { margin: 0; }
|
||||
.column-1 div{color:#888888; margin: 0;}
|
||||
.articleContent {display: block; clear:left;}
|
||||
.storyTop{}
|
||||
.pictureContainer img { max-width: 400px; max-height: 400px;}
|
||||
"""
|
||||
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
|
||||
_processed_urls = []
|
||||
|
||||
feeds = [
|
||||
(u'UK' , u'http://www.independent.co.uk/news/uk/rss' )
|
||||
,(u'World' , u'http://www.independent.co.uk/news/world/rss' )
|
||||
,(u'Business' , u'http://www.independent.co.uk/news/business/rss' )
|
||||
,(u'People' , u'http://www.independent.co.uk/news/people/rss' )
|
||||
,(u'Science' , u'http://www.independent.co.uk/news/science/rss' )
|
||||
,(u'Media' , u'http://www.independent.co.uk/news/media/rss' )
|
||||
,(u'Education' , u'http://www.independent.co.uk/news/education/rss' )
|
||||
,(u'Leading Articles' , u'http://www.independent.co.uk/opinion/leading-articles/rss')
|
||||
,(u'Comentators' , u'http://www.independent.co.uk/opinion/commentators/rss' )
|
||||
,(u'Columnists' , u'http://www.independent.co.uk/opinion/columnists/rss' )
|
||||
,(u'Letters' , u'http://www.independent.co.uk/opinion/letters/rss' )
|
||||
,(u'Big Question' , u'http://www.independent.co.uk/extras/big-question/rss' )
|
||||
,(u'Sport' , u'http://www.independent.co.uk/sport/rss' )
|
||||
,(u'Life&Style' , u'http://www.independent.co.uk/life-style/rss' )
|
||||
,(u'Arts&Entertainment' , u'http://www.independent.co.uk/arts-entertainment/rss' )
|
||||
,(u'Travel' , u'http://www.independent.co.uk/travel/rss' )
|
||||
,(u'Money' , u'http://www.independent.co.uk/money/rss' )
|
||||
]
|
||||
|
||||
def get_article_url(self, article):
|
||||
return article.get('guid', None)
|
||||
url = super(self.__class__,self).get_article_url(article)
|
||||
|
||||
title = article.get('title', None)
|
||||
if title and re.search("^Video:",title):
|
||||
return None
|
||||
|
||||
#remove duplicates
|
||||
if not (url in self._processed_urls):
|
||||
self._processed_urls.append(url)
|
||||
else:
|
||||
url = None
|
||||
return url
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
picdiv = soup.find('img')
|
||||
if picdiv is not None:
|
||||
self.add_toc_thumbnail(article,picdiv['src'])
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.body.findAll(style=True):
|
||||
del item['style']
|
||||
for item in soup.body.findAll(['author','preform']):
|
||||
item.name='span'
|
||||
for item in soup.body.findAll('img'):
|
||||
if not item.has_key('alt'):
|
||||
item['alt'] = 'image'
|
||||
for item in soup.body.findAll('div', attrs={'class':['clear-o','body','photoCaption']}):
|
||||
item.name = 'p'
|
||||
for item in soup.body.findAll('div'):
|
||||
if not item.attrs and not item.contents:
|
||||
|
||||
#remove 'advertorial articles'
|
||||
strapline = soup.find('div',attrs={'class' : re.compile('.*strapLine.*')})
|
||||
if strapline:
|
||||
for para in strapline.findAll('p'):
|
||||
if len(para.contents) and isinstance(para.contents[0],NavigableString) \
|
||||
and para.contents[0] == 'ADVERTORIAL FEATURE':
|
||||
return None
|
||||
|
||||
items_to_extract = []
|
||||
slideshow_elements = []
|
||||
|
||||
for item in soup.findAll(attrs={'class' : re.compile("widget.*")}):
|
||||
remove = True
|
||||
pattern = re.compile('((articleContent)|(title))$')
|
||||
if (pattern.search(item['class'])) is not None:
|
||||
remove = False
|
||||
|
||||
# corrections
|
||||
# story content always good
|
||||
pattern = re.compile('storyContent')
|
||||
if (pattern.search(item['class'])) is not None:
|
||||
remove = False
|
||||
|
||||
#images
|
||||
pattern = re.compile('slideshow')
|
||||
if (pattern.search(item['class'])) is not None:
|
||||
if self._FETCH_IMAGES:
|
||||
remove = False
|
||||
slideshow_elements.append(item)
|
||||
else:
|
||||
remove = True
|
||||
|
||||
#social widgets always bad
|
||||
pattern = re.compile('socialwidget')
|
||||
if (pattern.search(item['class'])) is not None:
|
||||
remove = True
|
||||
|
||||
if remove:
|
||||
items_to_extract.append(item)
|
||||
|
||||
for item in items_to_extract:
|
||||
item.extract()
|
||||
soup2 = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
|
||||
soup2.body.replaceWith(soup.body)
|
||||
return soup2
|
||||
|
||||
items_to_extract = []
|
||||
|
||||
if self._FETCH_IMAGES:
|
||||
for element in slideshow_elements:
|
||||
for item in element.findAll('a',attrs={'href' : re.compile('.*')}):
|
||||
if item.img is not None:
|
||||
#use full size image
|
||||
img = item.findNext('img')
|
||||
|
||||
img['src'] = item['href']
|
||||
|
||||
#insert caption if available
|
||||
if img.get('title') and (len(img['title']) > 1):
|
||||
tag = Tag(soup,'h3')
|
||||
text = NavigableString(img['title'])
|
||||
tag.insert(0,text)
|
||||
|
||||
#picture before text
|
||||
img.extract()
|
||||
item.insert(0,img)
|
||||
item.insert(1,tag)
|
||||
|
||||
# remove link
|
||||
item.name = "div"
|
||||
item["class"]='image'
|
||||
del item["href"]
|
||||
|
||||
|
||||
#remove empty subtitles
|
||||
"""
|
||||
currently the subtitle is located in first paragraph after
|
||||
sibling <h3 class="subtitle"> tag. This may be 'fixed' at
|
||||
some point.
|
||||
"""
|
||||
subtitle = soup.find('h3',attrs={'class' : 'subtitle'})
|
||||
if subtitle is not None:
|
||||
subtitleText = subtitle.findNext('p')
|
||||
if subtitleText is not None:
|
||||
if len(subtitleText.contents[0]) <= 1 :
|
||||
subtitleText.extract()
|
||||
subtitle.extract()
|
||||
|
||||
|
||||
#replace rating numbers with stars
|
||||
for item in soup.findAll('div',attrs={ 'class' : 'starRating'}):
|
||||
if item is not None:
|
||||
soup2 = self._insertRatingStars(soup,item)
|
||||
if soup2 is not None:
|
||||
soup = soup2
|
||||
|
||||
|
||||
#remove empty paragraph tags in storyTop which can leave a space
|
||||
#between first paragraph and rest of story
|
||||
nested_content = False
|
||||
storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
|
||||
for item in storyTop.findAll('p'):
|
||||
for nested in item:
|
||||
if isinstance(nested, Tag):
|
||||
nested_content = True
|
||||
break
|
||||
if not nested_content and item.contents is not None and len(item.contents[0]) <= 1 :
|
||||
items_to_extract.append(item)
|
||||
|
||||
for item in items_to_extract:
|
||||
item.extract()
|
||||
|
||||
items_to_extract = []
|
||||
|
||||
|
||||
#remove line breaks immediately next to tags with default margins
|
||||
#to prevent double line spacing and narrow columns of text
|
||||
storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
|
||||
self._remove_undesired_line_breaks_from_tag(storyTop,soup)
|
||||
|
||||
|
||||
#replace article graphics link with the graphics themselves
|
||||
if self._FETCH_ARTICLE_GRAPHICS:
|
||||
items_to_insert = []
|
||||
for item in soup.findAll('div', attrs={'class' : ['article-graphic']}):
|
||||
strong = item.find('strong')
|
||||
if not strong:
|
||||
continue
|
||||
for child in strong:
|
||||
if isinstance(child,Tag):
|
||||
if str(child.name) == 'a':
|
||||
items_to_insert.extend(self._get_article_graphic(strong,child['href'],soup))
|
||||
|
||||
for item in items_to_insert:
|
||||
item[0].replaceWith(item[1])
|
||||
|
||||
for item in items_to_extract:
|
||||
item.extract()
|
||||
|
||||
return soup
|
||||
|
||||
|
||||
def _get_article_graphic(self,old_item,url,soup):
|
||||
|
||||
items_to_insert = []
|
||||
|
||||
if re.search('\.jpg$',str(url)):
|
||||
div = Tag(soup,'div')
|
||||
div['class'] = 'pictureContainer'
|
||||
img = Tag(soup,'img')
|
||||
img['src'] = url
|
||||
img['alt'] = 'article graphic'
|
||||
div.insert(0,img)
|
||||
items_to_insert.append((old_item,div,))
|
||||
return items_to_insert
|
||||
|
||||
soup2 = self.index_to_soup(url)
|
||||
for item in soup2.findAll('div',attrs={'class' : re.compile("widget picture article.*")}):
|
||||
items_to_insert.append((old_item,item),)
|
||||
return items_to_insert
|
||||
|
||||
|
||||
def _insertRatingStars(self,soup,item):
|
||||
if item.contents is None or len(item.contents) < 1:
|
||||
return
|
||||
rating = item.contents[0]
|
||||
|
||||
try:
|
||||
rating = float(item.contents[0])
|
||||
except:
|
||||
print 'Could not convert decimal rating to star: malformatted float.'
|
||||
return
|
||||
for i in range(1,6):
|
||||
star = Tag(soup,'img')
|
||||
if i <= rating:
|
||||
star['src'] = self._STAR_URL
|
||||
else:
|
||||
star['src'] = self._NO_STAR_URL
|
||||
star['alt'] = 'star number ' + str(i)
|
||||
item.insert(i,star)
|
||||
#item.contents[0] = NavigableString('(' + str(rating) + ')')
|
||||
item.contents[0] = ''
|
||||
|
||||
def postprocess_html(self,soup, first_fetch):
|
||||
#find broken images and remove captions
|
||||
items_to_extract = []
|
||||
for item in soup.findAll('div', attrs={'class' : 'image'}):
|
||||
img = item.findNext('img')
|
||||
if img and img.get('src'):
|
||||
# broken images still point to remote url
|
||||
pattern = re.compile('http://www.independent.co.uk.*')
|
||||
if pattern.match(img["src"]) is not None:
|
||||
caption = img.findNextSibling('h3')
|
||||
if caption is not None:
|
||||
items_to_extract.append(caption)
|
||||
items_to_extract.append(img)
|
||||
|
||||
for item in items_to_extract:
|
||||
item.extract()
|
||||
return soup
|
||||
|
||||
def _recurisvely_linearise_tag_tree(
|
||||
self,
|
||||
item,
|
||||
linearised= None,
|
||||
count=0,
|
||||
limit = 100
|
||||
):
|
||||
linearised = linearised or []
|
||||
count = count + 1
|
||||
if count > limit:
|
||||
return linearised
|
||||
if not (isinstance(item,Tag)):
|
||||
return linearised
|
||||
for nested in item:
|
||||
linearised.append(nested)
|
||||
linearised = self._recurisvely_linearise_tag_tree(nested,linearised, count)
|
||||
return linearised
|
||||
|
||||
|
||||
def _get_previous_tag(self,current_index, tag_tree):
|
||||
if current_index == 0:
|
||||
return None
|
||||
else:
|
||||
return tag_tree[current_index - 1]
|
||||
|
||||
|
||||
def _get_next_tag(self,current_index, tag_tree):
|
||||
if current_index < len(tag_tree) - 1:
|
||||
return tag_tree[current_index + 1]
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def _list_match(self,test_str, list_regex):
|
||||
for regex in list_regex:
|
||||
match = re.match(regex, test_str)
|
||||
if match is not None:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _remove_undesired_line_breaks_from_tag(self,parent,soup):
|
||||
|
||||
if parent is None:
|
||||
return
|
||||
|
||||
|
||||
tag_tree = self._recurisvely_linearise_tag_tree(parent)
|
||||
items_to_remove = []
|
||||
|
||||
|
||||
for item in tag_tree:
|
||||
if item == u'\n':
|
||||
items_to_remove.append(item)
|
||||
continue;
|
||||
|
||||
for item in items_to_remove:
|
||||
tag_tree.remove(item)
|
||||
|
||||
|
||||
spaced_tags = [r'p', r'h\d', r'blockquote']
|
||||
tags_to_extract = []
|
||||
tags_to_replace = []
|
||||
for (i, tag) in enumerate(tag_tree):
|
||||
if isinstance(tag, Tag):
|
||||
if str(tag) == '<br />':
|
||||
previous_tag = self._get_previous_tag(i, tag_tree)
|
||||
|
||||
if isinstance(previous_tag, Tag):
|
||||
previous_tag_is_spaced = previous_tag is not None\
|
||||
and self._list_match(str(previous_tag.name),
|
||||
spaced_tags)
|
||||
else:
|
||||
previous_tag_is_spaced = False
|
||||
|
||||
next_tag = self._get_next_tag(i, tag_tree)
|
||||
|
||||
if isinstance(next_tag, Tag):
|
||||
next_tag_is_spaced = next_tag is not None\
|
||||
and self._list_match(str(next_tag.name), spaced_tags)
|
||||
else:
|
||||
next_tag_is_spaced = False
|
||||
|
||||
if previous_tag_is_spaced or next_tag_is_spaced or i == 0\
|
||||
or i == len(tag_tree) - 1:
|
||||
tags_to_extract.append(tag)
|
||||
else:
|
||||
tags_to_replace.append((tag,NavigableString(' '),))
|
||||
|
||||
|
||||
for pair in tags_to_replace:
|
||||
pair[0].replaceWith(pair[1])
|
||||
for tag in tags_to_extract:
|
||||
tag.extract()
|
||||
|
||||
feeds = [
|
||||
(u'News - UK',
|
||||
u'http://www.independent.co.uk/news/uk/?service=rss'),
|
||||
(u'News - World',
|
||||
u'http://www.independent.co.uk/news/world/?service=rss'),
|
||||
(u'News - Business',
|
||||
u'http://www.independent.co.uk/news/business/?service=rss'),
|
||||
(u'News - People',
|
||||
u'http://www.independent.co.uk/news/people/?service=rss'),
|
||||
(u'News - Science',
|
||||
u'http://www.independent.co.uk/news/science/?service=rss'),
|
||||
(u'News - Media',
|
||||
u'http://www.independent.co.uk/news/media/?service=rss'),
|
||||
(u'News - Education',
|
||||
u'http://www.independent.co.uk/news/education/?service=rss'),
|
||||
(u'News - Obituaries',
|
||||
u'http://www.independent.co.uk/news/obituaries/?service=rss'),
|
||||
(u'News - Corrections',
|
||||
u'http://www.independent.co.uk/news/corrections/?service=rss'
|
||||
),
|
||||
(u'Opinion',
|
||||
u'http://www.independent.co.uk/opinion/?service=rss'),
|
||||
(u'Environment',
|
||||
u'http://www.independent.co.uk/environment/?service=rss'),
|
||||
(u'Sport - Athletics',
|
||||
u'http://www.independent.co.uk/sport/general/athletics/?service=rss'
|
||||
),
|
||||
(u'Sport - Cricket',
|
||||
u'http://www.independent.co.uk/sport/cricket/?service=rss'),
|
||||
(u'Sport - Football',
|
||||
u'http://www.independent.co.uk/sport/football/?service=rss'),
|
||||
(u'Sport - Golf',
|
||||
u'http://www.independent.co.uk/sport/golf/?service=rss'),
|
||||
(u'Sport - Motor racing',
|
||||
u'http://www.independent.co.uk/sport/motor-racing/?service=rss'
|
||||
),
|
||||
(u'Sport - Olympics',
|
||||
u'http://www.independent.co.uk/sport/olympics/?service=rss'),
|
||||
(u'Sport - Racing',
|
||||
u'http://www.independent.co.uk/sport/racing/?service=rss'),
|
||||
(u'Sport - Rugby League',
|
||||
u'http://www.independent.co.uk/sport/general/rugby-league/?service=rss'),
|
||||
(u'Sport - Rugby Union',
|
||||
u'http://www.independent.co.uk/sport/rugby/rugby-union/?service=rss'
|
||||
),
|
||||
(u'Sport - Sailing',
|
||||
u'http://www.independent.co.uk/sport/general/sailing/?service=rss'
|
||||
),
|
||||
(u'Sport - Tennis',
|
||||
u'http://www.independent.co.uk/sport/tennis/?service=rss'),
|
||||
(u'Sport - Others',
|
||||
u'http://www.independent.co.uk/sport/general/others/?service=rss'
|
||||
),
|
||||
(u'Life & Style - Fashion',
|
||||
u'http://www.independent.co.uk/life-style/fashion/?service=rss'
|
||||
),
|
||||
(u'Life & Style -Food & Drink',
|
||||
u'http://www.independent.co.uk/life-style/food-and-drink/?service=rss'
|
||||
),
|
||||
(u'Life & Style - Health and Families',
|
||||
u'http://www.independent.co.uk/life-style/health-and-families/?service=rss'
|
||||
),
|
||||
(u'Life & Style - House & Home',
|
||||
u'http://www.independent.co.uk/life-style/house-and-home/'),
|
||||
(u'Life & Style - History',
|
||||
u'http://www.independent.co.uk/life-style/history/?service=rss'
|
||||
),
|
||||
(u'Life & Style - Gadgets & Tech',
|
||||
u'http://www.independent.co.uk/life-style/gadgets-and-tech/?service=rss'
|
||||
),
|
||||
(u'Life & Style - Motoring',
|
||||
u'http://www.independent.co.uk/life-style/motoring/?service=rss'
|
||||
),
|
||||
(u'Arts & Ents - Art',
|
||||
u'http://www.independent.co.uk/arts-entertainment/art/?service=rss'
|
||||
),
|
||||
(u'Arts & Ents - Architecture',
|
||||
u'http://www.independent.co.uk/arts-entertainment/architecture/?service=rss'
|
||||
),
|
||||
(u'Arts & Ents - Music',
|
||||
u'http://www.independent.co.uk/arts-entertainment/music/?service=rss'
|
||||
),
|
||||
(u'Arts & Ents - Classical',
|
||||
u'http://www.independent.co.uk/arts-entertainment/classical/?service=rss'
|
||||
),
|
||||
(u'Arts & Ents - Films',
|
||||
u'http://www.independent.co.uk/arts-entertainment/films/?service=rss'
|
||||
),
|
||||
(u'Arts & Ents - TV',
|
||||
u'http://www.independent.co.uk/arts-entertainment/tv/?service=rss'
|
||||
),
|
||||
(u'Arts & Ents - Theatre and Dance',
|
||||
u'http://www.independent.co.uk/arts-entertainment/theatre-dance/?service=rss'
|
||||
),
|
||||
(u'Arts & Ents - Comedy',
|
||||
u'http://www.independent.co.uk/arts-entertainment/comedy/?service=rss'
|
||||
),
|
||||
(u'Arts & Ents - Books',
|
||||
u'http://www.independent.co.uk/arts-entertainment/books/?service=rss'
|
||||
),
|
||||
(u'Travel', u'http://www.independent.co.uk/travel/?service=rss'
|
||||
),
|
||||
(u'Money', u'http://www.independent.co.uk/money/?service=rss'),
|
||||
(u'IndyBest',
|
||||
u'http://www.independent.co.uk/extras/indybest/?service=rss'),
|
||||
]
|
||||
|
||||
|
17
recipes/infra_pl.recipe
Normal file
@ -0,0 +1,17 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class INFRA(BasicNewsRecipe):
|
||||
title = u'INFRA'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
__author__ = 'fenuks'
|
||||
description = u'Serwis Informacyjny INFRA - UFO, Zjawiska Paranormalne, Duchy, Tajemnice świata.'
|
||||
cover_url = 'http://npn.nazwa.pl/templates/ja_teline_ii/images/logo.jpg'
|
||||
category = 'UFO'
|
||||
language = 'pl'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheers=True
|
||||
remove_tags_before=dict(name='h2', attrs={'class':'contentheading'})
|
||||
remove_tags_after=dict(attrs={'class':'pagenav'})
|
||||
remove_tags=[dict(attrs={'class':'pagenav'})]
|
||||
feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/index.php?option=com_rd_rss&id=1')]
|
18
recipes/japan_news.recipe
Normal file
@ -0,0 +1,18 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class NewsOnJapan(BasicNewsRecipe):
|
||||
title = u'News On Japan'
|
||||
language = 'en'
|
||||
__author__ = 'Krittika Goyal'
|
||||
oldest_article = 1 #days
|
||||
max_articles_per_feed = 25
|
||||
use_embedded_content = False
|
||||
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
|
||||
|
||||
feeds = [
|
||||
('News',
|
||||
'http://newsonjapan.com/rss/top.xml'),
|
||||
]
|
14
recipes/kosmonauta_pl.recipe
Normal file
@ -0,0 +1,14 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class Kosmonauta(BasicNewsRecipe):
|
||||
title = u'Kosmonauta.net'
|
||||
__author__ = 'fenuks'
|
||||
description = u'polskojęzyczny portal w całości dedykowany misjom kosmicznym i badaniom kosmosu.'
|
||||
category = 'astronomy'
|
||||
language = 'pl'
|
||||
cover_url='http://bi.gazeta.pl/im/4/10393/z10393414X,Kosmonauta-net.jpg'
|
||||
no_stylesheets = True
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
feeds = [(u'Kosmonauta.net', u'http://www.kosmonauta.net/index.php/feed/rss.html')]
|
@ -11,7 +11,7 @@ __description__ = 'Italian weekly magazine'
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Espresso(BasicNewsRecipe):
|
||||
__author__ = 'Lorenzo Vigentini, Gabriele Marini'
|
||||
__author__ = 'Lorenzo Vigentini, Gabriele Marini, Krittika Goyal'
|
||||
description = 'Italian weekly magazine'
|
||||
|
||||
cover_url = 'http://espresso.repubblica.it/images/logo_espresso.gif'
|
||||
@ -26,10 +26,9 @@ class Espresso(BasicNewsRecipe):
|
||||
oldest_article = 16
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
recursion = 10
|
||||
|
||||
remove_javascript = True
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
|
||||
|
||||
feeds = [
|
||||
@ -42,36 +41,3 @@ class Espresso(BasicNewsRecipe):
|
||||
(u'Chiesa: HomePage', u'http://data.kataweb.it/rss/chiesa/homepage/it'),
|
||||
(u'Chiesa: Speciali e Focus', u'http://data.kataweb.it/rss/chiesa/speciali_e_focus/it')
|
||||
]
|
||||
|
||||
|
||||
def print_version(self,url):
|
||||
print url[7:25]
|
||||
if url[7:25] == 'temi.repubblica.it':
|
||||
return url + '/?printpage=undefined'
|
||||
elif url[7:25] == 'www.chiesa.espress':
|
||||
return url
|
||||
return url + '/&print=true'
|
||||
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':['testo','copertina','occhiello','firma','didascalia','content-second-right','detail-articles','titolo-local','generic-articles']}),
|
||||
dict(name='div', attrs={'class':['generic-articles','summary','detail-articles']}),
|
||||
dict(name='div', attrs={'id':['content-second-right','content2']})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div',attrs={'class':['servizi','aggiungi','label-web','bottom-mobile','box-abbonamenti','box-cerca','big','little','stampaweb']}),
|
||||
dict(name='div',attrs={'id':['topheader','header','navigation-new','navigation','content-second-left','menutext']}),
|
||||
dict(name='ul',attrs={'id':'user-utility'}),
|
||||
dict(name=['script','noscript','iframe'])
|
||||
]
|
||||
# extra_css = '''
|
||||
# h1 {font-family:Times New Roman,"Trebuchet MS",Arial,Helvetica,sans-serif; font-size:24px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:18px;}
|
||||
# h2 {font-family:Times New Roman, "Trebuchet MS",Arial,Helvetica,sans-serif; font-size:18px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:16px; }
|
||||
# h3 {color:#333333;font-family:Times New Roman, "Trebuchet MS",Arial,Helvetica,sans-serif; font-size:16px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px;}
|
||||
# h4 {color:#333333; font-family:Times New Roman, "Trebuchet MS",Arial,Helvetica,sans-serif;font-size:16px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; }
|
||||
# h5 {color:#333333; font-family:Times New Roman, "Trebuchet MS",Arial,Helvetica,sans-serif; font-size:12px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; text-transform:uppercase;}
|
||||
# .firma {color:#333333;font-family:Times New Roman, "Trebuchet MS",Arial,Helvetica,sans-serif;font-size:12px; font-size-adjust:none; font-stretch:normal; font-style:italic; font-variant:normal; font-weight:bold; line-height:15px; text-decoration:none;}
|
||||
# .testo {font-family:Times New Roman, "Trebuchet MS",Arial,Helvetica,sans-serif; font-size:10px;}
|
||||
# '''
|
||||
|
||||
|
@ -1,13 +1,12 @@
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'Lorenzo Vigentini, based on Darko Miletic, Gabriele Marini'
|
||||
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
|
||||
description = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version'
|
||||
description = 'Italian daily newspaper - v1.01 (04, January 2010); 16.05.2010 new version; 17.10.2011 new version; 14.12.2011 new version'
|
||||
|
||||
'''
|
||||
http://www.repubblica.it/
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
@ -33,12 +32,6 @@ class LaRepubblica(BasicNewsRecipe):
|
||||
|
||||
remove_attributes = ['width','height','lang','xmlns:og','xmlns:fb']
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'.*?<head>', re.DOTALL|re.IGNORECASE), lambda match: '<head>'),
|
||||
(re.compile(r'<head>.*?<title>', re.DOTALL|re.IGNORECASE), lambda match: '<head><title>'),
|
||||
(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE), lambda match: '</title></head>')
|
||||
]
|
||||
|
||||
def get_article_url(self, article):
|
||||
link = BasicNewsRecipe.get_article_url(self, article)
|
||||
if link and not '.repubblica.it/' in link:
|
||||
@ -73,15 +66,15 @@ class LaRepubblica(BasicNewsRecipe):
|
||||
remove_tags = [
|
||||
dict(name=['object','link','meta','iframe','embed']),
|
||||
dict(name='span',attrs={'class':'linkindice'}),
|
||||
dict(name='div', attrs={'class':'bottom-mobile'}),
|
||||
dict(name='div', attrs={'id':['rssdiv','blocco']}),
|
||||
dict(name='div', attrs={'class':'utility'}),
|
||||
dict(name='div', attrs={'class':['bottom-mobile','adv adv-middle-inline']}),
|
||||
dict(name='div', attrs={'id':['rssdiv','blocco','fb-like-head']}),
|
||||
dict(name='div', attrs={'class':['utility','fb-like-button','archive-button']}),
|
||||
dict(name='div', attrs={'class':'generalbox'}),
|
||||
dict(name='ul', attrs={'id':'hystory'})
|
||||
]
|
||||
|
||||
feeds = [
|
||||
(u'Rilievo', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
|
||||
(u'Homepage', u'http://www.repubblica.it/rss/homepage/rss2.0.xml'),
|
||||
(u'Cronaca', u'http://www.repubblica.it/rss/cronaca/rss2.0.xml'),
|
||||
(u'Esteri', u'http://www.repubblica.it/rss/esteri/rss2.0.xml'),
|
||||
(u'Economia', u'http://www.repubblica.it/rss/economia/rss2.0.xml'),
|
||||
@ -110,3 +103,5 @@ class LaRepubblica(BasicNewsRecipe):
|
||||
del item['style']
|
||||
return soup
|
||||
|
||||
def preprocess_raw_html(self, raw, url):
|
||||
return '<html><head>'+raw[raw.find('</head>'):]
|
||||
|
94
recipes/letsgetcritical.recipe
Normal file
@ -0,0 +1,94 @@
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class LetsGetCritical(BasicNewsRecipe):
|
||||
title = u"Let's Get Critical"
|
||||
description = 'Curation / aggregation of criticisms of the arts and culture '
|
||||
language = 'en'
|
||||
__author__ = 'barty on mobileread.com forum'
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = False
|
||||
timefmt = ' [%a, %d %b, %Y]'
|
||||
oldest_article = 365
|
||||
auto_cleanup = True
|
||||
INDEX = 'http://www.letsgetcritical.org'
|
||||
CATEGORIES = [
|
||||
# comment out categories you don't want
|
||||
# (user friendly name, system name, max number of articles to load)
|
||||
('Architecture','architecture',30),
|
||||
('Art','art',30),
|
||||
('Books','books',30),
|
||||
('Design','design',30),
|
||||
('Digital','digital',30),
|
||||
('Food','food',30),
|
||||
('Movies','movies',30),
|
||||
('Music','music',30),
|
||||
('Television','television',30),
|
||||
('Other articles','',10)
|
||||
]
|
||||
|
||||
def parse_index(self):
|
||||
self.cover_url = 'http://www.letsgetcritical.org/wp-content/themes/lets_get_critical/images/lgc.jpg'
|
||||
feeds = []
|
||||
seen_urls = set([])
|
||||
regex = re.compile( r'http://(www\.)?([^/:]+)', re.I)
|
||||
|
||||
for category in self.CATEGORIES:
|
||||
|
||||
(cat_name, tag, max_articles) = category
|
||||
|
||||
tagurl = '' if tag=='' else '/category/'+tag.lower()
|
||||
self.log('Reading category:', cat_name)
|
||||
|
||||
articles = []
|
||||
pageno = 1
|
||||
|
||||
while len(articles) < max_articles and pageno < 100:
|
||||
|
||||
page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl
|
||||
pageno += 1
|
||||
|
||||
self.log('\tReading page:', page)
|
||||
try:
|
||||
soup = self.index_to_soup(page)
|
||||
except:
|
||||
break
|
||||
|
||||
posts = soup.findAll('div',attrs={'class':'post_multi'})
|
||||
if len(posts) == 0:
|
||||
break
|
||||
|
||||
for post in posts:
|
||||
dt = post.find('div',attrs={'class':'title'})
|
||||
atag = dt.find('a')
|
||||
url = atag['href']
|
||||
# skip promotionals and duplicate
|
||||
if url.startswith('http://letsgetcritical') or url.startswith('/') or url in seen_urls:
|
||||
continue
|
||||
seen_urls.add(url)
|
||||
title = self.tag_to_string(atag)
|
||||
self.log('\tFound article:', title)
|
||||
self.log('\t', url)
|
||||
desc = post.find('blockquote')
|
||||
desc = self.tag_to_string(desc) if desc else ''
|
||||
m = regex.match( url)
|
||||
if m:
|
||||
desc = "[%s] %s" % (m.group(2), desc)
|
||||
#self.log('\t', desc)
|
||||
date = ''
|
||||
p = post.previousSibling
|
||||
# navigate up sibling to find date
|
||||
while p:
|
||||
if hasattr(p,'class') and p['class'] == 'singledate':
|
||||
date = self.tag_to_string(p)
|
||||
break
|
||||
p = p.previousSibling
|
||||
articles.append({'title':title,'url':url,'description':desc,'date':date})
|
||||
if len(articles) >= max_articles:
|
||||
break
|
||||
|
||||
if articles:
|
||||
feeds.append((cat_name, articles))
|
||||
|
||||
return feeds
|
||||
|
@ -1,95 +1,117 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
import re
|
||||
from calibre.utils.magick import Image
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
try:
|
||||
from calibre_plugins.drMerry.debug import debuglogger as mlog
|
||||
print 'drMerry debuglogger found, debug options can be used'
|
||||
from calibre_plugins.drMerry.stats import statslogger as mstat
|
||||
print 'drMerry stats tracker found, stat can be tracked'
|
||||
mlog.setLoglevel(1) #-1 == no log; 0 for normal output
|
||||
mstat.calculateStats(False) #track stats (to track stats loglevel must be > 0
|
||||
KEEPSTATS = mstat.keepmystats()
|
||||
SHOWDEBUG0 = mlog.showdebuglevel(0)
|
||||
SHOWDEBUG1 = mlog.showdebuglevel(1)
|
||||
SHOWDEBUG2 = mlog.showdebuglevel(2)
|
||||
except:
|
||||
#print 'drMerry debuglogger not found, skipping debug options'
|
||||
SHOWDEBUG0 = False
|
||||
SHOWDEBUG1 = False
|
||||
SHOWDEBUG2 = False
|
||||
KEEPSTATS = False
|
||||
|
||||
#print ('level0: %s\nlevel1: %s\nlevel2: %s' % (SHOWDEBUG0,SHOWDEBUG1,SHOWDEBUG2))
|
||||
|
||||
''' Version 1.2, updated cover image to match the changed website.
|
||||
added info date on title
|
||||
version 1.4 Updated tags, delay and added autoclean 22-09-2011
|
||||
version 1.5 Changes due to changes in site
|
||||
version 1.6 Added css, removed auto cleanup, added buitenland section, added use_embedded_content, added remove_attributes
|
||||
Added som processing on pictures
|
||||
Added some processing on pictures
|
||||
Removed links in html
|
||||
Removed extre white characters
|
||||
changed handling of self closing span
|
||||
Version 1.7 11-11-2011 Changed oldest_article back to 1.5
|
||||
changed è into è
|
||||
updated remove tags
|
||||
removed keep_only tags
|
||||
Version 1.8 26-11-2022
|
||||
added remove tag: article-slideshow
|
||||
'''
|
||||
|
||||
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
title = u'Metro Nieuws NL'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
oldest_article = 10
|
||||
max_articles_per_feed = 15
|
||||
__author__ = u'DrMerry'
|
||||
description = u'Metro Nederland'
|
||||
language = u'nl'
|
||||
simultaneous_downloads = 5
|
||||
#delay = 1
|
||||
#auto_cleanup = True
|
||||
#auto_cleanup_keep = '//div[@class="article-image-caption-2column"]/*|//div[@id="date"]/*|//div[@class="article-image-caption-3column"]/*'
|
||||
masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif'
|
||||
timeout = 2
|
||||
center_navbar = True
|
||||
timefmt = ' [%A, %d %b %Y]'
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
remove_empty_feeds = True
|
||||
cover_url = 'http://www.oldreadmetro.com/img/en/metroholland/last/1/small.jpg'
|
||||
publication_type = 'newspaper'
|
||||
remove_tags_before = dict(name='div', attrs={'id':'date'})
|
||||
remove_tags_after = dict(name='div', attrs={'class':'article-body'})
|
||||
encoding = 'utf-8'
|
||||
remove_attributes = ['style', 'font', 'width', 'height']
|
||||
use_embedded_content = False
|
||||
conversion_options = {
|
||||
'authors' : 'Metro Nederland & calibre & DrMerry',
|
||||
'author_sort' : 'Metro Nederland & calibre & DrMerry',
|
||||
'publisher' : 'DrMerry/Metro Nederland'
|
||||
}
|
||||
extra_css = 'body {padding:5px 0px; background:#fff;font-size: 13px;}\
|
||||
#date {clear: both;margin-left: 19px;font-size: 11px;font-weight: 300;color: #616262;height: 15px;}\
|
||||
.article-box-fact.module-title {clear:both;border-top:1px solid black;border-bottom:4px solid black;padding: 8px 0;color: #24763b;font-family: arial, sans-serif;font-size: 14px;font-weight: bold;}\
|
||||
h1.title {color: #000000;font-size: 44px;padding-bottom: 10px;line-height: 1.15;font-weight: 300;} h2.subtitle {font-size: 13px;font-weight: 700;padding-bottom: 10px;}\
|
||||
.article-body p{padding-bottom:10px;}div.column-1-3{float: left;display: inline;width: 567px;margin-left: 19px;border-right: 1px solid #CACACA;padding-right: 9px;}\
|
||||
div.column-1-2 {float: left;display: inline;width: 373px;padding-right: 7px;border-right: 1px solid #CACACA;}\
|
||||
p.article-image-caption {font-size: 12px;font-weight: 300;line-height: 1.4;color: #616262;margin-top: 5px;} \
|
||||
#date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {clear: both;margin-bottom: 10px;font-size:0.5em; color: #616262;}\
|
||||
.article-box-fact.module-title {clear:both;padding: 8px 0;color: #24763b;font-family: arial, sans-serif;font-size: 14px;font-weight: bold;}\
|
||||
h1.title {color: #000000;font-size: 44px;padding-bottom: 10px;font-weight: 300;} h2.subtitle {font-size: 13px;font-weight: 700;padding-bottom: 10px;}\
|
||||
.article-body p{padding-bottom:10px;}div.column-1-3{margin-left: 19px;padding-right: 9px;}\
|
||||
div.column-1-2 {display: inline;padding-right: 7px;}\
|
||||
p.article-image-caption {font-size: 12px;font-weight: 300;color: #616262;margin-top: 5px;} \
|
||||
p.article-image-caption .credits {font-style: italic;font-size: 10px;}\
|
||||
div.article-image-caption {width: 246px;margin-bottom: 5px;margin-left: 10px;}\
|
||||
div.article-image-caption-2column {margin-bottom: 10px;width: 373px;} div.article-image-caption-3column {}\
|
||||
img {border:0px;} .img-mask {position:absolute;top:0px;left:0px;}'
|
||||
img {border:0px; padding:2px;} hr.merryhr {width:30%; border-width:0px; color:green; margin-left:5px; background-color: green} div.column-3 {background-color:#eee; width:50%; margin:2px; float:right; padding:2px;} div.column-3 module-title {border: 1px solid #aaa} div.article-box-fact div.subtitle {font-weight:bold; color:green;}'
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class':[ 'article-image-caption-2column', 'article-image-caption-3column', 'article-body', 'article-box-fact']}),
|
||||
dict(name='div', attrs={'id':['date']}),
|
||||
dict(name='h1', attrs={'class':['title']}),
|
||||
dict(name='h2', attrs={'class':['subtitle']})]
|
||||
|
||||
remove_tags = [dict(name='div', attrs={'class':[ 'metroCommentFormWrap',
|
||||
'commentForm', 'metroCommentInnerWrap', 'article-slideshow-counter-container', 'article-slideshow-control', 'ad', 'header-links',
|
||||
'art-rgt','pluck-app pluck-comm', 'share-and-byline', 'article-tools-below-title', 'col-179 ', 'related-links', 'clear padding-top-15', 'share-tools', 'article-page-auto-pushes', 'footer-edit']}),
|
||||
dict(name='div', attrs={'id':['article-2', 'article-4', 'article-1', 'navigation', 'footer', 'header', 'comments', 'sidebar', 'share-and-byline']}),
|
||||
dict(name='iframe')]
|
||||
|
||||
preprocess_regexps = [(re.compile(r'(<p>( |\s)*</p>|<a[^>]*>Tweet</a>|<a[^>]*>|</a>|<!--.*?-->)', re.DOTALL|re.IGNORECASE),lambda match: ''),
|
||||
(re.compile(r'( |\s\s)+\s*', re.DOTALL|re.IGNORECASE),lambda match: ' '),
|
||||
(re.compile(r'([\s>])([^\s>]+)(<span[^>]+) />', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: match.group(1) + match.group(3) + '>' + match.group(2) + '</span>'),
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<img[^>]+top-line[^>]+>', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: '<hr class="merryhr" />'),
|
||||
(re.compile(r'(<img[^>]+metronieuws\.nl/[^>]+/templates/[^>]+jpe?g[^>]+>|metronieuws\.nl/internal\-roxen\-unit\.gif)', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: ''),
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.setdefaults()
|
||||
mlog.addTextAndTag(['Show debug = on with level'], [str(mlog.debuglevel)])
|
||||
if KEEPSTATS == True:
|
||||
mlog.addDebug('Stats will be calculated')
|
||||
else:
|
||||
mlog.addTextAndTag(['Stats won\'t be calculated\nTo be enabled, stats must be true, currently','and debug level must be 1 or higher, currently'],[mstat.dokeepmystats, mlog.debuglevel])
|
||||
mlog.showDebug()
|
||||
myProcess = MerryProcess()
|
||||
myProcess.removeUnwantedTags(soup)
|
||||
return soup
|
||||
|
||||
def postprocess_html(self, soup, first):
|
||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
||||
iurl = tag['src']
|
||||
img = Image()
|
||||
img.open(iurl)
|
||||
#width, height = img.size
|
||||
#print '***img is: ', iurl, '\n****width is: ', width, 'height is: ', height
|
||||
img.trim(0)
|
||||
img.save(iurl)
|
||||
'''
|
||||
#width, height = img.size
|
||||
#print '***TRIMMED img width is: ', width, 'height is: ', height
|
||||
left=0
|
||||
top=0
|
||||
border_color='#ffffff'
|
||||
width, height = img.size
|
||||
#print '***retrieved img width is: ', width, 'height is: ', height
|
||||
height_correction = 1.17
|
||||
canvas = create_canvas(width, height*height_correction,border_color)
|
||||
canvas.compose(img, left, top)
|
||||
#img = canvas
|
||||
canvas.save(iurl)
|
||||
#width, height = canvas.size
|
||||
#print '***NEW img width is: ', width, 'height is: ', height
|
||||
'''
|
||||
myProcess = MerryProcess()
|
||||
myProcess.optimizeLayout(soup)
|
||||
if SHOWDEBUG0 == True:
|
||||
if KEEPSTATS == True:
|
||||
statinfo = 'generated stats:'
|
||||
statinfo += str(mstat.stats(mstat.statslist))
|
||||
print statinfo
|
||||
statinfo = 'generated stats (for removed tags):'
|
||||
statinfo += str(mstat.stats(mstat.removedtagslist))
|
||||
print statinfo
|
||||
#show all Debug info we forgot to report
|
||||
#Using print to be sure that this text will not be added at the end of the log.
|
||||
print '\n!!!!!unreported messages:\n(should be empty)\n'
|
||||
mlog.showDebug()
|
||||
return soup
|
||||
|
||||
feeds = [
|
||||
@ -105,6 +127,291 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
(u'Familie', u'http://www.metronieuws.nl/rss.xml?c=1283166782-9'),
|
||||
(u'Blogs', u'http://www.metronieuws.nl/rss.xml?c=1295586825-6'),
|
||||
(u'Reizen', u'http://www.metronieuws.nl/rss.xml?c=1277377288-13'),
|
||||
(u'Carrière', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
|
||||
(u'Carrière', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
|
||||
(u'Sport', u'http://www.metronieuws.nl/rss.xml?c=1277377288-12')
|
||||
]
|
||||
|
||||
class MerryPreProcess():
|
||||
def replacePictures(self, soup):
|
||||
#to be implemented
|
||||
return soup
|
||||
|
||||
def optimizePicture(self,soup):
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.addDebug('start image optimize')
|
||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
|
||||
iurl = tag['src']
|
||||
img = Image()
|
||||
img.open(iurl)
|
||||
img.trim(0)
|
||||
img.save(iurl)
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.addDebug('Images optimized')
|
||||
mlog.showDebug()
|
||||
return soup
|
||||
|
||||
class MerryExtract():
|
||||
def safeRemovePart(self, killingSoup, soupIsArray):
|
||||
if killingSoup and not killingSoup == None:
|
||||
if SHOWDEBUG2 == True:
|
||||
mlog.addTextAndTag(['items to remove'],[killingSoup])
|
||||
try:
|
||||
if soupIsArray == True:
|
||||
for killer in killingSoup:
|
||||
killer.extract()
|
||||
else:
|
||||
killingSoup.extract()
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('tag extracted')
|
||||
mlog.showDebug()
|
||||
if KEEPSTATS == True:
|
||||
try:
|
||||
mstat.addstat(mstat.removedtagslist,str(killingSoup.name))
|
||||
except:
|
||||
mstat.addstat(mstat.removedtagslist,'unknown')
|
||||
except:
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('tag extraction failed')
|
||||
mlog.showDebug()
|
||||
if KEEPSTATS == True:
|
||||
mstat.addstat(mstat.removedtagslist,'exception')
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
return killingSoup
|
||||
|
||||
class MerryReplace():
|
||||
myKiller = MerryExtract()
|
||||
def replaceATag(self, soup):
|
||||
anchors = []
|
||||
anchors = soup.findAll('a')
|
||||
if anchors and not (anchors == None or anchors == []):
|
||||
try:
|
||||
for link in anchors:
|
||||
# print str(link)
|
||||
if link and not link == None:
|
||||
# print ('type: %s'%(str(type(link))))
|
||||
# print ('link: %s' % (link))
|
||||
myParent = link.parent
|
||||
# print str('parent: %s'%(myParent))
|
||||
try:
|
||||
myIndex = link.parent.index(link)
|
||||
hasIndex = True
|
||||
except:
|
||||
myIndex = 0
|
||||
hasIndex = False
|
||||
# print str('index %s'%(myIndex))
|
||||
if not link.string == None:
|
||||
# print 'link=notnone'
|
||||
if hasIndex == True:
|
||||
myParent.insert(myIndex, link.string)
|
||||
else:
|
||||
myParent.append(link.string)
|
||||
else:
|
||||
# print 'link=none'
|
||||
myParent.insert(myIndex, link.contents)
|
||||
self.myKiller.safeRemovePart(link, False)
|
||||
else:
|
||||
notshown = 'tag received is empty' # print
|
||||
except:
|
||||
notshown = 'tag received is empty' # print
|
||||
notshown
|
||||
return soup
|
||||
|
||||
class MerryProcess(BeautifulSoup):
|
||||
myKiller = MerryExtract()
|
||||
myReplacer = MerryReplace()
|
||||
myPrepare = MerryPreProcess()
|
||||
|
||||
def optimizeLayout(self,soup):
|
||||
self.myPrepare.optimizePicture(soup)
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.addDebug('End of Optimize Layout')
|
||||
mlog.showDebug()
|
||||
return soup
|
||||
|
||||
def insertFacts(self, soup):
|
||||
allfacts = soup.findAll('div', {'class':re.compile('^article-box-fact.*$')})
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.addTextAndTag(['allfacts'],[allfacts])
|
||||
mlog.showDebug()
|
||||
if allfacts and not allfacts == None:
|
||||
allfactsparent = soup.find('div', {'class':re.compile('^article-box-fact.*$')}).parent
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.addTextAndTag(['allfactsparent'],[allfactsparent])
|
||||
mlog.showDebug()
|
||||
for part in allfactsparent:
|
||||
if not part in allfacts:
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.addTextAndTag(['FOUND A non-fact'],[part])
|
||||
mlog.showDebug()
|
||||
self.myKiller.safeRemovePart(part, True)
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addTextAndTag(['New All Facts'],[allfacts])
|
||||
mlog.showDebug()
|
||||
articlefacts = soup.find('div', {'class':'article-box-fact column'})
|
||||
errorOccured=False
|
||||
if (articlefacts and not articlefacts==None):
|
||||
try:
|
||||
contenttag = soup.find('div', {'class':'article-body'})
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.addTextAndTag(['curcontag'],[contenttag])
|
||||
mlog.showDebug()
|
||||
foundrighttag = False
|
||||
if contenttag and not contenttag == None:
|
||||
foundrighttag = True
|
||||
if SHOWDEBUG0 == True:
|
||||
if errorOccured == False:
|
||||
mlog.addTextAndTag(['type','curcontag (in while)'],[type(contenttag),contenttag])
|
||||
else:
|
||||
mlog.addDebug('Could not find right parent tag. Error Occured')
|
||||
mlog.showDebug()
|
||||
if foundrighttag == True:
|
||||
contenttag.insert(0, allfactsparent)
|
||||
if SHOWDEBUG2 == True:
|
||||
mlog.addTextAndTag(['added parent'],[soup.prettify()])
|
||||
mlog.showDebug()
|
||||
except:
|
||||
errorOccured=True
|
||||
mlog.addTrace()
|
||||
else:
|
||||
errorOccured=True
|
||||
if SHOWDEBUG0 == True and errorOccured == True:
|
||||
mlog.addTextAndTag(['no articlefacts'],[articlefacts])
|
||||
mlog.showDebug()
|
||||
return soup
|
||||
|
||||
def previousNextSibRemover(self, soup, previous=True, soupIsArray=False):
|
||||
findsibsof = soup
|
||||
firstpart = previous
|
||||
if findsibsof and not findsibsof == None:
|
||||
if soupIsArray == True:
|
||||
for foundsib in findsibsof:
|
||||
self.previousNextSibRemover(foundsib, firstpart, soupIsArray=False)
|
||||
else:
|
||||
if firstpart == True and soupIsArray == False:
|
||||
sibs = findsibsof.previousSiblingGenerator()
|
||||
else:
|
||||
sibs = findsibsof.nextSiblingGenerator()
|
||||
for sib in sibs:
|
||||
self.myKiller.safeRemovePart(sib, True)
|
||||
else:
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('Not any sib found')
|
||||
return
|
||||
|
||||
def removeUnwantedTags(self,soup):
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addTextAndTag(['Len of Soup before RemoveTagsByName'],[len(str(soup))])
|
||||
mlog.showDebug()
|
||||
self.removeTagsByName(soup)
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('Len of Soup before firstandlastpart: %s' % len(str(soup)))
|
||||
mlog.showDebug()
|
||||
self.insertFacts(soup)
|
||||
self.removeFirstAndLastPart(soup)
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('Len of Soup before unwantedpart: %s' % len(str(soup)))
|
||||
mlog.showDebug()
|
||||
self.removeUnwantedParts(soup)
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('Len of Soup before EmptyParts: %s' % len(str(soup)))
|
||||
mlog.showDebug()
|
||||
self.removeEmptyTags(soup)
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('Len of Soup after EmptyParts: %s' % len(str(soup)))
|
||||
mlog.showDebug()
|
||||
self.myReplacer.replaceATag(soup)
|
||||
return soup
|
||||
|
||||
def removeUnwantedParts(self, soup):
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('Len of Soup before UnwantedID: %s' % len(str(soup)))
|
||||
mlog.showDebug()
|
||||
self.removeUnwantedTagsByID(soup)
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('Len of Soup before Class: %s' % len(str(soup)))
|
||||
mlog.showDebug()
|
||||
self.removeUnwantedTagsByClass(soup)
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('Len of Soup before Style: %s' % len(str(soup)))
|
||||
mlog.showDebug()
|
||||
self.removeUnwantedTagsByStyle(soup)
|
||||
return soup
|
||||
|
||||
def removeUnwantedTagsByStyle(self,soup):
|
||||
self.removeArrayOfTags(soup.findAll(attrs={'style' : re.compile("^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$")}))
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.addDebug('end remove by style')
|
||||
return soup
|
||||
|
||||
def removeArrayOfTags(self,souparray):
|
||||
return self.myKiller.safeRemovePart(souparray, True)
|
||||
|
||||
def removeUnwantedTagsByClass(self,soup):
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.addDebug('start remove by class')
|
||||
self.removeArrayOfTags(soup.findAll("div", { "class" :re.compile('^(promo.*?|article-tools-below-title|metroCommentFormWrap|ad|share-tools|tools|header-links|related-links|padding-top-15)$')}))
|
||||
return soup
|
||||
|
||||
def removeUnwantedTagsByID(self,soup):
|
||||
defaultids = ['footer-extra',re.compile('^ad(\d+|adcomp.*?)?$'),'column-4-5','navigation','header',re.compile('^column-1-5-(top|bottom)$'),'footer','hidden_div','sidebar',re.compile('^article-\d$'),'comments','footer']
|
||||
for removeid in defaultids:
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('RemoveTagByID, tag: %s, Len of Soup: %s' % (str(removeid), len(str(soup))))
|
||||
mlog.showDebug()
|
||||
self.removeArrayOfTags(soup.findAll(id=removeid))
|
||||
return soup
|
||||
|
||||
# def safeRemoveTag(self, subtree):
|
||||
# return self.myKiller.safeRemovePart(subtree, True)
|
||||
|
||||
|
||||
def removeTagsByName(self, soup):
|
||||
self.myKiller.safeRemovePart(soup.script, True)
|
||||
self.myKiller.safeRemovePart(soup.iframe, True)
|
||||
self.myKiller.safeRemovePart(soup.style, True)
|
||||
self.myKiller.safeRemovePart(soup.noscript, True)
|
||||
return soup
|
||||
|
||||
def removeEmptyTags(self,soup,run=0):
|
||||
if SHOWDEBUG0 == True:
|
||||
mlog.addDebug('starting removeEmptyTags')
|
||||
if SHOWDEBUG1 == True:
|
||||
run += 1
|
||||
mlog.addDebug(run)
|
||||
if SHOWDEBUG2 == True:
|
||||
mlog.addDebug(str(soup.prettify()))
|
||||
mlog.showDebug()
|
||||
emptymatches = re.compile('^( |\s|\n|\r|\t)*$')
|
||||
emptytags = soup.findAll(lambda tag: tag.find(True) is None and (tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing)
|
||||
if emptytags and not (emptytags == None or emptytags == []):
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('tags found')
|
||||
mlog.addDebug(str(emptytags))
|
||||
self.removeArrayOfTags(emptytags)
|
||||
#recursive in case removing empty tag creates new empty tag
|
||||
self.removeEmptyTags(soup, run=run)
|
||||
else:
|
||||
if SHOWDEBUG1 == True:
|
||||
mlog.addDebug('no empty tags found')
|
||||
mlog.showDebug()
|
||||
if SHOWDEBUG0 == True:
|
||||
if SHOWDEBUG2 == True:
|
||||
mlog.addDebug('new soup:')
|
||||
mlog.addDebug(str(soup.prettify()))
|
||||
mlog.addDebug('RemoveEmptyTags Completed')
|
||||
mlog.showDebug()
|
||||
return soup
|
||||
|
||||
def removeFirstAndLastPart(self,soup):
|
||||
def findparenttag(lookuptag):
|
||||
if lookuptag and not lookuptag == None:
|
||||
return lookuptag.findParents()
|
||||
findtag = soup.find(id="date")
|
||||
self.previousNextSibRemover(findtag, previous=True, soupIsArray=False)
|
||||
self.previousNextSibRemover(findparenttag(findtag), previous=True, soupIsArray=True)
|
||||
for endtag in [soup.find(id="share-and-byline"), soup.find("div", { "class" : "gallery-text" })]:
|
||||
self.previousNextSibRemover(endtag, previous=False, soupIsArray=False)
|
||||
self.previousNextSibRemover(findparenttag(endtag), previous=False, soupIsArray=True)
|
||||
return soup
|
||||
|
@ -5,8 +5,8 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
description = 'News as provide by The Metro -UK'
|
||||
|
||||
__author__ = 'Dave Asbury'
|
||||
#last update 3/12/11
|
||||
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
|
||||
|
||||
no_stylesheets = True
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 20
|
||||
@ -32,9 +32,11 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
|
||||
dict(name='div', attrs={'class':'art-lft'}),
|
||||
dict(name='p')
|
||||
]
|
||||
remove_tags = [dict(name='div', attrs={'class':[ 'news m12 clrd clr-b p5t shareBtm', 'commentForm', 'metroCommentInnerWrap',
|
||||
'art-rgt','pluck-app pluck-comm','news m12 clrd clr-l p5t', 'flt-r' ]}),
|
||||
dict(attrs={'class':[ 'metroCommentFormWrap','commentText','commentsNav','avatar','submDateAndTime']})
|
||||
remove_tags = [
|
||||
dict(name = 'div',attrs={'id' : ['comments-news','formSubmission']}),
|
||||
dict(name='div', attrs={'class':[ 'news m12 clrd clr-b p5t shareBtm', 'commentForm', 'metroCommentInnerWrap',
|
||||
'art-rgt','pluck-app pluck-comm','news m12 clrd clr-l p5t', 'flt-r','username','clrd' ]}),
|
||||
dict(attrs={'class':['username', 'metroCommentFormWrap','commentText','commentsNav','avatar','submDateAndTime','addYourComment','displayName']})
|
||||
,dict(name='div', attrs={'class' : 'clrd art-fd fd-gr1-b'})
|
||||
]
|
||||
feeds = [
|
||||
|
@ -10,6 +10,10 @@ __MakePeriodical__ = True
|
||||
__UseChineseTitle__ = False
|
||||
# Set it to False if you want to skip images (Default: True)
|
||||
__KeepImages__ = True
|
||||
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
|
||||
__IncludeSummary__ = False
|
||||
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
|
||||
__IncludeThumbnails__ = True
|
||||
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
|
||||
__UseLife__ = True
|
||||
# (HK only) It is to disable premium content (Default: False)
|
||||
@ -24,6 +28,10 @@ __Date__ = ''
|
||||
|
||||
'''
|
||||
Change Log:
|
||||
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
|
||||
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
|
||||
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
|
||||
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
|
||||
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
|
||||
2011/10/19: fix a bug in txt source parsing
|
||||
2011/10/17: disable fetching of premium content, also improved txt source parsing
|
||||
@ -52,6 +60,7 @@ Change Log:
|
||||
2010/10/31: skip repeated articles in section pages
|
||||
'''
|
||||
|
||||
from calibre.utils.date import now as nowf
|
||||
import os, datetime, re, mechanize
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from contextlib import nested
|
||||
@ -59,10 +68,14 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
|
||||
# MAIN CLASS
|
||||
class MPRecipe(BasicNewsRecipe):
|
||||
if __Region__ == 'Hong Kong':
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||
else:
|
||||
title = 'Ming Pao - Hong Kong'
|
||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||
category = 'Chinese, News, Hong Kong'
|
||||
@ -108,6 +121,9 @@ class MPRecipe(BasicNewsRecipe):
|
||||
lambda match: "</b>")
|
||||
]
|
||||
elif __Region__ == 'Vancouver':
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
||||
else:
|
||||
title = 'Ming Pao - Vancouver'
|
||||
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
|
||||
category = 'Chinese, News, Vancouver'
|
||||
@ -126,6 +142,9 @@ class MPRecipe(BasicNewsRecipe):
|
||||
lambda match: ''),
|
||||
]
|
||||
elif __Region__ == 'Toronto':
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
||||
else:
|
||||
title = 'Ming Pao - Toronto'
|
||||
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
|
||||
category = 'Chinese, News, Toronto'
|
||||
@ -160,9 +179,9 @@ class MPRecipe(BasicNewsRecipe):
|
||||
def get_dtlocal(self):
|
||||
dt_utc = datetime.datetime.utcnow()
|
||||
if __Region__ == 'Hong Kong':
|
||||
# convert UTC to local hk time - at HKT 5.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
|
||||
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
|
||||
# convert UTC to local hk time - at HKT 4.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
|
||||
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
|
||||
elif __Region__ == 'Vancouver':
|
||||
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
|
||||
@ -185,6 +204,18 @@ class MPRecipe(BasicNewsRecipe):
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||
|
||||
def get_fetchyear(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[0:4]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y")
|
||||
|
||||
def get_fetchmonth(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[4:6]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%m")
|
||||
|
||||
def get_fetchday(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[6:8]
|
||||
@ -533,12 +564,22 @@ class MPRecipe(BasicNewsRecipe):
|
||||
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
|
||||
next_is_img_txt = False
|
||||
title_started = False
|
||||
title_break_reached = False
|
||||
met_article_start_char = False
|
||||
for item in splitter.split(raw_html):
|
||||
item = item.strip()
|
||||
if item.startswith(u'\u3010'):
|
||||
# if title already reached but break between title and content not yet found, record title_break_reached
|
||||
if title_started == True and title_break_reached == False and item == '':
|
||||
title_break_reached = True
|
||||
# if title reached and title_break_reached and met_article_start_char == False and item is not empty
|
||||
# start content
|
||||
elif title_started == True and title_break_reached == True and met_article_start_char == False:
|
||||
if item <> '':
|
||||
met_article_start_char = True
|
||||
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||
#if item.startswith(u'\u3010'):
|
||||
# met_article_start_char = True
|
||||
# new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||
else:
|
||||
if next_is_img_txt == False:
|
||||
if item.startswith("=@"):
|
||||
@ -643,38 +684,112 @@ class MPRecipe(BasicNewsRecipe):
|
||||
del item['absmiddle']
|
||||
return soup
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
# thumbnails shouldn't be available if using hi-res images
|
||||
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
|
||||
img = soup.find('img')
|
||||
if img is not None:
|
||||
self.add_toc_thumbnail(article, img['src'])
|
||||
|
||||
try:
|
||||
if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
|
||||
# look for content
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||
if articlebodies:
|
||||
for articlebody in articlebodies:
|
||||
if articlebody:
|
||||
# the text may or may not be enclosed in <p></p> tag
|
||||
paras = articlebody.findAll('p')
|
||||
if not paras:
|
||||
paras = articlebody
|
||||
textFound = False
|
||||
for p in paras:
|
||||
if not textFound:
|
||||
summary_candidate = self.tag_to_string(p).strip()
|
||||
summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
|
||||
if len(summary_candidate) > 0:
|
||||
article.summary = article.text_summary = summary_candidate
|
||||
textFound = True
|
||||
else:
|
||||
# display a simple text
|
||||
#article.summary = article.text_summary = u'\u66f4\u591a......'
|
||||
# display word counts
|
||||
counts = 0
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||
if articlebodies:
|
||||
for articlebody in articlebodies:
|
||||
# the text may or may not be enclosed in <p></p> tag
|
||||
paras = articlebody.findAll('p')
|
||||
if not paras:
|
||||
paras = articlebody
|
||||
for p in paras:
|
||||
summary_candidate = self.tag_to_string(p).strip()
|
||||
counts += len(summary_candidate)
|
||||
article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
|
||||
except:
|
||||
self.log("Error creating article descriptions")
|
||||
return
|
||||
|
||||
# override from the one in version 0.8.31
|
||||
def create_opf(self, feeds, dir=None):
|
||||
if dir is None:
|
||||
dir = self.output_dir
|
||||
if __UseChineseTitle__ == True:
|
||||
if __Region__ == 'Hong Kong':
|
||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||
elif __Region__ == 'Vancouver':
|
||||
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
||||
elif __Region__ == 'Toronto':
|
||||
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
||||
else:
|
||||
title = self.short_title()
|
||||
# if not generating a periodical, force date to apply in title
|
||||
if __MakePeriodical__ == False:
|
||||
# change 1: allow our own flag to tell if a periodical is to be generated
|
||||
# also use customed date instead of current time
|
||||
if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
|
||||
title = title + ' ' + self.get_fetchformatteddate()
|
||||
if True:
|
||||
mi = MetaInformation(title, [self.publisher])
|
||||
mi.publisher = self.publisher
|
||||
mi.author_sort = self.publisher
|
||||
# end of change 1
|
||||
# change 2: __appname__ replaced by newspaper publisher
|
||||
__appname__ = self.publisher
|
||||
mi = MetaInformation(title, [__appname__])
|
||||
mi.publisher = __appname__
|
||||
mi.author_sort = __appname__
|
||||
# change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
|
||||
if __MakePeriodical__ == True:
|
||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
else:
|
||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||
#mi.timestamp = nowf()
|
||||
mi.timestamp = self.get_dtlocal()
|
||||
mi.comments = self.description
|
||||
if not isinstance(mi.comments, unicode):
|
||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
#mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
# change 4: in the following, all the nowf() are changed to adjusted time
|
||||
# This one doesn't matter
|
||||
mi.timestamp = nowf()
|
||||
# change 5: skip listing the articles
|
||||
#article_titles, aseen = [], set()
|
||||
#for f in feeds:
|
||||
# for a in f:
|
||||
# if a.title and a.title not in aseen:
|
||||
# aseen.add(a.title)
|
||||
# article_titles.append(force_unicode(a.title, 'utf-8'))
|
||||
|
||||
#mi.comments = self.description
|
||||
#if not isinstance(mi.comments, unicode):
|
||||
# mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
#mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
|
||||
# '\n\n'.join(article_titles))
|
||||
|
||||
language = canonicalize_lang(self.language)
|
||||
if language is not None:
|
||||
mi.language = language
|
||||
# This one affects the pub date shown in kindle title
|
||||
#mi.pubdate = nowf()
|
||||
mi.pubdate = self.get_dtlocal()
|
||||
# now appears to need the time field to be > 12.00noon as well
|
||||
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
ncx_path = os.path.join(dir, 'index.ncx')
|
||||
|
||||
opf = OPFCreator(dir, mi)
|
||||
# Add mastheadImage entry to <guide> section
|
||||
mp = getattr(self, 'masthead_path', None)
|
||||
@ -710,11 +825,13 @@ class MPRecipe(BasicNewsRecipe):
|
||||
mani.id = 'ncx'
|
||||
if mani.path.endswith('mastheadImage.jpg'):
|
||||
mani.id = 'masthead-image'
|
||||
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
self.play_order_counter = 0
|
||||
self.play_order_map = {}
|
||||
|
||||
|
||||
def feed_index(num, parent):
|
||||
f = feeds[num]
|
||||
for j, a in enumerate(f):
|
||||
@ -728,13 +845,16 @@ class MPRecipe(BasicNewsRecipe):
|
||||
desc = None
|
||||
else:
|
||||
desc = self.description_limiter(desc)
|
||||
tt = a.toc_thumbnail if a.toc_thumbnail else None
|
||||
entries.append('%sindex.html'%adir)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
||||
play_order=po, author=auth, description=desc)
|
||||
parent.add_item('%sindex.html'%adir, None,
|
||||
a.title if a.title else _('Untitled Article'),
|
||||
play_order=po, author=auth,
|
||||
description=desc, toc_thumbnail=tt)
|
||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
@ -751,7 +871,7 @@ class MPRecipe(BasicNewsRecipe):
|
||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||
templ = self.navbar.generate(True, num, j, len(f),
|
||||
not self.has_single_feed,
|
||||
a.orig_url, self.publisher, prefix=prefix,
|
||||
a.orig_url, __appname__, prefix=prefix,
|
||||
center=self.center_navbar)
|
||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||
body.insert(len(body.contents), elem)
|
||||
@ -787,3 +907,5 @@ class MPRecipe(BasicNewsRecipe):
|
||||
|
||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||
opf.render(opf_file, ncx_file)
|
||||
|
||||
|
||||
|
@ -4,18 +4,41 @@ __copyright__ = '2010-2011, Eddie Lau'
|
||||
# Region - Hong Kong, Vancouver, Toronto
|
||||
__Region__ = 'Toronto'
|
||||
# Users of Kindle 3 with limited system-level CJK support
|
||||
# please replace the following "True" with "False".
|
||||
# please replace the following "True" with "False". (Default: True)
|
||||
__MakePeriodical__ = True
|
||||
# Turn below to true if your device supports display of CJK titles
|
||||
# Turn below to True if your device supports display of CJK titles (Default: False)
|
||||
__UseChineseTitle__ = False
|
||||
# Set it to False if you want to skip images
|
||||
# Set it to False if you want to skip images (Default: True)
|
||||
__KeepImages__ = True
|
||||
# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
|
||||
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
|
||||
__IncludeSummary__ = False
|
||||
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
|
||||
__IncludeThumbnails__ = True
|
||||
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
|
||||
__UseLife__ = True
|
||||
# (HK only) It is to disable premium content (Default: False)
|
||||
__InclPremium__ = False
|
||||
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
|
||||
__ParsePFF__ = True
|
||||
# (HK only) Turn below to True if you wish hi-res images (Default: False)
|
||||
__HiResImg__ = False
|
||||
# Override the date returned by the program if specifying a YYYYMMDD below
|
||||
__Date__ = ''
|
||||
|
||||
|
||||
'''
|
||||
Change Log:
|
||||
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
|
||||
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
|
||||
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
|
||||
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
|
||||
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
|
||||
2011/10/19: fix a bug in txt source parsing
|
||||
2011/10/17: disable fetching of premium content, also improved txt source parsing
|
||||
2011/10/04: option to get hi-res photos for the articles
|
||||
2011/09/21: fetching "column" section is made optional.
|
||||
2011/09/18: parse "column" section stuff from source text file directly.
|
||||
2011/09/07: disable "column" section as it is no longer offered free.
|
||||
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
|
||||
provide options to remove all images in the file
|
||||
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
|
||||
@ -37,30 +60,38 @@ Change Log:
|
||||
2010/10/31: skip repeated articles in section pages
|
||||
'''
|
||||
|
||||
import os, datetime, re
|
||||
from calibre.utils.date import now as nowf
|
||||
import os, datetime, re, mechanize
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from contextlib import nested
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
|
||||
# MAIN CLASS
|
||||
class MPRecipe(BasicNewsRecipe):
|
||||
if __Region__ == 'Hong Kong':
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||
else:
|
||||
title = 'Ming Pao - Hong Kong'
|
||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||
category = 'Chinese, News, Hong Kong'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
|
||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||
keep_only_tags = [dict(name='h1'),
|
||||
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
||||
dict(name='font', attrs={'color':['AA0000']}), # for column articles title
|
||||
dict(attrs={'class':['heading']}), # for heading from txt
|
||||
dict(attrs={'id':['newscontent']}), # entertainment and column page content
|
||||
dict(attrs={'id':['newscontent01','newscontent02']}),
|
||||
dict(attrs={'class':['content']}), # for content from txt
|
||||
dict(attrs={'class':['photo']}),
|
||||
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
|
||||
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
|
||||
dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
|
||||
dict(attrs={'class':['images']}) # for images from txt
|
||||
]
|
||||
if __KeepImages__:
|
||||
remove_tags = [dict(name='style'),
|
||||
@ -90,6 +121,9 @@ class MPRecipe(BasicNewsRecipe):
|
||||
lambda match: "</b>")
|
||||
]
|
||||
elif __Region__ == 'Vancouver':
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
||||
else:
|
||||
title = 'Ming Pao - Vancouver'
|
||||
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
|
||||
category = 'Chinese, News, Vancouver'
|
||||
@ -108,6 +142,9 @@ class MPRecipe(BasicNewsRecipe):
|
||||
lambda match: ''),
|
||||
]
|
||||
elif __Region__ == 'Toronto':
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
||||
else:
|
||||
title = 'Ming Pao - Toronto'
|
||||
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
|
||||
category = 'Chinese, News, Toronto'
|
||||
@ -139,49 +176,12 @@ class MPRecipe(BasicNewsRecipe):
|
||||
conversion_options = {'linearize_tables':True}
|
||||
timefmt = ''
|
||||
|
||||
def image_url_processor(cls, baseurl, url):
|
||||
# trick: break the url at the first occurance of digit, add an additional
|
||||
# '_' at the front
|
||||
# not working, may need to move this to preprocess_html() method
|
||||
# minIdx = 10000
|
||||
# i0 = url.find('0')
|
||||
# if i0 >= 0 and i0 < minIdx:
|
||||
# minIdx = i0
|
||||
# i1 = url.find('1')
|
||||
# if i1 >= 0 and i1 < minIdx:
|
||||
# minIdx = i1
|
||||
# i2 = url.find('2')
|
||||
# if i2 >= 0 and i2 < minIdx:
|
||||
# minIdx = i2
|
||||
# i3 = url.find('3')
|
||||
# if i3 >= 0 and i0 < minIdx:
|
||||
# minIdx = i3
|
||||
# i4 = url.find('4')
|
||||
# if i4 >= 0 and i4 < minIdx:
|
||||
# minIdx = i4
|
||||
# i5 = url.find('5')
|
||||
# if i5 >= 0 and i5 < minIdx:
|
||||
# minIdx = i5
|
||||
# i6 = url.find('6')
|
||||
# if i6 >= 0 and i6 < minIdx:
|
||||
# minIdx = i6
|
||||
# i7 = url.find('7')
|
||||
# if i7 >= 0 and i7 < minIdx:
|
||||
# minIdx = i7
|
||||
# i8 = url.find('8')
|
||||
# if i8 >= 0 and i8 < minIdx:
|
||||
# minIdx = i8
|
||||
# i9 = url.find('9')
|
||||
# if i9 >= 0 and i9 < minIdx:
|
||||
# minIdx = i9
|
||||
return url
|
||||
|
||||
def get_dtlocal(self):
|
||||
dt_utc = datetime.datetime.utcnow()
|
||||
if __Region__ == 'Hong Kong':
|
||||
# convert UTC to local hk time - at HKT 5.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
|
||||
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
|
||||
# convert UTC to local hk time - at HKT 4.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
|
||||
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
|
||||
elif __Region__ == 'Vancouver':
|
||||
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
|
||||
@ -193,12 +193,33 @@ class MPRecipe(BasicNewsRecipe):
|
||||
return dt_local
|
||||
|
||||
def get_fetchdate(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y%m%d")
|
||||
|
||||
def get_fetchformatteddate(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||
|
||||
def get_fetchyear(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[0:4]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y")
|
||||
|
||||
def get_fetchmonth(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[4:6]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%m")
|
||||
|
||||
def get_fetchday(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[6:8]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%d")
|
||||
|
||||
def get_cover_url(self):
|
||||
@ -230,12 +251,23 @@ class MPRecipe(BasicNewsRecipe):
|
||||
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
|
||||
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
|
||||
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
|
||||
(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
|
||||
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
||||
]:
|
||||
if __InclPremium__ == True:
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
else:
|
||||
articles = self.parse_section2(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
if __InclPremium__ == True:
|
||||
# parse column section articles directly from .txt files
|
||||
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||
]:
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
@ -244,15 +276,16 @@ class MPRecipe(BasicNewsRecipe):
|
||||
else:
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
||||
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
|
||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
|
||||
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
# special- editorial
|
||||
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
||||
if ed_articles:
|
||||
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
||||
#ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
||||
#if ed_articles:
|
||||
# feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
||||
|
||||
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
||||
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
||||
@ -263,20 +296,39 @@ class MPRecipe(BasicNewsRecipe):
|
||||
|
||||
# special - finance
|
||||
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
||||
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
||||
if fin_articles:
|
||||
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||
#fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
||||
#if fin_articles:
|
||||
# feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||
|
||||
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
#for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||
# (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||
# articles = self.parse_section(url)
|
||||
# if articles:
|
||||
# feeds.append((title, articles))
|
||||
|
||||
# special - entertainment
|
||||
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||
if ent_articles:
|
||||
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||
#ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||
#if ent_articles:
|
||||
# feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||
|
||||
for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
||||
]:
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
if __InclPremium__ == True:
|
||||
# parse column section articles directly from .txt files
|
||||
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||
]:
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
@ -284,11 +336,6 @@ class MPRecipe(BasicNewsRecipe):
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
|
||||
# special- columns
|
||||
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
|
||||
if col_articles:
|
||||
feeds.append((u'\u5c08\u6b04 Columns', col_articles))
|
||||
elif __Region__ == 'Vancouver':
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
|
||||
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
|
||||
@ -332,6 +379,16 @@ class MPRecipe(BasicNewsRecipe):
|
||||
title = self.tag_to_string(a)
|
||||
url = a.get('href', False)
|
||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
||||
# replace the url to the print-friendly version
|
||||
if __ParsePFF__ == True:
|
||||
if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
|
||||
url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
|
||||
url = re.sub('%2F.*%2F', '/', url)
|
||||
title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
|
||||
url = url.replace('%2Etxt', '_print.htm')
|
||||
url = url.replace('%5F', '_')
|
||||
else:
|
||||
url = url.replace('.htm', '_print.htm')
|
||||
if url not in included_urls and url.rfind('Redirect') == -1:
|
||||
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
||||
included_urls.append(url)
|
||||
@ -340,6 +397,8 @@ class MPRecipe(BasicNewsRecipe):
|
||||
|
||||
# parse from life.mingpao.com
|
||||
def parse_section2(self, url, keystr):
|
||||
br = mechanize.Browser()
|
||||
br.set_handle_redirect(False)
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
@ -350,9 +409,31 @@ class MPRecipe(BasicNewsRecipe):
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||
try:
|
||||
br.open_novisit(url)
|
||||
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
except:
|
||||
print 'skipping a premium article'
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
# parse from text file of life.mingpao.com
|
||||
def parse_section2_txt(self, url, keystr):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
a.reverse()
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||
url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/') # use printed version of the article
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
@ -438,6 +519,162 @@ class MPRecipe(BasicNewsRecipe):
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
# preprocess those .txt and javascript based files
|
||||
def preprocess_raw_html(self, raw_html, url):
|
||||
new_html = raw_html
|
||||
if url.rfind('ftp') <> -1 or url.rfind('_print.htm') <> -1:
|
||||
if url.rfind('_print.htm') <> -1:
|
||||
# javascript based file
|
||||
splitter = re.compile(r'\n')
|
||||
new_raw_html = '<html><head><title>Untitled</title></head>'
|
||||
new_raw_html = new_raw_html + '<body>'
|
||||
for item in splitter.split(raw_html):
|
||||
if item.startswith('var heading1 ='):
|
||||
heading = item.replace('var heading1 = \'', '')
|
||||
heading = heading.replace('\'', '')
|
||||
heading = heading.replace(';', '')
|
||||
new_raw_html = new_raw_html + '<div class="heading">' + heading
|
||||
if item.startswith('var heading2 ='):
|
||||
heading = item.replace('var heading2 = \'', '')
|
||||
heading = heading.replace('\'', '')
|
||||
heading = heading.replace(';', '')
|
||||
if heading <> '':
|
||||
new_raw_html = new_raw_html + '<br>' + heading + '</div>'
|
||||
else:
|
||||
new_raw_html = new_raw_html + '</div>'
|
||||
if item.startswith('var content ='):
|
||||
content = item.replace("var content = ", '')
|
||||
content = content.replace('\'', '')
|
||||
content = content.replace(';', '')
|
||||
new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
|
||||
if item.startswith('var photocontent ='):
|
||||
photo = item.replace('var photocontent = \'', '')
|
||||
photo = photo.replace('\'', '')
|
||||
photo = photo.replace(';', '')
|
||||
photo = photo.replace('<tr>', '')
|
||||
photo = photo.replace('<td>', '')
|
||||
photo = photo.replace('</tr>', '')
|
||||
photo = photo.replace('</td>', '<br>')
|
||||
photo = photo.replace('class="photo"', '')
|
||||
new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
|
||||
new_html = new_raw_html + '</body></html>'
|
||||
else:
|
||||
# .txt based file
|
||||
splitter = re.compile(r'\n') # Match non-digits
|
||||
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
|
||||
next_is_img_txt = False
|
||||
title_started = False
|
||||
title_break_reached = False
|
||||
met_article_start_char = False
|
||||
for item in splitter.split(raw_html):
|
||||
item = item.strip()
|
||||
# if title already reached but break between title and content not yet found, record title_break_reached
|
||||
if title_started == True and title_break_reached == False and item == '':
|
||||
title_break_reached = True
|
||||
# if title reached and title_break_reached and met_article_start_char == False and item is not empty
|
||||
# start content
|
||||
elif title_started == True and title_break_reached == True and met_article_start_char == False:
|
||||
if item <> '':
|
||||
met_article_start_char = True
|
||||
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||
#if item.startswith(u'\u3010'):
|
||||
# met_article_start_char = True
|
||||
# new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||
else:
|
||||
if next_is_img_txt == False:
|
||||
if item.startswith("=@"):
|
||||
print 'skip movie link'
|
||||
elif item.startswith("=?"):
|
||||
next_is_img_txt = True
|
||||
new_raw_html += '<img src="' + str(item)[2:].strip() + '.gif" /><p>\n'
|
||||
elif item.startswith('=='):
|
||||
next_is_img_txt = True
|
||||
if False:
|
||||
# TODO: check existence of .gif first
|
||||
newimg = '_' + item[2:].strip() + '.jpg'
|
||||
new_raw_html += '<img src="' + newimg + '" /><p>\n'
|
||||
else:
|
||||
new_raw_html += '<img src="' + str(item)[2:].strip() + '.jpg" /><p>\n'
|
||||
elif item.startswith('='):
|
||||
next_is_img_txt = True
|
||||
if False:
|
||||
# TODO: check existence of .gif first
|
||||
newimg = '_' + item[1:].strip() + '.jpg'
|
||||
new_raw_html += '<img src="' + newimg + '" /><p>\n'
|
||||
else:
|
||||
new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
|
||||
else:
|
||||
if next_is_img_txt == False and met_article_start_char == False:
|
||||
if item <> '':
|
||||
if title_started == False:
|
||||
#print 'Title started at ', item
|
||||
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
|
||||
title_started = True
|
||||
else:
|
||||
new_raw_html = new_raw_html + item + '\n'
|
||||
else:
|
||||
new_raw_html = new_raw_html + item + '<p>\n'
|
||||
else:
|
||||
next_is_img_txt = False
|
||||
new_raw_html = new_raw_html + item + '\n'
|
||||
new_html = new_raw_html + '</div></body></html>'
|
||||
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
|
||||
if __HiResImg__ == True:
|
||||
# TODO: add a _ in front of an image url
|
||||
if url.rfind('news.mingpao.com') > -1:
|
||||
imglist = re.findall('src="?.*?jpg"', new_html)
|
||||
br = mechanize.Browser()
|
||||
br.set_handle_redirect(False)
|
||||
for img in imglist:
|
||||
gifimg = img.replace('jpg"', 'gif"')
|
||||
try:
|
||||
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
|
||||
new_html = new_html.replace(img, gifimg)
|
||||
except:
|
||||
# find the location of the first _
|
||||
pos = img.find('_')
|
||||
if pos > -1:
|
||||
# if found, insert _ after the first _
|
||||
newimg = img[0:pos] + '_' + img[pos:]
|
||||
new_html = new_html.replace(img, newimg)
|
||||
else:
|
||||
# if not found, insert _ after "
|
||||
new_html = new_html.replace(img[1:], '"_' + img[1:])
|
||||
elif url.rfind('life.mingpao.com') > -1:
|
||||
imglist = re.findall('src=\'?.*?jpg\'', new_html)
|
||||
br = mechanize.Browser()
|
||||
br.set_handle_redirect(False)
|
||||
#print 'Img list: ', imglist, '\n'
|
||||
for img in imglist:
|
||||
#print 'Found img: ', img
|
||||
gifimg = img.replace('jpg\'', 'gif\'')
|
||||
try:
|
||||
gifurl = re.sub(r'dailynews.*txt', '', url)
|
||||
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
|
||||
new_html = new_html.replace(img, gifimg)
|
||||
except:
|
||||
pos = img.rfind('/')
|
||||
newimg = img[0:pos+1] + '_' + img[pos+1:]
|
||||
new_html = new_html.replace(img, newimg)
|
||||
# repeat with src quoted by double quotes, for text parsed from src txt
|
||||
imglist = re.findall('src="?.*?jpg"', new_html)
|
||||
for img in imglist:
|
||||
#print 'Found img: ', img
|
||||
gifimg = img.replace('jpg"', 'gif"')
|
||||
try:
|
||||
#print 'url', url
|
||||
pos = url.rfind('/')
|
||||
gifurl = url[:pos+1]
|
||||
#print 'try it:', gifurl + gifimg[5:len(gifimg)-1]
|
||||
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
|
||||
new_html = new_html.replace(img, gifimg)
|
||||
except:
|
||||
pos = img.find('"')
|
||||
newimg = img[0:pos+1] + '_' + img[pos+1:]
|
||||
#print 'Use hi-res img', newimg
|
||||
new_html = new_html.replace(img, newimg)
|
||||
return new_html
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
@ -447,38 +684,112 @@ class MPRecipe(BasicNewsRecipe):
|
||||
del item['absmiddle']
|
||||
return soup
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
# thumbnails shouldn't be available if using hi-res images
|
||||
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
|
||||
img = soup.find('img')
|
||||
if img is not None:
|
||||
self.add_toc_thumbnail(article, img['src'])
|
||||
|
||||
try:
|
||||
if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
|
||||
# look for content
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||
if articlebodies:
|
||||
for articlebody in articlebodies:
|
||||
if articlebody:
|
||||
# the text may or may not be enclosed in <p></p> tag
|
||||
paras = articlebody.findAll('p')
|
||||
if not paras:
|
||||
paras = articlebody
|
||||
textFound = False
|
||||
for p in paras:
|
||||
if not textFound:
|
||||
summary_candidate = self.tag_to_string(p).strip()
|
||||
summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
|
||||
if len(summary_candidate) > 0:
|
||||
article.summary = article.text_summary = summary_candidate
|
||||
textFound = True
|
||||
else:
|
||||
# display a simple text
|
||||
#article.summary = article.text_summary = u'\u66f4\u591a......'
|
||||
# display word counts
|
||||
counts = 0
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||
if articlebodies:
|
||||
for articlebody in articlebodies:
|
||||
# the text may or may not be enclosed in <p></p> tag
|
||||
paras = articlebody.findAll('p')
|
||||
if not paras:
|
||||
paras = articlebody
|
||||
for p in paras:
|
||||
summary_candidate = self.tag_to_string(p).strip()
|
||||
counts += len(summary_candidate)
|
||||
article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
|
||||
except:
|
||||
self.log("Error creating article descriptions")
|
||||
return
|
||||
|
||||
# override from the one in version 0.8.31
|
||||
def create_opf(self, feeds, dir=None):
|
||||
if dir is None:
|
||||
dir = self.output_dir
|
||||
if __UseChineseTitle__ == True:
|
||||
if __Region__ == 'Hong Kong':
|
||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||
elif __Region__ == 'Vancouver':
|
||||
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
||||
elif __Region__ == 'Toronto':
|
||||
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
||||
else:
|
||||
title = self.short_title()
|
||||
# if not generating a periodical, force date to apply in title
|
||||
if __MakePeriodical__ == False:
|
||||
# change 1: allow our own flag to tell if a periodical is to be generated
|
||||
# also use customed date instead of current time
|
||||
if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
|
||||
title = title + ' ' + self.get_fetchformatteddate()
|
||||
if True:
|
||||
mi = MetaInformation(title, [self.publisher])
|
||||
mi.publisher = self.publisher
|
||||
mi.author_sort = self.publisher
|
||||
# end of change 1
|
||||
# change 2: __appname__ replaced by newspaper publisher
|
||||
__appname__ = self.publisher
|
||||
mi = MetaInformation(title, [__appname__])
|
||||
mi.publisher = __appname__
|
||||
mi.author_sort = __appname__
|
||||
# change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
|
||||
if __MakePeriodical__ == True:
|
||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
else:
|
||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||
#mi.timestamp = nowf()
|
||||
mi.timestamp = self.get_dtlocal()
|
||||
mi.comments = self.description
|
||||
if not isinstance(mi.comments, unicode):
|
||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
#mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
# change 4: in the following, all the nowf() are changed to adjusted time
|
||||
# This one doesn't matter
|
||||
mi.timestamp = nowf()
|
||||
# change 5: skip listing the articles
|
||||
#article_titles, aseen = [], set()
|
||||
#for f in feeds:
|
||||
# for a in f:
|
||||
# if a.title and a.title not in aseen:
|
||||
# aseen.add(a.title)
|
||||
# article_titles.append(force_unicode(a.title, 'utf-8'))
|
||||
|
||||
#mi.comments = self.description
|
||||
#if not isinstance(mi.comments, unicode):
|
||||
# mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
#mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
|
||||
# '\n\n'.join(article_titles))
|
||||
|
||||
language = canonicalize_lang(self.language)
|
||||
if language is not None:
|
||||
mi.language = language
|
||||
# This one affects the pub date shown in kindle title
|
||||
#mi.pubdate = nowf()
|
||||
mi.pubdate = self.get_dtlocal()
|
||||
# now appears to need the time field to be > 12.00noon as well
|
||||
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
ncx_path = os.path.join(dir, 'index.ncx')
|
||||
|
||||
opf = OPFCreator(dir, mi)
|
||||
# Add mastheadImage entry to <guide> section
|
||||
mp = getattr(self, 'masthead_path', None)
|
||||
@ -514,11 +825,13 @@ class MPRecipe(BasicNewsRecipe):
|
||||
mani.id = 'ncx'
|
||||
if mani.path.endswith('mastheadImage.jpg'):
|
||||
mani.id = 'masthead-image'
|
||||
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
self.play_order_counter = 0
|
||||
self.play_order_map = {}
|
||||
|
||||
|
||||
def feed_index(num, parent):
|
||||
f = feeds[num]
|
||||
for j, a in enumerate(f):
|
||||
@ -532,13 +845,16 @@ class MPRecipe(BasicNewsRecipe):
|
||||
desc = None
|
||||
else:
|
||||
desc = self.description_limiter(desc)
|
||||
tt = a.toc_thumbnail if a.toc_thumbnail else None
|
||||
entries.append('%sindex.html'%adir)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
||||
play_order=po, author=auth, description=desc)
|
||||
parent.add_item('%sindex.html'%adir, None,
|
||||
a.title if a.title else _('Untitled Article'),
|
||||
play_order=po, author=auth,
|
||||
description=desc, toc_thumbnail=tt)
|
||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
@ -555,7 +871,7 @@ class MPRecipe(BasicNewsRecipe):
|
||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||
templ = self.navbar.generate(True, num, j, len(f),
|
||||
not self.has_single_feed,
|
||||
a.orig_url, self.publisher, prefix=prefix,
|
||||
a.orig_url, __appname__, prefix=prefix,
|
||||
center=self.center_navbar)
|
||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||
body.insert(len(body.contents), elem)
|
||||
@ -592,3 +908,4 @@ class MPRecipe(BasicNewsRecipe):
|
||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||
opf.render(opf_file, ncx_file)
|
||||
|
||||
|
||||
|
@ -4,18 +4,41 @@ __copyright__ = '2010-2011, Eddie Lau'
|
||||
# Region - Hong Kong, Vancouver, Toronto
|
||||
__Region__ = 'Vancouver'
|
||||
# Users of Kindle 3 with limited system-level CJK support
|
||||
# please replace the following "True" with "False".
|
||||
# please replace the following "True" with "False". (Default: True)
|
||||
__MakePeriodical__ = True
|
||||
# Turn below to true if your device supports display of CJK titles
|
||||
# Turn below to True if your device supports display of CJK titles (Default: False)
|
||||
__UseChineseTitle__ = False
|
||||
# Set it to False if you want to skip images
|
||||
# Set it to False if you want to skip images (Default: True)
|
||||
__KeepImages__ = True
|
||||
# (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
|
||||
# Set it to True if you want to include a summary in Kindle's article view (Default: False)
|
||||
__IncludeSummary__ = False
|
||||
# Set it to True if you want thumbnail images in Kindle's article view (Default: True)
|
||||
__IncludeThumbnails__ = True
|
||||
# (HK only) Turn below to True if you wish to use life.mingpao.com as the main article source (Default: True)
|
||||
__UseLife__ = True
|
||||
# (HK only) It is to disable premium content (Default: False)
|
||||
__InclPremium__ = False
|
||||
# (HK only) Turn below to True if you wish to parse articles in news.mingpao.com with their printer-friendly formats (Default: True)
|
||||
__ParsePFF__ = True
|
||||
# (HK only) Turn below to True if you wish hi-res images (Default: False)
|
||||
__HiResImg__ = False
|
||||
# Override the date returned by the program if specifying a YYYYMMDD below
|
||||
__Date__ = ''
|
||||
|
||||
|
||||
'''
|
||||
Change Log:
|
||||
2011/12/18: update the overridden create_odf(.) routine with the one from Calibre version 0.8.31. Move __UseChineseTitle__ usage away
|
||||
from create_odf(.). Optional support of text_summary and thumbnail images in Kindle's article view. Start new day
|
||||
download of Hong Kong Mingpao at 4.30am. Set the actual publication date shown on kindle device.
|
||||
2011/12/01: take care of situation that in txt source parsing, the article content does start with special character u'\u3010'
|
||||
2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt
|
||||
2011/10/19: fix a bug in txt source parsing
|
||||
2011/10/17: disable fetching of premium content, also improved txt source parsing
|
||||
2011/10/04: option to get hi-res photos for the articles
|
||||
2011/09/21: fetching "column" section is made optional.
|
||||
2011/09/18: parse "column" section stuff from source text file directly.
|
||||
2011/09/07: disable "column" section as it is no longer offered free.
|
||||
2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
|
||||
provide options to remove all images in the file
|
||||
2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
|
||||
@ -37,30 +60,38 @@ Change Log:
|
||||
2010/10/31: skip repeated articles in section pages
|
||||
'''
|
||||
|
||||
import os, datetime, re
|
||||
from calibre.utils.date import now as nowf
|
||||
import os, datetime, re, mechanize
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from contextlib import nested
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
|
||||
# MAIN CLASS
|
||||
class MPRecipe(BasicNewsRecipe):
|
||||
if __Region__ == 'Hong Kong':
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||
else:
|
||||
title = 'Ming Pao - Hong Kong'
|
||||
description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
|
||||
category = 'Chinese, News, Hong Kong'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
|
||||
extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} font>b {font-size:200%; font-weight:bold;} div[class=heading] {font-size:200%; font-weight:bold;} div[class=images] {font-size:50%;}'
|
||||
masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
|
||||
keep_only_tags = [dict(name='h1'),
|
||||
dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
|
||||
dict(name='font', attrs={'color':['AA0000']}), # for column articles title
|
||||
dict(attrs={'class':['heading']}), # for heading from txt
|
||||
dict(attrs={'id':['newscontent']}), # entertainment and column page content
|
||||
dict(attrs={'id':['newscontent01','newscontent02']}),
|
||||
dict(attrs={'class':['content']}), # for content from txt
|
||||
dict(attrs={'class':['photo']}),
|
||||
dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}), # content in printed version of life.mingpao.com
|
||||
dict(name='img', attrs={'width':['180'], 'alt':['按圖放大']}) # images for source from life.mingpao.com
|
||||
dict(name='img', attrs={'width':['180'], 'alt':['????']}), # images for source from life.mingpao.com
|
||||
dict(attrs={'class':['images']}) # for images from txt
|
||||
]
|
||||
if __KeepImages__:
|
||||
remove_tags = [dict(name='style'),
|
||||
@ -90,6 +121,9 @@ class MPRecipe(BasicNewsRecipe):
|
||||
lambda match: "</b>")
|
||||
]
|
||||
elif __Region__ == 'Vancouver':
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
||||
else:
|
||||
title = 'Ming Pao - Vancouver'
|
||||
description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
|
||||
category = 'Chinese, News, Vancouver'
|
||||
@ -108,6 +142,9 @@ class MPRecipe(BasicNewsRecipe):
|
||||
lambda match: ''),
|
||||
]
|
||||
elif __Region__ == 'Toronto':
|
||||
if __UseChineseTitle__ == True:
|
||||
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
||||
else:
|
||||
title = 'Ming Pao - Toronto'
|
||||
description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
|
||||
category = 'Chinese, News, Toronto'
|
||||
@ -139,49 +176,12 @@ class MPRecipe(BasicNewsRecipe):
|
||||
conversion_options = {'linearize_tables':True}
|
||||
timefmt = ''
|
||||
|
||||
def image_url_processor(cls, baseurl, url):
|
||||
# trick: break the url at the first occurance of digit, add an additional
|
||||
# '_' at the front
|
||||
# not working, may need to move this to preprocess_html() method
|
||||
# minIdx = 10000
|
||||
# i0 = url.find('0')
|
||||
# if i0 >= 0 and i0 < minIdx:
|
||||
# minIdx = i0
|
||||
# i1 = url.find('1')
|
||||
# if i1 >= 0 and i1 < minIdx:
|
||||
# minIdx = i1
|
||||
# i2 = url.find('2')
|
||||
# if i2 >= 0 and i2 < minIdx:
|
||||
# minIdx = i2
|
||||
# i3 = url.find('3')
|
||||
# if i3 >= 0 and i0 < minIdx:
|
||||
# minIdx = i3
|
||||
# i4 = url.find('4')
|
||||
# if i4 >= 0 and i4 < minIdx:
|
||||
# minIdx = i4
|
||||
# i5 = url.find('5')
|
||||
# if i5 >= 0 and i5 < minIdx:
|
||||
# minIdx = i5
|
||||
# i6 = url.find('6')
|
||||
# if i6 >= 0 and i6 < minIdx:
|
||||
# minIdx = i6
|
||||
# i7 = url.find('7')
|
||||
# if i7 >= 0 and i7 < minIdx:
|
||||
# minIdx = i7
|
||||
# i8 = url.find('8')
|
||||
# if i8 >= 0 and i8 < minIdx:
|
||||
# minIdx = i8
|
||||
# i9 = url.find('9')
|
||||
# if i9 >= 0 and i9 < minIdx:
|
||||
# minIdx = i9
|
||||
return url
|
||||
|
||||
def get_dtlocal(self):
|
||||
dt_utc = datetime.datetime.utcnow()
|
||||
if __Region__ == 'Hong Kong':
|
||||
# convert UTC to local hk time - at HKT 5.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
|
||||
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
|
||||
# convert UTC to local hk time - at HKT 4.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(4.5/24)
|
||||
# dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(4.5/24)
|
||||
elif __Region__ == 'Vancouver':
|
||||
# convert UTC to local Vancouver time - at PST time 5.30am, all news are available
|
||||
dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
|
||||
@ -193,12 +193,33 @@ class MPRecipe(BasicNewsRecipe):
|
||||
return dt_local
|
||||
|
||||
def get_fetchdate(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y%m%d")
|
||||
|
||||
def get_fetchformatteddate(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y-%m-%d")
|
||||
|
||||
def get_fetchyear(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[0:4]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%Y")
|
||||
|
||||
def get_fetchmonth(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[4:6]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%m")
|
||||
|
||||
def get_fetchday(self):
|
||||
if __Date__ <> '':
|
||||
return __Date__[6:8]
|
||||
else:
|
||||
return self.get_dtlocal().strftime("%d")
|
||||
|
||||
def get_cover_url(self):
|
||||
@ -230,12 +251,23 @@ class MPRecipe(BasicNewsRecipe):
|
||||
(u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
|
||||
(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
|
||||
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
|
||||
(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
|
||||
(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
||||
]:
|
||||
if __InclPremium__ == True:
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
else:
|
||||
articles = self.parse_section2(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
if __InclPremium__ == True:
|
||||
# parse column section articles directly from .txt files
|
||||
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||
]:
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
@ -244,15 +276,16 @@ class MPRecipe(BasicNewsRecipe):
|
||||
else:
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
|
||||
(u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
|
||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
|
||||
(u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'),
|
||||
(u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
# special- editorial
|
||||
ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
||||
if ed_articles:
|
||||
feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
||||
#ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
|
||||
#if ed_articles:
|
||||
# feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
|
||||
|
||||
for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
|
||||
(u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
|
||||
@ -263,20 +296,39 @@ class MPRecipe(BasicNewsRecipe):
|
||||
|
||||
# special - finance
|
||||
#fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
|
||||
fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
||||
if fin_articles:
|
||||
feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||
#fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
|
||||
#if fin_articles:
|
||||
# feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
|
||||
|
||||
for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||
(u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||
articles = self.parse_section(url)
|
||||
for title, url, keystr in [(u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal')]:
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
#for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
|
||||
# (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
|
||||
# articles = self.parse_section(url)
|
||||
# if articles:
|
||||
# feeds.append((title, articles))
|
||||
|
||||
# special - entertainment
|
||||
ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||
if ent_articles:
|
||||
feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||
#ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
|
||||
#if ent_articles:
|
||||
# feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
|
||||
|
||||
for title, url, keystr in [(u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
|
||||
]:
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
if __InclPremium__ == True:
|
||||
# parse column section articles directly from .txt files
|
||||
for title, url, keystr in [(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
|
||||
]:
|
||||
articles = self.parse_section2_txt(url, keystr)
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
|
||||
(u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
|
||||
@ -284,11 +336,6 @@ class MPRecipe(BasicNewsRecipe):
|
||||
if articles:
|
||||
feeds.append((title, articles))
|
||||
|
||||
|
||||
# special- columns
|
||||
col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
|
||||
if col_articles:
|
||||
feeds.append((u'\u5c08\u6b04 Columns', col_articles))
|
||||
elif __Region__ == 'Vancouver':
|
||||
for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
|
||||
(u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
|
||||
@ -332,6 +379,16 @@ class MPRecipe(BasicNewsRecipe):
|
||||
title = self.tag_to_string(a)
|
||||
url = a.get('href', False)
|
||||
url = 'http://news.mingpao.com/' + dateStr + '/' +url
|
||||
# replace the url to the print-friendly version
|
||||
if __ParsePFF__ == True:
|
||||
if url.rfind('Redirect') <> -1 and __InclPremium__ == True:
|
||||
url = re.sub(dateStr + '.*' + dateStr, dateStr, url)
|
||||
url = re.sub('%2F.*%2F', '/', url)
|
||||
title = title.replace(u'\u6536\u8cbb\u5167\u5bb9', '')
|
||||
url = url.replace('%2Etxt', '_print.htm')
|
||||
url = url.replace('%5F', '_')
|
||||
else:
|
||||
url = url.replace('.htm', '_print.htm')
|
||||
if url not in included_urls and url.rfind('Redirect') == -1:
|
||||
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
|
||||
included_urls.append(url)
|
||||
@ -340,6 +397,8 @@ class MPRecipe(BasicNewsRecipe):
|
||||
|
||||
# parse from life.mingpao.com
|
||||
def parse_section2(self, url, keystr):
|
||||
br = mechanize.Browser()
|
||||
br.set_handle_redirect(False)
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
@ -350,9 +409,31 @@ class MPRecipe(BasicNewsRecipe):
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||
try:
|
||||
br.open_novisit(url)
|
||||
url = url.replace('dailynews3.cfm', 'dailynews3a.cfm') # use printed version of the article
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
except:
|
||||
print 'skipping a premium article'
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
# parse from text file of life.mingpao.com
|
||||
def parse_section2_txt(self, url, keystr):
|
||||
self.get_fetchdate()
|
||||
soup = self.index_to_soup(url)
|
||||
a = soup.findAll('a', href=True)
|
||||
a.reverse()
|
||||
current_articles = []
|
||||
included_urls = []
|
||||
for i in a:
|
||||
title = self.tag_to_string(i)
|
||||
url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
|
||||
if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
|
||||
url = url.replace('cfm/dailynews3.cfm?File=', 'ftp/Life3/') # use printed version of the article
|
||||
current_articles.append({'title': title, 'url': url, 'description': ''})
|
||||
included_urls.append(url)
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
@ -438,6 +519,162 @@ class MPRecipe(BasicNewsRecipe):
|
||||
current_articles.reverse()
|
||||
return current_articles
|
||||
|
||||
# preprocess those .txt and javascript based files
|
||||
def preprocess_raw_html(self, raw_html, url):
|
||||
new_html = raw_html
|
||||
if url.rfind('ftp') <> -1 or url.rfind('_print.htm') <> -1:
|
||||
if url.rfind('_print.htm') <> -1:
|
||||
# javascript based file
|
||||
splitter = re.compile(r'\n')
|
||||
new_raw_html = '<html><head><title>Untitled</title></head>'
|
||||
new_raw_html = new_raw_html + '<body>'
|
||||
for item in splitter.split(raw_html):
|
||||
if item.startswith('var heading1 ='):
|
||||
heading = item.replace('var heading1 = \'', '')
|
||||
heading = heading.replace('\'', '')
|
||||
heading = heading.replace(';', '')
|
||||
new_raw_html = new_raw_html + '<div class="heading">' + heading
|
||||
if item.startswith('var heading2 ='):
|
||||
heading = item.replace('var heading2 = \'', '')
|
||||
heading = heading.replace('\'', '')
|
||||
heading = heading.replace(';', '')
|
||||
if heading <> '':
|
||||
new_raw_html = new_raw_html + '<br>' + heading + '</div>'
|
||||
else:
|
||||
new_raw_html = new_raw_html + '</div>'
|
||||
if item.startswith('var content ='):
|
||||
content = item.replace("var content = ", '')
|
||||
content = content.replace('\'', '')
|
||||
content = content.replace(';', '')
|
||||
new_raw_html = new_raw_html + '<div class="content">' + content + '</div>'
|
||||
if item.startswith('var photocontent ='):
|
||||
photo = item.replace('var photocontent = \'', '')
|
||||
photo = photo.replace('\'', '')
|
||||
photo = photo.replace(';', '')
|
||||
photo = photo.replace('<tr>', '')
|
||||
photo = photo.replace('<td>', '')
|
||||
photo = photo.replace('</tr>', '')
|
||||
photo = photo.replace('</td>', '<br>')
|
||||
photo = photo.replace('class="photo"', '')
|
||||
new_raw_html = new_raw_html + '<div class="images">' + photo + '</div>'
|
||||
new_html = new_raw_html + '</body></html>'
|
||||
else:
|
||||
# .txt based file
|
||||
splitter = re.compile(r'\n') # Match non-digits
|
||||
new_raw_html = '<html><head><title>Untitled</title></head><body><div class="images">'
|
||||
next_is_img_txt = False
|
||||
title_started = False
|
||||
title_break_reached = False
|
||||
met_article_start_char = False
|
||||
for item in splitter.split(raw_html):
|
||||
item = item.strip()
|
||||
# if title already reached but break between title and content not yet found, record title_break_reached
|
||||
if title_started == True and title_break_reached == False and item == '':
|
||||
title_break_reached = True
|
||||
# if title reached and title_break_reached and met_article_start_char == False and item is not empty
|
||||
# start content
|
||||
elif title_started == True and title_break_reached == True and met_article_start_char == False:
|
||||
if item <> '':
|
||||
met_article_start_char = True
|
||||
new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||
#if item.startswith(u'\u3010'):
|
||||
# met_article_start_char = True
|
||||
# new_raw_html = new_raw_html + '</div><div class="content"><p>' + item + '<p>\n'
|
||||
else:
|
||||
if next_is_img_txt == False:
|
||||
if item.startswith("=@"):
|
||||
print 'skip movie link'
|
||||
elif item.startswith("=?"):
|
||||
next_is_img_txt = True
|
||||
new_raw_html += '<img src="' + str(item)[2:].strip() + '.gif" /><p>\n'
|
||||
elif item.startswith('=='):
|
||||
next_is_img_txt = True
|
||||
if False:
|
||||
# TODO: check existence of .gif first
|
||||
newimg = '_' + item[2:].strip() + '.jpg'
|
||||
new_raw_html += '<img src="' + newimg + '" /><p>\n'
|
||||
else:
|
||||
new_raw_html += '<img src="' + str(item)[2:].strip() + '.jpg" /><p>\n'
|
||||
elif item.startswith('='):
|
||||
next_is_img_txt = True
|
||||
if False:
|
||||
# TODO: check existence of .gif first
|
||||
newimg = '_' + item[1:].strip() + '.jpg'
|
||||
new_raw_html += '<img src="' + newimg + '" /><p>\n'
|
||||
else:
|
||||
new_raw_html += '<img src="' + str(item)[1:].strip() + '.jpg" /><p>\n'
|
||||
else:
|
||||
if next_is_img_txt == False and met_article_start_char == False:
|
||||
if item <> '':
|
||||
if title_started == False:
|
||||
#print 'Title started at ', item
|
||||
new_raw_html = new_raw_html + '</div><div class="heading">' + item + '\n'
|
||||
title_started = True
|
||||
else:
|
||||
new_raw_html = new_raw_html + item + '\n'
|
||||
else:
|
||||
new_raw_html = new_raw_html + item + '<p>\n'
|
||||
else:
|
||||
next_is_img_txt = False
|
||||
new_raw_html = new_raw_html + item + '\n'
|
||||
new_html = new_raw_html + '</div></body></html>'
|
||||
#raw_html = raw_html.replace(u'<p>\u3010', u'\u3010')
|
||||
if __HiResImg__ == True:
|
||||
# TODO: add a _ in front of an image url
|
||||
if url.rfind('news.mingpao.com') > -1:
|
||||
imglist = re.findall('src="?.*?jpg"', new_html)
|
||||
br = mechanize.Browser()
|
||||
br.set_handle_redirect(False)
|
||||
for img in imglist:
|
||||
gifimg = img.replace('jpg"', 'gif"')
|
||||
try:
|
||||
br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
|
||||
new_html = new_html.replace(img, gifimg)
|
||||
except:
|
||||
# find the location of the first _
|
||||
pos = img.find('_')
|
||||
if pos > -1:
|
||||
# if found, insert _ after the first _
|
||||
newimg = img[0:pos] + '_' + img[pos:]
|
||||
new_html = new_html.replace(img, newimg)
|
||||
else:
|
||||
# if not found, insert _ after "
|
||||
new_html = new_html.replace(img[1:], '"_' + img[1:])
|
||||
elif url.rfind('life.mingpao.com') > -1:
|
||||
imglist = re.findall('src=\'?.*?jpg\'', new_html)
|
||||
br = mechanize.Browser()
|
||||
br.set_handle_redirect(False)
|
||||
#print 'Img list: ', imglist, '\n'
|
||||
for img in imglist:
|
||||
#print 'Found img: ', img
|
||||
gifimg = img.replace('jpg\'', 'gif\'')
|
||||
try:
|
||||
gifurl = re.sub(r'dailynews.*txt', '', url)
|
||||
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
|
||||
new_html = new_html.replace(img, gifimg)
|
||||
except:
|
||||
pos = img.rfind('/')
|
||||
newimg = img[0:pos+1] + '_' + img[pos+1:]
|
||||
new_html = new_html.replace(img, newimg)
|
||||
# repeat with src quoted by double quotes, for text parsed from src txt
|
||||
imglist = re.findall('src="?.*?jpg"', new_html)
|
||||
for img in imglist:
|
||||
#print 'Found img: ', img
|
||||
gifimg = img.replace('jpg"', 'gif"')
|
||||
try:
|
||||
#print 'url', url
|
||||
pos = url.rfind('/')
|
||||
gifurl = url[:pos+1]
|
||||
#print 'try it:', gifurl + gifimg[5:len(gifimg)-1]
|
||||
br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
|
||||
new_html = new_html.replace(img, gifimg)
|
||||
except:
|
||||
pos = img.find('"')
|
||||
newimg = img[0:pos+1] + '_' + img[pos+1:]
|
||||
#print 'Use hi-res img', newimg
|
||||
new_html = new_html.replace(img, newimg)
|
||||
return new_html
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
del item['style']
|
||||
@ -447,38 +684,112 @@ class MPRecipe(BasicNewsRecipe):
|
||||
del item['absmiddle']
|
||||
return soup
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
# thumbnails shouldn't be available if using hi-res images
|
||||
if __IncludeThumbnails__ and __HiResImg__ == False and first and hasattr(self, 'add_toc_thumbnail'):
|
||||
img = soup.find('img')
|
||||
if img is not None:
|
||||
self.add_toc_thumbnail(article, img['src'])
|
||||
|
||||
try:
|
||||
if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
|
||||
# look for content
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||
if articlebodies:
|
||||
for articlebody in articlebodies:
|
||||
if articlebody:
|
||||
# the text may or may not be enclosed in <p></p> tag
|
||||
paras = articlebody.findAll('p')
|
||||
if not paras:
|
||||
paras = articlebody
|
||||
textFound = False
|
||||
for p in paras:
|
||||
if not textFound:
|
||||
summary_candidate = self.tag_to_string(p).strip()
|
||||
summary_candidate = summary_candidate.replace(u'\u3010\u660e\u5831\u5c08\u8a0a\u3011', '', 1)
|
||||
if len(summary_candidate) > 0:
|
||||
article.summary = article.text_summary = summary_candidate
|
||||
textFound = True
|
||||
else:
|
||||
# display a simple text
|
||||
#article.summary = article.text_summary = u'\u66f4\u591a......'
|
||||
# display word counts
|
||||
counts = 0
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'id':'newscontent01'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div',attrs={'class':'content'})
|
||||
if not articlebodies:
|
||||
articlebodies = soup.findAll('div', attrs={'id':'font'})
|
||||
if articlebodies:
|
||||
for articlebody in articlebodies:
|
||||
# the text may or may not be enclosed in <p></p> tag
|
||||
paras = articlebody.findAll('p')
|
||||
if not paras:
|
||||
paras = articlebody
|
||||
for p in paras:
|
||||
summary_candidate = self.tag_to_string(p).strip()
|
||||
counts += len(summary_candidate)
|
||||
article.summary = article.text_summary = u'\uff08' + str(counts) + u'\u5b57\uff09'
|
||||
except:
|
||||
self.log("Error creating article descriptions")
|
||||
return
|
||||
|
||||
# override from the one in version 0.8.31
|
||||
def create_opf(self, feeds, dir=None):
|
||||
if dir is None:
|
||||
dir = self.output_dir
|
||||
if __UseChineseTitle__ == True:
|
||||
if __Region__ == 'Hong Kong':
|
||||
title = u'\u660e\u5831 (\u9999\u6e2f)'
|
||||
elif __Region__ == 'Vancouver':
|
||||
title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
|
||||
elif __Region__ == 'Toronto':
|
||||
title = u'\u660e\u5831 (\u591a\u502b\u591a)'
|
||||
else:
|
||||
title = self.short_title()
|
||||
# if not generating a periodical, force date to apply in title
|
||||
if __MakePeriodical__ == False:
|
||||
# change 1: allow our own flag to tell if a periodical is to be generated
|
||||
# also use customed date instead of current time
|
||||
if __MakePeriodical__ == False or self.output_profile.periodical_date_in_title:
|
||||
title = title + ' ' + self.get_fetchformatteddate()
|
||||
if True:
|
||||
mi = MetaInformation(title, [self.publisher])
|
||||
mi.publisher = self.publisher
|
||||
mi.author_sort = self.publisher
|
||||
# end of change 1
|
||||
# change 2: __appname__ replaced by newspaper publisher
|
||||
__appname__ = self.publisher
|
||||
mi = MetaInformation(title, [__appname__])
|
||||
mi.publisher = __appname__
|
||||
mi.author_sort = __appname__
|
||||
# change 3: use __MakePeriodical__ flag to tell if a periodical should be generated
|
||||
if __MakePeriodical__ == True:
|
||||
mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
else:
|
||||
mi.publication_type = self.publication_type+':'+self.short_title()
|
||||
#mi.timestamp = nowf()
|
||||
mi.timestamp = self.get_dtlocal()
|
||||
mi.comments = self.description
|
||||
if not isinstance(mi.comments, unicode):
|
||||
mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
#mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
|
||||
# change 4: in the following, all the nowf() are changed to adjusted time
|
||||
# This one doesn't matter
|
||||
mi.timestamp = nowf()
|
||||
# change 5: skip listing the articles
|
||||
#article_titles, aseen = [], set()
|
||||
#for f in feeds:
|
||||
# for a in f:
|
||||
# if a.title and a.title not in aseen:
|
||||
# aseen.add(a.title)
|
||||
# article_titles.append(force_unicode(a.title, 'utf-8'))
|
||||
|
||||
#mi.comments = self.description
|
||||
#if not isinstance(mi.comments, unicode):
|
||||
# mi.comments = mi.comments.decode('utf-8', 'replace')
|
||||
#mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
|
||||
# '\n\n'.join(article_titles))
|
||||
|
||||
language = canonicalize_lang(self.language)
|
||||
if language is not None:
|
||||
mi.language = language
|
||||
# This one affects the pub date shown in kindle title
|
||||
#mi.pubdate = nowf()
|
||||
mi.pubdate = self.get_dtlocal()
|
||||
# now appears to need the time field to be > 12.00noon as well
|
||||
mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
ncx_path = os.path.join(dir, 'index.ncx')
|
||||
|
||||
opf = OPFCreator(dir, mi)
|
||||
# Add mastheadImage entry to <guide> section
|
||||
mp = getattr(self, 'masthead_path', None)
|
||||
@ -514,11 +825,13 @@ class MPRecipe(BasicNewsRecipe):
|
||||
mani.id = 'ncx'
|
||||
if mani.path.endswith('mastheadImage.jpg'):
|
||||
mani.id = 'masthead-image'
|
||||
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
self.play_order_counter = 0
|
||||
self.play_order_map = {}
|
||||
|
||||
|
||||
def feed_index(num, parent):
|
||||
f = feeds[num]
|
||||
for j, a in enumerate(f):
|
||||
@ -532,13 +845,16 @@ class MPRecipe(BasicNewsRecipe):
|
||||
desc = None
|
||||
else:
|
||||
desc = self.description_limiter(desc)
|
||||
tt = a.toc_thumbnail if a.toc_thumbnail else None
|
||||
entries.append('%sindex.html'%adir)
|
||||
po = self.play_order_map.get(entries[-1], None)
|
||||
if po is None:
|
||||
self.play_order_counter += 1
|
||||
po = self.play_order_counter
|
||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
|
||||
play_order=po, author=auth, description=desc)
|
||||
parent.add_item('%sindex.html'%adir, None,
|
||||
a.title if a.title else _('Untitled Article'),
|
||||
play_order=po, author=auth,
|
||||
description=desc, toc_thumbnail=tt)
|
||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
@ -555,7 +871,7 @@ class MPRecipe(BasicNewsRecipe):
|
||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||
templ = self.navbar.generate(True, num, j, len(f),
|
||||
not self.has_single_feed,
|
||||
a.orig_url, self.publisher, prefix=prefix,
|
||||
a.orig_url, __appname__, prefix=prefix,
|
||||
center=self.center_navbar)
|
||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||
body.insert(len(body.contents), elem)
|
||||
@ -592,3 +908,4 @@ class MPRecipe(BasicNewsRecipe):
|
||||
with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
|
||||
opf.render(opf_file, ncx_file)
|
||||
|
||||
|
||||
|
15
recipes/mlody_technik_pl.recipe
Normal file
@ -0,0 +1,15 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
class Mlody_technik(BasicNewsRecipe):
|
||||
title = u'Mlody technik'
|
||||
__author__ = 'fenuks'
|
||||
description = u'Młody technik'
|
||||
category = 'science'
|
||||
language = 'pl'
|
||||
cover_url='http://science-everywhere.pl/wp-content/uploads/2011/10/mt12.jpg'
|
||||
no_stylesheets = True
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
#keep_only_tags=[dict(id='container')]
|
||||
feeds = [(u'Artyku\u0142y', u'http://www.mt.com.pl/feed')]
|
@ -1,9 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
||||
__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
moneynews.newsmax.com
|
||||
www.moneynews.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
@ -12,40 +10,40 @@ class MoneyNews(BasicNewsRecipe):
|
||||
title = 'Moneynews.com'
|
||||
__author__ = 'Darko Miletic'
|
||||
description = 'Financial news worldwide'
|
||||
publisher = 'moneynews.com'
|
||||
publisher = 'Newsmax.com'
|
||||
language = 'en'
|
||||
|
||||
category = 'news, finances, USA, business'
|
||||
oldest_article = 2
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
encoding = 'cp1252'
|
||||
encoding = 'utf8'
|
||||
extra_css = 'img{display: block} body{font-family: Arial, Helvetica, sans-serif}'
|
||||
|
||||
html2lrf_options = [
|
||||
'--comment', description
|
||||
, '--category', category
|
||||
, '--publisher', publisher
|
||||
, '--ignore-tables'
|
||||
]
|
||||
|
||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
|
||||
conversion_options = {
|
||||
'comment' : description
|
||||
, 'tags' : category
|
||||
, 'publisher' : publisher
|
||||
, 'language' : language
|
||||
, 'linearize_tables' : True
|
||||
}
|
||||
|
||||
feeds = [
|
||||
(u'Street Talk' , u'http://moneynews.newsmax.com/xml/streettalk.xml' )
|
||||
,(u'Finance News' , u'http://moneynews.newsmax.com/xml/FinanceNews.xml' )
|
||||
,(u'Economy' , u'http://moneynews.newsmax.com/xml/economy.xml' )
|
||||
,(u'Companies' , u'http://moneynews.newsmax.com/xml/companies.xml' )
|
||||
,(u'Markets' , u'http://moneynews.newsmax.com/xml/Markets.xml' )
|
||||
,(u'Investing & Analysis' , u'http://moneynews.newsmax.com/xml/investing.xml' )
|
||||
(u'Street Talk' , u'http://www.moneynews.com/rss/StreetTalk/8.xml' )
|
||||
,(u'Finance News' , u'http://www.moneynews.com/rss/FinanceNews/4.xml' )
|
||||
,(u'Economy' , u'http://www.moneynews.com/rss/Economy/2.xml' )
|
||||
,(u'Companies' , u'http://www.moneynews.com/rss/Companies/6.xml' )
|
||||
,(u'Markets' , u'http://www.moneynews.com/rss/Markets/7.xml' )
|
||||
,(u'Investing & Analysis' , u'http://www.moneynews.com/rss/InvestingAnalysis/17.xml')
|
||||
]
|
||||
|
||||
|
||||
keep_only_tags = [dict(name='table', attrs={'class':'copy'})]
|
||||
keep_only_tags = [dict(name='div', attrs={'class':'copy'})]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='td' , attrs={'id':'article_fontsize'})
|
||||
,dict(name='table', attrs={'id':'toolbox' })
|
||||
,dict(name='tr' , attrs={'id':'noprint3' })
|
||||
dict(attrs={'class':['MsoNormal', 'MsoNoSpacing']}),
|
||||
dict(name=['object','link','embed','form','meta'])
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
nodeid = url.rpartition('/')[2]
|
||||
return 'http://www.moneynews.com/PrintTemplate?nodeid=' + nodeid
|
||||
|
@ -7,6 +7,7 @@ class naczytniki(BasicNewsRecipe):
|
||||
language = 'pl'
|
||||
description ='everything about e-readers'
|
||||
category='readers'
|
||||
no_stylesheets=True
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
remove_tags_after= dict(name='div', attrs={'class':'sociable'})
|
||||
|
@ -6,11 +6,7 @@ www.nin.co.rs
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from contextlib import closing
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre import entity_to_unicode
|
||||
|
||||
class Nin(BasicNewsRecipe):
|
||||
title = 'NIN online'
|
||||
@ -80,59 +76,11 @@ class Nin(BasicNewsRecipe):
|
||||
return self.PREFIX + item.img['src']
|
||||
return cover_url
|
||||
|
||||
def parse_index(self):
|
||||
articles = []
|
||||
count = 0
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
for item in soup.findAll('a',attrs={'class':'lmeninavFont'}):
|
||||
count = count +1
|
||||
if self.test and count > 2:
|
||||
return articles
|
||||
section = self.tag_to_string(item)
|
||||
feedlink = self.PREFIX + item['href']
|
||||
feedpage = self.index_to_soup(feedlink)
|
||||
self.report_progress(0, _('Fetching feed')+' %s...'%(section))
|
||||
inarts = []
|
||||
for art in feedpage.findAll('span',attrs={'class':'artTitle'}):
|
||||
alink = art.parent
|
||||
url = self.PREFIX + alink['href']
|
||||
title = self.tag_to_string(art)
|
||||
sparent = alink.parent
|
||||
alink.extract()
|
||||
description = self.tag_to_string(sparent)
|
||||
date = strftime(self.timefmt)
|
||||
inarts.append({
|
||||
'title' :title
|
||||
,'date' :date
|
||||
,'url' :url
|
||||
,'description':description
|
||||
})
|
||||
articles.append((section,inarts))
|
||||
return articles
|
||||
feeds = [(u'NIN Online', u'http://www.nin.co.rs/misc/rss.php?feed=RSS2.0')]
|
||||
|
||||
def index_to_soup(self, url_or_raw, raw=False):
|
||||
if re.match(r'\w+://', url_or_raw):
|
||||
open_func = getattr(self.browser, 'open_novisit', self.browser.open)
|
||||
with closing(open_func(url_or_raw)) as f:
|
||||
_raw = f.read()
|
||||
if not _raw:
|
||||
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
|
||||
else:
|
||||
_raw = url_or_raw
|
||||
if raw:
|
||||
return _raw
|
||||
if not isinstance(_raw, unicode) and self.encoding:
|
||||
if callable(self.encoding):
|
||||
_raw = self.encoding(_raw)
|
||||
else:
|
||||
_raw = _raw.decode(self.encoding, 'replace')
|
||||
massage = list(BeautifulSoup.MARKUP_MASSAGE)
|
||||
enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding
|
||||
massage.append((re.compile(r'&(\S+?);'), lambda match:
|
||||
entity_to_unicode(match, encoding=enc)))
|
||||
massage.append((re.compile(r'[\x00-\x08]+'), lambda match:
|
||||
''))
|
||||
return BeautifulSoup(_raw, markupMassage=massage)
|
||||
def get_article_url(self, article):
|
||||
url = BasicNewsRecipe.get_article_url(self, article)
|
||||
return url.replace('.co.yu', '.co.rs')
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for item in soup.findAll(style=True):
|
||||
|
54
recipes/nol.recipe
Normal file
@ -0,0 +1,54 @@
|
||||
################################################################################
|
||||
#Description: http://nol.hu/ RSS channel
|
||||
#Author: Bigpapa (bigpapabig@hotmail.com)
|
||||
#Date: 2011.12.18. - V1.1
|
||||
################################################################################
|
||||
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class NOL(BasicNewsRecipe):
|
||||
title = u'NOL'
|
||||
__author__ = 'Bigpapa'
|
||||
oldest_article = 5
|
||||
max_articles_per_feed = 5 # Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
|
||||
no_stylesheets = True
|
||||
#delay = 1
|
||||
use_embedded_content = False
|
||||
encoding = 'utf8'
|
||||
language = 'hu'
|
||||
publication_type = 'newsportal'
|
||||
|
||||
conversion_options ={
|
||||
'linearize_tables' : True,
|
||||
}
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='table', attrs={'class':['article-box']})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
|
||||
dict(name='div', attrs={'class':['h','ad-container-outer','tags noborder','ad-container-inner','image-container-lead','tags','related-container']}),
|
||||
dict(name='h4'),
|
||||
dict(name='tfoot'),
|
||||
dict(name='td', attrs={'class':['foot']}),
|
||||
dict(name='span', attrs={'class':['image-container-caption']}),
|
||||
]
|
||||
|
||||
|
||||
feeds = [
|
||||
# (u'V\xe1logat\xe1s', 'http://nol.hu/feed/valogatas.rss'),
|
||||
(u'Belf\xf6ld', 'http://nol.hu/feed/belfold.rss'),
|
||||
(u'K\xfclf\xf6ld', 'http://nol.hu/feed/kulfold.rss'),
|
||||
(u'Gazdas\xe1g', 'http://nol.hu/feed/gazdasag.rss'),
|
||||
(u'V\xe9lem\xe9ny', 'http://nol.hu/feed/velemeny.rss'),
|
||||
(u'Kult\xfara', 'http://nol.hu/feed/kult.rss'),
|
||||
(u'Tud/Tech', 'http://nol.hu/feed/tud-tech.rss'),
|
||||
(u'Sport', 'http://nol.hu/feed/sport.rss'),
|
||||
(u'Noller', 'http://nol.hu/feed/noller.rss'),
|
||||
(u'Mozaik', 'http://nol.hu/feed/mozaik.rss'),
|
||||
(u'Utaz\xe1s', 'http://nol.hu/feed/utazas.rss'),
|
||||
(u'Aut\xf3', 'http://nol.hu/feed/auto.rss'),
|
||||
(u'Voks', 'http://nol.hu/feed/voks.rss'),
|
||||
|
||||
]
|
@ -1,20 +1,21 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Nowa_Fantastyka(BasicNewsRecipe):
|
||||
title = u'Nowa Fantastyka'
|
||||
oldest_article = 7
|
||||
__author__ = 'fenuks'
|
||||
language = 'pl'
|
||||
encoding='latin2'
|
||||
description ='site for fantasy readers'
|
||||
category='fantasy'
|
||||
max_articles_per_feed = 100
|
||||
INDEX='http://www.fantastyka.pl/'
|
||||
no_stylesheets=True
|
||||
needs_subscription = 'optional'
|
||||
remove_tags_before=dict(attrs={'class':'belka1-tlo-md'})
|
||||
#remove_tags_after=dict(name='span', attrs={'class':'naglowek-oceny'})
|
||||
remove_tags_after=dict(name='td', attrs={'class':'belka1-bot'})
|
||||
remove_tags=[dict(attrs={'class':'avatar2'})]
|
||||
feeds = []
|
||||
remove_tags=[dict(attrs={'class':'avatar2'}), dict(name='span', attrs={'class':'alert-oceny'}), dict(name='img', attrs={'src':['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'),dict(name='a', attrs={'href':'http://www.fantastyka.pl/10,1727.html'})]
|
||||
|
||||
def find_articles(self, url):
|
||||
articles = []
|
||||
@ -45,3 +46,13 @@ class Nowa_Fantastyka(BasicNewsRecipe):
|
||||
cover=soup.find(name='img', attrs={'class':'okladka'})
|
||||
self.cover_url=self.INDEX+ cover['src']
|
||||
return getattr(self, 'cover_url', self.cover_url)
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('http://www.fantastyka.pl/')
|
||||
br.select_form(nr=0)
|
||||
br['login'] = self.username
|
||||
br['pass'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''
|
||||
@ -707,6 +707,16 @@ class NYTimes(BasicNewsRecipe):
|
||||
return soup
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
|
||||
if idxdiv is not None:
|
||||
if idxdiv.img:
|
||||
self.add_toc_thumbnail(article, idxdiv.img['src'])
|
||||
else:
|
||||
img = soup.find('img')
|
||||
if img is not None:
|
||||
self.add_toc_thumbnail(article, img['src'])
|
||||
|
||||
shortparagraph = ""
|
||||
try:
|
||||
if len(article.text_summary.strip()) == 0:
|
||||
|
@ -855,6 +855,16 @@ class NYTimes(BasicNewsRecipe):
|
||||
|
||||
return soup
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||
idxdiv = soup.find('div',attrs={'class':'articleSpanImage'})
|
||||
if idxdiv is not None:
|
||||
if idxdiv.img:
|
||||
self.add_toc_thumbnail(article, idxdiv.img['src'])
|
||||
else:
|
||||
img = soup.find('img')
|
||||
if img is not None:
|
||||
self.add_toc_thumbnail(article, img['src'])
|
||||
|
||||
shortparagraph = ""
|
||||
try:
|
||||
if len(article.text_summary.strip()) == 0:
|
||||
|
@ -23,7 +23,7 @@ class OSNewsRecipe(BasicNewsRecipe):
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
|
||||
cover_url='http://osnews.pl/wp-content/themes/osnews/img/logo.png'
|
||||
extra_css = '''
|
||||
.news-heading {font-size:150%}
|
||||
.newsinformations li {display:inline;}
|
||||
@ -44,7 +44,9 @@ class OSNewsRecipe(BasicNewsRecipe):
|
||||
dict(name = 'div', attrs = {'class' : 'sociable'}),
|
||||
dict(name = 'div', attrs = {'class' : 'post_prev'}),
|
||||
dict(name = 'div', attrs = {'class' : 'post_next'}),
|
||||
dict(name = 'div', attrs = {'class' : 'clr'})
|
||||
dict(name = 'div', attrs = {'class' : 'clr'}),
|
||||
dict(name = 'div', attrs = {'class' : 'tw_button'}),
|
||||
dict(name = 'div', attrs = {'style' : 'width:56px;height:60px;float:left;margin-right:10px'})
|
||||
]
|
||||
|
||||
preprocess_regexps = [(re.compile(u'</span>Komentarze: \(?[0-9]+\)? ?<span'), lambda match: '</span><span')]
|
||||
|
79
recipes/prospectmaguk.recipe
Normal file
@ -0,0 +1,79 @@
|
||||
#!/usr/bin/env python
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
|
||||
'''
|
||||
calibre recipe for prospectmagazine.co.uk (subscription)
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class ProspectMagUK(BasicNewsRecipe):
|
||||
title = u'Prospect Magazine'
|
||||
description = 'A general-interest publication offering analysis and commentary about politics, news and business.'
|
||||
__author__ = 'barty, duluoz'
|
||||
timefmt = ' [%d %B %Y]'
|
||||
no_stylesheets = True
|
||||
publication_type = 'magazine'
|
||||
masthead_url = 'http://www.prospectmagazine.co.uk/wp-content/themes/prospect/images/titleMain.jpg'
|
||||
category = 'news, UK'
|
||||
language = 'en_GB'
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
needs_subscription = True
|
||||
|
||||
auto_cleanup_keep = '//div[@class="lead_image"]'
|
||||
remove_tags = [{'class':['shareinpost','postutils','postinfo']}]
|
||||
|
||||
INDEX = 'http://www.prospectmagazine.co.uk/current-issue'
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open('http://www.prospectmagazine.co.uk/wp-login.php')
|
||||
br.select_form(name='loginform')
|
||||
br['log'] = self.username
|
||||
br['pwd'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
#div = soup.find('h1',text=re.compile(r'Issue \d+'))
|
||||
#fname = self.tag_to_string( div) if div is not None else 'Current Issue'
|
||||
div = soup.find('div', id='cover_image')
|
||||
if div is not None:
|
||||
img = div.find('img', src=True)
|
||||
if img is not None:
|
||||
src = img['src']
|
||||
if src.startswith('/'):
|
||||
src = 'http://www.prospectmagazine.co.uk' + src
|
||||
self.cover_url = src
|
||||
feeds = []
|
||||
# loop through sections
|
||||
for sect in soup.findAll('div',attrs={'class':'sectionheading'}):
|
||||
fname = self.tag_to_string( sect).replace('>','').strip()
|
||||
self.log('Found section', fname)
|
||||
articles = []
|
||||
|
||||
# note: can't just find siblings with class='post' because that will also
|
||||
# grab all the articles belonging to the sections that follow.
|
||||
for item in sect.findNextSiblings('div',attrs={'class':True}):
|
||||
if not 'post' in item['class']: break
|
||||
a = item.find('a', href=True)
|
||||
if a is None: continue
|
||||
url = a['href']
|
||||
title = self.tag_to_string(a)
|
||||
p = item.find('p')
|
||||
desc = self.tag_to_string( p) if p is not None else ''
|
||||
art = {'title':title, 'description':desc,'date':' ', 'url':url}
|
||||
p = item.find(attrs={'class':re.compile('author')})
|
||||
self.log('\tFound article:', title, '::', url)
|
||||
if p is not None:
|
||||
art['author'] = self.tag_to_string( p).strip()
|
||||
articles.append(art)
|
||||
|
||||
feeds.append((fname, articles))
|
||||
return feeds
|
@ -42,6 +42,9 @@ class Radikal_tr(BasicNewsRecipe):
|
||||
,(u'Politika' , u'http://www.radikal.com.tr/d/rss/Rss_98.xml' )
|
||||
,(u'Dis Haberler', u'http://www.radikal.com.tr/d/rss/Rss_100.xml' )
|
||||
,(u'Ekonomi' , u'http://www.radikal.com.tr/d/rss/Rss_101.xml' )
|
||||
,(u'Radikal Iki' , u'http://www.radikal.com.tr/d/rss/Rss_42.xml')
|
||||
,(u'Radikal Hayat' , u'http://www.radikal.com.tr/d/rss/Rss_41.xml' )
|
||||
,(u'Radikal Kitap' , u'http://www.radikal.com.tr/d/rss/Rss_40.xml' )
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
|
@ -29,22 +29,7 @@ class RollingStones(BasicNewsRecipe):
|
||||
max_articles_per_feed = 25
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
|
||||
remove_javascript = True
|
||||
#####################################################################################
|
||||
# cleanup section #
|
||||
#####################################################################################
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':['c65l']}),
|
||||
dict(name='div', attrs={'id':['col1']}),
|
||||
|
||||
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class': ['storyActions upper','storyActions lowerArticleNav']}),
|
||||
dict(name='div', attrs={'id': ['comments','related']}),
|
||||
]
|
||||
|
||||
auto_cleanup = True
|
||||
|
||||
feeds = [
|
||||
(u'News', u'http://www.rollingstone.com/siteServices/rss/allNews'),
|
||||
@ -58,25 +43,7 @@ class RollingStones(BasicNewsRecipe):
|
||||
|
||||
|
||||
|
||||
def get_article_url(self, article):
|
||||
return article.get('guid', None)
|
||||
|
||||
|
||||
def append_page(self, soup, appendtag, position):
|
||||
'''
|
||||
Some are the articles are multipage so the below function
|
||||
will get the articles that have <next>
|
||||
'''
|
||||
pager = soup.find('li',attrs={'class':'next'})
|
||||
if pager:
|
||||
nexturl = pager.a['href']
|
||||
soup2 = self.index_to_soup(nexturl)
|
||||
texttag = soup2.find('div', attrs={'id':'storyTextContainer'})
|
||||
for it in texttag.findAll(style=True):
|
||||
del it['style']
|
||||
newpos = len(texttag.contents)
|
||||
self.append_page(soup2,texttag,newpos)
|
||||
texttag.extract()
|
||||
appendtag.insert(position,texttag)
|
||||
def print_version(self, url):
|
||||
return url +'?print=true'
|
||||
|
||||
|
||||
|
21
recipes/rynek_zdrowia.recipe
Normal file
@ -0,0 +1,21 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class rynekzdrowia(BasicNewsRecipe):
|
||||
title = u'Rynek Zdrowia'
|
||||
__author__ = u'spi630'
|
||||
language = 'pl'
|
||||
masthead_url = 'http://k.rynekzdrowia.pl/images/headerLogo.png'
|
||||
cover_url = 'http://k.rynekzdrowia.pl/images/headerLogo.png'
|
||||
oldest_article = 3
|
||||
max_articles_per_feed = 25
|
||||
no_stylesheets = True
|
||||
auto_cleanup = True
|
||||
remove_empty_feeds=True
|
||||
|
||||
remove_tags_before = dict(name='h3')
|
||||
|
||||
feeds = [(u'Finanse i Zarz\u0105dzanie', u'http://www.rynekzdrowia.pl/Kanal/finanse.html'), (u'Inwestycje', u'http://www.rynekzdrowia.pl/Kanal/inwestycje.html'), (u'Aparatura i wyposa\u017cenie', u'http://www.rynekzdrowia.pl/Kanal/aparatura.html'), (u'Informatyka', u'http://www.rynekzdrowia.pl/Kanal/informatyka.html'), (u'Prawo', u'http://www.rynekzdrowia.pl/Kanal/prawo.html'), (u'Polityka zdrowotna', u'http://www.rynekzdrowia.pl/Kanal/polityka_zdrowotna.html'), (u'Ubezpieczenia Zdrowotne', u'http://www.rynekzdrowia.pl/Kanal/ubezpieczenia.html'), (u'Farmacja', u'http://www.rynekzdrowia.pl/Kanal/farmacja.html'), (u'Badania i rozw\xf3j', u'http://www.rynekzdrowia.pl/Kanal/badania.html'), (u'Nauka', u'http://www.rynekzdrowia.pl/Kanal/nauka.html'), (u'Po godzinach', u'http://www.rynekzdrowia.pl/Kanal/godziny.html'), (u'Us\u0142ugi medyczne', u'http://www.rynekzdrowia.pl/Kanal/uslugi.html')]
|
||||
|
||||
def print_version(self, url):
|
||||
url = url.replace('.html', ',drukuj.html')
|
||||
return url
|
@ -11,17 +11,16 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Salon_com(BasicNewsRecipe):
|
||||
title = 'Salon.com'
|
||||
__author__ = 'cix3'
|
||||
__author__ = 'Kovid Goyal'
|
||||
description = 'Salon.com - Breaking news, opinion, politics, entertainment, sports and culture.'
|
||||
timefmt = ' [%b %d, %Y]'
|
||||
language = 'en'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
|
||||
remove_tags = [dict(name='div', attrs={'class':['ad_content', 'clearfix']}), dict(name='hr'), dict(name='img')]
|
||||
|
||||
remove_tags_before = dict(name='h2')
|
||||
auto_cleanup = True
|
||||
auto_cleanup_keep = '//div[@class="art"]'
|
||||
remove_empty_feeds = True
|
||||
|
||||
feeds = [
|
||||
('News & Politics', 'http://feeds.salon.com/salon/news'),
|
||||
@ -40,5 +39,5 @@ class Salon_com(BasicNewsRecipe):
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('/index.html', '/print.html')
|
||||
return url + '/print/'
|
||||
|
||||
|
17
recipes/salonica_press_news.recipe
Normal file
@ -0,0 +1,17 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class spn(BasicNewsRecipe):
|
||||
title = u'Salonica Press News'
|
||||
language = 'gr'
|
||||
__author__ = "SteliosGero"
|
||||
oldest_article = 3
|
||||
max_articles_per_feed = 100
|
||||
auto_cleanup = True
|
||||
category = 'news, GR'
|
||||
language = 'el'
|
||||
|
||||
|
||||
feeds = [(u'\u03a0\u03bf\u03bb\u03b9\u03c4\u03b9\u03ba\u03ae', u'http://www.spnews.gr/politiki?format=feed&type=rss'), (u'\u039f\u03b9\u03ba\u03bf\u03bd\u03bf\u03bc\u03af\u03b1', u'http://www.spnews.gr/oikonomia?format=feed&type=rss'), (u'\u0391\u03c5\u03c4\u03bf\u03b4\u03b9\u03bf\u03af\u03ba\u03b7\u03c3\u03b7', u'http://www.spnews.gr/aftodioikisi?format=feed&type=rss'), (u'\u039a\u03bf\u03b9\u03bd\u03c9\u03bd\u03af\u03b1', u'http://www.spnews.gr/koinonia?format=feed&type=rss'), (u'\u0391\u03b8\u03bb\u03b7\u03c4\u03b9\u03c3\u03bc\u03cc\u03c2', u'http://www.spnews.gr/sports?format=feed&type=rss'), (u'\u0394\u03b9\u03b5\u03b8\u03bd\u03ae', u'http://www.spnews.gr/diethni?format=feed&type=rss'), (u'\u03a0\u03bf\u03bb\u03b9\u03c4\u03b9\u03c3\u03bc\u03cc\u03c2', u'http://www.spnews.gr/politismos?format=feed&type=rss'), (u'Media', u'http://www.spnews.gr/media-news?format=feed&type=rss'), (u'\u0396\u03c9\u03ae', u'http://www.spnews.gr/zoi?format=feed&type=rss'), (u'\u03a4\u03b5\u03c7\u03bd\u03bf\u03bb\u03bf\u03b3\u03af\u03b1', u'http://spnews.gr/texnologia?format=feed&type=rss'), (u'\u03a0\u03b5\u03c1\u03b9\u03b2\u03ac\u03bb\u03bb\u03bf\u03bd', u'http://spnews.gr/periballon?format=feed&type=rss'), (u'\u03a0\u03b1\u03c1\u03b1\u03c0\u03bf\u03bb\u03b9\u03c4\u03b9\u03ba\u03ac', u'http://spnews.gr/parapolitika?format=feed&type=rss'), (u'\u03a0\u03b1\u03c1\u03b1\u03b4\u03b7\u03bc\u03bf\u03c4\u03b9\u03ba\u03ac', u'http://spnews.gr/paradimotika?format=feed&type=rss'), (u'\u03a0\u03b1\u03c1\u03b1\u03b1\u03b8\u03bb\u03b7\u03c4\u03b9\u03ba\u03ac', u'http://spnews.gr/parathlitika?format=feed&type=rss'), (u'\u0391\u03c0\u03cc\u03c8\u03b5\u03b9\u03c2', u'http://spnews.gr/apopseis?format=feed&type=rss'), (u'\u03a3\u03c5\u03bd\u03b5\u03cd\u03be\u03b5\u03b9\u03c2', u'http://spnews.gr/synenteykseis?format=feed&type=rss'), (u'Alert!', u'http://spnews.gr/alert?format=feed&type=rss')]
|
||||
|
||||
def print_version(self, url):
|
||||
return url+'?tmpl=component&print=1&layout=default&page='
|