start experiment with chm

This commit is contained in:
James Ralston 2010-01-31 12:05:56 -08:00
commit f348c6235e
32 changed files with 1837 additions and 116 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 569 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 253 B

View File

@ -0,0 +1,86 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class ADRecipe(BasicNewsRecipe):
__license__ = 'GPL v3'
__author__ = 'kwetal'
language = 'nl'
country = 'NL'
version = 1
title = u'AD'
publisher = u'de Persgroep Publishing Nederland NV'
category = u'News, Sports, the Netherlands'
description = u'News and Sports from the Netherlands'
oldest_article = 1.2
max_articles_per_feed = 100
use_embedded_content = False
remove_empty_feeds = True
no_stylesheets = True
remove_javascript = True
keep_only_tags = []
keep_only_tags.append(dict(name = 'div', attrs = {'id': 'art_box2'}))
keep_only_tags.append(dict(name = 'p', attrs = {'class': 'gen_footnote3'}))
remove_tags = []
remove_tags.append(dict(name = 'div', attrs = {'class': 'gen_clear'}))
remove_tags.append(dict(name = 'div', attrs = {'class': re.compile(r'gen_spacer.*')}))
remove_attributes = ['style']
# feeds from http://ad.nl/ad/nl/1401/home/integration/nmc/frameset/ad_footer/rssFeeds.dhtml
feeds = []
feeds.append((u'Binnenland', u'http://www.ad.nl/nieuws/binnenland/rss.xml'))
feeds.append((u'Buitenland', u'http://www.ad.nl/nieuws/buitenland/rss.xml'))
feeds.append((u'Bizar', u'http://www.ad.nl/nieuws/bizar/rss.xml'))
feeds.append((u'Gezondheid & Wetenschap', u'http://www.ad.nl/nieuws/gezondheidwetenschap/rss.xml'))
feeds.append((u'Economie', u'http://www.ad.nl/nieuws/economie/rss.xml'))
feeds.append((u'Nederlands Voetbal', u'http://www.ad.nl/sportwereld/nederlandsvoetbal/rss.xml'))
feeds.append((u'Buitenlands Voetbal', u'http://www.ad.nl/sportwereld/buitenlandsvoetbal/rss.xml'))
feeds.append((u'Champions League/Europa League', u'http://www.ad.nl/sportwereld/championsleagueeuropaleague/rss.xml'))
feeds.append((u'Wielrennen', u'http://www.ad.nl/sportwereld/wielrennen/rss.xml'))
feeds.append((u'Tennis', u'http://www.ad.nl/sportwereld/tennis/rss.xml'))
feeds.append((u'Formule 1', u'http://www.ad.nl/sportwereld/formule1/rss.xml'))
feeds.append((u'Meer Sport', u'http://www.ad.nl/sportwereld/meersport/rss.xml'))
feeds.append((u'Celebs', u'http://www.ad.nl/showbizz/celebs/rss.xml'))
feeds.append((u'Film', u'http://www.ad.nl/showbizz/film/rss.xml'))
feeds.append((u'Muziek', u'http://www.ad.nl/showbizz/muziek/rss.xml'))
feeds.append((u'TV', u'http://www.ad.nl/showbizz/tv/rss.xml'))
feeds.append((u'Kunst & Literatuur', u'http://www.ad.nl/showbizz/kunstenliteratuur/rss.xml'))
feeds.append((u'Jouw Wereld', u'http://www.ad.nl/you/rss.xml'))
feeds.append((u'Consument', u'http://www.ad.nl/consument/rss.xml'))
feeds.append((u'Autowereld', u'http://www.ad.nl/autowereld/rss.xml'))
feeds.append((u'Reiswereld', u'http://www.ad.nl/reiswereld/rss.xml'))
feeds.append((u'Internet', u'http://www.ad.nl/digitaal/internet/rss.xml'))
feeds.append((u'Games', u'http://www.ad.nl/digitaal/games/rss.xml'))
feeds.append((u'Multimedia', u'http://www.ad.nl/digitaal/multimedia/rss.xml'))
feeds.append((u'Planet Watch', u'http://www.ad.nl/planetwatch/rss.xml'))
extra_css = '''
body {font-family: verdana, arial, helvetica, geneva, sans-serif;}
div.captionEmbeddedMasterObject {font-size: x-small; font-style: italic; color: #696969;}
.gen_footnote3 {font-size: small; color: #666666; margin-top: 0.6em;}
'''
conversion_options = {'comments': description, 'tags': category, 'language': 'en',
'publisher': publisher}
def print_version(self, url):
parts = url.split('/')
print_url = 'http://' + parts[2] + '/' + parts[3] + '/' + parts[4] + '/' + parts[5] + '/' \
+ parts[10] + '/' + parts[7] + '/print/' + parts[8] + '/' + parts[9] + '/' + parts[13]
return print_url
def preprocess_html(self, soup):
for br in soup.findAll('br'):
prev = br.findPreviousSibling(True)
if hasattr(prev, 'name') and prev.name == 'br':
next = br.findNextSibling(True)
if hasattr(next, 'name') and next.name == 'br':
br.extract()
return soup

View File

@ -1,7 +1,5 @@
#!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
''' '''
spectator.org spectator.org
''' '''
@ -11,20 +9,22 @@ from calibre.web.feeds.news import BasicNewsRecipe
class TheAmericanSpectator(BasicNewsRecipe): class TheAmericanSpectator(BasicNewsRecipe):
title = 'The American Spectator' title = 'The American Spectator'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
language = 'en'
description = 'News from USA' description = 'News from USA'
category = 'news, politics, USA, world'
publisher = 'The American Spectator'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
language = 'en'
INDEX = 'http://spectator.org' INDEX = 'http://spectator.org'
html2lrf_options = [ conversion_options = {
'--comment' , description 'comments' : description
, '--category' , 'news, politics, USA' ,'tags' : category
, '--publisher' , title ,'language' : language
] ,'publisher' : publisher
}
keep_only_tags = [ keep_only_tags = [
dict(name='div', attrs={'class':'post inner'}) dict(name='div', attrs={'class':'post inner'})
@ -33,13 +33,11 @@ class TheAmericanSpectator(BasicNewsRecipe):
remove_tags = [ remove_tags = [
dict(name='object') dict(name='object')
,dict(name='div', attrs={'class':'col3' }) ,dict(name='div', attrs={'class':['col3','post-options','social']})
,dict(name='div', attrs={'class':'post-options' }) ,dict(name='p' , attrs={'class':['letter-editor','meta']})
,dict(name='p' , attrs={'class':'letter-editor'})
,dict(name='div', attrs={'class':'social' })
] ]
feeds = [ (u'Articles', u'http://feedproxy.google.com/amspecarticles')] feeds = [ (u'Articles', u'http://feeds.feedburner.com/amspecarticles')]
def get_cover_url(self): def get_cover_url(self):
cover_url = None cover_url = None
@ -53,3 +51,7 @@ class TheAmericanSpectator(BasicNewsRecipe):
def print_version(self, url): def print_version(self, url):
return url + '/print' return url + '/print'
def get_article_url(self, article):
return article.get('guid', None)

View File

@ -0,0 +1,60 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
news.bbc.co.uk
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class BBC(BasicNewsRecipe):
title = 'BBC News (fast)'
__author__ = 'Darko Miletic'
description = 'News from UK. A much faster version that does not download pictures'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
#delay = 1
use_embedded_content = False
encoding = 'utf8'
publisher = 'BBC'
category = 'news, UK, world'
language = 'en'
extra_css = ' body{ font-family: sans-serif; } .headline{font-size: xx-large; font-weight: bold} .ibox{display: block; margin: 20px 50px; padding: 10px; border: 1px solid } '
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
remove_tags_before = dict(name='div',attrs={'class':'headline'})
remove_tags_after = dict(name='div', attrs={'class':'footer'})
remove_tags = [
dict(name=['object','link','script','iframe'])
,dict(name='div', attrs={'class':'footer'})
]
feeds = [
('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'),
('Science/Nature', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/science/nature/rss.xml'),
('Technology', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/technology/rss.xml'),
('Entertainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'),
('Magazine', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/magazine/rss.xml'),
('Business', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml'),
('Health', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/health/rss.xml'),
('Americas', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml'),
('Europe', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml'),
('South Asia', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/south_asia/rss.xml'),
('UK', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/rss.xml'),
('Asia-Pacific', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/asia-pacific/rss.xml'),
('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'),
]
def print_version(self, url):
emp,sep,rstrip = url.partition('http://')
return 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/' + rstrip
def get_article_url(self, article):
return article.get('guid', None)

View File

@ -0,0 +1,121 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Calgary Herald
title = u'Calgary Herald'
url_prefix = 'http://www.calgaryherald.com'
description = u'News from Calgary, AB'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,52 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
digitaljournal.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class DigitalJournal(BasicNewsRecipe):
title = 'Digital Journal'
__author__ = 'Darko Miletic'
description = 'A Global Citizen Journalism News Network'
category = 'news, politics, USA, world'
publisher = 'Digital Journal'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf8'
language = 'en'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
keep_only_tags = [dict(name='div', attrs={'class':['article','body']})]
remove_tags = [dict(name=['object','table'])]
feeds = [
(u'Latest News' , u'http://digitaljournal.com/rss/?feed=latest_news' )
,(u'Business' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Business' )
,(u'Entertainment', u'http://digitaljournal.com/rss/?feed=top_news&depname=Entertainment')
,(u'Environment' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Environment' )
,(u'Food' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Food' )
,(u'Health' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Health' )
,(u'Internet' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Internet' )
,(u'Politics' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Politics' )
,(u'Religion' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Religion' )
,(u'Science' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Science' )
,(u'Sports' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Sports' )
,(u'Technology' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Technology' )
,(u'World' , u'http://digitaljournal.com/rss/?feed=top_news&depname=World' )
,(u'Arts' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Arts' )
]
def print_version(self, url):
return url.replace('digitaljournal.com/','digitaljournal.com/print/')

View File

@ -0,0 +1,126 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Edmonton Journal
title = u'Edmonton Journal'
url_prefix = 'http://www.edmontonjournal.com'
description = u'News from Edmonton, AB'
# un-comment the following three lines for the Calgary Herald
#title = u'Calgary Herald'
#url_prefix = 'http://www.calgaryherald.com'
#description = u'News from Calgary, AB'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,96 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Montreal Gazette
title = u'Montreal Gazette'
url_prefix = 'http://www.montrealgazette.com'
description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,101 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Ottawa Citizen
title = u'Ottawa Citizen'
url_prefix = 'http://www.ottawacitizen.com'
description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,48 @@
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class PajamasMedia(BasicNewsRecipe):
title = u'Pajamas Media'
description = u'Provides exclusive news and opinion for forty countries.'
language = 'en'
__author__ = 'Krittika Goyal'
oldest_article = 1 #days
max_articles_per_feed = 25
recursions = 1
match_regexps = [r'http://pajamasmedia.com/blog/.*/2/$']
#encoding = 'latin1'
remove_stylesheets = True
#remove_tags_before = dict(name='h1', attrs={'class':'heading'})
remove_tags_after = dict(name='div', attrs={'class':'paged-nav'})
remove_tags = [
dict(name='iframe'),
dict(name='div', attrs={'class':['pages']}),
#dict(name='div', attrs={'id':['bookmark']}),
#dict(name='span', attrs={'class':['related_link', 'slideshowcontrols']}),
#dict(name='ul', attrs={'class':'articleTools'}),
]
feeds = [
('pajamas Media',
'http://feeds.feedburner.com/PajamasMedia'),
]
def preprocess_html(self, soup):
story = soup.find(name='div', attrs={'id':'innerpage-content'})
#td = heading.findParent(name='td')
#td.extract()
soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
body = soup.find(name='body')
body.insert(0, story)
return soup
def postprocess_html(self, soup, first):
if not first:
h = soup.find(attrs={'class':'innerpage-header'})
if h: h.extract()
auth = soup.find(attrs={'class':'author'})
if auth: auth.extract()
return soup

View File

@ -8,8 +8,7 @@ class Physicstoday(BasicNewsRecipe):
description = u'Physics Today magazine' description = u'Physics Today magazine'
publisher = 'American Institute of Physics' publisher = 'American Institute of Physics'
category = 'Physics' category = 'Physics'
language = 'en' language = 'en'
cover_url = strftime('http://ptonline.aip.org/journals/doc/PHTOAD-home/jrnls/images/medcover%m_%Y.jpg') cover_url = strftime('http://ptonline.aip.org/journals/doc/PHTOAD-home/jrnls/images/medcover%m_%Y.jpg')
oldest_article = 30 oldest_article = 30
max_articles_per_feed = 100 max_articles_per_feed = 100
@ -30,11 +29,11 @@ class Physicstoday(BasicNewsRecipe):
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
br.open('http://www.physicstoday.org/pt/sso_login.jsp') br.open('http://ptonline.aip.org/journals/doc/PHTOAD-home/pt_login.jsp?fl=f')
br.select_form(name='login') br.select_form(name='login_form')
br['username'] = self.username br['username'] = self.username
br['password'] = self.password br['password'] = self.password
br.submit() br.submit()
return br return br
feeds = [(u'All', u'http://www.physicstoday.org/feed.xml')] feeds = [(u'All', u'http://www.physicstoday.org/feed.xml')]

View File

@ -0,0 +1,188 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.web.feeds import Feed
class ReadersDigest(BasicNewsRecipe):
title = 'Readers Digest'
__author__ = 'BrianG'
language = 'en'
description = 'Readers Digest Feeds'
no_stylesheets = True
use_embedded_content = False
oldest_article = 60
max_articles_per_feed = 200
language = 'en'
remove_javascript = True
extra_css = ''' h1 {font-family:georgia,serif;color:#000000;}
.mainHd{font-family:georgia,serif;color:#000000;}
h2 {font-family:Arial,Sans-serif;}
.name{font-family:Arial,Sans-serif; font-size:x-small;font-weight:bold; }
.date{font-family:Arial,Sans-serif; font-size:x-small ;color:#999999;}
.byline{font-family:Arial,Sans-serif; font-size:x-small ;}
.photoBkt{ font-size:x-small ;}
.vertPhoto{font-size:x-small ;}
.credits{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
.credit{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
.artTxt{font-family:georgia,serif;}
.caption{font-family:georgia,serif; font-size:x-small;color:#333333;}
.credit{font-family:georgia,serif; font-size:x-small;color:#999999;}
a:link{color:#CC0000;}
.breadcrumb{font-family:Arial,Sans-serif;font-size:x-small;}
'''
remove_tags = [
dict(name='h4', attrs={'class':'close'}),
dict(name='div', attrs={'class':'fromLine'}),
dict(name='img', attrs={'class':'colorTag'}),
dict(name='div', attrs={'id':'sponsorArticleHeader'}),
dict(name='div', attrs={'class':'horizontalAd'}),
dict(name='div', attrs={'id':'imageCounterLeft'}),
dict(name='div', attrs={'id':'commentsPrint'})
]
feeds = [
('New in RD', 'http://feeds.rd.com/ReadersDigest'),
('Jokes', 'http://feeds.rd.com/ReadersDigestJokes'),
('Cartoons', 'http://feeds.rd.com/ReadersDigestCartoons'),
('Blogs','http://feeds.rd.com/ReadersDigestBlogs')
]
cover_url = 'http://www.rd.com/images/logo-main-rd.gif'
#-------------------------------------------------------------------------------------------------
def print_version(self, url):
# Get the identity number of the current article and append it to the root print URL
if url.find('/article') > 0:
ident = url[url.find('/article')+8:url.find('.html?')-4]
url = 'http://www.rd.com/content/printContent.do?contentId=' + ident
elif url.find('/post') > 0:
# in this case, have to get the page itself to derive the Print page.
soup = self.index_to_soup(url)
newsoup = soup.find('ul',attrs={'class':'printBlock'})
url = 'http://www.rd.com' + newsoup('a')[0]['href']
url = url[0:url.find('&Keep')]
return url
#-------------------------------------------------------------------------------------------------
def parse_index(self):
pages = [
('Your America','http://www.rd.com/your-america-inspiring-people-and-stories', 'channelLeftContainer',{'class':'moreLeft'}),
# useless recipes ('Living Healthy','http://www.rd.com/living-healthy', 'channelLeftContainer',{'class':'moreLeft'}),
('Advice and Know-How','http://www.rd.com/advice-and-know-how', 'channelLeftContainer',{'class':'moreLeft'})
]
feeds = []
for page in pages:
section, url, divider, attrList = page
newArticles = self.page_parse(url, divider, attrList)
feeds.append((section,newArticles))
# after the pages of the site have been processed, parse several RSS feeds for additional sections
newfeeds = Feed()
newfeeds = self.parse_rss()
# The utility code in parse_rss returns a Feed object. Convert each feed/article combination into a form suitable
# for this module (parse_index).
for feed in newfeeds:
newArticles = []
for article in feed.articles:
newArt = {
'title' : article.title,
'url' : article.url,
'date' : article.date,
'description' : article.text_summary
}
newArticles.append(newArt)
# New and Blogs should be the first two feeds.
if feed.title == 'New in RD':
feeds.insert(0,(feed.title,newArticles))
elif feed.title == 'Blogs':
feeds.insert(1,(feed.title,newArticles))
else:
feeds.append((feed.title,newArticles))
return feeds
#-------------------------------------------------------------------------------------------------
def page_parse(self, mainurl, divider, attrList):
articles = []
mainsoup = self.index_to_soup(mainurl)
for item in mainsoup.findAll(attrs=attrList):
newArticle = {
'title' : item('img')[0]['alt'],
'url' : 'http://www.rd.com'+item('a')[0]['href'],
'date' : '',
'description' : ''
}
articles.append(newArticle)
return articles
#-------------------------------------------------------------------------------------------------
def parse_rss (self):
# Do the "official" parse_feeds first
feeds = BasicNewsRecipe.parse_feeds(self)
# Loop thru the articles in all feeds to find articles with "recipe" in it
recipeArticles = []
for curfeed in feeds:
delList = []
for a,curarticle in enumerate(curfeed.articles):
if curarticle.title.upper().find('RECIPE') >= 0:
recipeArticles.append(curarticle)
delList.append(curarticle)
if len(delList)>0:
for d in delList:
index = curfeed.articles.index(d)
curfeed.articles[index:index+1] = []
# If there are any recipes found, create a new Feed object and append.
if len(recipeArticles) > 0:
pfeed = Feed()
pfeed.title = 'Recipes'
pfeed.descrition = 'Recipe Feed (Virtual)'
pfeed.image_url = None
pfeed.oldest_article = 30
pfeed.id_counter = len(recipeArticles)
# Create a new Feed, add the recipe articles, and then append
# to "official" list of feeds
pfeed.articles = recipeArticles[:]
feeds.append(pfeed)
return feeds

View File

@ -0,0 +1,116 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Regina Leader-Post
title = u'Regina Leader-Post'
url_prefix = 'http://www.leaderpost.com'
description = u'News from Regina, SK'
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,111 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Saskatoon Star-Phoenix
title = u'Saskatoon Star-Phoenix'
url_prefix = 'http://www.thestarphoenix.com'
description = u'News from Saskatoon, SK'
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,136 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Vancouver Province
title = u'Vancouver Province'
url_prefix = 'http://www.theprovince.com'
description = u'News from Vancouver, BC'
# un-comment the following three lines for the Vancouver Sun
#title = u'Vancouver Sun'
#url_prefix = 'http://www.vancouversun.com'
#description = u'News from Vancouver, BC'
# un-comment the following three lines for the Edmonton Journal
#title = u'Edmonton Journal'
#url_prefix = 'http://www.edmontonjournal.com'
#description = u'News from Edmonton, AB'
# un-comment the following three lines for the Calgary Herald
#title = u'Calgary Herald'
#url_prefix = 'http://www.calgaryherald.com'
#description = u'News from Calgary, AB'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,131 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Vancouver Sun
title = u'Vancouver Sun'
url_prefix = 'http://www.vancouversun.com'
description = u'News from Vancouver, BC'
# un-comment the following three lines for the Edmonton Journal
#title = u'Edmonton Journal'
#url_prefix = 'http://www.edmontonjournal.com'
#description = u'News from Edmonton, AB'
# un-comment the following three lines for the Calgary Herald
#title = u'Calgary Herald'
#url_prefix = 'http://www.calgaryherald.com'
#description = u'News from Calgary, AB'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,141 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Victoria Times Colonist
title = u'Victoria Times Colonist'
url_prefix = 'http://www.timescolonist.com'
description = u'News from Victoria, BC'
# un-comment the following three lines for the Vancouver Province
#title = u'Vancouver Province'
#url_prefix = 'http://www.theprovince.com'
#description = u'News from Vancouver, BC'
# un-comment the following three lines for the Vancouver Sun
#title = u'Vancouver Sun'
#url_prefix = 'http://www.vancouversun.com'
#description = u'News from Vancouver, BC'
# un-comment the following three lines for the Edmonton Journal
#title = u'Edmonton Journal'
#url_prefix = 'http://www.edmontonjournal.com'
#description = u'News from Edmonton, AB'
# un-comment the following three lines for the Calgary Herald
#title = u'Calgary Herald'
#url_prefix = 'http://www.calgaryherald.com'
#description = u'News from Calgary, AB'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,106 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Windsor Star
title = u'Windsor Star'
url_prefix = 'http://www.windsorstar.com'
description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -5,6 +5,7 @@ __docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre import strftime
# http://online.wsj.com/page/us_in_todays_paper.html # http://online.wsj.com/page/us_in_todays_paper.html
@ -67,6 +68,13 @@ class WallStreetJournal(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
soup = self.wsj_get_index() soup = self.wsj_get_index()
year = strftime('%Y')
for x in soup.findAll('td', attrs={'class':'b14'}):
txt = self.tag_to_string(x).strip()
if year in txt:
self.timefmt = ' [%s]'%txt
break
left_column = soup.find( left_column = soup.find(
text=lambda t: 'begin ITP Left Column' in str(t)) text=lambda t: 'begin ITP Left Column' in str(t))
@ -91,7 +99,7 @@ class WallStreetJournal(BasicNewsRecipe):
url = url.partition('#')[0] url = url.partition('#')[0]
desc = '' desc = ''
d = x.findNextSibling(True) d = x.findNextSibling(True)
if d.get('class', None) == 'arialResize': if d is not None and d.get('class', None) == 'arialResize':
desc = self.tag_to_string(d) desc = self.tag_to_string(d)
desc = desc.partition(u'\u2022')[0] desc = desc.partition(u'\u2022')[0]
self.log('\t\tFound article:', title) self.log('\t\tFound article:', title)

View File

@ -3,47 +3,122 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
''' '''
online.wsj.com.com online.wsj.com
''' '''
import re import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
from datetime import timedelta, datetime, date
class WSJ(BasicNewsRecipe): class WSJ(BasicNewsRecipe):
# formatting adapted from original recipe by Kovid Goyal and Sujata Raman # formatting adapted from original recipe by Kovid Goyal and Sujata Raman
title = u'Wall Street Journal (free)' title = u'Wall Street Journal (free)'
__author__ = 'Nick Redding' __author__ = 'Nick Redding'
language = 'en' language = 'en'
description = ('All the free content from the Wall Street Journal (business' description = ('All the free content from the Wall Street Journal (business, financial and political news)')
', financial and political news)')
no_stylesheets = True no_stylesheets = True
timefmt = ' [%b %d]' timefmt = ' [%b %d]'
extra_css = '''h1{font-size:large; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;}
h2{font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;} # customization notes: delete sections you are not interested in
.subhead{font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;} # set omit_paid_content to False if you want the paid content article snippets
.insettipUnit {font-family:Arial,Sans-serif;font-size:xx-small;} # set oldest_article to the maximum number of days back from today to include articles
.targetCaption{font-size:x-small; font-family:Arial,Helvetica,sans-serif;} sectionlist = [
.article{font-family :Arial,Helvetica,sans-serif; font-size:x-small;} ['/home-page','Front Page'],
.tagline { ont-size:xx-small;} ['/public/page/news-opinion-commentary.html','Commentary'],
.dateStamp {font-family:Arial,Helvetica,sans-serif;} ['/public/page/news-global-world.html','World News'],
h3{font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} ['/public/page/news-world-business.html','US News'],
.byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small; list-style-type: none;} ['/public/page/news-business-us.html','Business'],
['/public/page/news-financial-markets-stock.html','Markets'],
['/public/page/news-tech-technology.html','Technology'],
['/public/page/news-personal-finance.html','Personal Finnce'],
['/public/page/news-lifestyle-arts-entertainment.html','Life & Style'],
['/public/page/news-real-estate-homes.html','Real Estate'],
['/public/page/news-career-jobs.html','Careers'],
['/public/page/news-small-business-marketing.html','Small Business']
]
oldest_article = 2
omit_paid_content = True
extra_css = '''h1{font-size:large; font-family:Times,serif;}
h2{font-family:Times,serif; font-size:small; font-style:italic;}
.subhead{font-family:Times,serif; font-size:small; font-style:italic;}
.insettipUnit {font-family:Times,serif;font-size:xx-small;}
.targetCaption{font-size:x-small; font-family:Times,serif; font-style:italic; margin-top: 0.25em;}
.article{font-family:Times,serif; font-size:x-small;}
.tagline { font-size:xx-small;}
.dateStamp {font-family:Times,serif;}
h3{font-family:Times,serif; font-size:xx-small;}
.byline {font-family:Times,serif; font-size:xx-small; list-style-type: none;}
.metadataType-articleCredits {list-style-type: none;} .metadataType-articleCredits {list-style-type: none;}
h6{ font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic;} h6{font-family:Times,serif; font-size:small; font-style:italic;}
.paperLocation{font-size:xx-small;}''' .paperLocation{font-size:xx-small;}'''
remove_tags_before = dict(name='h1')
remove_tags = [ dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", remove_tags_before = dict({'class':re.compile('^articleHeadlineBox')})
"articleTabs_tab_interactive","articleTabs_tab_video", remove_tags = [ dict({'id':re.compile('^articleTabs_tab_')}),
"articleTabs_tab_map","articleTabs_tab_slideshow"]), #dict(id=["articleTabs_tab_article", "articleTabs_tab_comments",
{'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map', # "articleTabs_tab_interactive","articleTabs_tab_video",
'insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', 'tooltip', # "articleTabs_tab_map","articleTabs_tab_slideshow"]),
'adSummary', 'nav-inline','insetFullBracket']}, {'class': ['footer_columns','network','insetCol3wide','interactive','video','slideshow','map',
dict(rel='shortcut icon'), 'insettip','insetClose','more_in', "insetContent",
# 'articleTools_bottom','articleTools_bottom mjArticleTools',
'aTools', 'tooltip',
'adSummary', 'nav-inline','insetFullBracket']},
dict({'class':re.compile('^articleTools_bottom')}),
dict(rel='shortcut icon')
] ]
remove_tags_after = [dict(id="article_story_body"), {'class':"article story"}] remove_tags_after = [dict(id="article_story_body"), {'class':"article story"}]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
return br
def preprocess_html(self,soup): def preprocess_html(self,soup):
# check if article is too old
datetag = soup.find('li',attrs={'class' : re.compile("^dateStamp")})
if datetag:
dateline_string = self.tag_to_string(datetag,False)
date_items = dateline_string.split(',')
datestring = date_items[0]+date_items[1]
article_date = datetime.strptime(datestring.title(),"%B %d %Y")
earliest_date = date.today() - timedelta(days=self.oldest_article)
if article_date.date() < earliest_date:
self.log("Skipping article dated %s" % datestring)
return None
datetag.parent.extract()
# place dateline in article heading
bylinetag = soup.find('h3','byline')
if bylinetag:
h3bylinetag = bylinetag
else:
bylinetag = soup.find('li','byline')
if bylinetag:
h3bylinetag = bylinetag.h3
if not h3bylinetag:
h3bylinetag = bylinetag
bylinetag = bylinetag.parent
if bylinetag:
if h3bylinetag.a:
bylinetext = 'By '+self.tag_to_string(h3bylinetag.a,False)
else:
bylinetext = self.tag_to_string(h3bylinetag,False)
h3byline = Tag(soup,'h3',[('class','byline')])
if bylinetext.isspace() or (bylinetext == ''):
h3byline.insert(0,NavigableString(date_items[0]+','+date_items[1]))
else:
h3byline.insert(0,NavigableString(bylinetext+u'\u2014'+date_items[0]+','+date_items[1]))
bylinetag.replaceWith(h3byline)
else:
headlinetag = soup.find('div',attrs={'class' : re.compile("^articleHeadlineBox")})
if headlinetag:
dateline = Tag(soup,'h3', [('class','byline')])
dateline.insert(0,NavigableString(date_items[0]+','+date_items[1]))
headlinetag.insert(len(headlinetag),dateline)
else: # if no date tag, don't process this page--it's not a news item
return None
# This gets rid of the annoying superfluous bullet symbol preceding columnist bylines # This gets rid of the annoying superfluous bullet symbol preceding columnist bylines
ultag = soup.find('ul',attrs={'class' : 'cMetadata metadataType-articleCredits'}) ultag = soup.find('ul',attrs={'class' : 'cMetadata metadataType-articleCredits'})
if ultag: if ultag:
@ -58,7 +133,7 @@ class WSJ(BasicNewsRecipe):
key = None key = None
ans = [] ans = []
def parse_index_page(page_name,page_title,omit_paid_content): def parse_index_page(page_name,page_title):
def article_title(tag): def article_title(tag):
atag = tag.find('h2') # title is usually in an h2 tag atag = tag.find('h2') # title is usually in an h2 tag
@ -119,7 +194,6 @@ class WSJ(BasicNewsRecipe):
soup = self.index_to_soup(pageurl) soup = self.index_to_soup(pageurl)
# Find each instance of div with class including "headlineSummary" # Find each instance of div with class including "headlineSummary"
for divtag in soup.findAll('div',attrs={'class' : re.compile("^headlineSummary")}): for divtag in soup.findAll('div',attrs={'class' : re.compile("^headlineSummary")}):
# divtag contains all article data as ul's and li's # divtag contains all article data as ul's and li's
# first, check if there is an h3 tag which provides a section name # first, check if there is an h3 tag which provides a section name
stag = divtag.find('h3') stag = divtag.find('h3')
@ -162,7 +236,7 @@ class WSJ(BasicNewsRecipe):
# now skip paid subscriber articles if desired # now skip paid subscriber articles if desired
subscriber_tag = litag.find(text="Subscriber Content") subscriber_tag = litag.find(text="Subscriber Content")
if subscriber_tag: if subscriber_tag:
if omit_paid_content: if self.omit_paid_content:
continue continue
# delete the tip div so it doesn't get in the way # delete the tip div so it doesn't get in the way
tiptag = litag.find("div", { "class" : "tipTargetBox" }) tiptag = litag.find("div", { "class" : "tipTargetBox" })
@ -185,7 +259,7 @@ class WSJ(BasicNewsRecipe):
continue continue
if url.startswith("/article"): if url.startswith("/article"):
url = mainurl+url url = mainurl+url
if not url.startswith("http"): if not url.startswith("http://online.wsj.com"):
continue continue
if not url.endswith(".html"): if not url.endswith(".html"):
continue continue
@ -214,48 +288,10 @@ class WSJ(BasicNewsRecipe):
articles[page_title] = [] articles[page_title] = []
articles[page_title].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) articles[page_title].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
# customization notes: delete sections you are not interested in
# set omit_paid_content to False if you want the paid content article previews
sectionlist = ['Front Page','Commentary','World News','US News','Business','Markets',
'Technology','Personal Finance','Life & Style','Real Estate','Careers','Small Business']
omit_paid_content = True
if 'Front Page' in sectionlist: for page_name,page_title in self.sectionlist:
parse_index_page('/home-page','Front Page',omit_paid_content) parse_index_page(page_name,page_title)
ans.append('Front Page') ans.append(page_title)
if 'Commentary' in sectionlist:
parse_index_page('/public/page/news-opinion-commentary.html','Commentary',omit_paid_content)
ans.append('Commentary')
if 'World News' in sectionlist:
parse_index_page('/public/page/news-global-world.html','World News',omit_paid_content)
ans.append('World News')
if 'US News' in sectionlist:
parse_index_page('/public/page/news-world-business.html','US News',omit_paid_content)
ans.append('US News')
if 'Business' in sectionlist:
parse_index_page('/public/page/news-business-us.html','Business',omit_paid_content)
ans.append('Business')
if 'Markets' in sectionlist:
parse_index_page('/public/page/news-financial-markets-stock.html','Markets',omit_paid_content)
ans.append('Markets')
if 'Technology' in sectionlist:
parse_index_page('/public/page/news-tech-technology.html','Technology',omit_paid_content)
ans.append('Technology')
if 'Personal Finance' in sectionlist:
parse_index_page('/public/page/news-personal-finance.html','Personal Finance',omit_paid_content)
ans.append('Personal Finance')
if 'Life & Style' in sectionlist:
parse_index_page('/public/page/news-lifestyle-arts-entertainment.html','Life & Style',omit_paid_content)
ans.append('Life & Style')
if 'Real Estate' in sectionlist:
parse_index_page('/public/page/news-real-estate-homes.html','Real Estate',omit_paid_content)
ans.append('Real Estate')
if 'Careers' in sectionlist:
parse_index_page('/public/page/news-career-jobs.html','Careers',omit_paid_content)
ans.append('Careers')
if 'Small Business' in sectionlist:
parse_index_page('/public/page/news-small-business-marketing.html','Small Business',omit_paid_content)
ans.append('Small Business')
ans = [(key, articles[key]) for key in ans if articles.has_key(key)] ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans return ans

View File

@ -0,0 +1,23 @@
/*
* images management
* Copyright 2008 Kovid Goyal
* License: GNU GPL v3
*/
function scale_images() {
$("img:visible").each(function() {
var offset = $(this).offset();
//window.py_bridge.debug(window.getComputedStyle(this, '').getPropertyValue('max-width'));
$(this).css("max-width", (window.innerWidth-offset.left-5)+"px");
$(this).css("max-height", (window.innerHeight-5)+"px");
});
}
function setup_image_scaling_handlers() {
scale_images();
$(window).resize(function(){
scale_images();
});
}

View File

@ -382,6 +382,7 @@ from calibre.ebooks.rtf.input import RTFInput
from calibre.ebooks.tcr.input import TCRInput from calibre.ebooks.tcr.input import TCRInput
from calibre.ebooks.txt.input import TXTInput from calibre.ebooks.txt.input import TXTInput
from calibre.ebooks.lrf.input import LRFInput from calibre.ebooks.lrf.input import LRFInput
from calibre.ebooks.chm.input import CHMInput # XXMODIFIED
from calibre.ebooks.epub.output import EPUBOutput from calibre.ebooks.epub.output import EPUBOutput
from calibre.ebooks.fb2.output import FB2Output from calibre.ebooks.fb2.output import FB2Output
@ -440,6 +441,7 @@ plugins += [
TCRInput, TCRInput,
TXTInput, TXTInput,
LRFInput, LRFInput,
CHMInput, # XXMODIFIED
] ]
plugins += [ plugins += [
EPUBOutput, EPUBOutput,

View File

@ -563,6 +563,16 @@ class MobiReader(object):
recindex = attrib.pop(attr, None) or recindex recindex = attrib.pop(attr, None) or recindex
if recindex is not None: if recindex is not None:
attrib['src'] = 'images/%s.jpg' % recindex attrib['src'] = 'images/%s.jpg' % recindex
for attr in ('width', 'height'):
if attr in attrib:
val = attrib[attr]
if val.lower().endswith('em'):
try:
nval = float(val[:-2])
nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile
attrib[attr] = "%dpx"%int(nval)
except:
del attrib[attr]
elif tag.tag == 'pre': elif tag.tag == 'pre':
if not tag.text: if not tag.text:
tag.tag = 'div' tag.tag = 'div'

View File

@ -411,6 +411,7 @@ class Style(object):
return result return result
def _unit_convert(self, value, base=None, font=None): def _unit_convert(self, value, base=None, font=None):
' Return value in pts'
if isinstance(value, (int, long, float)): if isinstance(value, (int, long, float)):
return value return value
try: try:
@ -447,6 +448,9 @@ class Style(object):
result = value * 0.40 result = value * 0.40
return result return result
def pt_to_px(self, value):
return (self._profile.dpi / 72.0) * value
@property @property
def fontSize(self): def fontSize(self):
def normalize_fontsize(value, base): def normalize_fontsize(value, base):

View File

@ -20,6 +20,10 @@ class Font(object):
class Column(object): class Column(object):
# A column contains an element is the element bulges out to
# the left or the right by at most HFUZZ*col width.
HFUZZ = 0.2
def __init__(self): def __init__(self):
self.left = self.right = self.top = self.bottom = 0 self.left = self.right = self.top = self.bottom = 0
self.width = self.height = 0 self.width = self.height = 0
@ -41,6 +45,10 @@ class Column(object):
for x in self.elements: for x in self.elements:
yield x yield x
def contains(self, elem):
return elem.left > self.left - self.HFUZZ*self.width and \
elem.right < self.right + self.HFUZZ*self.width
class Element(object): class Element(object):
def __eq__(self, other): def __eq__(self, other):
@ -238,11 +246,10 @@ class Page(object):
return columns return columns
def find_elements_in_row_of(self, x): def find_elements_in_row_of(self, x):
interval = Interval(x.top - self.YFUZZ * self.average_text_height, interval = Interval(x.top,
x.top + self.YFUZZ*(1+self.average_text_height)) x.top + self.YFUZZ*(1+self.average_text_height))
h_interval = Interval(x.left, x.right) h_interval = Interval(x.left, x.right)
m = max(0, x.idx-15) for y in self.elements[x.idx:x.idx+15]:
for y in self.elements[m:x.idx+15]:
if y is not x: if y is not x:
y_interval = Interval(y.top, y.bottom) y_interval = Interval(y.top, y.bottom)
x_interval = Interval(y.left, y.right) x_interval = Interval(y.left, y.right)

View File

@ -532,7 +532,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
if self.cover_fetcher.exception is not None: if self.cover_fetcher.exception is not None:
err = self.cover_fetcher.exception err = self.cover_fetcher.exception
error_dialog(self, _('Cannot fetch cover'), error_dialog(self, _('Cannot fetch cover'),
_('<b>Could not fetch cover.</b><br/>')+repr(err)).exec_() _('<b>Could not fetch cover.</b><br/>')+unicode(err)).exec_()
return return
pix = QPixmap() pix = QPixmap()

View File

@ -1361,7 +1361,7 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
def generate_catalog(self): def generate_catalog(self):
rows = self.library_view.selectionModel().selectedRows() rows = self.library_view.selectionModel().selectedRows()
if not rows: if not rows or len(rows) < 3:
rows = xrange(self.library_view.model().rowCount(QModelIndex())) rows = xrange(self.library_view.model().rowCount(QModelIndex()))
ids = map(self.library_view.model().id, rows) ids = map(self.library_view.model().id, rows)
dbspec = None dbspec = None

View File

@ -7,14 +7,14 @@
<x>0</x> <x>0</x>
<y>0</y> <y>0</y>
<width>479</width> <width>479</width>
<height>574</height> <height>606</height>
</rect> </rect>
</property> </property>
<property name="windowTitle"> <property name="windowTitle">
<string>Configure Ebook viewer</string> <string>Configure Ebook viewer</string>
</property> </property>
<property name="windowIcon"> <property name="windowIcon">
<iconset resource="../../../../resources/images.qrc"> <iconset>
<normaloff>:/images/config.svg</normaloff>:/images/config.svg</iconset> <normaloff>:/images/config.svg</normaloff>:/images/config.svg</iconset>
</property> </property>
<layout class="QGridLayout" name="gridLayout_4"> <layout class="QGridLayout" name="gridLayout_4">
@ -164,7 +164,7 @@
</item> </item>
</widget> </widget>
</item> </item>
<item row="6" column="0" colspan="2"> <item row="7" column="0" colspan="2">
<widget class="QCheckBox" name="opt_remember_window_size"> <widget class="QCheckBox" name="opt_remember_window_size">
<property name="text"> <property name="text">
<string>Remember last used &amp;window size</string> <string>Remember last used &amp;window size</string>
@ -218,6 +218,13 @@
</property> </property>
</widget> </widget>
</item> </item>
<item row="6" column="0" colspan="2">
<widget class="QCheckBox" name="opt_fit_images">
<property name="text">
<string>&amp;Resize images larger than the viewer window (needs restart)</string>
</property>
</widget>
</item>
</layout> </layout>
</item> </item>
<item row="3" column="0"> <item row="3" column="0">

View File

@ -10,7 +10,7 @@ from base64 import b64encode
from PyQt4.Qt import QSize, QSizePolicy, QUrl, SIGNAL, Qt, QTimer, \ from PyQt4.Qt import QSize, QSizePolicy, QUrl, SIGNAL, Qt, QTimer, \
QPainter, QPalette, QBrush, QFontDatabase, QDialog, \ QPainter, QPalette, QBrush, QFontDatabase, QDialog, \
QColor, QPoint, QImage, QRegion, QVariant, QIcon, \ QColor, QPoint, QImage, QRegion, QVariant, QIcon, \
QFont, QObject, QApplication, pyqtSignature, QAction QFont, pyqtSignature, QAction
from PyQt4.QtWebKit import QWebPage, QWebView, QWebSettings from PyQt4.QtWebKit import QWebPage, QWebView, QWebSettings
from calibre.utils.config import Config, StringConfig from calibre.utils.config import Config, StringConfig
@ -21,7 +21,7 @@ from calibre.constants import iswindows
from calibre import prints, guess_type from calibre import prints, guess_type
from calibre.gui2.viewer.keys import SHORTCUTS from calibre.gui2.viewer.keys import SHORTCUTS
bookmarks = referencing = hyphenation = jquery = jquery_scrollTo = hyphenator = None bookmarks = referencing = hyphenation = jquery = jquery_scrollTo = hyphenator = images =None
def load_builtin_fonts(): def load_builtin_fonts():
base = P('fonts/liberation/*.ttf') base = P('fonts/liberation/*.ttf')
@ -42,6 +42,8 @@ def config(defaults=None):
help=_('Set the user CSS stylesheet. This can be used to customize the look of all books.')) help=_('Set the user CSS stylesheet. This can be used to customize the look of all books.'))
c.add_opt('max_view_width', default=6000, c.add_opt('max_view_width', default=6000,
help=_('Maximum width of the viewer window, in pixels.')) help=_('Maximum width of the viewer window, in pixels.'))
c.add_opt('fit_images', default=True,
help=_('Resize images larger than the viewer window to fit inside it'))
c.add_opt('hyphenate', default=False, help=_('Hyphenate text')) c.add_opt('hyphenate', default=False, help=_('Hyphenate text'))
c.add_opt('hyphenate_default_lang', default='en', c.add_opt('hyphenate_default_lang', default='en',
help=_('Default language for hyphenation rules')) help=_('Default language for hyphenation rules'))
@ -59,20 +61,6 @@ def config(defaults=None):
return c return c
class PythonJS(QObject):
def __init__(self, callback):
QObject.__init__(self, QApplication.instance())
self.setObjectName("py_bridge")
self._callback = callback
@pyqtSignature("QString")
def callback(self, msg):
print "callback called"
self._callback(msg)
class ConfigDialog(QDialog, Ui_Dialog): class ConfigDialog(QDialog, Ui_Dialog):
def __init__(self, shortcuts, parent=None): def __init__(self, shortcuts, parent=None):
@ -110,6 +98,7 @@ class ConfigDialog(QDialog, Ui_Dialog):
self.shortcut_config = ShortcutConfig(shortcuts, parent=self) self.shortcut_config = ShortcutConfig(shortcuts, parent=self)
p = self.tabs.widget(1) p = self.tabs.widget(1)
p.layout().addWidget(self.shortcut_config) p.layout().addWidget(self.shortcut_config)
self.opt_fit_images.setChecked(opts.fit_images)
def accept(self, *args): def accept(self, *args):
@ -122,6 +111,7 @@ class ConfigDialog(QDialog, Ui_Dialog):
c.set('standard_font', {0:'serif', 1:'sans', 2:'mono'}[self.standard_font.currentIndex()]) c.set('standard_font', {0:'serif', 1:'sans', 2:'mono'}[self.standard_font.currentIndex()])
c.set('user_css', unicode(self.css.toPlainText())) c.set('user_css', unicode(self.css.toPlainText()))
c.set('remember_window_size', self.opt_remember_window_size.isChecked()) c.set('remember_window_size', self.opt_remember_window_size.isChecked())
c.set('fit_images', self.opt_fit_images.isChecked())
c.set('max_view_width', int(self.max_view_width.value())) c.set('max_view_width', int(self.max_view_width.value()))
c.set('hyphenate', self.hyphenate.isChecked()) c.set('hyphenate', self.hyphenate.isChecked())
idx = self.hyphenate_default_lang.currentIndex() idx = self.hyphenate_default_lang.currentIndex()
@ -157,7 +147,6 @@ class Document(QWebPage):
self.setObjectName("py_bridge") self.setObjectName("py_bridge")
self.debug_javascript = False self.debug_javascript = False
self.current_language = None self.current_language = None
#self.js_bridge = PythonJS(self.js_callback)
self.setLinkDelegationPolicy(self.DelegateAllLinks) self.setLinkDelegationPolicy(self.DelegateAllLinks)
self.scroll_marks = [] self.scroll_marks = []
@ -197,9 +186,14 @@ class Document(QWebPage):
opts = config().parse() opts = config().parse()
self.hyphenate = opts.hyphenate self.hyphenate = opts.hyphenate
self.hyphenate_default_lang = opts.hyphenate_default_lang self.hyphenate_default_lang = opts.hyphenate_default_lang
self.do_fit_images = opts.fit_images
def fit_images(self):
if self.do_fit_images:
self.javascript('setup_image_scaling_handlers()')
def load_javascript_libraries(self): def load_javascript_libraries(self):
global bookmarks, referencing, hyphenation, jquery, jquery_scrollTo, hyphenator global bookmarks, referencing, hyphenation, jquery, jquery_scrollTo, hyphenator, images
self.mainFrame().addToJavaScriptWindowObject("py_bridge", self) self.mainFrame().addToJavaScriptWindowObject("py_bridge", self)
if jquery is None: if jquery is None:
jquery = P('content_server/jquery.js', data=True) jquery = P('content_server/jquery.js', data=True)
@ -215,6 +209,9 @@ class Document(QWebPage):
if referencing is None: if referencing is None:
referencing = P('viewer/referencing.js', data=True) referencing = P('viewer/referencing.js', data=True)
self.javascript(referencing) self.javascript(referencing)
if images is None:
images = P('viewer/images.js', data=True)
self.javascript(images)
if hyphenation is None: if hyphenation is None:
hyphenation = P('viewer/hyphenation.js', data=True) hyphenation = P('viewer/hyphenation.js', data=True)
self.javascript(hyphenation) self.javascript(hyphenation)
@ -541,6 +538,7 @@ class DocumentView(QWebView):
return return
self.loading_url = None self.loading_url = None
self.document.set_bottom_padding(0) self.document.set_bottom_padding(0)
self.document.fit_images()
self._size_hint = self.document.mainFrame().contentsSize() self._size_hint = self.document.mainFrame().contentsSize()
scrolled = False scrolled = False
if self.to_bottom: if self.to_bottom:

View File

@ -1634,13 +1634,15 @@ class LibraryDatabase2(LibraryDatabase):
for i in iter(self): for i in iter(self):
yield i[x] yield i[x]
def get_data_as_dict(self, prefix=None, authors_as_string=False): def get_data_as_dict(self, prefix=None, authors_as_string=False, ids=None):
''' '''
Return all metadata stored in the database as a dict. Includes paths to Return all metadata stored in the database as a dict. Includes paths to
the cover and each format. the cover and each format.
:param prefix: The prefix for all paths. By default, the prefix is the absolute path :param prefix: The prefix for all paths. By default, the prefix is the absolute path
to the library folder. to the library folder.
:param ids: Set of ids to return the data for. If None return data for
all entries in database.
''' '''
if prefix is None: if prefix is None:
prefix = self.library_path prefix = self.library_path
@ -1650,11 +1652,14 @@ class LibraryDatabase2(LibraryDatabase):
data = [] data = []
for record in self.data: for record in self.data:
if record is None: continue if record is None: continue
db_id = record[FIELD_MAP['id']]
if ids is not None and db_id not in ids:
continue
x = {} x = {}
for field in FIELDS: for field in FIELDS:
x[field] = record[FIELD_MAP[field]] x[field] = record[FIELD_MAP[field]]
data.append(x) data.append(x)
x['id'] = record[FIELD_MAP['id']] x['id'] = db_id
x['formats'] = [] x['formats'] = []
if not x['authors']: if not x['authors']:
x['authors'] = _('Unknown') x['authors'] = _('Unknown')

View File

@ -524,6 +524,7 @@ class DynamicConfig(dict):
pass pass
except: except:
import traceback import traceback
print 'Failed to unpickle stored object:'
traceback.print_exc() traceback.print_exc()
d = {} d = {}
self.clear() self.clear()