Sync to trunk.

This commit is contained in:
John Schember 2010-01-21 17:13:09 -05:00
commit a0d1670e6f
67 changed files with 2577 additions and 486 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 569 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 253 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 531 B

View File

@ -0,0 +1,86 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class ADRecipe(BasicNewsRecipe):
__license__ = 'GPL v3'
__author__ = 'kwetal'
language = 'nl'
country = 'NL'
version = 1
title = u'AD'
publisher = u'de Persgroep Publishing Nederland NV'
category = u'News, Sports, the Netherlands'
description = u'News and Sports from the Netherlands'
oldest_article = 1.2
max_articles_per_feed = 100
use_embedded_content = False
remove_empty_feeds = True
no_stylesheets = True
remove_javascript = True
keep_only_tags = []
keep_only_tags.append(dict(name = 'div', attrs = {'id': 'art_box2'}))
keep_only_tags.append(dict(name = 'p', attrs = {'class': 'gen_footnote3'}))
remove_tags = []
remove_tags.append(dict(name = 'div', attrs = {'class': 'gen_clear'}))
remove_tags.append(dict(name = 'div', attrs = {'class': re.compile(r'gen_spacer.*')}))
remove_attributes = ['style']
# feeds from http://ad.nl/ad/nl/1401/home/integration/nmc/frameset/ad_footer/rssFeeds.dhtml
feeds = []
feeds.append((u'Binnenland', u'http://www.ad.nl/nieuws/binnenland/rss.xml'))
feeds.append((u'Buitenland', u'http://www.ad.nl/nieuws/buitenland/rss.xml'))
feeds.append((u'Bizar', u'http://www.ad.nl/nieuws/bizar/rss.xml'))
feeds.append((u'Gezondheid & Wetenschap', u'http://www.ad.nl/nieuws/gezondheidwetenschap/rss.xml'))
feeds.append((u'Economie', u'http://www.ad.nl/nieuws/economie/rss.xml'))
feeds.append((u'Nederlands Voetbal', u'http://www.ad.nl/sportwereld/nederlandsvoetbal/rss.xml'))
feeds.append((u'Buitenlands Voetbal', u'http://www.ad.nl/sportwereld/buitenlandsvoetbal/rss.xml'))
feeds.append((u'Champions League/Europa League', u'http://www.ad.nl/sportwereld/championsleagueeuropaleague/rss.xml'))
feeds.append((u'Wielrennen', u'http://www.ad.nl/sportwereld/wielrennen/rss.xml'))
feeds.append((u'Tennis', u'http://www.ad.nl/sportwereld/tennis/rss.xml'))
feeds.append((u'Formule 1', u'http://www.ad.nl/sportwereld/formule1/rss.xml'))
feeds.append((u'Meer Sport', u'http://www.ad.nl/sportwereld/meersport/rss.xml'))
feeds.append((u'Celebs', u'http://www.ad.nl/showbizz/celebs/rss.xml'))
feeds.append((u'Film', u'http://www.ad.nl/showbizz/film/rss.xml'))
feeds.append((u'Muziek', u'http://www.ad.nl/showbizz/muziek/rss.xml'))
feeds.append((u'TV', u'http://www.ad.nl/showbizz/tv/rss.xml'))
feeds.append((u'Kunst & Literatuur', u'http://www.ad.nl/showbizz/kunstenliteratuur/rss.xml'))
feeds.append((u'Jouw Wereld', u'http://www.ad.nl/you/rss.xml'))
feeds.append((u'Consument', u'http://www.ad.nl/consument/rss.xml'))
feeds.append((u'Autowereld', u'http://www.ad.nl/autowereld/rss.xml'))
feeds.append((u'Reiswereld', u'http://www.ad.nl/reiswereld/rss.xml'))
feeds.append((u'Internet', u'http://www.ad.nl/digitaal/internet/rss.xml'))
feeds.append((u'Games', u'http://www.ad.nl/digitaal/games/rss.xml'))
feeds.append((u'Multimedia', u'http://www.ad.nl/digitaal/multimedia/rss.xml'))
feeds.append((u'Planet Watch', u'http://www.ad.nl/planetwatch/rss.xml'))
extra_css = '''
body {font-family: verdana, arial, helvetica, geneva, sans-serif;}
div.captionEmbeddedMasterObject {font-size: x-small; font-style: italic; color: #696969;}
.gen_footnote3 {font-size: small; color: #666666; margin-top: 0.6em;}
'''
conversion_options = {'comments': description, 'tags': category, 'language': 'en',
'publisher': publisher}
def print_version(self, url):
parts = url.split('/')
print_url = 'http://' + parts[2] + '/' + parts[3] + '/' + parts[4] + '/' + parts[5] + '/' \
+ parts[10] + '/' + parts[7] + '/print/' + parts[8] + '/' + parts[9] + '/' + parts[13]
return print_url
def preprocess_html(self, soup):
for br in soup.findAll('br'):
prev = br.findPreviousSibling(True)
if hasattr(prev, 'name') and prev.name == 'br':
next = br.findNextSibling(True)
if hasattr(next, 'name') and next.name == 'br':
br.extract()
return soup

View File

@ -1,7 +1,5 @@
#!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
''' '''
spectator.org spectator.org
''' '''
@ -11,20 +9,22 @@ from calibre.web.feeds.news import BasicNewsRecipe
class TheAmericanSpectator(BasicNewsRecipe): class TheAmericanSpectator(BasicNewsRecipe):
title = 'The American Spectator' title = 'The American Spectator'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
language = 'en'
description = 'News from USA' description = 'News from USA'
category = 'news, politics, USA, world'
publisher = 'The American Spectator'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
language = 'en'
INDEX = 'http://spectator.org' INDEX = 'http://spectator.org'
html2lrf_options = [ conversion_options = {
'--comment' , description 'comments' : description
, '--category' , 'news, politics, USA' ,'tags' : category
, '--publisher' , title ,'language' : language
] ,'publisher' : publisher
}
keep_only_tags = [ keep_only_tags = [
dict(name='div', attrs={'class':'post inner'}) dict(name='div', attrs={'class':'post inner'})
@ -33,13 +33,11 @@ class TheAmericanSpectator(BasicNewsRecipe):
remove_tags = [ remove_tags = [
dict(name='object') dict(name='object')
,dict(name='div', attrs={'class':'col3' }) ,dict(name='div', attrs={'class':['col3','post-options','social']})
,dict(name='div', attrs={'class':'post-options' }) ,dict(name='p' , attrs={'class':['letter-editor','meta']})
,dict(name='p' , attrs={'class':'letter-editor'})
,dict(name='div', attrs={'class':'social' })
] ]
feeds = [ (u'Articles', u'http://feedproxy.google.com/amspecarticles')] feeds = [ (u'Articles', u'http://feeds.feedburner.com/amspecarticles')]
def get_cover_url(self): def get_cover_url(self):
cover_url = None cover_url = None
@ -53,3 +51,7 @@ class TheAmericanSpectator(BasicNewsRecipe):
def print_version(self, url): def print_version(self, url):
return url + '/print' return url + '/print'
def get_article_url(self, article):
return article.get('guid', None)

View File

@ -0,0 +1,60 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
news.bbc.co.uk
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class BBC(BasicNewsRecipe):
title = 'BBC News (fast)'
__author__ = 'Darko Miletic'
description = 'News from UK. A much faster version that does not download pictures'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
#delay = 1
use_embedded_content = False
encoding = 'utf8'
publisher = 'BBC'
category = 'news, UK, world'
language = 'en'
extra_css = ' body{ font-family: sans-serif; } .headline{font-size: xx-large; font-weight: bold} .ibox{display: block; margin: 20px 50px; padding: 10px; border: 1px solid } '
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
remove_tags_before = dict(name='div',attrs={'class':'headline'})
remove_tags_after = dict(name='div', attrs={'class':'footer'})
remove_tags = [
dict(name=['object','link','script','iframe'])
,dict(name='div', attrs={'class':'footer'})
]
feeds = [
('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'),
('Science/Nature', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/science/nature/rss.xml'),
('Technology', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/technology/rss.xml'),
('Entertainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'),
('Magazine', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/magazine/rss.xml'),
('Business', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml'),
('Health', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/health/rss.xml'),
('Americas', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml'),
('Europe', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml'),
('South Asia', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/south_asia/rss.xml'),
('UK', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/rss.xml'),
('Asia-Pacific', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/asia-pacific/rss.xml'),
('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'),
]
def print_version(self, url):
emp,sep,rstrip = url.partition('http://')
return 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/' + rstrip
def get_article_url(self, article):
return article.get('guid', None)

View File

@ -0,0 +1,121 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Calgary Herald
title = u'Calgary Herald'
url_prefix = 'http://www.calgaryherald.com'
description = u'News from Calgary, AB'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,15 @@
from calibre.web.feeds.news import BasicNewsRecipe
class CJR(BasicNewsRecipe):
title = u'Columbia Journalism Review'
__author__ = u'Xanthan Gum'
description = 'News about journalism.'
language = 'en'
oldest_article = 7
max_articles_per_feed = 100
feeds = [(u'News Stories', u'http://www.cjr.org/index.xml')]
def print_version(self, url):
return url + '?page=all&print=true'

View File

@ -0,0 +1,52 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
digitaljournal.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class DigitalJournal(BasicNewsRecipe):
title = 'Digital Journal'
__author__ = 'Darko Miletic'
description = 'A Global Citizen Journalism News Network'
category = 'news, politics, USA, world'
publisher = 'Digital Journal'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf8'
language = 'en'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
keep_only_tags = [dict(name='div', attrs={'class':['article','body']})]
remove_tags = [dict(name=['object','table'])]
feeds = [
(u'Latest News' , u'http://digitaljournal.com/rss/?feed=latest_news' )
,(u'Business' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Business' )
,(u'Entertainment', u'http://digitaljournal.com/rss/?feed=top_news&depname=Entertainment')
,(u'Environment' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Environment' )
,(u'Food' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Food' )
,(u'Health' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Health' )
,(u'Internet' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Internet' )
,(u'Politics' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Politics' )
,(u'Religion' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Religion' )
,(u'Science' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Science' )
,(u'Sports' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Sports' )
,(u'Technology' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Technology' )
,(u'World' , u'http://digitaljournal.com/rss/?feed=top_news&depname=World' )
,(u'Arts' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Arts' )
]
def print_version(self, url):
return url.replace('digitaljournal.com/','digitaljournal.com/print/')

View File

@ -0,0 +1,126 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Edmonton Journal
title = u'Edmonton Journal'
url_prefix = 'http://www.edmontonjournal.com'
description = u'News from Edmonton, AB'
# un-comment the following three lines for the Calgary Herald
#title = u'Calgary Herald'
#url_prefix = 'http://www.calgaryherald.com'
#description = u'News from Calgary, AB'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -9,27 +9,33 @@ from calibre.web.feeds.news import BasicNewsRecipe
class FTDe(BasicNewsRecipe): class FTDe(BasicNewsRecipe):
title = 'FTD' title = 'FTD'
description = 'Financial Times Deutschland' description = 'Financial Times Deutschland'
__author__ = 'Oliver Niesner' __author__ = 'Oliver Niesner'
use_embedded_content = False use_embedded_content = False
timefmt = ' [%d %b %Y]' timefmt = ' [%d %b %Y]'
language = 'de' language = _('German')
max_articles_per_feed = 40 max_articles_per_feed = 40
no_stylesheets = True no_stylesheets = True
remove_tags = [dict(id='navi_top'), remove_tags = [dict(id='navi_top'),
dict(id='topbanner'), dict(id='topbanner'),
dict(id='seitenkopf'), dict(id='seitenkopf'),
dict(id='BoxA-0-0-0'), dict(id='BoxA-0-0-0'),
#dict(id='BoxA-2-0-0'),
dict(id='footer'), dict(id='footer'),
dict(id='rating_open'), dict(id='rating_open'),
dict(id='ADS_Top'), dict(id='ADS_Top'),
dict(id='spinner'), dict(id='spinner'),
dict(id='ftd-contentad'), dict(id='ftd-contentad'),
dict(id='ftd-promo'),
dict(id='nava-50009007-1-0'), dict(id='nava-50009007-1-0'),
dict(id='navli-50009007-1-0'), dict(id='navli-50009007-1-0'),
dict(id='Box5000534-0-0-0'),
dict(id='ExpV-1-0-0-1'),
dict(id='ExpV-1-0-0-0'),
dict(id='PollExpV-2-0-0-0'),
dict(id='starRating'), dict(id='starRating'),
dict(id='saveRating'), dict(id='saveRating'),
dict(id='yLayer'), dict(id='yLayer'),
@ -44,14 +50,20 @@ class FTDe(BasicNewsRecipe):
dict(name='ul', attrs={'class':'nav'}), dict(name='ul', attrs={'class':'nav'}),
dict(name='p', attrs={'class':'articleOptionHead'}), dict(name='p', attrs={'class':'articleOptionHead'}),
dict(name='p', attrs={'class':'articleOptionFoot'}), dict(name='p', attrs={'class':'articleOptionFoot'}),
dict(name='p', attrs={'class':'moreInfo'}),
dict(name='div', attrs={'class':'chartBox'}), dict(name='div', attrs={'class':'chartBox'}),
dict(name='div', attrs={'class':'ratingOpt starRatingContainer articleOptionFootFrame'}), dict(name='div', attrs={'class':'ratingOpt starRatingContainer articleOptionFootFrame'}),
dict(name='div', attrs={'class':'box boxArticleBasic boxComments boxTransparent'}), dict(name='div', attrs={'class':'box boxArticleBasic boxComments boxTransparent'}),
dict(name='div', attrs={'class':'box boxNavTabs '}), dict(name='div', attrs={'class':'box boxNavTabs'}),
dict(name='div', attrs={'class':'boxMMRgtLow'}),
dict(name='span', attrs={'class':'vote_455857'}), dict(name='span', attrs={'class':'vote_455857'}),
dict(name='div', attrs={'class':'relatedhalb'}), dict(name='div', attrs={'class':'relatedhalb'}),
dict(name='div', attrs={'class':'box boxListScrollOutline'}), dict(name='div', attrs={'class':'box boxListScrollOutline'}),
dict(name='div', attrs={'class':'box boxPhotoshow boxImgWide'}),
dict(name='div', attrs={'class':'box boxTeaser boxPhotoshow boxImgWide'}),
dict(name='div', attrs={'class':'box boxTeaser'}),
dict(name='div', attrs={'class':'tagCloud'}), dict(name='div', attrs={'class':'tagCloud'}),
dict(name='div', attrs={'class':'pollView'}),
dict(name='div', attrs={'class':'box boxArticleBasic boxNavTabsOutline'}), dict(name='div', attrs={'class':'box boxArticleBasic boxNavTabsOutline'}),
dict(name='div', attrs={'class':'ftdHpNav'}), dict(name='div', attrs={'class':'ftdHpNav'}),
dict(name='div', attrs={'class':'ftdHead'}), dict(name='div', attrs={'class':'ftdHead'}),
@ -67,11 +79,12 @@ class FTDe(BasicNewsRecipe):
dict(name='div', attrs={'class':'wertungoben'}), dict(name='div', attrs={'class':'wertungoben'}),
dict(name='div', attrs={'class':'artikelfuss'}), dict(name='div', attrs={'class':'artikelfuss'}),
dict(name='a', attrs={'class':'rating'}), dict(name='a', attrs={'class':'rating'}),
dict(name='a', attrs={'href':'#rt'}),
dict(name='div', attrs={'class':'articleOptionFootFrame'}), dict(name='div', attrs={'class':'articleOptionFootFrame'}),
dict(name='div', attrs={'class':'artikelsplitfaq'})] dict(name='div', attrs={'class':'artikelsplitfaq'})]
remove_tags_after = [dict(name='a', attrs={'class':'more'})] #remove_tags_after = [dict(name='a', attrs={'class':'more'})]
feeds = [ ('Finanzen', 'http://www.ftd.de/rss2/finanzen/maerkte'), feeds = [ ('Finanzen', 'http://www.ftd.de/rss2/finanzen/maerkte'),
('Meinungshungrige', 'http://www.ftd.de/rss2/meinungshungrige'), ('Meinungshungrige', 'http://www.ftd.de/rss2/meinungshungrige'),
('Unternehmen', 'http://www.ftd.de/rss2/unternehmen'), ('Unternehmen', 'http://www.ftd.de/rss2/unternehmen'),
('Politik', 'http://www.ftd.de/rss2/politik'), ('Politik', 'http://www.ftd.de/rss2/politik'),
@ -82,8 +95,8 @@ class FTDe(BasicNewsRecipe):
('Auto', 'http://www.ftd.de/rss2/auto'), ('Auto', 'http://www.ftd.de/rss2/auto'),
('Lifestyle', 'http://www.ftd.de/rss2/lifestyle') ('Lifestyle', 'http://www.ftd.de/rss2/lifestyle')
] ]
def print_version(self, url): def print_version(self, url):
return url + '?mode=print' return url.replace('.html', '.html?mode=print')

View File

@ -32,7 +32,7 @@ class GlobeAndMail(BasicNewsRecipe):
'gallery-controls', 'video', 'galleryLoading','deck','header', 'gallery-controls', 'video', 'galleryLoading','deck','header',
'toolsBottom'] }, 'toolsBottom'] },
{'class':['credit','inline-img-caption','tab-pointer'] }, {'class':['credit','inline-img-caption','tab-pointer'] },
dict(name='div', attrs={'id':'lead-photo'}), dict(name='div', attrs={'id':['lead-photo', 'most-popular-story']}),
dict(name='div', attrs={'class':'right'}), dict(name='div', attrs={'class':'right'}),
dict(name='div', attrs={'id':'footer'}), dict(name='div', attrs={'id':'footer'}),
dict(name='div', attrs={'id':'beta-msg'}), dict(name='div', attrs={'id':'beta-msg'}),

View File

@ -0,0 +1,44 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
www.kitsapun.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Kitsapsun(BasicNewsRecipe):
title = 'Kitsap Sun'
__author__ = 'Darko Miletic'
description = 'News from Kitsap County'
publisher = 'Scripps Interactive Newspapers Group'
category = 'news, Kitsap county, USA'
language = 'en'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'cp1252'
use_embedded_content = False
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher': publisher
}
keep_only_tags = [dict(name='div', attrs={'id':['story_meta','story_content']})]
remove_tags = [dict(name=['object','link','embed','form','iframe'])]
feeds = [
(u'News' , u'http://www.kitsapsun.com/rss/headlines/news/' )
,(u'Business' , u'http://www.kitsapsun.com/rss/headlines/business/' )
,(u'Communities' , u'http://www.kitsapsun.com/rss/headlines/communities/' )
,(u'Entertainment', u'http://www.kitsapsun.com/rss/headlines/entertainment/')
,(u'Lifestyles' , u'http://www.kitsapsun.com/rss/headlines/lifestyles/' )
]
def print_version(self, url):
return url.rpartition('/')[0] + '/?print=1'

View File

@ -1,79 +1,79 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'Lorenzo Vigentini' __author__ = 'Lorenzo Vigentini'
__copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>' __copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
__version__ = 'v1.01' __version__ = 'v1.01'
__date__ = '14, January 2010' __date__ = '14, January 2010'
__description__ = 'Canadian Paper ' __description__ = 'Canadian Paper '
''' '''
http://www.ledevoir.com/ http://www.ledevoir.com/
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class ledevoir(BasicNewsRecipe): class ledevoir(BasicNewsRecipe):
author = 'Lorenzo Vigentini' author = 'Lorenzo Vigentini'
description = 'Canadian Paper' description = 'Canadian Paper'
cover_url = 'http://www.ledevoir.com/images/ul/graphiques/logo_devoir.gif' cover_url = 'http://www.ledevoir.com/images/ul/graphiques/logo_devoir.gif'
title = u'Le Devoir' title = u'Le Devoir'
publisher = 'leDevoir.com' publisher = 'leDevoir.com'
category = 'News, finance, economy, politics' category = 'News, finance, economy, politics'
language = 'fr' language = 'fr'
encoding = 'utf-8' encoding = 'utf-8'
timefmt = '[%a, %d %b, %Y]' timefmt = '[%a, %d %b, %Y]'
max_articles_per_feed = 50 max_articles_per_feed = 50
use_embedded_content = False use_embedded_content = False
recursion = 10 recursion = 10
remove_javascript = True remove_javascript = True
no_stylesheets = True no_stylesheets = True
keep_only_tags = [ keep_only_tags = [
dict(name='div', attrs={'id':'article'}), dict(name='div', attrs={'id':'article'}),
dict(name='ul', attrs={'id':'ariane'}) dict(name='ul', attrs={'id':'ariane'})
] ]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'id':'dialog'}), dict(name='div', attrs={'id':'dialog'}),
dict(name='div', attrs={'class':['interesse_actions','reactions']}), dict(name='div', attrs={'class':['interesse_actions','reactions']}),
dict(name='ul', attrs={'class':'mots_cles'}), dict(name='ul', attrs={'class':'mots_cles'}),
dict(name='a', attrs={'class':'haut'}), dict(name='a', attrs={'class':'haut'}),
dict(name='h5', attrs={'class':'interesse_actions'}) dict(name='h5', attrs={'class':'interesse_actions'})
] ]
feeds = [ feeds = [
(u'A la une', 'http://www.ledevoir.com/rss/manchettes.xml'), (u'A la une', 'http://www.ledevoir.com/rss/manchettes.xml'),
(u'Edition complete', 'http://feeds2.feedburner.com/fluxdudevoir'), (u'Edition complete', 'http://feeds2.feedburner.com/fluxdudevoir'),
(u'Opinions', 'http://www.ledevoir.com/rss/opinions.xml'), (u'Opinions', 'http://www.ledevoir.com/rss/opinions.xml'),
(u'Chroniques', 'http://www.ledevoir.com/rss/chroniques.xml'), (u'Chroniques', 'http://www.ledevoir.com/rss/chroniques.xml'),
(u'Politique', 'http://www.ledevoir.com/rss/section/politique.xml?id=51'), (u'Politique', 'http://www.ledevoir.com/rss/section/politique.xml?id=51'),
(u'International', 'http://www.ledevoir.com/rss/section/international.xml?id=76'), (u'International', 'http://www.ledevoir.com/rss/section/international.xml?id=76'),
(u'Culture', 'http://www.ledevoir.com/rss/section/culture.xml?id=48'), (u'Culture', 'http://www.ledevoir.com/rss/section/culture.xml?id=48'),
(u'Environnement', 'http://www.ledevoir.com/rss/section/environnement.xml?id=78'), (u'Environnement', 'http://www.ledevoir.com/rss/section/environnement.xml?id=78'),
(u'Societe', 'http://www.ledevoir.com/rss/section/societe.xml?id=52'), (u'Societe', 'http://www.ledevoir.com/rss/section/societe.xml?id=52'),
(u'Economie', 'http://www.ledevoir.com/rss/section/economie.xml?id=49'), (u'Economie', 'http://www.ledevoir.com/rss/section/economie.xml?id=49'),
(u'Sports', 'http://www.ledevoir.com/rss/section/sports.xml?id=85'), (u'Sports', 'http://www.ledevoir.com/rss/section/sports.xml?id=85'),
(u'Loisirs', 'http://www.ledevoir.com/rss/section/loisirs.xml?id=50') (u'Loisirs', 'http://www.ledevoir.com/rss/section/loisirs.xml?id=50')
] ]
extra_css = ''' extra_css = '''
h1 {color:#1C1E7C;font-family:Times,Georgia,serif;font-size:1.85em;font-size-adjust:none;font-stretch:normal;font-style:normal;font-variant:normal;font-weight:bold;line-height:1.2em;margin:0 0 5px;} h1 {color:#1C1E7C;font-family:Times,Georgia,serif;font-size:1.85em;font-size-adjust:none;font-stretch:normal;font-style:normal;font-variant:normal;font-weight:bold;line-height:1.2em;margin:0 0 5px;}
h2 {color:#333333;font-family:Times,Georgia,serif;font-size:1.5em;font-size-adjust:none;font-stretch:normal;font-style:normal;font-variant:normal;font-weight:normal;line-height:1.2em;margin:0 0 5px;} h2 {color:#333333;font-family:Times,Georgia,serif;font-size:1.5em;font-size-adjust:none;font-stretch:normal;font-style:normal;font-variant:normal;font-weight:normal;line-height:1.2em;margin:0 0 5px;}
h3 {color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px;} h3 {color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px;}
h4 {color:#333333; font-family:Arial,Helvetica,sans-serif;font-size:13px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; } h4 {color:#333333; font-family:Arial,Helvetica,sans-serif;font-size:13px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; }
h5 {color:#333333; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; text-transform:uppercase;} h5 {color:#333333; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; text-transform:uppercase;}
.specs {line-height:1em;margin:1px 0;} .specs {line-height:1em;margin:1px 0;}
.specs span.auteur {font:0.85em/1.1em Arial, Verdana, sans-serif;color:#787878;} .specs span.auteur {font:0.85em/1.1em Arial, Verdana, sans-serif;color:#787878;}
.specs span.auteur a, .specs span.auteur a,
.specs span.auteur span {text-transform:uppercase;color:#787878;} .specs span.auteur span {text-transform:uppercase;color:#787878;}
.specs .date {font:0.85em/1.1em Arial, Verdana, sans-serif;color:#787878;} .specs .date {font:0.85em/1.1em Arial, Verdana, sans-serif;color:#787878;}
ul#ariane {list-style-type:none;margin:0;padding:5px 0 8px 0;font:0.85em/1.2em Arial, Verdana, sans-serif;color:#2E2E2E;border-bottom:10px solid #fff;} ul#ariane {list-style-type:none;margin:0;padding:5px 0 8px 0;font:0.85em/1.2em Arial, Verdana, sans-serif;color:#2E2E2E;border-bottom:10px solid #fff;}
ul#ariane li {display:inline;} ul#ariane li {display:inline;}
ul#ariane a {color:#2E2E2E;text-decoration:underline;} ul#ariane a {color:#2E2E2E;text-decoration:underline;}
.credit {color:#787878;font-size:0.71em;line-height:1.1em;font-weight:bold;} .credit {color:#787878;font-size:0.71em;line-height:1.1em;font-weight:bold;}
.texte {font-size:1.15em;line-height:1.4em;margin-bottom:17px;} .texte {font-size:1.15em;line-height:1.4em;margin-bottom:17px;}
''' '''

View File

@ -0,0 +1,96 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Montreal Gazette
title = u'Montreal Gazette'
url_prefix = 'http://www.montrealgazette.com'
description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,101 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Ottawa Citizen
title = u'Ottawa Citizen'
url_prefix = 'http://www.ottawacitizen.com'
description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,48 @@
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class PajamasMedia(BasicNewsRecipe):
title = u'Pajamas Media'
description = u'Provides exclusive news and opinion for forty countries.'
language = 'en'
__author__ = 'Krittika Goyal'
oldest_article = 1 #days
max_articles_per_feed = 25
recursions = 1
match_regexps = [r'http://pajamasmedia.com/blog/.*/2/$']
#encoding = 'latin1'
remove_stylesheets = True
#remove_tags_before = dict(name='h1', attrs={'class':'heading'})
remove_tags_after = dict(name='div', attrs={'class':'paged-nav'})
remove_tags = [
dict(name='iframe'),
dict(name='div', attrs={'class':['pages']}),
#dict(name='div', attrs={'id':['bookmark']}),
#dict(name='span', attrs={'class':['related_link', 'slideshowcontrols']}),
#dict(name='ul', attrs={'class':'articleTools'}),
]
feeds = [
('pajamas Media',
'http://feeds.feedburner.com/PajamasMedia'),
]
def preprocess_html(self, soup):
story = soup.find(name='div', attrs={'id':'innerpage-content'})
#td = heading.findParent(name='td')
#td.extract()
soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
body = soup.find(name='body')
body.insert(0, story)
return soup
def postprocess_html(self, soup, first):
if not first:
h = soup.find(attrs={'class':'innerpage-header'})
if h: h.extract()
auth = soup.find(attrs={'class':'author'})
if auth: auth.extract()
return soup

View File

@ -8,8 +8,7 @@ class Physicstoday(BasicNewsRecipe):
description = u'Physics Today magazine' description = u'Physics Today magazine'
publisher = 'American Institute of Physics' publisher = 'American Institute of Physics'
category = 'Physics' category = 'Physics'
language = 'en' language = 'en'
cover_url = strftime('http://ptonline.aip.org/journals/doc/PHTOAD-home/jrnls/images/medcover%m_%Y.jpg') cover_url = strftime('http://ptonline.aip.org/journals/doc/PHTOAD-home/jrnls/images/medcover%m_%Y.jpg')
oldest_article = 30 oldest_article = 30
max_articles_per_feed = 100 max_articles_per_feed = 100
@ -30,11 +29,11 @@ class Physicstoday(BasicNewsRecipe):
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
br.open('http://www.physicstoday.org/pt/sso_login.jsp') br.open('http://ptonline.aip.org/journals/doc/PHTOAD-home/pt_login.jsp?fl=f')
br.select_form(name='login') br.select_form(name='login_form')
br['username'] = self.username br['username'] = self.username
br['password'] = self.password br['password'] = self.password
br.submit() br.submit()
return br return br
feeds = [(u'All', u'http://www.physicstoday.org/feed.xml')] feeds = [(u'All', u'http://www.physicstoday.org/feed.xml')]

View File

@ -0,0 +1,188 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.web.feeds import Feed
class ReadersDigest(BasicNewsRecipe):
title = 'Readers Digest'
__author__ = 'BrianG'
language = 'en'
description = 'Readers Digest Feeds'
no_stylesheets = True
use_embedded_content = False
oldest_article = 60
max_articles_per_feed = 200
language = 'en'
remove_javascript = True
extra_css = ''' h1 {font-family:georgia,serif;color:#000000;}
.mainHd{font-family:georgia,serif;color:#000000;}
h2 {font-family:Arial,Sans-serif;}
.name{font-family:Arial,Sans-serif; font-size:x-small;font-weight:bold; }
.date{font-family:Arial,Sans-serif; font-size:x-small ;color:#999999;}
.byline{font-family:Arial,Sans-serif; font-size:x-small ;}
.photoBkt{ font-size:x-small ;}
.vertPhoto{font-size:x-small ;}
.credits{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
.credit{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
.artTxt{font-family:georgia,serif;}
.caption{font-family:georgia,serif; font-size:x-small;color:#333333;}
.credit{font-family:georgia,serif; font-size:x-small;color:#999999;}
a:link{color:#CC0000;}
.breadcrumb{font-family:Arial,Sans-serif;font-size:x-small;}
'''
remove_tags = [
dict(name='h4', attrs={'class':'close'}),
dict(name='div', attrs={'class':'fromLine'}),
dict(name='img', attrs={'class':'colorTag'}),
dict(name='div', attrs={'id':'sponsorArticleHeader'}),
dict(name='div', attrs={'class':'horizontalAd'}),
dict(name='div', attrs={'id':'imageCounterLeft'}),
dict(name='div', attrs={'id':'commentsPrint'})
]
feeds = [
('New in RD', 'http://feeds.rd.com/ReadersDigest'),
('Jokes', 'http://feeds.rd.com/ReadersDigestJokes'),
('Cartoons', 'http://feeds.rd.com/ReadersDigestCartoons'),
('Blogs','http://feeds.rd.com/ReadersDigestBlogs')
]
cover_url = 'http://www.rd.com/images/logo-main-rd.gif'
#-------------------------------------------------------------------------------------------------
def print_version(self, url):
# Get the identity number of the current article and append it to the root print URL
if url.find('/article') > 0:
ident = url[url.find('/article')+8:url.find('.html?')-4]
url = 'http://www.rd.com/content/printContent.do?contentId=' + ident
elif url.find('/post') > 0:
# in this case, have to get the page itself to derive the Print page.
soup = self.index_to_soup(url)
newsoup = soup.find('ul',attrs={'class':'printBlock'})
url = 'http://www.rd.com' + newsoup('a')[0]['href']
url = url[0:url.find('&Keep')]
return url
#-------------------------------------------------------------------------------------------------
def parse_index(self):
pages = [
('Your America','http://www.rd.com/your-america-inspiring-people-and-stories', 'channelLeftContainer',{'class':'moreLeft'}),
# useless recipes ('Living Healthy','http://www.rd.com/living-healthy', 'channelLeftContainer',{'class':'moreLeft'}),
('Advice and Know-How','http://www.rd.com/advice-and-know-how', 'channelLeftContainer',{'class':'moreLeft'})
]
feeds = []
for page in pages:
section, url, divider, attrList = page
newArticles = self.page_parse(url, divider, attrList)
feeds.append((section,newArticles))
# after the pages of the site have been processed, parse several RSS feeds for additional sections
newfeeds = Feed()
newfeeds = self.parse_rss()
# The utility code in parse_rss returns a Feed object. Convert each feed/article combination into a form suitable
# for this module (parse_index).
for feed in newfeeds:
newArticles = []
for article in feed.articles:
newArt = {
'title' : article.title,
'url' : article.url,
'date' : article.date,
'description' : article.text_summary
}
newArticles.append(newArt)
# New and Blogs should be the first two feeds.
if feed.title == 'New in RD':
feeds.insert(0,(feed.title,newArticles))
elif feed.title == 'Blogs':
feeds.insert(1,(feed.title,newArticles))
else:
feeds.append((feed.title,newArticles))
return feeds
#-------------------------------------------------------------------------------------------------
def page_parse(self, mainurl, divider, attrList):
articles = []
mainsoup = self.index_to_soup(mainurl)
for item in mainsoup.findAll(attrs=attrList):
newArticle = {
'title' : item('img')[0]['alt'],
'url' : 'http://www.rd.com'+item('a')[0]['href'],
'date' : '',
'description' : ''
}
articles.append(newArticle)
return articles
#-------------------------------------------------------------------------------------------------
def parse_rss (self):
# Do the "official" parse_feeds first
feeds = BasicNewsRecipe.parse_feeds(self)
# Loop thru the articles in all feeds to find articles with "recipe" in it
recipeArticles = []
for curfeed in feeds:
delList = []
for a,curarticle in enumerate(curfeed.articles):
if curarticle.title.upper().find('RECIPE') >= 0:
recipeArticles.append(curarticle)
delList.append(curarticle)
if len(delList)>0:
for d in delList:
index = curfeed.articles.index(d)
curfeed.articles[index:index+1] = []
# If there are any recipes found, create a new Feed object and append.
if len(recipeArticles) > 0:
pfeed = Feed()
pfeed.title = 'Recipes'
pfeed.descrition = 'Recipe Feed (Virtual)'
pfeed.image_url = None
pfeed.oldest_article = 30
pfeed.id_counter = len(recipeArticles)
# Create a new Feed, add the recipe articles, and then append
# to "official" list of feeds
pfeed.articles = recipeArticles[:]
feeds.append(pfeed)
return feeds

View File

@ -0,0 +1,116 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Regina Leader-Post
title = u'Regina Leader-Post'
url_prefix = 'http://www.leaderpost.com'
description = u'News from Regina, SK'
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,111 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Saskatoon Star-Phoenix
title = u'Saskatoon Star-Phoenix'
url_prefix = 'http://www.thestarphoenix.com'
description = u'News from Saskatoon, SK'
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,136 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Vancouver Province
title = u'Vancouver Province'
url_prefix = 'http://www.theprovince.com'
description = u'News from Vancouver, BC'
# un-comment the following three lines for the Vancouver Sun
#title = u'Vancouver Sun'
#url_prefix = 'http://www.vancouversun.com'
#description = u'News from Vancouver, BC'
# un-comment the following three lines for the Edmonton Journal
#title = u'Edmonton Journal'
#url_prefix = 'http://www.edmontonjournal.com'
#description = u'News from Edmonton, AB'
# un-comment the following three lines for the Calgary Herald
#title = u'Calgary Herald'
#url_prefix = 'http://www.calgaryherald.com'
#description = u'News from Calgary, AB'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,131 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Vancouver Sun
title = u'Vancouver Sun'
url_prefix = 'http://www.vancouversun.com'
description = u'News from Vancouver, BC'
# un-comment the following three lines for the Edmonton Journal
#title = u'Edmonton Journal'
#url_prefix = 'http://www.edmontonjournal.com'
#description = u'News from Edmonton, AB'
# un-comment the following three lines for the Calgary Herald
#title = u'Calgary Herald'
#url_prefix = 'http://www.calgaryherald.com'
#description = u'News from Calgary, AB'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,141 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Victoria Times Colonist
title = u'Victoria Times Colonist'
url_prefix = 'http://www.timescolonist.com'
description = u'News from Victoria, BC'
# un-comment the following three lines for the Vancouver Province
#title = u'Vancouver Province'
#url_prefix = 'http://www.theprovince.com'
#description = u'News from Vancouver, BC'
# un-comment the following three lines for the Vancouver Sun
#title = u'Vancouver Sun'
#url_prefix = 'http://www.vancouversun.com'
#description = u'News from Vancouver, BC'
# un-comment the following three lines for the Edmonton Journal
#title = u'Edmonton Journal'
#url_prefix = 'http://www.edmontonjournal.com'
#description = u'News from Edmonton, AB'
# un-comment the following three lines for the Calgary Herald
#title = u'Calgary Herald'
#url_prefix = 'http://www.calgaryherald.com'
#description = u'News from Calgary, AB'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,106 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Windsor Star
title = u'Windsor Star'
url_prefix = 'http://www.windsorstar.com'
description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -5,6 +5,7 @@ __docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre import strftime
# http://online.wsj.com/page/us_in_todays_paper.html # http://online.wsj.com/page/us_in_todays_paper.html
@ -67,6 +68,13 @@ class WallStreetJournal(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
soup = self.wsj_get_index() soup = self.wsj_get_index()
year = strftime('%Y')
for x in soup.findAll('td', attrs={'class':'b14'}):
txt = self.tag_to_string(x).strip()
if year in txt:
self.timefmt = ' [%s]'%txt
break
left_column = soup.find( left_column = soup.find(
text=lambda t: 'begin ITP Left Column' in str(t)) text=lambda t: 'begin ITP Left Column' in str(t))
@ -91,7 +99,7 @@ class WallStreetJournal(BasicNewsRecipe):
url = url.partition('#')[0] url = url.partition('#')[0]
desc = '' desc = ''
d = x.findNextSibling(True) d = x.findNextSibling(True)
if d.get('class', None) == 'arialResize': if d is not None and d.get('class', None) == 'arialResize':
desc = self.tag_to_string(d) desc = self.tag_to_string(d)
desc = desc.partition(u'\u2022')[0] desc = desc.partition(u'\u2022')[0]
self.log('\t\tFound article:', title) self.log('\t\tFound article:', title)

View File

@ -3,47 +3,139 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
''' '''
online.wsj.com.com online.wsj.com
''' '''
import re import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
from datetime import timedelta, date
class WSJ(BasicNewsRecipe): class WSJ(BasicNewsRecipe):
# formatting adapted from original recipe by Kovid Goyal and Sujata Raman # formatting adapted from original recipe by Kovid Goyal and Sujata Raman
title = u'Wall Street Journal (free)' title = u'Wall Street Journal (free)'
__author__ = 'Nick Redding' __author__ = 'Nick Redding'
language = 'en' language = 'en'
description = ('All the free content from the Wall Street Journal (business' description = ('All the free content from the Wall Street Journal (business, financial and political news)')
', financial and political news)')
no_stylesheets = True no_stylesheets = True
timefmt = ' [%b %d]' timefmt = ' [%b %d]'
extra_css = '''h1{font-size:large; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;}
h2{font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;} # customization notes: delete sections you are not interested in
.subhead{font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;} # set omit_paid_content to False if you want the paid content article snippets
.insettipUnit {font-family:Arial,Sans-serif;font-size:xx-small;} # set oldest_article to the maximum number of days back from today to include articles
.targetCaption{font-size:x-small; font-family:Arial,Helvetica,sans-serif;} sectionlist = [
.article{font-family :Arial,Helvetica,sans-serif; font-size:x-small;} ['/home-page','Front Page'],
.tagline { ont-size:xx-small;} ['/public/page/news-opinion-commentary.html','Commentary'],
.dateStamp {font-family:Arial,Helvetica,sans-serif;} ['/public/page/news-global-world.html','World News'],
h3{font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} ['/public/page/news-world-business.html','US News'],
.byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small; list-style-type: none;} ['/public/page/news-business-us.html','Business'],
['/public/page/news-financial-markets-stock.html','Markets'],
['/public/page/news-tech-technology.html','Technology'],
['/public/page/news-personal-finance.html','Personal Finnce'],
['/public/page/news-lifestyle-arts-entertainment.html','Life & Style'],
['/public/page/news-real-estate-homes.html','Real Estate'],
['/public/page/news-career-jobs.html','Careers'],
['/public/page/news-small-business-marketing.html','Small Business']
]
oldest_article = 2
omit_paid_content = True
extra_css = '''h1{font-size:large; font-family:Times,serif;}
h2{font-family:Times,serif; font-size:small; font-style:italic;}
.subhead{font-family:Times,serif; font-size:small; font-style:italic;}
.insettipUnit {font-family:Times,serif;font-size:xx-small;}
.targetCaption{font-size:x-small; font-family:Times,serif; font-style:italic; margin-top: 0.25em;}
.article{font-family:Times,serif; font-size:x-small;}
.tagline { font-size:xx-small;}
.dateStamp {font-family:Times,serif;}
h3{font-family:Times,serif; font-size:xx-small;}
.byline {font-family:Times,serif; font-size:xx-small; list-style-type: none;}
.metadataType-articleCredits {list-style-type: none;} .metadataType-articleCredits {list-style-type: none;}
h6{ font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic;} h6{font-family:Times,serif; font-size:small; font-style:italic;}
.paperLocation{font-size:xx-small;}''' .paperLocation{font-size:xx-small;}'''
remove_tags_before = dict(name='h1')
remove_tags = [ dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", remove_tags_before = dict({'class':re.compile('^articleHeadlineBox')})
"articleTabs_tab_interactive","articleTabs_tab_video", remove_tags = [ dict({'id':re.compile('^articleTabs_tab_')}),
"articleTabs_tab_map","articleTabs_tab_slideshow"]), #dict(id=["articleTabs_tab_article", "articleTabs_tab_comments",
{'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map', # "articleTabs_tab_interactive","articleTabs_tab_video",
'insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', 'tooltip', # "articleTabs_tab_map","articleTabs_tab_slideshow"]),
'adSummary', 'nav-inline','insetFullBracket']}, {'class': ['footer_columns','network','insetCol3wide','interactive','video','slideshow','map',
dict(rel='shortcut icon'), 'insettip','insetClose','more_in', "insetContent",
# 'articleTools_bottom','articleTools_bottom mjArticleTools',
'aTools', 'tooltip',
'adSummary', 'nav-inline','insetFullBracket']},
dict({'class':re.compile('^articleTools_bottom')}),
dict(rel='shortcut icon')
] ]
remove_tags_after = [dict(id="article_story_body"), {'class':"article story"}] remove_tags_after = [dict(id="article_story_body"), {'class':"article story"}]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
return br
def preprocess_html(self,soup): def preprocess_html(self,soup):
def decode_us_date(datestr):
udate = datestr.strip().lower().split()
m = ['january','february','march','april','may','june','july','august','september','october','november','december'].index(udate[0])+1
d = int(udate[1])
y = int(udate[2])
return date(y,m,d)
# check if article is paid content
if self.omit_paid_content:
divtags = soup.findAll('div','tooltip')
if divtags:
for divtag in divtags:
if divtag.find(text="Subscriber Content"):
return None
# check if article is too old
datetag = soup.find('li',attrs={'class' : re.compile("^dateStamp")})
if datetag:
dateline_string = self.tag_to_string(datetag,False)
date_items = dateline_string.split(',')
datestring = date_items[0]+date_items[1]
article_date = decode_us_date(datestring)
earliest_date = date.today() - timedelta(days=self.oldest_article)
if article_date < earliest_date:
self.log("Skipping article dated %s" % datestring)
return None
datetag.parent.extract()
# place dateline in article heading
bylinetag = soup.find('h3','byline')
if bylinetag:
h3bylinetag = bylinetag
else:
bylinetag = soup.find('li','byline')
if bylinetag:
h3bylinetag = bylinetag.h3
if not h3bylinetag:
h3bylinetag = bylinetag
bylinetag = bylinetag.parent
if bylinetag:
if h3bylinetag.a:
bylinetext = 'By '+self.tag_to_string(h3bylinetag.a,False)
else:
bylinetext = self.tag_to_string(h3bylinetag,False)
h3byline = Tag(soup,'h3',[('class','byline')])
if bylinetext.isspace() or (bylinetext == ''):
h3byline.insert(0,NavigableString(date_items[0]+','+date_items[1]))
else:
h3byline.insert(0,NavigableString(bylinetext+u'\u2014'+date_items[0]+','+date_items[1]))
bylinetag.replaceWith(h3byline)
else:
headlinetag = soup.find('div',attrs={'class' : re.compile("^articleHeadlineBox")})
if headlinetag:
dateline = Tag(soup,'h3', [('class','byline')])
dateline.insert(0,NavigableString(date_items[0]+','+date_items[1]))
headlinetag.insert(len(headlinetag),dateline)
else: # if no date tag, don't process this page--it's not a news item
return None
# This gets rid of the annoying superfluous bullet symbol preceding columnist bylines # This gets rid of the annoying superfluous bullet symbol preceding columnist bylines
ultag = soup.find('ul',attrs={'class' : 'cMetadata metadataType-articleCredits'}) ultag = soup.find('ul',attrs={'class' : 'cMetadata metadataType-articleCredits'})
if ultag: if ultag:
@ -58,7 +150,7 @@ class WSJ(BasicNewsRecipe):
key = None key = None
ans = [] ans = []
def parse_index_page(page_name,page_title,omit_paid_content): def parse_index_page(page_name,page_title):
def article_title(tag): def article_title(tag):
atag = tag.find('h2') # title is usually in an h2 tag atag = tag.find('h2') # title is usually in an h2 tag
@ -119,7 +211,6 @@ class WSJ(BasicNewsRecipe):
soup = self.index_to_soup(pageurl) soup = self.index_to_soup(pageurl)
# Find each instance of div with class including "headlineSummary" # Find each instance of div with class including "headlineSummary"
for divtag in soup.findAll('div',attrs={'class' : re.compile("^headlineSummary")}): for divtag in soup.findAll('div',attrs={'class' : re.compile("^headlineSummary")}):
# divtag contains all article data as ul's and li's # divtag contains all article data as ul's and li's
# first, check if there is an h3 tag which provides a section name # first, check if there is an h3 tag which provides a section name
stag = divtag.find('h3') stag = divtag.find('h3')
@ -162,7 +253,7 @@ class WSJ(BasicNewsRecipe):
# now skip paid subscriber articles if desired # now skip paid subscriber articles if desired
subscriber_tag = litag.find(text="Subscriber Content") subscriber_tag = litag.find(text="Subscriber Content")
if subscriber_tag: if subscriber_tag:
if omit_paid_content: if self.omit_paid_content:
continue continue
# delete the tip div so it doesn't get in the way # delete the tip div so it doesn't get in the way
tiptag = litag.find("div", { "class" : "tipTargetBox" }) tiptag = litag.find("div", { "class" : "tipTargetBox" })
@ -185,7 +276,7 @@ class WSJ(BasicNewsRecipe):
continue continue
if url.startswith("/article"): if url.startswith("/article"):
url = mainurl+url url = mainurl+url
if not url.startswith("http"): if not url.startswith("http://online.wsj.com"):
continue continue
if not url.endswith(".html"): if not url.endswith(".html"):
continue continue
@ -214,48 +305,10 @@ class WSJ(BasicNewsRecipe):
articles[page_title] = [] articles[page_title] = []
articles[page_title].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) articles[page_title].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
# customization notes: delete sections you are not interested in
# set omit_paid_content to False if you want the paid content article previews
sectionlist = ['Front Page','Commentary','World News','US News','Business','Markets',
'Technology','Personal Finance','Life & Style','Real Estate','Careers','Small Business']
omit_paid_content = True
if 'Front Page' in sectionlist: for page_name,page_title in self.sectionlist:
parse_index_page('/home-page','Front Page',omit_paid_content) parse_index_page(page_name,page_title)
ans.append('Front Page') ans.append(page_title)
if 'Commentary' in sectionlist:
parse_index_page('/public/page/news-opinion-commentary.html','Commentary',omit_paid_content)
ans.append('Commentary')
if 'World News' in sectionlist:
parse_index_page('/public/page/news-global-world.html','World News',omit_paid_content)
ans.append('World News')
if 'US News' in sectionlist:
parse_index_page('/public/page/news-world-business.html','US News',omit_paid_content)
ans.append('US News')
if 'Business' in sectionlist:
parse_index_page('/public/page/news-business-us.html','Business',omit_paid_content)
ans.append('Business')
if 'Markets' in sectionlist:
parse_index_page('/public/page/news-financial-markets-stock.html','Markets',omit_paid_content)
ans.append('Markets')
if 'Technology' in sectionlist:
parse_index_page('/public/page/news-tech-technology.html','Technology',omit_paid_content)
ans.append('Technology')
if 'Personal Finance' in sectionlist:
parse_index_page('/public/page/news-personal-finance.html','Personal Finance',omit_paid_content)
ans.append('Personal Finance')
if 'Life & Style' in sectionlist:
parse_index_page('/public/page/news-lifestyle-arts-entertainment.html','Life & Style',omit_paid_content)
ans.append('Life & Style')
if 'Real Estate' in sectionlist:
parse_index_page('/public/page/news-real-estate-homes.html','Real Estate',omit_paid_content)
ans.append('Real Estate')
if 'Careers' in sectionlist:
parse_index_page('/public/page/news-career-jobs.html','Careers',omit_paid_content)
ans.append('Careers')
if 'Small Business' in sectionlist:
parse_index_page('/public/page/news-small-business-marketing.html','Small Business',omit_paid_content)
ans.append('Small Business')
ans = [(key, articles[key]) for key in ans if articles.has_key(key)] ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans return ans

View File

@ -0,0 +1,125 @@
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class YemenTimesRecipe(BasicNewsRecipe):
__license__ = 'GPL v3'
__author__ = 'kwetal'
language = 'en_YE'
country = 'YE'
version = 1
title = u'Yemen Times'
publisher = u'yementimes.com'
category = u'News, Opinion, Yemen'
description = u'Award winning weekly from Yemen, promoting press freedom, professional journalism and the defense of human rights.'
oldest_article = 7
max_articles_per_feed = 100
use_embedded_content = False
encoding = 'utf-8'
remove_empty_feeds = True
no_stylesheets = True
remove_javascript = True
keep_only_tags = []
keep_only_tags.append(dict(name = 'div', attrs = {'id': 'ctl00_ContentPlaceHolder1_MAINNEWS0_Panel1',
'class': 'DMAIN2'}))
remove_attributes = ['style']
INDEX = 'http://www.yementimes.com/'
feeds = []
feeds.append((u'Our Viewpoint', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=6&pnm=OUR%20VIEWPOINT'))
feeds.append((u'Local News', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=3&pnm=Local%20news'))
feeds.append((u'Their News', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=80&pnm=Their%20News'))
feeds.append((u'Report', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=8&pnm=report'))
feeds.append((u'Health', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=51&pnm=health'))
feeds.append((u'Interview', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=77&pnm=interview'))
feeds.append((u'Opinion', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=7&pnm=opinion'))
feeds.append((u'Business', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=5&pnm=business'))
feeds.append((u'Op-Ed', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=81&pnm=Op-Ed'))
feeds.append((u'Culture', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=75&pnm=Culture'))
feeds.append((u'Readers View', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=4&pnm=Readers%20View'))
feeds.append((u'Variety', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=9&pnm=Variety'))
feeds.append((u'Education', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=57&pnm=Education'))
extra_css = '''
body {font-family:verdana, arial, helvetica, geneva, sans-serif;}
div.yemen_byline {font-size: medium; font-weight: bold;}
div.yemen_date {font-size: small; color: #666666; margin-bottom: 0.6em;}
.yemen_caption {font-size: x-small; font-style: italic; color: #696969;}
'''
conversion_options = {'comments': description, 'tags': category, 'language': 'en',
'publisher': publisher, 'linearize_tables': True}
def get_browser(self):
br = BasicNewsRecipe.get_browser()
br.set_handle_gzip(True)
return br
def parse_index(self):
answer = []
for feed_title, feed in self.feeds:
soup = self.index_to_soup(feed)
newsbox = soup.find('div', 'newsbox')
main = newsbox.findNextSibling('table')
articles = []
for li in main.findAll('li'):
title = self.tag_to_string(li.a)
url = self.INDEX + li.a['href']
articles.append({'title': title, 'date': None, 'url': url, 'description': '<br/>&nbsp;'})
answer.append((feed_title, articles))
return answer
def preprocess_html(self, soup):
freshSoup = self.getFreshSoup(soup)
headline = soup.find('div', attrs = {'id': 'DVMTIT'})
if headline:
div = headline.findNext('div', attrs = {'id': 'DVTOP'})
img = None
if div:
img = div.find('img')
headline.name = 'h1'
freshSoup.body.append(headline)
if img is not None:
freshSoup.body.append(img)
byline = soup.find('div', attrs = {'id': 'DVTIT'})
if byline:
date_el = byline.find('span')
if date_el:
pub_date = self.tag_to_string(date_el)
date = Tag(soup, 'div', attrs = [('class', 'yemen_date')])
date.append(pub_date)
date_el.extract()
raw = '<br/>'.join(['%s' % (part) for part in byline.findAll(text = True)])
author = BeautifulSoup('<div class="yemen_byline">' + raw + '</div>')
if date is not None:
freshSoup.body.append(date)
freshSoup.body.append(author)
story = soup.find('div', attrs = {'id': 'DVDET'})
if story:
for table in story.findAll('table'):
if table.find('img'):
table['class'] = 'yemen_caption'
freshSoup.body.append(story)
return freshSoup
def getFreshSoup(self, oldSoup):
freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>')
if oldSoup.head.title:
freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
return freshSoup

View File

@ -0,0 +1,23 @@
/*
* images management
* Copyright 2008 Kovid Goyal
* License: GNU GPL v3
*/
function scale_images() {
$("img:visible").each(function() {
var offset = $(this).offset();
//window.py_bridge.debug(window.getComputedStyle(this, '').getPropertyValue('max-width'));
$(this).css("max-width", (window.innerWidth-offset.left-5)+"px");
$(this).css("max-height", (window.innerHeight-5)+"px");
});
}
function setup_image_scaling_handlers() {
scale_images();
$(window).resize(function(){
scale_images();
});
}

View File

@ -2,10 +2,11 @@ from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import sys import atexit, os, shutil, sys, tempfile, zipfile
from calibre.ptempfile import PersistentTemporaryFile
from calibre.constants import numeric_version from calibre.constants import numeric_version
from calibre.ptempfile import PersistentTemporaryFile
class Plugin(object): class Plugin(object):
''' '''
@ -225,12 +226,14 @@ class MetadataWriterPlugin(Plugin):
''' '''
pass pass
class CatalogPlugin(Plugin): class CatalogPlugin(Plugin):
''' '''
A plugin that implements a catalog generator. A plugin that implements a catalog generator.
''' '''
resources_path = None
#: Output file type for which this plugin should be run #: Output file type for which this plugin should be run
#: For example: 'epub' or 'xml' #: For example: 'epub' or 'xml'
file_types = set([]) file_types = set([])
@ -248,15 +251,19 @@ class CatalogPlugin(Plugin):
#: '%default' + "'"))] #: '%default' + "'"))]
cli_options = [] cli_options = []
def search_sort_db(self, db, opts): def search_sort_db(self, db, opts):
if opts.search_text:
# If declared, --ids overrides any declared search criteria
if not opts.ids and opts.search_text:
db.search(opts.search_text) db.search(opts.search_text)
if opts.sort_by: if opts.sort_by:
# 2nd arg = ascending # 2nd arg = ascending
db.sort(opts.sort_by, True) db.sort(opts.sort_by, True)
return db.get_data_as_dict() return db.get_data_as_dict(ids=opts.ids)
def get_output_fields(self, opts): def get_output_fields(self, opts):
# Return a list of requested fields, with opts.sort_by first # Return a list of requested fields, with opts.sort_by first
@ -272,11 +279,40 @@ class CatalogPlugin(Plugin):
fields = list(all_fields & requested_fields) fields = list(all_fields & requested_fields)
else: else:
fields = list(all_fields) fields = list(all_fields)
fields.sort() fields.sort()
fields.insert(0,fields.pop(int(fields.index(opts.sort_by)))) if opts.sort_by:
fields.insert(0,fields.pop(int(fields.index(opts.sort_by))))
return fields return fields
def run(self, path_to_output, opts, db): def initialize(self):
'''
If plugin is not a built-in, copy the plugin's .ui and .py files from
the zip file to $TMPDIR.
Tab will be dynamically generated and added to the Catalog Options dialog in
calibre.gui2.dialogs.catalog.py:Catalog
'''
from calibre.customize.builtins import plugins as builtin_plugins
from calibre.customize.ui import config
from calibre.ptempfile import PersistentTemporaryDirectory
if not type(self) in builtin_plugins and \
not self.name in config['disabled_plugins']:
files_to_copy = ["%s.%s" % (self.name.lower(),ext) for ext in ["ui","py"]]
resources = zipfile.ZipFile(self.plugin_path,'r')
if self.resources_path is None:
self.resources_path = PersistentTemporaryDirectory('_plugin_resources', prefix='')
for file in files_to_copy:
try:
resources.extract(file, self.resources_path)
except:
print " customize:__init__.initialize(): %s not found in %s" % (file, os.path.basename(self.plugin_path))
continue
resources.close()
def run(self, path_to_output, opts, db, ids):
''' '''
Run the plugin. Must be implemented in subclasses. Run the plugin. Must be implemented in subclasses.
It should generate the catalog in the format specified It should generate the catalog in the format specified

View File

@ -18,7 +18,7 @@ class BLACKBERRY(USBMS):
VENDOR_ID = [0x0fca] VENDOR_ID = [0x0fca]
PRODUCT_ID = [0x8004, 0x0004] PRODUCT_ID = [0x8004, 0x0004]
BCD = [0x0200, 0x0107] BCD = [0x0200, 0x0107, 0x0201]
VENDOR_NAME = 'RIM' VENDOR_NAME = 'RIM'
WINDOWS_MAIN_MEM = 'BLACKBERRY_SD' WINDOWS_MAIN_MEM = 'BLACKBERRY_SD'

View File

@ -86,4 +86,5 @@ class NOOK(USBMS):
return drives return drives
def sanitize_path_components(self, components):
return [x.replace('#', '_') for x in components]

View File

@ -782,6 +782,13 @@ class Device(DeviceConfig, DevicePlugin):
''' '''
return default return default
def sanitize_path_components(self, components):
'''
Perform any device specific sanitization on the path components
for files to be uploaded to the device
'''
return components
def create_upload_path(self, path, mdata, fname): def create_upload_path(self, path, mdata, fname):
path = os.path.abspath(path) path = os.path.abspath(path)
extra_components = [] extra_components = []
@ -834,6 +841,7 @@ class Device(DeviceConfig, DevicePlugin):
extra_components = list(map(remove_trailing_periods, extra_components)) extra_components = list(map(remove_trailing_periods, extra_components))
components = shorten_components_to(250 - len(path), extra_components) components = shorten_components_to(250 - len(path), extra_components)
components = self.sanitize_path_components(components)
filepath = os.path.join(path, *components) filepath = os.path.join(path, *components)
filedir = os.path.dirname(filepath) filedir = os.path.dirname(filepath)

View File

@ -132,7 +132,8 @@ class FB2MLizer(object):
href = self.oeb_book.guide['titlepage'].href href = self.oeb_book.guide['titlepage'].href
item = self.oeb_book.manifest.hrefs[href] item = self.oeb_book.manifest.hrefs[href]
if item.spine_position is None: if item.spine_position is None:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb_book,
self.opts, self.opts.output_profile)
output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item)) output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item))
return output return output
@ -152,7 +153,7 @@ class FB2MLizer(object):
text = [] text = []
for item in self.oeb_book.spine: for item in self.oeb_book.spine:
self.log.debug('Converting %s to FictionBook2 XML' % item.href) self.log.debug('Converting %s to FictionBook2 XML' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
text.append(self.add_page_anchor(item)) text.append(self.add_page_anchor(item))
text += self.dump_text(item.data.find(XHTML('body')), stylizer, item) text += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
return ''.join(text) return ''.join(text)

View File

@ -32,7 +32,7 @@ class LITOutput(OutputFormatPlugin):
mangler(oeb, opts) mangler(oeb, opts)
rasterizer = SVGRasterizer() rasterizer = SVGRasterizer()
rasterizer(oeb, opts) rasterizer(oeb, opts)
lit = LitWriter() lit = LitWriter(self.opts)
lit(oeb, output_path) lit(oeb, output_path)

View File

@ -134,7 +134,7 @@ def warn(x):
class ReBinary(object): class ReBinary(object):
NSRMAP = {'': None, XML_NS: 'xml'} NSRMAP = {'': None, XML_NS: 'xml'}
def __init__(self, root, item, oeb, map=HTML_MAP): def __init__(self, root, item, oeb, opts, map=HTML_MAP):
self.item = item self.item = item
self.logger = oeb.logger self.logger = oeb.logger
self.manifest = oeb.manifest self.manifest = oeb.manifest
@ -143,7 +143,7 @@ class ReBinary(object):
self.anchors = [] self.anchors = []
self.page_breaks = [] self.page_breaks = []
self.is_html = is_html = map is HTML_MAP self.is_html = is_html = map is HTML_MAP
self.stylizer = Stylizer(root, item.href, oeb) if is_html else None self.stylizer = Stylizer(root, item.href, oeb, opts) if is_html else None
self.tree_to_binary(root) self.tree_to_binary(root)
self.content = self.buf.getvalue() self.content = self.buf.getvalue()
self.ahc = self.build_ahc() if is_html else None self.ahc = self.build_ahc() if is_html else None
@ -295,9 +295,8 @@ def preserve(function):
return wrapper return wrapper
class LitWriter(object): class LitWriter(object):
def __init__(self): def __init__(self, opts):
# Wow, no options self.opts = opts
pass
def _litize_oeb(self): def _litize_oeb(self):
oeb = self._oeb oeb = self._oeb
@ -469,7 +468,7 @@ class LitWriter(object):
secnum = 0 secnum = 0
if isinstance(data, etree._Element): if isinstance(data, etree._Element):
self._add_folder(name) self._add_folder(name)
rebin = ReBinary(data, item, self._oeb, map=HTML_MAP) rebin = ReBinary(data, item, self._oeb, self.opts, map=HTML_MAP)
self._add_file(name + '/ahc', rebin.ahc, 0) self._add_file(name + '/ahc', rebin.ahc, 0)
self._add_file(name + '/aht', rebin.aht, 0) self._add_file(name + '/aht', rebin.aht, 0)
item.page_breaks = rebin.page_breaks item.page_breaks = rebin.page_breaks
@ -562,7 +561,7 @@ class LitWriter(object):
meta.attrib['ms--minimum_level'] = '0' meta.attrib['ms--minimum_level'] = '0'
meta.attrib['ms--attr5'] = '1' meta.attrib['ms--attr5'] = '1'
meta.attrib['ms--guid'] = '{%s}' % str(uuid.uuid4()).upper() meta.attrib['ms--guid'] = '{%s}' % str(uuid.uuid4()).upper()
rebin = ReBinary(meta, None, self._oeb, map=OPF_MAP) rebin = ReBinary(meta, None, self._oeb, self.opts, map=OPF_MAP)
meta = rebin.content meta = rebin.content
self._meta = meta self._meta = meta
self._add_file('/meta', meta) self._add_file('/meta', meta)

View File

@ -128,6 +128,10 @@ def do_set_metadata(opts, mi, stream, stream_type):
mi.title_sort = title_sort(opts.title) mi.title_sort = title_sort(opts.title)
if getattr(opts, 'tags', None) is not None: if getattr(opts, 'tags', None) is not None:
mi.tags = [t.strip() for t in opts.tags.split(',')] mi.tags = [t.strip() for t in opts.tags.split(',')]
if getattr(opts, 'series', None) is not None:
mi.series = opts.series.strip()
if getattr(opts, 'series_index', None) is not None:
mi.series_index = float(opts.series_index.strip())
if getattr(opts, 'cover', None) is not None: if getattr(opts, 'cover', None) is not None:
ext = os.path.splitext(opts.cover)[1].replace('.', '').upper() ext = os.path.splitext(opts.cover)[1].replace('.', '').upper()

View File

@ -134,7 +134,10 @@ def metadata_from_filename(name, pat=None):
mi.authors = aus mi.authors = aus
if prefs['swap_author_names'] and mi.authors: if prefs['swap_author_names'] and mi.authors:
def swap(a): def swap(a):
parts = a.split() if ',' in a:
parts = a.split(',', 1)
else:
parts = a.split(None, 1)
if len(parts) > 1: if len(parts) > 1:
t = parts[-1] t = parts[-1]
parts = parts[:-1] parts = parts[:-1]

View File

@ -92,6 +92,7 @@ class MobiMLizer(object):
def __call__(self, oeb, context): def __call__(self, oeb, context):
oeb.logger.info('Converting XHTML to Mobipocket markup...') oeb.logger.info('Converting XHTML to Mobipocket markup...')
self.oeb = oeb self.oeb = oeb
self.opts = context
self.profile = profile = context.dest self.profile = profile = context.dest
self.fnums = fnums = dict((v, k) for k, v in profile.fnums.items()) self.fnums = fnums = dict((v, k) for k, v in profile.fnums.items())
self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys()) self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys())
@ -114,7 +115,7 @@ class MobiMLizer(object):
def mobimlize_spine(self): def mobimlize_spine(self):
'Iterate over the spine and convert it to MOBIML' 'Iterate over the spine and convert it to MOBIML'
for item in self.oeb.spine: for item in self.oeb.spine:
stylizer = Stylizer(item.data, item.href, self.oeb, self.profile) stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.profile)
body = item.data.find(XHTML('body')) body = item.data.find(XHTML('body'))
nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP) nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP)
nbody = etree.SubElement(nroot, XHTML('body')) nbody = etree.SubElement(nroot, XHTML('body'))

View File

@ -563,6 +563,16 @@ class MobiReader(object):
recindex = attrib.pop(attr, None) or recindex recindex = attrib.pop(attr, None) or recindex
if recindex is not None: if recindex is not None:
attrib['src'] = 'images/%s.jpg' % recindex attrib['src'] = 'images/%s.jpg' % recindex
for attr in ('width', 'height'):
if attr in attrib:
val = attrib[attr]
if val.lower().endswith('em'):
try:
nval = float(val[:-2])
nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile
attrib[attr] = "%dpx"%int(nval)
except:
del attrib[attr]
elif tag.tag == 'pre': elif tag.tag == 'pre':
if not tag.text: if not tag.text:
tag.tag = 'div' tag.tag = 'div'

View File

@ -1,99 +0,0 @@
'''
Registry associating file extensions with Reader classes.
'''
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys, os, logging
from itertools import chain
import calibre
from calibre.ebooks.oeb.base import OEBError
from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks.oeb.writer import OEBWriter
from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks.lit.writer import LitWriter
from calibre.ebooks.mobi.reader import MobiReader
from calibre.ebooks.mobi.writer import MobiWriter
from calibre.ebooks.oeb.base import OEBBook
from calibre.ebooks.oeb.profile import Context
from calibre.utils.config import Config
__all__ = ['get_reader']
REGISTRY = {
'.opf': (OEBReader, None),
'.lit': (LitReader, LitWriter),
'.mobi': (MobiReader, MobiWriter),
}
def ReaderFactory(path):
if os.path.isdir(path):
return OEBReader
ext = os.path.splitext(path)[1].lower()
Reader = REGISTRY.get(ext, (None, None))[0]
if Reader is None:
raise OEBError('Unknown e-book file extension %r' % ext)
return Reader
def WriterFactory(path):
if os.path.isdir(path):
return OEBWriter
ext = os.path.splitext(path)[1].lower()
if not os.path.exists(path) and not ext:
return OEBWriter
Writer = REGISTRY.get(ext, (None, None))[1]
if Writer is None:
raise OEBError('Unknown e-book file extension %r' % ext)
return Writer
def option_parser(Reader, Writer):
cfg = Config('ebook-convert', _('Options to control e-book conversion.'))
Reader.config(cfg)
for Transform in chain(Reader.TRANSFORMS, Writer.TRANSFORMS):
Transform.config(cfg)
Writer.config(cfg)
parser = cfg.option_parser()
parser.add_option('--encoding', default=None,
help=_('Character encoding for input. Default is to auto detect.'))
parser.add_option('-o', '--output', default=None,
help=_('Output file. Default is derived from input filename.'))
parser.add_option('-p', '--pretty-print', action='store_true',
default=False, help=_('Produce more human-readable XML output.'))
parser.add_option('-v', '--verbose', default=0, action='count',
help=_('Useful for debugging.'))
return parser
def main(argv=sys.argv):
if len(argv) < 3:
print _("Usage: ebook-convert INFILE OUTFILE [OPTIONS..]")
return 1
inpath, outpath = argv[1], argv[2]
Reader = ReaderFactory(inpath)
Writer = WriterFactory(outpath)
parser = option_parser(Reader, Writer)
opts, args = parser.parse_args(argv[3:])
if len(args) != 0:
parser.print_help()
return 1
logger = logging.getLogger('ebook-convert')
calibre.setup_cli_handlers(logger, logging.DEBUG)
encoding = opts.encoding
pretty_print = opts.pretty_print
oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger)
context = Context(Reader.DEFAULT_PROFILE, Writer.DEFAULT_PROFILE)
reader = Reader.generate(opts)
writer = Writer.generate(opts)
transforms = []
for Transform in chain(Reader.TRANSFORMS, Writer.TRANSFORMS):
transforms.append(Transform.generate(opts))
reader(oeb, inpath)
for transform in transforms:
transform(oeb, context)
writer(oeb, outpath)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -110,9 +110,9 @@ class CSSSelector(etree.XPath):
class Stylizer(object): class Stylizer(object):
STYLESHEETS = WeakKeyDictionary() STYLESHEETS = WeakKeyDictionary()
def __init__(self, tree, path, oeb, profile=PROFILES['PRS505'], def __init__(self, tree, path, oeb, opts, profile=PROFILES['PRS505'],
extra_css='', user_css=''): extra_css='', user_css=''):
self.oeb = oeb self.oeb, self.opts = oeb, opts
self.profile = profile self.profile = profile
self.logger = oeb.logger self.logger = oeb.logger
item = oeb.manifest.hrefs[path] item = oeb.manifest.hrefs[path]
@ -249,6 +249,8 @@ class Stylizer(object):
style.update(self._normalize_font(prop.cssValue)) style.update(self._normalize_font(prop.cssValue))
elif name == 'list-style': elif name == 'list-style':
style.update(self._normalize_list_style(prop.cssValue)) style.update(self._normalize_list_style(prop.cssValue))
elif name == 'text-align':
style.update(self._normalize_text_align(prop.cssValue))
else: else:
style[name] = prop.value style[name] = prop.value
if 'font-size' in style: if 'font-size' in style:
@ -306,6 +308,19 @@ class Stylizer(object):
return style return style
def _normalize_text_align(self, cssvalue):
style = {}
text = cssvalue.cssText
if text == 'inherit':
style['text-align'] = 'inherit'
else:
if text in ('left', 'justify'):
val = 'left' if self.opts.dont_justify else 'justify'
style['text-align'] = val
else:
style['text-align'] = text
return style
def _normalize_font(self, cssvalue): def _normalize_font(self, cssvalue):
composition = ('font-style', 'font-variant', 'font-weight', composition = ('font-style', 'font-variant', 'font-weight',
'font-size', 'line-height', 'font-family') 'font-size', 'line-height', 'font-family')
@ -411,6 +426,7 @@ class Style(object):
return result return result
def _unit_convert(self, value, base=None, font=None): def _unit_convert(self, value, base=None, font=None):
' Return value in pts'
if isinstance(value, (int, long, float)): if isinstance(value, (int, long, float)):
return value return value
try: try:
@ -447,6 +463,9 @@ class Style(object):
result = value * 0.40 result = value * 0.40
return result return result
def pt_to_px(self, value):
return (self._profile.dpi / 72.0) * value
@property @property
def fontSize(self): def fontSize(self):
def normalize_fontsize(value, base): def normalize_fontsize(value, base):

View File

@ -141,7 +141,7 @@ class CSSFlattener(object):
bs.append('text-align: '+ \ bs.append('text-align: '+ \
('left' if self.context.dont_justify else 'justify')) ('left' if self.context.dont_justify else 'justify'))
body.set('style', '; '.join(bs)) body.set('style', '; '.join(bs))
stylizer = Stylizer(html, item.href, self.oeb, profile, stylizer = Stylizer(html, item.href, self.oeb, self.context, profile,
user_css=self.context.extra_css, user_css=self.context.extra_css,
extra_css=css) extra_css=css)
self.stylizers[item] = stylizer self.stylizers[item] = stylizer

View File

@ -29,13 +29,14 @@ class CaseMangler(object):
@classmethod @classmethod
def generate(cls, opts): def generate(cls, opts):
return cls() return cls()
def __call__(self, oeb, context): def __call__(self, oeb, context):
oeb.logger.info('Applying case-transforming CSS...') oeb.logger.info('Applying case-transforming CSS...')
self.oeb = oeb self.oeb = oeb
self.opts = context
self.profile = context.source self.profile = context.source
self.mangle_spine() self.mangle_spine()
def mangle_spine(self): def mangle_spine(self):
id, href = self.oeb.manifest.generate('manglecase', 'manglecase.css') id, href = self.oeb.manifest.generate('manglecase', 'manglecase.css')
self.oeb.manifest.add(id, href, CSS_MIME, data=CASE_MANGLER_CSS) self.oeb.manifest.add(id, href, CSS_MIME, data=CASE_MANGLER_CSS)
@ -44,9 +45,9 @@ class CaseMangler(object):
relhref = item.relhref(href) relhref = item.relhref(href)
etree.SubElement(html.find(XHTML('head')), XHTML('link'), etree.SubElement(html.find(XHTML('head')), XHTML('link'),
rel='stylesheet', href=relhref, type=CSS_MIME) rel='stylesheet', href=relhref, type=CSS_MIME)
stylizer = Stylizer(html, item.href, self.oeb, self.profile) stylizer = Stylizer(html, item.href, self.oeb, self.opts, self.profile)
self.mangle_elem(html.find(XHTML('body')), stylizer) self.mangle_elem(html.find(XHTML('body')), stylizer)
def text_transform(self, transform, text): def text_transform(self, transform, text):
if transform == 'capitalize': if transform == 'capitalize':
return text.title() return text.title()
@ -55,7 +56,7 @@ class CaseMangler(object):
elif transform == 'lowercase': elif transform == 'lowercase':
return text.lower() return text.lower()
return text return text
def split_text(self, text): def split_text(self, text):
results = [''] results = ['']
isupper = text[0].isupper() isupper = text[0].isupper()
@ -66,7 +67,7 @@ class CaseMangler(object):
isupper = not isupper isupper = not isupper
results.append(char) results.append(char)
return results return results
def smallcaps_elem(self, elem, attr): def smallcaps_elem(self, elem, attr):
texts = self.split_text(getattr(elem, attr)) texts = self.split_text(getattr(elem, attr))
setattr(elem, attr, None) setattr(elem, attr, None)
@ -90,7 +91,7 @@ class CaseMangler(object):
last.tail = tail last.tail = tail
child.tail = None child.tail = None
last = child last = child
def mangle_elem(self, elem, stylizer): def mangle_elem(self, elem, stylizer):
if not isinstance(elem.tag, basestring) or \ if not isinstance(elem.tag, basestring) or \
namespace(elem.tag) != XHTML_NS: namespace(elem.tag) != XHTML_NS:

View File

@ -44,6 +44,7 @@ class SVGRasterizer(object):
def __call__(self, oeb, context): def __call__(self, oeb, context):
oeb.logger.info('Rasterizing SVG images...') oeb.logger.info('Rasterizing SVG images...')
self.oeb = oeb self.oeb = oeb
self.opts = context
self.profile = context.dest self.profile = context.dest
self.images = {} self.images = {}
self.dataize_manifest() self.dataize_manifest()
@ -102,7 +103,7 @@ class SVGRasterizer(object):
def rasterize_spine(self): def rasterize_spine(self):
for item in self.oeb.spine: for item in self.oeb.spine:
html = item.data html = item.data
stylizer = Stylizer(html, item.href, self.oeb, self.profile) stylizer = Stylizer(html, item.href, self.oeb, self.opts, self.profile)
self.rasterize_item(item, stylizer) self.rasterize_item(item, stylizer)
def rasterize_item(self, item, stylizer): def rasterize_item(self, item, stylizer):

View File

@ -20,6 +20,10 @@ class Font(object):
class Column(object): class Column(object):
# A column contains an element is the element bulges out to
# the left or the right by at most HFUZZ*col width.
HFUZZ = 0.2
def __init__(self): def __init__(self):
self.left = self.right = self.top = self.bottom = 0 self.left = self.right = self.top = self.bottom = 0
self.width = self.height = 0 self.width = self.height = 0
@ -41,6 +45,10 @@ class Column(object):
for x in self.elements: for x in self.elements:
yield x yield x
def contains(self, elem):
return elem.left > self.left - self.HFUZZ*self.width and \
elem.right < self.right + self.HFUZZ*self.width
class Element(object): class Element(object):
def __eq__(self, other): def __eq__(self, other):
@ -132,6 +140,18 @@ class Interval(object):
def __hash__(self): def __hash__(self):
return hash('(%f,%f)'%self.left, self.right) return hash('(%f,%f)'%self.left, self.right)
class Region(object):
def __init__(self):
self.columns = []
self.top = self.bottom = self.left = self.right = self.width = self.height = 0
def add_columns(self, columns):
if not self.columns:
for x in sorted(columns, cmp=lambda x,y: cmp(x.left, y.left)):
self.columns.append(x)
else:
pass
class Page(object): class Page(object):
@ -238,11 +258,10 @@ class Page(object):
return columns return columns
def find_elements_in_row_of(self, x): def find_elements_in_row_of(self, x):
interval = Interval(x.top - self.YFUZZ * self.average_text_height, interval = Interval(x.top,
x.top + self.YFUZZ*(1+self.average_text_height)) x.top + self.YFUZZ*(1+self.average_text_height))
h_interval = Interval(x.left, x.right) h_interval = Interval(x.left, x.right)
m = max(0, x.idx-15) for y in self.elements[x.idx:x.idx+15]:
for y in self.elements[m:x.idx+15]:
if y is not x: if y is not x:
y_interval = Interval(y.top, y.bottom) y_interval = Interval(y.top, y.bottom)
x_interval = Interval(y.left, y.right) x_interval = Interval(y.left, y.right)

View File

@ -113,7 +113,8 @@ class PMLMLizer(object):
href = self.oeb_book.guide['titlepage'].href href = self.oeb_book.guide['titlepage'].href
item = self.oeb_book.manifest.hrefs[href] item = self.oeb_book.manifest.hrefs[href]
if item.spine_position is None: if item.spine_position is None:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb_book,
self.opts, self.opts.output_profile)
output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item)) output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item))
return output return output

View File

@ -90,7 +90,8 @@ class RBMLizer(object):
href = self.oeb_book.guide['titlepage'].href href = self.oeb_book.guide['titlepage'].href
item = self.oeb_book.manifest.hrefs[href] item = self.oeb_book.manifest.hrefs[href]
if item.spine_position is None: if item.spine_position is None:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb_book,
self.opts, self.opts.output_profile)
output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item)) output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item))
return output return output
@ -111,7 +112,7 @@ class RBMLizer(object):
output = [u''] output = [u'']
for item in self.oeb_book.spine: for item in self.oeb_book.spine:
self.log.debug('Converting %s to RocketBook HTML...' % item.href) self.log.debug('Converting %s to RocketBook HTML...' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
output.append(self.add_page_anchor(item)) output.append(self.add_page_anchor(item))
output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) output += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
return ''.join(output) return ''.join(output)

View File

@ -111,12 +111,13 @@ class RTFMLizer(object):
href = self.oeb_book.guide['titlepage'].href href = self.oeb_book.guide['titlepage'].href
item = self.oeb_book.manifest.hrefs[href] item = self.oeb_book.manifest.hrefs[href]
if item.spine_position is None: if item.spine_position is None:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb_book,
self.opts, self.opts.output_profile)
output += self.dump_text(item.data.find(XHTML('body')), stylizer) output += self.dump_text(item.data.find(XHTML('body')), stylizer)
output += '{\\page } ' output += '{\\page } '
for item in self.oeb_book.spine: for item in self.oeb_book.spine:
self.log.debug('Converting %s to RTF markup...' % item.href) self.log.debug('Converting %s to RTF markup...' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
output += self.dump_text(item.data.find(XHTML('body')), stylizer) output += self.dump_text(item.data.find(XHTML('body')), stylizer)
output += self.footer() output += self.footer()
output = self.insert_images(output) output = self.insert_images(output)

View File

@ -54,7 +54,7 @@ class TXTMLizer(object):
output.append(self.get_toc()) output.append(self.get_toc())
for item in self.oeb_book.spine: for item in self.oeb_book.spine:
self.log.debug('Converting %s to TXT...' % item.href) self.log.debug('Converting %s to TXT...' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
content = self.remove_newlines(content) content = self.remove_newlines(content)
output += self.dump_text(etree.fromstring(content), stylizer) output += self.dump_text(etree.fromstring(content), stylizer)

View File

@ -4,9 +4,14 @@ __license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>' __copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
from calibre.ebooks.conversion.plumber import Plumber import os
from calibre.utils.logging import Log from optparse import OptionParser
from calibre.customize.conversion import OptionRecommendation, DummyReporter from calibre.customize.conversion import OptionRecommendation, DummyReporter
from calibre.ebooks.conversion.plumber import Plumber
from calibre.customize.ui import plugin_for_catalog_format
from calibre.utils.logging import Log
from calibre.gui2 import choose_dir, Application
def gui_convert(input, output, recommendations, notification=DummyReporter(), def gui_convert(input, output, recommendations, notification=DummyReporter(),
abort_after_input_dump=False, log=None): abort_after_input_dump=False, log=None):
@ -20,7 +25,7 @@ def gui_convert(input, output, recommendations, notification=DummyReporter(),
plumber.run() plumber.run()
def gui_catalog(fmt, title, dbspec, ids, out_file_name, def gui_catalog(fmt, title, dbspec, ids, out_file_name, fmt_options,
notification=DummyReporter(), log=None): notification=DummyReporter(), log=None):
if log is None: if log is None:
log = Log() log = Log()
@ -31,8 +36,28 @@ def gui_catalog(fmt, title, dbspec, ids, out_file_name,
db = LibraryDatabase2(dbpath) db = LibraryDatabase2(dbpath)
else: # To be implemented in the future else: # To be implemented in the future
pass pass
# Implement the interface to the catalog generating code here
db # Create a minimal OptionParser that we can append to
parser = OptionParser()
args = []
parser.add_option("--verbose", action="store_true", dest="verbose", default=True)
opts, args = parser.parse_args()
# Populate opts
opts.ids = ids
opts.search_text = None
opts.sort_by = None
# Extract the option dictionary to comma-separated lists
for option in fmt_options:
setattr(opts,option, ','.join(fmt_options[option]))
# Fetch and run the plugin for fmt
plugin = plugin_for_catalog_format(fmt)
plugin.run(out_file_name, opts, db)

View File

@ -6,39 +6,131 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
from PyQt4.Qt import QDialog import os, shutil, sys, tempfile
from PyQt4.Qt import QDialog, QWidget
from calibre.customize.ui import config
from calibre.gui2.dialogs.catalog_ui import Ui_Dialog from calibre.gui2.dialogs.catalog_ui import Ui_Dialog
from calibre.gui2 import dynamic from calibre.gui2 import gprefs, dynamic
from calibre.customize.ui import available_catalog_formats from calibre.customize.ui import available_catalog_formats, catalog_plugins
from calibre.gui2.catalog.catalog_csv_xml import PluginWidget
class Catalog(QDialog, Ui_Dialog): class Catalog(QDialog, Ui_Dialog):
''' Catalog Dialog builder'''
widgets = []
def __init__(self, parent, dbspec, ids): def __init__(self, parent, dbspec, ids):
import re, cStringIO
from calibre import prints as info
from calibre.gui2 import dynamic
from PyQt4.uic import compileUi
QDialog.__init__(self, parent) QDialog.__init__(self, parent)
# Run the dialog setup generated from catalog.ui
self.setupUi(self) self.setupUi(self)
self.dbspec, self.ids = dbspec, ids self.dbspec, self.ids = dbspec, ids
# Display the number of books we've been passed
self.count.setText(unicode(self.count.text()).format(len(ids))) self.count.setText(unicode(self.count.text()).format(len(ids)))
# Display the last-used title
self.title.setText(dynamic.get('catalog_last_used_title', self.title.setText(dynamic.get('catalog_last_used_title',
_('My Books'))) _('My Books')))
fmts = sorted([x.upper() for x in available_catalog_formats()])
# GwR *** Add option tabs for built-in formats
# This code models #69 in calibre/gui2/dialogs/config/__init__.py
self.fmts = []
from calibre.customize.builtins import plugins as builtin_plugins
from calibre.customize import CatalogPlugin
for plugin in catalog_plugins():
if plugin.name in config['disabled_plugins']:
continue
name = plugin.name.lower().replace(' ', '_')
if type(plugin) in builtin_plugins:
#info("Adding widget for builtin Catalog plugin %s" % plugin.name)
try:
catalog_widget = __import__('calibre.gui2.catalog.'+name,
fromlist=[1])
pw = catalog_widget.PluginWidget()
pw.initialize(name)
pw.ICON = I('forward.svg')
self.widgets.append(pw)
[self.fmts.append([file_type.upper(), pw.sync_enabled,pw]) for file_type in plugin.file_types]
except ImportError:
info("ImportError with %s" % name)
continue
else:
# Load dynamic tab
form = os.path.join(plugin.resources_path,'%s.ui' % name)
klass = os.path.join(plugin.resources_path,'%s.py' % name)
compiled_form = os.path.join(plugin.resources_path,'%s_ui.py' % name)
if os.path.exists(form) and os.path.exists(klass):
#info("Adding widget for user-installed Catalog plugin %s" % plugin.name)
# Compile the .ui form provided in plugin.zip
if not os.path.exists(compiled_form):
# info('\tCompiling form', form)
buf = cStringIO.StringIO()
compileUi(form, buf)
dat = buf.getvalue()
dat = re.compile(r'QtGui.QApplication.translate\(.+?,\s+"(.+?)(?<!\\)",.+?\)',
re.DOTALL).sub(r'_("\1")', dat)
open(compiled_form, 'wb').write(dat)
# Import the dynamic PluginWidget() from .py file provided in plugin.zip
try:
sys.path.insert(0, plugin.resources_path)
catalog_widget = __import__(name, fromlist=[1])
pw = catalog_widget.PluginWidget()
pw.initialize(name)
pw.ICON = I('forward.svg')
self.widgets.append(pw)
[self.fmts.append([file_type.upper(), pw.sync_enabled,pw]) for file_type in plugin.file_types]
except ImportError:
info("ImportError with %s" % name)
continue
finally:
sys.path.remove(plugin.resources_path)
else:
info("No dynamic tab resources found for %s" % name)
self.widgets = sorted(self.widgets, key=lambda x:(x.TITLE, x.TITLE))
for pw in self.widgets:
page = self.tabs.addTab(pw,pw.TITLE)
# Generate a sorted list of installed catalog formats/sync_enabled pairs
fmts = sorted([x[0] for x in self.fmts])
self.sync_enabled_formats = []
for fmt in self.fmts:
if fmt[1]:
self.sync_enabled_formats.append(fmt[0])
# Callback when format changes
self.format.currentIndexChanged.connect(self.format_changed) self.format.currentIndexChanged.connect(self.format_changed)
# Add the installed catalog format list to the format QComboBox
self.format.addItems(fmts) self.format.addItems(fmts)
pref = dynamic.get('catalog_preferred_format', 'EPUB') pref = dynamic.get('catalog_preferred_format', 'CSV')
idx = self.format.findText(pref) idx = self.format.findText(pref)
if idx > -1: if idx > -1:
self.format.setCurrentIndex(idx) self.format.setCurrentIndex(idx)
if self.sync.isEnabled(): if self.sync.isEnabled():
self.sync.setChecked(dynamic.get('catalog_sync_to_device', True)) self.sync.setChecked(dynamic.get('catalog_sync_to_device', True))
def format_changed(self, idx): def format_changed(self, idx):
cf = unicode(self.format.currentText()) cf = unicode(self.format.currentText())
if cf in ('EPUB', 'MOBI'): if cf in self.sync_enabled_formats:
self.sync.setEnabled(True) self.sync.setEnabled(True)
else: else:
self.sync.setDisabled(True) self.sync.setDisabled(True)

View File

@ -6,105 +6,121 @@
<rect> <rect>
<x>0</x> <x>0</x>
<y>0</y> <y>0</y>
<width>628</width> <width>611</width>
<height>503</height> <height>514</height>
</rect> </rect>
</property> </property>
<property name="windowTitle"> <property name="windowTitle">
<string>Generate catalog</string> <string>Generate catalog</string>
</property> </property>
<property name="windowIcon"> <property name="windowIcon">
<iconset resource="../../../work/calibre/resources/images.qrc"> <iconset>
<normaloff>:/images/library.png</normaloff>:/images/library.png</iconset> <normaloff>:/images/library.png</normaloff>:/images/library.png</iconset>
</property> </property>
<layout class="QGridLayout" name="gridLayout"> <widget class="QDialogButtonBox" name="buttonBox">
<item row="2" column="0"> <property name="geometry">
<widget class="QDialogButtonBox" name="buttonBox"> <rect>
<property name="orientation"> <x>430</x>
<enum>Qt::Horizontal</enum> <y>470</y>
</property> <width>164</width>
<property name="standardButtons"> <height>32</height>
<set>QDialogButtonBox::Cancel|QDialogButtonBox::Ok</set> </rect>
</property> </property>
</widget> <property name="orientation">
</item> <enum>Qt::Horizontal</enum>
<item row="1" column="0"> </property>
<widget class="QTabWidget" name="tabs"> <property name="standardButtons">
<property name="currentIndex"> <set>QDialogButtonBox::Cancel|QDialogButtonBox::Ok</set>
<number>0</number> </property>
</property> </widget>
<widget class="QWidget" name="tab"> <widget class="QTabWidget" name="tabs">
<attribute name="title"> <property name="geometry">
<string>Catalog options</string> <rect>
</attribute> <x>12</x>
<layout class="QGridLayout" name="gridLayout_2"> <y>39</y>
<item row="0" column="0"> <width>579</width>
<widget class="QLabel" name="label"> <height>411</height>
<property name="text"> </rect>
<string>Catalog &amp;format:</string> </property>
</property> <property name="currentIndex">
<property name="buddy"> <number>0</number>
<cstring>format</cstring> </property>
</property> <widget class="QWidget" name="tab">
</widget> <attribute name="title">
</item> <string>Catalog options</string>
<item row="0" column="2"> </attribute>
<widget class="QComboBox" name="format"/> <layout class="QGridLayout" name="gridLayout_2">
</item> <item row="0" column="0">
<item row="1" column="0"> <widget class="QLabel" name="label">
<widget class="QLabel" name="label_2"> <property name="text">
<property name="text"> <string>Catalog &amp;format:</string>
<string>Catalog &amp;title (existing catalog with the same title will be replaced):</string> </property>
</property> <property name="buddy">
<property name="wordWrap"> <cstring>format</cstring>
<bool>true</bool> </property>
</property> </widget>
<property name="buddy"> </item>
<cstring>title</cstring> <item row="0" column="2">
</property> <widget class="QComboBox" name="format"/>
</widget> </item>
</item> <item row="1" column="0">
<item row="2" column="1"> <widget class="QLabel" name="label_2">
<spacer name="verticalSpacer"> <property name="text">
<property name="orientation"> <string>Catalog &amp;title (existing catalog with the same title will be replaced):</string>
<enum>Qt::Vertical</enum> </property>
</property> <property name="wordWrap">
<property name="sizeHint" stdset="0"> <bool>true</bool>
<size> </property>
<width>20</width> <property name="buddy">
<height>299</height> <cstring>title</cstring>
</size> </property>
</property> </widget>
</spacer> </item>
</item> <item row="1" column="2">
<item row="3" column="0"> <widget class="QLineEdit" name="title"/>
<widget class="QCheckBox" name="sync"> </item>
<property name="text"> <item row="3" column="0">
<string>&amp;Send catalog to device automatically</string> <widget class="QCheckBox" name="sync">
</property> <property name="text">
</widget> <string>&amp;Send catalog to device automatically</string>
</item> </property>
<item row="1" column="2"> </widget>
<widget class="QLineEdit" name="title"/> </item>
</item> <item row="2" column="1">
</layout> <spacer name="verticalSpacer">
</widget> <property name="orientation">
</widget> <enum>Qt::Vertical</enum>
</item> </property>
<item row="0" column="0"> <property name="sizeHint" stdset="0">
<widget class="QLabel" name="count"> <size>
<property name="font"> <width>20</width>
<font> <height>299</height>
<weight>75</weight> </size>
<bold>true</bold> </property>
</font> </spacer>
</property> </item>
<property name="text"> </layout>
<string>Generate catalog for {0} books</string> </widget>
</property> </widget>
</widget> <widget class="QLabel" name="count">
</item> <property name="geometry">
</layout> <rect>
<x>12</x>
<y>12</y>
<width>205</width>
<height>17</height>
</rect>
</property>
<property name="font">
<font>
<weight>75</weight>
<bold>true</bold>
</font>
</property>
<property name="text">
<string>Generate catalog for {0} books</string>
</property>
</widget>
</widget> </widget>
<resources> <resources>
<include location="../../../work/calibre/resources/images.qrc"/> <include location="../../../work/calibre/resources/images.qrc"/>

View File

@ -532,7 +532,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
if self.cover_fetcher.exception is not None: if self.cover_fetcher.exception is not None:
err = self.cover_fetcher.exception err = self.cover_fetcher.exception
error_dialog(self, _('Cannot fetch cover'), error_dialog(self, _('Cannot fetch cover'),
_('<b>Could not fetch cover.</b><br/>')+repr(err)).exec_() _('<b>Could not fetch cover.</b><br/>')+unicode(err)).exec_()
return return
pix = QPixmap() pix = QPixmap()

View File

@ -215,7 +215,7 @@ class TagsModel(QAbstractItemModel):
return QModelIndex() return QModelIndex()
child_item = index.internalPointer() child_item = index.internalPointer()
parent_item = child_item.parent parent_item = getattr(child_item, 'parent', None)
if parent_item is self.root_item or parent_item is None: if parent_item is self.root_item or parent_item is None:
return QModelIndex() return QModelIndex()

View File

@ -238,19 +238,36 @@ def fetch_scheduled_recipe(arg):
def generate_catalog(parent, dbspec, ids): def generate_catalog(parent, dbspec, ids):
from calibre.gui2.dialogs.catalog import Catalog from calibre.gui2.dialogs.catalog import Catalog
# Build the Catalog dialog in gui2.dialogs.catalog
d = Catalog(parent, dbspec, ids) d = Catalog(parent, dbspec, ids)
if d.exec_() != d.Accepted: if d.exec_() != d.Accepted:
return None return None
# Create the output file
out = PersistentTemporaryFile(suffix='_catalog_out.'+d.catalog_format.lower()) out = PersistentTemporaryFile(suffix='_catalog_out.'+d.catalog_format.lower())
# Retrieve plugin options
fmt_options = {}
for x in range(d.tabs.count()):
if str(d.tabs.tabText(x)).find(str(d.catalog_format)) > -1:
for fmt in d.fmts:
if fmt[0] == d.catalog_format:
fmt_options = fmt[2].options()
# print "gui2.tools:generate_catalog(): options for %s: %s" % (fmt[0], fmt_options)
args = [ args = [
d.catalog_format, d.catalog_format,
d.catalog_title, d.catalog_title,
dbspec, dbspec,
ids, ids,
out.name, out.name,
fmt_options
] ]
out.close() out.close()
# This calls gui2.convert.gui_conversion:gui_catalog()
return 'gui_catalog', args, _('Generate catalog'), out.name, d.catalog_sync, \ return 'gui_catalog', args, _('Generate catalog'), out.name, d.catalog_sync, \
d.catalog_title d.catalog_title

View File

@ -9,7 +9,7 @@ __docformat__ = 'restructuredtext en'
'''The main GUI''' '''The main GUI'''
import os, sys, textwrap, collections, time import atexit, os, shutil, sys, tempfile, textwrap, collections, time
from xml.parsers.expat import ExpatError from xml.parsers.expat import ExpatError
from Queue import Queue, Empty from Queue import Queue, Empty
from threading import Thread from threading import Thread
@ -357,7 +357,7 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
cm.addAction(_('Bulk convert')) cm.addAction(_('Bulk convert'))
cm.addSeparator() cm.addSeparator()
ac = cm.addAction( ac = cm.addAction(
_('Create catalog of the books in your calibre library')) _('Create catalog of books in your calibre library'))
ac.triggered.connect(self.generate_catalog) ac.triggered.connect(self.generate_catalog)
self.action_convert.setMenu(cm) self.action_convert.setMenu(cm)
self._convert_single_hook = partial(self.convert_ebook, bulk=False) self._convert_single_hook = partial(self.convert_ebook, bulk=False)
@ -1359,26 +1359,32 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
############################### Generate catalog ########################### ############################### Generate catalog ###########################
def generate_catalog(self): def generate_catalog(self):
rows = self.library_view.selectionModel().selectedRows() rows = self.library_view.selectionModel().selectedRows()
if not rows: if not rows or len(rows) < 2:
rows = xrange(self.library_view.model().rowCount(QModelIndex())) rows = xrange(self.library_view.model().rowCount(QModelIndex()))
ids = map(self.library_view.model().id, rows) ids = map(self.library_view.model().id, rows)
dbspec = None dbspec = None
if not ids: if not ids:
return error_dialog(self, _('No books selected'), return error_dialog(self, _('No books selected'),
_('No books selected to generate catalog for'), _('No books selected to generate catalog for'),
show=True) show=True)
# Calling gui2.tools:generate_catalog()
ret = generate_catalog(self, dbspec, ids) ret = generate_catalog(self, dbspec, ids)
if ret is None: if ret is None:
return return
func, args, desc, out, sync, title = ret func, args, desc, out, sync, title = ret
fmt = os.path.splitext(out)[1][1:].upper() fmt = os.path.splitext(out)[1][1:].upper()
job = self.job_manager.run_job( job = self.job_manager.run_job(
Dispatcher(self.catalog_generated), func, args=args, Dispatcher(self.catalog_generated), func, args=args,
description=desc) description=desc)
job.catalog_file_path = out job.catalog_file_path = out
job.catalog_sync, job.catalog_title = sync, title job.fmt = fmt
job.catalog_sync, job.catalog_title = sync, title
self.status_bar.showMessage(_('Generating %s catalog...')%fmt) self.status_bar.showMessage(_('Generating %s catalog...')%fmt)
def catalog_generated(self, job): def catalog_generated(self, job):
@ -1392,8 +1398,13 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
dynamic.set('catalogs_to_be_synced', sync) dynamic.set('catalogs_to_be_synced', sync)
self.status_bar.showMessage(_('Catalog generated.'), 3000) self.status_bar.showMessage(_('Catalog generated.'), 3000)
self.sync_catalogs() self.sync_catalogs()
if job.fmt in ['CSV','XML']:
export_dir = choose_dir(self, 'Export Catalog Directory',
'Select destination for %s.%s' % (job.catalog_title, job.fmt.lower()))
if export_dir:
destination = os.path.join(export_dir, '%s.%s' % (job.catalog_title, job.fmt.lower()))
shutil.copyfile(job.catalog_file_path, destination)
############################### Fetch news ################################# ############################### Fetch news #################################
def download_scheduled_recipe(self, arg): def download_scheduled_recipe(self, arg):

View File

@ -7,14 +7,14 @@
<x>0</x> <x>0</x>
<y>0</y> <y>0</y>
<width>479</width> <width>479</width>
<height>574</height> <height>606</height>
</rect> </rect>
</property> </property>
<property name="windowTitle"> <property name="windowTitle">
<string>Configure Ebook viewer</string> <string>Configure Ebook viewer</string>
</property> </property>
<property name="windowIcon"> <property name="windowIcon">
<iconset resource="../../../../resources/images.qrc"> <iconset>
<normaloff>:/images/config.svg</normaloff>:/images/config.svg</iconset> <normaloff>:/images/config.svg</normaloff>:/images/config.svg</iconset>
</property> </property>
<layout class="QGridLayout" name="gridLayout_4"> <layout class="QGridLayout" name="gridLayout_4">
@ -164,7 +164,7 @@
</item> </item>
</widget> </widget>
</item> </item>
<item row="6" column="0" colspan="2"> <item row="7" column="0" colspan="2">
<widget class="QCheckBox" name="opt_remember_window_size"> <widget class="QCheckBox" name="opt_remember_window_size">
<property name="text"> <property name="text">
<string>Remember last used &amp;window size</string> <string>Remember last used &amp;window size</string>
@ -218,6 +218,13 @@
</property> </property>
</widget> </widget>
</item> </item>
<item row="6" column="0" colspan="2">
<widget class="QCheckBox" name="opt_fit_images">
<property name="text">
<string>&amp;Resize images larger than the viewer window (needs restart)</string>
</property>
</widget>
</item>
</layout> </layout>
</item> </item>
<item row="3" column="0"> <item row="3" column="0">

View File

@ -10,7 +10,7 @@ from base64 import b64encode
from PyQt4.Qt import QSize, QSizePolicy, QUrl, SIGNAL, Qt, QTimer, \ from PyQt4.Qt import QSize, QSizePolicy, QUrl, SIGNAL, Qt, QTimer, \
QPainter, QPalette, QBrush, QFontDatabase, QDialog, \ QPainter, QPalette, QBrush, QFontDatabase, QDialog, \
QColor, QPoint, QImage, QRegion, QVariant, QIcon, \ QColor, QPoint, QImage, QRegion, QVariant, QIcon, \
QFont, QObject, QApplication, pyqtSignature, QAction QFont, pyqtSignature, QAction
from PyQt4.QtWebKit import QWebPage, QWebView, QWebSettings from PyQt4.QtWebKit import QWebPage, QWebView, QWebSettings
from calibre.utils.config import Config, StringConfig from calibre.utils.config import Config, StringConfig
@ -21,7 +21,7 @@ from calibre.constants import iswindows
from calibre import prints, guess_type from calibre import prints, guess_type
from calibre.gui2.viewer.keys import SHORTCUTS from calibre.gui2.viewer.keys import SHORTCUTS
bookmarks = referencing = hyphenation = jquery = jquery_scrollTo = hyphenator = None bookmarks = referencing = hyphenation = jquery = jquery_scrollTo = hyphenator = images =None
def load_builtin_fonts(): def load_builtin_fonts():
base = P('fonts/liberation/*.ttf') base = P('fonts/liberation/*.ttf')
@ -42,6 +42,8 @@ def config(defaults=None):
help=_('Set the user CSS stylesheet. This can be used to customize the look of all books.')) help=_('Set the user CSS stylesheet. This can be used to customize the look of all books.'))
c.add_opt('max_view_width', default=6000, c.add_opt('max_view_width', default=6000,
help=_('Maximum width of the viewer window, in pixels.')) help=_('Maximum width of the viewer window, in pixels.'))
c.add_opt('fit_images', default=True,
help=_('Resize images larger than the viewer window to fit inside it'))
c.add_opt('hyphenate', default=False, help=_('Hyphenate text')) c.add_opt('hyphenate', default=False, help=_('Hyphenate text'))
c.add_opt('hyphenate_default_lang', default='en', c.add_opt('hyphenate_default_lang', default='en',
help=_('Default language for hyphenation rules')) help=_('Default language for hyphenation rules'))
@ -59,20 +61,6 @@ def config(defaults=None):
return c return c
class PythonJS(QObject):
def __init__(self, callback):
QObject.__init__(self, QApplication.instance())
self.setObjectName("py_bridge")
self._callback = callback
@pyqtSignature("QString")
def callback(self, msg):
print "callback called"
self._callback(msg)
class ConfigDialog(QDialog, Ui_Dialog): class ConfigDialog(QDialog, Ui_Dialog):
def __init__(self, shortcuts, parent=None): def __init__(self, shortcuts, parent=None):
@ -110,6 +98,7 @@ class ConfigDialog(QDialog, Ui_Dialog):
self.shortcut_config = ShortcutConfig(shortcuts, parent=self) self.shortcut_config = ShortcutConfig(shortcuts, parent=self)
p = self.tabs.widget(1) p = self.tabs.widget(1)
p.layout().addWidget(self.shortcut_config) p.layout().addWidget(self.shortcut_config)
self.opt_fit_images.setChecked(opts.fit_images)
def accept(self, *args): def accept(self, *args):
@ -122,6 +111,7 @@ class ConfigDialog(QDialog, Ui_Dialog):
c.set('standard_font', {0:'serif', 1:'sans', 2:'mono'}[self.standard_font.currentIndex()]) c.set('standard_font', {0:'serif', 1:'sans', 2:'mono'}[self.standard_font.currentIndex()])
c.set('user_css', unicode(self.css.toPlainText())) c.set('user_css', unicode(self.css.toPlainText()))
c.set('remember_window_size', self.opt_remember_window_size.isChecked()) c.set('remember_window_size', self.opt_remember_window_size.isChecked())
c.set('fit_images', self.opt_fit_images.isChecked())
c.set('max_view_width', int(self.max_view_width.value())) c.set('max_view_width', int(self.max_view_width.value()))
c.set('hyphenate', self.hyphenate.isChecked()) c.set('hyphenate', self.hyphenate.isChecked())
idx = self.hyphenate_default_lang.currentIndex() idx = self.hyphenate_default_lang.currentIndex()
@ -157,7 +147,6 @@ class Document(QWebPage):
self.setObjectName("py_bridge") self.setObjectName("py_bridge")
self.debug_javascript = False self.debug_javascript = False
self.current_language = None self.current_language = None
#self.js_bridge = PythonJS(self.js_callback)
self.setLinkDelegationPolicy(self.DelegateAllLinks) self.setLinkDelegationPolicy(self.DelegateAllLinks)
self.scroll_marks = [] self.scroll_marks = []
@ -197,9 +186,14 @@ class Document(QWebPage):
opts = config().parse() opts = config().parse()
self.hyphenate = opts.hyphenate self.hyphenate = opts.hyphenate
self.hyphenate_default_lang = opts.hyphenate_default_lang self.hyphenate_default_lang = opts.hyphenate_default_lang
self.do_fit_images = opts.fit_images
def fit_images(self):
if self.do_fit_images:
self.javascript('setup_image_scaling_handlers()')
def load_javascript_libraries(self): def load_javascript_libraries(self):
global bookmarks, referencing, hyphenation, jquery, jquery_scrollTo, hyphenator global bookmarks, referencing, hyphenation, jquery, jquery_scrollTo, hyphenator, images
self.mainFrame().addToJavaScriptWindowObject("py_bridge", self) self.mainFrame().addToJavaScriptWindowObject("py_bridge", self)
if jquery is None: if jquery is None:
jquery = P('content_server/jquery.js', data=True) jquery = P('content_server/jquery.js', data=True)
@ -215,6 +209,9 @@ class Document(QWebPage):
if referencing is None: if referencing is None:
referencing = P('viewer/referencing.js', data=True) referencing = P('viewer/referencing.js', data=True)
self.javascript(referencing) self.javascript(referencing)
if images is None:
images = P('viewer/images.js', data=True)
self.javascript(images)
if hyphenation is None: if hyphenation is None:
hyphenation = P('viewer/hyphenation.js', data=True) hyphenation = P('viewer/hyphenation.js', data=True)
self.javascript(hyphenation) self.javascript(hyphenation)
@ -353,7 +350,13 @@ class Document(QWebPage):
return self.mainFrame().contentsSize().width() # offsetWidth gives inaccurate results return self.mainFrame().contentsSize().width() # offsetWidth gives inaccurate results
def set_bottom_padding(self, amount): def set_bottom_padding(self, amount):
self.javascript('$("body").css("padding-bottom", "%dpx")' % amount) padding = '%dpx'%amount
try:
old_padding = unicode(self.javascript('$("body").css("padding-bottom")').toString())
except:
old_padding = ''
if old_padding != padding:
self.javascript('$("body").css("padding-bottom", "%s")' % padding)
class EntityDeclarationProcessor(object): class EntityDeclarationProcessor(object):
@ -541,6 +544,7 @@ class DocumentView(QWebView):
return return
self.loading_url = None self.loading_url = None
self.document.set_bottom_padding(0) self.document.set_bottom_padding(0)
self.document.fit_images()
self._size_hint = self.document.mainFrame().contentsSize() self._size_hint = self.document.mainFrame().contentsSize()
scrolled = False scrolled = False
if self.to_bottom: if self.to_bottom:

View File

@ -40,8 +40,9 @@ class CSV_XML(CatalogPlugin):
from calibre.utils.logging import Log from calibre.utils.logging import Log
log = Log() log = Log()
self.fmt = path_to_output[path_to_output.rfind('.') + 1:] self.fmt = path_to_output.rpartition('.')[2]
if opts.verbose:
if False and opts.verbose:
log("%s:run" % self.name) log("%s:run" % self.name)
log(" path_to_output: %s" % path_to_output) log(" path_to_output: %s" % path_to_output)
log(" Output format: %s" % self.fmt) log(" Output format: %s" % self.fmt)
@ -53,7 +54,7 @@ class CSV_XML(CatalogPlugin):
log(" opts:") log(" opts:")
for key in keys: for key in keys:
log(" %s: %s" % (key, opts_dict[key])) log(" %s: %s" % (key, opts_dict[key]))
# Get the sorted, filtered database as a dictionary # Get the sorted, filtered database as a dictionary
data = self.search_sort_db(db, opts) data = self.search_sort_db(db, opts)

View File

@ -644,6 +644,10 @@ def catalog_option_parser(args):
output, fmt = validate_command_line(parser, args, log) output, fmt = validate_command_line(parser, args, log)
# Add options common to all catalog plugins # Add options common to all catalog plugins
parser.add_option('-i', '--ids', default=None, dest='ids',
help=_("Comma-separated list of database IDs to catalog.\n"
"If declared, --search is ignored.\n"
"Default: all"))
parser.add_option('-s', '--search', default=None, dest='search_text', parser.add_option('-s', '--search', default=None, dest='search_text',
help=_("Filter the results by the search query. " help=_("Filter the results by the search query. "
"For the format of the search query, please see " "For the format of the search query, please see "
@ -656,31 +660,6 @@ def catalog_option_parser(args):
# Add options specific to fmt plugin # Add options specific to fmt plugin
plugin = add_plugin_parser_options(fmt, parser, log) plugin = add_plugin_parser_options(fmt, parser, log)
# Merge options from GUI Preferences
'''
# Placeholder sample code until we implement GUI preferences
from calibre.library.save_to_disk import config
c = config()
for pref in ['asciiize', 'update_metadata', 'write_opf', 'save_cover']:
opt = c.get_option(pref)
switch = '--dont-'+pref.replace('_', '-')
parser.add_option(switch, default=True, action='store_false',
help=opt.help+' '+_('Specifying this switch will turn '
'this behavior off.'), dest=pref)
for pref in ['timefmt', 'template', 'formats']:
opt = c.get_option(pref)
switch = '--'+pref
parser.add_option(switch, default=opt.default,
help=opt.help, dest=pref)
for pref in ('replace_whitespace', 'to_lowercase'):
opt = c.get_option(pref)
switch = '--'+pref.replace('_', '-')
parser.add_option(switch, default=False, action='store_true',
help=opt.help)
'''
return parser, plugin, log return parser, plugin, log
def command_catalog(args, dbpath): def command_catalog(args, dbpath):
@ -693,6 +672,9 @@ def command_catalog(args, dbpath):
return 1 return 1
if opts.verbose: if opts.verbose:
log("library.cli:command_catalog dispatching to plugin %s" % plugin.name) log("library.cli:command_catalog dispatching to plugin %s" % plugin.name)
if opts.ids:
opts.ids = [int(id) for id in opts.ids.split(',')]
with plugin: with plugin:
plugin.run(args[1], opts, get_db(dbpath, opts)) plugin.run(args[1], opts, get_db(dbpath, opts))
return 0 return 0

View File

@ -1634,13 +1634,15 @@ class LibraryDatabase2(LibraryDatabase):
for i in iter(self): for i in iter(self):
yield i[x] yield i[x]
def get_data_as_dict(self, prefix=None, authors_as_string=False): def get_data_as_dict(self, prefix=None, authors_as_string=False, ids=None):
''' '''
Return all metadata stored in the database as a dict. Includes paths to Return all metadata stored in the database as a dict. Includes paths to
the cover and each format. the cover and each format.
:param prefix: The prefix for all paths. By default, the prefix is the absolute path :param prefix: The prefix for all paths. By default, the prefix is the absolute path
to the library folder. to the library folder.
:param ids: Set of ids to return the data for. If None return data for
all entries in database.
''' '''
if prefix is None: if prefix is None:
prefix = self.library_path prefix = self.library_path
@ -1650,11 +1652,14 @@ class LibraryDatabase2(LibraryDatabase):
data = [] data = []
for record in self.data: for record in self.data:
if record is None: continue if record is None: continue
db_id = record[FIELD_MAP['id']]
if ids is not None and db_id not in ids:
continue
x = {} x = {}
for field in FIELDS: for field in FIELDS:
x[field] = record[FIELD_MAP[field]] x[field] = record[FIELD_MAP[field]]
data.append(x) data.append(x)
x['id'] = record[FIELD_MAP['id']] x['id'] = db_id
x['formats'] = [] x['formats'] = []
if not x['authors']: if not x['authors']:
x['authors'] = _('Unknown') x['authors'] = _('Unknown')

View File

@ -524,6 +524,7 @@ class DynamicConfig(dict):
pass pass
except: except:
import traceback import traceback
print 'Failed to unpickle stored object:'
traceback.print_exc() traceback.print_exc()
d = {} d = {}
self.clear() self.clear()

View File

@ -104,6 +104,7 @@ _extra_lang_codes = {
'en_CY' : _('English (Cyprus)'), 'en_CY' : _('English (Cyprus)'),
'en_PK' : _('English (Pakistan)'), 'en_PK' : _('English (Pakistan)'),
'en_SG' : _('English (Singapore)'), 'en_SG' : _('English (Singapore)'),
'en_YE' : _('English (Yemen)'),
'de_AT' : _('German (AT)'), 'de_AT' : _('German (AT)'),
'nl' : _('Dutch (NL)'), 'nl' : _('Dutch (NL)'),
'nl_BE' : _('Dutch (BE)'), 'nl_BE' : _('Dutch (BE)'),

View File

@ -9,9 +9,22 @@ __docformat__ = 'restructuredtext en'
import __builtin__, sys, os import __builtin__, sys, os
_dev_path = os.environ.get('CALIBRE_DEVELOP_FROM', None)
if _dev_path is not None:
_dev_path = os.path.join(os.path.abspath(os.path.dirname(_dev_path)), 'resources')
if not os.path.exists(_dev_path):
_dev_path = None
def get_path(path, data=False): def get_path(path, data=False):
global _dev_path
path = path.replace(os.sep, '/') path = path.replace(os.sep, '/')
path = os.path.join(sys.resources_location, *path.split('/')) base = None
if _dev_path is not None:
if os.path.exists(os.path.join(_dev_path, *path.split('/'))):
base = _dev_path
if base is None:
base = sys.resources_location
path = os.path.join(base, *path.split('/'))
if data: if data:
return open(path, 'rb').read() return open(path, 'rb').read()
return path return path

View File

@ -357,9 +357,17 @@ class BasicNewsRecipe(Recipe):
Override in a subclass to customize extraction of the :term:`URL` that points Override in a subclass to customize extraction of the :term:`URL` that points
to the content for each article. Return the to the content for each article. Return the
article URL. It is called with `article`, an object representing a parsed article article URL. It is called with `article`, an object representing a parsed article
from a feed. See `feedsparser <http://www.feedparser.org/docs/>`_. from a feed. See `feedparser <http://www.feedparser.org/docs/>`_.
By default it returns `article.link <http://www.feedparser.org/docs/reference-entry-link.html>`_. By default it looks for the original link (for feeds syndicated via a
service like feedburner or pheedo) and if found,
returns that or else returns
`article.link <http://www.feedparser.org/docs/reference-entry-link.html>`_.
''' '''
for key in article.keys():
if key.endswith('_origlink'):
url = article[key]
if url and url.startswith('http://'):
return url
return article.get('link', None) return article.get('link', None)
def preprocess_html(self, soup): def preprocess_html(self, soup):