Sync to trunk.

This commit is contained in:
John Schember 2010-01-21 17:13:09 -05:00
commit a0d1670e6f
67 changed files with 2577 additions and 486 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 569 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 253 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 531 B

View File

@ -0,0 +1,86 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class ADRecipe(BasicNewsRecipe):
__license__ = 'GPL v3'
__author__ = 'kwetal'
language = 'nl'
country = 'NL'
version = 1
title = u'AD'
publisher = u'de Persgroep Publishing Nederland NV'
category = u'News, Sports, the Netherlands'
description = u'News and Sports from the Netherlands'
oldest_article = 1.2
max_articles_per_feed = 100
use_embedded_content = False
remove_empty_feeds = True
no_stylesheets = True
remove_javascript = True
keep_only_tags = []
keep_only_tags.append(dict(name = 'div', attrs = {'id': 'art_box2'}))
keep_only_tags.append(dict(name = 'p', attrs = {'class': 'gen_footnote3'}))
remove_tags = []
remove_tags.append(dict(name = 'div', attrs = {'class': 'gen_clear'}))
remove_tags.append(dict(name = 'div', attrs = {'class': re.compile(r'gen_spacer.*')}))
remove_attributes = ['style']
# feeds from http://ad.nl/ad/nl/1401/home/integration/nmc/frameset/ad_footer/rssFeeds.dhtml
feeds = []
feeds.append((u'Binnenland', u'http://www.ad.nl/nieuws/binnenland/rss.xml'))
feeds.append((u'Buitenland', u'http://www.ad.nl/nieuws/buitenland/rss.xml'))
feeds.append((u'Bizar', u'http://www.ad.nl/nieuws/bizar/rss.xml'))
feeds.append((u'Gezondheid & Wetenschap', u'http://www.ad.nl/nieuws/gezondheidwetenschap/rss.xml'))
feeds.append((u'Economie', u'http://www.ad.nl/nieuws/economie/rss.xml'))
feeds.append((u'Nederlands Voetbal', u'http://www.ad.nl/sportwereld/nederlandsvoetbal/rss.xml'))
feeds.append((u'Buitenlands Voetbal', u'http://www.ad.nl/sportwereld/buitenlandsvoetbal/rss.xml'))
feeds.append((u'Champions League/Europa League', u'http://www.ad.nl/sportwereld/championsleagueeuropaleague/rss.xml'))
feeds.append((u'Wielrennen', u'http://www.ad.nl/sportwereld/wielrennen/rss.xml'))
feeds.append((u'Tennis', u'http://www.ad.nl/sportwereld/tennis/rss.xml'))
feeds.append((u'Formule 1', u'http://www.ad.nl/sportwereld/formule1/rss.xml'))
feeds.append((u'Meer Sport', u'http://www.ad.nl/sportwereld/meersport/rss.xml'))
feeds.append((u'Celebs', u'http://www.ad.nl/showbizz/celebs/rss.xml'))
feeds.append((u'Film', u'http://www.ad.nl/showbizz/film/rss.xml'))
feeds.append((u'Muziek', u'http://www.ad.nl/showbizz/muziek/rss.xml'))
feeds.append((u'TV', u'http://www.ad.nl/showbizz/tv/rss.xml'))
feeds.append((u'Kunst & Literatuur', u'http://www.ad.nl/showbizz/kunstenliteratuur/rss.xml'))
feeds.append((u'Jouw Wereld', u'http://www.ad.nl/you/rss.xml'))
feeds.append((u'Consument', u'http://www.ad.nl/consument/rss.xml'))
feeds.append((u'Autowereld', u'http://www.ad.nl/autowereld/rss.xml'))
feeds.append((u'Reiswereld', u'http://www.ad.nl/reiswereld/rss.xml'))
feeds.append((u'Internet', u'http://www.ad.nl/digitaal/internet/rss.xml'))
feeds.append((u'Games', u'http://www.ad.nl/digitaal/games/rss.xml'))
feeds.append((u'Multimedia', u'http://www.ad.nl/digitaal/multimedia/rss.xml'))
feeds.append((u'Planet Watch', u'http://www.ad.nl/planetwatch/rss.xml'))
extra_css = '''
body {font-family: verdana, arial, helvetica, geneva, sans-serif;}
div.captionEmbeddedMasterObject {font-size: x-small; font-style: italic; color: #696969;}
.gen_footnote3 {font-size: small; color: #666666; margin-top: 0.6em;}
'''
conversion_options = {'comments': description, 'tags': category, 'language': 'en',
'publisher': publisher}
def print_version(self, url):
parts = url.split('/')
print_url = 'http://' + parts[2] + '/' + parts[3] + '/' + parts[4] + '/' + parts[5] + '/' \
+ parts[10] + '/' + parts[7] + '/print/' + parts[8] + '/' + parts[9] + '/' + parts[13]
return print_url
def preprocess_html(self, soup):
for br in soup.findAll('br'):
prev = br.findPreviousSibling(True)
if hasattr(prev, 'name') and prev.name == 'br':
next = br.findNextSibling(True)
if hasattr(next, 'name') and next.name == 'br':
br.extract()
return soup

View File

@ -1,7 +1,5 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
'''
spectator.org
'''
@ -11,20 +9,22 @@ from calibre.web.feeds.news import BasicNewsRecipe
class TheAmericanSpectator(BasicNewsRecipe):
title = 'The American Spectator'
__author__ = 'Darko Miletic'
language = 'en'
description = 'News from USA'
category = 'news, politics, USA, world'
publisher = 'The American Spectator'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
language = 'en'
INDEX = 'http://spectator.org'
html2lrf_options = [
'--comment' , description
, '--category' , 'news, politics, USA'
, '--publisher' , title
]
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
keep_only_tags = [
dict(name='div', attrs={'class':'post inner'})
@ -33,13 +33,11 @@ class TheAmericanSpectator(BasicNewsRecipe):
remove_tags = [
dict(name='object')
,dict(name='div', attrs={'class':'col3' })
,dict(name='div', attrs={'class':'post-options' })
,dict(name='p' , attrs={'class':'letter-editor'})
,dict(name='div', attrs={'class':'social' })
,dict(name='div', attrs={'class':['col3','post-options','social']})
,dict(name='p' , attrs={'class':['letter-editor','meta']})
]
feeds = [ (u'Articles', u'http://feedproxy.google.com/amspecarticles')]
feeds = [ (u'Articles', u'http://feeds.feedburner.com/amspecarticles')]
def get_cover_url(self):
cover_url = None
@ -53,3 +51,7 @@ class TheAmericanSpectator(BasicNewsRecipe):
def print_version(self, url):
return url + '/print'
def get_article_url(self, article):
return article.get('guid', None)

View File

@ -0,0 +1,60 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
news.bbc.co.uk
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class BBC(BasicNewsRecipe):
title = 'BBC News (fast)'
__author__ = 'Darko Miletic'
description = 'News from UK. A much faster version that does not download pictures'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
#delay = 1
use_embedded_content = False
encoding = 'utf8'
publisher = 'BBC'
category = 'news, UK, world'
language = 'en'
extra_css = ' body{ font-family: sans-serif; } .headline{font-size: xx-large; font-weight: bold} .ibox{display: block; margin: 20px 50px; padding: 10px; border: 1px solid } '
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
remove_tags_before = dict(name='div',attrs={'class':'headline'})
remove_tags_after = dict(name='div', attrs={'class':'footer'})
remove_tags = [
dict(name=['object','link','script','iframe'])
,dict(name='div', attrs={'class':'footer'})
]
feeds = [
('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'),
('Science/Nature', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/science/nature/rss.xml'),
('Technology', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/technology/rss.xml'),
('Entertainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'),
('Magazine', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/magazine/rss.xml'),
('Business', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml'),
('Health', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/health/rss.xml'),
('Americas', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml'),
('Europe', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml'),
('South Asia', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/south_asia/rss.xml'),
('UK', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/rss.xml'),
('Asia-Pacific', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/asia-pacific/rss.xml'),
('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'),
]
def print_version(self, url):
emp,sep,rstrip = url.partition('http://')
return 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/' + rstrip
def get_article_url(self, article):
return article.get('guid', None)

View File

@ -0,0 +1,121 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Calgary Herald
title = u'Calgary Herald'
url_prefix = 'http://www.calgaryherald.com'
description = u'News from Calgary, AB'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,15 @@
from calibre.web.feeds.news import BasicNewsRecipe
class CJR(BasicNewsRecipe):
title = u'Columbia Journalism Review'
__author__ = u'Xanthan Gum'
description = 'News about journalism.'
language = 'en'
oldest_article = 7
max_articles_per_feed = 100
feeds = [(u'News Stories', u'http://www.cjr.org/index.xml')]
def print_version(self, url):
return url + '?page=all&print=true'

View File

@ -0,0 +1,52 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
digitaljournal.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class DigitalJournal(BasicNewsRecipe):
title = 'Digital Journal'
__author__ = 'Darko Miletic'
description = 'A Global Citizen Journalism News Network'
category = 'news, politics, USA, world'
publisher = 'Digital Journal'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf8'
language = 'en'
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
}
keep_only_tags = [dict(name='div', attrs={'class':['article','body']})]
remove_tags = [dict(name=['object','table'])]
feeds = [
(u'Latest News' , u'http://digitaljournal.com/rss/?feed=latest_news' )
,(u'Business' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Business' )
,(u'Entertainment', u'http://digitaljournal.com/rss/?feed=top_news&depname=Entertainment')
,(u'Environment' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Environment' )
,(u'Food' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Food' )
,(u'Health' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Health' )
,(u'Internet' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Internet' )
,(u'Politics' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Politics' )
,(u'Religion' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Religion' )
,(u'Science' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Science' )
,(u'Sports' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Sports' )
,(u'Technology' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Technology' )
,(u'World' , u'http://digitaljournal.com/rss/?feed=top_news&depname=World' )
,(u'Arts' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Arts' )
]
def print_version(self, url):
return url.replace('digitaljournal.com/','digitaljournal.com/print/')

View File

@ -0,0 +1,126 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Edmonton Journal
title = u'Edmonton Journal'
url_prefix = 'http://www.edmontonjournal.com'
description = u'News from Edmonton, AB'
# un-comment the following three lines for the Calgary Herald
#title = u'Calgary Herald'
#url_prefix = 'http://www.calgaryherald.com'
#description = u'News from Calgary, AB'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -15,7 +15,7 @@ class FTDe(BasicNewsRecipe):
__author__ = 'Oliver Niesner'
use_embedded_content = False
timefmt = ' [%d %b %Y]'
language = 'de'
language = _('German')
max_articles_per_feed = 40
no_stylesheets = True
@ -23,13 +23,19 @@ class FTDe(BasicNewsRecipe):
dict(id='topbanner'),
dict(id='seitenkopf'),
dict(id='BoxA-0-0-0'),
#dict(id='BoxA-2-0-0'),
dict(id='footer'),
dict(id='rating_open'),
dict(id='ADS_Top'),
dict(id='spinner'),
dict(id='ftd-contentad'),
dict(id='ftd-promo'),
dict(id='nava-50009007-1-0'),
dict(id='navli-50009007-1-0'),
dict(id='Box5000534-0-0-0'),
dict(id='ExpV-1-0-0-1'),
dict(id='ExpV-1-0-0-0'),
dict(id='PollExpV-2-0-0-0'),
dict(id='starRating'),
dict(id='saveRating'),
dict(id='yLayer'),
@ -44,14 +50,20 @@ class FTDe(BasicNewsRecipe):
dict(name='ul', attrs={'class':'nav'}),
dict(name='p', attrs={'class':'articleOptionHead'}),
dict(name='p', attrs={'class':'articleOptionFoot'}),
dict(name='p', attrs={'class':'moreInfo'}),
dict(name='div', attrs={'class':'chartBox'}),
dict(name='div', attrs={'class':'ratingOpt starRatingContainer articleOptionFootFrame'}),
dict(name='div', attrs={'class':'box boxArticleBasic boxComments boxTransparent'}),
dict(name='div', attrs={'class':'box boxNavTabs'}),
dict(name='div', attrs={'class':'boxMMRgtLow'}),
dict(name='span', attrs={'class':'vote_455857'}),
dict(name='div', attrs={'class':'relatedhalb'}),
dict(name='div', attrs={'class':'box boxListScrollOutline'}),
dict(name='div', attrs={'class':'box boxPhotoshow boxImgWide'}),
dict(name='div', attrs={'class':'box boxTeaser boxPhotoshow boxImgWide'}),
dict(name='div', attrs={'class':'box boxTeaser'}),
dict(name='div', attrs={'class':'tagCloud'}),
dict(name='div', attrs={'class':'pollView'}),
dict(name='div', attrs={'class':'box boxArticleBasic boxNavTabsOutline'}),
dict(name='div', attrs={'class':'ftdHpNav'}),
dict(name='div', attrs={'class':'ftdHead'}),
@ -67,9 +79,10 @@ class FTDe(BasicNewsRecipe):
dict(name='div', attrs={'class':'wertungoben'}),
dict(name='div', attrs={'class':'artikelfuss'}),
dict(name='a', attrs={'class':'rating'}),
dict(name='a', attrs={'href':'#rt'}),
dict(name='div', attrs={'class':'articleOptionFootFrame'}),
dict(name='div', attrs={'class':'artikelsplitfaq'})]
remove_tags_after = [dict(name='a', attrs={'class':'more'})]
#remove_tags_after = [dict(name='a', attrs={'class':'more'})]
feeds = [ ('Finanzen', 'http://www.ftd.de/rss2/finanzen/maerkte'),
('Meinungshungrige', 'http://www.ftd.de/rss2/meinungshungrige'),
@ -86,4 +99,4 @@ class FTDe(BasicNewsRecipe):
def print_version(self, url):
return url + '?mode=print'
return url.replace('.html', '.html?mode=print')

View File

@ -32,7 +32,7 @@ class GlobeAndMail(BasicNewsRecipe):
'gallery-controls', 'video', 'galleryLoading','deck','header',
'toolsBottom'] },
{'class':['credit','inline-img-caption','tab-pointer'] },
dict(name='div', attrs={'id':'lead-photo'}),
dict(name='div', attrs={'id':['lead-photo', 'most-popular-story']}),
dict(name='div', attrs={'class':'right'}),
dict(name='div', attrs={'id':'footer'}),
dict(name='div', attrs={'id':'beta-msg'}),

View File

@ -0,0 +1,44 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
www.kitsapun.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Kitsapsun(BasicNewsRecipe):
title = 'Kitsap Sun'
__author__ = 'Darko Miletic'
description = 'News from Kitsap County'
publisher = 'Scripps Interactive Newspapers Group'
category = 'news, Kitsap county, USA'
language = 'en'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'cp1252'
use_embedded_content = False
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher': publisher
}
keep_only_tags = [dict(name='div', attrs={'id':['story_meta','story_content']})]
remove_tags = [dict(name=['object','link','embed','form','iframe'])]
feeds = [
(u'News' , u'http://www.kitsapsun.com/rss/headlines/news/' )
,(u'Business' , u'http://www.kitsapsun.com/rss/headlines/business/' )
,(u'Communities' , u'http://www.kitsapsun.com/rss/headlines/communities/' )
,(u'Entertainment', u'http://www.kitsapsun.com/rss/headlines/entertainment/')
,(u'Lifestyles' , u'http://www.kitsapsun.com/rss/headlines/lifestyles/' )
]
def print_version(self, url):
return url.rpartition('/')[0] + '/?print=1'

View File

@ -0,0 +1,96 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Montreal Gazette
title = u'Montreal Gazette'
url_prefix = 'http://www.montrealgazette.com'
description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,101 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Ottawa Citizen
title = u'Ottawa Citizen'
url_prefix = 'http://www.ottawacitizen.com'
description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,48 @@
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class PajamasMedia(BasicNewsRecipe):
title = u'Pajamas Media'
description = u'Provides exclusive news and opinion for forty countries.'
language = 'en'
__author__ = 'Krittika Goyal'
oldest_article = 1 #days
max_articles_per_feed = 25
recursions = 1
match_regexps = [r'http://pajamasmedia.com/blog/.*/2/$']
#encoding = 'latin1'
remove_stylesheets = True
#remove_tags_before = dict(name='h1', attrs={'class':'heading'})
remove_tags_after = dict(name='div', attrs={'class':'paged-nav'})
remove_tags = [
dict(name='iframe'),
dict(name='div', attrs={'class':['pages']}),
#dict(name='div', attrs={'id':['bookmark']}),
#dict(name='span', attrs={'class':['related_link', 'slideshowcontrols']}),
#dict(name='ul', attrs={'class':'articleTools'}),
]
feeds = [
('pajamas Media',
'http://feeds.feedburner.com/PajamasMedia'),
]
def preprocess_html(self, soup):
story = soup.find(name='div', attrs={'id':'innerpage-content'})
#td = heading.findParent(name='td')
#td.extract()
soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
body = soup.find(name='body')
body.insert(0, story)
return soup
def postprocess_html(self, soup, first):
if not first:
h = soup.find(attrs={'class':'innerpage-header'})
if h: h.extract()
auth = soup.find(attrs={'class':'author'})
if auth: auth.extract()
return soup

View File

@ -9,7 +9,6 @@ class Physicstoday(BasicNewsRecipe):
publisher = 'American Institute of Physics'
category = 'Physics'
language = 'en'
cover_url = strftime('http://ptonline.aip.org/journals/doc/PHTOAD-home/jrnls/images/medcover%m_%Y.jpg')
oldest_article = 30
max_articles_per_feed = 100
@ -30,8 +29,8 @@ class Physicstoday(BasicNewsRecipe):
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.physicstoday.org/pt/sso_login.jsp')
br.select_form(name='login')
br.open('http://ptonline.aip.org/journals/doc/PHTOAD-home/pt_login.jsp?fl=f')
br.select_form(name='login_form')
br['username'] = self.username
br['password'] = self.password
br.submit()

View File

@ -0,0 +1,188 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.web.feeds import Feed
class ReadersDigest(BasicNewsRecipe):
title = 'Readers Digest'
__author__ = 'BrianG'
language = 'en'
description = 'Readers Digest Feeds'
no_stylesheets = True
use_embedded_content = False
oldest_article = 60
max_articles_per_feed = 200
language = 'en'
remove_javascript = True
extra_css = ''' h1 {font-family:georgia,serif;color:#000000;}
.mainHd{font-family:georgia,serif;color:#000000;}
h2 {font-family:Arial,Sans-serif;}
.name{font-family:Arial,Sans-serif; font-size:x-small;font-weight:bold; }
.date{font-family:Arial,Sans-serif; font-size:x-small ;color:#999999;}
.byline{font-family:Arial,Sans-serif; font-size:x-small ;}
.photoBkt{ font-size:x-small ;}
.vertPhoto{font-size:x-small ;}
.credits{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
.credit{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
.artTxt{font-family:georgia,serif;}
.caption{font-family:georgia,serif; font-size:x-small;color:#333333;}
.credit{font-family:georgia,serif; font-size:x-small;color:#999999;}
a:link{color:#CC0000;}
.breadcrumb{font-family:Arial,Sans-serif;font-size:x-small;}
'''
remove_tags = [
dict(name='h4', attrs={'class':'close'}),
dict(name='div', attrs={'class':'fromLine'}),
dict(name='img', attrs={'class':'colorTag'}),
dict(name='div', attrs={'id':'sponsorArticleHeader'}),
dict(name='div', attrs={'class':'horizontalAd'}),
dict(name='div', attrs={'id':'imageCounterLeft'}),
dict(name='div', attrs={'id':'commentsPrint'})
]
feeds = [
('New in RD', 'http://feeds.rd.com/ReadersDigest'),
('Jokes', 'http://feeds.rd.com/ReadersDigestJokes'),
('Cartoons', 'http://feeds.rd.com/ReadersDigestCartoons'),
('Blogs','http://feeds.rd.com/ReadersDigestBlogs')
]
cover_url = 'http://www.rd.com/images/logo-main-rd.gif'
#-------------------------------------------------------------------------------------------------
def print_version(self, url):
# Get the identity number of the current article and append it to the root print URL
if url.find('/article') > 0:
ident = url[url.find('/article')+8:url.find('.html?')-4]
url = 'http://www.rd.com/content/printContent.do?contentId=' + ident
elif url.find('/post') > 0:
# in this case, have to get the page itself to derive the Print page.
soup = self.index_to_soup(url)
newsoup = soup.find('ul',attrs={'class':'printBlock'})
url = 'http://www.rd.com' + newsoup('a')[0]['href']
url = url[0:url.find('&Keep')]
return url
#-------------------------------------------------------------------------------------------------
def parse_index(self):
pages = [
('Your America','http://www.rd.com/your-america-inspiring-people-and-stories', 'channelLeftContainer',{'class':'moreLeft'}),
# useless recipes ('Living Healthy','http://www.rd.com/living-healthy', 'channelLeftContainer',{'class':'moreLeft'}),
('Advice and Know-How','http://www.rd.com/advice-and-know-how', 'channelLeftContainer',{'class':'moreLeft'})
]
feeds = []
for page in pages:
section, url, divider, attrList = page
newArticles = self.page_parse(url, divider, attrList)
feeds.append((section,newArticles))
# after the pages of the site have been processed, parse several RSS feeds for additional sections
newfeeds = Feed()
newfeeds = self.parse_rss()
# The utility code in parse_rss returns a Feed object. Convert each feed/article combination into a form suitable
# for this module (parse_index).
for feed in newfeeds:
newArticles = []
for article in feed.articles:
newArt = {
'title' : article.title,
'url' : article.url,
'date' : article.date,
'description' : article.text_summary
}
newArticles.append(newArt)
# New and Blogs should be the first two feeds.
if feed.title == 'New in RD':
feeds.insert(0,(feed.title,newArticles))
elif feed.title == 'Blogs':
feeds.insert(1,(feed.title,newArticles))
else:
feeds.append((feed.title,newArticles))
return feeds
#-------------------------------------------------------------------------------------------------
def page_parse(self, mainurl, divider, attrList):
articles = []
mainsoup = self.index_to_soup(mainurl)
for item in mainsoup.findAll(attrs=attrList):
newArticle = {
'title' : item('img')[0]['alt'],
'url' : 'http://www.rd.com'+item('a')[0]['href'],
'date' : '',
'description' : ''
}
articles.append(newArticle)
return articles
#-------------------------------------------------------------------------------------------------
def parse_rss (self):
# Do the "official" parse_feeds first
feeds = BasicNewsRecipe.parse_feeds(self)
# Loop thru the articles in all feeds to find articles with "recipe" in it
recipeArticles = []
for curfeed in feeds:
delList = []
for a,curarticle in enumerate(curfeed.articles):
if curarticle.title.upper().find('RECIPE') >= 0:
recipeArticles.append(curarticle)
delList.append(curarticle)
if len(delList)>0:
for d in delList:
index = curfeed.articles.index(d)
curfeed.articles[index:index+1] = []
# If there are any recipes found, create a new Feed object and append.
if len(recipeArticles) > 0:
pfeed = Feed()
pfeed.title = 'Recipes'
pfeed.descrition = 'Recipe Feed (Virtual)'
pfeed.image_url = None
pfeed.oldest_article = 30
pfeed.id_counter = len(recipeArticles)
# Create a new Feed, add the recipe articles, and then append
# to "official" list of feeds
pfeed.articles = recipeArticles[:]
feeds.append(pfeed)
return feeds

View File

@ -0,0 +1,116 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Regina Leader-Post
title = u'Regina Leader-Post'
url_prefix = 'http://www.leaderpost.com'
description = u'News from Regina, SK'
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,111 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Saskatoon Star-Phoenix
title = u'Saskatoon Star-Phoenix'
url_prefix = 'http://www.thestarphoenix.com'
description = u'News from Saskatoon, SK'
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,136 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Vancouver Province
title = u'Vancouver Province'
url_prefix = 'http://www.theprovince.com'
description = u'News from Vancouver, BC'
# un-comment the following three lines for the Vancouver Sun
#title = u'Vancouver Sun'
#url_prefix = 'http://www.vancouversun.com'
#description = u'News from Vancouver, BC'
# un-comment the following three lines for the Edmonton Journal
#title = u'Edmonton Journal'
#url_prefix = 'http://www.edmontonjournal.com'
#description = u'News from Edmonton, AB'
# un-comment the following three lines for the Calgary Herald
#title = u'Calgary Herald'
#url_prefix = 'http://www.calgaryherald.com'
#description = u'News from Calgary, AB'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,131 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Vancouver Sun
title = u'Vancouver Sun'
url_prefix = 'http://www.vancouversun.com'
description = u'News from Vancouver, BC'
# un-comment the following three lines for the Edmonton Journal
#title = u'Edmonton Journal'
#url_prefix = 'http://www.edmontonjournal.com'
#description = u'News from Edmonton, AB'
# un-comment the following three lines for the Calgary Herald
#title = u'Calgary Herald'
#url_prefix = 'http://www.calgaryherald.com'
#description = u'News from Calgary, AB'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,141 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Victoria Times Colonist
title = u'Victoria Times Colonist'
url_prefix = 'http://www.timescolonist.com'
description = u'News from Victoria, BC'
# un-comment the following three lines for the Vancouver Province
#title = u'Vancouver Province'
#url_prefix = 'http://www.theprovince.com'
#description = u'News from Vancouver, BC'
# un-comment the following three lines for the Vancouver Sun
#title = u'Vancouver Sun'
#url_prefix = 'http://www.vancouversun.com'
#description = u'News from Vancouver, BC'
# un-comment the following three lines for the Edmonton Journal
#title = u'Edmonton Journal'
#url_prefix = 'http://www.edmontonjournal.com'
#description = u'News from Edmonton, AB'
# un-comment the following three lines for the Calgary Herald
#title = u'Calgary Herald'
#url_prefix = 'http://www.calgaryherald.com'
#description = u'News from Calgary, AB'
# un-comment the following three lines for the Regina Leader-Post
#title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK'
# un-comment the following three lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK'
# un-comment the following three lines for the Windsor Star
#title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,106 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
'''
www.canada.com
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Windsor Star
title = u'Windsor Star'
url_prefix = 'http://www.windsorstar.com'
description = u'News from Windsor, ON'
# un-comment the following three lines for the Ottawa Citizen
#title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON'
# un-comment the following three lines for the Montreal Gazette
#title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC'
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic }
#photocredit { font-size: xx-small; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def preprocess_html(self,soup):
#delete iempty id attributes--they screw up the TOC for unknow reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
return soup
def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {}
key = 'News'
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -5,6 +5,7 @@ __docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import strftime
# http://online.wsj.com/page/us_in_todays_paper.html
@ -67,6 +68,13 @@ class WallStreetJournal(BasicNewsRecipe):
def parse_index(self):
soup = self.wsj_get_index()
year = strftime('%Y')
for x in soup.findAll('td', attrs={'class':'b14'}):
txt = self.tag_to_string(x).strip()
if year in txt:
self.timefmt = ' [%s]'%txt
break
left_column = soup.find(
text=lambda t: 'begin ITP Left Column' in str(t))
@ -91,7 +99,7 @@ class WallStreetJournal(BasicNewsRecipe):
url = url.partition('#')[0]
desc = ''
d = x.findNextSibling(True)
if d.get('class', None) == 'arialResize':
if d is not None and d.get('class', None) == 'arialResize':
desc = self.tag_to_string(d)
desc = desc.partition(u'\u2022')[0]
self.log('\t\tFound article:', title)

View File

@ -3,47 +3,139 @@
__license__ = 'GPL v3'
'''
online.wsj.com.com
online.wsj.com
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
from datetime import timedelta, date
class WSJ(BasicNewsRecipe):
# formatting adapted from original recipe by Kovid Goyal and Sujata Raman
title = u'Wall Street Journal (free)'
__author__ = 'Nick Redding'
language = 'en'
description = ('All the free content from the Wall Street Journal (business'
', financial and political news)')
description = ('All the free content from the Wall Street Journal (business, financial and political news)')
no_stylesheets = True
timefmt = ' [%b %d]'
extra_css = '''h1{font-size:large; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;}
h2{font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
.subhead{font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
.insettipUnit {font-family:Arial,Sans-serif;font-size:xx-small;}
.targetCaption{font-size:x-small; font-family:Arial,Helvetica,sans-serif;}
.article{font-family :Arial,Helvetica,sans-serif; font-size:x-small;}
.tagline { ont-size:xx-small;}
.dateStamp {font-family:Arial,Helvetica,sans-serif;}
h3{font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
.byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small; list-style-type: none;}
# customization notes: delete sections you are not interested in
# set omit_paid_content to False if you want the paid content article snippets
# set oldest_article to the maximum number of days back from today to include articles
sectionlist = [
['/home-page','Front Page'],
['/public/page/news-opinion-commentary.html','Commentary'],
['/public/page/news-global-world.html','World News'],
['/public/page/news-world-business.html','US News'],
['/public/page/news-business-us.html','Business'],
['/public/page/news-financial-markets-stock.html','Markets'],
['/public/page/news-tech-technology.html','Technology'],
['/public/page/news-personal-finance.html','Personal Finnce'],
['/public/page/news-lifestyle-arts-entertainment.html','Life & Style'],
['/public/page/news-real-estate-homes.html','Real Estate'],
['/public/page/news-career-jobs.html','Careers'],
['/public/page/news-small-business-marketing.html','Small Business']
]
oldest_article = 2
omit_paid_content = True
extra_css = '''h1{font-size:large; font-family:Times,serif;}
h2{font-family:Times,serif; font-size:small; font-style:italic;}
.subhead{font-family:Times,serif; font-size:small; font-style:italic;}
.insettipUnit {font-family:Times,serif;font-size:xx-small;}
.targetCaption{font-size:x-small; font-family:Times,serif; font-style:italic; margin-top: 0.25em;}
.article{font-family:Times,serif; font-size:x-small;}
.tagline { font-size:xx-small;}
.dateStamp {font-family:Times,serif;}
h3{font-family:Times,serif; font-size:xx-small;}
.byline {font-family:Times,serif; font-size:xx-small; list-style-type: none;}
.metadataType-articleCredits {list-style-type: none;}
h6{ font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic;}
h6{font-family:Times,serif; font-size:small; font-style:italic;}
.paperLocation{font-size:xx-small;}'''
remove_tags_before = dict(name='h1')
remove_tags = [ dict(id=["articleTabs_tab_article", "articleTabs_tab_comments",
"articleTabs_tab_interactive","articleTabs_tab_video",
"articleTabs_tab_map","articleTabs_tab_slideshow"]),
remove_tags_before = dict({'class':re.compile('^articleHeadlineBox')})
remove_tags = [ dict({'id':re.compile('^articleTabs_tab_')}),
#dict(id=["articleTabs_tab_article", "articleTabs_tab_comments",
# "articleTabs_tab_interactive","articleTabs_tab_video",
# "articleTabs_tab_map","articleTabs_tab_slideshow"]),
{'class': ['footer_columns','network','insetCol3wide','interactive','video','slideshow','map',
'insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', 'tooltip',
'insettip','insetClose','more_in', "insetContent",
# 'articleTools_bottom','articleTools_bottom mjArticleTools',
'aTools', 'tooltip',
'adSummary', 'nav-inline','insetFullBracket']},
dict(rel='shortcut icon'),
dict({'class':re.compile('^articleTools_bottom')}),
dict(rel='shortcut icon')
]
remove_tags_after = [dict(id="article_story_body"), {'class':"article story"}]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
return br
def preprocess_html(self,soup):
def decode_us_date(datestr):
udate = datestr.strip().lower().split()
m = ['january','february','march','april','may','june','july','august','september','october','november','december'].index(udate[0])+1
d = int(udate[1])
y = int(udate[2])
return date(y,m,d)
# check if article is paid content
if self.omit_paid_content:
divtags = soup.findAll('div','tooltip')
if divtags:
for divtag in divtags:
if divtag.find(text="Subscriber Content"):
return None
# check if article is too old
datetag = soup.find('li',attrs={'class' : re.compile("^dateStamp")})
if datetag:
dateline_string = self.tag_to_string(datetag,False)
date_items = dateline_string.split(',')
datestring = date_items[0]+date_items[1]
article_date = decode_us_date(datestring)
earliest_date = date.today() - timedelta(days=self.oldest_article)
if article_date < earliest_date:
self.log("Skipping article dated %s" % datestring)
return None
datetag.parent.extract()
# place dateline in article heading
bylinetag = soup.find('h3','byline')
if bylinetag:
h3bylinetag = bylinetag
else:
bylinetag = soup.find('li','byline')
if bylinetag:
h3bylinetag = bylinetag.h3
if not h3bylinetag:
h3bylinetag = bylinetag
bylinetag = bylinetag.parent
if bylinetag:
if h3bylinetag.a:
bylinetext = 'By '+self.tag_to_string(h3bylinetag.a,False)
else:
bylinetext = self.tag_to_string(h3bylinetag,False)
h3byline = Tag(soup,'h3',[('class','byline')])
if bylinetext.isspace() or (bylinetext == ''):
h3byline.insert(0,NavigableString(date_items[0]+','+date_items[1]))
else:
h3byline.insert(0,NavigableString(bylinetext+u'\u2014'+date_items[0]+','+date_items[1]))
bylinetag.replaceWith(h3byline)
else:
headlinetag = soup.find('div',attrs={'class' : re.compile("^articleHeadlineBox")})
if headlinetag:
dateline = Tag(soup,'h3', [('class','byline')])
dateline.insert(0,NavigableString(date_items[0]+','+date_items[1]))
headlinetag.insert(len(headlinetag),dateline)
else: # if no date tag, don't process this page--it's not a news item
return None
# This gets rid of the annoying superfluous bullet symbol preceding columnist bylines
ultag = soup.find('ul',attrs={'class' : 'cMetadata metadataType-articleCredits'})
if ultag:
@ -58,7 +150,7 @@ class WSJ(BasicNewsRecipe):
key = None
ans = []
def parse_index_page(page_name,page_title,omit_paid_content):
def parse_index_page(page_name,page_title):
def article_title(tag):
atag = tag.find('h2') # title is usually in an h2 tag
@ -119,7 +211,6 @@ class WSJ(BasicNewsRecipe):
soup = self.index_to_soup(pageurl)
# Find each instance of div with class including "headlineSummary"
for divtag in soup.findAll('div',attrs={'class' : re.compile("^headlineSummary")}):
# divtag contains all article data as ul's and li's
# first, check if there is an h3 tag which provides a section name
stag = divtag.find('h3')
@ -162,7 +253,7 @@ class WSJ(BasicNewsRecipe):
# now skip paid subscriber articles if desired
subscriber_tag = litag.find(text="Subscriber Content")
if subscriber_tag:
if omit_paid_content:
if self.omit_paid_content:
continue
# delete the tip div so it doesn't get in the way
tiptag = litag.find("div", { "class" : "tipTargetBox" })
@ -185,7 +276,7 @@ class WSJ(BasicNewsRecipe):
continue
if url.startswith("/article"):
url = mainurl+url
if not url.startswith("http"):
if not url.startswith("http://online.wsj.com"):
continue
if not url.endswith(".html"):
continue
@ -214,48 +305,10 @@ class WSJ(BasicNewsRecipe):
articles[page_title] = []
articles[page_title].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
# customization notes: delete sections you are not interested in
# set omit_paid_content to False if you want the paid content article previews
sectionlist = ['Front Page','Commentary','World News','US News','Business','Markets',
'Technology','Personal Finance','Life & Style','Real Estate','Careers','Small Business']
omit_paid_content = True
if 'Front Page' in sectionlist:
parse_index_page('/home-page','Front Page',omit_paid_content)
ans.append('Front Page')
if 'Commentary' in sectionlist:
parse_index_page('/public/page/news-opinion-commentary.html','Commentary',omit_paid_content)
ans.append('Commentary')
if 'World News' in sectionlist:
parse_index_page('/public/page/news-global-world.html','World News',omit_paid_content)
ans.append('World News')
if 'US News' in sectionlist:
parse_index_page('/public/page/news-world-business.html','US News',omit_paid_content)
ans.append('US News')
if 'Business' in sectionlist:
parse_index_page('/public/page/news-business-us.html','Business',omit_paid_content)
ans.append('Business')
if 'Markets' in sectionlist:
parse_index_page('/public/page/news-financial-markets-stock.html','Markets',omit_paid_content)
ans.append('Markets')
if 'Technology' in sectionlist:
parse_index_page('/public/page/news-tech-technology.html','Technology',omit_paid_content)
ans.append('Technology')
if 'Personal Finance' in sectionlist:
parse_index_page('/public/page/news-personal-finance.html','Personal Finance',omit_paid_content)
ans.append('Personal Finance')
if 'Life & Style' in sectionlist:
parse_index_page('/public/page/news-lifestyle-arts-entertainment.html','Life & Style',omit_paid_content)
ans.append('Life & Style')
if 'Real Estate' in sectionlist:
parse_index_page('/public/page/news-real-estate-homes.html','Real Estate',omit_paid_content)
ans.append('Real Estate')
if 'Careers' in sectionlist:
parse_index_page('/public/page/news-career-jobs.html','Careers',omit_paid_content)
ans.append('Careers')
if 'Small Business' in sectionlist:
parse_index_page('/public/page/news-small-business-marketing.html','Small Business',omit_paid_content)
ans.append('Small Business')
for page_name,page_title in self.sectionlist:
parse_index_page(page_name,page_title)
ans.append(page_title)
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -0,0 +1,125 @@
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class YemenTimesRecipe(BasicNewsRecipe):
__license__ = 'GPL v3'
__author__ = 'kwetal'
language = 'en_YE'
country = 'YE'
version = 1
title = u'Yemen Times'
publisher = u'yementimes.com'
category = u'News, Opinion, Yemen'
description = u'Award winning weekly from Yemen, promoting press freedom, professional journalism and the defense of human rights.'
oldest_article = 7
max_articles_per_feed = 100
use_embedded_content = False
encoding = 'utf-8'
remove_empty_feeds = True
no_stylesheets = True
remove_javascript = True
keep_only_tags = []
keep_only_tags.append(dict(name = 'div', attrs = {'id': 'ctl00_ContentPlaceHolder1_MAINNEWS0_Panel1',
'class': 'DMAIN2'}))
remove_attributes = ['style']
INDEX = 'http://www.yementimes.com/'
feeds = []
feeds.append((u'Our Viewpoint', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=6&pnm=OUR%20VIEWPOINT'))
feeds.append((u'Local News', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=3&pnm=Local%20news'))
feeds.append((u'Their News', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=80&pnm=Their%20News'))
feeds.append((u'Report', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=8&pnm=report'))
feeds.append((u'Health', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=51&pnm=health'))
feeds.append((u'Interview', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=77&pnm=interview'))
feeds.append((u'Opinion', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=7&pnm=opinion'))
feeds.append((u'Business', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=5&pnm=business'))
feeds.append((u'Op-Ed', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=81&pnm=Op-Ed'))
feeds.append((u'Culture', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=75&pnm=Culture'))
feeds.append((u'Readers View', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=4&pnm=Readers%20View'))
feeds.append((u'Variety', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=9&pnm=Variety'))
feeds.append((u'Education', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=57&pnm=Education'))
extra_css = '''
body {font-family:verdana, arial, helvetica, geneva, sans-serif;}
div.yemen_byline {font-size: medium; font-weight: bold;}
div.yemen_date {font-size: small; color: #666666; margin-bottom: 0.6em;}
.yemen_caption {font-size: x-small; font-style: italic; color: #696969;}
'''
conversion_options = {'comments': description, 'tags': category, 'language': 'en',
'publisher': publisher, 'linearize_tables': True}
def get_browser(self):
br = BasicNewsRecipe.get_browser()
br.set_handle_gzip(True)
return br
def parse_index(self):
answer = []
for feed_title, feed in self.feeds:
soup = self.index_to_soup(feed)
newsbox = soup.find('div', 'newsbox')
main = newsbox.findNextSibling('table')
articles = []
for li in main.findAll('li'):
title = self.tag_to_string(li.a)
url = self.INDEX + li.a['href']
articles.append({'title': title, 'date': None, 'url': url, 'description': '<br/>&nbsp;'})
answer.append((feed_title, articles))
return answer
def preprocess_html(self, soup):
freshSoup = self.getFreshSoup(soup)
headline = soup.find('div', attrs = {'id': 'DVMTIT'})
if headline:
div = headline.findNext('div', attrs = {'id': 'DVTOP'})
img = None
if div:
img = div.find('img')
headline.name = 'h1'
freshSoup.body.append(headline)
if img is not None:
freshSoup.body.append(img)
byline = soup.find('div', attrs = {'id': 'DVTIT'})
if byline:
date_el = byline.find('span')
if date_el:
pub_date = self.tag_to_string(date_el)
date = Tag(soup, 'div', attrs = [('class', 'yemen_date')])
date.append(pub_date)
date_el.extract()
raw = '<br/>'.join(['%s' % (part) for part in byline.findAll(text = True)])
author = BeautifulSoup('<div class="yemen_byline">' + raw + '</div>')
if date is not None:
freshSoup.body.append(date)
freshSoup.body.append(author)
story = soup.find('div', attrs = {'id': 'DVDET'})
if story:
for table in story.findAll('table'):
if table.find('img'):
table['class'] = 'yemen_caption'
freshSoup.body.append(story)
return freshSoup
def getFreshSoup(self, oldSoup):
freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>')
if oldSoup.head.title:
freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
return freshSoup

View File

@ -0,0 +1,23 @@
/*
* images management
* Copyright 2008 Kovid Goyal
* License: GNU GPL v3
*/
function scale_images() {
$("img:visible").each(function() {
var offset = $(this).offset();
//window.py_bridge.debug(window.getComputedStyle(this, '').getPropertyValue('max-width'));
$(this).css("max-width", (window.innerWidth-offset.left-5)+"px");
$(this).css("max-height", (window.innerHeight-5)+"px");
});
}
function setup_image_scaling_handlers() {
scale_images();
$(window).resize(function(){
scale_images();
});
}

View File

@ -2,10 +2,11 @@ from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import sys
import atexit, os, shutil, sys, tempfile, zipfile
from calibre.ptempfile import PersistentTemporaryFile
from calibre.constants import numeric_version
from calibre.ptempfile import PersistentTemporaryFile
class Plugin(object):
'''
@ -231,6 +232,8 @@ class CatalogPlugin(Plugin):
A plugin that implements a catalog generator.
'''
resources_path = None
#: Output file type for which this plugin should be run
#: For example: 'epub' or 'xml'
file_types = set([])
@ -249,14 +252,18 @@ class CatalogPlugin(Plugin):
cli_options = []
def search_sort_db(self, db, opts):
if opts.search_text:
# If declared, --ids overrides any declared search criteria
if not opts.ids and opts.search_text:
db.search(opts.search_text)
if opts.sort_by:
# 2nd arg = ascending
db.sort(opts.sort_by, True)
return db.get_data_as_dict()
return db.get_data_as_dict(ids=opts.ids)
def get_output_fields(self, opts):
# Return a list of requested fields, with opts.sort_by first
@ -272,11 +279,40 @@ class CatalogPlugin(Plugin):
fields = list(all_fields & requested_fields)
else:
fields = list(all_fields)
fields.sort()
if opts.sort_by:
fields.insert(0,fields.pop(int(fields.index(opts.sort_by))))
return fields
def run(self, path_to_output, opts, db):
def initialize(self):
'''
If plugin is not a built-in, copy the plugin's .ui and .py files from
the zip file to $TMPDIR.
Tab will be dynamically generated and added to the Catalog Options dialog in
calibre.gui2.dialogs.catalog.py:Catalog
'''
from calibre.customize.builtins import plugins as builtin_plugins
from calibre.customize.ui import config
from calibre.ptempfile import PersistentTemporaryDirectory
if not type(self) in builtin_plugins and \
not self.name in config['disabled_plugins']:
files_to_copy = ["%s.%s" % (self.name.lower(),ext) for ext in ["ui","py"]]
resources = zipfile.ZipFile(self.plugin_path,'r')
if self.resources_path is None:
self.resources_path = PersistentTemporaryDirectory('_plugin_resources', prefix='')
for file in files_to_copy:
try:
resources.extract(file, self.resources_path)
except:
print " customize:__init__.initialize(): %s not found in %s" % (file, os.path.basename(self.plugin_path))
continue
resources.close()
def run(self, path_to_output, opts, db, ids):
'''
Run the plugin. Must be implemented in subclasses.
It should generate the catalog in the format specified

View File

@ -18,7 +18,7 @@ class BLACKBERRY(USBMS):
VENDOR_ID = [0x0fca]
PRODUCT_ID = [0x8004, 0x0004]
BCD = [0x0200, 0x0107]
BCD = [0x0200, 0x0107, 0x0201]
VENDOR_NAME = 'RIM'
WINDOWS_MAIN_MEM = 'BLACKBERRY_SD'

View File

@ -86,4 +86,5 @@ class NOOK(USBMS):
return drives
def sanitize_path_components(self, components):
return [x.replace('#', '_') for x in components]

View File

@ -782,6 +782,13 @@ class Device(DeviceConfig, DevicePlugin):
'''
return default
def sanitize_path_components(self, components):
'''
Perform any device specific sanitization on the path components
for files to be uploaded to the device
'''
return components
def create_upload_path(self, path, mdata, fname):
path = os.path.abspath(path)
extra_components = []
@ -834,6 +841,7 @@ class Device(DeviceConfig, DevicePlugin):
extra_components = list(map(remove_trailing_periods, extra_components))
components = shorten_components_to(250 - len(path), extra_components)
components = self.sanitize_path_components(components)
filepath = os.path.join(path, *components)
filedir = os.path.dirname(filepath)

View File

@ -132,7 +132,8 @@ class FB2MLizer(object):
href = self.oeb_book.guide['titlepage'].href
item = self.oeb_book.manifest.hrefs[href]
if item.spine_position is None:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
stylizer = Stylizer(item.data, item.href, self.oeb_book,
self.opts, self.opts.output_profile)
output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item))
return output
@ -152,7 +153,7 @@ class FB2MLizer(object):
text = []
for item in self.oeb_book.spine:
self.log.debug('Converting %s to FictionBook2 XML' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
text.append(self.add_page_anchor(item))
text += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
return ''.join(text)

View File

@ -32,7 +32,7 @@ class LITOutput(OutputFormatPlugin):
mangler(oeb, opts)
rasterizer = SVGRasterizer()
rasterizer(oeb, opts)
lit = LitWriter()
lit = LitWriter(self.opts)
lit(oeb, output_path)

View File

@ -134,7 +134,7 @@ def warn(x):
class ReBinary(object):
NSRMAP = {'': None, XML_NS: 'xml'}
def __init__(self, root, item, oeb, map=HTML_MAP):
def __init__(self, root, item, oeb, opts, map=HTML_MAP):
self.item = item
self.logger = oeb.logger
self.manifest = oeb.manifest
@ -143,7 +143,7 @@ class ReBinary(object):
self.anchors = []
self.page_breaks = []
self.is_html = is_html = map is HTML_MAP
self.stylizer = Stylizer(root, item.href, oeb) if is_html else None
self.stylizer = Stylizer(root, item.href, oeb, opts) if is_html else None
self.tree_to_binary(root)
self.content = self.buf.getvalue()
self.ahc = self.build_ahc() if is_html else None
@ -295,9 +295,8 @@ def preserve(function):
return wrapper
class LitWriter(object):
def __init__(self):
# Wow, no options
pass
def __init__(self, opts):
self.opts = opts
def _litize_oeb(self):
oeb = self._oeb
@ -469,7 +468,7 @@ class LitWriter(object):
secnum = 0
if isinstance(data, etree._Element):
self._add_folder(name)
rebin = ReBinary(data, item, self._oeb, map=HTML_MAP)
rebin = ReBinary(data, item, self._oeb, self.opts, map=HTML_MAP)
self._add_file(name + '/ahc', rebin.ahc, 0)
self._add_file(name + '/aht', rebin.aht, 0)
item.page_breaks = rebin.page_breaks
@ -562,7 +561,7 @@ class LitWriter(object):
meta.attrib['ms--minimum_level'] = '0'
meta.attrib['ms--attr5'] = '1'
meta.attrib['ms--guid'] = '{%s}' % str(uuid.uuid4()).upper()
rebin = ReBinary(meta, None, self._oeb, map=OPF_MAP)
rebin = ReBinary(meta, None, self._oeb, self.opts, map=OPF_MAP)
meta = rebin.content
self._meta = meta
self._add_file('/meta', meta)

View File

@ -128,6 +128,10 @@ def do_set_metadata(opts, mi, stream, stream_type):
mi.title_sort = title_sort(opts.title)
if getattr(opts, 'tags', None) is not None:
mi.tags = [t.strip() for t in opts.tags.split(',')]
if getattr(opts, 'series', None) is not None:
mi.series = opts.series.strip()
if getattr(opts, 'series_index', None) is not None:
mi.series_index = float(opts.series_index.strip())
if getattr(opts, 'cover', None) is not None:
ext = os.path.splitext(opts.cover)[1].replace('.', '').upper()

View File

@ -134,7 +134,10 @@ def metadata_from_filename(name, pat=None):
mi.authors = aus
if prefs['swap_author_names'] and mi.authors:
def swap(a):
parts = a.split()
if ',' in a:
parts = a.split(',', 1)
else:
parts = a.split(None, 1)
if len(parts) > 1:
t = parts[-1]
parts = parts[:-1]

View File

@ -92,6 +92,7 @@ class MobiMLizer(object):
def __call__(self, oeb, context):
oeb.logger.info('Converting XHTML to Mobipocket markup...')
self.oeb = oeb
self.opts = context
self.profile = profile = context.dest
self.fnums = fnums = dict((v, k) for k, v in profile.fnums.items())
self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys())
@ -114,7 +115,7 @@ class MobiMLizer(object):
def mobimlize_spine(self):
'Iterate over the spine and convert it to MOBIML'
for item in self.oeb.spine:
stylizer = Stylizer(item.data, item.href, self.oeb, self.profile)
stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.profile)
body = item.data.find(XHTML('body'))
nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP)
nbody = etree.SubElement(nroot, XHTML('body'))

View File

@ -563,6 +563,16 @@ class MobiReader(object):
recindex = attrib.pop(attr, None) or recindex
if recindex is not None:
attrib['src'] = 'images/%s.jpg' % recindex
for attr in ('width', 'height'):
if attr in attrib:
val = attrib[attr]
if val.lower().endswith('em'):
try:
nval = float(val[:-2])
nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile
attrib[attr] = "%dpx"%int(nval)
except:
del attrib[attr]
elif tag.tag == 'pre':
if not tag.text:
tag.tag = 'div'

View File

@ -1,99 +0,0 @@
'''
Registry associating file extensions with Reader classes.
'''
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys, os, logging
from itertools import chain
import calibre
from calibre.ebooks.oeb.base import OEBError
from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks.oeb.writer import OEBWriter
from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks.lit.writer import LitWriter
from calibre.ebooks.mobi.reader import MobiReader
from calibre.ebooks.mobi.writer import MobiWriter
from calibre.ebooks.oeb.base import OEBBook
from calibre.ebooks.oeb.profile import Context
from calibre.utils.config import Config
__all__ = ['get_reader']
REGISTRY = {
'.opf': (OEBReader, None),
'.lit': (LitReader, LitWriter),
'.mobi': (MobiReader, MobiWriter),
}
def ReaderFactory(path):
if os.path.isdir(path):
return OEBReader
ext = os.path.splitext(path)[1].lower()
Reader = REGISTRY.get(ext, (None, None))[0]
if Reader is None:
raise OEBError('Unknown e-book file extension %r' % ext)
return Reader
def WriterFactory(path):
if os.path.isdir(path):
return OEBWriter
ext = os.path.splitext(path)[1].lower()
if not os.path.exists(path) and not ext:
return OEBWriter
Writer = REGISTRY.get(ext, (None, None))[1]
if Writer is None:
raise OEBError('Unknown e-book file extension %r' % ext)
return Writer
def option_parser(Reader, Writer):
cfg = Config('ebook-convert', _('Options to control e-book conversion.'))
Reader.config(cfg)
for Transform in chain(Reader.TRANSFORMS, Writer.TRANSFORMS):
Transform.config(cfg)
Writer.config(cfg)
parser = cfg.option_parser()
parser.add_option('--encoding', default=None,
help=_('Character encoding for input. Default is to auto detect.'))
parser.add_option('-o', '--output', default=None,
help=_('Output file. Default is derived from input filename.'))
parser.add_option('-p', '--pretty-print', action='store_true',
default=False, help=_('Produce more human-readable XML output.'))
parser.add_option('-v', '--verbose', default=0, action='count',
help=_('Useful for debugging.'))
return parser
def main(argv=sys.argv):
if len(argv) < 3:
print _("Usage: ebook-convert INFILE OUTFILE [OPTIONS..]")
return 1
inpath, outpath = argv[1], argv[2]
Reader = ReaderFactory(inpath)
Writer = WriterFactory(outpath)
parser = option_parser(Reader, Writer)
opts, args = parser.parse_args(argv[3:])
if len(args) != 0:
parser.print_help()
return 1
logger = logging.getLogger('ebook-convert')
calibre.setup_cli_handlers(logger, logging.DEBUG)
encoding = opts.encoding
pretty_print = opts.pretty_print
oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger)
context = Context(Reader.DEFAULT_PROFILE, Writer.DEFAULT_PROFILE)
reader = Reader.generate(opts)
writer = Writer.generate(opts)
transforms = []
for Transform in chain(Reader.TRANSFORMS, Writer.TRANSFORMS):
transforms.append(Transform.generate(opts))
reader(oeb, inpath)
for transform in transforms:
transform(oeb, context)
writer(oeb, outpath)
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -110,9 +110,9 @@ class CSSSelector(etree.XPath):
class Stylizer(object):
STYLESHEETS = WeakKeyDictionary()
def __init__(self, tree, path, oeb, profile=PROFILES['PRS505'],
def __init__(self, tree, path, oeb, opts, profile=PROFILES['PRS505'],
extra_css='', user_css=''):
self.oeb = oeb
self.oeb, self.opts = oeb, opts
self.profile = profile
self.logger = oeb.logger
item = oeb.manifest.hrefs[path]
@ -249,6 +249,8 @@ class Stylizer(object):
style.update(self._normalize_font(prop.cssValue))
elif name == 'list-style':
style.update(self._normalize_list_style(prop.cssValue))
elif name == 'text-align':
style.update(self._normalize_text_align(prop.cssValue))
else:
style[name] = prop.value
if 'font-size' in style:
@ -306,6 +308,19 @@ class Stylizer(object):
return style
def _normalize_text_align(self, cssvalue):
style = {}
text = cssvalue.cssText
if text == 'inherit':
style['text-align'] = 'inherit'
else:
if text in ('left', 'justify'):
val = 'left' if self.opts.dont_justify else 'justify'
style['text-align'] = val
else:
style['text-align'] = text
return style
def _normalize_font(self, cssvalue):
composition = ('font-style', 'font-variant', 'font-weight',
'font-size', 'line-height', 'font-family')
@ -411,6 +426,7 @@ class Style(object):
return result
def _unit_convert(self, value, base=None, font=None):
' Return value in pts'
if isinstance(value, (int, long, float)):
return value
try:
@ -447,6 +463,9 @@ class Style(object):
result = value * 0.40
return result
def pt_to_px(self, value):
return (self._profile.dpi / 72.0) * value
@property
def fontSize(self):
def normalize_fontsize(value, base):

View File

@ -141,7 +141,7 @@ class CSSFlattener(object):
bs.append('text-align: '+ \
('left' if self.context.dont_justify else 'justify'))
body.set('style', '; '.join(bs))
stylizer = Stylizer(html, item.href, self.oeb, profile,
stylizer = Stylizer(html, item.href, self.oeb, self.context, profile,
user_css=self.context.extra_css,
extra_css=css)
self.stylizers[item] = stylizer

View File

@ -33,6 +33,7 @@ class CaseMangler(object):
def __call__(self, oeb, context):
oeb.logger.info('Applying case-transforming CSS...')
self.oeb = oeb
self.opts = context
self.profile = context.source
self.mangle_spine()
@ -44,7 +45,7 @@ class CaseMangler(object):
relhref = item.relhref(href)
etree.SubElement(html.find(XHTML('head')), XHTML('link'),
rel='stylesheet', href=relhref, type=CSS_MIME)
stylizer = Stylizer(html, item.href, self.oeb, self.profile)
stylizer = Stylizer(html, item.href, self.oeb, self.opts, self.profile)
self.mangle_elem(html.find(XHTML('body')), stylizer)
def text_transform(self, transform, text):

View File

@ -44,6 +44,7 @@ class SVGRasterizer(object):
def __call__(self, oeb, context):
oeb.logger.info('Rasterizing SVG images...')
self.oeb = oeb
self.opts = context
self.profile = context.dest
self.images = {}
self.dataize_manifest()
@ -102,7 +103,7 @@ class SVGRasterizer(object):
def rasterize_spine(self):
for item in self.oeb.spine:
html = item.data
stylizer = Stylizer(html, item.href, self.oeb, self.profile)
stylizer = Stylizer(html, item.href, self.oeb, self.opts, self.profile)
self.rasterize_item(item, stylizer)
def rasterize_item(self, item, stylizer):

View File

@ -20,6 +20,10 @@ class Font(object):
class Column(object):
# A column contains an element is the element bulges out to
# the left or the right by at most HFUZZ*col width.
HFUZZ = 0.2
def __init__(self):
self.left = self.right = self.top = self.bottom = 0
self.width = self.height = 0
@ -41,6 +45,10 @@ class Column(object):
for x in self.elements:
yield x
def contains(self, elem):
return elem.left > self.left - self.HFUZZ*self.width and \
elem.right < self.right + self.HFUZZ*self.width
class Element(object):
def __eq__(self, other):
@ -132,6 +140,18 @@ class Interval(object):
def __hash__(self):
return hash('(%f,%f)'%self.left, self.right)
class Region(object):
def __init__(self):
self.columns = []
self.top = self.bottom = self.left = self.right = self.width = self.height = 0
def add_columns(self, columns):
if not self.columns:
for x in sorted(columns, cmp=lambda x,y: cmp(x.left, y.left)):
self.columns.append(x)
else:
pass
class Page(object):
@ -238,11 +258,10 @@ class Page(object):
return columns
def find_elements_in_row_of(self, x):
interval = Interval(x.top - self.YFUZZ * self.average_text_height,
interval = Interval(x.top,
x.top + self.YFUZZ*(1+self.average_text_height))
h_interval = Interval(x.left, x.right)
m = max(0, x.idx-15)
for y in self.elements[m:x.idx+15]:
for y in self.elements[x.idx:x.idx+15]:
if y is not x:
y_interval = Interval(y.top, y.bottom)
x_interval = Interval(y.left, y.right)

View File

@ -113,7 +113,8 @@ class PMLMLizer(object):
href = self.oeb_book.guide['titlepage'].href
item = self.oeb_book.manifest.hrefs[href]
if item.spine_position is None:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
stylizer = Stylizer(item.data, item.href, self.oeb_book,
self.opts, self.opts.output_profile)
output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item))
return output

View File

@ -90,7 +90,8 @@ class RBMLizer(object):
href = self.oeb_book.guide['titlepage'].href
item = self.oeb_book.manifest.hrefs[href]
if item.spine_position is None:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
stylizer = Stylizer(item.data, item.href, self.oeb_book,
self.opts, self.opts.output_profile)
output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item))
return output
@ -111,7 +112,7 @@ class RBMLizer(object):
output = [u'']
for item in self.oeb_book.spine:
self.log.debug('Converting %s to RocketBook HTML...' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
output.append(self.add_page_anchor(item))
output += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
return ''.join(output)

View File

@ -111,12 +111,13 @@ class RTFMLizer(object):
href = self.oeb_book.guide['titlepage'].href
item = self.oeb_book.manifest.hrefs[href]
if item.spine_position is None:
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
stylizer = Stylizer(item.data, item.href, self.oeb_book,
self.opts, self.opts.output_profile)
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
output += '{\\page } '
for item in self.oeb_book.spine:
self.log.debug('Converting %s to RTF markup...' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
output += self.footer()
output = self.insert_images(output)

View File

@ -54,7 +54,7 @@ class TXTMLizer(object):
output.append(self.get_toc())
for item in self.oeb_book.spine:
self.log.debug('Converting %s to TXT...' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile)
content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
content = self.remove_newlines(content)
output += self.dump_text(etree.fromstring(content), stylizer)

View File

@ -4,9 +4,14 @@ __license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.ebooks.conversion.plumber import Plumber
from calibre.utils.logging import Log
import os
from optparse import OptionParser
from calibre.customize.conversion import OptionRecommendation, DummyReporter
from calibre.ebooks.conversion.plumber import Plumber
from calibre.customize.ui import plugin_for_catalog_format
from calibre.utils.logging import Log
from calibre.gui2 import choose_dir, Application
def gui_convert(input, output, recommendations, notification=DummyReporter(),
abort_after_input_dump=False, log=None):
@ -20,7 +25,7 @@ def gui_convert(input, output, recommendations, notification=DummyReporter(),
plumber.run()
def gui_catalog(fmt, title, dbspec, ids, out_file_name,
def gui_catalog(fmt, title, dbspec, ids, out_file_name, fmt_options,
notification=DummyReporter(), log=None):
if log is None:
log = Log()
@ -31,8 +36,28 @@ def gui_catalog(fmt, title, dbspec, ids, out_file_name,
db = LibraryDatabase2(dbpath)
else: # To be implemented in the future
pass
# Implement the interface to the catalog generating code here
db
# Create a minimal OptionParser that we can append to
parser = OptionParser()
args = []
parser.add_option("--verbose", action="store_true", dest="verbose", default=True)
opts, args = parser.parse_args()
# Populate opts
opts.ids = ids
opts.search_text = None
opts.sort_by = None
# Extract the option dictionary to comma-separated lists
for option in fmt_options:
setattr(opts,option, ','.join(fmt_options[option]))
# Fetch and run the plugin for fmt
plugin = plugin_for_catalog_format(fmt)
plugin.run(out_file_name, opts, db)

View File

@ -6,29 +6,121 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from PyQt4.Qt import QDialog
import os, shutil, sys, tempfile
from PyQt4.Qt import QDialog, QWidget
from calibre.customize.ui import config
from calibre.gui2.dialogs.catalog_ui import Ui_Dialog
from calibre.gui2 import dynamic
from calibre.customize.ui import available_catalog_formats
from calibre.gui2 import gprefs, dynamic
from calibre.customize.ui import available_catalog_formats, catalog_plugins
from calibre.gui2.catalog.catalog_csv_xml import PluginWidget
class Catalog(QDialog, Ui_Dialog):
''' Catalog Dialog builder'''
widgets = []
def __init__(self, parent, dbspec, ids):
import re, cStringIO
from calibre import prints as info
from calibre.gui2 import dynamic
from PyQt4.uic import compileUi
QDialog.__init__(self, parent)
# Run the dialog setup generated from catalog.ui
self.setupUi(self)
self.dbspec, self.ids = dbspec, ids
# Display the number of books we've been passed
self.count.setText(unicode(self.count.text()).format(len(ids)))
# Display the last-used title
self.title.setText(dynamic.get('catalog_last_used_title',
_('My Books')))
fmts = sorted([x.upper() for x in available_catalog_formats()])
# GwR *** Add option tabs for built-in formats
# This code models #69 in calibre/gui2/dialogs/config/__init__.py
self.fmts = []
from calibre.customize.builtins import plugins as builtin_plugins
from calibre.customize import CatalogPlugin
for plugin in catalog_plugins():
if plugin.name in config['disabled_plugins']:
continue
name = plugin.name.lower().replace(' ', '_')
if type(plugin) in builtin_plugins:
#info("Adding widget for builtin Catalog plugin %s" % plugin.name)
try:
catalog_widget = __import__('calibre.gui2.catalog.'+name,
fromlist=[1])
pw = catalog_widget.PluginWidget()
pw.initialize(name)
pw.ICON = I('forward.svg')
self.widgets.append(pw)
[self.fmts.append([file_type.upper(), pw.sync_enabled,pw]) for file_type in plugin.file_types]
except ImportError:
info("ImportError with %s" % name)
continue
else:
# Load dynamic tab
form = os.path.join(plugin.resources_path,'%s.ui' % name)
klass = os.path.join(plugin.resources_path,'%s.py' % name)
compiled_form = os.path.join(plugin.resources_path,'%s_ui.py' % name)
if os.path.exists(form) and os.path.exists(klass):
#info("Adding widget for user-installed Catalog plugin %s" % plugin.name)
# Compile the .ui form provided in plugin.zip
if not os.path.exists(compiled_form):
# info('\tCompiling form', form)
buf = cStringIO.StringIO()
compileUi(form, buf)
dat = buf.getvalue()
dat = re.compile(r'QtGui.QApplication.translate\(.+?,\s+"(.+?)(?<!\\)",.+?\)',
re.DOTALL).sub(r'_("\1")', dat)
open(compiled_form, 'wb').write(dat)
# Import the dynamic PluginWidget() from .py file provided in plugin.zip
try:
sys.path.insert(0, plugin.resources_path)
catalog_widget = __import__(name, fromlist=[1])
pw = catalog_widget.PluginWidget()
pw.initialize(name)
pw.ICON = I('forward.svg')
self.widgets.append(pw)
[self.fmts.append([file_type.upper(), pw.sync_enabled,pw]) for file_type in plugin.file_types]
except ImportError:
info("ImportError with %s" % name)
continue
finally:
sys.path.remove(plugin.resources_path)
else:
info("No dynamic tab resources found for %s" % name)
self.widgets = sorted(self.widgets, key=lambda x:(x.TITLE, x.TITLE))
for pw in self.widgets:
page = self.tabs.addTab(pw,pw.TITLE)
# Generate a sorted list of installed catalog formats/sync_enabled pairs
fmts = sorted([x[0] for x in self.fmts])
self.sync_enabled_formats = []
for fmt in self.fmts:
if fmt[1]:
self.sync_enabled_formats.append(fmt[0])
# Callback when format changes
self.format.currentIndexChanged.connect(self.format_changed)
# Add the installed catalog format list to the format QComboBox
self.format.addItems(fmts)
pref = dynamic.get('catalog_preferred_format', 'EPUB')
pref = dynamic.get('catalog_preferred_format', 'CSV')
idx = self.format.findText(pref)
if idx > -1:
self.format.setCurrentIndex(idx)
@ -38,7 +130,7 @@ class Catalog(QDialog, Ui_Dialog):
def format_changed(self, idx):
cf = unicode(self.format.currentText())
if cf in ('EPUB', 'MOBI'):
if cf in self.sync_enabled_formats:
self.sync.setEnabled(True)
else:
self.sync.setDisabled(True)

View File

@ -6,20 +6,26 @@
<rect>
<x>0</x>
<y>0</y>
<width>628</width>
<height>503</height>
<width>611</width>
<height>514</height>
</rect>
</property>
<property name="windowTitle">
<string>Generate catalog</string>
</property>
<property name="windowIcon">
<iconset resource="../../../work/calibre/resources/images.qrc">
<iconset>
<normaloff>:/images/library.png</normaloff>:/images/library.png</iconset>
</property>
<layout class="QGridLayout" name="gridLayout">
<item row="2" column="0">
<widget class="QDialogButtonBox" name="buttonBox">
<property name="geometry">
<rect>
<x>430</x>
<y>470</y>
<width>164</width>
<height>32</height>
</rect>
</property>
<property name="orientation">
<enum>Qt::Horizontal</enum>
</property>
@ -27,9 +33,15 @@
<set>QDialogButtonBox::Cancel|QDialogButtonBox::Ok</set>
</property>
</widget>
</item>
<item row="1" column="0">
<widget class="QTabWidget" name="tabs">
<property name="geometry">
<rect>
<x>12</x>
<y>39</y>
<width>579</width>
<height>411</height>
</rect>
</property>
<property name="currentIndex">
<number>0</number>
</property>
@ -64,6 +76,16 @@
</property>
</widget>
</item>
<item row="1" column="2">
<widget class="QLineEdit" name="title"/>
</item>
<item row="3" column="0">
<widget class="QCheckBox" name="sync">
<property name="text">
<string>&amp;Send catalog to device automatically</string>
</property>
</widget>
</item>
<item row="2" column="1">
<spacer name="verticalSpacer">
<property name="orientation">
@ -77,22 +99,18 @@
</property>
</spacer>
</item>
<item row="3" column="0">
<widget class="QCheckBox" name="sync">
<property name="text">
<string>&amp;Send catalog to device automatically</string>
</property>
</widget>
</item>
<item row="1" column="2">
<widget class="QLineEdit" name="title"/>
</item>
</layout>
</widget>
</widget>
</item>
<item row="0" column="0">
<widget class="QLabel" name="count">
<property name="geometry">
<rect>
<x>12</x>
<y>12</y>
<width>205</width>
<height>17</height>
</rect>
</property>
<property name="font">
<font>
<weight>75</weight>
@ -103,8 +121,6 @@
<string>Generate catalog for {0} books</string>
</property>
</widget>
</item>
</layout>
</widget>
<resources>
<include location="../../../work/calibre/resources/images.qrc"/>

View File

@ -532,7 +532,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog):
if self.cover_fetcher.exception is not None:
err = self.cover_fetcher.exception
error_dialog(self, _('Cannot fetch cover'),
_('<b>Could not fetch cover.</b><br/>')+repr(err)).exec_()
_('<b>Could not fetch cover.</b><br/>')+unicode(err)).exec_()
return
pix = QPixmap()

View File

@ -215,7 +215,7 @@ class TagsModel(QAbstractItemModel):
return QModelIndex()
child_item = index.internalPointer()
parent_item = child_item.parent
parent_item = getattr(child_item, 'parent', None)
if parent_item is self.root_item or parent_item is None:
return QModelIndex()

View File

@ -238,19 +238,36 @@ def fetch_scheduled_recipe(arg):
def generate_catalog(parent, dbspec, ids):
from calibre.gui2.dialogs.catalog import Catalog
# Build the Catalog dialog in gui2.dialogs.catalog
d = Catalog(parent, dbspec, ids)
if d.exec_() != d.Accepted:
return None
# Create the output file
out = PersistentTemporaryFile(suffix='_catalog_out.'+d.catalog_format.lower())
# Retrieve plugin options
fmt_options = {}
for x in range(d.tabs.count()):
if str(d.tabs.tabText(x)).find(str(d.catalog_format)) > -1:
for fmt in d.fmts:
if fmt[0] == d.catalog_format:
fmt_options = fmt[2].options()
# print "gui2.tools:generate_catalog(): options for %s: %s" % (fmt[0], fmt_options)
args = [
d.catalog_format,
d.catalog_title,
dbspec,
ids,
out.name,
fmt_options
]
out.close()
# This calls gui2.convert.gui_conversion:gui_catalog()
return 'gui_catalog', args, _('Generate catalog'), out.name, d.catalog_sync, \
d.catalog_title

View File

@ -9,7 +9,7 @@ __docformat__ = 'restructuredtext en'
'''The main GUI'''
import os, sys, textwrap, collections, time
import atexit, os, shutil, sys, tempfile, textwrap, collections, time
from xml.parsers.expat import ExpatError
from Queue import Queue, Empty
from threading import Thread
@ -357,7 +357,7 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
cm.addAction(_('Bulk convert'))
cm.addSeparator()
ac = cm.addAction(
_('Create catalog of the books in your calibre library'))
_('Create catalog of books in your calibre library'))
ac.triggered.connect(self.generate_catalog)
self.action_convert.setMenu(cm)
self._convert_single_hook = partial(self.convert_ebook, bulk=False)
@ -1361,23 +1361,29 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
def generate_catalog(self):
rows = self.library_view.selectionModel().selectedRows()
if not rows:
if not rows or len(rows) < 2:
rows = xrange(self.library_view.model().rowCount(QModelIndex()))
ids = map(self.library_view.model().id, rows)
dbspec = None
if not ids:
return error_dialog(self, _('No books selected'),
_('No books selected to generate catalog for'),
show=True)
# Calling gui2.tools:generate_catalog()
ret = generate_catalog(self, dbspec, ids)
if ret is None:
return
func, args, desc, out, sync, title = ret
fmt = os.path.splitext(out)[1][1:].upper()
job = self.job_manager.run_job(
Dispatcher(self.catalog_generated), func, args=args,
description=desc)
job.catalog_file_path = out
job.fmt = fmt
job.catalog_sync, job.catalog_title = sync, title
self.status_bar.showMessage(_('Generating %s catalog...')%fmt)
@ -1392,7 +1398,12 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
dynamic.set('catalogs_to_be_synced', sync)
self.status_bar.showMessage(_('Catalog generated.'), 3000)
self.sync_catalogs()
if job.fmt in ['CSV','XML']:
export_dir = choose_dir(self, 'Export Catalog Directory',
'Select destination for %s.%s' % (job.catalog_title, job.fmt.lower()))
if export_dir:
destination = os.path.join(export_dir, '%s.%s' % (job.catalog_title, job.fmt.lower()))
shutil.copyfile(job.catalog_file_path, destination)
############################### Fetch news #################################

View File

@ -7,14 +7,14 @@
<x>0</x>
<y>0</y>
<width>479</width>
<height>574</height>
<height>606</height>
</rect>
</property>
<property name="windowTitle">
<string>Configure Ebook viewer</string>
</property>
<property name="windowIcon">
<iconset resource="../../../../resources/images.qrc">
<iconset>
<normaloff>:/images/config.svg</normaloff>:/images/config.svg</iconset>
</property>
<layout class="QGridLayout" name="gridLayout_4">
@ -164,7 +164,7 @@
</item>
</widget>
</item>
<item row="6" column="0" colspan="2">
<item row="7" column="0" colspan="2">
<widget class="QCheckBox" name="opt_remember_window_size">
<property name="text">
<string>Remember last used &amp;window size</string>
@ -218,6 +218,13 @@
</property>
</widget>
</item>
<item row="6" column="0" colspan="2">
<widget class="QCheckBox" name="opt_fit_images">
<property name="text">
<string>&amp;Resize images larger than the viewer window (needs restart)</string>
</property>
</widget>
</item>
</layout>
</item>
<item row="3" column="0">

View File

@ -10,7 +10,7 @@ from base64 import b64encode
from PyQt4.Qt import QSize, QSizePolicy, QUrl, SIGNAL, Qt, QTimer, \
QPainter, QPalette, QBrush, QFontDatabase, QDialog, \
QColor, QPoint, QImage, QRegion, QVariant, QIcon, \
QFont, QObject, QApplication, pyqtSignature, QAction
QFont, pyqtSignature, QAction
from PyQt4.QtWebKit import QWebPage, QWebView, QWebSettings
from calibre.utils.config import Config, StringConfig
@ -21,7 +21,7 @@ from calibre.constants import iswindows
from calibre import prints, guess_type
from calibre.gui2.viewer.keys import SHORTCUTS
bookmarks = referencing = hyphenation = jquery = jquery_scrollTo = hyphenator = None
bookmarks = referencing = hyphenation = jquery = jquery_scrollTo = hyphenator = images =None
def load_builtin_fonts():
base = P('fonts/liberation/*.ttf')
@ -42,6 +42,8 @@ def config(defaults=None):
help=_('Set the user CSS stylesheet. This can be used to customize the look of all books.'))
c.add_opt('max_view_width', default=6000,
help=_('Maximum width of the viewer window, in pixels.'))
c.add_opt('fit_images', default=True,
help=_('Resize images larger than the viewer window to fit inside it'))
c.add_opt('hyphenate', default=False, help=_('Hyphenate text'))
c.add_opt('hyphenate_default_lang', default='en',
help=_('Default language for hyphenation rules'))
@ -59,20 +61,6 @@ def config(defaults=None):
return c
class PythonJS(QObject):
def __init__(self, callback):
QObject.__init__(self, QApplication.instance())
self.setObjectName("py_bridge")
self._callback = callback
@pyqtSignature("QString")
def callback(self, msg):
print "callback called"
self._callback(msg)
class ConfigDialog(QDialog, Ui_Dialog):
def __init__(self, shortcuts, parent=None):
@ -110,6 +98,7 @@ class ConfigDialog(QDialog, Ui_Dialog):
self.shortcut_config = ShortcutConfig(shortcuts, parent=self)
p = self.tabs.widget(1)
p.layout().addWidget(self.shortcut_config)
self.opt_fit_images.setChecked(opts.fit_images)
def accept(self, *args):
@ -122,6 +111,7 @@ class ConfigDialog(QDialog, Ui_Dialog):
c.set('standard_font', {0:'serif', 1:'sans', 2:'mono'}[self.standard_font.currentIndex()])
c.set('user_css', unicode(self.css.toPlainText()))
c.set('remember_window_size', self.opt_remember_window_size.isChecked())
c.set('fit_images', self.opt_fit_images.isChecked())
c.set('max_view_width', int(self.max_view_width.value()))
c.set('hyphenate', self.hyphenate.isChecked())
idx = self.hyphenate_default_lang.currentIndex()
@ -157,7 +147,6 @@ class Document(QWebPage):
self.setObjectName("py_bridge")
self.debug_javascript = False
self.current_language = None
#self.js_bridge = PythonJS(self.js_callback)
self.setLinkDelegationPolicy(self.DelegateAllLinks)
self.scroll_marks = []
@ -197,9 +186,14 @@ class Document(QWebPage):
opts = config().parse()
self.hyphenate = opts.hyphenate
self.hyphenate_default_lang = opts.hyphenate_default_lang
self.do_fit_images = opts.fit_images
def fit_images(self):
if self.do_fit_images:
self.javascript('setup_image_scaling_handlers()')
def load_javascript_libraries(self):
global bookmarks, referencing, hyphenation, jquery, jquery_scrollTo, hyphenator
global bookmarks, referencing, hyphenation, jquery, jquery_scrollTo, hyphenator, images
self.mainFrame().addToJavaScriptWindowObject("py_bridge", self)
if jquery is None:
jquery = P('content_server/jquery.js', data=True)
@ -215,6 +209,9 @@ class Document(QWebPage):
if referencing is None:
referencing = P('viewer/referencing.js', data=True)
self.javascript(referencing)
if images is None:
images = P('viewer/images.js', data=True)
self.javascript(images)
if hyphenation is None:
hyphenation = P('viewer/hyphenation.js', data=True)
self.javascript(hyphenation)
@ -353,7 +350,13 @@ class Document(QWebPage):
return self.mainFrame().contentsSize().width() # offsetWidth gives inaccurate results
def set_bottom_padding(self, amount):
self.javascript('$("body").css("padding-bottom", "%dpx")' % amount)
padding = '%dpx'%amount
try:
old_padding = unicode(self.javascript('$("body").css("padding-bottom")').toString())
except:
old_padding = ''
if old_padding != padding:
self.javascript('$("body").css("padding-bottom", "%s")' % padding)
class EntityDeclarationProcessor(object):
@ -541,6 +544,7 @@ class DocumentView(QWebView):
return
self.loading_url = None
self.document.set_bottom_padding(0)
self.document.fit_images()
self._size_hint = self.document.mainFrame().contentsSize()
scrolled = False
if self.to_bottom:

View File

@ -40,8 +40,9 @@ class CSV_XML(CatalogPlugin):
from calibre.utils.logging import Log
log = Log()
self.fmt = path_to_output[path_to_output.rfind('.') + 1:]
if opts.verbose:
self.fmt = path_to_output.rpartition('.')[2]
if False and opts.verbose:
log("%s:run" % self.name)
log(" path_to_output: %s" % path_to_output)
log(" Output format: %s" % self.fmt)

View File

@ -644,6 +644,10 @@ def catalog_option_parser(args):
output, fmt = validate_command_line(parser, args, log)
# Add options common to all catalog plugins
parser.add_option('-i', '--ids', default=None, dest='ids',
help=_("Comma-separated list of database IDs to catalog.\n"
"If declared, --search is ignored.\n"
"Default: all"))
parser.add_option('-s', '--search', default=None, dest='search_text',
help=_("Filter the results by the search query. "
"For the format of the search query, please see "
@ -656,31 +660,6 @@ def catalog_option_parser(args):
# Add options specific to fmt plugin
plugin = add_plugin_parser_options(fmt, parser, log)
# Merge options from GUI Preferences
'''
# Placeholder sample code until we implement GUI preferences
from calibre.library.save_to_disk import config
c = config()
for pref in ['asciiize', 'update_metadata', 'write_opf', 'save_cover']:
opt = c.get_option(pref)
switch = '--dont-'+pref.replace('_', '-')
parser.add_option(switch, default=True, action='store_false',
help=opt.help+' '+_('Specifying this switch will turn '
'this behavior off.'), dest=pref)
for pref in ['timefmt', 'template', 'formats']:
opt = c.get_option(pref)
switch = '--'+pref
parser.add_option(switch, default=opt.default,
help=opt.help, dest=pref)
for pref in ('replace_whitespace', 'to_lowercase'):
opt = c.get_option(pref)
switch = '--'+pref.replace('_', '-')
parser.add_option(switch, default=False, action='store_true',
help=opt.help)
'''
return parser, plugin, log
def command_catalog(args, dbpath):
@ -693,6 +672,9 @@ def command_catalog(args, dbpath):
return 1
if opts.verbose:
log("library.cli:command_catalog dispatching to plugin %s" % plugin.name)
if opts.ids:
opts.ids = [int(id) for id in opts.ids.split(',')]
with plugin:
plugin.run(args[1], opts, get_db(dbpath, opts))
return 0

View File

@ -1634,13 +1634,15 @@ class LibraryDatabase2(LibraryDatabase):
for i in iter(self):
yield i[x]
def get_data_as_dict(self, prefix=None, authors_as_string=False):
def get_data_as_dict(self, prefix=None, authors_as_string=False, ids=None):
'''
Return all metadata stored in the database as a dict. Includes paths to
the cover and each format.
:param prefix: The prefix for all paths. By default, the prefix is the absolute path
to the library folder.
:param ids: Set of ids to return the data for. If None return data for
all entries in database.
'''
if prefix is None:
prefix = self.library_path
@ -1650,11 +1652,14 @@ class LibraryDatabase2(LibraryDatabase):
data = []
for record in self.data:
if record is None: continue
db_id = record[FIELD_MAP['id']]
if ids is not None and db_id not in ids:
continue
x = {}
for field in FIELDS:
x[field] = record[FIELD_MAP[field]]
data.append(x)
x['id'] = record[FIELD_MAP['id']]
x['id'] = db_id
x['formats'] = []
if not x['authors']:
x['authors'] = _('Unknown')

View File

@ -524,6 +524,7 @@ class DynamicConfig(dict):
pass
except:
import traceback
print 'Failed to unpickle stored object:'
traceback.print_exc()
d = {}
self.clear()

View File

@ -104,6 +104,7 @@ _extra_lang_codes = {
'en_CY' : _('English (Cyprus)'),
'en_PK' : _('English (Pakistan)'),
'en_SG' : _('English (Singapore)'),
'en_YE' : _('English (Yemen)'),
'de_AT' : _('German (AT)'),
'nl' : _('Dutch (NL)'),
'nl_BE' : _('Dutch (BE)'),

View File

@ -9,9 +9,22 @@ __docformat__ = 'restructuredtext en'
import __builtin__, sys, os
_dev_path = os.environ.get('CALIBRE_DEVELOP_FROM', None)
if _dev_path is not None:
_dev_path = os.path.join(os.path.abspath(os.path.dirname(_dev_path)), 'resources')
if not os.path.exists(_dev_path):
_dev_path = None
def get_path(path, data=False):
global _dev_path
path = path.replace(os.sep, '/')
path = os.path.join(sys.resources_location, *path.split('/'))
base = None
if _dev_path is not None:
if os.path.exists(os.path.join(_dev_path, *path.split('/'))):
base = _dev_path
if base is None:
base = sys.resources_location
path = os.path.join(base, *path.split('/'))
if data:
return open(path, 'rb').read()
return path

View File

@ -357,9 +357,17 @@ class BasicNewsRecipe(Recipe):
Override in a subclass to customize extraction of the :term:`URL` that points
to the content for each article. Return the
article URL. It is called with `article`, an object representing a parsed article
from a feed. See `feedsparser <http://www.feedparser.org/docs/>`_.
By default it returns `article.link <http://www.feedparser.org/docs/reference-entry-link.html>`_.
from a feed. See `feedparser <http://www.feedparser.org/docs/>`_.
By default it looks for the original link (for feeds syndicated via a
service like feedburner or pheedo) and if found,
returns that or else returns
`article.link <http://www.feedparser.org/docs/reference-entry-link.html>`_.
'''
for key in article.keys():
if key.endswith('_origlink'):
url = article[key]
if url and url.startswith('http://'):
return url
return article.get('link', None)
def preprocess_html(self, soup):