diff --git a/resources/images/news/ad.png b/resources/images/news/ad.png new file mode 100644 index 0000000000..8b017910df Binary files /dev/null and b/resources/images/news/ad.png differ diff --git a/resources/images/news/digitaljournal.png b/resources/images/news/digitaljournal.png new file mode 100644 index 0000000000..ea4637b8ad Binary files /dev/null and b/resources/images/news/digitaljournal.png differ diff --git a/resources/images/news/kitsapun.png b/resources/images/news/kitsapun.png new file mode 100644 index 0000000000..4b7b883d52 Binary files /dev/null and b/resources/images/news/kitsapun.png differ diff --git a/resources/images/news/ledevoir.png b/resources/images/news/ledevoir.png new file mode 100644 index 0000000000..eabcf97004 Binary files /dev/null and b/resources/images/news/ledevoir.png differ diff --git a/resources/recipes/ad.recipe b/resources/recipes/ad.recipe new file mode 100644 index 0000000000..bc3fe40dad --- /dev/null +++ b/resources/recipes/ad.recipe @@ -0,0 +1,86 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class ADRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'kwetal' + language = 'nl' + country = 'NL' + version = 1 + + title = u'AD' + publisher = u'de Persgroep Publishing Nederland NV' + category = u'News, Sports, the Netherlands' + description = u'News and Sports from the Netherlands' + + oldest_article = 1.2 + max_articles_per_feed = 100 + use_embedded_content = False + + remove_empty_feeds = True + no_stylesheets = True + remove_javascript = True + + keep_only_tags = [] + keep_only_tags.append(dict(name = 'div', attrs = {'id': 'art_box2'})) + keep_only_tags.append(dict(name = 'p', attrs = {'class': 'gen_footnote3'})) + + remove_tags = [] + remove_tags.append(dict(name = 'div', attrs = {'class': 'gen_clear'})) + remove_tags.append(dict(name = 'div', attrs = {'class': re.compile(r'gen_spacer.*')})) + + remove_attributes = ['style'] + + # feeds from http://ad.nl/ad/nl/1401/home/integration/nmc/frameset/ad_footer/rssFeeds.dhtml + feeds = [] + feeds.append((u'Binnenland', u'http://www.ad.nl/nieuws/binnenland/rss.xml')) + feeds.append((u'Buitenland', u'http://www.ad.nl/nieuws/buitenland/rss.xml')) + feeds.append((u'Bizar', u'http://www.ad.nl/nieuws/bizar/rss.xml')) + feeds.append((u'Gezondheid & Wetenschap', u'http://www.ad.nl/nieuws/gezondheidwetenschap/rss.xml')) + feeds.append((u'Economie', u'http://www.ad.nl/nieuws/economie/rss.xml')) + feeds.append((u'Nederlands Voetbal', u'http://www.ad.nl/sportwereld/nederlandsvoetbal/rss.xml')) + feeds.append((u'Buitenlands Voetbal', u'http://www.ad.nl/sportwereld/buitenlandsvoetbal/rss.xml')) + feeds.append((u'Champions League/Europa League', u'http://www.ad.nl/sportwereld/championsleagueeuropaleague/rss.xml')) + feeds.append((u'Wielrennen', u'http://www.ad.nl/sportwereld/wielrennen/rss.xml')) + feeds.append((u'Tennis', u'http://www.ad.nl/sportwereld/tennis/rss.xml')) + feeds.append((u'Formule 1', u'http://www.ad.nl/sportwereld/formule1/rss.xml')) + feeds.append((u'Meer Sport', u'http://www.ad.nl/sportwereld/meersport/rss.xml')) + feeds.append((u'Celebs', u'http://www.ad.nl/showbizz/celebs/rss.xml')) + feeds.append((u'Film', u'http://www.ad.nl/showbizz/film/rss.xml')) + feeds.append((u'Muziek', u'http://www.ad.nl/showbizz/muziek/rss.xml')) + feeds.append((u'TV', u'http://www.ad.nl/showbizz/tv/rss.xml')) + feeds.append((u'Kunst & Literatuur', u'http://www.ad.nl/showbizz/kunstenliteratuur/rss.xml')) + feeds.append((u'Jouw Wereld', u'http://www.ad.nl/you/rss.xml')) + feeds.append((u'Consument', u'http://www.ad.nl/consument/rss.xml')) + feeds.append((u'Autowereld', u'http://www.ad.nl/autowereld/rss.xml')) + feeds.append((u'Reiswereld', u'http://www.ad.nl/reiswereld/rss.xml')) + feeds.append((u'Internet', u'http://www.ad.nl/digitaal/internet/rss.xml')) + feeds.append((u'Games', u'http://www.ad.nl/digitaal/games/rss.xml')) + feeds.append((u'Multimedia', u'http://www.ad.nl/digitaal/multimedia/rss.xml')) + feeds.append((u'Planet Watch', u'http://www.ad.nl/planetwatch/rss.xml')) + + extra_css = ''' + body {font-family: verdana, arial, helvetica, geneva, sans-serif;} + div.captionEmbeddedMasterObject {font-size: x-small; font-style: italic; color: #696969;} + .gen_footnote3 {font-size: small; color: #666666; margin-top: 0.6em;} + ''' + + conversion_options = {'comments': description, 'tags': category, 'language': 'en', + 'publisher': publisher} + + def print_version(self, url): + parts = url.split('/') + print_url = 'http://' + parts[2] + '/' + parts[3] + '/' + parts[4] + '/' + parts[5] + '/' \ + + parts[10] + '/' + parts[7] + '/print/' + parts[8] + '/' + parts[9] + '/' + parts[13] + + return print_url + + def preprocess_html(self, soup): + for br in soup.findAll('br'): + prev = br.findPreviousSibling(True) + if hasattr(prev, 'name') and prev.name == 'br': + next = br.findNextSibling(True) + if hasattr(next, 'name') and next.name == 'br': + br.extract() + + return soup diff --git a/resources/recipes/amspec.recipe b/resources/recipes/amspec.recipe index 62bec5ae18..e5a76a4f86 100644 --- a/resources/recipes/amspec.recipe +++ b/resources/recipes/amspec.recipe @@ -1,7 +1,5 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' +__copyright__ = '2009-2010, Darko Miletic ' ''' spectator.org ''' @@ -11,20 +9,22 @@ from calibre.web.feeds.news import BasicNewsRecipe class TheAmericanSpectator(BasicNewsRecipe): title = 'The American Spectator' __author__ = 'Darko Miletic' - language = 'en' - description = 'News from USA' + category = 'news, politics, USA, world' + publisher = 'The American Spectator' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False + language = 'en' INDEX = 'http://spectator.org' - html2lrf_options = [ - '--comment' , description - , '--category' , 'news, politics, USA' - , '--publisher' , title - ] + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } keep_only_tags = [ dict(name='div', attrs={'class':'post inner'}) @@ -33,13 +33,11 @@ class TheAmericanSpectator(BasicNewsRecipe): remove_tags = [ dict(name='object') - ,dict(name='div', attrs={'class':'col3' }) - ,dict(name='div', attrs={'class':'post-options' }) - ,dict(name='p' , attrs={'class':'letter-editor'}) - ,dict(name='div', attrs={'class':'social' }) + ,dict(name='div', attrs={'class':['col3','post-options','social']}) + ,dict(name='p' , attrs={'class':['letter-editor','meta']}) ] - feeds = [ (u'Articles', u'http://feedproxy.google.com/amspecarticles')] + feeds = [ (u'Articles', u'http://feeds.feedburner.com/amspecarticles')] def get_cover_url(self): cover_url = None @@ -53,3 +51,7 @@ class TheAmericanSpectator(BasicNewsRecipe): def print_version(self, url): return url + '/print' + + def get_article_url(self, article): + return article.get('guid', None) + diff --git a/resources/recipes/bbc_fast.recipe b/resources/recipes/bbc_fast.recipe new file mode 100644 index 0000000000..12ae9ce1eb --- /dev/null +++ b/resources/recipes/bbc_fast.recipe @@ -0,0 +1,60 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +news.bbc.co.uk +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + +class BBC(BasicNewsRecipe): + title = 'BBC News (fast)' + __author__ = 'Darko Miletic' + description = 'News from UK. A much faster version that does not download pictures' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + #delay = 1 + use_embedded_content = False + encoding = 'utf8' + publisher = 'BBC' + category = 'news, UK, world' + language = 'en' + extra_css = ' body{ font-family: sans-serif; } .headline{font-size: xx-large; font-weight: bold} .ibox{display: block; margin: 20px 50px; padding: 10px; border: 1px solid } ' + + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } + + remove_tags_before = dict(name='div',attrs={'class':'headline'}) + remove_tags_after = dict(name='div', attrs={'class':'footer'}) + remove_tags = [ + dict(name=['object','link','script','iframe']) + ,dict(name='div', attrs={'class':'footer'}) + ] + + feeds = [ + ('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'), + ('Science/Nature', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/science/nature/rss.xml'), + ('Technology', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/technology/rss.xml'), + ('Entertainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'), + ('Magazine', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/magazine/rss.xml'), + ('Business', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml'), + ('Health', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/health/rss.xml'), + ('Americas', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml'), + ('Europe', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml'), + ('South Asia', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/south_asia/rss.xml'), + ('UK', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/rss.xml'), + ('Asia-Pacific', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/asia-pacific/rss.xml'), + ('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'), + ] + + def print_version(self, url): + emp,sep,rstrip = url.partition('http://') + return 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/' + rstrip + + def get_article_url(self, article): + return article.get('guid', None) + diff --git a/resources/recipes/calgary_herald.recipe b/resources/recipes/calgary_herald.recipe new file mode 100644 index 0000000000..884a951d96 --- /dev/null +++ b/resources/recipes/calgary_herald.recipe @@ -0,0 +1,121 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Calgary Herald + title = u'Calgary Herald' + url_prefix = 'http://www.calgaryherald.com' + description = u'News from Calgary, AB' + + # un-comment the following three lines for the Regina Leader-Post + #title = u'Regina Leader-Post' + #url_prefix = 'http://www.leaderpost.com' + #description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/cjr.recipe b/resources/recipes/cjr.recipe new file mode 100644 index 0000000000..d581184c4e --- /dev/null +++ b/resources/recipes/cjr.recipe @@ -0,0 +1,15 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class CJR(BasicNewsRecipe): + title = u'Columbia Journalism Review' + __author__ = u'Xanthan Gum' + description = 'News about journalism.' + language = 'en' + + oldest_article = 7 + max_articles_per_feed = 100 + + feeds = [(u'News Stories', u'http://www.cjr.org/index.xml')] + + def print_version(self, url): + return url + '?page=all&print=true' diff --git a/resources/recipes/digitaljournal.recipe b/resources/recipes/digitaljournal.recipe new file mode 100644 index 0000000000..c49caf9580 --- /dev/null +++ b/resources/recipes/digitaljournal.recipe @@ -0,0 +1,52 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +digitaljournal.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class DigitalJournal(BasicNewsRecipe): + title = 'Digital Journal' + __author__ = 'Darko Miletic' + description = 'A Global Citizen Journalism News Network' + category = 'news, politics, USA, world' + publisher = 'Digital Journal' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + encoding = 'utf8' + language = 'en' + + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } + + keep_only_tags = [dict(name='div', attrs={'class':['article','body']})] + + remove_tags = [dict(name=['object','table'])] + + feeds = [ + (u'Latest News' , u'http://digitaljournal.com/rss/?feed=latest_news' ) + ,(u'Business' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Business' ) + ,(u'Entertainment', u'http://digitaljournal.com/rss/?feed=top_news&depname=Entertainment') + ,(u'Environment' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Environment' ) + ,(u'Food' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Food' ) + ,(u'Health' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Health' ) + ,(u'Internet' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Internet' ) + ,(u'Politics' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Politics' ) + ,(u'Religion' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Religion' ) + ,(u'Science' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Science' ) + ,(u'Sports' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Sports' ) + ,(u'Technology' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Technology' ) + ,(u'World' , u'http://digitaljournal.com/rss/?feed=top_news&depname=World' ) + ,(u'Arts' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Arts' ) + ] + + def print_version(self, url): + return url.replace('digitaljournal.com/','digitaljournal.com/print/') + diff --git a/resources/recipes/edmonton_journal.recipe b/resources/recipes/edmonton_journal.recipe new file mode 100644 index 0000000000..ac28b18f71 --- /dev/null +++ b/resources/recipes/edmonton_journal.recipe @@ -0,0 +1,126 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Edmonton Journal + title = u'Edmonton Journal' + url_prefix = 'http://www.edmontonjournal.com' + description = u'News from Edmonton, AB' + + # un-comment the following three lines for the Calgary Herald + #title = u'Calgary Herald' + #url_prefix = 'http://www.calgaryherald.com' + #description = u'News from Calgary, AB' + + # un-comment the following three lines for the Regina Leader-Post + #title = u'Regina Leader-Post' + #url_prefix = 'http://www.leaderpost.com' + #description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/ftd.recipe b/resources/recipes/ftd.recipe index db53a3ed19..67eb4d08b1 100644 --- a/resources/recipes/ftd.recipe +++ b/resources/recipes/ftd.recipe @@ -9,27 +9,33 @@ from calibre.web.feeds.news import BasicNewsRecipe class FTDe(BasicNewsRecipe): - + title = 'FTD' description = 'Financial Times Deutschland' __author__ = 'Oliver Niesner' use_embedded_content = False timefmt = ' [%d %b %Y]' - language = 'de' + language = _('German') max_articles_per_feed = 40 no_stylesheets = True - + remove_tags = [dict(id='navi_top'), dict(id='topbanner'), dict(id='seitenkopf'), dict(id='BoxA-0-0-0'), + #dict(id='BoxA-2-0-0'), dict(id='footer'), dict(id='rating_open'), dict(id='ADS_Top'), dict(id='spinner'), dict(id='ftd-contentad'), + dict(id='ftd-promo'), dict(id='nava-50009007-1-0'), dict(id='navli-50009007-1-0'), + dict(id='Box5000534-0-0-0'), + dict(id='ExpV-1-0-0-1'), + dict(id='ExpV-1-0-0-0'), + dict(id='PollExpV-2-0-0-0'), dict(id='starRating'), dict(id='saveRating'), dict(id='yLayer'), @@ -44,14 +50,20 @@ class FTDe(BasicNewsRecipe): dict(name='ul', attrs={'class':'nav'}), dict(name='p', attrs={'class':'articleOptionHead'}), dict(name='p', attrs={'class':'articleOptionFoot'}), + dict(name='p', attrs={'class':'moreInfo'}), dict(name='div', attrs={'class':'chartBox'}), dict(name='div', attrs={'class':'ratingOpt starRatingContainer articleOptionFootFrame'}), dict(name='div', attrs={'class':'box boxArticleBasic boxComments boxTransparent'}), - dict(name='div', attrs={'class':'box boxNavTabs '}), + dict(name='div', attrs={'class':'box boxNavTabs'}), + dict(name='div', attrs={'class':'boxMMRgtLow'}), dict(name='span', attrs={'class':'vote_455857'}), dict(name='div', attrs={'class':'relatedhalb'}), dict(name='div', attrs={'class':'box boxListScrollOutline'}), + dict(name='div', attrs={'class':'box boxPhotoshow boxImgWide'}), + dict(name='div', attrs={'class':'box boxTeaser boxPhotoshow boxImgWide'}), + dict(name='div', attrs={'class':'box boxTeaser'}), dict(name='div', attrs={'class':'tagCloud'}), + dict(name='div', attrs={'class':'pollView'}), dict(name='div', attrs={'class':'box boxArticleBasic boxNavTabsOutline'}), dict(name='div', attrs={'class':'ftdHpNav'}), dict(name='div', attrs={'class':'ftdHead'}), @@ -67,11 +79,12 @@ class FTDe(BasicNewsRecipe): dict(name='div', attrs={'class':'wertungoben'}), dict(name='div', attrs={'class':'artikelfuss'}), dict(name='a', attrs={'class':'rating'}), + dict(name='a', attrs={'href':'#rt'}), dict(name='div', attrs={'class':'articleOptionFootFrame'}), dict(name='div', attrs={'class':'artikelsplitfaq'})] - remove_tags_after = [dict(name='a', attrs={'class':'more'})] - - feeds = [ ('Finanzen', 'http://www.ftd.de/rss2/finanzen/maerkte'), + #remove_tags_after = [dict(name='a', attrs={'class':'more'})] + + feeds = [ ('Finanzen', 'http://www.ftd.de/rss2/finanzen/maerkte'), ('Meinungshungrige', 'http://www.ftd.de/rss2/meinungshungrige'), ('Unternehmen', 'http://www.ftd.de/rss2/unternehmen'), ('Politik', 'http://www.ftd.de/rss2/politik'), @@ -82,8 +95,8 @@ class FTDe(BasicNewsRecipe): ('Auto', 'http://www.ftd.de/rss2/auto'), ('Lifestyle', 'http://www.ftd.de/rss2/lifestyle') - ] - + ] + def print_version(self, url): - return url + '?mode=print' + return url.replace('.html', '.html?mode=print') diff --git a/resources/recipes/globe_and_mail.recipe b/resources/recipes/globe_and_mail.recipe index 71d6b2b304..0ef8bd9dd8 100644 --- a/resources/recipes/globe_and_mail.recipe +++ b/resources/recipes/globe_and_mail.recipe @@ -32,7 +32,7 @@ class GlobeAndMail(BasicNewsRecipe): 'gallery-controls', 'video', 'galleryLoading','deck','header', 'toolsBottom'] }, {'class':['credit','inline-img-caption','tab-pointer'] }, - dict(name='div', attrs={'id':'lead-photo'}), + dict(name='div', attrs={'id':['lead-photo', 'most-popular-story']}), dict(name='div', attrs={'class':'right'}), dict(name='div', attrs={'id':'footer'}), dict(name='div', attrs={'id':'beta-msg'}), diff --git a/resources/recipes/kitsapun.recipe b/resources/recipes/kitsapun.recipe new file mode 100644 index 0000000000..e9a7c42f06 --- /dev/null +++ b/resources/recipes/kitsapun.recipe @@ -0,0 +1,44 @@ + +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +www.kitsapun.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class Kitsapsun(BasicNewsRecipe): + title = 'Kitsap Sun' + __author__ = 'Darko Miletic' + description = 'News from Kitsap County' + publisher = 'Scripps Interactive Newspapers Group' + category = 'news, Kitsap county, USA' + language = 'en' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + encoding = 'cp1252' + use_embedded_content = False + + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher': publisher + } + + + keep_only_tags = [dict(name='div', attrs={'id':['story_meta','story_content']})] + + remove_tags = [dict(name=['object','link','embed','form','iframe'])] + + feeds = [ + (u'News' , u'http://www.kitsapsun.com/rss/headlines/news/' ) + ,(u'Business' , u'http://www.kitsapsun.com/rss/headlines/business/' ) + ,(u'Communities' , u'http://www.kitsapsun.com/rss/headlines/communities/' ) + ,(u'Entertainment', u'http://www.kitsapsun.com/rss/headlines/entertainment/') + ,(u'Lifestyles' , u'http://www.kitsapsun.com/rss/headlines/lifestyles/' ) + ] + + def print_version(self, url): + return url.rpartition('/')[0] + '/?print=1' diff --git a/resources/recipes/ledevoir.recipe b/resources/recipes/ledevoir.recipe index c9dbd8c5d7..97b33c43a7 100644 --- a/resources/recipes/ledevoir.recipe +++ b/resources/recipes/ledevoir.recipe @@ -1,79 +1,79 @@ -#!/usr/bin/env python -__license__ = 'GPL v3' -__author__ = 'Lorenzo Vigentini' -__copyright__ = '2009, Lorenzo Vigentini ' -__version__ = 'v1.01' -__date__ = '14, January 2010' -__description__ = 'Canadian Paper ' - -''' -http://www.ledevoir.com/ -''' - -from calibre.web.feeds.news import BasicNewsRecipe - -class ledevoir(BasicNewsRecipe): - author = 'Lorenzo Vigentini' - description = 'Canadian Paper' - - cover_url = 'http://www.ledevoir.com/images/ul/graphiques/logo_devoir.gif' - title = u'Le Devoir' - publisher = 'leDevoir.com' - category = 'News, finance, economy, politics' - - language = 'fr' - encoding = 'utf-8' - timefmt = '[%a, %d %b, %Y]' - - max_articles_per_feed = 50 - use_embedded_content = False - recursion = 10 - - remove_javascript = True - no_stylesheets = True - - keep_only_tags = [ - dict(name='div', attrs={'id':'article'}), - dict(name='ul', attrs={'id':'ariane'}) - ] - - remove_tags = [ - dict(name='div', attrs={'id':'dialog'}), - dict(name='div', attrs={'class':['interesse_actions','reactions']}), - dict(name='ul', attrs={'class':'mots_cles'}), - dict(name='a', attrs={'class':'haut'}), - dict(name='h5', attrs={'class':'interesse_actions'}) - ] - - feeds = [ - (u'A la une', 'http://www.ledevoir.com/rss/manchettes.xml'), - (u'Edition complete', 'http://feeds2.feedburner.com/fluxdudevoir'), - (u'Opinions', 'http://www.ledevoir.com/rss/opinions.xml'), - (u'Chroniques', 'http://www.ledevoir.com/rss/chroniques.xml'), - (u'Politique', 'http://www.ledevoir.com/rss/section/politique.xml?id=51'), - (u'International', 'http://www.ledevoir.com/rss/section/international.xml?id=76'), - (u'Culture', 'http://www.ledevoir.com/rss/section/culture.xml?id=48'), - (u'Environnement', 'http://www.ledevoir.com/rss/section/environnement.xml?id=78'), - (u'Societe', 'http://www.ledevoir.com/rss/section/societe.xml?id=52'), - (u'Economie', 'http://www.ledevoir.com/rss/section/economie.xml?id=49'), - (u'Sports', 'http://www.ledevoir.com/rss/section/sports.xml?id=85'), - (u'Loisirs', 'http://www.ledevoir.com/rss/section/loisirs.xml?id=50') - ] - - extra_css = ''' - h1 {color:#1C1E7C;font-family:Times,Georgia,serif;font-size:1.85em;font-size-adjust:none;font-stretch:normal;font-style:normal;font-variant:normal;font-weight:bold;line-height:1.2em;margin:0 0 5px;} - h2 {color:#333333;font-family:Times,Georgia,serif;font-size:1.5em;font-size-adjust:none;font-stretch:normal;font-style:normal;font-variant:normal;font-weight:normal;line-height:1.2em;margin:0 0 5px;} - h3 {color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px;} - h4 {color:#333333; font-family:Arial,Helvetica,sans-serif;font-size:13px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; } - h5 {color:#333333; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; text-transform:uppercase;} - .specs {line-height:1em;margin:1px 0;} - .specs span.auteur {font:0.85em/1.1em Arial, Verdana, sans-serif;color:#787878;} - .specs span.auteur a, - .specs span.auteur span {text-transform:uppercase;color:#787878;} - .specs .date {font:0.85em/1.1em Arial, Verdana, sans-serif;color:#787878;} - ul#ariane {list-style-type:none;margin:0;padding:5px 0 8px 0;font:0.85em/1.2em Arial, Verdana, sans-serif;color:#2E2E2E;border-bottom:10px solid #fff;} - ul#ariane li {display:inline;} - ul#ariane a {color:#2E2E2E;text-decoration:underline;} - .credit {color:#787878;font-size:0.71em;line-height:1.1em;font-weight:bold;} - .texte {font-size:1.15em;line-height:1.4em;margin-bottom:17px;} - ''' +#!/usr/bin/env python +__license__ = 'GPL v3' +__author__ = 'Lorenzo Vigentini' +__copyright__ = '2009, Lorenzo Vigentini ' +__version__ = 'v1.01' +__date__ = '14, January 2010' +__description__ = 'Canadian Paper ' + +''' +http://www.ledevoir.com/ +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class ledevoir(BasicNewsRecipe): + author = 'Lorenzo Vigentini' + description = 'Canadian Paper' + + cover_url = 'http://www.ledevoir.com/images/ul/graphiques/logo_devoir.gif' + title = u'Le Devoir' + publisher = 'leDevoir.com' + category = 'News, finance, economy, politics' + + language = 'fr' + encoding = 'utf-8' + timefmt = '[%a, %d %b, %Y]' + + max_articles_per_feed = 50 + use_embedded_content = False + recursion = 10 + + remove_javascript = True + no_stylesheets = True + + keep_only_tags = [ + dict(name='div', attrs={'id':'article'}), + dict(name='ul', attrs={'id':'ariane'}) + ] + + remove_tags = [ + dict(name='div', attrs={'id':'dialog'}), + dict(name='div', attrs={'class':['interesse_actions','reactions']}), + dict(name='ul', attrs={'class':'mots_cles'}), + dict(name='a', attrs={'class':'haut'}), + dict(name='h5', attrs={'class':'interesse_actions'}) + ] + + feeds = [ + (u'A la une', 'http://www.ledevoir.com/rss/manchettes.xml'), + (u'Edition complete', 'http://feeds2.feedburner.com/fluxdudevoir'), + (u'Opinions', 'http://www.ledevoir.com/rss/opinions.xml'), + (u'Chroniques', 'http://www.ledevoir.com/rss/chroniques.xml'), + (u'Politique', 'http://www.ledevoir.com/rss/section/politique.xml?id=51'), + (u'International', 'http://www.ledevoir.com/rss/section/international.xml?id=76'), + (u'Culture', 'http://www.ledevoir.com/rss/section/culture.xml?id=48'), + (u'Environnement', 'http://www.ledevoir.com/rss/section/environnement.xml?id=78'), + (u'Societe', 'http://www.ledevoir.com/rss/section/societe.xml?id=52'), + (u'Economie', 'http://www.ledevoir.com/rss/section/economie.xml?id=49'), + (u'Sports', 'http://www.ledevoir.com/rss/section/sports.xml?id=85'), + (u'Loisirs', 'http://www.ledevoir.com/rss/section/loisirs.xml?id=50') + ] + + extra_css = ''' + h1 {color:#1C1E7C;font-family:Times,Georgia,serif;font-size:1.85em;font-size-adjust:none;font-stretch:normal;font-style:normal;font-variant:normal;font-weight:bold;line-height:1.2em;margin:0 0 5px;} + h2 {color:#333333;font-family:Times,Georgia,serif;font-size:1.5em;font-size-adjust:none;font-stretch:normal;font-style:normal;font-variant:normal;font-weight:normal;line-height:1.2em;margin:0 0 5px;} + h3 {color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px;} + h4 {color:#333333; font-family:Arial,Helvetica,sans-serif;font-size:13px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; } + h5 {color:#333333; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; line-height:14px; text-transform:uppercase;} + .specs {line-height:1em;margin:1px 0;} + .specs span.auteur {font:0.85em/1.1em Arial, Verdana, sans-serif;color:#787878;} + .specs span.auteur a, + .specs span.auteur span {text-transform:uppercase;color:#787878;} + .specs .date {font:0.85em/1.1em Arial, Verdana, sans-serif;color:#787878;} + ul#ariane {list-style-type:none;margin:0;padding:5px 0 8px 0;font:0.85em/1.2em Arial, Verdana, sans-serif;color:#2E2E2E;border-bottom:10px solid #fff;} + ul#ariane li {display:inline;} + ul#ariane a {color:#2E2E2E;text-decoration:underline;} + .credit {color:#787878;font-size:0.71em;line-height:1.1em;font-weight:bold;} + .texte {font-size:1.15em;line-height:1.4em;margin-bottom:17px;} + ''' diff --git a/resources/recipes/montreal_gazette.recipe b/resources/recipes/montreal_gazette.recipe new file mode 100644 index 0000000000..3061cc37e4 --- /dev/null +++ b/resources/recipes/montreal_gazette.recipe @@ -0,0 +1,96 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Montreal Gazette + title = u'Montreal Gazette' + url_prefix = 'http://www.montrealgazette.com' + description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/ottawa_citizen.recipe b/resources/recipes/ottawa_citizen.recipe new file mode 100644 index 0000000000..5465212d4c --- /dev/null +++ b/resources/recipes/ottawa_citizen.recipe @@ -0,0 +1,101 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Ottawa Citizen + title = u'Ottawa Citizen' + url_prefix = 'http://www.ottawacitizen.com' + description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/pajama.recipe b/resources/recipes/pajama.recipe new file mode 100644 index 0000000000..8c5ba74317 --- /dev/null +++ b/resources/recipes/pajama.recipe @@ -0,0 +1,48 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +class PajamasMedia(BasicNewsRecipe): + title = u'Pajamas Media' + description = u'Provides exclusive news and opinion for forty countries.' + language = 'en' + __author__ = 'Krittika Goyal' + oldest_article = 1 #days + max_articles_per_feed = 25 + recursions = 1 + match_regexps = [r'http://pajamasmedia.com/blog/.*/2/$'] + #encoding = 'latin1' + + remove_stylesheets = True + #remove_tags_before = dict(name='h1', attrs={'class':'heading'}) + remove_tags_after = dict(name='div', attrs={'class':'paged-nav'}) + remove_tags = [ + dict(name='iframe'), + dict(name='div', attrs={'class':['pages']}), + #dict(name='div', attrs={'id':['bookmark']}), + #dict(name='span', attrs={'class':['related_link', 'slideshowcontrols']}), + #dict(name='ul', attrs={'class':'articleTools'}), + ] + + feeds = [ +('pajamas Media', + 'http://feeds.feedburner.com/PajamasMedia'), + +] + + def preprocess_html(self, soup): + story = soup.find(name='div', attrs={'id':'innerpage-content'}) + #td = heading.findParent(name='td') + #td.extract() + + soup = BeautifulSoup('t') + body = soup.find(name='body') + body.insert(0, story) + return soup + + def postprocess_html(self, soup, first): + if not first: + h = soup.find(attrs={'class':'innerpage-header'}) + if h: h.extract() + auth = soup.find(attrs={'class':'author'}) + if auth: auth.extract() + return soup diff --git a/resources/recipes/physics_today.recipe b/resources/recipes/physics_today.recipe index 9b236ff23c..d1ce17cf32 100644 --- a/resources/recipes/physics_today.recipe +++ b/resources/recipes/physics_today.recipe @@ -8,8 +8,7 @@ class Physicstoday(BasicNewsRecipe): description = u'Physics Today magazine' publisher = 'American Institute of Physics' category = 'Physics' - language = 'en' - + language = 'en' cover_url = strftime('http://ptonline.aip.org/journals/doc/PHTOAD-home/jrnls/images/medcover%m_%Y.jpg') oldest_article = 30 max_articles_per_feed = 100 @@ -30,11 +29,11 @@ class Physicstoday(BasicNewsRecipe): def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: - br.open('http://www.physicstoday.org/pt/sso_login.jsp') - br.select_form(name='login') + br.open('http://ptonline.aip.org/journals/doc/PHTOAD-home/pt_login.jsp?fl=f') + br.select_form(name='login_form') br['username'] = self.username br['password'] = self.password br.submit() return br - feeds = [(u'All', u'http://www.physicstoday.org/feed.xml')] \ No newline at end of file + feeds = [(u'All', u'http://www.physicstoday.org/feed.xml')] diff --git a/resources/recipes/readers_digest.recipe b/resources/recipes/readers_digest.recipe new file mode 100644 index 0000000000..3689ca4c53 --- /dev/null +++ b/resources/recipes/readers_digest.recipe @@ -0,0 +1,188 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +''' +''' +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.web.feeds import Feed + + +class ReadersDigest(BasicNewsRecipe): + + title = 'Readers Digest' + __author__ = 'BrianG' + language = 'en' + description = 'Readers Digest Feeds' + no_stylesheets = True + use_embedded_content = False + oldest_article = 60 + max_articles_per_feed = 200 + + language = 'en' + remove_javascript = True + + extra_css = ''' h1 {font-family:georgia,serif;color:#000000;} + .mainHd{font-family:georgia,serif;color:#000000;} + h2 {font-family:Arial,Sans-serif;} + .name{font-family:Arial,Sans-serif; font-size:x-small;font-weight:bold; } + .date{font-family:Arial,Sans-serif; font-size:x-small ;color:#999999;} + .byline{font-family:Arial,Sans-serif; font-size:x-small ;} + .photoBkt{ font-size:x-small ;} + .vertPhoto{font-size:x-small ;} + .credits{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;} + .credit{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;} + .artTxt{font-family:georgia,serif;} + .caption{font-family:georgia,serif; font-size:x-small;color:#333333;} + .credit{font-family:georgia,serif; font-size:x-small;color:#999999;} + a:link{color:#CC0000;} + .breadcrumb{font-family:Arial,Sans-serif;font-size:x-small;} + ''' + + + remove_tags = [ + dict(name='h4', attrs={'class':'close'}), + dict(name='div', attrs={'class':'fromLine'}), + dict(name='img', attrs={'class':'colorTag'}), + dict(name='div', attrs={'id':'sponsorArticleHeader'}), + dict(name='div', attrs={'class':'horizontalAd'}), + dict(name='div', attrs={'id':'imageCounterLeft'}), + dict(name='div', attrs={'id':'commentsPrint'}) + ] + + + feeds = [ + ('New in RD', 'http://feeds.rd.com/ReadersDigest'), + ('Jokes', 'http://feeds.rd.com/ReadersDigestJokes'), + ('Cartoons', 'http://feeds.rd.com/ReadersDigestCartoons'), + ('Blogs','http://feeds.rd.com/ReadersDigestBlogs') + ] + + cover_url = 'http://www.rd.com/images/logo-main-rd.gif' + + + +#------------------------------------------------------------------------------------------------- + + def print_version(self, url): + + # Get the identity number of the current article and append it to the root print URL + + if url.find('/article') > 0: + ident = url[url.find('/article')+8:url.find('.html?')-4] + url = 'http://www.rd.com/content/printContent.do?contentId=' + ident + + elif url.find('/post') > 0: + + # in this case, have to get the page itself to derive the Print page. + soup = self.index_to_soup(url) + newsoup = soup.find('ul',attrs={'class':'printBlock'}) + url = 'http://www.rd.com' + newsoup('a')[0]['href'] + url = url[0:url.find('&Keep')] + + return url + +#------------------------------------------------------------------------------------------------- + + def parse_index(self): + + pages = [ + ('Your America','http://www.rd.com/your-america-inspiring-people-and-stories', 'channelLeftContainer',{'class':'moreLeft'}), + # useless recipes ('Living Healthy','http://www.rd.com/living-healthy', 'channelLeftContainer',{'class':'moreLeft'}), + ('Advice and Know-How','http://www.rd.com/advice-and-know-how', 'channelLeftContainer',{'class':'moreLeft'}) + + ] + + feeds = [] + + for page in pages: + section, url, divider, attrList = page + newArticles = self.page_parse(url, divider, attrList) + feeds.append((section,newArticles)) + + # after the pages of the site have been processed, parse several RSS feeds for additional sections + newfeeds = Feed() + newfeeds = self.parse_rss() + + + # The utility code in parse_rss returns a Feed object. Convert each feed/article combination into a form suitable + # for this module (parse_index). + + for feed in newfeeds: + newArticles = [] + for article in feed.articles: + newArt = { + 'title' : article.title, + 'url' : article.url, + 'date' : article.date, + 'description' : article.text_summary + } + newArticles.append(newArt) + + + # New and Blogs should be the first two feeds. + if feed.title == 'New in RD': + feeds.insert(0,(feed.title,newArticles)) + elif feed.title == 'Blogs': + feeds.insert(1,(feed.title,newArticles)) + else: + feeds.append((feed.title,newArticles)) + + + return feeds + +#------------------------------------------------------------------------------------------------- + + def page_parse(self, mainurl, divider, attrList): + + articles = [] + mainsoup = self.index_to_soup(mainurl) + for item in mainsoup.findAll(attrs=attrList): + newArticle = { + 'title' : item('img')[0]['alt'], + 'url' : 'http://www.rd.com'+item('a')[0]['href'], + 'date' : '', + 'description' : '' + } + articles.append(newArticle) + + + + return articles + + + +#------------------------------------------------------------------------------------------------- + + def parse_rss (self): + + # Do the "official" parse_feeds first + feeds = BasicNewsRecipe.parse_feeds(self) + + + # Loop thru the articles in all feeds to find articles with "recipe" in it + recipeArticles = [] + for curfeed in feeds: + delList = [] + for a,curarticle in enumerate(curfeed.articles): + if curarticle.title.upper().find('RECIPE') >= 0: + recipeArticles.append(curarticle) + delList.append(curarticle) + if len(delList)>0: + for d in delList: + index = curfeed.articles.index(d) + curfeed.articles[index:index+1] = [] + + # If there are any recipes found, create a new Feed object and append. + if len(recipeArticles) > 0: + pfeed = Feed() + pfeed.title = 'Recipes' + pfeed.descrition = 'Recipe Feed (Virtual)' + pfeed.image_url = None + pfeed.oldest_article = 30 + pfeed.id_counter = len(recipeArticles) + # Create a new Feed, add the recipe articles, and then append + # to "official" list of feeds + pfeed.articles = recipeArticles[:] + feeds.append(pfeed) + + return feeds + diff --git a/resources/recipes/regina_leader_post.recipe b/resources/recipes/regina_leader_post.recipe new file mode 100644 index 0000000000..9efec51848 --- /dev/null +++ b/resources/recipes/regina_leader_post.recipe @@ -0,0 +1,116 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Regina Leader-Post + title = u'Regina Leader-Post' + url_prefix = 'http://www.leaderpost.com' + description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/saskatoon_star_phoenix.recipe b/resources/recipes/saskatoon_star_phoenix.recipe new file mode 100644 index 0000000000..25330478d4 --- /dev/null +++ b/resources/recipes/saskatoon_star_phoenix.recipe @@ -0,0 +1,111 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Saskatoon Star-Phoenix + title = u'Saskatoon Star-Phoenix' + url_prefix = 'http://www.thestarphoenix.com' + description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/vancouver_provice.recipe b/resources/recipes/vancouver_provice.recipe new file mode 100644 index 0000000000..9375670c59 --- /dev/null +++ b/resources/recipes/vancouver_provice.recipe @@ -0,0 +1,136 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Vancouver Province + title = u'Vancouver Province' + url_prefix = 'http://www.theprovince.com' + description = u'News from Vancouver, BC' + + # un-comment the following three lines for the Vancouver Sun + #title = u'Vancouver Sun' + #url_prefix = 'http://www.vancouversun.com' + #description = u'News from Vancouver, BC' + + # un-comment the following three lines for the Edmonton Journal + #title = u'Edmonton Journal' + #url_prefix = 'http://www.edmontonjournal.com' + #description = u'News from Edmonton, AB' + + # un-comment the following three lines for the Calgary Herald + #title = u'Calgary Herald' + #url_prefix = 'http://www.calgaryherald.com' + #description = u'News from Calgary, AB' + + # un-comment the following three lines for the Regina Leader-Post + #title = u'Regina Leader-Post' + #url_prefix = 'http://www.leaderpost.com' + #description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/vancouver_sun.recipe b/resources/recipes/vancouver_sun.recipe new file mode 100644 index 0000000000..8f12869bf9 --- /dev/null +++ b/resources/recipes/vancouver_sun.recipe @@ -0,0 +1,131 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Vancouver Sun + title = u'Vancouver Sun' + url_prefix = 'http://www.vancouversun.com' + description = u'News from Vancouver, BC' + + # un-comment the following three lines for the Edmonton Journal + #title = u'Edmonton Journal' + #url_prefix = 'http://www.edmontonjournal.com' + #description = u'News from Edmonton, AB' + + # un-comment the following three lines for the Calgary Herald + #title = u'Calgary Herald' + #url_prefix = 'http://www.calgaryherald.com' + #description = u'News from Calgary, AB' + + # un-comment the following three lines for the Regina Leader-Post + #title = u'Regina Leader-Post' + #url_prefix = 'http://www.leaderpost.com' + #description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/vic_times.recipe b/resources/recipes/vic_times.recipe new file mode 100644 index 0000000000..2dc8e96003 --- /dev/null +++ b/resources/recipes/vic_times.recipe @@ -0,0 +1,141 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Victoria Times Colonist + title = u'Victoria Times Colonist' + url_prefix = 'http://www.timescolonist.com' + description = u'News from Victoria, BC' + + # un-comment the following three lines for the Vancouver Province + #title = u'Vancouver Province' + #url_prefix = 'http://www.theprovince.com' + #description = u'News from Vancouver, BC' + + # un-comment the following three lines for the Vancouver Sun + #title = u'Vancouver Sun' + #url_prefix = 'http://www.vancouversun.com' + #description = u'News from Vancouver, BC' + + # un-comment the following three lines for the Edmonton Journal + #title = u'Edmonton Journal' + #url_prefix = 'http://www.edmontonjournal.com' + #description = u'News from Edmonton, AB' + + # un-comment the following three lines for the Calgary Herald + #title = u'Calgary Herald' + #url_prefix = 'http://www.calgaryherald.com' + #description = u'News from Calgary, AB' + + # un-comment the following three lines for the Regina Leader-Post + #title = u'Regina Leader-Post' + #url_prefix = 'http://www.leaderpost.com' + #description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/windows_star.recipe b/resources/recipes/windows_star.recipe new file mode 100644 index 0000000000..4d34261bb7 --- /dev/null +++ b/resources/recipes/windows_star.recipe @@ -0,0 +1,106 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Windsor Star + title = u'Windsor Star' + url_prefix = 'http://www.windsorstar.com' + description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/wsj.recipe b/resources/recipes/wsj.recipe index 70c05b1ded..3ced77023d 100644 --- a/resources/recipes/wsj.recipe +++ b/resources/recipes/wsj.recipe @@ -5,6 +5,7 @@ __docformat__ = 'restructuredtext en' from calibre.web.feeds.news import BasicNewsRecipe +from calibre import strftime # http://online.wsj.com/page/us_in_todays_paper.html @@ -67,6 +68,13 @@ class WallStreetJournal(BasicNewsRecipe): def parse_index(self): soup = self.wsj_get_index() + year = strftime('%Y') + for x in soup.findAll('td', attrs={'class':'b14'}): + txt = self.tag_to_string(x).strip() + if year in txt: + self.timefmt = ' [%s]'%txt + break + left_column = soup.find( text=lambda t: 'begin ITP Left Column' in str(t)) @@ -91,7 +99,7 @@ class WallStreetJournal(BasicNewsRecipe): url = url.partition('#')[0] desc = '' d = x.findNextSibling(True) - if d.get('class', None) == 'arialResize': + if d is not None and d.get('class', None) == 'arialResize': desc = self.tag_to_string(d) desc = desc.partition(u'\u2022')[0] self.log('\t\tFound article:', title) diff --git a/resources/recipes/wsj_free.recipe b/resources/recipes/wsj_free.recipe index b05da400ae..b190f43849 100644 --- a/resources/recipes/wsj_free.recipe +++ b/resources/recipes/wsj_free.recipe @@ -3,47 +3,139 @@ __license__ = 'GPL v3' ''' -online.wsj.com.com +online.wsj.com ''' import re from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag, NavigableString +from datetime import timedelta, date class WSJ(BasicNewsRecipe): # formatting adapted from original recipe by Kovid Goyal and Sujata Raman title = u'Wall Street Journal (free)' __author__ = 'Nick Redding' language = 'en' - description = ('All the free content from the Wall Street Journal (business' - ', financial and political news)') + description = ('All the free content from the Wall Street Journal (business, financial and political news)') + no_stylesheets = True timefmt = ' [%b %d]' - extra_css = '''h1{font-size:large; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;} - h2{font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;} - .subhead{font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;} - .insettipUnit {font-family:Arial,Sans-serif;font-size:xx-small;} - .targetCaption{font-size:x-small; font-family:Arial,Helvetica,sans-serif;} - .article{font-family :Arial,Helvetica,sans-serif; font-size:x-small;} - .tagline { ont-size:xx-small;} - .dateStamp {font-family:Arial,Helvetica,sans-serif;} - h3{font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} - .byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small; list-style-type: none;} + + # customization notes: delete sections you are not interested in + # set omit_paid_content to False if you want the paid content article snippets + # set oldest_article to the maximum number of days back from today to include articles + sectionlist = [ + ['/home-page','Front Page'], + ['/public/page/news-opinion-commentary.html','Commentary'], + ['/public/page/news-global-world.html','World News'], + ['/public/page/news-world-business.html','US News'], + ['/public/page/news-business-us.html','Business'], + ['/public/page/news-financial-markets-stock.html','Markets'], + ['/public/page/news-tech-technology.html','Technology'], + ['/public/page/news-personal-finance.html','Personal Finnce'], + ['/public/page/news-lifestyle-arts-entertainment.html','Life & Style'], + ['/public/page/news-real-estate-homes.html','Real Estate'], + ['/public/page/news-career-jobs.html','Careers'], + ['/public/page/news-small-business-marketing.html','Small Business'] + ] + oldest_article = 2 + omit_paid_content = True + + extra_css = '''h1{font-size:large; font-family:Times,serif;} + h2{font-family:Times,serif; font-size:small; font-style:italic;} + .subhead{font-family:Times,serif; font-size:small; font-style:italic;} + .insettipUnit {font-family:Times,serif;font-size:xx-small;} + .targetCaption{font-size:x-small; font-family:Times,serif; font-style:italic; margin-top: 0.25em;} + .article{font-family:Times,serif; font-size:x-small;} + .tagline { font-size:xx-small;} + .dateStamp {font-family:Times,serif;} + h3{font-family:Times,serif; font-size:xx-small;} + .byline {font-family:Times,serif; font-size:xx-small; list-style-type: none;} .metadataType-articleCredits {list-style-type: none;} - h6{ font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic;} + h6{font-family:Times,serif; font-size:small; font-style:italic;} .paperLocation{font-size:xx-small;}''' - remove_tags_before = dict(name='h1') - remove_tags = [ dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", - "articleTabs_tab_interactive","articleTabs_tab_video", - "articleTabs_tab_map","articleTabs_tab_slideshow"]), - {'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map', - 'insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', 'tooltip', - 'adSummary', 'nav-inline','insetFullBracket']}, - dict(rel='shortcut icon'), + + remove_tags_before = dict({'class':re.compile('^articleHeadlineBox')}) + remove_tags = [ dict({'id':re.compile('^articleTabs_tab_')}), + #dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", + # "articleTabs_tab_interactive","articleTabs_tab_video", + # "articleTabs_tab_map","articleTabs_tab_slideshow"]), + {'class': ['footer_columns','network','insetCol3wide','interactive','video','slideshow','map', + 'insettip','insetClose','more_in', "insetContent", + # 'articleTools_bottom','articleTools_bottom mjArticleTools', + 'aTools', 'tooltip', + 'adSummary', 'nav-inline','insetFullBracket']}, + dict({'class':re.compile('^articleTools_bottom')}), + dict(rel='shortcut icon') ] remove_tags_after = [dict(id="article_story_body"), {'class':"article story"}] + def get_browser(self): + br = BasicNewsRecipe.get_browser() + return br + def preprocess_html(self,soup): + + def decode_us_date(datestr): + udate = datestr.strip().lower().split() + m = ['january','february','march','april','may','june','july','august','september','october','november','december'].index(udate[0])+1 + d = int(udate[1]) + y = int(udate[2]) + return date(y,m,d) + + # check if article is paid content + if self.omit_paid_content: + divtags = soup.findAll('div','tooltip') + if divtags: + for divtag in divtags: + if divtag.find(text="Subscriber Content"): + return None + + # check if article is too old + datetag = soup.find('li',attrs={'class' : re.compile("^dateStamp")}) + if datetag: + dateline_string = self.tag_to_string(datetag,False) + date_items = dateline_string.split(',') + datestring = date_items[0]+date_items[1] + article_date = decode_us_date(datestring) + earliest_date = date.today() - timedelta(days=self.oldest_article) + if article_date < earliest_date: + self.log("Skipping article dated %s" % datestring) + return None + datetag.parent.extract() + + # place dateline in article heading + + bylinetag = soup.find('h3','byline') + if bylinetag: + h3bylinetag = bylinetag + else: + bylinetag = soup.find('li','byline') + if bylinetag: + h3bylinetag = bylinetag.h3 + if not h3bylinetag: + h3bylinetag = bylinetag + bylinetag = bylinetag.parent + if bylinetag: + if h3bylinetag.a: + bylinetext = 'By '+self.tag_to_string(h3bylinetag.a,False) + else: + bylinetext = self.tag_to_string(h3bylinetag,False) + h3byline = Tag(soup,'h3',[('class','byline')]) + if bylinetext.isspace() or (bylinetext == ''): + h3byline.insert(0,NavigableString(date_items[0]+','+date_items[1])) + else: + h3byline.insert(0,NavigableString(bylinetext+u'\u2014'+date_items[0]+','+date_items[1])) + bylinetag.replaceWith(h3byline) + else: + headlinetag = soup.find('div',attrs={'class' : re.compile("^articleHeadlineBox")}) + if headlinetag: + dateline = Tag(soup,'h3', [('class','byline')]) + dateline.insert(0,NavigableString(date_items[0]+','+date_items[1])) + headlinetag.insert(len(headlinetag),dateline) + else: # if no date tag, don't process this page--it's not a news item + return None # This gets rid of the annoying superfluous bullet symbol preceding columnist bylines ultag = soup.find('ul',attrs={'class' : 'cMetadata metadataType-articleCredits'}) if ultag: @@ -58,7 +150,7 @@ class WSJ(BasicNewsRecipe): key = None ans = [] - def parse_index_page(page_name,page_title,omit_paid_content): + def parse_index_page(page_name,page_title): def article_title(tag): atag = tag.find('h2') # title is usually in an h2 tag @@ -119,7 +211,6 @@ class WSJ(BasicNewsRecipe): soup = self.index_to_soup(pageurl) # Find each instance of div with class including "headlineSummary" for divtag in soup.findAll('div',attrs={'class' : re.compile("^headlineSummary")}): - # divtag contains all article data as ul's and li's # first, check if there is an h3 tag which provides a section name stag = divtag.find('h3') @@ -162,7 +253,7 @@ class WSJ(BasicNewsRecipe): # now skip paid subscriber articles if desired subscriber_tag = litag.find(text="Subscriber Content") if subscriber_tag: - if omit_paid_content: + if self.omit_paid_content: continue # delete the tip div so it doesn't get in the way tiptag = litag.find("div", { "class" : "tipTargetBox" }) @@ -185,7 +276,7 @@ class WSJ(BasicNewsRecipe): continue if url.startswith("/article"): url = mainurl+url - if not url.startswith("http"): + if not url.startswith("http://online.wsj.com"): continue if not url.endswith(".html"): continue @@ -214,48 +305,10 @@ class WSJ(BasicNewsRecipe): articles[page_title] = [] articles[page_title].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) - # customization notes: delete sections you are not interested in - # set omit_paid_content to False if you want the paid content article previews - sectionlist = ['Front Page','Commentary','World News','US News','Business','Markets', - 'Technology','Personal Finance','Life & Style','Real Estate','Careers','Small Business'] - omit_paid_content = True - if 'Front Page' in sectionlist: - parse_index_page('/home-page','Front Page',omit_paid_content) - ans.append('Front Page') - if 'Commentary' in sectionlist: - parse_index_page('/public/page/news-opinion-commentary.html','Commentary',omit_paid_content) - ans.append('Commentary') - if 'World News' in sectionlist: - parse_index_page('/public/page/news-global-world.html','World News',omit_paid_content) - ans.append('World News') - if 'US News' in sectionlist: - parse_index_page('/public/page/news-world-business.html','US News',omit_paid_content) - ans.append('US News') - if 'Business' in sectionlist: - parse_index_page('/public/page/news-business-us.html','Business',omit_paid_content) - ans.append('Business') - if 'Markets' in sectionlist: - parse_index_page('/public/page/news-financial-markets-stock.html','Markets',omit_paid_content) - ans.append('Markets') - if 'Technology' in sectionlist: - parse_index_page('/public/page/news-tech-technology.html','Technology',omit_paid_content) - ans.append('Technology') - if 'Personal Finance' in sectionlist: - parse_index_page('/public/page/news-personal-finance.html','Personal Finance',omit_paid_content) - ans.append('Personal Finance') - if 'Life & Style' in sectionlist: - parse_index_page('/public/page/news-lifestyle-arts-entertainment.html','Life & Style',omit_paid_content) - ans.append('Life & Style') - if 'Real Estate' in sectionlist: - parse_index_page('/public/page/news-real-estate-homes.html','Real Estate',omit_paid_content) - ans.append('Real Estate') - if 'Careers' in sectionlist: - parse_index_page('/public/page/news-career-jobs.html','Careers',omit_paid_content) - ans.append('Careers') - if 'Small Business' in sectionlist: - parse_index_page('/public/page/news-small-business-marketing.html','Small Business',omit_paid_content) - ans.append('Small Business') + for page_name,page_title in self.sectionlist: + parse_index_page(page_name,page_title) + ans.append(page_title) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans diff --git a/resources/recipes/yementimes.recipe b/resources/recipes/yementimes.recipe new file mode 100644 index 0000000000..426c9a748c --- /dev/null +++ b/resources/recipes/yementimes.recipe @@ -0,0 +1,125 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag + +class YemenTimesRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'kwetal' + language = 'en_YE' + country = 'YE' + version = 1 + + title = u'Yemen Times' + publisher = u'yementimes.com' + category = u'News, Opinion, Yemen' + description = u'Award winning weekly from Yemen, promoting press freedom, professional journalism and the defense of human rights.' + + oldest_article = 7 + max_articles_per_feed = 100 + use_embedded_content = False + encoding = 'utf-8' + + remove_empty_feeds = True + no_stylesheets = True + remove_javascript = True + + keep_only_tags = [] + keep_only_tags.append(dict(name = 'div', attrs = {'id': 'ctl00_ContentPlaceHolder1_MAINNEWS0_Panel1', + 'class': 'DMAIN2'})) + remove_attributes = ['style'] + + INDEX = 'http://www.yementimes.com/' + feeds = [] + feeds.append((u'Our Viewpoint', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=6&pnm=OUR%20VIEWPOINT')) + feeds.append((u'Local News', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=3&pnm=Local%20news')) + feeds.append((u'Their News', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=80&pnm=Their%20News')) + feeds.append((u'Report', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=8&pnm=report')) + feeds.append((u'Health', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=51&pnm=health')) + feeds.append((u'Interview', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=77&pnm=interview')) + feeds.append((u'Opinion', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=7&pnm=opinion')) + feeds.append((u'Business', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=5&pnm=business')) + feeds.append((u'Op-Ed', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=81&pnm=Op-Ed')) + feeds.append((u'Culture', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=75&pnm=Culture')) + feeds.append((u'Readers View', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=4&pnm=Readers%20View')) + feeds.append((u'Variety', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=9&pnm=Variety')) + feeds.append((u'Education', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=57&pnm=Education')) + + extra_css = ''' + body {font-family:verdana, arial, helvetica, geneva, sans-serif;} + div.yemen_byline {font-size: medium; font-weight: bold;} + div.yemen_date {font-size: small; color: #666666; margin-bottom: 0.6em;} + .yemen_caption {font-size: x-small; font-style: italic; color: #696969;} + ''' + + conversion_options = {'comments': description, 'tags': category, 'language': 'en', + 'publisher': publisher, 'linearize_tables': True} + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + br.set_handle_gzip(True) + + return br + + def parse_index(self): + answer = [] + for feed_title, feed in self.feeds: + soup = self.index_to_soup(feed) + + newsbox = soup.find('div', 'newsbox') + main = newsbox.findNextSibling('table') + + articles = [] + for li in main.findAll('li'): + title = self.tag_to_string(li.a) + url = self.INDEX + li.a['href'] + articles.append({'title': title, 'date': None, 'url': url, 'description': '
 '}) + + answer.append((feed_title, articles)) + + return answer + + def preprocess_html(self, soup): + freshSoup = self.getFreshSoup(soup) + + headline = soup.find('div', attrs = {'id': 'DVMTIT'}) + if headline: + div = headline.findNext('div', attrs = {'id': 'DVTOP'}) + img = None + if div: + img = div.find('img') + + headline.name = 'h1' + freshSoup.body.append(headline) + if img is not None: + freshSoup.body.append(img) + + byline = soup.find('div', attrs = {'id': 'DVTIT'}) + if byline: + date_el = byline.find('span') + if date_el: + pub_date = self.tag_to_string(date_el) + date = Tag(soup, 'div', attrs = [('class', 'yemen_date')]) + date.append(pub_date) + date_el.extract() + + raw = '
'.join(['%s' % (part) for part in byline.findAll(text = True)]) + author = BeautifulSoup('') + + if date is not None: + freshSoup.body.append(date) + freshSoup.body.append(author) + + story = soup.find('div', attrs = {'id': 'DVDET'}) + if story: + for table in story.findAll('table'): + if table.find('img'): + table['class'] = 'yemen_caption' + + freshSoup.body.append(story) + + return freshSoup + + def getFreshSoup(self, oldSoup): + freshSoup = BeautifulSoup('') + if oldSoup.head.title: + freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title)) + return freshSoup diff --git a/resources/viewer/images.js b/resources/viewer/images.js new file mode 100644 index 0000000000..ea68009254 --- /dev/null +++ b/resources/viewer/images.js @@ -0,0 +1,23 @@ +/* + * images management + * Copyright 2008 Kovid Goyal + * License: GNU GPL v3 + */ + +function scale_images() { + $("img:visible").each(function() { + var offset = $(this).offset(); + //window.py_bridge.debug(window.getComputedStyle(this, '').getPropertyValue('max-width')); + $(this).css("max-width", (window.innerWidth-offset.left-5)+"px"); + $(this).css("max-height", (window.innerHeight-5)+"px"); + }); +} + +function setup_image_scaling_handlers() { + scale_images(); + $(window).resize(function(){ + scale_images(); + }); +} + + diff --git a/src/calibre/customize/__init__.py b/src/calibre/customize/__init__.py index a6bf55eec4..5ab9ac6d1c 100644 --- a/src/calibre/customize/__init__.py +++ b/src/calibre/customize/__init__.py @@ -2,10 +2,11 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -import sys +import atexit, os, shutil, sys, tempfile, zipfile -from calibre.ptempfile import PersistentTemporaryFile from calibre.constants import numeric_version +from calibre.ptempfile import PersistentTemporaryFile + class Plugin(object): ''' @@ -225,12 +226,14 @@ class MetadataWriterPlugin(Plugin): ''' pass - + class CatalogPlugin(Plugin): ''' A plugin that implements a catalog generator. ''' + resources_path = None + #: Output file type for which this plugin should be run #: For example: 'epub' or 'xml' file_types = set([]) @@ -248,15 +251,19 @@ class CatalogPlugin(Plugin): #: '%default' + "'"))] cli_options = [] + def search_sort_db(self, db, opts): - if opts.search_text: + + # If declared, --ids overrides any declared search criteria + if not opts.ids and opts.search_text: db.search(opts.search_text) + if opts.sort_by: # 2nd arg = ascending db.sort(opts.sort_by, True) - - return db.get_data_as_dict() + + return db.get_data_as_dict(ids=opts.ids) def get_output_fields(self, opts): # Return a list of requested fields, with opts.sort_by first @@ -272,11 +279,40 @@ class CatalogPlugin(Plugin): fields = list(all_fields & requested_fields) else: fields = list(all_fields) + fields.sort() - fields.insert(0,fields.pop(int(fields.index(opts.sort_by)))) + if opts.sort_by: + fields.insert(0,fields.pop(int(fields.index(opts.sort_by)))) return fields - def run(self, path_to_output, opts, db): + def initialize(self): + ''' + If plugin is not a built-in, copy the plugin's .ui and .py files from + the zip file to $TMPDIR. + Tab will be dynamically generated and added to the Catalog Options dialog in + calibre.gui2.dialogs.catalog.py:Catalog + ''' + from calibre.customize.builtins import plugins as builtin_plugins + from calibre.customize.ui import config + from calibre.ptempfile import PersistentTemporaryDirectory + + if not type(self) in builtin_plugins and \ + not self.name in config['disabled_plugins']: + files_to_copy = ["%s.%s" % (self.name.lower(),ext) for ext in ["ui","py"]] + resources = zipfile.ZipFile(self.plugin_path,'r') + + if self.resources_path is None: + self.resources_path = PersistentTemporaryDirectory('_plugin_resources', prefix='') + + for file in files_to_copy: + try: + resources.extract(file, self.resources_path) + except: + print " customize:__init__.initialize(): %s not found in %s" % (file, os.path.basename(self.plugin_path)) + continue + resources.close() + + def run(self, path_to_output, opts, db, ids): ''' Run the plugin. Must be implemented in subclasses. It should generate the catalog in the format specified diff --git a/src/calibre/devices/blackberry/driver.py b/src/calibre/devices/blackberry/driver.py index 1d96d4118f..ec8a7e8f49 100644 --- a/src/calibre/devices/blackberry/driver.py +++ b/src/calibre/devices/blackberry/driver.py @@ -18,7 +18,7 @@ class BLACKBERRY(USBMS): VENDOR_ID = [0x0fca] PRODUCT_ID = [0x8004, 0x0004] - BCD = [0x0200, 0x0107] + BCD = [0x0200, 0x0107, 0x0201] VENDOR_NAME = 'RIM' WINDOWS_MAIN_MEM = 'BLACKBERRY_SD' diff --git a/src/calibre/devices/nook/driver.py b/src/calibre/devices/nook/driver.py index c74a964648..16bf9479d8 100644 --- a/src/calibre/devices/nook/driver.py +++ b/src/calibre/devices/nook/driver.py @@ -86,4 +86,5 @@ class NOOK(USBMS): return drives - + def sanitize_path_components(self, components): + return [x.replace('#', '_') for x in components] diff --git a/src/calibre/devices/usbms/device.py b/src/calibre/devices/usbms/device.py index ab91de2abf..6ddfc81cf3 100644 --- a/src/calibre/devices/usbms/device.py +++ b/src/calibre/devices/usbms/device.py @@ -782,6 +782,13 @@ class Device(DeviceConfig, DevicePlugin): ''' return default + def sanitize_path_components(self, components): + ''' + Perform any device specific sanitization on the path components + for files to be uploaded to the device + ''' + return components + def create_upload_path(self, path, mdata, fname): path = os.path.abspath(path) extra_components = [] @@ -834,6 +841,7 @@ class Device(DeviceConfig, DevicePlugin): extra_components = list(map(remove_trailing_periods, extra_components)) components = shorten_components_to(250 - len(path), extra_components) + components = self.sanitize_path_components(components) filepath = os.path.join(path, *components) filedir = os.path.dirname(filepath) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 42feeb2330..c8428cf136 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -132,7 +132,8 @@ class FB2MLizer(object): href = self.oeb_book.guide['titlepage'].href item = self.oeb_book.manifest.hrefs[href] if item.spine_position is None: - stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + stylizer = Stylizer(item.data, item.href, self.oeb_book, + self.opts, self.opts.output_profile) output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item)) return output @@ -152,7 +153,7 @@ class FB2MLizer(object): text = [] for item in self.oeb_book.spine: self.log.debug('Converting %s to FictionBook2 XML' % item.href) - stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) text.append(self.add_page_anchor(item)) text += self.dump_text(item.data.find(XHTML('body')), stylizer, item) return ''.join(text) diff --git a/src/calibre/ebooks/lit/output.py b/src/calibre/ebooks/lit/output.py index 2a08ff51a8..423fb9ce7c 100644 --- a/src/calibre/ebooks/lit/output.py +++ b/src/calibre/ebooks/lit/output.py @@ -32,7 +32,7 @@ class LITOutput(OutputFormatPlugin): mangler(oeb, opts) rasterizer = SVGRasterizer() rasterizer(oeb, opts) - lit = LitWriter() + lit = LitWriter(self.opts) lit(oeb, output_path) diff --git a/src/calibre/ebooks/lit/writer.py b/src/calibre/ebooks/lit/writer.py index 6dd5068032..cf9ea6aa77 100644 --- a/src/calibre/ebooks/lit/writer.py +++ b/src/calibre/ebooks/lit/writer.py @@ -134,7 +134,7 @@ def warn(x): class ReBinary(object): NSRMAP = {'': None, XML_NS: 'xml'} - def __init__(self, root, item, oeb, map=HTML_MAP): + def __init__(self, root, item, oeb, opts, map=HTML_MAP): self.item = item self.logger = oeb.logger self.manifest = oeb.manifest @@ -143,7 +143,7 @@ class ReBinary(object): self.anchors = [] self.page_breaks = [] self.is_html = is_html = map is HTML_MAP - self.stylizer = Stylizer(root, item.href, oeb) if is_html else None + self.stylizer = Stylizer(root, item.href, oeb, opts) if is_html else None self.tree_to_binary(root) self.content = self.buf.getvalue() self.ahc = self.build_ahc() if is_html else None @@ -295,9 +295,8 @@ def preserve(function): return wrapper class LitWriter(object): - def __init__(self): - # Wow, no options - pass + def __init__(self, opts): + self.opts = opts def _litize_oeb(self): oeb = self._oeb @@ -469,7 +468,7 @@ class LitWriter(object): secnum = 0 if isinstance(data, etree._Element): self._add_folder(name) - rebin = ReBinary(data, item, self._oeb, map=HTML_MAP) + rebin = ReBinary(data, item, self._oeb, self.opts, map=HTML_MAP) self._add_file(name + '/ahc', rebin.ahc, 0) self._add_file(name + '/aht', rebin.aht, 0) item.page_breaks = rebin.page_breaks @@ -562,7 +561,7 @@ class LitWriter(object): meta.attrib['ms--minimum_level'] = '0' meta.attrib['ms--attr5'] = '1' meta.attrib['ms--guid'] = '{%s}' % str(uuid.uuid4()).upper() - rebin = ReBinary(meta, None, self._oeb, map=OPF_MAP) + rebin = ReBinary(meta, None, self._oeb, self.opts, map=OPF_MAP) meta = rebin.content self._meta = meta self._add_file('/meta', meta) diff --git a/src/calibre/ebooks/metadata/cli.py b/src/calibre/ebooks/metadata/cli.py index e4ea1a3931..5de8b76c43 100644 --- a/src/calibre/ebooks/metadata/cli.py +++ b/src/calibre/ebooks/metadata/cli.py @@ -128,6 +128,10 @@ def do_set_metadata(opts, mi, stream, stream_type): mi.title_sort = title_sort(opts.title) if getattr(opts, 'tags', None) is not None: mi.tags = [t.strip() for t in opts.tags.split(',')] + if getattr(opts, 'series', None) is not None: + mi.series = opts.series.strip() + if getattr(opts, 'series_index', None) is not None: + mi.series_index = float(opts.series_index.strip()) if getattr(opts, 'cover', None) is not None: ext = os.path.splitext(opts.cover)[1].replace('.', '').upper() diff --git a/src/calibre/ebooks/metadata/meta.py b/src/calibre/ebooks/metadata/meta.py index 479b513ea5..60228f57dd 100644 --- a/src/calibre/ebooks/metadata/meta.py +++ b/src/calibre/ebooks/metadata/meta.py @@ -134,7 +134,10 @@ def metadata_from_filename(name, pat=None): mi.authors = aus if prefs['swap_author_names'] and mi.authors: def swap(a): - parts = a.split() + if ',' in a: + parts = a.split(',', 1) + else: + parts = a.split(None, 1) if len(parts) > 1: t = parts[-1] parts = parts[:-1] diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index aa69ba446b..f958b63a12 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -92,6 +92,7 @@ class MobiMLizer(object): def __call__(self, oeb, context): oeb.logger.info('Converting XHTML to Mobipocket markup...') self.oeb = oeb + self.opts = context self.profile = profile = context.dest self.fnums = fnums = dict((v, k) for k, v in profile.fnums.items()) self.fmap = KeyMapper(profile.fbase, profile.fbase, fnums.keys()) @@ -114,7 +115,7 @@ class MobiMLizer(object): def mobimlize_spine(self): 'Iterate over the spine and convert it to MOBIML' for item in self.oeb.spine: - stylizer = Stylizer(item.data, item.href, self.oeb, self.profile) + stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.profile) body = item.data.find(XHTML('body')) nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP) nbody = etree.SubElement(nroot, XHTML('body')) diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index b8557aea98..4f894ce088 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -563,6 +563,16 @@ class MobiReader(object): recindex = attrib.pop(attr, None) or recindex if recindex is not None: attrib['src'] = 'images/%s.jpg' % recindex + for attr in ('width', 'height'): + if attr in attrib: + val = attrib[attr] + if val.lower().endswith('em'): + try: + nval = float(val[:-2]) + nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile + attrib[attr] = "%dpx"%int(nval) + except: + del attrib[attr] elif tag.tag == 'pre': if not tag.text: tag.tag = 'div' diff --git a/src/calibre/ebooks/oeb/factory.py b/src/calibre/ebooks/oeb/factory.py deleted file mode 100644 index 8add71d20d..0000000000 --- a/src/calibre/ebooks/oeb/factory.py +++ /dev/null @@ -1,99 +0,0 @@ -''' -Registry associating file extensions with Reader classes. -''' -from __future__ import with_statement - -__license__ = 'GPL v3' -__copyright__ = '2008, Marshall T. Vandegrift ' - -import sys, os, logging -from itertools import chain -import calibre -from calibre.ebooks.oeb.base import OEBError -from calibre.ebooks.oeb.reader import OEBReader -from calibre.ebooks.oeb.writer import OEBWriter -from calibre.ebooks.lit.reader import LitReader -from calibre.ebooks.lit.writer import LitWriter -from calibre.ebooks.mobi.reader import MobiReader -from calibre.ebooks.mobi.writer import MobiWriter -from calibre.ebooks.oeb.base import OEBBook -from calibre.ebooks.oeb.profile import Context -from calibre.utils.config import Config - -__all__ = ['get_reader'] - -REGISTRY = { - '.opf': (OEBReader, None), - '.lit': (LitReader, LitWriter), - '.mobi': (MobiReader, MobiWriter), - } - -def ReaderFactory(path): - if os.path.isdir(path): - return OEBReader - ext = os.path.splitext(path)[1].lower() - Reader = REGISTRY.get(ext, (None, None))[0] - if Reader is None: - raise OEBError('Unknown e-book file extension %r' % ext) - return Reader - -def WriterFactory(path): - if os.path.isdir(path): - return OEBWriter - ext = os.path.splitext(path)[1].lower() - if not os.path.exists(path) and not ext: - return OEBWriter - Writer = REGISTRY.get(ext, (None, None))[1] - if Writer is None: - raise OEBError('Unknown e-book file extension %r' % ext) - return Writer - - -def option_parser(Reader, Writer): - cfg = Config('ebook-convert', _('Options to control e-book conversion.')) - Reader.config(cfg) - for Transform in chain(Reader.TRANSFORMS, Writer.TRANSFORMS): - Transform.config(cfg) - Writer.config(cfg) - parser = cfg.option_parser() - parser.add_option('--encoding', default=None, - help=_('Character encoding for input. Default is to auto detect.')) - parser.add_option('-o', '--output', default=None, - help=_('Output file. Default is derived from input filename.')) - parser.add_option('-p', '--pretty-print', action='store_true', - default=False, help=_('Produce more human-readable XML output.')) - parser.add_option('-v', '--verbose', default=0, action='count', - help=_('Useful for debugging.')) - return parser - -def main(argv=sys.argv): - if len(argv) < 3: - print _("Usage: ebook-convert INFILE OUTFILE [OPTIONS..]") - return 1 - inpath, outpath = argv[1], argv[2] - Reader = ReaderFactory(inpath) - Writer = WriterFactory(outpath) - parser = option_parser(Reader, Writer) - opts, args = parser.parse_args(argv[3:]) - if len(args) != 0: - parser.print_help() - return 1 - logger = logging.getLogger('ebook-convert') - calibre.setup_cli_handlers(logger, logging.DEBUG) - encoding = opts.encoding - pretty_print = opts.pretty_print - oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger) - context = Context(Reader.DEFAULT_PROFILE, Writer.DEFAULT_PROFILE) - reader = Reader.generate(opts) - writer = Writer.generate(opts) - transforms = [] - for Transform in chain(Reader.TRANSFORMS, Writer.TRANSFORMS): - transforms.append(Transform.generate(opts)) - reader(oeb, inpath) - for transform in transforms: - transform(oeb, context) - writer(oeb, outpath) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py index 9f50796615..26fb4ca980 100644 --- a/src/calibre/ebooks/oeb/stylizer.py +++ b/src/calibre/ebooks/oeb/stylizer.py @@ -110,9 +110,9 @@ class CSSSelector(etree.XPath): class Stylizer(object): STYLESHEETS = WeakKeyDictionary() - def __init__(self, tree, path, oeb, profile=PROFILES['PRS505'], + def __init__(self, tree, path, oeb, opts, profile=PROFILES['PRS505'], extra_css='', user_css=''): - self.oeb = oeb + self.oeb, self.opts = oeb, opts self.profile = profile self.logger = oeb.logger item = oeb.manifest.hrefs[path] @@ -249,6 +249,8 @@ class Stylizer(object): style.update(self._normalize_font(prop.cssValue)) elif name == 'list-style': style.update(self._normalize_list_style(prop.cssValue)) + elif name == 'text-align': + style.update(self._normalize_text_align(prop.cssValue)) else: style[name] = prop.value if 'font-size' in style: @@ -306,6 +308,19 @@ class Stylizer(object): return style + def _normalize_text_align(self, cssvalue): + style = {} + text = cssvalue.cssText + if text == 'inherit': + style['text-align'] = 'inherit' + else: + if text in ('left', 'justify'): + val = 'left' if self.opts.dont_justify else 'justify' + style['text-align'] = val + else: + style['text-align'] = text + return style + def _normalize_font(self, cssvalue): composition = ('font-style', 'font-variant', 'font-weight', 'font-size', 'line-height', 'font-family') @@ -411,6 +426,7 @@ class Style(object): return result def _unit_convert(self, value, base=None, font=None): + ' Return value in pts' if isinstance(value, (int, long, float)): return value try: @@ -447,6 +463,9 @@ class Style(object): result = value * 0.40 return result + def pt_to_px(self, value): + return (self._profile.dpi / 72.0) * value + @property def fontSize(self): def normalize_fontsize(value, base): diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index 61226ca4f4..1eb6afc1b5 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -141,7 +141,7 @@ class CSSFlattener(object): bs.append('text-align: '+ \ ('left' if self.context.dont_justify else 'justify')) body.set('style', '; '.join(bs)) - stylizer = Stylizer(html, item.href, self.oeb, profile, + stylizer = Stylizer(html, item.href, self.oeb, self.context, profile, user_css=self.context.extra_css, extra_css=css) self.stylizers[item] = stylizer diff --git a/src/calibre/ebooks/oeb/transforms/manglecase.py b/src/calibre/ebooks/oeb/transforms/manglecase.py index 4b852db6c4..04bf63ac1d 100644 --- a/src/calibre/ebooks/oeb/transforms/manglecase.py +++ b/src/calibre/ebooks/oeb/transforms/manglecase.py @@ -29,13 +29,14 @@ class CaseMangler(object): @classmethod def generate(cls, opts): return cls() - + def __call__(self, oeb, context): oeb.logger.info('Applying case-transforming CSS...') self.oeb = oeb + self.opts = context self.profile = context.source self.mangle_spine() - + def mangle_spine(self): id, href = self.oeb.manifest.generate('manglecase', 'manglecase.css') self.oeb.manifest.add(id, href, CSS_MIME, data=CASE_MANGLER_CSS) @@ -44,9 +45,9 @@ class CaseMangler(object): relhref = item.relhref(href) etree.SubElement(html.find(XHTML('head')), XHTML('link'), rel='stylesheet', href=relhref, type=CSS_MIME) - stylizer = Stylizer(html, item.href, self.oeb, self.profile) + stylizer = Stylizer(html, item.href, self.oeb, self.opts, self.profile) self.mangle_elem(html.find(XHTML('body')), stylizer) - + def text_transform(self, transform, text): if transform == 'capitalize': return text.title() @@ -55,7 +56,7 @@ class CaseMangler(object): elif transform == 'lowercase': return text.lower() return text - + def split_text(self, text): results = [''] isupper = text[0].isupper() @@ -66,7 +67,7 @@ class CaseMangler(object): isupper = not isupper results.append(char) return results - + def smallcaps_elem(self, elem, attr): texts = self.split_text(getattr(elem, attr)) setattr(elem, attr, None) @@ -90,7 +91,7 @@ class CaseMangler(object): last.tail = tail child.tail = None last = child - + def mangle_elem(self, elem, stylizer): if not isinstance(elem.tag, basestring) or \ namespace(elem.tag) != XHTML_NS: diff --git a/src/calibre/ebooks/oeb/transforms/rasterize.py b/src/calibre/ebooks/oeb/transforms/rasterize.py index 30357b10d2..ac28e51b15 100644 --- a/src/calibre/ebooks/oeb/transforms/rasterize.py +++ b/src/calibre/ebooks/oeb/transforms/rasterize.py @@ -44,6 +44,7 @@ class SVGRasterizer(object): def __call__(self, oeb, context): oeb.logger.info('Rasterizing SVG images...') self.oeb = oeb + self.opts = context self.profile = context.dest self.images = {} self.dataize_manifest() @@ -102,7 +103,7 @@ class SVGRasterizer(object): def rasterize_spine(self): for item in self.oeb.spine: html = item.data - stylizer = Stylizer(html, item.href, self.oeb, self.profile) + stylizer = Stylizer(html, item.href, self.oeb, self.opts, self.profile) self.rasterize_item(item, stylizer) def rasterize_item(self, item, stylizer): diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index 1b2149cf3a..721df28537 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -20,6 +20,10 @@ class Font(object): class Column(object): + # A column contains an element is the element bulges out to + # the left or the right by at most HFUZZ*col width. + HFUZZ = 0.2 + def __init__(self): self.left = self.right = self.top = self.bottom = 0 self.width = self.height = 0 @@ -41,6 +45,10 @@ class Column(object): for x in self.elements: yield x + def contains(self, elem): + return elem.left > self.left - self.HFUZZ*self.width and \ + elem.right < self.right + self.HFUZZ*self.width + class Element(object): def __eq__(self, other): @@ -132,6 +140,18 @@ class Interval(object): def __hash__(self): return hash('(%f,%f)'%self.left, self.right) +class Region(object): + + def __init__(self): + self.columns = [] + self.top = self.bottom = self.left = self.right = self.width = self.height = 0 + + def add_columns(self, columns): + if not self.columns: + for x in sorted(columns, cmp=lambda x,y: cmp(x.left, y.left)): + self.columns.append(x) + else: + pass class Page(object): @@ -238,11 +258,10 @@ class Page(object): return columns def find_elements_in_row_of(self, x): - interval = Interval(x.top - self.YFUZZ * self.average_text_height, + interval = Interval(x.top, x.top + self.YFUZZ*(1+self.average_text_height)) h_interval = Interval(x.left, x.right) - m = max(0, x.idx-15) - for y in self.elements[m:x.idx+15]: + for y in self.elements[x.idx:x.idx+15]: if y is not x: y_interval = Interval(y.top, y.bottom) x_interval = Interval(y.left, y.right) diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index 4f3d5f23df..e3609fcddb 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -113,7 +113,8 @@ class PMLMLizer(object): href = self.oeb_book.guide['titlepage'].href item = self.oeb_book.manifest.hrefs[href] if item.spine_position is None: - stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + stylizer = Stylizer(item.data, item.href, self.oeb_book, + self.opts, self.opts.output_profile) output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item)) return output diff --git a/src/calibre/ebooks/rb/rbml.py b/src/calibre/ebooks/rb/rbml.py index 5574aa94b6..50153d7d4d 100644 --- a/src/calibre/ebooks/rb/rbml.py +++ b/src/calibre/ebooks/rb/rbml.py @@ -90,7 +90,8 @@ class RBMLizer(object): href = self.oeb_book.guide['titlepage'].href item = self.oeb_book.manifest.hrefs[href] if item.spine_position is None: - stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + stylizer = Stylizer(item.data, item.href, self.oeb_book, + self.opts, self.opts.output_profile) output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item)) return output @@ -111,7 +112,7 @@ class RBMLizer(object): output = [u''] for item in self.oeb_book.spine: self.log.debug('Converting %s to RocketBook HTML...' % item.href) - stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) output.append(self.add_page_anchor(item)) output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) return ''.join(output) diff --git a/src/calibre/ebooks/rtf/rtfml.py b/src/calibre/ebooks/rtf/rtfml.py index 6aa48ad61b..1217482823 100644 --- a/src/calibre/ebooks/rtf/rtfml.py +++ b/src/calibre/ebooks/rtf/rtfml.py @@ -111,12 +111,13 @@ class RTFMLizer(object): href = self.oeb_book.guide['titlepage'].href item = self.oeb_book.manifest.hrefs[href] if item.spine_position is None: - stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + stylizer = Stylizer(item.data, item.href, self.oeb_book, + self.opts, self.opts.output_profile) output += self.dump_text(item.data.find(XHTML('body')), stylizer) output += '{\\page } ' for item in self.oeb_book.spine: self.log.debug('Converting %s to RTF markup...' % item.href) - stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) output += self.dump_text(item.data.find(XHTML('body')), stylizer) output += self.footer() output = self.insert_images(output) diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index 7642e051fe..bb730c0720 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -54,7 +54,7 @@ class TXTMLizer(object): output.append(self.get_toc()) for item in self.oeb_book.spine: self.log.debug('Converting %s to TXT...' % item.href) - stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) content = self.remove_newlines(content) output += self.dump_text(etree.fromstring(content), stylizer) diff --git a/src/calibre/gui2/convert/gui_conversion.py b/src/calibre/gui2/convert/gui_conversion.py index 32cd883727..b951244e71 100644 --- a/src/calibre/gui2/convert/gui_conversion.py +++ b/src/calibre/gui2/convert/gui_conversion.py @@ -4,9 +4,14 @@ __license__ = 'GPL 3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -from calibre.ebooks.conversion.plumber import Plumber -from calibre.utils.logging import Log +import os +from optparse import OptionParser + from calibre.customize.conversion import OptionRecommendation, DummyReporter +from calibre.ebooks.conversion.plumber import Plumber +from calibre.customize.ui import plugin_for_catalog_format +from calibre.utils.logging import Log +from calibre.gui2 import choose_dir, Application def gui_convert(input, output, recommendations, notification=DummyReporter(), abort_after_input_dump=False, log=None): @@ -20,7 +25,7 @@ def gui_convert(input, output, recommendations, notification=DummyReporter(), plumber.run() -def gui_catalog(fmt, title, dbspec, ids, out_file_name, +def gui_catalog(fmt, title, dbspec, ids, out_file_name, fmt_options, notification=DummyReporter(), log=None): if log is None: log = Log() @@ -31,8 +36,28 @@ def gui_catalog(fmt, title, dbspec, ids, out_file_name, db = LibraryDatabase2(dbpath) else: # To be implemented in the future pass - # Implement the interface to the catalog generating code here - db + + # Create a minimal OptionParser that we can append to + parser = OptionParser() + args = [] + parser.add_option("--verbose", action="store_true", dest="verbose", default=True) + opts, args = parser.parse_args() + + # Populate opts + opts.ids = ids + opts.search_text = None + opts.sort_by = None + + # Extract the option dictionary to comma-separated lists + for option in fmt_options: + setattr(opts,option, ','.join(fmt_options[option])) + + # Fetch and run the plugin for fmt + plugin = plugin_for_catalog_format(fmt) + plugin.run(out_file_name, opts, db) + + + diff --git a/src/calibre/gui2/dialogs/catalog.py b/src/calibre/gui2/dialogs/catalog.py index 29b6ef972d..8407e2c426 100644 --- a/src/calibre/gui2/dialogs/catalog.py +++ b/src/calibre/gui2/dialogs/catalog.py @@ -6,39 +6,131 @@ __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal ' __docformat__ = 'restructuredtext en' -from PyQt4.Qt import QDialog +import os, shutil, sys, tempfile +from PyQt4.Qt import QDialog, QWidget + +from calibre.customize.ui import config from calibre.gui2.dialogs.catalog_ui import Ui_Dialog -from calibre.gui2 import dynamic -from calibre.customize.ui import available_catalog_formats +from calibre.gui2 import gprefs, dynamic +from calibre.customize.ui import available_catalog_formats, catalog_plugins +from calibre.gui2.catalog.catalog_csv_xml import PluginWidget class Catalog(QDialog, Ui_Dialog): + ''' Catalog Dialog builder''' + widgets = [] def __init__(self, parent, dbspec, ids): + import re, cStringIO + from calibre import prints as info + from calibre.gui2 import dynamic + from PyQt4.uic import compileUi + QDialog.__init__(self, parent) + + # Run the dialog setup generated from catalog.ui self.setupUi(self) self.dbspec, self.ids = dbspec, ids + # Display the number of books we've been passed self.count.setText(unicode(self.count.text()).format(len(ids))) + + # Display the last-used title self.title.setText(dynamic.get('catalog_last_used_title', _('My Books'))) - fmts = sorted([x.upper() for x in available_catalog_formats()]) + # GwR *** Add option tabs for built-in formats + # This code models #69 in calibre/gui2/dialogs/config/__init__.py + + self.fmts = [] + + from calibre.customize.builtins import plugins as builtin_plugins + from calibre.customize import CatalogPlugin + + for plugin in catalog_plugins(): + if plugin.name in config['disabled_plugins']: + continue + + name = plugin.name.lower().replace(' ', '_') + if type(plugin) in builtin_plugins: + #info("Adding widget for builtin Catalog plugin %s" % plugin.name) + try: + catalog_widget = __import__('calibre.gui2.catalog.'+name, + fromlist=[1]) + pw = catalog_widget.PluginWidget() + pw.initialize(name) + pw.ICON = I('forward.svg') + self.widgets.append(pw) + [self.fmts.append([file_type.upper(), pw.sync_enabled,pw]) for file_type in plugin.file_types] + except ImportError: + info("ImportError with %s" % name) + continue + else: + # Load dynamic tab + form = os.path.join(plugin.resources_path,'%s.ui' % name) + klass = os.path.join(plugin.resources_path,'%s.py' % name) + compiled_form = os.path.join(plugin.resources_path,'%s_ui.py' % name) + + if os.path.exists(form) and os.path.exists(klass): + #info("Adding widget for user-installed Catalog plugin %s" % plugin.name) + + # Compile the .ui form provided in plugin.zip + if not os.path.exists(compiled_form): + # info('\tCompiling form', form) + buf = cStringIO.StringIO() + compileUi(form, buf) + dat = buf.getvalue() + dat = re.compile(r'QtGui.QApplication.translate\(.+?,\s+"(.+?)(? -1: self.format.setCurrentIndex(idx) if self.sync.isEnabled(): self.sync.setChecked(dynamic.get('catalog_sync_to_device', True)) - + def format_changed(self, idx): cf = unicode(self.format.currentText()) - if cf in ('EPUB', 'MOBI'): + if cf in self.sync_enabled_formats: self.sync.setEnabled(True) else: self.sync.setDisabled(True) diff --git a/src/calibre/gui2/dialogs/catalog.ui b/src/calibre/gui2/dialogs/catalog.ui index aa47f3c0c3..c18e08ef65 100644 --- a/src/calibre/gui2/dialogs/catalog.ui +++ b/src/calibre/gui2/dialogs/catalog.ui @@ -6,105 +6,121 @@ 0 0 - 628 - 503 + 611 + 514 Generate catalog - + :/images/library.png:/images/library.png - - - - - Qt::Horizontal - - - QDialogButtonBox::Cancel|QDialogButtonBox::Ok - - - - - - - 0 - - - - Catalog options - - - - - - Catalog &format: - - - format - - - - - - - - - - Catalog &title (existing catalog with the same title will be replaced): - - - true - - - title - - - - - - - Qt::Vertical - - - - 20 - 299 - - - - - - - - &Send catalog to device automatically - - - - - - - - - - - - - - - 75 - true - - - - Generate catalog for {0} books - - - - + + + + 430 + 470 + 164 + 32 + + + + Qt::Horizontal + + + QDialogButtonBox::Cancel|QDialogButtonBox::Ok + + + + + + 12 + 39 + 579 + 411 + + + + 0 + + + + Catalog options + + + + + + Catalog &format: + + + format + + + + + + + + + + Catalog &title (existing catalog with the same title will be replaced): + + + true + + + title + + + + + + + + + + &Send catalog to device automatically + + + + + + + Qt::Vertical + + + + 20 + 299 + + + + + + + + + + + 12 + 12 + 205 + 17 + + + + + 75 + true + + + + Generate catalog for {0} books + + diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index a9130b2ea2..89b7c92125 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -532,7 +532,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): if self.cover_fetcher.exception is not None: err = self.cover_fetcher.exception error_dialog(self, _('Cannot fetch cover'), - _('Could not fetch cover.
')+repr(err)).exec_() + _('Could not fetch cover.
')+unicode(err)).exec_() return pix = QPixmap() diff --git a/src/calibre/gui2/tag_view.py b/src/calibre/gui2/tag_view.py index 0f02f2a591..8ad0dff4d2 100644 --- a/src/calibre/gui2/tag_view.py +++ b/src/calibre/gui2/tag_view.py @@ -215,7 +215,7 @@ class TagsModel(QAbstractItemModel): return QModelIndex() child_item = index.internalPointer() - parent_item = child_item.parent + parent_item = getattr(child_item, 'parent', None) if parent_item is self.root_item or parent_item is None: return QModelIndex() diff --git a/src/calibre/gui2/tools.py b/src/calibre/gui2/tools.py index 2bb891d36b..b23e0b6259 100644 --- a/src/calibre/gui2/tools.py +++ b/src/calibre/gui2/tools.py @@ -238,19 +238,36 @@ def fetch_scheduled_recipe(arg): def generate_catalog(parent, dbspec, ids): from calibre.gui2.dialogs.catalog import Catalog + + # Build the Catalog dialog in gui2.dialogs.catalog d = Catalog(parent, dbspec, ids) + if d.exec_() != d.Accepted: return None + + # Create the output file out = PersistentTemporaryFile(suffix='_catalog_out.'+d.catalog_format.lower()) + + # Retrieve plugin options + fmt_options = {} + for x in range(d.tabs.count()): + if str(d.tabs.tabText(x)).find(str(d.catalog_format)) > -1: + for fmt in d.fmts: + if fmt[0] == d.catalog_format: + fmt_options = fmt[2].options() + # print "gui2.tools:generate_catalog(): options for %s: %s" % (fmt[0], fmt_options) + args = [ d.catalog_format, d.catalog_title, dbspec, ids, out.name, + fmt_options ] out.close() + # This calls gui2.convert.gui_conversion:gui_catalog() return 'gui_catalog', args, _('Generate catalog'), out.name, d.catalog_sync, \ d.catalog_title diff --git a/src/calibre/gui2/ui.py b/src/calibre/gui2/ui.py index 6cbae7f7b0..ccff7ccdc8 100644 --- a/src/calibre/gui2/ui.py +++ b/src/calibre/gui2/ui.py @@ -9,7 +9,7 @@ __docformat__ = 'restructuredtext en' '''The main GUI''' -import os, sys, textwrap, collections, time +import atexit, os, shutil, sys, tempfile, textwrap, collections, time from xml.parsers.expat import ExpatError from Queue import Queue, Empty from threading import Thread @@ -357,7 +357,7 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI): cm.addAction(_('Bulk convert')) cm.addSeparator() ac = cm.addAction( - _('Create catalog of the books in your calibre library')) + _('Create catalog of books in your calibre library')) ac.triggered.connect(self.generate_catalog) self.action_convert.setMenu(cm) self._convert_single_hook = partial(self.convert_ebook, bulk=False) @@ -1359,26 +1359,32 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI): ############################### Generate catalog ########################### - def generate_catalog(self): + def generate_catalog(self): rows = self.library_view.selectionModel().selectedRows() - if not rows: + if not rows or len(rows) < 2: rows = xrange(self.library_view.model().rowCount(QModelIndex())) ids = map(self.library_view.model().id, rows) + dbspec = None if not ids: return error_dialog(self, _('No books selected'), _('No books selected to generate catalog for'), show=True) + + # Calling gui2.tools:generate_catalog() ret = generate_catalog(self, dbspec, ids) if ret is None: return + func, args, desc, out, sync, title = ret + fmt = os.path.splitext(out)[1][1:].upper() job = self.job_manager.run_job( Dispatcher(self.catalog_generated), func, args=args, description=desc) job.catalog_file_path = out - job.catalog_sync, job.catalog_title = sync, title + job.fmt = fmt + job.catalog_sync, job.catalog_title = sync, title self.status_bar.showMessage(_('Generating %s catalog...')%fmt) def catalog_generated(self, job): @@ -1392,8 +1398,13 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI): dynamic.set('catalogs_to_be_synced', sync) self.status_bar.showMessage(_('Catalog generated.'), 3000) self.sync_catalogs() - - + if job.fmt in ['CSV','XML']: + export_dir = choose_dir(self, 'Export Catalog Directory', + 'Select destination for %s.%s' % (job.catalog_title, job.fmt.lower())) + if export_dir: + destination = os.path.join(export_dir, '%s.%s' % (job.catalog_title, job.fmt.lower())) + shutil.copyfile(job.catalog_file_path, destination) + ############################### Fetch news ################################# def download_scheduled_recipe(self, arg): diff --git a/src/calibre/gui2/viewer/config.ui b/src/calibre/gui2/viewer/config.ui index fe1dc85c93..d6e71c77d2 100644 --- a/src/calibre/gui2/viewer/config.ui +++ b/src/calibre/gui2/viewer/config.ui @@ -7,14 +7,14 @@ 0 0 479 - 574 + 606 Configure Ebook viewer - + :/images/config.svg:/images/config.svg @@ -164,7 +164,7 @@ - + Remember last used &window size @@ -218,6 +218,13 @@ + + + + &Resize images larger than the viewer window (needs restart) + + + diff --git a/src/calibre/gui2/viewer/documentview.py b/src/calibre/gui2/viewer/documentview.py index b35e28121a..aedd709bb8 100644 --- a/src/calibre/gui2/viewer/documentview.py +++ b/src/calibre/gui2/viewer/documentview.py @@ -10,7 +10,7 @@ from base64 import b64encode from PyQt4.Qt import QSize, QSizePolicy, QUrl, SIGNAL, Qt, QTimer, \ QPainter, QPalette, QBrush, QFontDatabase, QDialog, \ QColor, QPoint, QImage, QRegion, QVariant, QIcon, \ - QFont, QObject, QApplication, pyqtSignature, QAction + QFont, pyqtSignature, QAction from PyQt4.QtWebKit import QWebPage, QWebView, QWebSettings from calibre.utils.config import Config, StringConfig @@ -21,7 +21,7 @@ from calibre.constants import iswindows from calibre import prints, guess_type from calibre.gui2.viewer.keys import SHORTCUTS -bookmarks = referencing = hyphenation = jquery = jquery_scrollTo = hyphenator = None +bookmarks = referencing = hyphenation = jquery = jquery_scrollTo = hyphenator = images =None def load_builtin_fonts(): base = P('fonts/liberation/*.ttf') @@ -42,6 +42,8 @@ def config(defaults=None): help=_('Set the user CSS stylesheet. This can be used to customize the look of all books.')) c.add_opt('max_view_width', default=6000, help=_('Maximum width of the viewer window, in pixels.')) + c.add_opt('fit_images', default=True, + help=_('Resize images larger than the viewer window to fit inside it')) c.add_opt('hyphenate', default=False, help=_('Hyphenate text')) c.add_opt('hyphenate_default_lang', default='en', help=_('Default language for hyphenation rules')) @@ -59,20 +61,6 @@ def config(defaults=None): return c -class PythonJS(QObject): - - def __init__(self, callback): - QObject.__init__(self, QApplication.instance()) - self.setObjectName("py_bridge") - self._callback = callback - - @pyqtSignature("QString") - def callback(self, msg): - print "callback called" - self._callback(msg) - - - class ConfigDialog(QDialog, Ui_Dialog): def __init__(self, shortcuts, parent=None): @@ -110,6 +98,7 @@ class ConfigDialog(QDialog, Ui_Dialog): self.shortcut_config = ShortcutConfig(shortcuts, parent=self) p = self.tabs.widget(1) p.layout().addWidget(self.shortcut_config) + self.opt_fit_images.setChecked(opts.fit_images) def accept(self, *args): @@ -122,6 +111,7 @@ class ConfigDialog(QDialog, Ui_Dialog): c.set('standard_font', {0:'serif', 1:'sans', 2:'mono'}[self.standard_font.currentIndex()]) c.set('user_css', unicode(self.css.toPlainText())) c.set('remember_window_size', self.opt_remember_window_size.isChecked()) + c.set('fit_images', self.opt_fit_images.isChecked()) c.set('max_view_width', int(self.max_view_width.value())) c.set('hyphenate', self.hyphenate.isChecked()) idx = self.hyphenate_default_lang.currentIndex() @@ -157,7 +147,6 @@ class Document(QWebPage): self.setObjectName("py_bridge") self.debug_javascript = False self.current_language = None - #self.js_bridge = PythonJS(self.js_callback) self.setLinkDelegationPolicy(self.DelegateAllLinks) self.scroll_marks = [] @@ -197,9 +186,14 @@ class Document(QWebPage): opts = config().parse() self.hyphenate = opts.hyphenate self.hyphenate_default_lang = opts.hyphenate_default_lang + self.do_fit_images = opts.fit_images + + def fit_images(self): + if self.do_fit_images: + self.javascript('setup_image_scaling_handlers()') def load_javascript_libraries(self): - global bookmarks, referencing, hyphenation, jquery, jquery_scrollTo, hyphenator + global bookmarks, referencing, hyphenation, jquery, jquery_scrollTo, hyphenator, images self.mainFrame().addToJavaScriptWindowObject("py_bridge", self) if jquery is None: jquery = P('content_server/jquery.js', data=True) @@ -215,6 +209,9 @@ class Document(QWebPage): if referencing is None: referencing = P('viewer/referencing.js', data=True) self.javascript(referencing) + if images is None: + images = P('viewer/images.js', data=True) + self.javascript(images) if hyphenation is None: hyphenation = P('viewer/hyphenation.js', data=True) self.javascript(hyphenation) @@ -353,7 +350,13 @@ class Document(QWebPage): return self.mainFrame().contentsSize().width() # offsetWidth gives inaccurate results def set_bottom_padding(self, amount): - self.javascript('$("body").css("padding-bottom", "%dpx")' % amount) + padding = '%dpx'%amount + try: + old_padding = unicode(self.javascript('$("body").css("padding-bottom")').toString()) + except: + old_padding = '' + if old_padding != padding: + self.javascript('$("body").css("padding-bottom", "%s")' % padding) class EntityDeclarationProcessor(object): @@ -541,6 +544,7 @@ class DocumentView(QWebView): return self.loading_url = None self.document.set_bottom_padding(0) + self.document.fit_images() self._size_hint = self.document.mainFrame().contentsSize() scrolled = False if self.to_bottom: diff --git a/src/calibre/library/catalog.py b/src/calibre/library/catalog.py index 42f0139cd8..32f2503b2c 100644 --- a/src/calibre/library/catalog.py +++ b/src/calibre/library/catalog.py @@ -40,8 +40,9 @@ class CSV_XML(CatalogPlugin): from calibre.utils.logging import Log log = Log() - self.fmt = path_to_output[path_to_output.rfind('.') + 1:] - if opts.verbose: + self.fmt = path_to_output.rpartition('.')[2] + + if False and opts.verbose: log("%s:run" % self.name) log(" path_to_output: %s" % path_to_output) log(" Output format: %s" % self.fmt) @@ -53,7 +54,7 @@ class CSV_XML(CatalogPlugin): log(" opts:") for key in keys: log(" %s: %s" % (key, opts_dict[key])) - + # Get the sorted, filtered database as a dictionary data = self.search_sort_db(db, opts) diff --git a/src/calibre/library/cli.py b/src/calibre/library/cli.py index 6e2d672202..ddfb96704c 100644 --- a/src/calibre/library/cli.py +++ b/src/calibre/library/cli.py @@ -644,6 +644,10 @@ def catalog_option_parser(args): output, fmt = validate_command_line(parser, args, log) # Add options common to all catalog plugins + parser.add_option('-i', '--ids', default=None, dest='ids', + help=_("Comma-separated list of database IDs to catalog.\n" + "If declared, --search is ignored.\n" + "Default: all")) parser.add_option('-s', '--search', default=None, dest='search_text', help=_("Filter the results by the search query. " "For the format of the search query, please see " @@ -656,31 +660,6 @@ def catalog_option_parser(args): # Add options specific to fmt plugin plugin = add_plugin_parser_options(fmt, parser, log) - # Merge options from GUI Preferences - ''' - # Placeholder sample code until we implement GUI preferences - from calibre.library.save_to_disk import config - c = config() - for pref in ['asciiize', 'update_metadata', 'write_opf', 'save_cover']: - opt = c.get_option(pref) - switch = '--dont-'+pref.replace('_', '-') - parser.add_option(switch, default=True, action='store_false', - help=opt.help+' '+_('Specifying this switch will turn ' - 'this behavior off.'), dest=pref) - - for pref in ['timefmt', 'template', 'formats']: - opt = c.get_option(pref) - switch = '--'+pref - parser.add_option(switch, default=opt.default, - help=opt.help, dest=pref) - - for pref in ('replace_whitespace', 'to_lowercase'): - opt = c.get_option(pref) - switch = '--'+pref.replace('_', '-') - parser.add_option(switch, default=False, action='store_true', - help=opt.help) - ''' - return parser, plugin, log def command_catalog(args, dbpath): @@ -693,6 +672,9 @@ def command_catalog(args, dbpath): return 1 if opts.verbose: log("library.cli:command_catalog dispatching to plugin %s" % plugin.name) + if opts.ids: + opts.ids = [int(id) for id in opts.ids.split(',')] + with plugin: plugin.run(args[1], opts, get_db(dbpath, opts)) return 0 diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 84638410c7..7b0f7a083e 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -1634,13 +1634,15 @@ class LibraryDatabase2(LibraryDatabase): for i in iter(self): yield i[x] - def get_data_as_dict(self, prefix=None, authors_as_string=False): + def get_data_as_dict(self, prefix=None, authors_as_string=False, ids=None): ''' Return all metadata stored in the database as a dict. Includes paths to the cover and each format. :param prefix: The prefix for all paths. By default, the prefix is the absolute path to the library folder. + :param ids: Set of ids to return the data for. If None return data for + all entries in database. ''' if prefix is None: prefix = self.library_path @@ -1650,11 +1652,14 @@ class LibraryDatabase2(LibraryDatabase): data = [] for record in self.data: if record is None: continue + db_id = record[FIELD_MAP['id']] + if ids is not None and db_id not in ids: + continue x = {} for field in FIELDS: x[field] = record[FIELD_MAP[field]] data.append(x) - x['id'] = record[FIELD_MAP['id']] + x['id'] = db_id x['formats'] = [] if not x['authors']: x['authors'] = _('Unknown') diff --git a/src/calibre/utils/config.py b/src/calibre/utils/config.py index a0e5632cb7..22e31c3005 100644 --- a/src/calibre/utils/config.py +++ b/src/calibre/utils/config.py @@ -524,6 +524,7 @@ class DynamicConfig(dict): pass except: import traceback + print 'Failed to unpickle stored object:' traceback.print_exc() d = {} self.clear() diff --git a/src/calibre/utils/localization.py b/src/calibre/utils/localization.py index 1ade012b1f..90f86a8368 100644 --- a/src/calibre/utils/localization.py +++ b/src/calibre/utils/localization.py @@ -104,6 +104,7 @@ _extra_lang_codes = { 'en_CY' : _('English (Cyprus)'), 'en_PK' : _('English (Pakistan)'), 'en_SG' : _('English (Singapore)'), + 'en_YE' : _('English (Yemen)'), 'de_AT' : _('German (AT)'), 'nl' : _('Dutch (NL)'), 'nl_BE' : _('Dutch (BE)'), diff --git a/src/calibre/utils/resources.py b/src/calibre/utils/resources.py index adfbebd9f0..a69db34f2e 100644 --- a/src/calibre/utils/resources.py +++ b/src/calibre/utils/resources.py @@ -9,9 +9,22 @@ __docformat__ = 'restructuredtext en' import __builtin__, sys, os +_dev_path = os.environ.get('CALIBRE_DEVELOP_FROM', None) +if _dev_path is not None: + _dev_path = os.path.join(os.path.abspath(os.path.dirname(_dev_path)), 'resources') + if not os.path.exists(_dev_path): + _dev_path = None + def get_path(path, data=False): + global _dev_path path = path.replace(os.sep, '/') - path = os.path.join(sys.resources_location, *path.split('/')) + base = None + if _dev_path is not None: + if os.path.exists(os.path.join(_dev_path, *path.split('/'))): + base = _dev_path + if base is None: + base = sys.resources_location + path = os.path.join(base, *path.split('/')) if data: return open(path, 'rb').read() return path diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index dfcadf03ed..60b5ad0174 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -357,9 +357,17 @@ class BasicNewsRecipe(Recipe): Override in a subclass to customize extraction of the :term:`URL` that points to the content for each article. Return the article URL. It is called with `article`, an object representing a parsed article - from a feed. See `feedsparser `_. - By default it returns `article.link `_. + from a feed. See `feedparser `_. + By default it looks for the original link (for feeds syndicated via a + service like feedburner or pheedo) and if found, + returns that or else returns + `article.link `_. ''' + for key in article.keys(): + if key.endswith('_origlink'): + url = article[key] + if url and url.startswith('http://'): + return url return article.get('link', None) def preprocess_html(self, soup):