diff --git a/resources/images/news/ad.png b/resources/images/news/ad.png new file mode 100644 index 0000000000..8b017910df Binary files /dev/null and b/resources/images/news/ad.png differ diff --git a/resources/images/news/digitaljournal.png b/resources/images/news/digitaljournal.png new file mode 100644 index 0000000000..ea4637b8ad Binary files /dev/null and b/resources/images/news/digitaljournal.png differ diff --git a/resources/recipes/ad.recipe b/resources/recipes/ad.recipe new file mode 100644 index 0000000000..bc3fe40dad --- /dev/null +++ b/resources/recipes/ad.recipe @@ -0,0 +1,86 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class ADRecipe(BasicNewsRecipe): + __license__ = 'GPL v3' + __author__ = 'kwetal' + language = 'nl' + country = 'NL' + version = 1 + + title = u'AD' + publisher = u'de Persgroep Publishing Nederland NV' + category = u'News, Sports, the Netherlands' + description = u'News and Sports from the Netherlands' + + oldest_article = 1.2 + max_articles_per_feed = 100 + use_embedded_content = False + + remove_empty_feeds = True + no_stylesheets = True + remove_javascript = True + + keep_only_tags = [] + keep_only_tags.append(dict(name = 'div', attrs = {'id': 'art_box2'})) + keep_only_tags.append(dict(name = 'p', attrs = {'class': 'gen_footnote3'})) + + remove_tags = [] + remove_tags.append(dict(name = 'div', attrs = {'class': 'gen_clear'})) + remove_tags.append(dict(name = 'div', attrs = {'class': re.compile(r'gen_spacer.*')})) + + remove_attributes = ['style'] + + # feeds from http://ad.nl/ad/nl/1401/home/integration/nmc/frameset/ad_footer/rssFeeds.dhtml + feeds = [] + feeds.append((u'Binnenland', u'http://www.ad.nl/nieuws/binnenland/rss.xml')) + feeds.append((u'Buitenland', u'http://www.ad.nl/nieuws/buitenland/rss.xml')) + feeds.append((u'Bizar', u'http://www.ad.nl/nieuws/bizar/rss.xml')) + feeds.append((u'Gezondheid & Wetenschap', u'http://www.ad.nl/nieuws/gezondheidwetenschap/rss.xml')) + feeds.append((u'Economie', u'http://www.ad.nl/nieuws/economie/rss.xml')) + feeds.append((u'Nederlands Voetbal', u'http://www.ad.nl/sportwereld/nederlandsvoetbal/rss.xml')) + feeds.append((u'Buitenlands Voetbal', u'http://www.ad.nl/sportwereld/buitenlandsvoetbal/rss.xml')) + feeds.append((u'Champions League/Europa League', u'http://www.ad.nl/sportwereld/championsleagueeuropaleague/rss.xml')) + feeds.append((u'Wielrennen', u'http://www.ad.nl/sportwereld/wielrennen/rss.xml')) + feeds.append((u'Tennis', u'http://www.ad.nl/sportwereld/tennis/rss.xml')) + feeds.append((u'Formule 1', u'http://www.ad.nl/sportwereld/formule1/rss.xml')) + feeds.append((u'Meer Sport', u'http://www.ad.nl/sportwereld/meersport/rss.xml')) + feeds.append((u'Celebs', u'http://www.ad.nl/showbizz/celebs/rss.xml')) + feeds.append((u'Film', u'http://www.ad.nl/showbizz/film/rss.xml')) + feeds.append((u'Muziek', u'http://www.ad.nl/showbizz/muziek/rss.xml')) + feeds.append((u'TV', u'http://www.ad.nl/showbizz/tv/rss.xml')) + feeds.append((u'Kunst & Literatuur', u'http://www.ad.nl/showbizz/kunstenliteratuur/rss.xml')) + feeds.append((u'Jouw Wereld', u'http://www.ad.nl/you/rss.xml')) + feeds.append((u'Consument', u'http://www.ad.nl/consument/rss.xml')) + feeds.append((u'Autowereld', u'http://www.ad.nl/autowereld/rss.xml')) + feeds.append((u'Reiswereld', u'http://www.ad.nl/reiswereld/rss.xml')) + feeds.append((u'Internet', u'http://www.ad.nl/digitaal/internet/rss.xml')) + feeds.append((u'Games', u'http://www.ad.nl/digitaal/games/rss.xml')) + feeds.append((u'Multimedia', u'http://www.ad.nl/digitaal/multimedia/rss.xml')) + feeds.append((u'Planet Watch', u'http://www.ad.nl/planetwatch/rss.xml')) + + extra_css = ''' + body {font-family: verdana, arial, helvetica, geneva, sans-serif;} + div.captionEmbeddedMasterObject {font-size: x-small; font-style: italic; color: #696969;} + .gen_footnote3 {font-size: small; color: #666666; margin-top: 0.6em;} + ''' + + conversion_options = {'comments': description, 'tags': category, 'language': 'en', + 'publisher': publisher} + + def print_version(self, url): + parts = url.split('/') + print_url = 'http://' + parts[2] + '/' + parts[3] + '/' + parts[4] + '/' + parts[5] + '/' \ + + parts[10] + '/' + parts[7] + '/print/' + parts[8] + '/' + parts[9] + '/' + parts[13] + + return print_url + + def preprocess_html(self, soup): + for br in soup.findAll('br'): + prev = br.findPreviousSibling(True) + if hasattr(prev, 'name') and prev.name == 'br': + next = br.findNextSibling(True) + if hasattr(next, 'name') and next.name == 'br': + br.extract() + + return soup diff --git a/resources/recipes/amspec.recipe b/resources/recipes/amspec.recipe index 62bec5ae18..e5a76a4f86 100644 --- a/resources/recipes/amspec.recipe +++ b/resources/recipes/amspec.recipe @@ -1,7 +1,5 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' +__copyright__ = '2009-2010, Darko Miletic ' ''' spectator.org ''' @@ -11,20 +9,22 @@ from calibre.web.feeds.news import BasicNewsRecipe class TheAmericanSpectator(BasicNewsRecipe): title = 'The American Spectator' __author__ = 'Darko Miletic' - language = 'en' - description = 'News from USA' + category = 'news, politics, USA, world' + publisher = 'The American Spectator' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False + language = 'en' INDEX = 'http://spectator.org' - html2lrf_options = [ - '--comment' , description - , '--category' , 'news, politics, USA' - , '--publisher' , title - ] + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } keep_only_tags = [ dict(name='div', attrs={'class':'post inner'}) @@ -33,13 +33,11 @@ class TheAmericanSpectator(BasicNewsRecipe): remove_tags = [ dict(name='object') - ,dict(name='div', attrs={'class':'col3' }) - ,dict(name='div', attrs={'class':'post-options' }) - ,dict(name='p' , attrs={'class':'letter-editor'}) - ,dict(name='div', attrs={'class':'social' }) + ,dict(name='div', attrs={'class':['col3','post-options','social']}) + ,dict(name='p' , attrs={'class':['letter-editor','meta']}) ] - feeds = [ (u'Articles', u'http://feedproxy.google.com/amspecarticles')] + feeds = [ (u'Articles', u'http://feeds.feedburner.com/amspecarticles')] def get_cover_url(self): cover_url = None @@ -53,3 +51,7 @@ class TheAmericanSpectator(BasicNewsRecipe): def print_version(self, url): return url + '/print' + + def get_article_url(self, article): + return article.get('guid', None) + diff --git a/resources/recipes/bbc_fast.recipe b/resources/recipes/bbc_fast.recipe new file mode 100644 index 0000000000..12ae9ce1eb --- /dev/null +++ b/resources/recipes/bbc_fast.recipe @@ -0,0 +1,60 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +news.bbc.co.uk +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + +class BBC(BasicNewsRecipe): + title = 'BBC News (fast)' + __author__ = 'Darko Miletic' + description = 'News from UK. A much faster version that does not download pictures' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + #delay = 1 + use_embedded_content = False + encoding = 'utf8' + publisher = 'BBC' + category = 'news, UK, world' + language = 'en' + extra_css = ' body{ font-family: sans-serif; } .headline{font-size: xx-large; font-weight: bold} .ibox{display: block; margin: 20px 50px; padding: 10px; border: 1px solid } ' + + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } + + remove_tags_before = dict(name='div',attrs={'class':'headline'}) + remove_tags_after = dict(name='div', attrs={'class':'footer'}) + remove_tags = [ + dict(name=['object','link','script','iframe']) + ,dict(name='div', attrs={'class':'footer'}) + ] + + feeds = [ + ('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'), + ('Science/Nature', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/science/nature/rss.xml'), + ('Technology', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/technology/rss.xml'), + ('Entertainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'), + ('Magazine', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/magazine/rss.xml'), + ('Business', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml'), + ('Health', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/health/rss.xml'), + ('Americas', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml'), + ('Europe', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml'), + ('South Asia', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/south_asia/rss.xml'), + ('UK', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/rss.xml'), + ('Asia-Pacific', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/asia-pacific/rss.xml'), + ('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'), + ] + + def print_version(self, url): + emp,sep,rstrip = url.partition('http://') + return 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/' + rstrip + + def get_article_url(self, article): + return article.get('guid', None) + diff --git a/resources/recipes/calgary_herald.recipe b/resources/recipes/calgary_herald.recipe new file mode 100644 index 0000000000..884a951d96 --- /dev/null +++ b/resources/recipes/calgary_herald.recipe @@ -0,0 +1,121 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Calgary Herald + title = u'Calgary Herald' + url_prefix = 'http://www.calgaryherald.com' + description = u'News from Calgary, AB' + + # un-comment the following three lines for the Regina Leader-Post + #title = u'Regina Leader-Post' + #url_prefix = 'http://www.leaderpost.com' + #description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/digitaljournal.recipe b/resources/recipes/digitaljournal.recipe new file mode 100644 index 0000000000..c49caf9580 --- /dev/null +++ b/resources/recipes/digitaljournal.recipe @@ -0,0 +1,52 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +digitaljournal.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class DigitalJournal(BasicNewsRecipe): + title = 'Digital Journal' + __author__ = 'Darko Miletic' + description = 'A Global Citizen Journalism News Network' + category = 'news, politics, USA, world' + publisher = 'Digital Journal' + oldest_article = 2 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + encoding = 'utf8' + language = 'en' + + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } + + keep_only_tags = [dict(name='div', attrs={'class':['article','body']})] + + remove_tags = [dict(name=['object','table'])] + + feeds = [ + (u'Latest News' , u'http://digitaljournal.com/rss/?feed=latest_news' ) + ,(u'Business' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Business' ) + ,(u'Entertainment', u'http://digitaljournal.com/rss/?feed=top_news&depname=Entertainment') + ,(u'Environment' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Environment' ) + ,(u'Food' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Food' ) + ,(u'Health' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Health' ) + ,(u'Internet' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Internet' ) + ,(u'Politics' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Politics' ) + ,(u'Religion' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Religion' ) + ,(u'Science' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Science' ) + ,(u'Sports' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Sports' ) + ,(u'Technology' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Technology' ) + ,(u'World' , u'http://digitaljournal.com/rss/?feed=top_news&depname=World' ) + ,(u'Arts' , u'http://digitaljournal.com/rss/?feed=top_news&depname=Arts' ) + ] + + def print_version(self, url): + return url.replace('digitaljournal.com/','digitaljournal.com/print/') + diff --git a/resources/recipes/edmonton_journal.recipe b/resources/recipes/edmonton_journal.recipe new file mode 100644 index 0000000000..ac28b18f71 --- /dev/null +++ b/resources/recipes/edmonton_journal.recipe @@ -0,0 +1,126 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Edmonton Journal + title = u'Edmonton Journal' + url_prefix = 'http://www.edmontonjournal.com' + description = u'News from Edmonton, AB' + + # un-comment the following three lines for the Calgary Herald + #title = u'Calgary Herald' + #url_prefix = 'http://www.calgaryherald.com' + #description = u'News from Calgary, AB' + + # un-comment the following three lines for the Regina Leader-Post + #title = u'Regina Leader-Post' + #url_prefix = 'http://www.leaderpost.com' + #description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/montreal_gazette.recipe b/resources/recipes/montreal_gazette.recipe new file mode 100644 index 0000000000..3061cc37e4 --- /dev/null +++ b/resources/recipes/montreal_gazette.recipe @@ -0,0 +1,96 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Montreal Gazette + title = u'Montreal Gazette' + url_prefix = 'http://www.montrealgazette.com' + description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/ottawa_citizen.recipe b/resources/recipes/ottawa_citizen.recipe new file mode 100644 index 0000000000..5465212d4c --- /dev/null +++ b/resources/recipes/ottawa_citizen.recipe @@ -0,0 +1,101 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Ottawa Citizen + title = u'Ottawa Citizen' + url_prefix = 'http://www.ottawacitizen.com' + description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/pajama.recipe b/resources/recipes/pajama.recipe new file mode 100644 index 0000000000..8c5ba74317 --- /dev/null +++ b/resources/recipes/pajama.recipe @@ -0,0 +1,48 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +class PajamasMedia(BasicNewsRecipe): + title = u'Pajamas Media' + description = u'Provides exclusive news and opinion for forty countries.' + language = 'en' + __author__ = 'Krittika Goyal' + oldest_article = 1 #days + max_articles_per_feed = 25 + recursions = 1 + match_regexps = [r'http://pajamasmedia.com/blog/.*/2/$'] + #encoding = 'latin1' + + remove_stylesheets = True + #remove_tags_before = dict(name='h1', attrs={'class':'heading'}) + remove_tags_after = dict(name='div', attrs={'class':'paged-nav'}) + remove_tags = [ + dict(name='iframe'), + dict(name='div', attrs={'class':['pages']}), + #dict(name='div', attrs={'id':['bookmark']}), + #dict(name='span', attrs={'class':['related_link', 'slideshowcontrols']}), + #dict(name='ul', attrs={'class':'articleTools'}), + ] + + feeds = [ +('pajamas Media', + 'http://feeds.feedburner.com/PajamasMedia'), + +] + + def preprocess_html(self, soup): + story = soup.find(name='div', attrs={'id':'innerpage-content'}) + #td = heading.findParent(name='td') + #td.extract() + + soup = BeautifulSoup('t') + body = soup.find(name='body') + body.insert(0, story) + return soup + + def postprocess_html(self, soup, first): + if not first: + h = soup.find(attrs={'class':'innerpage-header'}) + if h: h.extract() + auth = soup.find(attrs={'class':'author'}) + if auth: auth.extract() + return soup diff --git a/resources/recipes/physics_today.recipe b/resources/recipes/physics_today.recipe index 9b236ff23c..d1ce17cf32 100644 --- a/resources/recipes/physics_today.recipe +++ b/resources/recipes/physics_today.recipe @@ -8,8 +8,7 @@ class Physicstoday(BasicNewsRecipe): description = u'Physics Today magazine' publisher = 'American Institute of Physics' category = 'Physics' - language = 'en' - + language = 'en' cover_url = strftime('http://ptonline.aip.org/journals/doc/PHTOAD-home/jrnls/images/medcover%m_%Y.jpg') oldest_article = 30 max_articles_per_feed = 100 @@ -30,11 +29,11 @@ class Physicstoday(BasicNewsRecipe): def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: - br.open('http://www.physicstoday.org/pt/sso_login.jsp') - br.select_form(name='login') + br.open('http://ptonline.aip.org/journals/doc/PHTOAD-home/pt_login.jsp?fl=f') + br.select_form(name='login_form') br['username'] = self.username br['password'] = self.password br.submit() return br - feeds = [(u'All', u'http://www.physicstoday.org/feed.xml')] \ No newline at end of file + feeds = [(u'All', u'http://www.physicstoday.org/feed.xml')] diff --git a/resources/recipes/readers_digest.recipe b/resources/recipes/readers_digest.recipe new file mode 100644 index 0000000000..3689ca4c53 --- /dev/null +++ b/resources/recipes/readers_digest.recipe @@ -0,0 +1,188 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +''' +''' +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.web.feeds import Feed + + +class ReadersDigest(BasicNewsRecipe): + + title = 'Readers Digest' + __author__ = 'BrianG' + language = 'en' + description = 'Readers Digest Feeds' + no_stylesheets = True + use_embedded_content = False + oldest_article = 60 + max_articles_per_feed = 200 + + language = 'en' + remove_javascript = True + + extra_css = ''' h1 {font-family:georgia,serif;color:#000000;} + .mainHd{font-family:georgia,serif;color:#000000;} + h2 {font-family:Arial,Sans-serif;} + .name{font-family:Arial,Sans-serif; font-size:x-small;font-weight:bold; } + .date{font-family:Arial,Sans-serif; font-size:x-small ;color:#999999;} + .byline{font-family:Arial,Sans-serif; font-size:x-small ;} + .photoBkt{ font-size:x-small ;} + .vertPhoto{font-size:x-small ;} + .credits{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;} + .credit{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;} + .artTxt{font-family:georgia,serif;} + .caption{font-family:georgia,serif; font-size:x-small;color:#333333;} + .credit{font-family:georgia,serif; font-size:x-small;color:#999999;} + a:link{color:#CC0000;} + .breadcrumb{font-family:Arial,Sans-serif;font-size:x-small;} + ''' + + + remove_tags = [ + dict(name='h4', attrs={'class':'close'}), + dict(name='div', attrs={'class':'fromLine'}), + dict(name='img', attrs={'class':'colorTag'}), + dict(name='div', attrs={'id':'sponsorArticleHeader'}), + dict(name='div', attrs={'class':'horizontalAd'}), + dict(name='div', attrs={'id':'imageCounterLeft'}), + dict(name='div', attrs={'id':'commentsPrint'}) + ] + + + feeds = [ + ('New in RD', 'http://feeds.rd.com/ReadersDigest'), + ('Jokes', 'http://feeds.rd.com/ReadersDigestJokes'), + ('Cartoons', 'http://feeds.rd.com/ReadersDigestCartoons'), + ('Blogs','http://feeds.rd.com/ReadersDigestBlogs') + ] + + cover_url = 'http://www.rd.com/images/logo-main-rd.gif' + + + +#------------------------------------------------------------------------------------------------- + + def print_version(self, url): + + # Get the identity number of the current article and append it to the root print URL + + if url.find('/article') > 0: + ident = url[url.find('/article')+8:url.find('.html?')-4] + url = 'http://www.rd.com/content/printContent.do?contentId=' + ident + + elif url.find('/post') > 0: + + # in this case, have to get the page itself to derive the Print page. + soup = self.index_to_soup(url) + newsoup = soup.find('ul',attrs={'class':'printBlock'}) + url = 'http://www.rd.com' + newsoup('a')[0]['href'] + url = url[0:url.find('&Keep')] + + return url + +#------------------------------------------------------------------------------------------------- + + def parse_index(self): + + pages = [ + ('Your America','http://www.rd.com/your-america-inspiring-people-and-stories', 'channelLeftContainer',{'class':'moreLeft'}), + # useless recipes ('Living Healthy','http://www.rd.com/living-healthy', 'channelLeftContainer',{'class':'moreLeft'}), + ('Advice and Know-How','http://www.rd.com/advice-and-know-how', 'channelLeftContainer',{'class':'moreLeft'}) + + ] + + feeds = [] + + for page in pages: + section, url, divider, attrList = page + newArticles = self.page_parse(url, divider, attrList) + feeds.append((section,newArticles)) + + # after the pages of the site have been processed, parse several RSS feeds for additional sections + newfeeds = Feed() + newfeeds = self.parse_rss() + + + # The utility code in parse_rss returns a Feed object. Convert each feed/article combination into a form suitable + # for this module (parse_index). + + for feed in newfeeds: + newArticles = [] + for article in feed.articles: + newArt = { + 'title' : article.title, + 'url' : article.url, + 'date' : article.date, + 'description' : article.text_summary + } + newArticles.append(newArt) + + + # New and Blogs should be the first two feeds. + if feed.title == 'New in RD': + feeds.insert(0,(feed.title,newArticles)) + elif feed.title == 'Blogs': + feeds.insert(1,(feed.title,newArticles)) + else: + feeds.append((feed.title,newArticles)) + + + return feeds + +#------------------------------------------------------------------------------------------------- + + def page_parse(self, mainurl, divider, attrList): + + articles = [] + mainsoup = self.index_to_soup(mainurl) + for item in mainsoup.findAll(attrs=attrList): + newArticle = { + 'title' : item('img')[0]['alt'], + 'url' : 'http://www.rd.com'+item('a')[0]['href'], + 'date' : '', + 'description' : '' + } + articles.append(newArticle) + + + + return articles + + + +#------------------------------------------------------------------------------------------------- + + def parse_rss (self): + + # Do the "official" parse_feeds first + feeds = BasicNewsRecipe.parse_feeds(self) + + + # Loop thru the articles in all feeds to find articles with "recipe" in it + recipeArticles = [] + for curfeed in feeds: + delList = [] + for a,curarticle in enumerate(curfeed.articles): + if curarticle.title.upper().find('RECIPE') >= 0: + recipeArticles.append(curarticle) + delList.append(curarticle) + if len(delList)>0: + for d in delList: + index = curfeed.articles.index(d) + curfeed.articles[index:index+1] = [] + + # If there are any recipes found, create a new Feed object and append. + if len(recipeArticles) > 0: + pfeed = Feed() + pfeed.title = 'Recipes' + pfeed.descrition = 'Recipe Feed (Virtual)' + pfeed.image_url = None + pfeed.oldest_article = 30 + pfeed.id_counter = len(recipeArticles) + # Create a new Feed, add the recipe articles, and then append + # to "official" list of feeds + pfeed.articles = recipeArticles[:] + feeds.append(pfeed) + + return feeds + diff --git a/resources/recipes/regina_leader_post.recipe b/resources/recipes/regina_leader_post.recipe new file mode 100644 index 0000000000..9efec51848 --- /dev/null +++ b/resources/recipes/regina_leader_post.recipe @@ -0,0 +1,116 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Regina Leader-Post + title = u'Regina Leader-Post' + url_prefix = 'http://www.leaderpost.com' + description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/saskatoon_star_phoenix.recipe b/resources/recipes/saskatoon_star_phoenix.recipe new file mode 100644 index 0000000000..25330478d4 --- /dev/null +++ b/resources/recipes/saskatoon_star_phoenix.recipe @@ -0,0 +1,111 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Saskatoon Star-Phoenix + title = u'Saskatoon Star-Phoenix' + url_prefix = 'http://www.thestarphoenix.com' + description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/vancouver_provice.recipe b/resources/recipes/vancouver_provice.recipe new file mode 100644 index 0000000000..9375670c59 --- /dev/null +++ b/resources/recipes/vancouver_provice.recipe @@ -0,0 +1,136 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Vancouver Province + title = u'Vancouver Province' + url_prefix = 'http://www.theprovince.com' + description = u'News from Vancouver, BC' + + # un-comment the following three lines for the Vancouver Sun + #title = u'Vancouver Sun' + #url_prefix = 'http://www.vancouversun.com' + #description = u'News from Vancouver, BC' + + # un-comment the following three lines for the Edmonton Journal + #title = u'Edmonton Journal' + #url_prefix = 'http://www.edmontonjournal.com' + #description = u'News from Edmonton, AB' + + # un-comment the following three lines for the Calgary Herald + #title = u'Calgary Herald' + #url_prefix = 'http://www.calgaryherald.com' + #description = u'News from Calgary, AB' + + # un-comment the following three lines for the Regina Leader-Post + #title = u'Regina Leader-Post' + #url_prefix = 'http://www.leaderpost.com' + #description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/vancouver_sun.recipe b/resources/recipes/vancouver_sun.recipe new file mode 100644 index 0000000000..8f12869bf9 --- /dev/null +++ b/resources/recipes/vancouver_sun.recipe @@ -0,0 +1,131 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Vancouver Sun + title = u'Vancouver Sun' + url_prefix = 'http://www.vancouversun.com' + description = u'News from Vancouver, BC' + + # un-comment the following three lines for the Edmonton Journal + #title = u'Edmonton Journal' + #url_prefix = 'http://www.edmontonjournal.com' + #description = u'News from Edmonton, AB' + + # un-comment the following three lines for the Calgary Herald + #title = u'Calgary Herald' + #url_prefix = 'http://www.calgaryherald.com' + #description = u'News from Calgary, AB' + + # un-comment the following three lines for the Regina Leader-Post + #title = u'Regina Leader-Post' + #url_prefix = 'http://www.leaderpost.com' + #description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/vic_times.recipe b/resources/recipes/vic_times.recipe new file mode 100644 index 0000000000..2dc8e96003 --- /dev/null +++ b/resources/recipes/vic_times.recipe @@ -0,0 +1,141 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Victoria Times Colonist + title = u'Victoria Times Colonist' + url_prefix = 'http://www.timescolonist.com' + description = u'News from Victoria, BC' + + # un-comment the following three lines for the Vancouver Province + #title = u'Vancouver Province' + #url_prefix = 'http://www.theprovince.com' + #description = u'News from Vancouver, BC' + + # un-comment the following three lines for the Vancouver Sun + #title = u'Vancouver Sun' + #url_prefix = 'http://www.vancouversun.com' + #description = u'News from Vancouver, BC' + + # un-comment the following three lines for the Edmonton Journal + #title = u'Edmonton Journal' + #url_prefix = 'http://www.edmontonjournal.com' + #description = u'News from Edmonton, AB' + + # un-comment the following three lines for the Calgary Herald + #title = u'Calgary Herald' + #url_prefix = 'http://www.calgaryherald.com' + #description = u'News from Calgary, AB' + + # un-comment the following three lines for the Regina Leader-Post + #title = u'Regina Leader-Post' + #url_prefix = 'http://www.leaderpost.com' + #description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/windows_star.recipe b/resources/recipes/windows_star.recipe new file mode 100644 index 0000000000..4d34261bb7 --- /dev/null +++ b/resources/recipes/windows_star.recipe @@ -0,0 +1,106 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Windsor Star + title = u'Windsor Star' + url_prefix = 'http://www.windsorstar.com' + description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/wsj.recipe b/resources/recipes/wsj.recipe index 70c05b1ded..3ced77023d 100644 --- a/resources/recipes/wsj.recipe +++ b/resources/recipes/wsj.recipe @@ -5,6 +5,7 @@ __docformat__ = 'restructuredtext en' from calibre.web.feeds.news import BasicNewsRecipe +from calibre import strftime # http://online.wsj.com/page/us_in_todays_paper.html @@ -67,6 +68,13 @@ class WallStreetJournal(BasicNewsRecipe): def parse_index(self): soup = self.wsj_get_index() + year = strftime('%Y') + for x in soup.findAll('td', attrs={'class':'b14'}): + txt = self.tag_to_string(x).strip() + if year in txt: + self.timefmt = ' [%s]'%txt + break + left_column = soup.find( text=lambda t: 'begin ITP Left Column' in str(t)) @@ -91,7 +99,7 @@ class WallStreetJournal(BasicNewsRecipe): url = url.partition('#')[0] desc = '' d = x.findNextSibling(True) - if d.get('class', None) == 'arialResize': + if d is not None and d.get('class', None) == 'arialResize': desc = self.tag_to_string(d) desc = desc.partition(u'\u2022')[0] self.log('\t\tFound article:', title) diff --git a/resources/recipes/wsj_free.recipe b/resources/recipes/wsj_free.recipe index b05da400ae..495a7c343b 100644 --- a/resources/recipes/wsj_free.recipe +++ b/resources/recipes/wsj_free.recipe @@ -3,47 +3,122 @@ __license__ = 'GPL v3' ''' -online.wsj.com.com +online.wsj.com ''' import re from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag, NavigableString +from datetime import timedelta, datetime, date class WSJ(BasicNewsRecipe): # formatting adapted from original recipe by Kovid Goyal and Sujata Raman title = u'Wall Street Journal (free)' __author__ = 'Nick Redding' language = 'en' - description = ('All the free content from the Wall Street Journal (business' - ', financial and political news)') + description = ('All the free content from the Wall Street Journal (business, financial and political news)') + no_stylesheets = True timefmt = ' [%b %d]' - extra_css = '''h1{font-size:large; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;} - h2{font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;} - .subhead{font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;} - .insettipUnit {font-family:Arial,Sans-serif;font-size:xx-small;} - .targetCaption{font-size:x-small; font-family:Arial,Helvetica,sans-serif;} - .article{font-family :Arial,Helvetica,sans-serif; font-size:x-small;} - .tagline { ont-size:xx-small;} - .dateStamp {font-family:Arial,Helvetica,sans-serif;} - h3{font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} - .byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small; list-style-type: none;} + + # customization notes: delete sections you are not interested in + # set omit_paid_content to False if you want the paid content article snippets + # set oldest_article to the maximum number of days back from today to include articles + sectionlist = [ + ['/home-page','Front Page'], + ['/public/page/news-opinion-commentary.html','Commentary'], + ['/public/page/news-global-world.html','World News'], + ['/public/page/news-world-business.html','US News'], + ['/public/page/news-business-us.html','Business'], + ['/public/page/news-financial-markets-stock.html','Markets'], + ['/public/page/news-tech-technology.html','Technology'], + ['/public/page/news-personal-finance.html','Personal Finnce'], + ['/public/page/news-lifestyle-arts-entertainment.html','Life & Style'], + ['/public/page/news-real-estate-homes.html','Real Estate'], + ['/public/page/news-career-jobs.html','Careers'], + ['/public/page/news-small-business-marketing.html','Small Business'] + ] + oldest_article = 2 + omit_paid_content = True + + extra_css = '''h1{font-size:large; font-family:Times,serif;} + h2{font-family:Times,serif; font-size:small; font-style:italic;} + .subhead{font-family:Times,serif; font-size:small; font-style:italic;} + .insettipUnit {font-family:Times,serif;font-size:xx-small;} + .targetCaption{font-size:x-small; font-family:Times,serif; font-style:italic; margin-top: 0.25em;} + .article{font-family:Times,serif; font-size:x-small;} + .tagline { font-size:xx-small;} + .dateStamp {font-family:Times,serif;} + h3{font-family:Times,serif; font-size:xx-small;} + .byline {font-family:Times,serif; font-size:xx-small; list-style-type: none;} .metadataType-articleCredits {list-style-type: none;} - h6{ font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic;} + h6{font-family:Times,serif; font-size:small; font-style:italic;} .paperLocation{font-size:xx-small;}''' - remove_tags_before = dict(name='h1') - remove_tags = [ dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", - "articleTabs_tab_interactive","articleTabs_tab_video", - "articleTabs_tab_map","articleTabs_tab_slideshow"]), - {'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map', - 'insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', 'tooltip', - 'adSummary', 'nav-inline','insetFullBracket']}, - dict(rel='shortcut icon'), + + remove_tags_before = dict({'class':re.compile('^articleHeadlineBox')}) + remove_tags = [ dict({'id':re.compile('^articleTabs_tab_')}), + #dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", + # "articleTabs_tab_interactive","articleTabs_tab_video", + # "articleTabs_tab_map","articleTabs_tab_slideshow"]), + {'class': ['footer_columns','network','insetCol3wide','interactive','video','slideshow','map', + 'insettip','insetClose','more_in', "insetContent", + # 'articleTools_bottom','articleTools_bottom mjArticleTools', + 'aTools', 'tooltip', + 'adSummary', 'nav-inline','insetFullBracket']}, + dict({'class':re.compile('^articleTools_bottom')}), + dict(rel='shortcut icon') ] remove_tags_after = [dict(id="article_story_body"), {'class':"article story"}] + def get_browser(self): + br = BasicNewsRecipe.get_browser() + return br def preprocess_html(self,soup): + # check if article is too old + datetag = soup.find('li',attrs={'class' : re.compile("^dateStamp")}) + if datetag: + dateline_string = self.tag_to_string(datetag,False) + date_items = dateline_string.split(',') + datestring = date_items[0]+date_items[1] + article_date = datetime.strptime(datestring.title(),"%B %d %Y") + earliest_date = date.today() - timedelta(days=self.oldest_article) + if article_date.date() < earliest_date: + self.log("Skipping article dated %s" % datestring) + return None + datetag.parent.extract() + + # place dateline in article heading + + bylinetag = soup.find('h3','byline') + if bylinetag: + h3bylinetag = bylinetag + else: + bylinetag = soup.find('li','byline') + if bylinetag: + h3bylinetag = bylinetag.h3 + if not h3bylinetag: + h3bylinetag = bylinetag + bylinetag = bylinetag.parent + if bylinetag: + if h3bylinetag.a: + bylinetext = 'By '+self.tag_to_string(h3bylinetag.a,False) + else: + bylinetext = self.tag_to_string(h3bylinetag,False) + h3byline = Tag(soup,'h3',[('class','byline')]) + if bylinetext.isspace() or (bylinetext == ''): + h3byline.insert(0,NavigableString(date_items[0]+','+date_items[1])) + else: + h3byline.insert(0,NavigableString(bylinetext+u'\u2014'+date_items[0]+','+date_items[1])) + bylinetag.replaceWith(h3byline) + else: + headlinetag = soup.find('div',attrs={'class' : re.compile("^articleHeadlineBox")}) + if headlinetag: + dateline = Tag(soup,'h3', [('class','byline')]) + dateline.insert(0,NavigableString(date_items[0]+','+date_items[1])) + headlinetag.insert(len(headlinetag),dateline) + else: # if no date tag, don't process this page--it's not a news item + return None # This gets rid of the annoying superfluous bullet symbol preceding columnist bylines ultag = soup.find('ul',attrs={'class' : 'cMetadata metadataType-articleCredits'}) if ultag: @@ -58,7 +133,7 @@ class WSJ(BasicNewsRecipe): key = None ans = [] - def parse_index_page(page_name,page_title,omit_paid_content): + def parse_index_page(page_name,page_title): def article_title(tag): atag = tag.find('h2') # title is usually in an h2 tag @@ -119,7 +194,6 @@ class WSJ(BasicNewsRecipe): soup = self.index_to_soup(pageurl) # Find each instance of div with class including "headlineSummary" for divtag in soup.findAll('div',attrs={'class' : re.compile("^headlineSummary")}): - # divtag contains all article data as ul's and li's # first, check if there is an h3 tag which provides a section name stag = divtag.find('h3') @@ -162,7 +236,7 @@ class WSJ(BasicNewsRecipe): # now skip paid subscriber articles if desired subscriber_tag = litag.find(text="Subscriber Content") if subscriber_tag: - if omit_paid_content: + if self.omit_paid_content: continue # delete the tip div so it doesn't get in the way tiptag = litag.find("div", { "class" : "tipTargetBox" }) @@ -185,7 +259,7 @@ class WSJ(BasicNewsRecipe): continue if url.startswith("/article"): url = mainurl+url - if not url.startswith("http"): + if not url.startswith("http://online.wsj.com"): continue if not url.endswith(".html"): continue @@ -214,48 +288,10 @@ class WSJ(BasicNewsRecipe): articles[page_title] = [] articles[page_title].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) - # customization notes: delete sections you are not interested in - # set omit_paid_content to False if you want the paid content article previews - sectionlist = ['Front Page','Commentary','World News','US News','Business','Markets', - 'Technology','Personal Finance','Life & Style','Real Estate','Careers','Small Business'] - omit_paid_content = True - if 'Front Page' in sectionlist: - parse_index_page('/home-page','Front Page',omit_paid_content) - ans.append('Front Page') - if 'Commentary' in sectionlist: - parse_index_page('/public/page/news-opinion-commentary.html','Commentary',omit_paid_content) - ans.append('Commentary') - if 'World News' in sectionlist: - parse_index_page('/public/page/news-global-world.html','World News',omit_paid_content) - ans.append('World News') - if 'US News' in sectionlist: - parse_index_page('/public/page/news-world-business.html','US News',omit_paid_content) - ans.append('US News') - if 'Business' in sectionlist: - parse_index_page('/public/page/news-business-us.html','Business',omit_paid_content) - ans.append('Business') - if 'Markets' in sectionlist: - parse_index_page('/public/page/news-financial-markets-stock.html','Markets',omit_paid_content) - ans.append('Markets') - if 'Technology' in sectionlist: - parse_index_page('/public/page/news-tech-technology.html','Technology',omit_paid_content) - ans.append('Technology') - if 'Personal Finance' in sectionlist: - parse_index_page('/public/page/news-personal-finance.html','Personal Finance',omit_paid_content) - ans.append('Personal Finance') - if 'Life & Style' in sectionlist: - parse_index_page('/public/page/news-lifestyle-arts-entertainment.html','Life & Style',omit_paid_content) - ans.append('Life & Style') - if 'Real Estate' in sectionlist: - parse_index_page('/public/page/news-real-estate-homes.html','Real Estate',omit_paid_content) - ans.append('Real Estate') - if 'Careers' in sectionlist: - parse_index_page('/public/page/news-career-jobs.html','Careers',omit_paid_content) - ans.append('Careers') - if 'Small Business' in sectionlist: - parse_index_page('/public/page/news-small-business-marketing.html','Small Business',omit_paid_content) - ans.append('Small Business') + for page_name,page_title in self.sectionlist: + parse_index_page(page_name,page_title) + ans.append(page_title) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans diff --git a/resources/viewer/images.js b/resources/viewer/images.js new file mode 100644 index 0000000000..ea68009254 --- /dev/null +++ b/resources/viewer/images.js @@ -0,0 +1,23 @@ +/* + * images management + * Copyright 2008 Kovid Goyal + * License: GNU GPL v3 + */ + +function scale_images() { + $("img:visible").each(function() { + var offset = $(this).offset(); + //window.py_bridge.debug(window.getComputedStyle(this, '').getPropertyValue('max-width')); + $(this).css("max-width", (window.innerWidth-offset.left-5)+"px"); + $(this).css("max-height", (window.innerHeight-5)+"px"); + }); +} + +function setup_image_scaling_handlers() { + scale_images(); + $(window).resize(function(){ + scale_images(); + }); +} + + diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 81e9021817..cf6eddce72 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -382,6 +382,7 @@ from calibre.ebooks.rtf.input import RTFInput from calibre.ebooks.tcr.input import TCRInput from calibre.ebooks.txt.input import TXTInput from calibre.ebooks.lrf.input import LRFInput +from calibre.ebooks.chm.input import CHMInput # XXMODIFIED from calibre.ebooks.epub.output import EPUBOutput from calibre.ebooks.fb2.output import FB2Output @@ -440,6 +441,7 @@ plugins += [ TCRInput, TXTInput, LRFInput, + CHMInput, # XXMODIFIED ] plugins += [ EPUBOutput, diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index b8557aea98..4f894ce088 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -563,6 +563,16 @@ class MobiReader(object): recindex = attrib.pop(attr, None) or recindex if recindex is not None: attrib['src'] = 'images/%s.jpg' % recindex + for attr in ('width', 'height'): + if attr in attrib: + val = attrib[attr] + if val.lower().endswith('em'): + try: + nval = float(val[:-2]) + nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile + attrib[attr] = "%dpx"%int(nval) + except: + del attrib[attr] elif tag.tag == 'pre': if not tag.text: tag.tag = 'div' diff --git a/src/calibre/ebooks/oeb/stylizer.py b/src/calibre/ebooks/oeb/stylizer.py index 9f50796615..d0e394b9e5 100644 --- a/src/calibre/ebooks/oeb/stylizer.py +++ b/src/calibre/ebooks/oeb/stylizer.py @@ -411,6 +411,7 @@ class Style(object): return result def _unit_convert(self, value, base=None, font=None): + ' Return value in pts' if isinstance(value, (int, long, float)): return value try: @@ -447,6 +448,9 @@ class Style(object): result = value * 0.40 return result + def pt_to_px(self, value): + return (self._profile.dpi / 72.0) * value + @property def fontSize(self): def normalize_fontsize(value, base): diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py index 1b2149cf3a..f4bdb9c7ac 100644 --- a/src/calibre/ebooks/pdf/reflow.py +++ b/src/calibre/ebooks/pdf/reflow.py @@ -20,6 +20,10 @@ class Font(object): class Column(object): + # A column contains an element is the element bulges out to + # the left or the right by at most HFUZZ*col width. + HFUZZ = 0.2 + def __init__(self): self.left = self.right = self.top = self.bottom = 0 self.width = self.height = 0 @@ -41,6 +45,10 @@ class Column(object): for x in self.elements: yield x + def contains(self, elem): + return elem.left > self.left - self.HFUZZ*self.width and \ + elem.right < self.right + self.HFUZZ*self.width + class Element(object): def __eq__(self, other): @@ -238,11 +246,10 @@ class Page(object): return columns def find_elements_in_row_of(self, x): - interval = Interval(x.top - self.YFUZZ * self.average_text_height, + interval = Interval(x.top, x.top + self.YFUZZ*(1+self.average_text_height)) h_interval = Interval(x.left, x.right) - m = max(0, x.idx-15) - for y in self.elements[m:x.idx+15]: + for y in self.elements[x.idx:x.idx+15]: if y is not x: y_interval = Interval(y.top, y.bottom) x_interval = Interval(y.left, y.right) diff --git a/src/calibre/gui2/dialogs/metadata_single.py b/src/calibre/gui2/dialogs/metadata_single.py index a9130b2ea2..89b7c92125 100644 --- a/src/calibre/gui2/dialogs/metadata_single.py +++ b/src/calibre/gui2/dialogs/metadata_single.py @@ -532,7 +532,7 @@ class MetadataSingleDialog(ResizableDialog, Ui_MetadataSingleDialog): if self.cover_fetcher.exception is not None: err = self.cover_fetcher.exception error_dialog(self, _('Cannot fetch cover'), - _('Could not fetch cover.
')+repr(err)).exec_() + _('Could not fetch cover.
')+unicode(err)).exec_() return pix = QPixmap() diff --git a/src/calibre/gui2/ui.py b/src/calibre/gui2/ui.py index 6cbae7f7b0..98b416eaa3 100644 --- a/src/calibre/gui2/ui.py +++ b/src/calibre/gui2/ui.py @@ -1361,7 +1361,7 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI): def generate_catalog(self): rows = self.library_view.selectionModel().selectedRows() - if not rows: + if not rows or len(rows) < 3: rows = xrange(self.library_view.model().rowCount(QModelIndex())) ids = map(self.library_view.model().id, rows) dbspec = None diff --git a/src/calibre/gui2/viewer/config.ui b/src/calibre/gui2/viewer/config.ui index fe1dc85c93..d6e71c77d2 100644 --- a/src/calibre/gui2/viewer/config.ui +++ b/src/calibre/gui2/viewer/config.ui @@ -7,14 +7,14 @@ 0 0 479 - 574 + 606 Configure Ebook viewer - + :/images/config.svg:/images/config.svg @@ -164,7 +164,7 @@ - + Remember last used &window size @@ -218,6 +218,13 @@ + + + + &Resize images larger than the viewer window (needs restart) + + + diff --git a/src/calibre/gui2/viewer/documentview.py b/src/calibre/gui2/viewer/documentview.py index b35e28121a..790b1c4f2f 100644 --- a/src/calibre/gui2/viewer/documentview.py +++ b/src/calibre/gui2/viewer/documentview.py @@ -10,7 +10,7 @@ from base64 import b64encode from PyQt4.Qt import QSize, QSizePolicy, QUrl, SIGNAL, Qt, QTimer, \ QPainter, QPalette, QBrush, QFontDatabase, QDialog, \ QColor, QPoint, QImage, QRegion, QVariant, QIcon, \ - QFont, QObject, QApplication, pyqtSignature, QAction + QFont, pyqtSignature, QAction from PyQt4.QtWebKit import QWebPage, QWebView, QWebSettings from calibre.utils.config import Config, StringConfig @@ -21,7 +21,7 @@ from calibre.constants import iswindows from calibre import prints, guess_type from calibre.gui2.viewer.keys import SHORTCUTS -bookmarks = referencing = hyphenation = jquery = jquery_scrollTo = hyphenator = None +bookmarks = referencing = hyphenation = jquery = jquery_scrollTo = hyphenator = images =None def load_builtin_fonts(): base = P('fonts/liberation/*.ttf') @@ -42,6 +42,8 @@ def config(defaults=None): help=_('Set the user CSS stylesheet. This can be used to customize the look of all books.')) c.add_opt('max_view_width', default=6000, help=_('Maximum width of the viewer window, in pixels.')) + c.add_opt('fit_images', default=True, + help=_('Resize images larger than the viewer window to fit inside it')) c.add_opt('hyphenate', default=False, help=_('Hyphenate text')) c.add_opt('hyphenate_default_lang', default='en', help=_('Default language for hyphenation rules')) @@ -59,20 +61,6 @@ def config(defaults=None): return c -class PythonJS(QObject): - - def __init__(self, callback): - QObject.__init__(self, QApplication.instance()) - self.setObjectName("py_bridge") - self._callback = callback - - @pyqtSignature("QString") - def callback(self, msg): - print "callback called" - self._callback(msg) - - - class ConfigDialog(QDialog, Ui_Dialog): def __init__(self, shortcuts, parent=None): @@ -110,6 +98,7 @@ class ConfigDialog(QDialog, Ui_Dialog): self.shortcut_config = ShortcutConfig(shortcuts, parent=self) p = self.tabs.widget(1) p.layout().addWidget(self.shortcut_config) + self.opt_fit_images.setChecked(opts.fit_images) def accept(self, *args): @@ -122,6 +111,7 @@ class ConfigDialog(QDialog, Ui_Dialog): c.set('standard_font', {0:'serif', 1:'sans', 2:'mono'}[self.standard_font.currentIndex()]) c.set('user_css', unicode(self.css.toPlainText())) c.set('remember_window_size', self.opt_remember_window_size.isChecked()) + c.set('fit_images', self.opt_fit_images.isChecked()) c.set('max_view_width', int(self.max_view_width.value())) c.set('hyphenate', self.hyphenate.isChecked()) idx = self.hyphenate_default_lang.currentIndex() @@ -157,7 +147,6 @@ class Document(QWebPage): self.setObjectName("py_bridge") self.debug_javascript = False self.current_language = None - #self.js_bridge = PythonJS(self.js_callback) self.setLinkDelegationPolicy(self.DelegateAllLinks) self.scroll_marks = [] @@ -197,9 +186,14 @@ class Document(QWebPage): opts = config().parse() self.hyphenate = opts.hyphenate self.hyphenate_default_lang = opts.hyphenate_default_lang + self.do_fit_images = opts.fit_images + + def fit_images(self): + if self.do_fit_images: + self.javascript('setup_image_scaling_handlers()') def load_javascript_libraries(self): - global bookmarks, referencing, hyphenation, jquery, jquery_scrollTo, hyphenator + global bookmarks, referencing, hyphenation, jquery, jquery_scrollTo, hyphenator, images self.mainFrame().addToJavaScriptWindowObject("py_bridge", self) if jquery is None: jquery = P('content_server/jquery.js', data=True) @@ -215,6 +209,9 @@ class Document(QWebPage): if referencing is None: referencing = P('viewer/referencing.js', data=True) self.javascript(referencing) + if images is None: + images = P('viewer/images.js', data=True) + self.javascript(images) if hyphenation is None: hyphenation = P('viewer/hyphenation.js', data=True) self.javascript(hyphenation) @@ -541,6 +538,7 @@ class DocumentView(QWebView): return self.loading_url = None self.document.set_bottom_padding(0) + self.document.fit_images() self._size_hint = self.document.mainFrame().contentsSize() scrolled = False if self.to_bottom: diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index 84638410c7..7b0f7a083e 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -1634,13 +1634,15 @@ class LibraryDatabase2(LibraryDatabase): for i in iter(self): yield i[x] - def get_data_as_dict(self, prefix=None, authors_as_string=False): + def get_data_as_dict(self, prefix=None, authors_as_string=False, ids=None): ''' Return all metadata stored in the database as a dict. Includes paths to the cover and each format. :param prefix: The prefix for all paths. By default, the prefix is the absolute path to the library folder. + :param ids: Set of ids to return the data for. If None return data for + all entries in database. ''' if prefix is None: prefix = self.library_path @@ -1650,11 +1652,14 @@ class LibraryDatabase2(LibraryDatabase): data = [] for record in self.data: if record is None: continue + db_id = record[FIELD_MAP['id']] + if ids is not None and db_id not in ids: + continue x = {} for field in FIELDS: x[field] = record[FIELD_MAP[field]] data.append(x) - x['id'] = record[FIELD_MAP['id']] + x['id'] = db_id x['formats'] = [] if not x['authors']: x['authors'] = _('Unknown') diff --git a/src/calibre/utils/config.py b/src/calibre/utils/config.py index a0e5632cb7..22e31c3005 100644 --- a/src/calibre/utils/config.py +++ b/src/calibre/utils/config.py @@ -524,6 +524,7 @@ class DynamicConfig(dict): pass except: import traceback + print 'Failed to unpickle stored object:' traceback.print_exc() d = {} self.clear()