diff --git a/resources/recipes/the_age.recipe b/resources/recipes/the_age.recipe
index 8e4ae05575..eddb5e5000 100644
--- a/resources/recipes/the_age.recipe
+++ b/resources/recipes/the_age.recipe
@@ -9,15 +9,19 @@ theage.com.au
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
-
+import re
class TheAge(BasicNewsRecipe):
- title = 'The Age'
- description = 'Business News, World News and Breaking News in Melbourne, Australia'
- __author__ = 'Matthew Briggs'
- language = 'en_AU'
-
+ title = 'The Age'
+ description = 'Business News, World News and Breaking News in Melbourne, Australia'
+ publication_type = 'newspaper'
+ __author__ = 'Matthew Briggs'
+ language = 'en_AU'
+
+ max_articles_per_feed = 1000
+ recursions = 0
+ remove_tags = [dict(name=['table', 'script', 'noscript', 'style']), dict(name='a', attrs={'href':'/'}), dict(name='a', attrs={'href':'/text/'})]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
@@ -28,30 +32,81 @@ class TheAge(BasicNewsRecipe):
soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/text/').read())
- feeds, articles = [], []
- feed = None
-
+ section = None
+ sections = {}
for tag in soup.findAll(['h3', 'a']):
if tag.name == 'h3':
- if articles:
- feeds.append((feed, articles))
- articles = []
- feed = self.tag_to_string(tag)
- elif feed is not None and tag.has_key('href') and tag['href'].strip():
+ section = self.tag_to_string(tag)
+ sections[section] = []
+
+ # Make sure to skip: TheAge
+
+ elif section and tag.has_key('href') and len(tag['href'].strip())>1:
url = tag['href'].strip()
if url.startswith('/'):
- url = 'http://www.theage.com.au' + url
+ url = 'http://www.theage.com.au' + url
title = self.tag_to_string(tag)
- articles.append({
+ sections[section].append({
'title': title,
'url' : url,
'date' : strftime('%a, %d %b'),
'description' : '',
'content' : '',
})
+
+ feeds = []
+ # Insert feeds in specified order, if available
+
+ feedSort = [ 'National', 'World', 'Opinion', 'Columns', 'Business', 'Sport', 'Entertainment' ]
+ for i in feedSort:
+ if i in sections:
+ feeds.append((i,sections[i]))
+
+ # Done with the sorted feeds
+
+ for i in feedSort:
+ del sections[i]
+
+ # Append what is left over...
+
+ for i in sections:
+ feeds.append((i,sections[i]))
+
return feeds
+ def get_cover_url(self):
+ soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/todays-paper').read())
+ for i in soup.findAll('a'):
+ href = i['href']
+ if href and re.match('http://www.theage.com.au/frontpage/[0-9]+/[0-9]+/[0-9]+/frontpage.pdf',href):
+ return href
+
+ return None
+
+ def preprocess_html(self,soup):
+
+ for p in soup.findAll('p'):
+
+ # Collapse the paragraph by joining the non-tag contents
+
+ contents = [i for i in p.contents if isinstance(i,unicode)]
+ if len(contents):
+ contents = ''.join(contents)
+
+ # Filter out what's left of the text-mode navigation stuff
+
+ if re.match('((\s)|(\ \;))*\[[\|\s*]*\]((\s)|(\ \;))*$',contents):
+ p.extract()
+ continue
+
+ # Shrink the fine print font
+
+ if contents=='This material is subject to copyright and any unauthorised use, copying or mirroring is prohibited.':
+ p['style'] = 'font-size:small'
+ continue
+
+ return soup
diff --git a/resources/recipes/the_oz.recipe b/resources/recipes/the_oz.recipe
index ccdce0acb6..6a897589db 100644
--- a/resources/recipes/the_oz.recipe
+++ b/resources/recipes/the_oz.recipe
@@ -16,7 +16,7 @@ class DailyTelegraph(BasicNewsRecipe):
language = 'en_AU'
oldest_article = 2
- max_articles_per_feed = 20
+ max_articles_per_feed = 30
remove_javascript = True
no_stylesheets = True
encoding = 'utf8'
@@ -48,22 +48,24 @@ class DailyTelegraph(BasicNewsRecipe):
.caption{font-family:Trebuchet MS,Trebuchet,Helvetica,sans-serif; font-size: xx-small;}
'''
- feeds = [(u'News', u'http://feeds.news.com.au/public/rss/2.0/aus_news_807.xml'),
+ feeds = [ (u'News', u'http://feeds.news.com.au/public/rss/2.0/aus_news_807.xml'),
(u'Opinion', u'http://feeds.news.com.au/public/rss/2.0/aus_opinion_58.xml'),
- (u'Business', u'http://feeds.news.com.au/public/rss/2.0/aus_business_811.xml'),
- (u'Media', u'http://feeds.news.com.au/public/rss/2.0/aus_media_57.xml'),
- (u'Higher Education', u'http://feeds.news.com.au/public/rss/2.0/aus_higher_education_56.xml'),
- (u'The Arts', u'http://feeds.news.com.au/public/rss/2.0/aus_arts_51.xml'),
- (u'Commercial Property', u'http://feeds.news.com.au/public/rss/2.0/aus_business_commercial_property_708.xml'),
(u'The Nation', u'http://feeds.news.com.au/public/rss/2.0/aus_the_nation_62.xml'),
- (u'Sport', u'http://feeds.news.com.au/public/rss/2.0/aus_sport_61.xml'),
- (u'Travel', u'http://feeds.news.com.au/public/rss/2.0/aus_travel_and_indulgence_63.xml'),
- (u'Defence', u'http://feeds.news.com.au/public/rss/2.0/aus_defence_54.xml'),
- (u'Aviation', u'http://feeds.news.com.au/public/rss/2.0/aus_business_aviation_706.xml'),
- (u'Mining', u'http://feeds.news.com.au/public/rss/2.0/aus_business_mining_704.xml'),
+ (u'World News', u'http://feeds.news.com.au/public/rss/2.0/aus_world_808.xml'),
+ (u'US Election', u'http://feeds.news.com.au/public/rss/2.0/aus_uselection_687.xml'),
(u'Climate', u'http://feeds.news.com.au/public/rss/2.0/aus_climate_809.xml'),
+ (u'Media', u'http://feeds.news.com.au/public/rss/2.0/aus_media_57.xml'),
+ (u'IT', u'http://feeds.news.com.au/public/rss/2.0/ausit_itnews_topstories_367.xml'),
+ (u'Exec Tech', u'http://feeds.news.com.au/public/rss/2.0/ausit_exec_topstories_385.xml'),
+ (u'Higher Education', u'http://feeds.news.com.au/public/rss/2.0/aus_higher_education_56.xml'),
+ (u'Arts', u'http://feeds.news.com.au/public/rss/2.0/aus_arts_51.xml'),
+ (u'Travel', u'http://feeds.news.com.au/public/rss/2.0/aus_travel_and_indulgence_63.xml'),
(u'Property', u'http://feeds.news.com.au/public/rss/2.0/aus_property_59.xml'),
- (u'US Election', u'http://feeds.news.com.au/public/rss/2.0/aus_uselection_687.xml')]
+ (u'Sport', u'http://feeds.news.com.au/public/rss/2.0/aus_sport_61.xml'),
+ (u'Business', u'http://feeds.news.com.au/public/rss/2.0/aus_business_811.xml'),
+ (u'Aviation', u'http://feeds.news.com.au/public/rss/2.0/aus_business_aviation_706.xml'),
+ (u'Commercial Property', u'http://feeds.news.com.au/public/rss/2.0/aus_business_commercial_property_708.xml'),
+ (u'Mining', u'http://feeds.news.com.au/public/rss/2.0/aus_business_mining_704.xml')]
def get_article_url(self, article):
return article.id
diff --git a/resources/recipes/wikinews_en.recipe b/resources/recipes/wikinews_en.recipe
index cf83793702..538ab241c2 100644
--- a/resources/recipes/wikinews_en.recipe
+++ b/resources/recipes/wikinews_en.recipe
@@ -55,6 +55,9 @@ class WikiNews(BasicNewsRecipe):
rest, sep, article_id = url.rpartition('/')
return 'http://en.wikinews.org/w/index.php?title=' + article_id + '&printable=yes'
+ def get_cover_url(self):
+ return 'http://upload.wikimedia.org/wikipedia/commons/b/bd/Wikinews-logo-en.png'
+
def preprocess_html(self, soup):
mtag = ''
soup.head.insert(0,mtag)