From 23b01a98e0231110f8916bccb43324bbd279dcf5 Mon Sep 17 00:00:00 2001
From: Nigel Stewart <nigels.com@gmail.com>
Date: Fri, 8 Oct 2010 09:51:19 -0500
Subject: [PATCH 1/3] Ehancements for The Age recipe: strip out useless links,
 grab the pdf front page, editorial and letters, sort the feeds explicitly

---
 resources/recipes/the_age.recipe | 87 ++++++++++++++++++++++++++------
 1 file changed, 71 insertions(+), 16 deletions(-)
diff --git a/resources/recipes/the_age.recipe b/resources/recipes/the_age.recipe
index 8e4ae05575..eddb5e5000 100644
--- a/resources/recipes/the_age.recipe
+++ b/resources/recipes/the_age.recipe
@@ -9,15 +9,19 @@ theage.com.au
 from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
-
+import re
 
 class TheAge(BasicNewsRecipe):
 
-    title = 'The Age'
-    description = 'Business News, World News and Breaking News in Melbourne, Australia'
-    __author__ = 'Matthew Briggs'
-    language = 'en_AU'
-
+    title            = 'The Age'
+    description      = 'Business News, World News and Breaking News in Melbourne, Australia'
+    publication_type = 'newspaper'
+    __author__       = 'Matthew Briggs'
+    language         = 'en_AU'
+    
+    max_articles_per_feed = 1000
+    recursions        = 0
+    remove_tags       = [dict(name=['table', 'script', 'noscript', 'style']), dict(name='a', attrs={'href':'/'}), dict(name='a', attrs={'href':'/text/'})]
 
     def get_browser(self):
         br = BasicNewsRecipe.get_browser()
@@ -28,30 +32,81 @@ class TheAge(BasicNewsRecipe):
 
         soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/text/').read())
 
-        feeds, articles = [], []
-        feed = None
-
+        section = None
+        sections = {}
 
         for tag in soup.findAll(['h3', 'a']):
             if tag.name == 'h3':
-                if articles:
-                    feeds.append((feed, articles))
-                    articles = []
-                feed = self.tag_to_string(tag)
-            elif feed is not None and tag.has_key('href') and tag['href'].strip():
+                section = self.tag_to_string(tag)
+                sections[section] = []
+
+            # Make sure to skip: <a href="/">TheAge</a>
+
+            elif section and tag.has_key('href') and len(tag['href'].strip())>1:
                 url = tag['href'].strip()
                 if url.startswith('/'):
-                    url   = 'http://www.theage.com.au' + url
+                    url = 'http://www.theage.com.au' + url
                 title = self.tag_to_string(tag)
-                articles.append({
+                sections[section].append({
                                  'title': title,
                                  'url'  : url,
                                  'date' : strftime('%a, %d %b'),
                                  'description' : '',
                                  'content'     : '',
                                  })
+                                 
+        feeds = []
 
+        # Insert feeds in specified order, if available
+        
+        feedSort = [ 'National', 'World', 'Opinion', 'Columns', 'Business', 'Sport', 'Entertainment' ]
+        for i in feedSort:
+          if i in sections:
+            feeds.append((i,sections[i]))
+
+        # Done with the sorted feeds
+
+        for i in feedSort:
+          del sections[i]
+        
+        # Append what is left over...
+
+        for i in sections:
+          feeds.append((i,sections[i]))
+            
         return feeds
 
+    def get_cover_url(self):
 
+        soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/todays-paper').read())
 
+        for i in soup.findAll('a'):
+          href = i['href']
+          if href and re.match('http://www.theage.com.au/frontpage/[0-9]+/[0-9]+/[0-9]+/frontpage.pdf',href):
+            return href
+
+        return None
+
+    def preprocess_html(self,soup):
+        
+        for p in soup.findAll('p'):
+        
+          # Collapse the paragraph by joining the non-tag contents
+
+          contents = [i for i in p.contents if isinstance(i,unicode)]
+          if len(contents):
+            contents = ''.join(contents)
+
+            # Filter out what's left of the text-mode navigation stuff
+
+            if re.match('((\s)|(\&nbsp\;))*\[[\|\s*]*\]((\s)|(\&nbsp\;))*$',contents):
+              p.extract()
+              continue
+
+            # Shrink the fine print font 
+
+            if contents=='This material is subject to copyright and any unauthorised use, copying or mirroring is prohibited.':
+              p['style'] = 'font-size:small'
+              continue        
+        
+        return soup

From 93f561249008323e4a7a94370663aa2f75b34da7 Mon Sep 17 00:00:00 2001
From: Nigel Stewart <nigels.com@gmail.com>
Date: Fri, 8 Oct 2010 09:52:44 -0500
Subject: [PATCH 2/3] Ehancements for The Australian recipe: more feeds, more
 articles

---
 resources/recipes/the_oz.recipe | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/resources/recipes/the_oz.recipe b/resources/recipes/the_oz.recipe
index ccdce0acb6..6a897589db 100644
--- a/resources/recipes/the_oz.recipe
+++ b/resources/recipes/the_oz.recipe
@@ -16,7 +16,7 @@ class DailyTelegraph(BasicNewsRecipe):
     language = 'en_AU'
 
     oldest_article = 2
-    max_articles_per_feed = 20
+    max_articles_per_feed = 30
     remove_javascript      = True
     no_stylesheets         = True
     encoding               = 'utf8'
@@ -48,22 +48,24 @@ class DailyTelegraph(BasicNewsRecipe):
                     .caption{font-family:Trebuchet MS,Trebuchet,Helvetica,sans-serif; font-size: xx-small;}
                 '''
 
-    feeds = [(u'News', u'http://feeds.news.com.au/public/rss/2.0/aus_news_807.xml'),
+    feeds = [       (u'News', u'http://feeds.news.com.au/public/rss/2.0/aus_news_807.xml'),
                     (u'Opinion', u'http://feeds.news.com.au/public/rss/2.0/aus_opinion_58.xml'),
-                    (u'Business', u'http://feeds.news.com.au/public/rss/2.0/aus_business_811.xml'),
-                    (u'Media', u'http://feeds.news.com.au/public/rss/2.0/aus_media_57.xml'),
-                    (u'Higher Education', u'http://feeds.news.com.au/public/rss/2.0/aus_higher_education_56.xml'),
-                    (u'The Arts', u'http://feeds.news.com.au/public/rss/2.0/aus_arts_51.xml'),
-                    (u'Commercial Property', u'http://feeds.news.com.au/public/rss/2.0/aus_business_commercial_property_708.xml'),
                     (u'The Nation', u'http://feeds.news.com.au/public/rss/2.0/aus_the_nation_62.xml'),
-                    (u'Sport', u'http://feeds.news.com.au/public/rss/2.0/aus_sport_61.xml'),
-                    (u'Travel', u'http://feeds.news.com.au/public/rss/2.0/aus_travel_and_indulgence_63.xml'),
-                    (u'Defence', u'http://feeds.news.com.au/public/rss/2.0/aus_defence_54.xml'),
-                    (u'Aviation', u'http://feeds.news.com.au/public/rss/2.0/aus_business_aviation_706.xml'),
-                    (u'Mining', u'http://feeds.news.com.au/public/rss/2.0/aus_business_mining_704.xml'),
+                    (u'World News', u'http://feeds.news.com.au/public/rss/2.0/aus_world_808.xml'),
+                    (u'US Election', u'http://feeds.news.com.au/public/rss/2.0/aus_uselection_687.xml'),
                     (u'Climate', u'http://feeds.news.com.au/public/rss/2.0/aus_climate_809.xml'),
+                    (u'Media', u'http://feeds.news.com.au/public/rss/2.0/aus_media_57.xml'),
+                    (u'IT', u'http://feeds.news.com.au/public/rss/2.0/ausit_itnews_topstories_367.xml'),
+                    (u'Exec Tech', u'http://feeds.news.com.au/public/rss/2.0/ausit_exec_topstories_385.xml'),
+                    (u'Higher Education', u'http://feeds.news.com.au/public/rss/2.0/aus_higher_education_56.xml'),
+                    (u'Arts', u'http://feeds.news.com.au/public/rss/2.0/aus_arts_51.xml'),
+                    (u'Travel', u'http://feeds.news.com.au/public/rss/2.0/aus_travel_and_indulgence_63.xml'),
                     (u'Property', u'http://feeds.news.com.au/public/rss/2.0/aus_property_59.xml'),
-                    (u'US Election', u'http://feeds.news.com.au/public/rss/2.0/aus_uselection_687.xml')]
+                    (u'Sport', u'http://feeds.news.com.au/public/rss/2.0/aus_sport_61.xml'),
+                    (u'Business', u'http://feeds.news.com.au/public/rss/2.0/aus_business_811.xml'),
+                    (u'Aviation', u'http://feeds.news.com.au/public/rss/2.0/aus_business_aviation_706.xml'),
+                    (u'Commercial Property', u'http://feeds.news.com.au/public/rss/2.0/aus_business_commercial_property_708.xml'),
+                    (u'Mining', u'http://feeds.news.com.au/public/rss/2.0/aus_business_mining_704.xml')]
 
     def get_article_url(self, article):
         return article.id

From 17f3f48616a3981eadd314c952b9b83e2398d469 Mon Sep 17 00:00:00 2001
From: Nigel Stewart <nigels.com@gmail.com>
Date: Fri, 8 Oct 2010 09:53:15 -0500
Subject: [PATCH 3/3] Ehancement for WikiNews recipe: fetch cover from
 wikemedia

---
 resources/recipes/wikinews_en.recipe | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/resources/recipes/wikinews_en.recipe b/resources/recipes/wikinews_en.recipe
index cf83793702..538ab241c2 100644
--- a/resources/recipes/wikinews_en.recipe
+++ b/resources/recipes/wikinews_en.recipe
@@ -55,6 +55,9 @@ class WikiNews(BasicNewsRecipe):
         rest, sep, article_id  = url.rpartition('/')
         return 'http://en.wikinews.org/w/index.php?title=' + article_id + '&printable=yes'
 
+    def get_cover_url(self):
+        return 'http://upload.wikimedia.org/wikipedia/commons/b/bd/Wikinews-logo-en.png'
+
     def preprocess_html(self, soup):
         mtag = '<meta http-equiv="Content-Language" content="en"/><meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
         soup.head.insert(0,mtag)