Ehancements for The Age recipe: strip out useless links, grab the pdf front page, editorial and letters, sort the feeds explicitly

2025-07-09 03:04:10 -04:00 · 2010-10-08 09:51:19 -05:00 · 2010-10-08 09:51:19 -05:00 · 23b01a98e0
commit 23b01a98e0
parent 257a82f06b
1 changed files with 71 additions and 16 deletions
--- a/resources/recipes/the_age.recipe
+++ b/resources/recipes/the_age.recipe
@ -9,15 +9,19 @@ theage.com.au
 from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
-
+import re
 class TheAge(BasicNewsRecipe):
-    title = 'The Age'
+    title            = 'The Age'
-    description = 'Business News, World News and Breaking News in Melbourne, Australia'
+    description      = 'Business News, World News and Breaking News in Melbourne, Australia'
-    __author__ = 'Matthew Briggs'
+    publication_type = 'newspaper'
-    language = 'en_AU'
+    __author__       = 'Matthew Briggs'
-
+    language         = 'en_AU'
    max_articles_per_feed = 1000
    recursions        = 0
    remove_tags       = [dict(name=['table', 'script', 'noscript', 'style']), dict(name='a', attrs={'href':'/'}), dict(name='a', attrs={'href':'/text/'})]
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
@ -28,30 +32,81 @@ class TheAge(BasicNewsRecipe):
        soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/text/').read())
-        feeds, articles = [], []
+        section = None
-        feed = None
+        sections = {}
        for tag in soup.findAll(['h3', 'a']):
            if tag.name == 'h3':
-                if articles:
+                section = self.tag_to_string(tag)
-                    feeds.append((feed, articles))
+                sections[section] = []
-                    articles = []
+
-                feed = self.tag_to_string(tag)
+            # Make sure to skip: <a href="/">TheAge</a>
-            elif feed is not None and tag.has_key('href') and tag['href'].strip():
+
            elif section and tag.has_key('href') and len(tag['href'].strip())>1:
                url = tag['href'].strip()
                if url.startswith('/'):
-                    url   = 'http://www.theage.com.au' + url
+                    url = 'http://www.theage.com.au' + url
                title = self.tag_to_string(tag)
-                articles.append({
+                sections[section].append({
                                 'title': title,
                                 'url'  : url,
                                 'date' : strftime('%a, %d %b'),
                                 'description' : '',
                                 'content'     : '',
                                 })
        feeds = []
        # Insert feeds in specified order, if available
        feedSort = [ 'National', 'World', 'Opinion', 'Columns', 'Business', 'Sport', 'Entertainment' ]
        for i in feedSort:
          if i in sections:
            feeds.append((i,sections[i]))
        # Done with the sorted feeds
        for i in feedSort:
          del sections[i]
        # Append what is left over...
        for i in sections:
          feeds.append((i,sections[i]))
        return feeds
    def get_cover_url(self):
        soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/todays-paper').read())
        for i in soup.findAll('a'):
          href = i['href']
          if href and re.match('http://www.theage.com.au/frontpage/[0-9]+/[0-9]+/[0-9]+/frontpage.pdf',href):
            return href
        return None
    def preprocess_html(self,soup):
        for p in soup.findAll('p'):
          # Collapse the paragraph by joining the non-tag contents
          contents = [i for i in p.contents if isinstance(i,unicode)]
          if len(contents):
            contents = ''.join(contents)
            # Filter out what's left of the text-mode navigation stuff
            if re.match('((\s)|(\&nbsp\;))*\[[\|\s*]*\]((\s)|(\&nbsp\;))*$',contents):
              p.extract()
              continue
            # Shrink the fine print font 
            if contents=='This material is subject to copyright and any unauthorised use, copying or mirroring is prohibited.':
              p['style'] = 'font-size:small'
              continue        
        return soup