calibre/recipes/the_age.recipe

#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Matthew Briggs <hal.sulphur@gmail.com>'
__docformat__ = 'restructuredtext en'

'''
theage.com.au
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
import re


class TheAge(BasicNewsRecipe):

    title = 'The Age'
    description = 'Business News, World News and Breaking News in Melbourne, Australia'
    publication_type = 'newspaper'
    __author__ = 'Matthew Briggs'
    language = 'en_AU'

    max_articles_per_feed = 1000
    recursions = 0
    remove_tags = [dict(name=['table', 'script', 'noscript', 'style']), dict(
        name='a', attrs={'href': '/'}), dict(name='a', attrs={'href': '/text/'})]

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        br.set_handle_refresh(False)
        return br

    def parse_index(self):

        soup = BeautifulSoup(self.browser.open(
            'http://www.theage.com.au/text/').read())

        section = None
        sections = {}

        for tag in soup.findAll(['h3', 'a']):
            if tag.name == 'h3':
                section = self.tag_to_string(tag)
                sections[section] = []

            # Make sure to skip: <a href="/">TheAge</a>

            elif section and tag.get('href'):
                url = tag['href'].strip()
                if url.startswith('/'):
                    url = 'http://www.theage.com.au' + url
                title = self.tag_to_string(tag)
                if url != 'http://www.theage.com.au':
                    sections[section].append({
                        'title': title,
                        'url': url,
                        'date': strftime('%a, %d %b'),
                        'description': '',
                        'content': '',
                    })

        feeds = []

        # Insert feeds in specified order, if available

        feedSort = ['National', 'World', 'Opinion',
                    'Columns', 'Business', 'Sport', 'Entertainment']
        for i in feedSort:
            if i in sections:
                feeds.append((i, sections[i]))

        # Done with the sorted feeds

        for i in feedSort:
            del sections[i]

        # Append what is left over...

        for i in sections:
            feeds.append((i, sections[i]))

        return feeds

    def get_cover_url(self):

        soup = BeautifulSoup(self.browser.open(
            'http://www.theage.com.au/todays-paper').read())

        for i in soup.findAll('a'):
            href = i['href']
            if href and re.match('http://www.theage.com.au/frontpage/[0-9]+/[0-9]+/[0-9]+/frontpage.pdf', href):
                return href

        return None

    def preprocess_html(self, soup):

        for p in soup.findAll('p'):

            # Collapse the paragraph by joining the non-tag contents

            contents = [i for i in p.contents if isinstance(i, type(u''))]
            if len(contents):
                contents = ''.join(contents)

                # Filter out what's left of the text-mode navigation stuff

                if re.match(r'((\s)|(\&nbsp\;))*\[[\|\s*]*\]((\s)|(\&nbsp\;))*$', contents):
                    p.extract()
                    continue

                # Shrink the fine print font

                if contents == 'This material is subject to copyright and any unauthorised use, copying or mirroring is prohibited.':
                    p['style'] = 'font-size:small'
                    continue

        return soup