calibre/recipes/billorielly.recipe

#!/usr/bin/env python

# ebook-convert.exe c:\billorielly.recipe c:\test -vv

from calibre.web.feeds.recipes import BasicNewsRecipe


class BillOReilly(BasicNewsRecipe):
    cover_url = 'http://images.billoreilly.com/images/headers/borbanner.jpg'
    title = u"Bill O'Reilly"
    __author__ = 'Rob Lammert - rob.lammert[at]gmail.com'
    description = u"Articles from Bill O'Reilly's website and his Fox New's website"
    language = 'en'
    oldest_article = 7.0
    max_articles_per_feed = 100
    recursions = 0
    encoding = 'utf8'
    no_stylesheets = True
    remove_javascript = True

    def parse_index(self):
        feeds = []

        articles_shows = self.bo_parse_shows(
            'http://www.billoreilly.com/show?action=tvShowArchive')
        articles_columns = self.bo_parse_columns(
            'http://www.billoreilly.com/columns')

        if articles_shows:
            feeds.append(("O'Reilly Factor", articles_shows))

        if articles_columns:
            feeds.append(("Newspaper Column", articles_columns))

        return feeds

    def bo_parse_shows(self, url):
        soup = self.index_to_soup(url)
        links = soup.find(attrs={'class': 'showLinks'})

        current_articles = []
        counter = 0
        for lnk in links.findAllNext(attrs={'class': ['showLinks']}):
            if counter <= 5:
                title = self.tag_to_string(lnk)
                url = lnk.get('href', False)

                if not url or not title:
                    continue

                if url.startswith('/'):
                    url = 'http://www.billoreilly.com' + url + \
                        '&dest=/pg/jsp/community/tvshowprint.jsp'

                self.log('\t\tFound article:', title)
                self.log('\t\t\t', url)
                current_articles.append(
                    {'title': title, 'url': url, 'description': '', 'date': ''})
                counter += 1
        return current_articles

    def bo_parse_columns(self, url):
        soup = self.index_to_soup(url)
        links = soup.find(attrs={'id': 'bold'})

        current_articles = []
        counter = 0
        for lnk in links.findAllNext(attrs={'id': ['bold']}):
            test = lnk.get('class', False)
            if counter <= 5 and test == 'defaultLinks':
                title = self.tag_to_string(lnk)
                url = lnk.get('href', False)

                if not url or not title:
                    continue

                if url.startswith('/'):
                    url = 'http://www.billoreilly.com' + url + '&printerFriendly=true"'

                self.log('\t\tFound article:', title)
                self.log('\t\t\t', url)
                current_articles.append(
                    {'title': title, 'url': url, 'description': '', 'date': ''})
                counter += 1
        return current_articles