calibre/recipes/billorielly.recipe

#!/usr/bin/env  python

# ebook-convert.exe c:\billorielly.recipe c:\test -vv

from calibre.web.feeds.recipes import BasicNewsRecipe

class BillOReilly(BasicNewsRecipe):
    cover_url             = 'http://images.billoreilly.com/images/headers/borbanner.jpg'
    title          		    = u"Bill O'Reilly"
    __author__ 	          = 'Rob Lammert - rob.lammert[at]gmail.com'
    description		        = u"Articles from Bill O'Reilly's website and his Fox New's website"
    language = 'en'
    oldest_article        = 7.0
    max_articles_per_feed = 100
    recursions            = 0
    encoding              = 'utf8'
    no_stylesheets        = True
    remove_javascript     = True
    #use_embedded_content  = False


 #   feeds                 = [
 #     ('Talking Points Memo', u'http://www.foxnews.com/xmlfeed/column/0,5184,19,00.rss'),
 #     ('No Spin News', u'http://www.billoreilly.com/blog?rss=true&size=50&useBlurbs=true&categoryID=7')
 #   ]

    def parse_index(self):
      feeds = []

      articles_shows = self.bo_parse_shows('http://www.billoreilly.com/show?action=tvShowArchive')
      articles_columns = self.bo_parse_columns('http://www.billoreilly.com/columns')

      if articles_shows:
        feeds.append(("O'Reilly Factor", articles_shows))

      if articles_columns:
        feeds.append(("Newspaper Column", articles_columns))

      return feeds

    def bo_parse_shows(self,url):
      soup = self.index_to_soup(url)
      links = soup.find(attrs={'class': 'showLinks'})

      current_articles = []
      counter = 0
      for lnk in links.findAllNext(attrs={'class': ['showLinks']}):
        if counter <= 5:
          title = self.tag_to_string(lnk)
          url = lnk.get('href', False)

          if not url or not title:
            continue

          if url.startswith('/'):
            url = 'http://www.billoreilly.com'+url+'&dest=/pg/jsp/community/tvshowprint.jsp'

          self.log('\t\tFound article:', title)
          self.log('\t\t\t', url)
          current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
          counter += 1
      return current_articles

    def bo_parse_columns(self,url):
      soup = self.index_to_soup(url)
      links = soup.find(attrs={'id': 'bold'})

      current_articles = []
      counter = 0
      for lnk in links.findAllNext(attrs={'id': ['bold']}):
        test = lnk.get('class', False)
        if counter <= 5 and test == 'defaultLinks':
          title = self.tag_to_string(lnk)
          url = lnk.get('href', False)

          if not url or not title:
            continue

          if url.startswith('/'):
            url = 'http://www.billoreilly.com'+url+'&printerFriendly=true"'

          self.log('\t\tFound article:', title)
          self.log('\t\t\t', url)
          current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
          counter += 1
      return current_articles