#!/usr/bin/env python # ebook-convert.exe c:\billorielly.recipe c:\test -vv from calibre.web.feeds.recipes import BasicNewsRecipe class BillOReilly(BasicNewsRecipe): cover_url = 'http://images.billoreilly.com/images/headers/borbanner.jpg' title = u"Bill O'Reilly" __author__ = 'Rob Lammert - rob.lammert[at]gmail.com' description = u"Articles from Bill O'Reilly's website and his Fox New's website" language = 'en' oldest_article = 7.0 max_articles_per_feed = 100 recursions = 0 encoding = 'utf8' no_stylesheets = True remove_javascript = True def parse_index(self): feeds = [] articles_shows = self.bo_parse_shows( 'http://www.billoreilly.com/show?action=tvShowArchive') articles_columns = self.bo_parse_columns( 'http://www.billoreilly.com/columns') if articles_shows: feeds.append(("O'Reilly Factor", articles_shows)) if articles_columns: feeds.append(("Newspaper Column", articles_columns)) return feeds def bo_parse_shows(self, url): soup = self.index_to_soup(url) links = soup.find(attrs={'class': 'showLinks'}) current_articles = [] counter = 0 for lnk in links.findAllNext(attrs={'class': ['showLinks']}): if counter <= 5: title = self.tag_to_string(lnk) url = lnk.get('href', False) if not url or not title: continue if url.startswith('/'): url = 'http://www.billoreilly.com' + url + \ '&dest=/pg/jsp/community/tvshowprint.jsp' self.log('\t\tFound article:', title) self.log('\t\t\t', url) current_articles.append( {'title': title, 'url': url, 'description': '', 'date': ''}) counter += 1 return current_articles def bo_parse_columns(self, url): soup = self.index_to_soup(url) links = soup.find(attrs={'id': 'bold'}) current_articles = [] counter = 0 for lnk in links.findAllNext(attrs={'id': ['bold']}): test = lnk.get('class', False) if counter <= 5 and test == 'defaultLinks': title = self.tag_to_string(lnk) url = lnk.get('href', False) if not url or not title: continue if url.startswith('/'): url = 'http://www.billoreilly.com' + url + '&printerFriendly=true"' self.log('\t\tFound article:', title) self.log('\t\t\t', url) current_articles.append( {'title': title, 'url': url, 'description': '', 'date': ''}) counter += 1 return current_articles