calibre/recipes/oreilly_premium.recipe
2012-01-08 08:58:59 +05:30

264 lines
12 KiB
Plaintext

import re
import time
from calibre.web.feeds.recipes import BasicNewsRecipe
# Allows the Python soup converter, which makes parsing easier.
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class OReillyPremium(BasicNewsRecipe):
title = u'OReilly Premium'
__author__ = 'TMcN'
description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.'
cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png'
auto_cleanup = True
encoding = 'utf8'
needs_subscription = True
no_stylesheets = True
oldest_article = 20
remove_javascript = True
remove_tags = [dict(name='img', attrs={})]
# Don't go down
recursions = 0
max_articles_per_feed = 2000
language = 'en'
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
br.select_form(name='login')
br['formEmailField'] = self.username
br['formPasswordField'] = self.password
br.submit()
return br
def extractPrintURL(self, baseURL, pageURL, printString):
tagURL = pageURL
soup = self.index_to_soup(pageURL)
if soup :
printText = soup.find('a', text=printString)
else :
print("Failed to find Print string "+printString+ " in "+pageURL)
if printText:
tag = printText.parent
tagURL = baseURL+tag['href']
return tagURL
def stripBadChars(self, inString) :
return inString.replace("\'", "")
# returns a qualifying article list
def parseNoSpinArchives(self, baseURL, soupURL, debugMessages):
articleList = []
soup = self.index_to_soup(soupURL)
for div in soup.findAll(True, attrs={'class':['blogBody'], 'style':['padding-top:10px;']}):
a = div.find('a', href=True)
if not a:
continue
# re == regex. [href] is the link
url = baseURL
url +=re.sub(r'\?.*', '', a['href'])
# Get print version
printURL = self.extractPrintURL(baseURL, url, "Print this entry")
if printURL:
url = printURL
title = self.tag_to_string(a, use_alt=True).strip()
if debugMessages :
print("No Spin Archive Title:"+title+" at url: "+url)
description = 'None'
pubdate = time.strftime('%a, %d %b')
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
return articleList
def parseTVArchives(self, baseURL, soupURL, debugMessages):
# TV Archives page has some Ajax, so look for the static only.
articleList = []
soup = self.index_to_soup(soupURL)
if debugMessages :
print("In parseTVArchives")
for div in soup.findAll('a', {'class':['showLinks','homeLinks']}):
a = div
url = baseURL
url +=a['href']
printURL = self.extractPrintURL(baseURL, url, "Print this entry")
if printURL:
url = printURL
title = self.tag_to_string(a, use_alt=True).strip()
title = self.stripBadChars(title)
if debugMessages :
print("TV Archive "+title+" at url: "+url)
description = 'None'
pubdate = time.strftime('%a, %d %b')
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
if debugMessages :
print("Leaving TV Parse ")
return articleList
# Get Daily Briefing Archives
def parseDailyBriefs(self, baseURL, soupURL, debugMessages) :
print("Starting daily briefs")
articleList = []
soup = self.index_to_soup(soupURL)
for div in soup.findAll(True, attrs={'class':['defaultHeaderSmallLinks']}):
# re == regex. [href] is the link
url = baseURL
url +=re.sub(r'\?.*', '', div['href'])
printURL = self.extractPrintURL(baseURL, url, "Print this entry")
if printURL:
url = printURL
title = div.contents[0]
if debugMessages :
print("Daily Brief - title:"+title+" at url: "+url)
description = 'None'
pubdate = time.strftime('%a, %d %b')
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
print("Leaving daily briefs")
return articleList
# Get the weekly Stratfor intelligence report
def parseStratfor(self, baseURL, soupURL, debugMessages):
# http://www.billoreilly.com/blog?categoryID=5
articleList = []
soup = self.index_to_soup(soupURL)
if debugMessages :
print("In parseStratfor")
a = soup.find('a', {'class':['blogLinks']})
url = baseURL
url +=a['href']
title = self.tag_to_string(a, use_alt=True).strip()
if debugMessages :
print("url: "+url)
print("title:"+title)
# Get Stratfor contents so we can get the real title.
stratSoup = self.index_to_soup(url)
title = stratSoup.html.head.title.string
stratIndex = title.find('Stratfor.com:', 0)
if (stratIndex > -1) :
title = title[stratIndex+14:-1]
# Look for first blogBody <td class="blogBody"
stratBody = stratSoup.find('td', {'class':['blogBody']})
if debugMessages :
print("Strat content title:"+title)
print("Strat body: "+ stratBody.contents[0])
description = 'None'
pubdate = time.strftime('%a, %d %b')
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
if debugMessages :
print("Leaving Stratfor Parse ")
return articleList
def parseTalkingPoints(self, baseURL, soupURL, debugMessages) :
# Look for blogDate. That's got the date... Then the next blogBody has the title. and then an anchor with class "homeBlogReadMore bold" has the URL.
articleList = []
soup = self.index_to_soup(soupURL)
if debugMessages :
print("Starting Talking Points")
topDate = soup.find("td", "blogBody")
if not topDate :
print("Failed to find date in Talking Points")
# This page has the contents in double-wrapped tables!
# tableParent = topDate.parent.parent
myTable = topDate.findParents('table')[0]
upOneTable = myTable.findParents('table')[0]
upTwo = upOneTable.findParents('table')[0]
# Now navigate rows of upTwo
if debugMessages :
print("Entering rows")
for rows in upTwo.findChildren("tr", recursive=False):
# Inside top level table, each row is an article
rowTable = rows.find("table")
articleTable = rowTable.find("table")
articleTable = rows.find("tr")
# The middle table is just for formatting the article buffer... but this means we can skip the inner table.
blogDate = articleTable.find("a","blogDate").contents[0]
# Skip to second blogBody for this.
blogTitle = articleTable.findAll("td", "blogBody")[1].contents[0]
blogURL = articleTable.find("a", "homeBlogReadMore bold")['href']
# re == regex. [href] is the link
url = baseURL
url +=re.sub(r'\?.*', '', blogURL)
title = blogDate+": "+self.stripBadChars(blogTitle.replace("Bill O'Reilly: ", ""))
if debugMessages :
print("Talking Points Memo title "+title+" at url: "+url)
description = 'None'
pubdate = time.strftime('%a, %d %b')
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
print("Exiting parseTalkingPoints\n")
return articleList
def parseCurrentColumn(self, baseURL, soupURL, debugMessages) :
# Only needed to get the column title. Otherwise it's all good already; there's only one column
articleList = []
soup = self.index_to_soup(soupURL)
titleSpan = soup.find('span', {'class':['defaultHeader']})
title = titleSpan.contents[0]
# Get Print URL since it's available
printURL = self.extractPrintURL(baseURL, soupURL, "Print This Article")
if printURL:
print("Found print URL")
url = printURL
if debugMessages :
print("url: "+url)
print("title:"+title)
description = 'None'
pubdate = time.strftime('%a, %d %b')
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
if debugMessages :
print("Leaving Stratfor Parse ")
return articleList
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
# returns a list of tuple ('feed title', list of articles)
# {
# 'title' : article title,
# 'url' : URL of print version,
# 'date' : The publication date of the article as a string,
# 'description' : A summary of the article
# 'content' : The full article (can be an empty string). This is used by FullContentProfile
# }
# this is used instead of BasicNewsRecipe.parse_feeds().
def parse_index(self):
# Parse the page into Python Soup
debugMessages = True
baseURL = "https://www.billoreilly.com"
def feed_title(div):
return ''.join(div.findAll(text=True, recursive=False)).strip()
# [] is list, {} is empty mapping.
articleList = []
ans = []
showList = self.parseTVArchives(baseURL, 'https://www.billoreilly.com/show?action=tvShowArchive', debugMessages)
articleList = self.parseNoSpinArchives(baseURL, 'https://www.billoreilly.com/blog?categoryID=7', debugMessages)
stratList = self.parseStratfor(baseURL, 'http://www.billoreilly.com/blog?categoryID=5', debugMessages)
dailyBriefs = self.parseDailyBriefs(baseURL, 'http://www.billoreilly.com/blog?categoryID=11', debugMessages)
talkingPoints = self.parseTalkingPoints(baseURL, 'https://www.billoreilly.com/blog?categoryID=12', debugMessages)
currentColumn = self.parseCurrentColumn(baseURL, 'https://www.billoreilly.com/currentcolumn', debugMessages)
# Below, { x:y, a:b } creates a dictionary. We return a tuple of a title and list of dict...
# Lists are constructed with square brackets, separating items with commas: [a, b, c]. Tuples are constructed by the comma operator (not within square brackets), with or without enclosing parentheses, but an empty tuple must have the enclosing parentheses, such as a, b, c or (). A single item tuple must have a trailing comma, such as (d,).
# Shows first two if talking points and no spin news. Also if they are TV Shows ande Stratfor Weekly, also if Daily Briefing and Curren Column
# So all work individually. No idea why only getting first two in TOC now.
ans = [("Talking Points Memos", talkingPoints),("No Spin News", articleList),("TV Shows", showList),("Stratfor Weekly",stratList), ("Daily Briefing", dailyBriefs),("Current Column", currentColumn)]
if debugMessages :
print ans
return ans
def preprocess_html(self, soup):
refresh = soup.find('meta', {'http-equiv':'refresh'})
if refresh is None:
return soup
content = refresh.get('content').partition('=')[2]
raw = self.browser.open('https://www.billoreilly.com'+content).read()
return BeautifulSoup(raw.decode('cp1252', 'replace'))