Updated OReilly Premium and Real Clear

This commit is contained in:
Kovid Goyal 2012-04-02 09:00:25 +05:30
parent ee108790db
commit bb443d01f1
2 changed files with 284 additions and 109 deletions

View File

@ -1,45 +1,73 @@
# Talking Points is not grabbing everything.
# The look is right, but only the last one added?
import re
import string, re
import time
import traceback
# above for debugging via stack
from calibre.web.feeds.recipes import BasicNewsRecipe
# Allows the Python soup converter, which makes parsing easier.
from calibre.ebooks.BeautifulSoup import BeautifulSoup
# strip ads and graphics
# Current Column lacks a title.
# Talking Points Memo - shorten title - Remove year and Bill's name
import os, time, traceback, re, urlparse, sys, cStringIO
from collections import defaultdict
from functools import partial
from contextlib import nested, closing
from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
# To Do: strip ads and graphics, Current Column lacks a title.
# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
# Newsletters: Talking Points Memos covered by cat12
# ./ebook-convert --username xxx --password xxx
# this is derived from BasicNewsRecipe, so it can only overload those.
# Soome of what we need is otherwise in article, so we have more copy to do than otherwise.
class OReillyPremium(BasicNewsRecipe):
title = u'OReilly Premium'
__author__ = 'TMcN'
language = 'en'
description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.'
cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png'
custom_title = 'Bill O\'Reilly Premium - '+ time.strftime('%d %b %Y')
title = 'Bill O\'Reilly Premium'
auto_cleanup = True
conversion_options = {'linearize_tables': True}
encoding = 'utf8'
needs_subscription = True
language = 'en'
no_stylesheets = True
oldest_article = 20
needs_subscription = True
oldest_article = 31
remove_javascript = True
remove_tags = [dict(name='img', attrs={})]
# Don't go down
recursions = 0
max_articles_per_feed = 2000
max_articles_per_feed = 20
debugMessages = True
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []],
["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []],
["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []],
["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []],
# ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
# ["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []],
# ["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []],
# ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []],
["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []]
]
feeds = [
(u'No Spin', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'),
(u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'),
(u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'),
(u'Blog', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'),
(u'StratFor', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5')
]
# http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day.
# Note: Talking Points is broken in the above model; the site changed to more Ajax-y.
# Now using RSS
def get_browser(self):
print("In get_browser")
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
@ -66,6 +94,7 @@ class OReillyPremium(BasicNewsRecipe):
def stripBadChars(self, inString) :
return inString.replace("\'", "")
def parseGeneric(self, baseURL):
# Does a generic parsing of the articles. There are six categories (0-5)
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
@ -73,6 +102,7 @@ class OReillyPremium(BasicNewsRecipe):
fullReturn = []
for i in range(len(self.catList)) :
articleList = []
print("In "+self.catList[i][0]+", index: "+ str(i))
soup = self.index_to_soup(self.catList[i][1])
# Set defaults
description = 'None'
@ -81,14 +111,12 @@ class OReillyPremium(BasicNewsRecipe):
# 3-5 create one.
# So no for-div for 3-5
if i < 3 :
if i == 0 :
print("Starting TV Archives")
for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
print("Next DIV:")
print(div)
if i == 1:
a = div.find('a', href=True)
else :
a = div
print(a)
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
@ -96,82 +124,63 @@ class OReillyPremium(BasicNewsRecipe):
continue
# url = baseURL+re.sub(r'\?.*', '', a['href'])
url = baseURL+a['href']
if i < 2 :
url = self.extractPrintURL(baseURL, url, "Print this entry")
title = self.tag_to_string(a, use_alt=True).strip()
elif i == 2 :
# Daily Briefs
url = self.extractPrintURL(baseURL, url, "Print this entry")
title = div.contents[0]
if self.debugMessages :
print(title+" @ "+url)
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
elif i == 3 : # Stratfor
a = soup.find('a', self.catList[i][3])
if a is None :
continue
url = baseURL+a['href']
title = self.tag_to_string(a, use_alt=True).strip()
# Get Stratfor contents so we can get the real title.
stratSoup = self.index_to_soup(url)
title = stratSoup.html.head.title.string
stratIndex = title.find('Stratfor.com:', 0)
if (stratIndex > -1) :
title = title[stratIndex+14:-1]
# Look for first blogBody <td class="blogBody"
# Changed 12 Jan 2012 - new page format
#stratBlogTable = stratSoup.find('td', {'class':['blogBody']}).findParent('table')
#stratBody = stratSoup.find('td', {'class':['blogBody']})
elif i == 4 : # Talking Points
topDate = soup.find("td", "blogBody")
if not topDate :
print("Failed to find date in Talking Points")
# This page has the contents in double-wrapped tables!
myTable = topDate.findParents('table')[0]
if myTable is not None:
upOneTable = myTable.findParents('table')[0]
if upOneTable is not None:
upTwo = upOneTable.findParents('table')[0]
if upTwo is None:
continue
# Now navigate rows of upTwo
if self.debugMessages :
print("Entering rows")
for rows in upTwo.findChildren("tr", recursive=False):
# Inside top level table, each row is an article
rowTable = rows.find("table")
articleTable = rowTable.find("table")
# This looks wrong.
articleTable = rows.find("tr")
# The middle table is just for formatting the article buffer... but this means we can skip the inner table.
blogDate = articleTable.find("a","blogDate").contents[0]
# Skip to second blogBody for this.
blogTitle = articleTable.findAll("td", "blogBody")[1].contents[0]
blogURL = articleTable.find("a", "homeBlogReadMore bold")['href']
url = baseURL+re.sub(r'\?.*', '', blogURL)
title = blogDate+": "+self.stripBadChars(blogTitle.replace("Bill O'Reilly: ", ""))
if self.debugMessages :
print("Talking Points Memo title "+title+" at url: "+url)
pubdate = time.strftime('%a, %d %b')
articleList.append(dict(title=title, url=url, date=pubdate, description='None', content=''))
else : # Current Column
titleSpan = soup.find(self.catList[i][2], self.catList[i][3])
if titleSpan is None :
print("No Current Column Title Span")
print(soup)
continue
title = titleSpan.contents[0]
url = self.extractPrintURL(baseURL, self.catList[i][1], "Print This Article")
if i == 3 or i == 5 :
if i == 1 :
if self.debugMessages :
print(self.catList[i][0]+" Title:"+title+" at url: "+url)
summary = div.find(True, attrs={'class':'summary'})
if summary:
print("At Summary")
print(summary)
if summary is not None:
description = self.tag_to_string(summary, use_alt=False)
print("At append")
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
self.catList[i][3] = articleList
fullReturn.append((self.catList[i][0], articleList))
print("Returning")
# print fullReturn
return fullReturn
# build_index() starts with:
# try:
# feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
# max_articles_per_feed=self.max_articles_per_feed,
# log=self.log)
# self.report_progress(0, _('Got feeds from index page'))
# except NotImplementedError:
# feeds = self.parse_feeds()
# which in turn is from __init__.py
#def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100,
# log=default_log):
#'''
#@param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}.
#@return: A list of L{Feed} objects.
#@rtype: list
#'''
#feeds = []
#for title, articles in index:
# pfeed = Feed(log=log)
# pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
# max_articles_per_feed=max_articles_per_feed)
# feeds.append(pfeed)
# return feeds
# use_embedded_content defaults to None, at which point if the content is > 2K, it is used as the article.
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
# returns a list of tuple ('feed title', list of articles)
# {
@ -182,12 +191,19 @@ class OReillyPremium(BasicNewsRecipe):
# 'content' : The full article (can be an empty string). This is used by FullContentProfile
# }
# this is used instead of BasicNewsRecipe.parse_feeds().
# it is called by download
def parse_index(self):
# Parse the page into Python Soup
print("Entering recipe print_index from:")
traceback.print_stack()
print("web")
baseURL = "https://www.billoreilly.com"
return self.parseGeneric(baseURL)
masterList = self.parseGeneric(baseURL)
#print(masterList)
return masterList
def preprocess_html(self, soup):
print("In preprocess_html")
refresh = soup.find('meta', {'http-equiv':'refresh'})
if refresh is None:
return soup
@ -195,3 +211,128 @@ class OReillyPremium(BasicNewsRecipe):
raw = self.browser.open('https://www.billoreilly.com'+content).read()
return BeautifulSoup(raw.decode('cp1252', 'replace'))
def build_index(self):
print("In OReilly build_index()\n\n")
feedsRSS = []
self.report_progress(0, _('Fetching feeds...'))
#try:
feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
log=self.log)
self.report_progress(0, _('Got feeds from index page'))
#except NotImplementedError:
# feeds = self.parse_feeds()
# Now add regular feeds.
feedsRSS = self.parse_feeds()
print ("feedsRSS is type "+feedsRSS.__class__.__name__)
for articles in feedsRSS:
print("articles is type "+articles.__class__.__name__)
print("Title:" + articles.title)
feeds.append(articles)
if not feeds:
raise ValueError('No articles found, aborting')
#feeds = FeedCollection(feeds)
self.report_progress(0, _('Trying to download cover...'))
self.download_cover()
self.report_progress(0, _('Generating masthead...'))
self.masthead_path = None
try:
murl = self.get_masthead_url()
except:
self.log.exception('Failed to get masthead url')
murl = None
if murl is not None:
# Try downloading the user-supplied masthead_url
# Failure sets self.masthead_path to None
self.download_masthead(murl)
if self.masthead_path is None:
self.log.info("Synthesizing mastheadImage")
self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg')
try:
self.default_masthead_image(self.masthead_path)
except:
self.log.exception('Failed to generate default masthead image')
self.masthead_path = None
if self.test:
feeds = feeds[:2]
self.has_single_feed = len(feeds) == 1
index = os.path.join(self.output_dir, 'index.html')
html = self.feeds2index(feeds)
with open(index, 'wb') as fi:
fi.write(html)
self.jobs = []
if self.reverse_article_order:
for feed in feeds:
if hasattr(feed, 'reverse'):
feed.reverse()
self.feed_objects = feeds
for f, feed in enumerate(feeds):
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
if not os.path.isdir(feed_dir):
os.makedirs(feed_dir)
for a, article in enumerate(feed):
if a >= self.max_articles_per_feed:
break
art_dir = os.path.join(feed_dir, 'article_%d'%a)
if not os.path.isdir(art_dir):
os.makedirs(art_dir)
try:
url = self.print_version(article.url)
except NotImplementedError:
url = article.url
except:
self.log.exception('Failed to find print version for: '+article.url)
url = None
if not url:
continue
func, arg = (self.fetch_embedded_article, article) \
if self.use_embedded_content or (self.use_embedded_content == None and feed.has_embedded_content()) \
else \
((self.fetch_obfuscated_article if self.articles_are_obfuscated \
else self.fetch_article), url)
req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
{}, (f, a), self.article_downloaded,
self.error_in_article_download)
req.feed = feed
req.article = article
req.feed_dir = feed_dir
self.jobs.append(req)
self.jobs_done = 0
tp = ThreadPool(self.simultaneous_downloads)
for req in self.jobs:
tp.putRequest(req, block=True, timeout=0)
self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
while True:
try:
tp.poll()
time.sleep(0.1)
except NoResultsPending:
break
for f, feed in enumerate(feeds):
print("Writing feeds for "+feed.title)
html = self.feed2index(f,feeds)
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
fi.write(html)
self.create_opf(feeds)
self.report_progress(1, _('Feeds downloaded to %s')%index)
return index

View File

@ -1,7 +1,9 @@
# Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug
import string, re
import time
from urlparse import urlparse
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import NavigableString
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString
class RealClear(BasicNewsRecipe):
title = u'Real Clear'
@ -20,11 +22,12 @@ class RealClear(BasicNewsRecipe):
# Don't go down
recursions = 0
max_articles_per_feed = 400
debugMessages = False
debugMessages = True
# Numeric parameter is type, controls whether we look for
feedsets = [
["Politics", "http://www.realclearpolitics.com/index.xml", 0],
["Policy", "http://www.realclearpolicy.com/index.xml", 0],
["Science", "http://www.realclearscience.com/index.xml", 0],
["Tech", "http://www.realcleartechnology.com/index.xml", 0],
# The feedburner is essentially the same as the top feed, politics.
@ -37,7 +40,9 @@ class RealClear(BasicNewsRecipe):
]
# Hints to extractPrintURL.
# First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down.
printhints = [
phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)
printhints = [ ["realclear", "", '' , 'printpage'],
["billoreilly.com", "Print this entry", 'a', ''],
["billoreilly.com", "Print This Article", 'a', ''],
["politico.com", "Print", 'a', 'share-print'],
@ -48,11 +53,24 @@ class RealClear(BasicNewsRecipe):
# usatoday - just prints with all current crap anyhow
]
# RCP - look for a strange compound. See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html
# The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s
# http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html
# Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html
# Use the FULL PRINTPAGE URL; it formats it better too!
#
# NYT - try single page...
# Need special code - is it one page or several? Which URL?
# from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
# to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
# which is at link rel="canonical" and at <meta property="og:url" or look for "Single Page"
# Returns the best-guess print url.
# The second parameter (pageURL) is returned if nothing is found.
def extractPrintURL(self, pageURL):
tagURL = pageURL
baseParse = urlparse(pageURL)
baseURL = baseParse[0]+"://"+baseParse[1]
hintsCount =len(self.printhints)
for x in range(0,hintsCount):
if pageURL.find(self.printhints[x][0])== -1 :
@ -62,23 +80,37 @@ class RealClear(BasicNewsRecipe):
soup = self.index_to_soup(pageURL)
if soup is None:
return pageURL
if len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
if len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0:
# e.g. RealClear
if self.debugMessages == True :
print("search1")
print("Search by href: "+self.printhints[x][self.phHrefSearch])
printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch]))
elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
if self.debugMessages == True :
print("Search 1: "+self.printhints[x][2]+" Attributes: ")
print(self.printhints[x][3])
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3])
elif len(self.printhints[x][3])>0 :
if self.debugMessages == True :
print("search2")
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1])
else :
if self.debugMessages == True:
print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1])
printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1])
if printFind is None:
if self.debugMessages == True :
print("Not Found")
# print(soup)
print("end soup\n\n");
continue
print(printFind)
if isinstance(printFind, NavigableString)==False:
if printFind['href'] is not None:
print("Check "+printFind['href']+" for base of "+baseURL)
if printFind['href'].find("http")!=0 :
return baseURL+printFind['href']
return printFind['href']
tag = printFind.parent
print(tag)
@ -158,6 +190,7 @@ class RealClear(BasicNewsRecipe):
def parse_index(self):
# Parse the page into Python Soup
articleList = []
ans = []
feedsCount = len(self.feedsets)
for x in range(0,feedsCount): # should be ,4
@ -168,3 +201,4 @@ class RealClear(BasicNewsRecipe):
print(ans)
return ans