diff --git a/recipes/oreilly_premium.recipe b/recipes/oreilly_premium.recipe
index 9dc11059c4..4a9b9e54c3 100644
--- a/recipes/oreilly_premium.recipe
+++ b/recipes/oreilly_premium.recipe
@@ -1,45 +1,73 @@
-# Talking Points is not grabbing everything.
-# The look is right, but only the last one added?
-import re
+import string, re
import time
+import traceback
+# above for debugging via stack
from calibre.web.feeds.recipes import BasicNewsRecipe
# Allows the Python soup converter, which makes parsing easier.
from calibre.ebooks.BeautifulSoup import BeautifulSoup
-# strip ads and graphics
-# Current Column lacks a title.
-# Talking Points Memo - shorten title - Remove year and Bill's name
-# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
-# Newsletters: Talking Points Memos covered by cat12
+import os, time, traceback, re, urlparse, sys, cStringIO
+from collections import defaultdict
+from functools import partial
+from contextlib import nested, closing
+
+
+from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
+from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
+
+
+# To Do: strip ads and graphics, Current Column lacks a title.
+# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
+# Newsletters: Talking Points Memos covered by cat12
+# ./ebook-convert --username xxx --password xxx
+
+# this is derived from BasicNewsRecipe, so it can only overload those.
+# Soome of what we need is otherwise in article, so we have more copy to do than otherwise.
class OReillyPremium(BasicNewsRecipe):
title = u'OReilly Premium'
__author__ = 'TMcN'
- language = 'en'
description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.'
cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png'
+ custom_title = 'Bill O\'Reilly Premium - '+ time.strftime('%d %b %Y')
+ title = 'Bill O\'Reilly Premium'
auto_cleanup = True
+ conversion_options = {'linearize_tables': True}
encoding = 'utf8'
- needs_subscription = True
+ language = 'en'
no_stylesheets = True
- oldest_article = 20
+ needs_subscription = True
+ oldest_article = 31
remove_javascript = True
remove_tags = [dict(name='img', attrs={})]
# Don't go down
recursions = 0
- max_articles_per_feed = 2000
-
+ max_articles_per_feed = 20
+
debugMessages = True
-
+
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []],
- ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
- ["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []],
- ["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []],
- ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []],
+ # ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
+ # ["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []],
+ # ["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []],
+ # ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []],
["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []]
]
-
+
+ feeds = [
+ (u'No Spin', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'),
+ (u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'),
+ (u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'),
+ (u'Blog', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'),
+ (u'StratFor', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5')
+ ]
+ # http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day.
+
+ # Note: Talking Points is broken in the above model; the site changed to more Ajax-y.
+ # Now using RSS
+
def get_browser(self):
+ print("In get_browser")
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
@@ -48,7 +76,7 @@ class OReillyPremium(BasicNewsRecipe):
br['formPasswordField'] = self.password
br.submit()
return br
-
+
# Returns the best-guess print url.
# The second parameter (pageURL) is returned if nothing is found.
def extractPrintURL(self, baseURL, pageURL, printString):
@@ -62,17 +90,19 @@ class OReillyPremium(BasicNewsRecipe):
tag = printText.parent
tagURL = baseURL+tag['href']
return tagURL
-
+
def stripBadChars(self, inString) :
return inString.replace("\'", "")
-
+
+
def parseGeneric(self, baseURL):
- # Does a generic parsing of the articles. There are six categories (0-5)
+ # Does a generic parsing of the articles. There are six categories (0-5)
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
# NoSpin and TV are generic
fullReturn = []
- for i in range(len(self.catList)) :
+ for i in range(len(self.catList)) :
articleList = []
+ print("In "+self.catList[i][0]+", index: "+ str(i))
soup = self.index_to_soup(self.catList[i][1])
# Set defaults
description = 'None'
@@ -80,15 +110,13 @@ class OReillyPremium(BasicNewsRecipe):
# Problem: 0-2 create many in an array
# 3-5 create one.
# So no for-div for 3-5
-
- if i < 3 :
+
+ if i == 0 :
+ print("Starting TV Archives")
for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
+ print("Next DIV:")
print(div)
- if i == 1:
- a = div.find('a', href=True)
- else :
- a = div
- print(a)
+ a = div
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
@@ -96,82 +124,63 @@ class OReillyPremium(BasicNewsRecipe):
continue
# url = baseURL+re.sub(r'\?.*', '', a['href'])
url = baseURL+a['href']
- if i < 2 :
- url = self.extractPrintURL(baseURL, url, "Print this entry")
- title = self.tag_to_string(a, use_alt=True).strip()
- elif i == 2 :
- # Daily Briefs
- url = self.extractPrintURL(baseURL, url, "Print this entry")
- title = div.contents[0]
- if self.debugMessages :
- print(title+" @ "+url)
+ url = self.extractPrintURL(baseURL, url, "Print this entry")
+ title = self.tag_to_string(a, use_alt=True).strip()
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
- elif i == 3 : # Stratfor
- a = soup.find('a', self.catList[i][3])
- if a is None :
- continue
- url = baseURL+a['href']
- title = self.tag_to_string(a, use_alt=True).strip()
- # Get Stratfor contents so we can get the real title.
- stratSoup = self.index_to_soup(url)
- title = stratSoup.html.head.title.string
- stratIndex = title.find('Stratfor.com:', 0)
- if (stratIndex > -1) :
- title = title[stratIndex+14:-1]
- # Look for first blogBody
2K, it is used as the article.
+
+
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
# returns a list of tuple ('feed title', list of articles)
# {
@@ -182,16 +191,148 @@ class OReillyPremium(BasicNewsRecipe):
# 'content' : The full article (can be an empty string). This is used by FullContentProfile
# }
# this is used instead of BasicNewsRecipe.parse_feeds().
+ # it is called by download
def parse_index(self):
# Parse the page into Python Soup
+ print("Entering recipe print_index from:")
+ traceback.print_stack()
+ print("web")
baseURL = "https://www.billoreilly.com"
- return self.parseGeneric(baseURL)
-
+ masterList = self.parseGeneric(baseURL)
+ #print(masterList)
+ return masterList
+
def preprocess_html(self, soup):
+ print("In preprocess_html")
refresh = soup.find('meta', {'http-equiv':'refresh'})
if refresh is None:
return soup
content = refresh.get('content').partition('=')[2]
raw = self.browser.open('https://www.billoreilly.com'+content).read()
return BeautifulSoup(raw.decode('cp1252', 'replace'))
+
+ def build_index(self):
+ print("In OReilly build_index()\n\n")
+ feedsRSS = []
+ self.report_progress(0, _('Fetching feeds...'))
+ #try:
+ feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
+ max_articles_per_feed=self.max_articles_per_feed,
+ log=self.log)
+ self.report_progress(0, _('Got feeds from index page'))
+ #except NotImplementedError:
+ # feeds = self.parse_feeds()
+ # Now add regular feeds.
+ feedsRSS = self.parse_feeds()
+ print ("feedsRSS is type "+feedsRSS.__class__.__name__)
+
+ for articles in feedsRSS:
+ print("articles is type "+articles.__class__.__name__)
+ print("Title:" + articles.title)
+ feeds.append(articles)
+ if not feeds:
+ raise ValueError('No articles found, aborting')
+
+ #feeds = FeedCollection(feeds)
+
+ self.report_progress(0, _('Trying to download cover...'))
+ self.download_cover()
+ self.report_progress(0, _('Generating masthead...'))
+ self.masthead_path = None
+
+ try:
+ murl = self.get_masthead_url()
+ except:
+ self.log.exception('Failed to get masthead url')
+ murl = None
+
+ if murl is not None:
+ # Try downloading the user-supplied masthead_url
+ # Failure sets self.masthead_path to None
+ self.download_masthead(murl)
+ if self.masthead_path is None:
+ self.log.info("Synthesizing mastheadImage")
+ self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg')
+ try:
+ self.default_masthead_image(self.masthead_path)
+ except:
+ self.log.exception('Failed to generate default masthead image')
+ self.masthead_path = None
+
+ if self.test:
+ feeds = feeds[:2]
+ self.has_single_feed = len(feeds) == 1
+
+ index = os.path.join(self.output_dir, 'index.html')
+
+ html = self.feeds2index(feeds)
+ with open(index, 'wb') as fi:
+ fi.write(html)
+
+ self.jobs = []
+
+ if self.reverse_article_order:
+ for feed in feeds:
+ if hasattr(feed, 'reverse'):
+ feed.reverse()
+
+ self.feed_objects = feeds
+ for f, feed in enumerate(feeds):
+ feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
+ if not os.path.isdir(feed_dir):
+ os.makedirs(feed_dir)
+
+ for a, article in enumerate(feed):
+ if a >= self.max_articles_per_feed:
+ break
+ art_dir = os.path.join(feed_dir, 'article_%d'%a)
+ if not os.path.isdir(art_dir):
+ os.makedirs(art_dir)
+ try:
+ url = self.print_version(article.url)
+ except NotImplementedError:
+ url = article.url
+ except:
+ self.log.exception('Failed to find print version for: '+article.url)
+ url = None
+ if not url:
+ continue
+ func, arg = (self.fetch_embedded_article, article) \
+ if self.use_embedded_content or (self.use_embedded_content == None and feed.has_embedded_content()) \
+ else \
+ ((self.fetch_obfuscated_article if self.articles_are_obfuscated \
+ else self.fetch_article), url)
+ req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
+ {}, (f, a), self.article_downloaded,
+ self.error_in_article_download)
+ req.feed = feed
+ req.article = article
+ req.feed_dir = feed_dir
+ self.jobs.append(req)
+
+
+ self.jobs_done = 0
+ tp = ThreadPool(self.simultaneous_downloads)
+ for req in self.jobs:
+ tp.putRequest(req, block=True, timeout=0)
+
+
+ self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
+ while True:
+ try:
+ tp.poll()
+ time.sleep(0.1)
+ except NoResultsPending:
+ break
+ for f, feed in enumerate(feeds):
+ print("Writing feeds for "+feed.title)
+ html = self.feed2index(f,feeds)
+ feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
+ with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
+ fi.write(html)
+ self.create_opf(feeds)
+ self.report_progress(1, _('Feeds downloaded to %s')%index)
+
+ return index
+
diff --git a/recipes/real_clear.recipe b/recipes/real_clear.recipe
index 19add74fcd..2dfe56d207 100644
--- a/recipes/real_clear.recipe
+++ b/recipes/real_clear.recipe
@@ -1,7 +1,9 @@
# Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug
+import string, re
import time
+from urlparse import urlparse
from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import NavigableString
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString
class RealClear(BasicNewsRecipe):
title = u'Real Clear'
@@ -20,12 +22,13 @@ class RealClear(BasicNewsRecipe):
# Don't go down
recursions = 0
max_articles_per_feed = 400
- debugMessages = False
-
- # Numeric parameter is type, controls whether we look for
+ debugMessages = True
+
+ # Numeric parameter is type, controls whether we look for
feedsets = [
- ["Politics", "http://www.realclearpolitics.com/index.xml", 0],
- ["Science", "http://www.realclearscience.com/index.xml", 0],
+ ["Politics", "http://www.realclearpolitics.com/index.xml", 0],
+ ["Policy", "http://www.realclearpolicy.com/index.xml", 0],
+ ["Science", "http://www.realclearscience.com/index.xml", 0],
["Tech", "http://www.realcleartechnology.com/index.xml", 0],
# The feedburner is essentially the same as the top feed, politics.
# ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1],
@@ -37,22 +40,37 @@ class RealClear(BasicNewsRecipe):
]
# Hints to extractPrintURL.
# First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down.
- printhints = [
+ phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)
+
+ printhints = [ ["realclear", "", '' , 'printpage'],
["billoreilly.com", "Print this entry", 'a', ''],
["billoreilly.com", "Print This Article", 'a', ''],
- ["politico.com", "Print", 'a', 'share-print'],
+ ["politico.com", "Print", 'a', 'share-print'],
["nationalreview.com", ">Print<", 'a', ''],
["reason.com", "", 'a', 'printer']
# The following are not supported due to JavaScripting, and would require obfuscated_article to handle
- # forbes,
+ # forbes,
# usatoday - just prints with all current crap anyhow
-
+
]
-
+ # RCP - look for a strange compound. See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html
+ # The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s
+ # http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html
+ # Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html
+ # Use the FULL PRINTPAGE URL; it formats it better too!
+ #
+ # NYT - try single page...
+ # Need special code - is it one page or several? Which URL?
+ # from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
+ # to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
+ # which is at link rel="canonical" and at 0 and len(self.printhints[x][1]) == 0:
+ if len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0:
+ # e.g. RealClear
if self.debugMessages == True :
- print("search1")
+ print("Search by href: "+self.printhints[x][self.phHrefSearch])
+ printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch]))
+ elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
+ if self.debugMessages == True :
+ print("Search 1: "+self.printhints[x][2]+" Attributes: ")
+ print(self.printhints[x][3])
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3])
elif len(self.printhints[x][3])>0 :
if self.debugMessages == True :
print("search2")
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1])
else :
+ if self.debugMessages == True:
+ print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1])
printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1])
if printFind is None:
if self.debugMessages == True :
print("Not Found")
+ # print(soup)
+ print("end soup\n\n");
continue
+
print(printFind)
if isinstance(printFind, NavigableString)==False:
if printFind['href'] is not None:
+ print("Check "+printFind['href']+" for base of "+baseURL)
+ if printFind['href'].find("http")!=0 :
+ return baseURL+printFind['href']
return printFind['href']
tag = printFind.parent
print(tag)
@@ -98,7 +130,7 @@ class RealClear(BasicNewsRecipe):
print("In get_browser")
br = BasicNewsRecipe.get_browser()
return br
-
+
def parseRSS(self, index) :
if self.debugMessages == True :
print("\n\nStarting "+self.feedsets[index][0])
@@ -128,7 +160,7 @@ class RealClear(BasicNewsRecipe):
pubDateEl = div.find("pubDate")
if pubDateEl is None :
pubDateEl = div.find("pubdate")
- if pubDateEl is None :
+ if pubDateEl is None :
pubDate = time.strftime('%a, %d %b')
else :
pubDate = pubDateEl.contents[0]
@@ -144,7 +176,7 @@ class RealClear(BasicNewsRecipe):
pubdate = time.strftime('%a, %d %b')
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
return articleList
-
+
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
# returns a list of tuple ('feed title', list of articles)
# {
@@ -157,7 +189,8 @@ class RealClear(BasicNewsRecipe):
# this is used instead of BasicNewsRecipe.parse_feeds().
def parse_index(self):
# Parse the page into Python Soup
-
+
+ articleList = []
ans = []
feedsCount = len(self.feedsets)
for x in range(0,feedsCount): # should be ,4
@@ -167,4 +200,5 @@ class RealClear(BasicNewsRecipe):
if self.debugMessages == True :
print(ans)
return ans
+
|