mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Updated OReilly Premium and Real Clear
This commit is contained in:
parent
ee108790db
commit
bb443d01f1
@ -1,45 +1,73 @@
|
|||||||
# Talking Points is not grabbing everything.
|
import string, re
|
||||||
# The look is right, but only the last one added?
|
|
||||||
import re
|
|
||||||
import time
|
import time
|
||||||
|
import traceback
|
||||||
|
# above for debugging via stack
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
# Allows the Python soup converter, which makes parsing easier.
|
# Allows the Python soup converter, which makes parsing easier.
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
# strip ads and graphics
|
|
||||||
# Current Column lacks a title.
|
|
||||||
# Talking Points Memo - shorten title - Remove year and Bill's name
|
|
||||||
# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
|
|
||||||
# Newsletters: Talking Points Memos covered by cat12
|
|
||||||
|
|
||||||
|
import os, time, traceback, re, urlparse, sys, cStringIO
|
||||||
|
from collections import defaultdict
|
||||||
|
from functools import partial
|
||||||
|
from contextlib import nested, closing
|
||||||
|
|
||||||
|
|
||||||
|
from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
|
||||||
|
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
|
||||||
|
|
||||||
|
|
||||||
|
# To Do: strip ads and graphics, Current Column lacks a title.
|
||||||
|
# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
|
||||||
|
# Newsletters: Talking Points Memos covered by cat12
|
||||||
|
# ./ebook-convert --username xxx --password xxx
|
||||||
|
|
||||||
|
# this is derived from BasicNewsRecipe, so it can only overload those.
|
||||||
|
# Soome of what we need is otherwise in article, so we have more copy to do than otherwise.
|
||||||
class OReillyPremium(BasicNewsRecipe):
|
class OReillyPremium(BasicNewsRecipe):
|
||||||
title = u'OReilly Premium'
|
title = u'OReilly Premium'
|
||||||
__author__ = 'TMcN'
|
__author__ = 'TMcN'
|
||||||
language = 'en'
|
|
||||||
description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.'
|
description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.'
|
||||||
cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png'
|
cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png'
|
||||||
|
custom_title = 'Bill O\'Reilly Premium - '+ time.strftime('%d %b %Y')
|
||||||
|
title = 'Bill O\'Reilly Premium'
|
||||||
auto_cleanup = True
|
auto_cleanup = True
|
||||||
|
conversion_options = {'linearize_tables': True}
|
||||||
encoding = 'utf8'
|
encoding = 'utf8'
|
||||||
needs_subscription = True
|
language = 'en'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
oldest_article = 20
|
needs_subscription = True
|
||||||
|
oldest_article = 31
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
remove_tags = [dict(name='img', attrs={})]
|
remove_tags = [dict(name='img', attrs={})]
|
||||||
# Don't go down
|
# Don't go down
|
||||||
recursions = 0
|
recursions = 0
|
||||||
max_articles_per_feed = 2000
|
max_articles_per_feed = 20
|
||||||
|
|
||||||
debugMessages = True
|
debugMessages = True
|
||||||
|
|
||||||
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
|
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
|
||||||
catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []],
|
catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []],
|
||||||
["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
|
# ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
|
||||||
["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []],
|
# ["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []],
|
||||||
["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []],
|
# ["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []],
|
||||||
["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []],
|
# ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []],
|
||||||
["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []]
|
["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []]
|
||||||
]
|
]
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
(u'No Spin', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'),
|
||||||
|
(u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'),
|
||||||
|
(u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'),
|
||||||
|
(u'Blog', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'),
|
||||||
|
(u'StratFor', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5')
|
||||||
|
]
|
||||||
|
# http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day.
|
||||||
|
|
||||||
|
# Note: Talking Points is broken in the above model; the site changed to more Ajax-y.
|
||||||
|
# Now using RSS
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
|
print("In get_browser")
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
if self.username is not None and self.password is not None:
|
if self.username is not None and self.password is not None:
|
||||||
br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
|
br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
|
||||||
@ -48,7 +76,7 @@ class OReillyPremium(BasicNewsRecipe):
|
|||||||
br['formPasswordField'] = self.password
|
br['formPasswordField'] = self.password
|
||||||
br.submit()
|
br.submit()
|
||||||
return br
|
return br
|
||||||
|
|
||||||
# Returns the best-guess print url.
|
# Returns the best-guess print url.
|
||||||
# The second parameter (pageURL) is returned if nothing is found.
|
# The second parameter (pageURL) is returned if nothing is found.
|
||||||
def extractPrintURL(self, baseURL, pageURL, printString):
|
def extractPrintURL(self, baseURL, pageURL, printString):
|
||||||
@ -62,17 +90,19 @@ class OReillyPremium(BasicNewsRecipe):
|
|||||||
tag = printText.parent
|
tag = printText.parent
|
||||||
tagURL = baseURL+tag['href']
|
tagURL = baseURL+tag['href']
|
||||||
return tagURL
|
return tagURL
|
||||||
|
|
||||||
def stripBadChars(self, inString) :
|
def stripBadChars(self, inString) :
|
||||||
return inString.replace("\'", "")
|
return inString.replace("\'", "")
|
||||||
|
|
||||||
|
|
||||||
def parseGeneric(self, baseURL):
|
def parseGeneric(self, baseURL):
|
||||||
# Does a generic parsing of the articles. There are six categories (0-5)
|
# Does a generic parsing of the articles. There are six categories (0-5)
|
||||||
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
|
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
|
||||||
# NoSpin and TV are generic
|
# NoSpin and TV are generic
|
||||||
fullReturn = []
|
fullReturn = []
|
||||||
for i in range(len(self.catList)) :
|
for i in range(len(self.catList)) :
|
||||||
articleList = []
|
articleList = []
|
||||||
|
print("In "+self.catList[i][0]+", index: "+ str(i))
|
||||||
soup = self.index_to_soup(self.catList[i][1])
|
soup = self.index_to_soup(self.catList[i][1])
|
||||||
# Set defaults
|
# Set defaults
|
||||||
description = 'None'
|
description = 'None'
|
||||||
@ -80,15 +110,13 @@ class OReillyPremium(BasicNewsRecipe):
|
|||||||
# Problem: 0-2 create many in an array
|
# Problem: 0-2 create many in an array
|
||||||
# 3-5 create one.
|
# 3-5 create one.
|
||||||
# So no for-div for 3-5
|
# So no for-div for 3-5
|
||||||
|
|
||||||
if i < 3 :
|
if i == 0 :
|
||||||
|
print("Starting TV Archives")
|
||||||
for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
|
for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
|
||||||
|
print("Next DIV:")
|
||||||
print(div)
|
print(div)
|
||||||
if i == 1:
|
a = div
|
||||||
a = div.find('a', href=True)
|
|
||||||
else :
|
|
||||||
a = div
|
|
||||||
print(a)
|
|
||||||
summary = div.find(True, attrs={'class':'summary'})
|
summary = div.find(True, attrs={'class':'summary'})
|
||||||
if summary:
|
if summary:
|
||||||
description = self.tag_to_string(summary, use_alt=False)
|
description = self.tag_to_string(summary, use_alt=False)
|
||||||
@ -96,82 +124,63 @@ class OReillyPremium(BasicNewsRecipe):
|
|||||||
continue
|
continue
|
||||||
# url = baseURL+re.sub(r'\?.*', '', a['href'])
|
# url = baseURL+re.sub(r'\?.*', '', a['href'])
|
||||||
url = baseURL+a['href']
|
url = baseURL+a['href']
|
||||||
if i < 2 :
|
url = self.extractPrintURL(baseURL, url, "Print this entry")
|
||||||
url = self.extractPrintURL(baseURL, url, "Print this entry")
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
title = self.tag_to_string(a, use_alt=True).strip()
|
|
||||||
elif i == 2 :
|
|
||||||
# Daily Briefs
|
|
||||||
url = self.extractPrintURL(baseURL, url, "Print this entry")
|
|
||||||
title = div.contents[0]
|
|
||||||
if self.debugMessages :
|
|
||||||
print(title+" @ "+url)
|
|
||||||
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
|
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
|
||||||
|
|
||||||
elif i == 3 : # Stratfor
|
|
||||||
a = soup.find('a', self.catList[i][3])
|
|
||||||
if a is None :
|
|
||||||
continue
|
|
||||||
url = baseURL+a['href']
|
|
||||||
title = self.tag_to_string(a, use_alt=True).strip()
|
|
||||||
# Get Stratfor contents so we can get the real title.
|
|
||||||
stratSoup = self.index_to_soup(url)
|
|
||||||
title = stratSoup.html.head.title.string
|
|
||||||
stratIndex = title.find('Stratfor.com:', 0)
|
|
||||||
if (stratIndex > -1) :
|
|
||||||
title = title[stratIndex+14:-1]
|
|
||||||
# Look for first blogBody <td class="blogBody"
|
|
||||||
# Changed 12 Jan 2012 - new page format
|
|
||||||
#stratBlogTable = stratSoup.find('td', {'class':['blogBody']}).findParent('table')
|
|
||||||
#stratBody = stratSoup.find('td', {'class':['blogBody']})
|
|
||||||
elif i == 4 : # Talking Points
|
|
||||||
topDate = soup.find("td", "blogBody")
|
|
||||||
if not topDate :
|
|
||||||
print("Failed to find date in Talking Points")
|
|
||||||
# This page has the contents in double-wrapped tables!
|
|
||||||
myTable = topDate.findParents('table')[0]
|
|
||||||
if myTable is not None:
|
|
||||||
upOneTable = myTable.findParents('table')[0]
|
|
||||||
if upOneTable is not None:
|
|
||||||
upTwo = upOneTable.findParents('table')[0]
|
|
||||||
if upTwo is None:
|
|
||||||
continue
|
|
||||||
# Now navigate rows of upTwo
|
|
||||||
if self.debugMessages :
|
|
||||||
print("Entering rows")
|
|
||||||
for rows in upTwo.findChildren("tr", recursive=False):
|
|
||||||
# Inside top level table, each row is an article
|
|
||||||
rowTable = rows.find("table")
|
|
||||||
articleTable = rowTable.find("table")
|
|
||||||
# This looks wrong.
|
|
||||||
articleTable = rows.find("tr")
|
|
||||||
# The middle table is just for formatting the article buffer... but this means we can skip the inner table.
|
|
||||||
blogDate = articleTable.find("a","blogDate").contents[0]
|
|
||||||
# Skip to second blogBody for this.
|
|
||||||
blogTitle = articleTable.findAll("td", "blogBody")[1].contents[0]
|
|
||||||
blogURL = articleTable.find("a", "homeBlogReadMore bold")['href']
|
|
||||||
url = baseURL+re.sub(r'\?.*', '', blogURL)
|
|
||||||
title = blogDate+": "+self.stripBadChars(blogTitle.replace("Bill O'Reilly: ", ""))
|
|
||||||
if self.debugMessages :
|
|
||||||
print("Talking Points Memo title "+title+" at url: "+url)
|
|
||||||
pubdate = time.strftime('%a, %d %b')
|
|
||||||
articleList.append(dict(title=title, url=url, date=pubdate, description='None', content=''))
|
|
||||||
else : # Current Column
|
else : # Current Column
|
||||||
titleSpan = soup.find(self.catList[i][2], self.catList[i][3])
|
titleSpan = soup.find(self.catList[i][2], self.catList[i][3])
|
||||||
if titleSpan is None :
|
if titleSpan is None :
|
||||||
|
print("No Current Column Title Span")
|
||||||
|
print(soup)
|
||||||
continue
|
continue
|
||||||
title = titleSpan.contents[0]
|
title = titleSpan.contents[0]
|
||||||
url = self.extractPrintURL(baseURL, self.catList[i][1], "Print This Article")
|
url = self.extractPrintURL(baseURL, self.catList[i][1], "Print This Article")
|
||||||
if i == 3 or i == 5 :
|
if i == 1 :
|
||||||
if self.debugMessages :
|
if self.debugMessages :
|
||||||
print(self.catList[i][0]+" Title:"+title+" at url: "+url)
|
print(self.catList[i][0]+" Title:"+title+" at url: "+url)
|
||||||
summary = div.find(True, attrs={'class':'summary'})
|
summary = div.find(True, attrs={'class':'summary'})
|
||||||
if summary:
|
print("At Summary")
|
||||||
|
print(summary)
|
||||||
|
if summary is not None:
|
||||||
description = self.tag_to_string(summary, use_alt=False)
|
description = self.tag_to_string(summary, use_alt=False)
|
||||||
|
print("At append")
|
||||||
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
|
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
|
||||||
self.catList[i][3] = articleList
|
self.catList[i][3] = articleList
|
||||||
fullReturn.append((self.catList[i][0], articleList))
|
fullReturn.append((self.catList[i][0], articleList))
|
||||||
|
print("Returning")
|
||||||
|
# print fullReturn
|
||||||
return fullReturn
|
return fullReturn
|
||||||
|
|
||||||
|
|
||||||
|
# build_index() starts with:
|
||||||
|
# try:
|
||||||
|
# feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
|
||||||
|
# max_articles_per_feed=self.max_articles_per_feed,
|
||||||
|
# log=self.log)
|
||||||
|
# self.report_progress(0, _('Got feeds from index page'))
|
||||||
|
# except NotImplementedError:
|
||||||
|
# feeds = self.parse_feeds()
|
||||||
|
|
||||||
|
# which in turn is from __init__.py
|
||||||
|
#def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100,
|
||||||
|
# log=default_log):
|
||||||
|
#'''
|
||||||
|
#@param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}.
|
||||||
|
#@return: A list of L{Feed} objects.
|
||||||
|
#@rtype: list
|
||||||
|
#'''
|
||||||
|
#feeds = []
|
||||||
|
#for title, articles in index:
|
||||||
|
# pfeed = Feed(log=log)
|
||||||
|
# pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
|
||||||
|
# max_articles_per_feed=max_articles_per_feed)
|
||||||
|
# feeds.append(pfeed)
|
||||||
|
# return feeds
|
||||||
|
|
||||||
|
# use_embedded_content defaults to None, at which point if the content is > 2K, it is used as the article.
|
||||||
|
|
||||||
|
|
||||||
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
|
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
|
||||||
# returns a list of tuple ('feed title', list of articles)
|
# returns a list of tuple ('feed title', list of articles)
|
||||||
# {
|
# {
|
||||||
@ -182,16 +191,148 @@ class OReillyPremium(BasicNewsRecipe):
|
|||||||
# 'content' : The full article (can be an empty string). This is used by FullContentProfile
|
# 'content' : The full article (can be an empty string). This is used by FullContentProfile
|
||||||
# }
|
# }
|
||||||
# this is used instead of BasicNewsRecipe.parse_feeds().
|
# this is used instead of BasicNewsRecipe.parse_feeds().
|
||||||
|
# it is called by download
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
# Parse the page into Python Soup
|
# Parse the page into Python Soup
|
||||||
|
print("Entering recipe print_index from:")
|
||||||
|
traceback.print_stack()
|
||||||
|
print("web")
|
||||||
baseURL = "https://www.billoreilly.com"
|
baseURL = "https://www.billoreilly.com"
|
||||||
return self.parseGeneric(baseURL)
|
masterList = self.parseGeneric(baseURL)
|
||||||
|
#print(masterList)
|
||||||
|
return masterList
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
print("In preprocess_html")
|
||||||
refresh = soup.find('meta', {'http-equiv':'refresh'})
|
refresh = soup.find('meta', {'http-equiv':'refresh'})
|
||||||
if refresh is None:
|
if refresh is None:
|
||||||
return soup
|
return soup
|
||||||
content = refresh.get('content').partition('=')[2]
|
content = refresh.get('content').partition('=')[2]
|
||||||
raw = self.browser.open('https://www.billoreilly.com'+content).read()
|
raw = self.browser.open('https://www.billoreilly.com'+content).read()
|
||||||
return BeautifulSoup(raw.decode('cp1252', 'replace'))
|
return BeautifulSoup(raw.decode('cp1252', 'replace'))
|
||||||
|
|
||||||
|
def build_index(self):
|
||||||
|
print("In OReilly build_index()\n\n")
|
||||||
|
feedsRSS = []
|
||||||
|
self.report_progress(0, _('Fetching feeds...'))
|
||||||
|
#try:
|
||||||
|
feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
|
||||||
|
max_articles_per_feed=self.max_articles_per_feed,
|
||||||
|
log=self.log)
|
||||||
|
self.report_progress(0, _('Got feeds from index page'))
|
||||||
|
#except NotImplementedError:
|
||||||
|
# feeds = self.parse_feeds()
|
||||||
|
# Now add regular feeds.
|
||||||
|
feedsRSS = self.parse_feeds()
|
||||||
|
print ("feedsRSS is type "+feedsRSS.__class__.__name__)
|
||||||
|
|
||||||
|
for articles in feedsRSS:
|
||||||
|
print("articles is type "+articles.__class__.__name__)
|
||||||
|
print("Title:" + articles.title)
|
||||||
|
feeds.append(articles)
|
||||||
|
if not feeds:
|
||||||
|
raise ValueError('No articles found, aborting')
|
||||||
|
|
||||||
|
#feeds = FeedCollection(feeds)
|
||||||
|
|
||||||
|
self.report_progress(0, _('Trying to download cover...'))
|
||||||
|
self.download_cover()
|
||||||
|
self.report_progress(0, _('Generating masthead...'))
|
||||||
|
self.masthead_path = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
murl = self.get_masthead_url()
|
||||||
|
except:
|
||||||
|
self.log.exception('Failed to get masthead url')
|
||||||
|
murl = None
|
||||||
|
|
||||||
|
if murl is not None:
|
||||||
|
# Try downloading the user-supplied masthead_url
|
||||||
|
# Failure sets self.masthead_path to None
|
||||||
|
self.download_masthead(murl)
|
||||||
|
if self.masthead_path is None:
|
||||||
|
self.log.info("Synthesizing mastheadImage")
|
||||||
|
self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg')
|
||||||
|
try:
|
||||||
|
self.default_masthead_image(self.masthead_path)
|
||||||
|
except:
|
||||||
|
self.log.exception('Failed to generate default masthead image')
|
||||||
|
self.masthead_path = None
|
||||||
|
|
||||||
|
if self.test:
|
||||||
|
feeds = feeds[:2]
|
||||||
|
self.has_single_feed = len(feeds) == 1
|
||||||
|
|
||||||
|
index = os.path.join(self.output_dir, 'index.html')
|
||||||
|
|
||||||
|
html = self.feeds2index(feeds)
|
||||||
|
with open(index, 'wb') as fi:
|
||||||
|
fi.write(html)
|
||||||
|
|
||||||
|
self.jobs = []
|
||||||
|
|
||||||
|
if self.reverse_article_order:
|
||||||
|
for feed in feeds:
|
||||||
|
if hasattr(feed, 'reverse'):
|
||||||
|
feed.reverse()
|
||||||
|
|
||||||
|
self.feed_objects = feeds
|
||||||
|
for f, feed in enumerate(feeds):
|
||||||
|
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
|
||||||
|
if not os.path.isdir(feed_dir):
|
||||||
|
os.makedirs(feed_dir)
|
||||||
|
|
||||||
|
for a, article in enumerate(feed):
|
||||||
|
if a >= self.max_articles_per_feed:
|
||||||
|
break
|
||||||
|
art_dir = os.path.join(feed_dir, 'article_%d'%a)
|
||||||
|
if not os.path.isdir(art_dir):
|
||||||
|
os.makedirs(art_dir)
|
||||||
|
try:
|
||||||
|
url = self.print_version(article.url)
|
||||||
|
except NotImplementedError:
|
||||||
|
url = article.url
|
||||||
|
except:
|
||||||
|
self.log.exception('Failed to find print version for: '+article.url)
|
||||||
|
url = None
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
func, arg = (self.fetch_embedded_article, article) \
|
||||||
|
if self.use_embedded_content or (self.use_embedded_content == None and feed.has_embedded_content()) \
|
||||||
|
else \
|
||||||
|
((self.fetch_obfuscated_article if self.articles_are_obfuscated \
|
||||||
|
else self.fetch_article), url)
|
||||||
|
req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
|
||||||
|
{}, (f, a), self.article_downloaded,
|
||||||
|
self.error_in_article_download)
|
||||||
|
req.feed = feed
|
||||||
|
req.article = article
|
||||||
|
req.feed_dir = feed_dir
|
||||||
|
self.jobs.append(req)
|
||||||
|
|
||||||
|
|
||||||
|
self.jobs_done = 0
|
||||||
|
tp = ThreadPool(self.simultaneous_downloads)
|
||||||
|
for req in self.jobs:
|
||||||
|
tp.putRequest(req, block=True, timeout=0)
|
||||||
|
|
||||||
|
|
||||||
|
self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
tp.poll()
|
||||||
|
time.sleep(0.1)
|
||||||
|
except NoResultsPending:
|
||||||
|
break
|
||||||
|
for f, feed in enumerate(feeds):
|
||||||
|
print("Writing feeds for "+feed.title)
|
||||||
|
html = self.feed2index(f,feeds)
|
||||||
|
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
|
||||||
|
with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
|
||||||
|
fi.write(html)
|
||||||
|
self.create_opf(feeds)
|
||||||
|
self.report_progress(1, _('Feeds downloaded to %s')%index)
|
||||||
|
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
# Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug
|
# Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug
|
||||||
|
import string, re
|
||||||
import time
|
import time
|
||||||
|
from urlparse import urlparse
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import NavigableString
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString
|
||||||
|
|
||||||
class RealClear(BasicNewsRecipe):
|
class RealClear(BasicNewsRecipe):
|
||||||
title = u'Real Clear'
|
title = u'Real Clear'
|
||||||
@ -20,12 +22,13 @@ class RealClear(BasicNewsRecipe):
|
|||||||
# Don't go down
|
# Don't go down
|
||||||
recursions = 0
|
recursions = 0
|
||||||
max_articles_per_feed = 400
|
max_articles_per_feed = 400
|
||||||
debugMessages = False
|
debugMessages = True
|
||||||
|
|
||||||
# Numeric parameter is type, controls whether we look for
|
# Numeric parameter is type, controls whether we look for
|
||||||
feedsets = [
|
feedsets = [
|
||||||
["Politics", "http://www.realclearpolitics.com/index.xml", 0],
|
["Politics", "http://www.realclearpolitics.com/index.xml", 0],
|
||||||
["Science", "http://www.realclearscience.com/index.xml", 0],
|
["Policy", "http://www.realclearpolicy.com/index.xml", 0],
|
||||||
|
["Science", "http://www.realclearscience.com/index.xml", 0],
|
||||||
["Tech", "http://www.realcleartechnology.com/index.xml", 0],
|
["Tech", "http://www.realcleartechnology.com/index.xml", 0],
|
||||||
# The feedburner is essentially the same as the top feed, politics.
|
# The feedburner is essentially the same as the top feed, politics.
|
||||||
# ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1],
|
# ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1],
|
||||||
@ -37,22 +40,37 @@ class RealClear(BasicNewsRecipe):
|
|||||||
]
|
]
|
||||||
# Hints to extractPrintURL.
|
# Hints to extractPrintURL.
|
||||||
# First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down.
|
# First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down.
|
||||||
printhints = [
|
phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)
|
||||||
|
|
||||||
|
printhints = [ ["realclear", "", '' , 'printpage'],
|
||||||
["billoreilly.com", "Print this entry", 'a', ''],
|
["billoreilly.com", "Print this entry", 'a', ''],
|
||||||
["billoreilly.com", "Print This Article", 'a', ''],
|
["billoreilly.com", "Print This Article", 'a', ''],
|
||||||
["politico.com", "Print", 'a', 'share-print'],
|
["politico.com", "Print", 'a', 'share-print'],
|
||||||
["nationalreview.com", ">Print<", 'a', ''],
|
["nationalreview.com", ">Print<", 'a', ''],
|
||||||
["reason.com", "", 'a', 'printer']
|
["reason.com", "", 'a', 'printer']
|
||||||
# The following are not supported due to JavaScripting, and would require obfuscated_article to handle
|
# The following are not supported due to JavaScripting, and would require obfuscated_article to handle
|
||||||
# forbes,
|
# forbes,
|
||||||
# usatoday - just prints with all current crap anyhow
|
# usatoday - just prints with all current crap anyhow
|
||||||
|
|
||||||
]
|
]
|
||||||
|
# RCP - look for a strange compound. See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html
|
||||||
|
# The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s
|
||||||
|
# http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html
|
||||||
|
# Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html
|
||||||
|
# Use the FULL PRINTPAGE URL; it formats it better too!
|
||||||
|
#
|
||||||
|
# NYT - try single page...
|
||||||
|
# Need special code - is it one page or several? Which URL?
|
||||||
|
# from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
|
||||||
|
# to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
|
||||||
|
# which is at link rel="canonical" and at <meta property="og:url" or look for "Single Page"
|
||||||
|
|
||||||
# Returns the best-guess print url.
|
# Returns the best-guess print url.
|
||||||
# The second parameter (pageURL) is returned if nothing is found.
|
# The second parameter (pageURL) is returned if nothing is found.
|
||||||
def extractPrintURL(self, pageURL):
|
def extractPrintURL(self, pageURL):
|
||||||
tagURL = pageURL
|
tagURL = pageURL
|
||||||
|
baseParse = urlparse(pageURL)
|
||||||
|
baseURL = baseParse[0]+"://"+baseParse[1]
|
||||||
hintsCount =len(self.printhints)
|
hintsCount =len(self.printhints)
|
||||||
for x in range(0,hintsCount):
|
for x in range(0,hintsCount):
|
||||||
if pageURL.find(self.printhints[x][0])== -1 :
|
if pageURL.find(self.printhints[x][0])== -1 :
|
||||||
@ -62,23 +80,37 @@ class RealClear(BasicNewsRecipe):
|
|||||||
soup = self.index_to_soup(pageURL)
|
soup = self.index_to_soup(pageURL)
|
||||||
if soup is None:
|
if soup is None:
|
||||||
return pageURL
|
return pageURL
|
||||||
if len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
|
if len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0:
|
||||||
|
# e.g. RealClear
|
||||||
if self.debugMessages == True :
|
if self.debugMessages == True :
|
||||||
print("search1")
|
print("Search by href: "+self.printhints[x][self.phHrefSearch])
|
||||||
|
printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch]))
|
||||||
|
elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
|
||||||
|
if self.debugMessages == True :
|
||||||
|
print("Search 1: "+self.printhints[x][2]+" Attributes: ")
|
||||||
|
print(self.printhints[x][3])
|
||||||
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3])
|
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3])
|
||||||
elif len(self.printhints[x][3])>0 :
|
elif len(self.printhints[x][3])>0 :
|
||||||
if self.debugMessages == True :
|
if self.debugMessages == True :
|
||||||
print("search2")
|
print("search2")
|
||||||
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1])
|
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1])
|
||||||
else :
|
else :
|
||||||
|
if self.debugMessages == True:
|
||||||
|
print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1])
|
||||||
printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1])
|
printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1])
|
||||||
if printFind is None:
|
if printFind is None:
|
||||||
if self.debugMessages == True :
|
if self.debugMessages == True :
|
||||||
print("Not Found")
|
print("Not Found")
|
||||||
|
# print(soup)
|
||||||
|
print("end soup\n\n");
|
||||||
continue
|
continue
|
||||||
|
|
||||||
print(printFind)
|
print(printFind)
|
||||||
if isinstance(printFind, NavigableString)==False:
|
if isinstance(printFind, NavigableString)==False:
|
||||||
if printFind['href'] is not None:
|
if printFind['href'] is not None:
|
||||||
|
print("Check "+printFind['href']+" for base of "+baseURL)
|
||||||
|
if printFind['href'].find("http")!=0 :
|
||||||
|
return baseURL+printFind['href']
|
||||||
return printFind['href']
|
return printFind['href']
|
||||||
tag = printFind.parent
|
tag = printFind.parent
|
||||||
print(tag)
|
print(tag)
|
||||||
@ -98,7 +130,7 @@ class RealClear(BasicNewsRecipe):
|
|||||||
print("In get_browser")
|
print("In get_browser")
|
||||||
br = BasicNewsRecipe.get_browser()
|
br = BasicNewsRecipe.get_browser()
|
||||||
return br
|
return br
|
||||||
|
|
||||||
def parseRSS(self, index) :
|
def parseRSS(self, index) :
|
||||||
if self.debugMessages == True :
|
if self.debugMessages == True :
|
||||||
print("\n\nStarting "+self.feedsets[index][0])
|
print("\n\nStarting "+self.feedsets[index][0])
|
||||||
@ -128,7 +160,7 @@ class RealClear(BasicNewsRecipe):
|
|||||||
pubDateEl = div.find("pubDate")
|
pubDateEl = div.find("pubDate")
|
||||||
if pubDateEl is None :
|
if pubDateEl is None :
|
||||||
pubDateEl = div.find("pubdate")
|
pubDateEl = div.find("pubdate")
|
||||||
if pubDateEl is None :
|
if pubDateEl is None :
|
||||||
pubDate = time.strftime('%a, %d %b')
|
pubDate = time.strftime('%a, %d %b')
|
||||||
else :
|
else :
|
||||||
pubDate = pubDateEl.contents[0]
|
pubDate = pubDateEl.contents[0]
|
||||||
@ -144,7 +176,7 @@ class RealClear(BasicNewsRecipe):
|
|||||||
pubdate = time.strftime('%a, %d %b')
|
pubdate = time.strftime('%a, %d %b')
|
||||||
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
|
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
|
||||||
return articleList
|
return articleList
|
||||||
|
|
||||||
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
|
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
|
||||||
# returns a list of tuple ('feed title', list of articles)
|
# returns a list of tuple ('feed title', list of articles)
|
||||||
# {
|
# {
|
||||||
@ -157,7 +189,8 @@ class RealClear(BasicNewsRecipe):
|
|||||||
# this is used instead of BasicNewsRecipe.parse_feeds().
|
# this is used instead of BasicNewsRecipe.parse_feeds().
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
# Parse the page into Python Soup
|
# Parse the page into Python Soup
|
||||||
|
|
||||||
|
articleList = []
|
||||||
ans = []
|
ans = []
|
||||||
feedsCount = len(self.feedsets)
|
feedsCount = len(self.feedsets)
|
||||||
for x in range(0,feedsCount): # should be ,4
|
for x in range(0,feedsCount): # should be ,4
|
||||||
@ -167,4 +200,5 @@ class RealClear(BasicNewsRecipe):
|
|||||||
if self.debugMessages == True :
|
if self.debugMessages == True :
|
||||||
print(ans)
|
print(ans)
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user