mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
335 lines
14 KiB
Plaintext
335 lines
14 KiB
Plaintext
import time
|
|
import traceback
|
|
# above for debugging via stack
|
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
# Allows the Python soup converter, which makes parsing easier.
|
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
|
|
|
import os
|
|
|
|
|
|
from calibre.web.feeds import feeds_from_index
|
|
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
|
|
|
|
|
|
# To Do: strip ads and graphics, Current Column lacks a title.
|
|
# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
|
|
# Newsletters: Talking Points Memos covered by cat12
|
|
# ./ebook-convert --username xxx --password xxx
|
|
|
|
# this is derived from BasicNewsRecipe, so it can only overload those.
|
|
# Soome of what we need is otherwise in article, so we have more copy to do than otherwise.
|
|
class OReillyPremium(BasicNewsRecipe):
|
|
title = u'OReilly Premium'
|
|
__author__ = 'TMcN'
|
|
description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.'
|
|
cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png'
|
|
custom_title = 'Bill O\'Reilly Premium - '+ time.strftime('%d %b %Y')
|
|
title = 'Bill O\'Reilly Premium'
|
|
auto_cleanup = True
|
|
conversion_options = {'linearize_tables': True}
|
|
encoding = 'utf8'
|
|
language = 'en'
|
|
no_stylesheets = True
|
|
needs_subscription = True
|
|
oldest_article = 31
|
|
remove_javascript = True
|
|
remove_tags = [dict(name='img', attrs={})]
|
|
# Don't go down
|
|
recursions = 0
|
|
max_articles_per_feed = 20
|
|
|
|
debugMessages = True
|
|
|
|
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
|
|
catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []],
|
|
# ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
|
|
# ["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []],
|
|
# ["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []],
|
|
# ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []],
|
|
["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []]
|
|
]
|
|
|
|
feeds = [
|
|
(u'No Spin', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'),
|
|
(u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'),
|
|
(u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'),
|
|
(u'Blog', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'),
|
|
(u'StratFor', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5')
|
|
]
|
|
# http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day.
|
|
|
|
# Note: Talking Points is broken in the above model; the site changed to more Ajax-y.
|
|
# Now using RSS
|
|
|
|
def get_browser(self):
|
|
print("In get_browser")
|
|
br = BasicNewsRecipe.get_browser(self)
|
|
if self.username is not None and self.password is not None:
|
|
br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
|
|
br.select_form(name='login')
|
|
br['formEmailField'] = self.username
|
|
br['formPasswordField'] = self.password
|
|
br.submit()
|
|
return br
|
|
|
|
# Returns the best-guess print url.
|
|
# The second parameter (pageURL) is returned if nothing is found.
|
|
def extractPrintURL(self, baseURL, pageURL, printString):
|
|
tagURL = pageURL
|
|
soup = self.index_to_soup(pageURL)
|
|
if soup :
|
|
printText = soup.find('a', text=printString)
|
|
else :
|
|
print("Failed to find Print string "+printString+ " in "+pageURL)
|
|
if printText:
|
|
tag = printText.parent
|
|
tagURL = baseURL+tag['href']
|
|
return tagURL
|
|
|
|
def stripBadChars(self, inString) :
|
|
return inString.replace("\'", "")
|
|
|
|
|
|
def parseGeneric(self, baseURL):
|
|
# Does a generic parsing of the articles. There are six categories (0-5)
|
|
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
|
|
# NoSpin and TV are generic
|
|
fullReturn = []
|
|
for i in range(len(self.catList)) :
|
|
articleList = []
|
|
print("In "+self.catList[i][0]+", index: "+ str(i))
|
|
soup = self.index_to_soup(self.catList[i][1])
|
|
# Set defaults
|
|
description = 'None'
|
|
pubdate = time.strftime('%a, %d %b')
|
|
# Problem: 0-2 create many in an array
|
|
# 3-5 create one.
|
|
# So no for-div for 3-5
|
|
|
|
if i == 0 :
|
|
print("Starting TV Archives")
|
|
for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
|
|
print("Next DIV:")
|
|
print(div)
|
|
a = div
|
|
summary = div.find(True, attrs={'class':'summary'})
|
|
if summary:
|
|
description = self.tag_to_string(summary, use_alt=False)
|
|
if not a:
|
|
continue
|
|
# url = baseURL+re.sub(r'\?.*', '', a['href'])
|
|
url = baseURL+a['href']
|
|
url = self.extractPrintURL(baseURL, url, "Print this entry")
|
|
title = self.tag_to_string(a, use_alt=True).strip()
|
|
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
|
|
|
|
else : # Current Column
|
|
titleSpan = soup.find(self.catList[i][2], self.catList[i][3])
|
|
if titleSpan is None :
|
|
print("No Current Column Title Span")
|
|
print(soup)
|
|
continue
|
|
title = titleSpan.contents[0]
|
|
url = self.extractPrintURL(baseURL, self.catList[i][1], "Print This Article")
|
|
if i == 1 :
|
|
if self.debugMessages :
|
|
print(self.catList[i][0]+" Title:"+title+" at url: "+url)
|
|
summary = div.find(True, attrs={'class':'summary'})
|
|
print("At Summary")
|
|
print(summary)
|
|
if summary is not None:
|
|
description = self.tag_to_string(summary, use_alt=False)
|
|
print("At append")
|
|
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
|
|
self.catList[i][3] = articleList
|
|
fullReturn.append((self.catList[i][0], articleList))
|
|
print("Returning")
|
|
# print fullReturn
|
|
return fullReturn
|
|
|
|
|
|
# build_index() starts with:
|
|
# try:
|
|
# feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
|
|
# max_articles_per_feed=self.max_articles_per_feed,
|
|
# log=self.log)
|
|
# self.report_progress(0, _('Got feeds from index page'))
|
|
# except NotImplementedError:
|
|
# feeds = self.parse_feeds()
|
|
|
|
# which in turn is from __init__.py
|
|
#def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100,
|
|
# log=default_log):
|
|
#'''
|
|
#@param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}.
|
|
#@return: A list of L{Feed} objects.
|
|
#@rtype: list
|
|
#'''
|
|
#feeds = []
|
|
#for title, articles in index:
|
|
# pfeed = Feed(log=log)
|
|
# pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
|
|
# max_articles_per_feed=max_articles_per_feed)
|
|
# feeds.append(pfeed)
|
|
# return feeds
|
|
|
|
# use_embedded_content defaults to None, at which point if the content is > 2K, it is used as the article.
|
|
|
|
|
|
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
|
|
# returns a list of tuple ('feed title', list of articles)
|
|
# {
|
|
# 'title' : article title,
|
|
# 'url' : URL of print version,
|
|
# 'date' : The publication date of the article as a string,
|
|
# 'description' : A summary of the article
|
|
# 'content' : The full article (can be an empty string). This is used by FullContentProfile
|
|
# }
|
|
# this is used instead of BasicNewsRecipe.parse_feeds().
|
|
# it is called by download
|
|
def parse_index(self):
|
|
# Parse the page into Python Soup
|
|
print("Entering recipe print_index from:")
|
|
traceback.print_stack()
|
|
print("web")
|
|
baseURL = "https://www.billoreilly.com"
|
|
masterList = self.parseGeneric(baseURL)
|
|
#print(masterList)
|
|
return masterList
|
|
|
|
def preprocess_html(self, soup):
|
|
print("In preprocess_html")
|
|
refresh = soup.find('meta', {'http-equiv':'refresh'})
|
|
if refresh is None:
|
|
return soup
|
|
content = refresh.get('content').partition('=')[2]
|
|
raw = self.browser.open('https://www.billoreilly.com'+content).read()
|
|
return BeautifulSoup(raw.decode('cp1252', 'replace'))
|
|
|
|
def build_index(self):
|
|
print("In OReilly build_index()\n\n")
|
|
feedsRSS = []
|
|
self.report_progress(0, ('Fetching feeds...'))
|
|
#try:
|
|
feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
|
|
max_articles_per_feed=self.max_articles_per_feed,
|
|
log=self.log)
|
|
self.report_progress(0, ('Got feeds from index page'))
|
|
#except NotImplementedError:
|
|
# feeds = self.parse_feeds()
|
|
# Now add regular feeds.
|
|
feedsRSS = self.parse_feeds()
|
|
print ("feedsRSS is type "+feedsRSS.__class__.__name__)
|
|
|
|
for articles in feedsRSS:
|
|
print("articles is type "+articles.__class__.__name__)
|
|
print("Title:" + articles.title)
|
|
feeds.append(articles)
|
|
if not feeds:
|
|
raise ValueError('No articles found, aborting')
|
|
|
|
#feeds = FeedCollection(feeds)
|
|
|
|
self.report_progress(0, ('Trying to download cover...'))
|
|
self.download_cover()
|
|
self.report_progress(0, ('Generating masthead...'))
|
|
self.masthead_path = None
|
|
|
|
try:
|
|
murl = self.get_masthead_url()
|
|
except:
|
|
self.log.exception('Failed to get masthead url')
|
|
murl = None
|
|
|
|
if murl is not None:
|
|
# Try downloading the user-supplied masthead_url
|
|
# Failure sets self.masthead_path to None
|
|
self.download_masthead(murl)
|
|
if self.masthead_path is None:
|
|
self.log.info("Synthesizing mastheadImage")
|
|
self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg')
|
|
try:
|
|
self.default_masthead_image(self.masthead_path)
|
|
except:
|
|
self.log.exception('Failed to generate default masthead image')
|
|
self.masthead_path = None
|
|
|
|
if self.test:
|
|
feeds = feeds[:2]
|
|
self.has_single_feed = len(feeds) == 1
|
|
|
|
index = os.path.join(self.output_dir, 'index.html')
|
|
|
|
html = self.feeds2index(feeds)
|
|
with open(index, 'wb') as fi:
|
|
fi.write(html)
|
|
|
|
self.jobs = []
|
|
|
|
if self.reverse_article_order:
|
|
for feed in feeds:
|
|
if hasattr(feed, 'reverse'):
|
|
feed.reverse()
|
|
|
|
self.feed_objects = feeds
|
|
for f, feed in enumerate(feeds):
|
|
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
|
|
if not os.path.isdir(feed_dir):
|
|
os.makedirs(feed_dir)
|
|
|
|
for a, article in enumerate(feed):
|
|
if a >= self.max_articles_per_feed:
|
|
break
|
|
art_dir = os.path.join(feed_dir, 'article_%d'%a)
|
|
if not os.path.isdir(art_dir):
|
|
os.makedirs(art_dir)
|
|
try:
|
|
url = self.print_version(article.url)
|
|
except NotImplementedError:
|
|
url = article.url
|
|
except:
|
|
self.log.exception('Failed to find print version for: '+article.url)
|
|
url = None
|
|
if not url:
|
|
continue
|
|
func, arg = (self.fetch_embedded_article, article) \
|
|
if self.use_embedded_content or (self.use_embedded_content == None and feed.has_embedded_content()) \
|
|
else \
|
|
((self.fetch_obfuscated_article if self.articles_are_obfuscated \
|
|
else self.fetch_article), url)
|
|
req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
|
|
{}, (f, a), self.article_downloaded,
|
|
self.error_in_article_download)
|
|
req.feed = feed
|
|
req.article = article
|
|
req.feed_dir = feed_dir
|
|
self.jobs.append(req)
|
|
|
|
|
|
self.jobs_done = 0
|
|
tp = ThreadPool(self.simultaneous_downloads)
|
|
for req in self.jobs:
|
|
tp.putRequest(req, block=True, timeout=0)
|
|
|
|
|
|
self.report_progress(0, ('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
|
|
while True:
|
|
try:
|
|
tp.poll()
|
|
time.sleep(0.1)
|
|
except NoResultsPending:
|
|
break
|
|
for f, feed in enumerate(feeds):
|
|
print("Writing feeds for "+feed.title)
|
|
html = self.feed2index(f,feeds)
|
|
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
|
|
with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
|
|
fi.write(html)
|
|
self.create_opf(feeds)
|
|
self.report_progress(1, ('Feeds downloaded to %s')%index)
|
|
|
|
return index
|
|
|
|
|