calibre/recipes/real_clear.recipe
2019-04-01 14:16:37 +05:30

225 lines
10 KiB
Plaintext

from __future__ import print_function
# Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe
# .epub --test -vv --debug-pipeline debug
import re
import time
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import NavigableString
try:
from urllib.parse import urlparse
except ImportError:
from urlparse import urlparse
class RealClear(BasicNewsRecipe):
title = u'Real Clear'
__author__ = 'TMcN'
description = 'Real Clear Politics/Science/etc... aggregation of news\n'
cover_url = 'http://www.realclearpolitics.com/dev/mt-static/images/logo.gif'
custom_title = 'Real Clear - ' + time.strftime('%d %b %Y')
auto_cleanup = True
encoding = 'utf8'
language = 'en'
needs_subscription = False
no_stylesheets = True
oldest_article = 7
remove_javascript = True
remove_tags = [dict(name='img', attrs={})]
# Don't go down
recursions = 0
max_articles_per_feed = 400
debugMessages = True
# Numeric parameter is type, controls whether we look for
feedsets = [
["Politics", "http://www.realclearpolitics.com/index.xml", 0],
["Policy", "http://www.realclearpolicy.com/index.xml", 0],
["Science", "http://www.realclearscience.com/index.xml", 0],
["Tech", "http://www.realcleartechnology.com/index.xml", 0],
# The feedburner is essentially the same as the top feed, politics.
# ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1],
# ["Commentary", "http://feeds.feedburner.com/Realclearpolitics-Articles", 1],
["Markets Home", "http://www.realclearmarkets.com/index.xml", 0],
["Markets", "http://www.realclearmarkets.com/articles/index.xml", 0],
["World", "http://www.realclearworld.com/index.xml", 0],
["World Blog", "http://www.realclearworld.com/blog/index.xml", 2]
]
# Hints to extractPrintURL.
# First column is the URL snippet. Then the string to search for as text,
# and the attributes to look for above it. Start with attributes and
# drill down.
phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)
printhints = [["realclear", "", '', 'printpage'],
["billoreilly.com", "Print this entry", 'a', ''],
["billoreilly.com", "Print This Article", 'a', ''],
["politico.com", "Print",
'a', 'share-print'],
["nationalreview.com", ">Print<", 'a', ''],
["reason.com", "", 'a', 'printer']
# The following are not supported due to JavaScripting, and would require obfuscated_article to handle
# forbes,
# usatoday - just prints with all current crap anyhow
]
# The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s
# http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html
# Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html
# Use the FULL PRINTPAGE URL; it formats it better too!
#
# NYT - try single page...
# Need special code - is it one page or several? Which URL?
# from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
# to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
# which is at link rel="canonical" and at <meta property="og:url"
# or look for "Single Page"
# Returns the best-guess print url.
# The second parameter (pageURL) is returned if nothing is found.
def extractPrintURL(self, pageURL):
tagURL = pageURL
baseParse = urlparse(pageURL)
baseURL = baseParse[0] + "://" + baseParse[1]
hintsCount = len(self.printhints)
for x in range(0, hintsCount):
if pageURL.find(self.printhints[x][0]) == -1:
continue
print("Trying " + self.printhints[x][0])
# Only retrieve the soup if we have a match to check for the
# printed article with.
soup = self.index_to_soup(pageURL)
if soup is None:
return pageURL
if len(self.printhints[x][self.phHrefSearch]) > 0 and len(self.printhints[x][self.phLinkText]) == 0:
# e.g. RealClear
if self.debugMessages is True:
print("Search by href: " +
self.printhints[x][self.phHrefSearch])
printFind = soup.find(href=re.compile(
self.printhints[x][self.phHrefSearch]))
elif len(self.printhints[x][3]) > 0 and len(self.printhints[x][1]) == 0:
if self.debugMessages is True:
print("Search 1: " +
self.printhints[x][2] + " Attributes: ")
print(self.printhints[x][3])
printFind = soup.find(
self.printhints[x][2], attrs=self.printhints[x][3])
elif len(self.printhints[x][3]) > 0:
if self.debugMessages is True:
print("search2")
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[
x][3], text=self.printhints[x][1])
else:
if self.debugMessages is True:
print(
"Default Search: " + self.printhints[x][2] + " Text: " + self.printhints[x][1])
printFind = soup.find(
self.printhints[x][2], text=self.printhints[x][1])
if printFind is None:
if self.debugMessages is True:
print("Not Found")
# print(soup)
print("end soup\n\n")
continue
print(printFind)
if isinstance(printFind, NavigableString) is False:
if printFind['href'] is not None:
print("Check " + printFind['href'] +
" for base of " + baseURL)
if printFind['href'].find("http") != 0:
return baseURL + printFind['href']
return printFind['href']
tag = printFind.parent
print(tag)
if tag.get('href', None) is None:
if self.debugMessages is True:
print("Not in parent, trying skip-up")
if tag.parent['href'] is None:
if self.debugMessages is True:
print("Not in skip either, aborting")
continue
return tag.parent['href']
return tag['href']
return tagURL
def get_browser(self):
if self.debugMessages is True:
print("In get_browser")
br = BasicNewsRecipe.get_browser(self)
return br
def parseRSS(self, index):
if self.debugMessages is True:
print("\n\nStarting " + self.feedsets[index][0])
articleList = []
soup = self.index_to_soup(self.feedsets[index][1])
for div in soup.findAll("item"):
title = div.find("title").contents[0]
urlEl = div.find("originalLink")
if urlEl is None or len(urlEl.contents) == 0:
urlEl = div.find("originallink")
if urlEl is None or len(urlEl.contents) == 0:
urlEl = div.find("link")
if urlEl is None or len(urlEl.contents) == 0:
urlEl = div.find("guid")
if urlEl is None or title is None or len(urlEl.contents) == 0:
print("Error in feed " + self.feedsets[index][0])
print(div)
continue
print(title)
print(urlEl)
url = urlEl.contents[0].encode("utf-8")
description = div.find("description")
if description is not None and description.contents is not None and len(description.contents) > 0:
description = description.contents[0]
else:
description = "None"
pubDateEl = div.find("pubDate")
if pubDateEl is None:
pubDateEl = div.find("pubdate")
if pubDateEl is None:
pubDate = time.strftime('%a, %d %b')
else:
pubDate = pubDateEl.contents[0]
if self.debugMessages is True:
print("Article")
print(title)
print(description)
print(pubDate)
print(url)
try:
url = self.extractPrintURL(url)
except Exception:
self.log.exception('Failed to extract print URL for %s' % url)
print(url)
# url +=re.sub(r'\?.*', '', div['href'])
pubdate = time.strftime('%a, %d %b')
articleList.append(
dict(title=title, url=url, date=pubdate, description=description, content=''))
return articleList
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
# returns a list of tuple ('feed title', list of articles)
# {
# 'title' : article title,
# 'url' : URL of print version,
# 'date' : The publication date of the article as a string,
# 'description' : A summary of the article
# 'content' : The full article (can be an empty string). This is used by FullContentProfile
# }
# this is used instead of BasicNewsRecipe.parse_feeds().
def parse_index(self):
# Parse the page into Python Soup
# articleList = []
ans = []
feedsCount = len(self.feedsets)
for x in range(0, feedsCount): # should be ,4
feedarticles = self.parseRSS(x)
if feedarticles is not None:
ans.append((self.feedsets[x][0], feedarticles))
if self.debugMessages is True:
print(ans)
return ans