Updated OReilly Premium and Real Clear

This commit is contained in:
Kovid Goyal 2012-04-02 09:00:25 +05:30
parent ee108790db
commit bb443d01f1
2 changed files with 284 additions and 109 deletions

View File

@ -1,45 +1,73 @@
# Talking Points is not grabbing everything. import string, re
# The look is right, but only the last one added?
import re
import time import time
import traceback
# above for debugging via stack
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
# Allows the Python soup converter, which makes parsing easier. # Allows the Python soup converter, which makes parsing easier.
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
# strip ads and graphics
# Current Column lacks a title. import os, time, traceback, re, urlparse, sys, cStringIO
# Talking Points Memo - shorten title - Remove year and Bill's name from collections import defaultdict
from functools import partial
from contextlib import nested, closing
from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
# To Do: strip ads and graphics, Current Column lacks a title.
# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries. # The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
# Newsletters: Talking Points Memos covered by cat12 # Newsletters: Talking Points Memos covered by cat12
# ./ebook-convert --username xxx --password xxx
# this is derived from BasicNewsRecipe, so it can only overload those.
# Soome of what we need is otherwise in article, so we have more copy to do than otherwise.
class OReillyPremium(BasicNewsRecipe): class OReillyPremium(BasicNewsRecipe):
title = u'OReilly Premium' title = u'OReilly Premium'
__author__ = 'TMcN' __author__ = 'TMcN'
language = 'en'
description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.' description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.'
cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png' cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png'
custom_title = 'Bill O\'Reilly Premium - '+ time.strftime('%d %b %Y')
title = 'Bill O\'Reilly Premium'
auto_cleanup = True auto_cleanup = True
conversion_options = {'linearize_tables': True}
encoding = 'utf8' encoding = 'utf8'
needs_subscription = True language = 'en'
no_stylesheets = True no_stylesheets = True
oldest_article = 20 needs_subscription = True
oldest_article = 31
remove_javascript = True remove_javascript = True
remove_tags = [dict(name='img', attrs={})] remove_tags = [dict(name='img', attrs={})]
# Don't go down # Don't go down
recursions = 0 recursions = 0
max_articles_per_feed = 2000 max_articles_per_feed = 20
debugMessages = True debugMessages = True
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []], catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []],
["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []], # ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []], # ["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []],
["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []], # ["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []],
["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []], # ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []],
["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []] ["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []]
] ]
feeds = [
(u'No Spin', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'),
(u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'),
(u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'),
(u'Blog', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'),
(u'StratFor', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5')
]
# http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day.
# Note: Talking Points is broken in the above model; the site changed to more Ajax-y.
# Now using RSS
def get_browser(self): def get_browser(self):
print("In get_browser")
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp') br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
@ -66,6 +94,7 @@ class OReillyPremium(BasicNewsRecipe):
def stripBadChars(self, inString) : def stripBadChars(self, inString) :
return inString.replace("\'", "") return inString.replace("\'", "")
def parseGeneric(self, baseURL): def parseGeneric(self, baseURL):
# Does a generic parsing of the articles. There are six categories (0-5) # Does a generic parsing of the articles. There are six categories (0-5)
# Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
@ -73,6 +102,7 @@ class OReillyPremium(BasicNewsRecipe):
fullReturn = [] fullReturn = []
for i in range(len(self.catList)) : for i in range(len(self.catList)) :
articleList = [] articleList = []
print("In "+self.catList[i][0]+", index: "+ str(i))
soup = self.index_to_soup(self.catList[i][1]) soup = self.index_to_soup(self.catList[i][1])
# Set defaults # Set defaults
description = 'None' description = 'None'
@ -81,14 +111,12 @@ class OReillyPremium(BasicNewsRecipe):
# 3-5 create one. # 3-5 create one.
# So no for-div for 3-5 # So no for-div for 3-5
if i < 3 : if i == 0 :
print("Starting TV Archives")
for div in soup.findAll(self.catList[i][2], self.catList[i][3]): for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
print("Next DIV:")
print(div) print(div)
if i == 1: a = div
a = div.find('a', href=True)
else :
a = div
print(a)
summary = div.find(True, attrs={'class':'summary'}) summary = div.find(True, attrs={'class':'summary'})
if summary: if summary:
description = self.tag_to_string(summary, use_alt=False) description = self.tag_to_string(summary, use_alt=False)
@ -96,82 +124,63 @@ class OReillyPremium(BasicNewsRecipe):
continue continue
# url = baseURL+re.sub(r'\?.*', '', a['href']) # url = baseURL+re.sub(r'\?.*', '', a['href'])
url = baseURL+a['href'] url = baseURL+a['href']
if i < 2 : url = self.extractPrintURL(baseURL, url, "Print this entry")
url = self.extractPrintURL(baseURL, url, "Print this entry") title = self.tag_to_string(a, use_alt=True).strip()
title = self.tag_to_string(a, use_alt=True).strip()
elif i == 2 :
# Daily Briefs
url = self.extractPrintURL(baseURL, url, "Print this entry")
title = div.contents[0]
if self.debugMessages :
print(title+" @ "+url)
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
elif i == 3 : # Stratfor
a = soup.find('a', self.catList[i][3])
if a is None :
continue
url = baseURL+a['href']
title = self.tag_to_string(a, use_alt=True).strip()
# Get Stratfor contents so we can get the real title.
stratSoup = self.index_to_soup(url)
title = stratSoup.html.head.title.string
stratIndex = title.find('Stratfor.com:', 0)
if (stratIndex > -1) :
title = title[stratIndex+14:-1]
# Look for first blogBody <td class="blogBody"
# Changed 12 Jan 2012 - new page format
#stratBlogTable = stratSoup.find('td', {'class':['blogBody']}).findParent('table')
#stratBody = stratSoup.find('td', {'class':['blogBody']})
elif i == 4 : # Talking Points
topDate = soup.find("td", "blogBody")
if not topDate :
print("Failed to find date in Talking Points")
# This page has the contents in double-wrapped tables!
myTable = topDate.findParents('table')[0]
if myTable is not None:
upOneTable = myTable.findParents('table')[0]
if upOneTable is not None:
upTwo = upOneTable.findParents('table')[0]
if upTwo is None:
continue
# Now navigate rows of upTwo
if self.debugMessages :
print("Entering rows")
for rows in upTwo.findChildren("tr", recursive=False):
# Inside top level table, each row is an article
rowTable = rows.find("table")
articleTable = rowTable.find("table")
# This looks wrong.
articleTable = rows.find("tr")
# The middle table is just for formatting the article buffer... but this means we can skip the inner table.
blogDate = articleTable.find("a","blogDate").contents[0]
# Skip to second blogBody for this.
blogTitle = articleTable.findAll("td", "blogBody")[1].contents[0]
blogURL = articleTable.find("a", "homeBlogReadMore bold")['href']
url = baseURL+re.sub(r'\?.*', '', blogURL)
title = blogDate+": "+self.stripBadChars(blogTitle.replace("Bill O'Reilly: ", ""))
if self.debugMessages :
print("Talking Points Memo title "+title+" at url: "+url)
pubdate = time.strftime('%a, %d %b')
articleList.append(dict(title=title, url=url, date=pubdate, description='None', content=''))
else : # Current Column else : # Current Column
titleSpan = soup.find(self.catList[i][2], self.catList[i][3]) titleSpan = soup.find(self.catList[i][2], self.catList[i][3])
if titleSpan is None : if titleSpan is None :
print("No Current Column Title Span")
print(soup)
continue continue
title = titleSpan.contents[0] title = titleSpan.contents[0]
url = self.extractPrintURL(baseURL, self.catList[i][1], "Print This Article") url = self.extractPrintURL(baseURL, self.catList[i][1], "Print This Article")
if i == 3 or i == 5 : if i == 1 :
if self.debugMessages : if self.debugMessages :
print(self.catList[i][0]+" Title:"+title+" at url: "+url) print(self.catList[i][0]+" Title:"+title+" at url: "+url)
summary = div.find(True, attrs={'class':'summary'}) summary = div.find(True, attrs={'class':'summary'})
if summary: print("At Summary")
print(summary)
if summary is not None:
description = self.tag_to_string(summary, use_alt=False) description = self.tag_to_string(summary, use_alt=False)
print("At append")
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
self.catList[i][3] = articleList self.catList[i][3] = articleList
fullReturn.append((self.catList[i][0], articleList)) fullReturn.append((self.catList[i][0], articleList))
print("Returning")
# print fullReturn
return fullReturn return fullReturn
# build_index() starts with:
# try:
# feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
# max_articles_per_feed=self.max_articles_per_feed,
# log=self.log)
# self.report_progress(0, _('Got feeds from index page'))
# except NotImplementedError:
# feeds = self.parse_feeds()
# which in turn is from __init__.py
#def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100,
# log=default_log):
#'''
#@param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}.
#@return: A list of L{Feed} objects.
#@rtype: list
#'''
#feeds = []
#for title, articles in index:
# pfeed = Feed(log=log)
# pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
# max_articles_per_feed=max_articles_per_feed)
# feeds.append(pfeed)
# return feeds
# use_embedded_content defaults to None, at which point if the content is > 2K, it is used as the article.
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles. # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
# returns a list of tuple ('feed title', list of articles) # returns a list of tuple ('feed title', list of articles)
# { # {
@ -182,12 +191,19 @@ class OReillyPremium(BasicNewsRecipe):
# 'content' : The full article (can be an empty string). This is used by FullContentProfile # 'content' : The full article (can be an empty string). This is used by FullContentProfile
# } # }
# this is used instead of BasicNewsRecipe.parse_feeds(). # this is used instead of BasicNewsRecipe.parse_feeds().
# it is called by download
def parse_index(self): def parse_index(self):
# Parse the page into Python Soup # Parse the page into Python Soup
print("Entering recipe print_index from:")
traceback.print_stack()
print("web")
baseURL = "https://www.billoreilly.com" baseURL = "https://www.billoreilly.com"
return self.parseGeneric(baseURL) masterList = self.parseGeneric(baseURL)
#print(masterList)
return masterList
def preprocess_html(self, soup): def preprocess_html(self, soup):
print("In preprocess_html")
refresh = soup.find('meta', {'http-equiv':'refresh'}) refresh = soup.find('meta', {'http-equiv':'refresh'})
if refresh is None: if refresh is None:
return soup return soup
@ -195,3 +211,128 @@ class OReillyPremium(BasicNewsRecipe):
raw = self.browser.open('https://www.billoreilly.com'+content).read() raw = self.browser.open('https://www.billoreilly.com'+content).read()
return BeautifulSoup(raw.decode('cp1252', 'replace')) return BeautifulSoup(raw.decode('cp1252', 'replace'))
def build_index(self):
print("In OReilly build_index()\n\n")
feedsRSS = []
self.report_progress(0, _('Fetching feeds...'))
#try:
feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
log=self.log)
self.report_progress(0, _('Got feeds from index page'))
#except NotImplementedError:
# feeds = self.parse_feeds()
# Now add regular feeds.
feedsRSS = self.parse_feeds()
print ("feedsRSS is type "+feedsRSS.__class__.__name__)
for articles in feedsRSS:
print("articles is type "+articles.__class__.__name__)
print("Title:" + articles.title)
feeds.append(articles)
if not feeds:
raise ValueError('No articles found, aborting')
#feeds = FeedCollection(feeds)
self.report_progress(0, _('Trying to download cover...'))
self.download_cover()
self.report_progress(0, _('Generating masthead...'))
self.masthead_path = None
try:
murl = self.get_masthead_url()
except:
self.log.exception('Failed to get masthead url')
murl = None
if murl is not None:
# Try downloading the user-supplied masthead_url
# Failure sets self.masthead_path to None
self.download_masthead(murl)
if self.masthead_path is None:
self.log.info("Synthesizing mastheadImage")
self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg')
try:
self.default_masthead_image(self.masthead_path)
except:
self.log.exception('Failed to generate default masthead image')
self.masthead_path = None
if self.test:
feeds = feeds[:2]
self.has_single_feed = len(feeds) == 1
index = os.path.join(self.output_dir, 'index.html')
html = self.feeds2index(feeds)
with open(index, 'wb') as fi:
fi.write(html)
self.jobs = []
if self.reverse_article_order:
for feed in feeds:
if hasattr(feed, 'reverse'):
feed.reverse()
self.feed_objects = feeds
for f, feed in enumerate(feeds):
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
if not os.path.isdir(feed_dir):
os.makedirs(feed_dir)
for a, article in enumerate(feed):
if a >= self.max_articles_per_feed:
break
art_dir = os.path.join(feed_dir, 'article_%d'%a)
if not os.path.isdir(art_dir):
os.makedirs(art_dir)
try:
url = self.print_version(article.url)
except NotImplementedError:
url = article.url
except:
self.log.exception('Failed to find print version for: '+article.url)
url = None
if not url:
continue
func, arg = (self.fetch_embedded_article, article) \
if self.use_embedded_content or (self.use_embedded_content == None and feed.has_embedded_content()) \
else \
((self.fetch_obfuscated_article if self.articles_are_obfuscated \
else self.fetch_article), url)
req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
{}, (f, a), self.article_downloaded,
self.error_in_article_download)
req.feed = feed
req.article = article
req.feed_dir = feed_dir
self.jobs.append(req)
self.jobs_done = 0
tp = ThreadPool(self.simultaneous_downloads)
for req in self.jobs:
tp.putRequest(req, block=True, timeout=0)
self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
while True:
try:
tp.poll()
time.sleep(0.1)
except NoResultsPending:
break
for f, feed in enumerate(feeds):
print("Writing feeds for "+feed.title)
html = self.feed2index(f,feeds)
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
fi.write(html)
self.create_opf(feeds)
self.report_progress(1, _('Feeds downloaded to %s')%index)
return index

View File

@ -1,7 +1,9 @@
# Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug # Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug
import string, re
import time import time
from urlparse import urlparse
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import NavigableString from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString
class RealClear(BasicNewsRecipe): class RealClear(BasicNewsRecipe):
title = u'Real Clear' title = u'Real Clear'
@ -20,12 +22,13 @@ class RealClear(BasicNewsRecipe):
# Don't go down # Don't go down
recursions = 0 recursions = 0
max_articles_per_feed = 400 max_articles_per_feed = 400
debugMessages = False debugMessages = True
# Numeric parameter is type, controls whether we look for # Numeric parameter is type, controls whether we look for
feedsets = [ feedsets = [
["Politics", "http://www.realclearpolitics.com/index.xml", 0], ["Politics", "http://www.realclearpolitics.com/index.xml", 0],
["Science", "http://www.realclearscience.com/index.xml", 0], ["Policy", "http://www.realclearpolicy.com/index.xml", 0],
["Science", "http://www.realclearscience.com/index.xml", 0],
["Tech", "http://www.realcleartechnology.com/index.xml", 0], ["Tech", "http://www.realcleartechnology.com/index.xml", 0],
# The feedburner is essentially the same as the top feed, politics. # The feedburner is essentially the same as the top feed, politics.
# ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1], # ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1],
@ -37,7 +40,9 @@ class RealClear(BasicNewsRecipe):
] ]
# Hints to extractPrintURL. # Hints to extractPrintURL.
# First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down. # First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down.
printhints = [ phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)
printhints = [ ["realclear", "", '' , 'printpage'],
["billoreilly.com", "Print this entry", 'a', ''], ["billoreilly.com", "Print this entry", 'a', ''],
["billoreilly.com", "Print This Article", 'a', ''], ["billoreilly.com", "Print This Article", 'a', ''],
["politico.com", "Print", 'a', 'share-print'], ["politico.com", "Print", 'a', 'share-print'],
@ -48,11 +53,24 @@ class RealClear(BasicNewsRecipe):
# usatoday - just prints with all current crap anyhow # usatoday - just prints with all current crap anyhow
] ]
# RCP - look for a strange compound. See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html
# The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s
# http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html
# Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html
# Use the FULL PRINTPAGE URL; it formats it better too!
#
# NYT - try single page...
# Need special code - is it one page or several? Which URL?
# from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
# to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
# which is at link rel="canonical" and at <meta property="og:url" or look for "Single Page"
# Returns the best-guess print url. # Returns the best-guess print url.
# The second parameter (pageURL) is returned if nothing is found. # The second parameter (pageURL) is returned if nothing is found.
def extractPrintURL(self, pageURL): def extractPrintURL(self, pageURL):
tagURL = pageURL tagURL = pageURL
baseParse = urlparse(pageURL)
baseURL = baseParse[0]+"://"+baseParse[1]
hintsCount =len(self.printhints) hintsCount =len(self.printhints)
for x in range(0,hintsCount): for x in range(0,hintsCount):
if pageURL.find(self.printhints[x][0])== -1 : if pageURL.find(self.printhints[x][0])== -1 :
@ -62,23 +80,37 @@ class RealClear(BasicNewsRecipe):
soup = self.index_to_soup(pageURL) soup = self.index_to_soup(pageURL)
if soup is None: if soup is None:
return pageURL return pageURL
if len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0: if len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0:
# e.g. RealClear
if self.debugMessages == True : if self.debugMessages == True :
print("search1") print("Search by href: "+self.printhints[x][self.phHrefSearch])
printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch]))
elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
if self.debugMessages == True :
print("Search 1: "+self.printhints[x][2]+" Attributes: ")
print(self.printhints[x][3])
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3]) printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3])
elif len(self.printhints[x][3])>0 : elif len(self.printhints[x][3])>0 :
if self.debugMessages == True : if self.debugMessages == True :
print("search2") print("search2")
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1]) printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1])
else : else :
if self.debugMessages == True:
print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1])
printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1]) printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1])
if printFind is None: if printFind is None:
if self.debugMessages == True : if self.debugMessages == True :
print("Not Found") print("Not Found")
# print(soup)
print("end soup\n\n");
continue continue
print(printFind) print(printFind)
if isinstance(printFind, NavigableString)==False: if isinstance(printFind, NavigableString)==False:
if printFind['href'] is not None: if printFind['href'] is not None:
print("Check "+printFind['href']+" for base of "+baseURL)
if printFind['href'].find("http")!=0 :
return baseURL+printFind['href']
return printFind['href'] return printFind['href']
tag = printFind.parent tag = printFind.parent
print(tag) print(tag)
@ -158,6 +190,7 @@ class RealClear(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
# Parse the page into Python Soup # Parse the page into Python Soup
articleList = []
ans = [] ans = []
feedsCount = len(self.feedsets) feedsCount = len(self.feedsets)
for x in range(0,feedsCount): # should be ,4 for x in range(0,feedsCount): # should be ,4
@ -168,3 +201,4 @@ class RealClear(BasicNewsRecipe):
print(ans) print(ans)
return ans return ans