This commit is contained in:
Kovid Goyal 2014-11-27 13:11:54 +05:30
parent b524709809
commit e9c2e24155

View File

@ -42,7 +42,7 @@ class RealClear(BasicNewsRecipe):
# First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down.
phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)
printhints = [ ["realclear", "", '' , 'printpage'],
printhints = [["realclear", "", '' , 'printpage'],
["billoreilly.com", "Print this entry", 'a', ''],
["billoreilly.com", "Print This Article", 'a', ''],
["politico.com", "Print", 'a', 'share-print'],
@ -53,17 +53,17 @@ class RealClear(BasicNewsRecipe):
# usatoday - just prints with all current crap anyhow
]
# RCP - look for a strange compound. See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html
# The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s
# http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html
# Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html
# Use the FULL PRINTPAGE URL; it formats it better too!
#
# NYT - try single page...
# Need special code - is it one page or several? Which URL?
# from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
# to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
# which is at link rel="canonical" and at <meta property="og:url" or look for "Single Page"
# RCP - look for a strange compound. See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html
# The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s
# http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html
# Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html
# Use the FULL PRINTPAGE URL; it formats it better too!
#
# NYT - try single page...
# Need special code - is it one page or several? Which URL?
# from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
# to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
# which is at link rel="canonical" and at <meta property="og:url" or look for "Single Page"
# Returns the best-guess print url.
# The second parameter (pageURL) is returned if nothing is found.
@ -80,33 +80,33 @@ class RealClear(BasicNewsRecipe):
soup = self.index_to_soup(pageURL)
if soup is None:
return pageURL
if len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0:
if len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0:
# e.g. RealClear
if self.debugMessages == True :
if self.debugMessages is True :
print("Search by href: "+self.printhints[x][self.phHrefSearch])
printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch]))
elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
if self.debugMessages == True :
if self.debugMessages is True :
print("Search 1: "+self.printhints[x][2]+" Attributes: ")
print(self.printhints[x][3])
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3])
elif len(self.printhints[x][3])>0 :
if self.debugMessages == True :
elif len(self.printhints[x][3])>0 :
if self.debugMessages is True :
print("search2")
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1])
else :
if self.debugMessages == True:
if self.debugMessages is True:
print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1])
printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1])
if printFind is None:
if self.debugMessages == True :
if self.debugMessages is True :
print("Not Found")
# print(soup)
print("end soup\n\n");
print("end soup\n\n")
continue
print(printFind)
if isinstance(printFind, NavigableString)==False:
if isinstance(printFind, NavigableString) is False:
if printFind['href'] is not None:
print("Check "+printFind['href']+" for base of "+baseURL)
if printFind['href'].find("http")!=0 :
@ -115,24 +115,24 @@ class RealClear(BasicNewsRecipe):
tag = printFind.parent
print(tag)
if tag['href'] is None:
if self.debugMessages == True :
if self.debugMessages is True :
print("Not in parent, trying skip-up")
if tag.parent['href'] is None:
if self.debugMessages == True :
if self.debugMessages is True :
print("Not in skip either, aborting")
continue;
continue
return tag.parent['href']
return tag['href']
return tagURL
def get_browser(self):
if self.debugMessages == True :
if self.debugMessages is True :
print("In get_browser")
br = BasicNewsRecipe.get_browser(self)
return br
def parseRSS(self, index) :
if self.debugMessages == True :
if self.debugMessages is True :
print("\n\nStarting "+self.feedsets[index][0])
articleList = []
soup = self.index_to_soup(self.feedsets[index][1])
@ -145,7 +145,7 @@ class RealClear(BasicNewsRecipe):
urlEl = div.find("link")
if urlEl is None or len(urlEl.contents)==0 :
urlEl = div.find("guid")
if urlEl is None or title is None or len(urlEl.contents)==0 :
if urlEl is None or title is None or len(urlEl.contents)==0 :
print("Error in feed "+ self.feedsets[index][0])
print(div)
continue
@ -164,15 +164,15 @@ class RealClear(BasicNewsRecipe):
pubDate = time.strftime('%a, %d %b')
else :
pubDate = pubDateEl.contents[0]
if self.debugMessages == True :
print("Article");
if self.debugMessages is True :
print("Article")
print(title)
print(description)
print(pubDate)
print(url)
url = self.extractPrintURL(url)
print(url)
#url +=re.sub(r'\?.*', '', div['href'])
# url +=re.sub(r'\?.*', '', div['href'])
pubdate = time.strftime('%a, %d %b')
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
return articleList
@ -190,15 +190,13 @@ class RealClear(BasicNewsRecipe):
def parse_index(self):
# Parse the page into Python Soup
#articleList = []
# articleList = []
ans = []
feedsCount = len(self.feedsets)
for x in range(0,feedsCount): # should be ,4
for x in range(0,feedsCount): # should be ,4
feedarticles = self.parseRSS(x)
if feedarticles is not None:
ans.append((self.feedsets[x][0], feedarticles))
if self.debugMessages == True :
if self.debugMessages is True :
print(ans)
return ans