This commit is contained in:
Kovid Goyal 2014-11-27 13:11:54 +05:30
parent b524709809
commit e9c2e24155

View File

@ -42,7 +42,7 @@ class RealClear(BasicNewsRecipe):
# First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down. # First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down.
phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4) phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)
printhints = [ ["realclear", "", '' , 'printpage'], printhints = [["realclear", "", '' , 'printpage'],
["billoreilly.com", "Print this entry", 'a', ''], ["billoreilly.com", "Print this entry", 'a', ''],
["billoreilly.com", "Print This Article", 'a', ''], ["billoreilly.com", "Print This Article", 'a', ''],
["politico.com", "Print", 'a', 'share-print'], ["politico.com", "Print", 'a', 'share-print'],
@ -53,17 +53,17 @@ class RealClear(BasicNewsRecipe):
# usatoday - just prints with all current crap anyhow # usatoday - just prints with all current crap anyhow
] ]
# RCP - look for a strange compound. See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html # RCP - look for a strange compound. See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html
# The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s # The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s
# http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html # http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html
# Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html # Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html
# Use the FULL PRINTPAGE URL; it formats it better too! # Use the FULL PRINTPAGE URL; it formats it better too!
# #
# NYT - try single page... # NYT - try single page...
# Need special code - is it one page or several? Which URL? # Need special code - is it one page or several? Which URL?
# from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1 # from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
# to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all # to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
# which is at link rel="canonical" and at <meta property="og:url" or look for "Single Page" # which is at link rel="canonical" and at <meta property="og:url" or look for "Single Page"
# Returns the best-guess print url. # Returns the best-guess print url.
# The second parameter (pageURL) is returned if nothing is found. # The second parameter (pageURL) is returned if nothing is found.
@ -80,33 +80,33 @@ class RealClear(BasicNewsRecipe):
soup = self.index_to_soup(pageURL) soup = self.index_to_soup(pageURL)
if soup is None: if soup is None:
return pageURL return pageURL
if len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0: if len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0:
# e.g. RealClear # e.g. RealClear
if self.debugMessages == True : if self.debugMessages is True :
print("Search by href: "+self.printhints[x][self.phHrefSearch]) print("Search by href: "+self.printhints[x][self.phHrefSearch])
printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch])) printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch]))
elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0: elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
if self.debugMessages == True : if self.debugMessages is True :
print("Search 1: "+self.printhints[x][2]+" Attributes: ") print("Search 1: "+self.printhints[x][2]+" Attributes: ")
print(self.printhints[x][3]) print(self.printhints[x][3])
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3]) printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3])
elif len(self.printhints[x][3])>0 : elif len(self.printhints[x][3])>0 :
if self.debugMessages == True : if self.debugMessages is True :
print("search2") print("search2")
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1]) printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1])
else : else :
if self.debugMessages == True: if self.debugMessages is True:
print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1]) print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1])
printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1]) printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1])
if printFind is None: if printFind is None:
if self.debugMessages == True : if self.debugMessages is True :
print("Not Found") print("Not Found")
# print(soup) # print(soup)
print("end soup\n\n"); print("end soup\n\n")
continue continue
print(printFind) print(printFind)
if isinstance(printFind, NavigableString)==False: if isinstance(printFind, NavigableString) is False:
if printFind['href'] is not None: if printFind['href'] is not None:
print("Check "+printFind['href']+" for base of "+baseURL) print("Check "+printFind['href']+" for base of "+baseURL)
if printFind['href'].find("http")!=0 : if printFind['href'].find("http")!=0 :
@ -115,24 +115,24 @@ class RealClear(BasicNewsRecipe):
tag = printFind.parent tag = printFind.parent
print(tag) print(tag)
if tag['href'] is None: if tag['href'] is None:
if self.debugMessages == True : if self.debugMessages is True :
print("Not in parent, trying skip-up") print("Not in parent, trying skip-up")
if tag.parent['href'] is None: if tag.parent['href'] is None:
if self.debugMessages == True : if self.debugMessages is True :
print("Not in skip either, aborting") print("Not in skip either, aborting")
continue; continue
return tag.parent['href'] return tag.parent['href']
return tag['href'] return tag['href']
return tagURL return tagURL
def get_browser(self): def get_browser(self):
if self.debugMessages == True : if self.debugMessages is True :
print("In get_browser") print("In get_browser")
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
return br return br
def parseRSS(self, index) : def parseRSS(self, index) :
if self.debugMessages == True : if self.debugMessages is True :
print("\n\nStarting "+self.feedsets[index][0]) print("\n\nStarting "+self.feedsets[index][0])
articleList = [] articleList = []
soup = self.index_to_soup(self.feedsets[index][1]) soup = self.index_to_soup(self.feedsets[index][1])
@ -145,7 +145,7 @@ class RealClear(BasicNewsRecipe):
urlEl = div.find("link") urlEl = div.find("link")
if urlEl is None or len(urlEl.contents)==0 : if urlEl is None or len(urlEl.contents)==0 :
urlEl = div.find("guid") urlEl = div.find("guid")
if urlEl is None or title is None or len(urlEl.contents)==0 : if urlEl is None or title is None or len(urlEl.contents)==0 :
print("Error in feed "+ self.feedsets[index][0]) print("Error in feed "+ self.feedsets[index][0])
print(div) print(div)
continue continue
@ -164,15 +164,15 @@ class RealClear(BasicNewsRecipe):
pubDate = time.strftime('%a, %d %b') pubDate = time.strftime('%a, %d %b')
else : else :
pubDate = pubDateEl.contents[0] pubDate = pubDateEl.contents[0]
if self.debugMessages == True : if self.debugMessages is True :
print("Article"); print("Article")
print(title) print(title)
print(description) print(description)
print(pubDate) print(pubDate)
print(url) print(url)
url = self.extractPrintURL(url) url = self.extractPrintURL(url)
print(url) print(url)
#url +=re.sub(r'\?.*', '', div['href']) # url +=re.sub(r'\?.*', '', div['href'])
pubdate = time.strftime('%a, %d %b') pubdate = time.strftime('%a, %d %b')
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
return articleList return articleList
@ -190,15 +190,13 @@ class RealClear(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
# Parse the page into Python Soup # Parse the page into Python Soup
#articleList = [] # articleList = []
ans = [] ans = []
feedsCount = len(self.feedsets) feedsCount = len(self.feedsets)
for x in range(0,feedsCount): # should be ,4 for x in range(0,feedsCount): # should be ,4
feedarticles = self.parseRSS(x) feedarticles = self.parseRSS(x)
if feedarticles is not None: if feedarticles is not None:
ans.append((self.feedsets[x][0], feedarticles)) ans.append((self.feedsets[x][0], feedarticles))
if self.debugMessages == True : if self.debugMessages is True :
print(ans) print(ans)
return ans return ans