This commit is contained in:
Kovid Goyal 2014-11-27 13:11:54 +05:30
parent b524709809
commit e9c2e24155

View File

@ -42,7 +42,7 @@ class RealClear(BasicNewsRecipe):
# First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down. # First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down.
phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4) phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)
printhints = [ ["realclear", "", '' , 'printpage'], printhints = [["realclear", "", '' , 'printpage'],
["billoreilly.com", "Print this entry", 'a', ''], ["billoreilly.com", "Print this entry", 'a', ''],
["billoreilly.com", "Print This Article", 'a', ''], ["billoreilly.com", "Print This Article", 'a', ''],
["politico.com", "Print", 'a', 'share-print'], ["politico.com", "Print", 'a', 'share-print'],
@ -82,31 +82,31 @@ class RealClear(BasicNewsRecipe):
return pageURL return pageURL
if len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0: if len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0:
# e.g. RealClear # e.g. RealClear
if self.debugMessages == True : if self.debugMessages is True :
print("Search by href: "+self.printhints[x][self.phHrefSearch]) print("Search by href: "+self.printhints[x][self.phHrefSearch])
printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch])) printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch]))
elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0: elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
if self.debugMessages == True : if self.debugMessages is True :
print("Search 1: "+self.printhints[x][2]+" Attributes: ") print("Search 1: "+self.printhints[x][2]+" Attributes: ")
print(self.printhints[x][3]) print(self.printhints[x][3])
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3]) printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3])
elif len(self.printhints[x][3])>0 : elif len(self.printhints[x][3])>0 :
if self.debugMessages == True : if self.debugMessages is True :
print("search2") print("search2")
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1]) printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1])
else : else :
if self.debugMessages == True: if self.debugMessages is True:
print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1]) print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1])
printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1]) printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1])
if printFind is None: if printFind is None:
if self.debugMessages == True : if self.debugMessages is True :
print("Not Found") print("Not Found")
# print(soup) # print(soup)
print("end soup\n\n"); print("end soup\n\n")
continue continue
print(printFind) print(printFind)
if isinstance(printFind, NavigableString)==False: if isinstance(printFind, NavigableString) is False:
if printFind['href'] is not None: if printFind['href'] is not None:
print("Check "+printFind['href']+" for base of "+baseURL) print("Check "+printFind['href']+" for base of "+baseURL)
if printFind['href'].find("http")!=0 : if printFind['href'].find("http")!=0 :
@ -115,24 +115,24 @@ class RealClear(BasicNewsRecipe):
tag = printFind.parent tag = printFind.parent
print(tag) print(tag)
if tag['href'] is None: if tag['href'] is None:
if self.debugMessages == True : if self.debugMessages is True :
print("Not in parent, trying skip-up") print("Not in parent, trying skip-up")
if tag.parent['href'] is None: if tag.parent['href'] is None:
if self.debugMessages == True : if self.debugMessages is True :
print("Not in skip either, aborting") print("Not in skip either, aborting")
continue; continue
return tag.parent['href'] return tag.parent['href']
return tag['href'] return tag['href']
return tagURL return tagURL
def get_browser(self): def get_browser(self):
if self.debugMessages == True : if self.debugMessages is True :
print("In get_browser") print("In get_browser")
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
return br return br
def parseRSS(self, index) : def parseRSS(self, index) :
if self.debugMessages == True : if self.debugMessages is True :
print("\n\nStarting "+self.feedsets[index][0]) print("\n\nStarting "+self.feedsets[index][0])
articleList = [] articleList = []
soup = self.index_to_soup(self.feedsets[index][1]) soup = self.index_to_soup(self.feedsets[index][1])
@ -164,15 +164,15 @@ class RealClear(BasicNewsRecipe):
pubDate = time.strftime('%a, %d %b') pubDate = time.strftime('%a, %d %b')
else : else :
pubDate = pubDateEl.contents[0] pubDate = pubDateEl.contents[0]
if self.debugMessages == True : if self.debugMessages is True :
print("Article"); print("Article")
print(title) print(title)
print(description) print(description)
print(pubDate) print(pubDate)
print(url) print(url)
url = self.extractPrintURL(url) url = self.extractPrintURL(url)
print(url) print(url)
#url +=re.sub(r'\?.*', '', div['href']) # url +=re.sub(r'\?.*', '', div['href'])
pubdate = time.strftime('%a, %d %b') pubdate = time.strftime('%a, %d %b')
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
return articleList return articleList
@ -190,15 +190,13 @@ class RealClear(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
# Parse the page into Python Soup # Parse the page into Python Soup
#articleList = [] # articleList = []
ans = [] ans = []
feedsCount = len(self.feedsets) feedsCount = len(self.feedsets)
for x in range(0,feedsCount): # should be ,4 for x in range(0,feedsCount): # should be ,4
feedarticles = self.parseRSS(x) feedarticles = self.parseRSS(x)
if feedarticles is not None: if feedarticles is not None:
ans.append((self.feedsets[x][0], feedarticles)) ans.append((self.feedsets[x][0], feedarticles))
if self.debugMessages == True : if self.debugMessages is True :
print(ans) print(ans)
return ans return ans