This commit is contained in:
Kovid Goyal 2014-11-27 13:11:54 +05:30
parent b524709809
commit e9c2e24155

View File

@ -42,7 +42,7 @@ class RealClear(BasicNewsRecipe):
# First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down.
phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)
printhints = [ ["realclear", "", '' , 'printpage'],
printhints = [["realclear", "", '' , 'printpage'],
["billoreilly.com", "Print this entry", 'a', ''],
["billoreilly.com", "Print This Article", 'a', ''],
["politico.com", "Print", 'a', 'share-print'],
@ -82,31 +82,31 @@ class RealClear(BasicNewsRecipe):
return pageURL
if len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0:
# e.g. RealClear
if self.debugMessages == True :
if self.debugMessages is True :
print("Search by href: "+self.printhints[x][self.phHrefSearch])
printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch]))
elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
if self.debugMessages == True :
if self.debugMessages is True :
print("Search 1: "+self.printhints[x][2]+" Attributes: ")
print(self.printhints[x][3])
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3])
elif len(self.printhints[x][3])>0 :
if self.debugMessages == True :
if self.debugMessages is True :
print("search2")
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1])
else :
if self.debugMessages == True:
if self.debugMessages is True:
print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1])
printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1])
if printFind is None:
if self.debugMessages == True :
if self.debugMessages is True :
print("Not Found")
# print(soup)
print("end soup\n\n");
print("end soup\n\n")
continue
print(printFind)
if isinstance(printFind, NavigableString)==False:
if isinstance(printFind, NavigableString) is False:
if printFind['href'] is not None:
print("Check "+printFind['href']+" for base of "+baseURL)
if printFind['href'].find("http")!=0 :
@ -115,24 +115,24 @@ class RealClear(BasicNewsRecipe):
tag = printFind.parent
print(tag)
if tag['href'] is None:
if self.debugMessages == True :
if self.debugMessages is True :
print("Not in parent, trying skip-up")
if tag.parent['href'] is None:
if self.debugMessages == True :
if self.debugMessages is True :
print("Not in skip either, aborting")
continue;
continue
return tag.parent['href']
return tag['href']
return tagURL
def get_browser(self):
if self.debugMessages == True :
if self.debugMessages is True :
print("In get_browser")
br = BasicNewsRecipe.get_browser(self)
return br
def parseRSS(self, index) :
if self.debugMessages == True :
if self.debugMessages is True :
print("\n\nStarting "+self.feedsets[index][0])
articleList = []
soup = self.index_to_soup(self.feedsets[index][1])
@ -164,15 +164,15 @@ class RealClear(BasicNewsRecipe):
pubDate = time.strftime('%a, %d %b')
else :
pubDate = pubDateEl.contents[0]
if self.debugMessages == True :
print("Article");
if self.debugMessages is True :
print("Article")
print(title)
print(description)
print(pubDate)
print(url)
url = self.extractPrintURL(url)
print(url)
#url +=re.sub(r'\?.*', '', div['href'])
# url +=re.sub(r'\?.*', '', div['href'])
pubdate = time.strftime('%a, %d %b')
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
return articleList
@ -190,15 +190,13 @@ class RealClear(BasicNewsRecipe):
def parse_index(self):
# Parse the page into Python Soup
#articleList = []
# articleList = []
ans = []
feedsCount = len(self.feedsets)
for x in range(0,feedsCount): # should be ,4
feedarticles = self.parseRSS(x)
if feedarticles is not None:
ans.append((self.feedsets[x][0], feedarticles))
if self.debugMessages == True :
if self.debugMessages is True :
print(ans)
return ans