mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
pep8
This commit is contained in:
parent
b524709809
commit
e9c2e24155
@ -42,7 +42,7 @@ class RealClear(BasicNewsRecipe):
|
||||
# First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down.
|
||||
phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)
|
||||
|
||||
printhints = [ ["realclear", "", '' , 'printpage'],
|
||||
printhints = [["realclear", "", '' , 'printpage'],
|
||||
["billoreilly.com", "Print this entry", 'a', ''],
|
||||
["billoreilly.com", "Print This Article", 'a', ''],
|
||||
["politico.com", "Print", 'a', 'share-print'],
|
||||
@ -53,17 +53,17 @@ class RealClear(BasicNewsRecipe):
|
||||
# usatoday - just prints with all current crap anyhow
|
||||
|
||||
]
|
||||
# RCP - look for a strange compound. See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html
|
||||
# The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s
|
||||
# http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html
|
||||
# Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html
|
||||
# Use the FULL PRINTPAGE URL; it formats it better too!
|
||||
#
|
||||
# NYT - try single page...
|
||||
# Need special code - is it one page or several? Which URL?
|
||||
# from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
|
||||
# to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
|
||||
# which is at link rel="canonical" and at <meta property="og:url" or look for "Single Page"
|
||||
# RCP - look for a strange compound. See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html
|
||||
# The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s
|
||||
# http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html
|
||||
# Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html
|
||||
# Use the FULL PRINTPAGE URL; it formats it better too!
|
||||
#
|
||||
# NYT - try single page...
|
||||
# Need special code - is it one page or several? Which URL?
|
||||
# from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
|
||||
# to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
|
||||
# which is at link rel="canonical" and at <meta property="og:url" or look for "Single Page"
|
||||
|
||||
# Returns the best-guess print url.
|
||||
# The second parameter (pageURL) is returned if nothing is found.
|
||||
@ -80,33 +80,33 @@ class RealClear(BasicNewsRecipe):
|
||||
soup = self.index_to_soup(pageURL)
|
||||
if soup is None:
|
||||
return pageURL
|
||||
if len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0:
|
||||
if len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0:
|
||||
# e.g. RealClear
|
||||
if self.debugMessages == True :
|
||||
if self.debugMessages is True :
|
||||
print("Search by href: "+self.printhints[x][self.phHrefSearch])
|
||||
printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch]))
|
||||
elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
|
||||
if self.debugMessages == True :
|
||||
if self.debugMessages is True :
|
||||
print("Search 1: "+self.printhints[x][2]+" Attributes: ")
|
||||
print(self.printhints[x][3])
|
||||
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3])
|
||||
elif len(self.printhints[x][3])>0 :
|
||||
if self.debugMessages == True :
|
||||
elif len(self.printhints[x][3])>0 :
|
||||
if self.debugMessages is True :
|
||||
print("search2")
|
||||
printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1])
|
||||
else :
|
||||
if self.debugMessages == True:
|
||||
if self.debugMessages is True:
|
||||
print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1])
|
||||
printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1])
|
||||
if printFind is None:
|
||||
if self.debugMessages == True :
|
||||
if self.debugMessages is True :
|
||||
print("Not Found")
|
||||
# print(soup)
|
||||
print("end soup\n\n");
|
||||
print("end soup\n\n")
|
||||
continue
|
||||
|
||||
print(printFind)
|
||||
if isinstance(printFind, NavigableString)==False:
|
||||
if isinstance(printFind, NavigableString) is False:
|
||||
if printFind['href'] is not None:
|
||||
print("Check "+printFind['href']+" for base of "+baseURL)
|
||||
if printFind['href'].find("http")!=0 :
|
||||
@ -115,24 +115,24 @@ class RealClear(BasicNewsRecipe):
|
||||
tag = printFind.parent
|
||||
print(tag)
|
||||
if tag['href'] is None:
|
||||
if self.debugMessages == True :
|
||||
if self.debugMessages is True :
|
||||
print("Not in parent, trying skip-up")
|
||||
if tag.parent['href'] is None:
|
||||
if self.debugMessages == True :
|
||||
if self.debugMessages is True :
|
||||
print("Not in skip either, aborting")
|
||||
continue;
|
||||
continue
|
||||
return tag.parent['href']
|
||||
return tag['href']
|
||||
return tagURL
|
||||
|
||||
def get_browser(self):
|
||||
if self.debugMessages == True :
|
||||
if self.debugMessages is True :
|
||||
print("In get_browser")
|
||||
br = BasicNewsRecipe.get_browser(self)
|
||||
return br
|
||||
|
||||
def parseRSS(self, index) :
|
||||
if self.debugMessages == True :
|
||||
if self.debugMessages is True :
|
||||
print("\n\nStarting "+self.feedsets[index][0])
|
||||
articleList = []
|
||||
soup = self.index_to_soup(self.feedsets[index][1])
|
||||
@ -145,7 +145,7 @@ class RealClear(BasicNewsRecipe):
|
||||
urlEl = div.find("link")
|
||||
if urlEl is None or len(urlEl.contents)==0 :
|
||||
urlEl = div.find("guid")
|
||||
if urlEl is None or title is None or len(urlEl.contents)==0 :
|
||||
if urlEl is None or title is None or len(urlEl.contents)==0 :
|
||||
print("Error in feed "+ self.feedsets[index][0])
|
||||
print(div)
|
||||
continue
|
||||
@ -164,15 +164,15 @@ class RealClear(BasicNewsRecipe):
|
||||
pubDate = time.strftime('%a, %d %b')
|
||||
else :
|
||||
pubDate = pubDateEl.contents[0]
|
||||
if self.debugMessages == True :
|
||||
print("Article");
|
||||
if self.debugMessages is True :
|
||||
print("Article")
|
||||
print(title)
|
||||
print(description)
|
||||
print(pubDate)
|
||||
print(url)
|
||||
url = self.extractPrintURL(url)
|
||||
print(url)
|
||||
#url +=re.sub(r'\?.*', '', div['href'])
|
||||
# url +=re.sub(r'\?.*', '', div['href'])
|
||||
pubdate = time.strftime('%a, %d %b')
|
||||
articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
|
||||
return articleList
|
||||
@ -190,15 +190,13 @@ class RealClear(BasicNewsRecipe):
|
||||
def parse_index(self):
|
||||
# Parse the page into Python Soup
|
||||
|
||||
#articleList = []
|
||||
# articleList = []
|
||||
ans = []
|
||||
feedsCount = len(self.feedsets)
|
||||
for x in range(0,feedsCount): # should be ,4
|
||||
for x in range(0,feedsCount): # should be ,4
|
||||
feedarticles = self.parseRSS(x)
|
||||
if feedarticles is not None:
|
||||
ans.append((self.feedsets[x][0], feedarticles))
|
||||
if self.debugMessages == True :
|
||||
if self.debugMessages is True :
|
||||
print(ans)
|
||||
return ans
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user