diff --git a/recipes/oreilly_premium.recipe b/recipes/oreilly_premium.recipe index c5a615900c..94d24c1e8e 100644 --- a/recipes/oreilly_premium.recipe +++ b/recipes/oreilly_premium.recipe @@ -1,8 +1,15 @@ +# Talking Points is not grabbing everything. +# The look is right, but only the last one added? import re import time from calibre.web.feeds.recipes import BasicNewsRecipe # Allows the Python soup converter, which makes parsing easier. from calibre.ebooks.BeautifulSoup import BeautifulSoup +# strip ads and graphics +# Current Column lacks a title. +# Talking Points Memo - shorten title - Remove year and Bill's name +# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries. +# Newsletters: Talking Points Memos covered by cat12 class OReillyPremium(BasicNewsRecipe): title = u'OReilly Premium' @@ -19,7 +26,17 @@ class OReillyPremium(BasicNewsRecipe): # Don't go down recursions = 0 max_articles_per_feed = 2000 - language = 'en' + + debugMessages = True + + # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList + catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []], + ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []], + ["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []], + ["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []], + ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []], + ["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []] + ] def get_browser(self): br = BasicNewsRecipe.get_browser() @@ -31,6 +48,8 @@ class OReillyPremium(BasicNewsRecipe): br.submit() return br + # Returns the best-guess print url. + # The second parameter (pageURL) is returned if nothing is found. def extractPrintURL(self, baseURL, pageURL, printString): tagURL = pageURL soup = self.index_to_soup(pageURL) @@ -38,7 +57,6 @@ class OReillyPremium(BasicNewsRecipe): printText = soup.find('a', text=printString) else : print("Failed to find Print string "+printString+ " in "+pageURL) - if printText: tag = printText.parent tagURL = baseURL+tag['href'] @@ -47,177 +65,111 @@ class OReillyPremium(BasicNewsRecipe): def stripBadChars(self, inString) : return inString.replace("\'", "") - - # returns a qualifying article list - def parseNoSpinArchives(self, baseURL, soupURL, debugMessages): - articleList = [] - soup = self.index_to_soup(soupURL) - for div in soup.findAll(True, attrs={'class':['blogBody'], 'style':['padding-top:10px;']}): - a = div.find('a', href=True) - if not a: - continue - # re == regex. [href] is the link - url = baseURL - url +=re.sub(r'\?.*', '', a['href']) - # Get print version - printURL = self.extractPrintURL(baseURL, url, "Print this entry") - if printURL: - url = printURL - title = self.tag_to_string(a, use_alt=True).strip() - if debugMessages : - print("No Spin Archive Title:"+title+" at url: "+url) - description = 'None' - pubdate = time.strftime('%a, %d %b') - summary = div.find(True, attrs={'class':'summary'}) - if summary: - description = self.tag_to_string(summary, use_alt=False) - articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) - return articleList - - - def parseTVArchives(self, baseURL, soupURL, debugMessages): - # TV Archives page has some Ajax, so look for the static only. - articleList = [] - soup = self.index_to_soup(soupURL) - if debugMessages : - print("In parseTVArchives") - for div in soup.findAll('a', {'class':['showLinks','homeLinks']}): - a = div - url = baseURL - url +=a['href'] - printURL = self.extractPrintURL(baseURL, url, "Print this entry") - if printURL: - url = printURL - title = self.tag_to_string(a, use_alt=True).strip() - title = self.stripBadChars(title) - if debugMessages : - print("TV Archive "+title+" at url: "+url) - description = 'None' - pubdate = time.strftime('%a, %d %b') - summary = div.find(True, attrs={'class':'summary'}) - if summary: - description = self.tag_to_string(summary, use_alt=False) - articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) - if debugMessages : - print("Leaving TV Parse ") - return articleList - - # Get Daily Briefing Archives - def parseDailyBriefs(self, baseURL, soupURL, debugMessages) : - print("Starting daily briefs") - articleList = [] - soup = self.index_to_soup(soupURL) - for div in soup.findAll(True, attrs={'class':['defaultHeaderSmallLinks']}): - # re == regex. [href] is the link - url = baseURL - url +=re.sub(r'\?.*', '', div['href']) - printURL = self.extractPrintURL(baseURL, url, "Print this entry") - if printURL: - url = printURL - title = div.contents[0] - if debugMessages : - print("Daily Brief - title:"+title+" at url: "+url) - description = 'None' - pubdate = time.strftime('%a, %d %b') - summary = div.find(True, attrs={'class':'summary'}) - if summary: - description = self.tag_to_string(summary, use_alt=False) - articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) - print("Leaving daily briefs") - return articleList - - # Get the weekly Stratfor intelligence report - def parseStratfor(self, baseURL, soupURL, debugMessages): - # http://www.billoreilly.com/blog?categoryID=5 - articleList = [] - soup = self.index_to_soup(soupURL) - if debugMessages : - print("In parseStratfor") - a = soup.find('a', {'class':['blogLinks']}) - url = baseURL - url +=a['href'] - title = self.tag_to_string(a, use_alt=True).strip() - if debugMessages : - print("url: "+url) - print("title:"+title) - # Get Stratfor contents so we can get the real title. - stratSoup = self.index_to_soup(url) - title = stratSoup.html.head.title.string - stratIndex = title.find('Stratfor.com:', 0) - if (stratIndex > -1) : - title = title[stratIndex+14:-1] - # Look for first blogBody