import re import time from calibre.web.feeds.recipes import BasicNewsRecipe # Allows the Python soup converter, which makes parsing easier. from calibre.ebooks.BeautifulSoup import BeautifulSoup class OReillyPremium(BasicNewsRecipe): title = u'OReilly Premium' __author__ = 'TMcN' description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.' cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png' auto_cleanup = True encoding = 'utf8' needs_subscription = True no_stylesheets = True oldest_article = 20 remove_javascript = True remove_tags = [dict(name='img', attrs={})] # Don't go down recursions = 0 max_articles_per_feed = 2000 language = 'en' def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp') br.select_form(name='login') br['formEmailField'] = self.username br['formPasswordField'] = self.password br.submit() return br def extractPrintURL(self, baseURL, pageURL, printString): tagURL = pageURL soup = self.index_to_soup(pageURL) if soup : printText = soup.find('a', text=printString) else : print("Failed to find Print string "+printString+ " in "+pageURL) if printText: tag = printText.parent tagURL = baseURL+tag['href'] return tagURL def stripBadChars(self, inString) : return inString.replace("\'", "") # returns a qualifying article list def parseNoSpinArchives(self, baseURL, soupURL, debugMessages): articleList = [] soup = self.index_to_soup(soupURL) for div in soup.findAll(True, attrs={'class':['blogBody'], 'style':['padding-top:10px;']}): a = div.find('a', href=True) if not a: continue # re == regex. [href] is the link url = baseURL url +=re.sub(r'\?.*', '', a['href']) # Get print version printURL = self.extractPrintURL(baseURL, url, "Print this entry") if printURL: url = printURL title = self.tag_to_string(a, use_alt=True).strip() if debugMessages : print("No Spin Archive Title:"+title+" at url: "+url) description = 'None' pubdate = time.strftime('%a, %d %b') summary = div.find(True, attrs={'class':'summary'}) if summary: description = self.tag_to_string(summary, use_alt=False) articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) return articleList def parseTVArchives(self, baseURL, soupURL, debugMessages): # TV Archives page has some Ajax, so look for the static only. articleList = [] soup = self.index_to_soup(soupURL) if debugMessages : print("In parseTVArchives") for div in soup.findAll('a', {'class':['showLinks','homeLinks']}): a = div url = baseURL url +=a['href'] printURL = self.extractPrintURL(baseURL, url, "Print this entry") if printURL: url = printURL title = self.tag_to_string(a, use_alt=True).strip() title = self.stripBadChars(title) if debugMessages : print("TV Archive "+title+" at url: "+url) description = 'None' pubdate = time.strftime('%a, %d %b') summary = div.find(True, attrs={'class':'summary'}) if summary: description = self.tag_to_string(summary, use_alt=False) articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) if debugMessages : print("Leaving TV Parse ") return articleList # Get Daily Briefing Archives def parseDailyBriefs(self, baseURL, soupURL, debugMessages) : print("Starting daily briefs") articleList = [] soup = self.index_to_soup(soupURL) for div in soup.findAll(True, attrs={'class':['defaultHeaderSmallLinks']}): # re == regex. [href] is the link url = baseURL url +=re.sub(r'\?.*', '', div['href']) printURL = self.extractPrintURL(baseURL, url, "Print this entry") if printURL: url = printURL title = div.contents[0] if debugMessages : print("Daily Brief - title:"+title+" at url: "+url) description = 'None' pubdate = time.strftime('%a, %d %b') summary = div.find(True, attrs={'class':'summary'}) if summary: description = self.tag_to_string(summary, use_alt=False) articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) print("Leaving daily briefs") return articleList # Get the weekly Stratfor intelligence report def parseStratfor(self, baseURL, soupURL, debugMessages): # http://www.billoreilly.com/blog?categoryID=5 articleList = [] soup = self.index_to_soup(soupURL) if debugMessages : print("In parseStratfor") a = soup.find('a', {'class':['blogLinks']}) url = baseURL url +=a['href'] title = self.tag_to_string(a, use_alt=True).strip() if debugMessages : print("url: "+url) print("title:"+title) # Get Stratfor contents so we can get the real title. stratSoup = self.index_to_soup(url) title = stratSoup.html.head.title.string stratIndex = title.find('Stratfor.com:', 0) if (stratIndex > -1) : title = title[stratIndex+14:-1] # Look for first blogBody