# Talking Points is not grabbing everything. # The look is right, but only the last one added? import re import time from calibre.web.feeds.recipes import BasicNewsRecipe # Allows the Python soup converter, which makes parsing easier. from calibre.ebooks.BeautifulSoup import BeautifulSoup # strip ads and graphics # Current Column lacks a title. # Talking Points Memo - shorten title - Remove year and Bill's name # The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries. # Newsletters: Talking Points Memos covered by cat12 class OReillyPremium(BasicNewsRecipe): title = u'OReilly Premium' __author__ = 'TMcN' language = 'en' description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.' cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png' auto_cleanup = True encoding = 'utf8' needs_subscription = True no_stylesheets = True oldest_article = 20 remove_javascript = True remove_tags = [dict(name='img', attrs={})] # Don't go down recursions = 0 max_articles_per_feed = 2000 debugMessages = True # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []], ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []], ["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []], ["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []], ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []], ["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []] ] def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp') br.select_form(name='login') br['formEmailField'] = self.username br['formPasswordField'] = self.password br.submit() return br # Returns the best-guess print url. # The second parameter (pageURL) is returned if nothing is found. def extractPrintURL(self, baseURL, pageURL, printString): tagURL = pageURL soup = self.index_to_soup(pageURL) if soup : printText = soup.find('a', text=printString) else : print("Failed to find Print string "+printString+ " in "+pageURL) if printText: tag = printText.parent tagURL = baseURL+tag['href'] return tagURL def stripBadChars(self, inString) : return inString.replace("\'", "") def parseGeneric(self, baseURL): # Does a generic parsing of the articles. There are six categories (0-5) # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList # NoSpin and TV are generic fullReturn = [] for i in range(len(self.catList)) : articleList = [] soup = self.index_to_soup(self.catList[i][1]) # Set defaults description = 'None' pubdate = time.strftime('%a, %d %b') # Problem: 0-2 create many in an array # 3-5 create one. # So no for-div for 3-5 if i < 3 : for div in soup.findAll(self.catList[i][2], self.catList[i][3]): print(div) if i == 1: a = div.find('a', href=True) else : a = div print(a) summary = div.find(True, attrs={'class':'summary'}) if summary: description = self.tag_to_string(summary, use_alt=False) if not a: continue # url = baseURL+re.sub(r'\?.*', '', a['href']) url = baseURL+a['href'] if i < 2 : url = self.extractPrintURL(baseURL, url, "Print this entry") title = self.tag_to_string(a, use_alt=True).strip() elif i == 2 : # Daily Briefs url = self.extractPrintURL(baseURL, url, "Print this entry") title = div.contents[0] if self.debugMessages : print(title+" @ "+url) articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) elif i == 3 : # Stratfor a = soup.find('a', self.catList[i][3]) if a is None : continue url = baseURL+a['href'] title = self.tag_to_string(a, use_alt=True).strip() # Get Stratfor contents so we can get the real title. stratSoup = self.index_to_soup(url) title = stratSoup.html.head.title.string stratIndex = title.find('Stratfor.com:', 0) if (stratIndex > -1) : title = title[stratIndex+14:-1] # Look for first blogBody