import time import traceback # above for debugging via stack from calibre.web.feeds.recipes import BasicNewsRecipe # Allows the Python soup converter, which makes parsing easier. from calibre.ebooks.BeautifulSoup import BeautifulSoup import os from calibre.web.feeds import feeds_from_index from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending # To Do: strip ads and graphics, Current Column lacks a title. # The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries. # Newsletters: Talking Points Memos covered by cat12 # ./ebook-convert --username xxx --password xxx # this is derived from BasicNewsRecipe, so it can only overload those. # Soome of what we need is otherwise in article, so we have more copy to do than otherwise. class OReillyPremium(BasicNewsRecipe): title = u'OReilly Premium' __author__ = 'TMcN' description = 'Retrieves Premium and News Letter content from BillOReilly.com. Requires a Bill OReilly Premium Membership.' cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png' custom_title = 'Bill O\'Reilly Premium - '+ time.strftime('%d %b %Y') title = 'Bill O\'Reilly Premium' auto_cleanup = True conversion_options = {'linearize_tables': True} encoding = 'utf8' language = 'en' no_stylesheets = True needs_subscription = True oldest_article = 31 remove_javascript = True remove_tags = [dict(name='img', attrs={})] # Don't go down recursions = 0 max_articles_per_feed = 20 debugMessages = True # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList catList = [ ["TV Archives", 'https://www.billoreilly.com/show?action=tvShowArchive', 'a', {'class':['showLinks','homeLinks']}, []], # ["No Spin Archives", 'https://www.billoreilly.com/blog?categoryID=7', True, {'class':['blogBody'], 'style':['padding-top:10px;']}, []], # ["Daily Briefings", 'http://www.billoreilly.com/blog?categoryID=11', True, {'class':['defaultHeaderSmallLinks']}, []], # ["Stratfor", 'http://www.billoreilly.com/blog?categoryID=5', 'a', {'class':['blogLinks']}, []], # ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12', 'td', {}, []], ["Current Column", 'https://www.billoreilly.com/currentcolumn', 'span', {'class':['defaultHeader']}, []] ] feeds = [ (u'No Spin', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'), (u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'), (u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'), (u'Blog', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'), (u'StratFor', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5') ] # http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day. # Note: Talking Points is broken in the above model; the site changed to more Ajax-y. # Now using RSS def get_browser(self): print("In get_browser") br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp') br.select_form(name='login') br['formEmailField'] = self.username br['formPasswordField'] = self.password br.submit() return br # Returns the best-guess print url. # The second parameter (pageURL) is returned if nothing is found. def extractPrintURL(self, baseURL, pageURL, printString): tagURL = pageURL soup = self.index_to_soup(pageURL) if soup : printText = soup.find('a', text=printString) else : print("Failed to find Print string "+printString+ " in "+pageURL) if printText: tag = printText.parent tagURL = baseURL+tag['href'] return tagURL def stripBadChars(self, inString) : return inString.replace("\'", "") def parseGeneric(self, baseURL): # Does a generic parsing of the articles. There are six categories (0-5) # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList # NoSpin and TV are generic fullReturn = [] for i in range(len(self.catList)) : articleList = [] print("In "+self.catList[i][0]+", index: "+ str(i)) soup = self.index_to_soup(self.catList[i][1]) # Set defaults description = 'None' pubdate = time.strftime('%a, %d %b') # Problem: 0-2 create many in an array # 3-5 create one. # So no for-div for 3-5 if i == 0 : print("Starting TV Archives") for div in soup.findAll(self.catList[i][2], self.catList[i][3]): print("Next DIV:") print(div) a = div summary = div.find(True, attrs={'class':'summary'}) if summary: description = self.tag_to_string(summary, use_alt=False) if not a: continue # url = baseURL+re.sub(r'\?.*', '', a['href']) url = baseURL+a['href'] url = self.extractPrintURL(baseURL, url, "Print this entry") title = self.tag_to_string(a, use_alt=True).strip() articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) else : # Current Column titleSpan = soup.find(self.catList[i][2], self.catList[i][3]) if titleSpan is None : print("No Current Column Title Span") print(soup) continue title = titleSpan.contents[0] url = self.extractPrintURL(baseURL, self.catList[i][1], "Print This Article") if i == 1 : if self.debugMessages : print(self.catList[i][0]+" Title:"+title+" at url: "+url) summary = div.find(True, attrs={'class':'summary'}) print("At Summary") print(summary) if summary is not None: description = self.tag_to_string(summary, use_alt=False) print("At append") articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) self.catList[i][3] = articleList fullReturn.append((self.catList[i][0], articleList)) print("Returning") # print fullReturn return fullReturn # build_index() starts with: # try: # feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article, # max_articles_per_feed=self.max_articles_per_feed, # log=self.log) # self.report_progress(0, _('Got feeds from index page')) # except NotImplementedError: # feeds = self.parse_feeds() # which in turn is from __init__.py #def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100, # log=default_log): #''' #@param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}. #@return: A list of L{Feed} objects. #@rtype: list #''' #feeds = [] #for title, articles in index: # pfeed = Feed(log=log) # pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article, # max_articles_per_feed=max_articles_per_feed) # feeds.append(pfeed) # return feeds # use_embedded_content defaults to None, at which point if the content is > 2K, it is used as the article. # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles. # returns a list of tuple ('feed title', list of articles) # { # 'title' : article title, # 'url' : URL of print version, # 'date' : The publication date of the article as a string, # 'description' : A summary of the article # 'content' : The full article (can be an empty string). This is used by FullContentProfile # } # this is used instead of BasicNewsRecipe.parse_feeds(). # it is called by download def parse_index(self): # Parse the page into Python Soup print("Entering recipe print_index from:") traceback.print_stack() print("web") baseURL = "https://www.billoreilly.com" masterList = self.parseGeneric(baseURL) #print(masterList) return masterList def preprocess_html(self, soup): print("In preprocess_html") refresh = soup.find('meta', {'http-equiv':'refresh'}) if refresh is None: return soup content = refresh.get('content').partition('=')[2] raw = self.browser.open('https://www.billoreilly.com'+content).read() return BeautifulSoup(raw.decode('cp1252', 'replace')) def build_index(self): print("In OReilly build_index()\n\n") feedsRSS = [] self.report_progress(0, ('Fetching feeds...')) #try: feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article, max_articles_per_feed=self.max_articles_per_feed, log=self.log) self.report_progress(0, ('Got feeds from index page')) #except NotImplementedError: # feeds = self.parse_feeds() # Now add regular feeds. feedsRSS = self.parse_feeds() print ("feedsRSS is type "+feedsRSS.__class__.__name__) for articles in feedsRSS: print("articles is type "+articles.__class__.__name__) print("Title:" + articles.title) feeds.append(articles) if not feeds: raise ValueError('No articles found, aborting') #feeds = FeedCollection(feeds) self.report_progress(0, ('Trying to download cover...')) self.download_cover() self.report_progress(0, ('Generating masthead...')) self.masthead_path = None try: murl = self.get_masthead_url() except: self.log.exception('Failed to get masthead url') murl = None if murl is not None: # Try downloading the user-supplied masthead_url # Failure sets self.masthead_path to None self.download_masthead(murl) if self.masthead_path is None: self.log.info("Synthesizing mastheadImage") self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg') try: self.default_masthead_image(self.masthead_path) except: self.log.exception('Failed to generate default masthead image') self.masthead_path = None if self.test: feeds = feeds[:2] self.has_single_feed = len(feeds) == 1 index = os.path.join(self.output_dir, 'index.html') html = self.feeds2index(feeds) with open(index, 'wb') as fi: fi.write(html) self.jobs = [] if self.reverse_article_order: for feed in feeds: if hasattr(feed, 'reverse'): feed.reverse() self.feed_objects = feeds for f, feed in enumerate(feeds): feed_dir = os.path.join(self.output_dir, 'feed_%d'%f) if not os.path.isdir(feed_dir): os.makedirs(feed_dir) for a, article in enumerate(feed): if a >= self.max_articles_per_feed: break art_dir = os.path.join(feed_dir, 'article_%d'%a) if not os.path.isdir(art_dir): os.makedirs(art_dir) try: url = self.print_version(article.url) except NotImplementedError: url = article.url except: self.log.exception('Failed to find print version for: '+article.url) url = None if not url: continue func, arg = (self.fetch_embedded_article, article) \ if self.use_embedded_content or (self.use_embedded_content == None and feed.has_embedded_content()) \ else \ ((self.fetch_obfuscated_article if self.articles_are_obfuscated \ else self.fetch_article), url) req = WorkRequest(func, (arg, art_dir, f, a, len(feed)), {}, (f, a), self.article_downloaded, self.error_in_article_download) req.feed = feed req.article = article req.feed_dir = feed_dir self.jobs.append(req) self.jobs_done = 0 tp = ThreadPool(self.simultaneous_downloads) for req in self.jobs: tp.putRequest(req, block=True, timeout=0) self.report_progress(0, ('Starting download [%d thread(s)]...')%self.simultaneous_downloads) while True: try: tp.poll() time.sleep(0.1) except NoResultsPending: break for f, feed in enumerate(feeds): print("Writing feeds for "+feed.title) html = self.feed2index(f,feeds) feed_dir = os.path.join(self.output_dir, 'feed_%d'%f) with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi: fi.write(html) self.create_opf(feeds) self.report_progress(1, ('Feeds downloaded to %s')%index) return index