# Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug import time from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import NavigableString class RealClear(BasicNewsRecipe): title = u'Real Clear' __author__ = 'TMcN' description = 'Real Clear Politics/Science/etc... aggregation of news\n' cover_url = 'http://www.realclearpolitics.com/dev/mt-static/images/logo.gif' custom_title = 'Real Clear - '+ time.strftime('%d %b %Y') auto_cleanup = True encoding = 'utf8' language = 'en' needs_subscription = False no_stylesheets = True oldest_article = 7 remove_javascript = True remove_tags = [dict(name='img', attrs={})] # Don't go down recursions = 0 max_articles_per_feed = 400 debugMessages = False # Numeric parameter is type, controls whether we look for feedsets = [ ["Politics", "http://www.realclearpolitics.com/index.xml", 0], ["Science", "http://www.realclearscience.com/index.xml", 0], ["Tech", "http://www.realcleartechnology.com/index.xml", 0], # The feedburner is essentially the same as the top feed, politics. # ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1], # ["Commentary", "http://feeds.feedburner.com/Realclearpolitics-Articles", 1], ["Markets Home", "http://www.realclearmarkets.com/index.xml", 0], ["Markets", "http://www.realclearmarkets.com/articles/index.xml", 0], ["World", "http://www.realclearworld.com/index.xml", 0], ["World Blog", "http://www.realclearworld.com/blog/index.xml", 2] ] # Hints to extractPrintURL. # First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down. printhints = [ ["billoreilly.com", "Print this entry", 'a', ''], ["billoreilly.com", "Print This Article", 'a', ''], ["politico.com", "Print", 'a', 'share-print'], ["nationalreview.com", ">Print<", 'a', ''], ["reason.com", "", 'a', 'printer'] # The following are not supported due to JavaScripting, and would require obfuscated_article to handle # forbes, # usatoday - just prints with all current crap anyhow ] # Returns the best-guess print url. # The second parameter (pageURL) is returned if nothing is found. def extractPrintURL(self, pageURL): tagURL = pageURL hintsCount =len(self.printhints) for x in range(0,hintsCount): if pageURL.find(self.printhints[x][0])== -1 : continue print("Trying "+self.printhints[x][0]) # Only retrieve the soup if we have a match to check for the printed article with. soup = self.index_to_soup(pageURL) if soup is None: return pageURL if len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0: if self.debugMessages == True : print("search1") printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3]) elif len(self.printhints[x][3])>0 : if self.debugMessages == True : print("search2") printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1]) else : printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1]) if printFind is None: if self.debugMessages == True : print("Not Found") continue print(printFind) if isinstance(printFind, NavigableString)==False: if printFind['href'] is not None: return printFind['href'] tag = printFind.parent print(tag) if tag['href'] is None: if self.debugMessages == True : print("Not in parent, trying skip-up") if tag.parent['href'] is None: if self.debugMessages == True : print("Not in skip either, aborting") continue; return tag.parent['href'] return tag['href'] return tagURL def get_browser(self): if self.debugMessages == True : print("In get_browser") br = BasicNewsRecipe.get_browser() return br def parseRSS(self, index) : if self.debugMessages == True : print("\n\nStarting "+self.feedsets[index][0]) articleList = [] soup = self.index_to_soup(self.feedsets[index][1]) for div in soup.findAll("item"): title = div.find("title").contents[0] urlEl = div.find("originalLink") if urlEl is None or len(urlEl.contents)==0 : urlEl = div.find("originallink") if urlEl is None or len(urlEl.contents)==0 : urlEl = div.find("link") if urlEl is None or len(urlEl.contents)==0 : urlEl = div.find("guid") if urlEl is None or title is None or len(urlEl.contents)==0 : print("Error in feed "+ self.feedsets[index][0]) print(div) continue print(title) print(urlEl) url = urlEl.contents[0].encode("utf-8") description = div.find("description") if description is not None and description.contents is not None and len(description.contents)>0: description = description.contents[0] else : description="None" pubDateEl = div.find("pubDate") if pubDateEl is None : pubDateEl = div.find("pubdate") if pubDateEl is None : pubDate = time.strftime('%a, %d %b') else : pubDate = pubDateEl.contents[0] if self.debugMessages == True : print("Article"); print(title) print(description) print(pubDate) print(url) url = self.extractPrintURL(url) print(url) #url +=re.sub(r'\?.*', '', div['href']) pubdate = time.strftime('%a, %d %b') articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) return articleList # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles. # returns a list of tuple ('feed title', list of articles) # { # 'title' : article title, # 'url' : URL of print version, # 'date' : The publication date of the article as a string, # 'description' : A summary of the article # 'content' : The full article (can be an empty string). This is used by FullContentProfile # } # this is used instead of BasicNewsRecipe.parse_feeds(). def parse_index(self): # Parse the page into Python Soup ans = [] feedsCount = len(self.feedsets) for x in range(0,feedsCount): # should be ,4 feedarticles = self.parseRSS(x) if feedarticles is not None: ans.append((self.feedsets[x][0], feedarticles)) if self.debugMessages == True : print(ans) return ans