diff --git a/recipes/real_clear.recipe b/recipes/real_clear.recipe new file mode 100644 index 0000000000..19add74fcd --- /dev/null +++ b/recipes/real_clear.recipe @@ -0,0 +1,170 @@ +# Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug +import time +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import NavigableString + +class RealClear(BasicNewsRecipe): + title = u'Real Clear' + __author__ = 'TMcN' + description = 'Real Clear Politics/Science/etc... aggregation of news\n' + cover_url = 'http://www.realclearpolitics.com/dev/mt-static/images/logo.gif' + custom_title = 'Real Clear - '+ time.strftime('%d %b %Y') + auto_cleanup = True + encoding = 'utf8' + language = 'en' + needs_subscription = False + no_stylesheets = True + oldest_article = 7 + remove_javascript = True + remove_tags = [dict(name='img', attrs={})] + # Don't go down + recursions = 0 + max_articles_per_feed = 400 + debugMessages = False + + # Numeric parameter is type, controls whether we look for + feedsets = [ + ["Politics", "http://www.realclearpolitics.com/index.xml", 0], + ["Science", "http://www.realclearscience.com/index.xml", 0], + ["Tech", "http://www.realcleartechnology.com/index.xml", 0], + # The feedburner is essentially the same as the top feed, politics. + # ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1], + # ["Commentary", "http://feeds.feedburner.com/Realclearpolitics-Articles", 1], + ["Markets Home", "http://www.realclearmarkets.com/index.xml", 0], + ["Markets", "http://www.realclearmarkets.com/articles/index.xml", 0], + ["World", "http://www.realclearworld.com/index.xml", 0], + ["World Blog", "http://www.realclearworld.com/blog/index.xml", 2] + ] + # Hints to extractPrintURL. + # First column is the URL snippet. Then the string to search for as text, and the attributes to look for above it. Start with attributes and drill down. + printhints = [ + ["billoreilly.com", "Print this entry", 'a', ''], + ["billoreilly.com", "Print This Article", 'a', ''], + ["politico.com", "Print", 'a', 'share-print'], + ["nationalreview.com", ">Print<", 'a', ''], + ["reason.com", "", 'a', 'printer'] + # The following are not supported due to JavaScripting, and would require obfuscated_article to handle + # forbes, + # usatoday - just prints with all current crap anyhow + + ] + + # Returns the best-guess print url. + # The second parameter (pageURL) is returned if nothing is found. + def extractPrintURL(self, pageURL): + tagURL = pageURL + hintsCount =len(self.printhints) + for x in range(0,hintsCount): + if pageURL.find(self.printhints[x][0])== -1 : + continue + print("Trying "+self.printhints[x][0]) + # Only retrieve the soup if we have a match to check for the printed article with. + soup = self.index_to_soup(pageURL) + if soup is None: + return pageURL + if len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0: + if self.debugMessages == True : + print("search1") + printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3]) + elif len(self.printhints[x][3])>0 : + if self.debugMessages == True : + print("search2") + printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1]) + else : + printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1]) + if printFind is None: + if self.debugMessages == True : + print("Not Found") + continue + print(printFind) + if isinstance(printFind, NavigableString)==False: + if printFind['href'] is not None: + return printFind['href'] + tag = printFind.parent + print(tag) + if tag['href'] is None: + if self.debugMessages == True : + print("Not in parent, trying skip-up") + if tag.parent['href'] is None: + if self.debugMessages == True : + print("Not in skip either, aborting") + continue; + return tag.parent['href'] + return tag['href'] + return tagURL + + def get_browser(self): + if self.debugMessages == True : + print("In get_browser") + br = BasicNewsRecipe.get_browser() + return br + + def parseRSS(self, index) : + if self.debugMessages == True : + print("\n\nStarting "+self.feedsets[index][0]) + articleList = [] + soup = self.index_to_soup(self.feedsets[index][1]) + for div in soup.findAll("item"): + title = div.find("title").contents[0] + urlEl = div.find("originalLink") + if urlEl is None or len(urlEl.contents)==0 : + urlEl = div.find("originallink") + if urlEl is None or len(urlEl.contents)==0 : + urlEl = div.find("link") + if urlEl is None or len(urlEl.contents)==0 : + urlEl = div.find("guid") + if urlEl is None or title is None or len(urlEl.contents)==0 : + print("Error in feed "+ self.feedsets[index][0]) + print(div) + continue + print(title) + print(urlEl) + url = urlEl.contents[0].encode("utf-8") + description = div.find("description") + if description is not None and description.contents is not None and len(description.contents)>0: + description = description.contents[0] + else : + description="None" + pubDateEl = div.find("pubDate") + if pubDateEl is None : + pubDateEl = div.find("pubdate") + if pubDateEl is None : + pubDate = time.strftime('%a, %d %b') + else : + pubDate = pubDateEl.contents[0] + if self.debugMessages == True : + print("Article"); + print(title) + print(description) + print(pubDate) + print(url) + url = self.extractPrintURL(url) + print(url) + #url +=re.sub(r'\?.*', '', div['href']) + pubdate = time.strftime('%a, %d %b') + articleList.append(dict(title=title, url=url, date=pubdate, description=description, content='')) + return articleList + + # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles. + # returns a list of tuple ('feed title', list of articles) + # { + # 'title' : article title, + # 'url' : URL of print version, + # 'date' : The publication date of the article as a string, + # 'description' : A summary of the article + # 'content' : The full article (can be an empty string). This is used by FullContentProfile + # } + # this is used instead of BasicNewsRecipe.parse_feeds(). + def parse_index(self): + # Parse the page into Python Soup + + ans = [] + feedsCount = len(self.feedsets) + for x in range(0,feedsCount): # should be ,4 + feedarticles = self.parseRSS(x) + if feedarticles is not None: + ans.append((self.feedsets[x][0], feedarticles)) + if self.debugMessages == True : + print(ans) + return ans +