from __future__ import print_function # Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe # .epub --test -vv --debug-pipeline debug import re import time from calibre.ebooks.BeautifulSoup import NavigableString from calibre.web.feeds.recipes import BasicNewsRecipe try: from urllib.parse import urlparse except ImportError: from urlparse import urlparse class RealClear(BasicNewsRecipe): title = u'Real Clear' __author__ = 'TMcN' description = 'Real Clear Politics/Science/etc... aggregation of news\n' cover_url = 'http://www.realclearpolitics.com/dev/mt-static/images/logo.gif' custom_title = 'Real Clear - ' + time.strftime('%d %b %Y') auto_cleanup = True encoding = 'utf8' language = 'en' needs_subscription = False no_stylesheets = True oldest_article = 7 remove_javascript = True remove_tags = [dict(name='img', attrs={})] # Don't go down recursions = 0 max_articles_per_feed = 400 debugMessages = True # Numeric parameter is type, controls whether we look for feedsets = [ ['Politics', 'http://www.realclearpolitics.com/index.xml', 0], ['Policy', 'http://www.realclearpolicy.com/index.xml', 0], ['Science', 'http://www.realclearscience.com/index.xml', 0], ['Tech', 'http://www.realcleartechnology.com/index.xml', 0], # The feedburner is essentially the same as the top feed, politics. # ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1], # ["Commentary", "http://feeds.feedburner.com/Realclearpolitics-Articles", 1], ['Markets Home', 'http://www.realclearmarkets.com/index.xml', 0], ['Markets', 'http://www.realclearmarkets.com/articles/index.xml', 0], ['World', 'http://www.realclearworld.com/index.xml', 0], ['World Blog', 'http://www.realclearworld.com/blog/index.xml', 2] ] # Hints to extractPrintURL. # First column is the URL snippet. Then the string to search for as text, # and the attributes to look for above it. Start with attributes and # drill down. phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4) printhints = [['realclear', '', '', 'printpage'], ['billoreilly.com', 'Print this entry', 'a', ''], ['billoreilly.com', 'Print This Article', 'a', ''], ['politico.com', 'Print', 'a', 'share-print'], ['nationalreview.com', '>Print<', 'a', ''], ['reason.com', '', 'a', 'printer'] # The following are not supported due to JavaScripting, and would require obfuscated_article to handle # forbes, # usatoday - just prints with all current crap anyhow ] # The print link isn't obvious, and only the end is needed (the -full append.) SO maybe try that first?s # http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html # Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html # Use the FULL PRINTPAGE URL; it formats it better too! # # NYT - try single page... # Need special code - is it one page or several? Which URL? # from http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1 # to http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all # which is at link rel="canonical" and at 0 and len(self.printhints[x][self.phLinkText]) == 0: # e.g. RealClear if self.debugMessages is True: print('Search by href: ' + self.printhints[x][self.phHrefSearch]) printFind = soup.find(href=re.compile( self.printhints[x][self.phHrefSearch])) elif len(self.printhints[x][3]) > 0 and len(self.printhints[x][1]) == 0: if self.debugMessages is True: print('Search 1: ' + self.printhints[x][2] + ' Attributes: ') print(self.printhints[x][3]) printFind = soup.find( self.printhints[x][2], attrs=self.printhints[x][3]) elif len(self.printhints[x][3]) > 0: if self.debugMessages is True: print('search2') printFind = soup.find(self.printhints[x][2], attrs=self.printhints[ x][3], text=self.printhints[x][1]) else: if self.debugMessages is True: print( 'Default Search: ' + self.printhints[x][2] + ' Text: ' + self.printhints[x][1]) printFind = soup.find( self.printhints[x][2], text=self.printhints[x][1]) if printFind is None: if self.debugMessages is True: print('Not Found') # print(soup) print('end soup\n\n') continue print(printFind) if isinstance(printFind, NavigableString) is False: if printFind['href'] is not None: print('Check ' + printFind['href'] + ' for base of ' + baseURL) if printFind['href'].find('http') != 0: return baseURL + printFind['href'] return printFind['href'] tag = printFind.parent print(tag) if tag.get('href', None) is None: if self.debugMessages is True: print('Not in parent, trying skip-up') if tag.parent['href'] is None: if self.debugMessages is True: print('Not in skip either, aborting') continue return tag.parent['href'] return tag['href'] return tagURL def get_browser(self): if self.debugMessages is True: print('In get_browser') br = BasicNewsRecipe.get_browser(self) return br def parseRSS(self, index): if self.debugMessages is True: print('\n\nStarting ' + self.feedsets[index][0]) articleList = [] soup = self.index_to_soup(self.feedsets[index][1]) for div in soup.findAll('item'): title = div.find('title').contents[0] urlEl = div.find('originalLink') if urlEl is None or len(urlEl.contents) == 0: urlEl = div.find('originallink') if urlEl is None or len(urlEl.contents) == 0: urlEl = div.find('link') if urlEl is None or len(urlEl.contents) == 0: urlEl = div.find('guid') if urlEl is None or title is None or len(urlEl.contents) == 0: print('Error in feed ' + self.feedsets[index][0]) print(div) continue print(title) print(urlEl) url = urlEl.contents[0].encode('utf-8') description = div.find('description') if description is not None and description.contents is not None and len(description.contents) > 0: description = description.contents[0] else: description = 'None' pubDateEl = div.find('pubDate') if pubDateEl is None: pubDateEl = div.find('pubdate') if pubDateEl is None: pubDate = time.strftime('%a, %d %b') else: pubDate = pubDateEl.contents[0] if self.debugMessages is True: print('Article') print(title) print(description) print(pubDate) print(url) try: url = self.extractPrintURL(url) except Exception: self.log.exception('Failed to extract print URL for %s' % url) print(url) # url +=re.sub(r'\?.*', '', div['href']) pubdate = time.strftime('%a, %d %b') articleList.append( dict(title=title, url=url, date=pubdate, description=description, content='')) return articleList # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles. # returns a list of tuple ('feed title', list of articles) # { # 'title' : article title, # 'url' : URL of print version, # 'date' : The publication date of the article as a string, # 'description' : A summary of the article # 'content' : The full article (can be an empty string). This is used by FullContentProfile # } # this is used instead of BasicNewsRecipe.parse_feeds(). def parse_index(self): # Parse the page into Python Soup # articleList = [] ans = [] feedsCount = len(self.feedsets) for x in range(feedsCount): # should be ,4 feedarticles = self.parseRSS(x) if feedarticles is not None: ans.append((self.feedsets[x][0], feedarticles)) if self.debugMessages is True: print(ans) return ans