mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-31 10:37:00 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			228 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			228 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from __future__ import print_function
 | |
| 
 | |
| # Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe
 | |
| # .epub --test -vv --debug-pipeline debug
 | |
| import re
 | |
| import time
 | |
| 
 | |
| from calibre.ebooks.BeautifulSoup import NavigableString
 | |
| from calibre.web.feeds.recipes import BasicNewsRecipe
 | |
| 
 | |
| try:
 | |
|     from urllib.parse import urlparse
 | |
| except ImportError:
 | |
|     from urlparse import urlparse
 | |
| 
 | |
| 
 | |
| class RealClear(BasicNewsRecipe):
 | |
|     title = u'Real Clear'
 | |
|     __author__ = 'TMcN'
 | |
|     description = 'Real Clear Politics/Science/etc... aggregation of news\n'
 | |
|     cover_url = 'http://www.realclearpolitics.com/dev/mt-static/images/logo.gif'
 | |
|     custom_title = 'Real Clear - ' + time.strftime('%d %b %Y')
 | |
|     auto_cleanup = True
 | |
|     encoding = 'utf8'
 | |
|     language = 'en'
 | |
|     needs_subscription = False
 | |
|     no_stylesheets = True
 | |
|     oldest_article = 7
 | |
|     remove_javascript = True
 | |
|     remove_tags = [dict(name='img', attrs={})]
 | |
|     # Don't go down
 | |
|     recursions = 0
 | |
|     max_articles_per_feed = 400
 | |
|     debugMessages = True
 | |
| 
 | |
|     # Numeric parameter is type, controls whether we look for
 | |
|     feedsets = [
 | |
|         ["Politics",        "http://www.realclearpolitics.com/index.xml",   0],
 | |
|         ["Policy",           "http://www.realclearpolicy.com/index.xml", 0],
 | |
|         ["Science",         "http://www.realclearscience.com/index.xml",    0],
 | |
|         ["Tech",            "http://www.realcleartechnology.com/index.xml", 0],
 | |
|         # The feedburner is essentially the same as the top feed, politics.
 | |
|         # ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1],
 | |
|         # ["Commentary",      "http://feeds.feedburner.com/Realclearpolitics-Articles", 1],
 | |
|         ["Markets Home",    "http://www.realclearmarkets.com/index.xml", 0],
 | |
|         ["Markets",         "http://www.realclearmarkets.com/articles/index.xml", 0],
 | |
|         ["World",           "http://www.realclearworld.com/index.xml", 0],
 | |
|         ["World Blog",           "http://www.realclearworld.com/blog/index.xml", 2]
 | |
|     ]
 | |
|     # Hints to extractPrintURL.
 | |
|     # First column is the URL snippet.  Then the string to search for as text,
 | |
|     # and the attributes to look for above it.  Start with attributes and
 | |
|     # drill down.
 | |
|     phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)
 | |
| 
 | |
|     printhints = [["realclear",           "",                            '', 'printpage'],
 | |
|                   ["billoreilly.com",     "Print this entry",            'a', ''],
 | |
|                   ["billoreilly.com",     "Print This Article",          'a', ''],
 | |
|                   ["politico.com",        "Print",
 | |
|                       'a', 'share-print'],
 | |
|                   ["nationalreview.com",  ">Print<",                     'a', ''],
 | |
|                   ["reason.com",          "",                       'a', 'printer']
 | |
|                   # The following are not supported due to JavaScripting, and would require obfuscated_article to handle
 | |
|                   # forbes,
 | |
|                   # usatoday - just prints with all current crap anyhow
 | |
| 
 | |
|                   ]
 | |
|     # The print link isn't obvious, and only the end is needed (the -full append.)  SO maybe try that first?s
 | |
|     # http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html
 | |
|     # Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html
 | |
|     # Use the FULL PRINTPAGE URL; it formats it better too!
 | |
|     #
 | |
|     # NYT - try single page...
 | |
|     # Need special code - is it one page or several?  Which URL?
 | |
|     # from  http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
 | |
|     # to    http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
 | |
|     # which is at link rel="canonical"   and at        <meta property="og:url"
 | |
|     # or look for "Single Page"
 | |
| 
 | |
|     # Returns the best-guess print url.
 | |
|     # The second parameter (pageURL) is returned if nothing is found.
 | |
|     def extractPrintURL(self, pageURL):
 | |
|         tagURL = pageURL
 | |
|         baseParse = urlparse(pageURL)
 | |
|         baseURL = baseParse[0] + "://" + baseParse[1]
 | |
|         hintsCount = len(self.printhints)
 | |
|         for x in range(0, hintsCount):
 | |
|             if pageURL.find(self.printhints[x][0]) == -1:
 | |
|                 continue
 | |
|             print("Trying " + self.printhints[x][0])
 | |
|             # Only retrieve the soup if we have a match to check for the
 | |
|             # printed article with.
 | |
|             soup = self.index_to_soup(pageURL)
 | |
|             if soup is None:
 | |
|                 return pageURL
 | |
|             if len(self.printhints[x][self.phHrefSearch]) > 0 and len(self.printhints[x][self.phLinkText]) == 0:
 | |
|                 # e.g. RealClear
 | |
|                 if self.debugMessages is True:
 | |
|                     print("Search by href: " +
 | |
|                           self.printhints[x][self.phHrefSearch])
 | |
|                 printFind = soup.find(href=re.compile(
 | |
|                     self.printhints[x][self.phHrefSearch]))
 | |
|             elif len(self.printhints[x][3]) > 0 and len(self.printhints[x][1]) == 0:
 | |
|                 if self.debugMessages is True:
 | |
|                     print("Search 1: " +
 | |
|                           self.printhints[x][2] + " Attributes: ")
 | |
|                     print(self.printhints[x][3])
 | |
|                 printFind = soup.find(
 | |
|                     self.printhints[x][2], attrs=self.printhints[x][3])
 | |
|             elif len(self.printhints[x][3]) > 0:
 | |
|                 if self.debugMessages is True:
 | |
|                     print("search2")
 | |
|                 printFind = soup.find(self.printhints[x][2], attrs=self.printhints[
 | |
|                                       x][3], text=self.printhints[x][1])
 | |
|             else:
 | |
|                 if self.debugMessages is True:
 | |
|                     print(
 | |
|                         "Default Search: " + self.printhints[x][2] + " Text: " + self.printhints[x][1])
 | |
|                 printFind = soup.find(
 | |
|                     self.printhints[x][2], text=self.printhints[x][1])
 | |
|             if printFind is None:
 | |
|                 if self.debugMessages is True:
 | |
|                     print("Not Found")
 | |
|                     # print(soup)
 | |
|                     print("end soup\n\n")
 | |
|                 continue
 | |
| 
 | |
|             print(printFind)
 | |
|             if isinstance(printFind, NavigableString) is False:
 | |
|                 if printFind['href'] is not None:
 | |
|                     print("Check " + printFind['href'] +
 | |
|                           " for base of " + baseURL)
 | |
|                     if printFind['href'].find("http") != 0:
 | |
|                         return baseURL + printFind['href']
 | |
|                     return printFind['href']
 | |
|             tag = printFind.parent
 | |
|             print(tag)
 | |
|             if tag.get('href', None) is None:
 | |
|                 if self.debugMessages is True:
 | |
|                     print("Not in parent, trying skip-up")
 | |
|                 if tag.parent['href'] is None:
 | |
|                     if self.debugMessages is True:
 | |
|                         print("Not in skip either, aborting")
 | |
|                     continue
 | |
|                 return tag.parent['href']
 | |
|             return tag['href']
 | |
|         return tagURL
 | |
| 
 | |
|     def get_browser(self):
 | |
|         if self.debugMessages is True:
 | |
|             print("In get_browser")
 | |
|         br = BasicNewsRecipe.get_browser(self)
 | |
|         return br
 | |
| 
 | |
|     def parseRSS(self, index):
 | |
|         if self.debugMessages is True:
 | |
|             print("\n\nStarting " + self.feedsets[index][0])
 | |
|         articleList = []
 | |
|         soup = self.index_to_soup(self.feedsets[index][1])
 | |
|         for div in soup.findAll("item"):
 | |
|             title = div.find("title").contents[0]
 | |
|             urlEl = div.find("originalLink")
 | |
|             if urlEl is None or len(urlEl.contents) == 0:
 | |
|                 urlEl = div.find("originallink")
 | |
|             if urlEl is None or len(urlEl.contents) == 0:
 | |
|                 urlEl = div.find("link")
 | |
|             if urlEl is None or len(urlEl.contents) == 0:
 | |
|                 urlEl = div.find("guid")
 | |
|             if urlEl is None or title is None or len(urlEl.contents) == 0:
 | |
|                 print("Error in feed " + self.feedsets[index][0])
 | |
|                 print(div)
 | |
|                 continue
 | |
|             print(title)
 | |
|             print(urlEl)
 | |
|             url = urlEl.contents[0].encode("utf-8")
 | |
|             description = div.find("description")
 | |
|             if description is not None and description.contents is not None and len(description.contents) > 0:
 | |
|                 description = description.contents[0]
 | |
|             else:
 | |
|                 description = "None"
 | |
|             pubDateEl = div.find("pubDate")
 | |
|             if pubDateEl is None:
 | |
|                 pubDateEl = div.find("pubdate")
 | |
|             if pubDateEl is None:
 | |
|                 pubDate = time.strftime('%a, %d %b')
 | |
|             else:
 | |
|                 pubDate = pubDateEl.contents[0]
 | |
|             if self.debugMessages is True:
 | |
|                 print("Article")
 | |
|                 print(title)
 | |
|                 print(description)
 | |
|                 print(pubDate)
 | |
|                 print(url)
 | |
|             try:
 | |
|                 url = self.extractPrintURL(url)
 | |
|             except Exception:
 | |
|                 self.log.exception('Failed to extract print URL for %s' % url)
 | |
|             print(url)
 | |
|             # url +=re.sub(r'\?.*', '', div['href'])
 | |
|             pubdate = time.strftime('%a, %d %b')
 | |
|             articleList.append(
 | |
|                 dict(title=title, url=url, date=pubdate, description=description, content=''))
 | |
|         return articleList
 | |
| 
 | |
|     # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
 | |
|     # returns a list of tuple ('feed title', list of articles)
 | |
|     # {
 | |
|     # 'title'       : article title,
 | |
|     # 'url'         : URL of print version,
 | |
|     # 'date'        : The publication date of the article as a string,
 | |
|     # 'description' : A summary of the article
 | |
|     # 'content'     : The full article (can be an empty string). This is used by FullContentProfile
 | |
|     # }
 | |
|     # this is used instead of BasicNewsRecipe.parse_feeds().
 | |
|     def parse_index(self):
 | |
|         # Parse the page into Python Soup
 | |
| 
 | |
|         # articleList = []
 | |
|         ans = []
 | |
|         feedsCount = len(self.feedsets)
 | |
|         for x in range(0, feedsCount):  # should be ,4
 | |
|             feedarticles = self.parseRSS(x)
 | |
|             if feedarticles is not None:
 | |
|                 ans.append((self.feedsets[x][0], feedarticles))
 | |
|         if self.debugMessages is True:
 | |
|             print(ans)
 | |
|         return ans
 |