Real Clear by TMcN

2025-07-09 03:04:10 -04:00 · 2012-01-21 21:37:47 +05:30 · 2012-01-21 21:37:47 +05:30 · 0be2914578
commit 0be2914578
parent 541b8cc368
1 changed files with 170 additions and 0 deletions
--- a/recipes/real_clear.recipe
+++ b/recipes/real_clear.recipe
@ -0,0 +1,170 @@
+#  Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug
+import time
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import NavigableString
+
+class RealClear(BasicNewsRecipe):
+    title           = u'Real Clear'
+    __author__      = 'TMcN'
+    description     = 'Real Clear Politics/Science/etc... aggregation of news\n'
+    cover_url       = 'http://www.realclearpolitics.com/dev/mt-static/images/logo.gif'
+    custom_title    = 'Real Clear - '+ time.strftime('%d %b %Y')
+    auto_cleanup    = True
+    encoding        = 'utf8'
+    language        = 'en'
+    needs_subscription = False
+    no_stylesheets  = True
+    oldest_article  = 7
+    remove_javascript = True
+    remove_tags     = [dict(name='img', attrs={})]
+    # Don't go down
+    recursions      = 0
+    max_articles_per_feed = 400
+    debugMessages = False
+
+    # Numeric parameter is type, controls whether we look for
+    feedsets = [
+                ["Politics",        "http://www.realclearpolitics.com/index.xml", 0],
+                ["Science",         "http://www.realclearscience.com/index.xml", 0],
+                ["Tech",            "http://www.realcleartechnology.com/index.xml", 0],
+                # The feedburner is essentially the same as the top feed, politics.
+                # ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1],
+                # ["Commentary",      "http://feeds.feedburner.com/Realclearpolitics-Articles", 1],
+                ["Markets Home",    "http://www.realclearmarkets.com/index.xml", 0],
+                ["Markets",         "http://www.realclearmarkets.com/articles/index.xml", 0],
+                ["World",           "http://www.realclearworld.com/index.xml", 0],
+                ["World Blog",           "http://www.realclearworld.com/blog/index.xml", 2]
+            ]
+    # Hints to extractPrintURL.
+    # First column is the URL snippet.  Then the string to search for as text, and the attributes to look for above it.  Start with attributes and drill down.
+    printhints = [
+                    ["billoreilly.com",     "Print this entry",            'a', ''],
+                    ["billoreilly.com",     "Print This Article",          'a', ''],
+                    ["politico.com",        "Print",                       'a', 'share-print'],
+                    ["nationalreview.com",  ">Print<",                     'a', ''],
+                    ["reason.com",          "",                       'a', 'printer']
+                    # The following are not supported due to JavaScripting, and would require obfuscated_article to handle
+                    # forbes,
+                    # usatoday - just prints with all current crap anyhow
+
+            ]
+
+    # Returns the best-guess print url.
+    # The second parameter (pageURL) is returned if nothing is found.
+    def extractPrintURL(self, pageURL):
+        tagURL = pageURL
+        hintsCount =len(self.printhints)
+        for x in range(0,hintsCount):
+            if pageURL.find(self.printhints[x][0])== -1 :
+                continue
+            print("Trying "+self.printhints[x][0])
+            # Only retrieve the soup if we have a match to check for the printed article with.
+            soup = self.index_to_soup(pageURL)
+            if soup is None:
+                return pageURL
+            if len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
+                if self.debugMessages == True :
+                    print("search1")
+                printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3])
+            elif  len(self.printhints[x][3])>0 :
+                if self.debugMessages == True :
+                    print("search2")
+                printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1])
+            else :
+                printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1])
+            if printFind is None:
+                if self.debugMessages == True :
+                    print("Not Found")
+                continue
+            print(printFind)
+            if isinstance(printFind, NavigableString)==False:
+                if printFind['href'] is not None:
+                    return printFind['href']
+            tag = printFind.parent
+            print(tag)
+            if tag['href'] is None:
+                if self.debugMessages == True :
+                    print("Not in parent, trying skip-up")
+                if tag.parent['href'] is None:
+                    if self.debugMessages == True :
+                        print("Not in skip either, aborting")
+                    continue;
+                return tag.parent['href']
+            return tag['href']
+        return tagURL
+
+    def get_browser(self):
+        if self.debugMessages == True :
+            print("In get_browser")
+        br = BasicNewsRecipe.get_browser()
+        return br
+
+    def parseRSS(self, index) :
+        if self.debugMessages == True :
+            print("\n\nStarting "+self.feedsets[index][0])
+        articleList = []
+        soup = self.index_to_soup(self.feedsets[index][1])
+        for div in soup.findAll("item"):
+            title = div.find("title").contents[0]
+            urlEl = div.find("originalLink")
+            if urlEl is None or len(urlEl.contents)==0 :
+                urlEl = div.find("originallink")
+            if urlEl is None or len(urlEl.contents)==0 :
+                urlEl = div.find("link")
+            if urlEl is None or len(urlEl.contents)==0 :
+                urlEl = div.find("guid")
+            if urlEl is None or title is None  or len(urlEl.contents)==0 :
+                print("Error in feed "+ self.feedsets[index][0])
+                print(div)
+                continue
+            print(title)
+            print(urlEl)
+            url = urlEl.contents[0].encode("utf-8")
+            description = div.find("description")
+            if description is not None and description.contents is not None and len(description.contents)>0:
+                description = description.contents[0]
+            else :
+                description="None"
+            pubDateEl = div.find("pubDate")
+            if pubDateEl is None :
+                pubDateEl = div.find("pubdate")
+            if pubDateEl is None :
+                pubDate = time.strftime('%a, %d %b')
+            else :
+                pubDate = pubDateEl.contents[0]
+            if self.debugMessages == True :
+                print("Article");
+                print(title)
+                print(description)
+                print(pubDate)
+                print(url)
+            url = self.extractPrintURL(url)
+            print(url)
+            #url +=re.sub(r'\?.*', '', div['href'])
+            pubdate = time.strftime('%a, %d %b')
+            articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
+        return articleList
+
+    # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
+    # returns a list of tuple ('feed title', list of articles)
+    # {
+    # 'title'       : article title,
+    # 'url'         : URL of print version,
+    # 'date'        : The publication date of the article as a string,
+    # 'description' : A summary of the article
+    # 'content'     : The full article (can be an empty string). This is used by FullContentProfile
+    # }
+    # this is used instead of BasicNewsRecipe.parse_feeds().
+    def parse_index(self):
+        # Parse the page into Python Soup
+
+        ans = []
+        feedsCount = len(self.feedsets)
+        for x in range(0,feedsCount): # should be ,4
+            feedarticles = self.parseRSS(x)
+            if feedarticles is not None:
+                ans.append((self.feedsets[x][0], feedarticles))
+        if self.debugMessages == True :
+            print(ans)
+        return ans
+