calibre/recipes/real_clear.recipe

from __future__ import print_function

# Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe
# .epub --test -vv --debug-pipeline debug
import re
import time

from calibre.ebooks.BeautifulSoup import NavigableString
from calibre.web.feeds.recipes import BasicNewsRecipe

try:
    from urllib.parse import urlparse
except ImportError:
    from urlparse import urlparse


class RealClear(BasicNewsRecipe):
    title = u'Real Clear'
    __author__ = 'TMcN'
    description = 'Real Clear Politics/Science/etc... aggregation of news\n'
    cover_url = 'http://www.realclearpolitics.com/dev/mt-static/images/logo.gif'
    custom_title = 'Real Clear - ' + time.strftime('%d %b %Y')
    auto_cleanup = True
    encoding = 'utf8'
    language = 'en'
    needs_subscription = False
    no_stylesheets = True
    oldest_article = 7
    remove_javascript = True
    remove_tags = [dict(name='img', attrs={})]
    # Don't go down
    recursions = 0
    max_articles_per_feed = 400
    debugMessages = True

    # Numeric parameter is type, controls whether we look for
    feedsets = [
        ["Politics",        "http://www.realclearpolitics.com/index.xml",   0],
        ["Policy",           "http://www.realclearpolicy.com/index.xml", 0],
        ["Science",         "http://www.realclearscience.com/index.xml",    0],
        ["Tech",            "http://www.realcleartechnology.com/index.xml", 0],
        # The feedburner is essentially the same as the top feed, politics.
        # ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1],
        # ["Commentary",      "http://feeds.feedburner.com/Realclearpolitics-Articles", 1],
        ["Markets Home",    "http://www.realclearmarkets.com/index.xml", 0],
        ["Markets",         "http://www.realclearmarkets.com/articles/index.xml", 0],
        ["World",           "http://www.realclearworld.com/index.xml", 0],
        ["World Blog",           "http://www.realclearworld.com/blog/index.xml", 2]
    ]
    # Hints to extractPrintURL.
    # First column is the URL snippet.  Then the string to search for as text,
    # and the attributes to look for above it.  Start with attributes and
    # drill down.
    phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)

    printhints = [["realclear",           "",                            '', 'printpage'],
                  ["billoreilly.com",     "Print this entry",            'a', ''],
                  ["billoreilly.com",     "Print This Article",          'a', ''],
                  ["politico.com",        "Print",
                      'a', 'share-print'],
                  ["nationalreview.com",  ">Print<",                     'a', ''],
                  ["reason.com",          "",                       'a', 'printer']
                  # The following are not supported due to JavaScripting, and would require obfuscated_article to handle
                  # forbes,
                  # usatoday - just prints with all current crap anyhow

                  ]
    # The print link isn't obvious, and only the end is needed (the -full append.)  SO maybe try that first?s
    # http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html
    # Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html
    # Use the FULL PRINTPAGE URL; it formats it better too!
    #
    # NYT - try single page...
    # Need special code - is it one page or several?  Which URL?
    # from  http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
    # to    http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
    # which is at link rel="canonical"   and at        <meta property="og:url"
    # or look for "Single Page"

    # Returns the best-guess print url.
    # The second parameter (pageURL) is returned if nothing is found.
    def extractPrintURL(self, pageURL):
        tagURL = pageURL
        baseParse = urlparse(pageURL)
        baseURL = baseParse[0] + "://" + baseParse[1]
        hintsCount = len(self.printhints)
        for x in range(0, hintsCount):
            if pageURL.find(self.printhints[x][0]) == -1:
                continue
            print("Trying " + self.printhints[x][0])
            # Only retrieve the soup if we have a match to check for the
            # printed article with.
            soup = self.index_to_soup(pageURL)
            if soup is None:
                return pageURL
            if len(self.printhints[x][self.phHrefSearch]) > 0 and len(self.printhints[x][self.phLinkText]) == 0:
                # e.g. RealClear
                if self.debugMessages is True:
                    print("Search by href: " +
                          self.printhints[x][self.phHrefSearch])
                printFind = soup.find(href=re.compile(
                    self.printhints[x][self.phHrefSearch]))
            elif len(self.printhints[x][3]) > 0 and len(self.printhints[x][1]) == 0:
                if self.debugMessages is True:
                    print("Search 1: " +
                          self.printhints[x][2] + " Attributes: ")
                    print(self.printhints[x][3])
                printFind = soup.find(
                    self.printhints[x][2], attrs=self.printhints[x][3])
            elif len(self.printhints[x][3]) > 0:
                if self.debugMessages is True:
                    print("search2")
                printFind = soup.find(self.printhints[x][2], attrs=self.printhints[
                                      x][3], text=self.printhints[x][1])
            else:
                if self.debugMessages is True:
                    print(
                        "Default Search: " + self.printhints[x][2] + " Text: " + self.printhints[x][1])
                printFind = soup.find(
                    self.printhints[x][2], text=self.printhints[x][1])
            if printFind is None:
                if self.debugMessages is True:
                    print("Not Found")
                    # print(soup)
                    print("end soup\n\n")
                continue

            print(printFind)
            if isinstance(printFind, NavigableString) is False:
                if printFind['href'] is not None:
                    print("Check " + printFind['href'] +
                          " for base of " + baseURL)
                    if printFind['href'].find("http") != 0:
                        return baseURL + printFind['href']
                    return printFind['href']
            tag = printFind.parent
            print(tag)
            if tag.get('href', None) is None:
                if self.debugMessages is True:
                    print("Not in parent, trying skip-up")
                if tag.parent['href'] is None:
                    if self.debugMessages is True:
                        print("Not in skip either, aborting")
                    continue
                return tag.parent['href']
            return tag['href']
        return tagURL

    def get_browser(self):
        if self.debugMessages is True:
            print("In get_browser")
        br = BasicNewsRecipe.get_browser(self)
        return br

    def parseRSS(self, index):
        if self.debugMessages is True:
            print("\n\nStarting " + self.feedsets[index][0])
        articleList = []
        soup = self.index_to_soup(self.feedsets[index][1])
        for div in soup.findAll("item"):
            title = div.find("title").contents[0]
            urlEl = div.find("originalLink")
            if urlEl is None or len(urlEl.contents) == 0:
                urlEl = div.find("originallink")
            if urlEl is None or len(urlEl.contents) == 0:
                urlEl = div.find("link")
            if urlEl is None or len(urlEl.contents) == 0:
                urlEl = div.find("guid")
            if urlEl is None or title is None or len(urlEl.contents) == 0:
                print("Error in feed " + self.feedsets[index][0])
                print(div)
                continue
            print(title)
            print(urlEl)
            url = urlEl.contents[0].encode("utf-8")
            description = div.find("description")
            if description is not None and description.contents is not None and len(description.contents) > 0:
                description = description.contents[0]
            else:
                description = "None"
            pubDateEl = div.find("pubDate")
            if pubDateEl is None:
                pubDateEl = div.find("pubdate")
            if pubDateEl is None:
                pubDate = time.strftime('%a, %d %b')
            else:
                pubDate = pubDateEl.contents[0]
            if self.debugMessages is True:
                print("Article")
                print(title)
                print(description)
                print(pubDate)
                print(url)
            try:
                url = self.extractPrintURL(url)
            except Exception:
                self.log.exception('Failed to extract print URL for %s' % url)
            print(url)
            # url +=re.sub(r'\?.*', '', div['href'])
            pubdate = time.strftime('%a, %d %b')
            articleList.append(
                dict(title=title, url=url, date=pubdate, description=description, content=''))
        return articleList

    # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
    # returns a list of tuple ('feed title', list of articles)
    # {
    # 'title'       : article title,
    # 'url'         : URL of print version,
    # 'date'        : The publication date of the article as a string,
    # 'description' : A summary of the article
    # 'content'     : The full article (can be an empty string). This is used by FullContentProfile
    # }
    # this is used instead of BasicNewsRecipe.parse_feeds().
    def parse_index(self):
        # Parse the page into Python Soup

        # articleList = []
        ans = []
        feedsCount = len(self.feedsets)
        for x in range(0, feedsCount):  # should be ,4
            feedarticles = self.parseRSS(x)
            if feedarticles is not None:
                ans.append((self.feedsets[x][0], feedarticles))
        if self.debugMessages is True:
            print(ans)
        return ans