calibre/recipes/oreilly_premium.recipe

import re
import time
from calibre.web.feeds.recipes import BasicNewsRecipe
# Allows the Python soup converter, which makes parsing easier.
from calibre.ebooks.BeautifulSoup import BeautifulSoup

class OReillyPremium(BasicNewsRecipe):
    title           = u'OReilly Premium'
    __author__      = 'TMcN'
    description     = 'Retrieves Premium and News Letter content from BillOReilly.com.  Requires a Bill OReilly Premium Membership.'
    cover_url       = 'http://images.billoreilly.com/images/headers/billgray_header.png'
    auto_cleanup    = True
    encoding        = 'utf8'
    needs_subscription = True
    no_stylesheets  = True
    oldest_article  = 20
    remove_javascript = True
    remove_tags     = [dict(name='img', attrs={})]
    # Don't go down
    recursions      = 0
    max_articles_per_feed = 2000
    language = 'en'

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
            br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
            br.select_form(name='login')
            br['formEmailField']   = self.username
            br['formPasswordField'] = self.password
            br.submit()
        return br

    def extractPrintURL(self, baseURL, pageURL, printString):
        tagURL = pageURL
        soup = self.index_to_soup(pageURL)
        if soup :
            printText = soup.find('a', text=printString)
        else :
            print("Failed to find Print string "+printString+ " in "+pageURL)

        if printText:
            tag = printText.parent
            tagURL = baseURL+tag['href']
        return tagURL

    def stripBadChars(self, inString) :
        return inString.replace("\'", "")


    # returns a qualifying article list
    def parseNoSpinArchives(self, baseURL, soupURL, debugMessages):
        articleList = []
        soup = self.index_to_soup(soupURL)
        for div in soup.findAll(True, attrs={'class':['blogBody'], 'style':['padding-top:10px;']}):
             a = div.find('a', href=True)
             if not a:
                 continue
             # re == regex. [href] is the link
             url = baseURL
             url +=re.sub(r'\?.*', '', a['href'])
             # Get print version
             printURL = self.extractPrintURL(baseURL, url, "Print this entry")
             if printURL:
                url = printURL
             title = self.tag_to_string(a, use_alt=True).strip()
             if debugMessages :
                print("No Spin Archive Title:"+title+" at url: "+url)
             description = 'None'
             pubdate = time.strftime('%a, %d %b')
             summary = div.find(True, attrs={'class':'summary'})
             if summary:
                 description = self.tag_to_string(summary, use_alt=False)
             articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
        return articleList


    def parseTVArchives(self, baseURL, soupURL, debugMessages):
        # TV Archives page has some Ajax, so look for the static only.
        articleList = []
        soup = self.index_to_soup(soupURL)
        if debugMessages :
           print("In parseTVArchives")
        for div in soup.findAll('a', {'class':['showLinks','homeLinks']}):
             a = div
             url = baseURL
             url +=a['href']
             printURL = self.extractPrintURL(baseURL, url, "Print this entry")
             if printURL:
                url = printURL
             title = self.tag_to_string(a, use_alt=True).strip()
             title = self.stripBadChars(title)
             if debugMessages :
                print("TV Archive "+title+" at url: "+url)
             description = 'None'
             pubdate = time.strftime('%a, %d %b')
             summary = div.find(True, attrs={'class':'summary'})
             if summary:
                 description = self.tag_to_string(summary, use_alt=False)
             articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
        if debugMessages :
            print("Leaving TV Parse ")
        return articleList

    # Get Daily Briefing Archives
    def parseDailyBriefs(self, baseURL, soupURL, debugMessages) :
        print("Starting daily briefs")
        articleList = []
        soup = self.index_to_soup(soupURL)
        for div in soup.findAll(True, attrs={'class':['defaultHeaderSmallLinks']}):
             # re == regex. [href] is the link
             url = baseURL
             url +=re.sub(r'\?.*', '', div['href'])
             printURL = self.extractPrintURL(baseURL, url, "Print this entry")
             if printURL:
                url = printURL
             title = div.contents[0]
             if debugMessages :
                print("Daily Brief - title:"+title+" at url: "+url)
             description = 'None'
             pubdate = time.strftime('%a, %d %b')
             summary = div.find(True, attrs={'class':'summary'})
             if summary:
                 description = self.tag_to_string(summary, use_alt=False)
             articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
        print("Leaving daily briefs")
        return articleList

    # Get the weekly Stratfor intelligence report
    def parseStratfor(self, baseURL, soupURL, debugMessages):
        # http://www.billoreilly.com/blog?categoryID=5
        articleList = []
        soup = self.index_to_soup(soupURL)
        if debugMessages :
           print("In parseStratfor")
        a = soup.find('a', {'class':['blogLinks']})
        url = baseURL
        url +=a['href']
        title = self.tag_to_string(a, use_alt=True).strip()
        if debugMessages :
            print("url: "+url)
            print("title:"+title)
        # Get Stratfor contents so we can get the real title.
        stratSoup = self.index_to_soup(url)
        title = stratSoup.html.head.title.string
        stratIndex = title.find('Stratfor.com:', 0)
        if (stratIndex > -1) :
            title = title[stratIndex+14:-1]
        # Look for first blogBody  <td class="blogBody"
        stratBody = stratSoup.find('td', {'class':['blogBody']})
        if debugMessages :
            print("Strat content title:"+title)
            print("Strat body: "+ stratBody.contents[0])
        description = 'None'
        pubdate = time.strftime('%a, %d %b')
        articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
        if debugMessages :
           print("Leaving Stratfor Parse ")
        return articleList

    def parseTalkingPoints(self, baseURL, soupURL, debugMessages) :
        # Look for blogDate.  That's got the date...  Then the next blogBody has the title.  and then an anchor with class "homeBlogReadMore bold" has the URL.
        articleList = []
        soup = self.index_to_soup(soupURL)
        if debugMessages :
            print("Starting Talking Points")
        topDate =  soup.find("td", "blogBody")
        if not topDate :
            print("Failed to find date in Talking Points")
        # This page has the contents in double-wrapped tables!
        # tableParent = topDate.parent.parent
        myTable = topDate.findParents('table')[0]
        upOneTable = myTable.findParents('table')[0]
        upTwo = upOneTable.findParents('table')[0]
        # Now navigate rows of upTwo
        if debugMessages :
            print("Entering rows")
        for rows in upTwo.findChildren("tr", recursive=False):
            # Inside top level table, each row is an article
            rowTable = rows.find("table")
            articleTable = rowTable.find("table")
            articleTable = rows.find("tr")
            # The middle table is just for formatting the article buffer... but this means we can skip the inner table.
            blogDate = articleTable.find("a","blogDate").contents[0]
            # Skip to second blogBody for this.
            blogTitle = articleTable.findAll("td", "blogBody")[1].contents[0]
            blogURL = articleTable.find("a", "homeBlogReadMore bold")['href']
            # re == regex. [href] is the link
            url = baseURL
            url +=re.sub(r'\?.*', '', blogURL)
            title = blogDate+": "+self.stripBadChars(blogTitle.replace("Bill O'Reilly: ", ""))
            if debugMessages :
                print("Talking Points Memo title "+title+" at url: "+url)
            description = 'None'
            pubdate = time.strftime('%a, %d %b')
            articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
        print("Exiting parseTalkingPoints\n")
        return articleList

    def parseCurrentColumn(self, baseURL, soupURL, debugMessages) :
        # Only needed to get the column title.  Otherwise it's all good already; there's only one column
        articleList = []
        soup = self.index_to_soup(soupURL)
        titleSpan = soup.find('span', {'class':['defaultHeader']})
        title = titleSpan.contents[0]
        # Get Print URL since it's available
        printURL = self.extractPrintURL(baseURL, soupURL, "Print This Article")
        if printURL:
            print("Found print URL")
            url = printURL
        if debugMessages :
            print("url: "+url)
            print("title:"+title)
        description = 'None'
        pubdate = time.strftime('%a, %d %b')
        articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
        if debugMessages :
           print("Leaving Stratfor Parse ")
        return articleList


    # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
    # returns a list of tuple ('feed title', list of articles)
    # {
    # 'title'       : article title,
    # 'url'         : URL of print version,
    # 'date'        : The publication date of the article as a string,
    # 'description' : A summary of the article
    # 'content'     : The full article (can be an empty string). This is used by FullContentProfile
    # }
    # this is used instead of BasicNewsRecipe.parse_feeds().
    def parse_index(self):
        # Parse the page into Python Soup
        debugMessages = True
        baseURL = "https://www.billoreilly.com"
        def feed_title(div):
            return ''.join(div.findAll(text=True, recursive=False)).strip()
        # [] is list, {} is empty mapping.
        articleList = []
        ans = []
        showList = self.parseTVArchives(baseURL, 'https://www.billoreilly.com/show?action=tvShowArchive', debugMessages)
        articleList = self.parseNoSpinArchives(baseURL, 'https://www.billoreilly.com/blog?categoryID=7', debugMessages)
        stratList = self.parseStratfor(baseURL, 'http://www.billoreilly.com/blog?categoryID=5', debugMessages)
        dailyBriefs = self.parseDailyBriefs(baseURL, 'http://www.billoreilly.com/blog?categoryID=11', debugMessages)
        talkingPoints = self.parseTalkingPoints(baseURL, 'https://www.billoreilly.com/blog?categoryID=12', debugMessages)
        currentColumn = self.parseCurrentColumn(baseURL, 'https://www.billoreilly.com/currentcolumn', debugMessages)
        # Below, { x:y, a:b } creates a dictionary.   We return a tuple of a title and list of dict...
        # Lists are constructed with square brackets, separating items with commas: [a, b, c].  Tuples are constructed by the comma operator (not within square brackets), with or without enclosing parentheses, but an empty tuple must have the enclosing parentheses, such as a, b, c or (). A single item tuple must have a trailing comma, such as (d,).
        # Shows first two if talking points and no spin news.  Also if they are TV Shows ande Stratfor Weekly, also if Daily Briefing and Curren Column
        # So all work individually.  No idea why only getting first two in TOC now.
        ans = [("Talking Points Memos", talkingPoints),("No Spin News", articleList),("TV Shows", showList),("Stratfor Weekly",stratList), ("Daily Briefing", dailyBriefs),("Current Column", currentColumn)]
        if debugMessages :
            print ans
        return ans

    def preprocess_html(self, soup):
        refresh = soup.find('meta', {'http-equiv':'refresh'})
        if refresh is None:
            return soup
        content = refresh.get('content').partition('=')[2]
        raw = self.browser.open('https://www.billoreilly.com'+content).read()
        return BeautifulSoup(raw.decode('cp1252', 'replace'))