calibre/recipes/oreilly_premium.recipe

import string, re
import time
import traceback
# above for debugging via stack
from calibre.web.feeds.recipes import BasicNewsRecipe
# Allows the Python soup converter, which makes parsing easier.
from calibre.ebooks.BeautifulSoup import BeautifulSoup

import os, time, traceback, re, urlparse, sys, cStringIO
from collections import defaultdict
from functools import partial
from contextlib import nested, closing


from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending


# To Do: strip ads and graphics, Current Column lacks a title.
# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
# Newsletters: Talking Points Memos covered by cat12
# ./ebook-convert  --username xxx --password xxx

# this is derived from BasicNewsRecipe, so it can only overload those.
# Soome of what we need is otherwise in article, so we have more copy to do than otherwise.
class OReillyPremium(BasicNewsRecipe):
    title           = u'OReilly Premium'
    __author__      = 'TMcN'
    description     = 'Retrieves Premium and News Letter content from BillOReilly.com.  Requires a Bill OReilly Premium Membership.'
    cover_url       = 'http://images.billoreilly.com/images/headers/billgray_header.png'
    custom_title    = 'Bill O\'Reilly Premium - '+ time.strftime('%d %b %Y')
    title           = 'Bill O\'Reilly Premium'
    auto_cleanup    = True
    conversion_options = {'linearize_tables': True}
    encoding        = 'utf8'
    language        = 'en'
    no_stylesheets  = True
    needs_subscription = True
    oldest_article  = 31
    remove_javascript = True
    remove_tags     = [dict(name='img', attrs={})]
    # Don't go down
    recursions      = 0
    max_articles_per_feed = 20

    debugMessages   = True

    # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
    catList = [ ["TV Archives",         'https://www.billoreilly.com/show?action=tvShowArchive', 'a',    {'class':['showLinks','homeLinks']},                   []],
                # ["No Spin Archives",    'https://www.billoreilly.com/blog?categoryID=7',         True,   {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
                # ["Daily Briefings",     'http://www.billoreilly.com/blog?categoryID=11',         True,   {'class':['defaultHeaderSmallLinks']},                 []],
                # ["Stratfor",            'http://www.billoreilly.com/blog?categoryID=5',          'a',    {'class':['blogLinks']},                               []],
                # ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12',        'td',   {},                                                    []],
                ["Current Column",      'https://www.billoreilly.com/currentcolumn',             'span', {'class':['defaultHeader']},                           []]
              ]

    feeds          = [
        (u'No Spin',        u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'),
        (u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'),
        (u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'),
        (u'Blog',           u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'),
        (u'StratFor',       u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5')
    ]
    # http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day.

    # Note: Talking Points is broken in the above model; the site changed to more Ajax-y.
    # Now using RSS

    def get_browser(self):
        print("In get_browser")
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
            br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
            br.select_form(name='login')
            br['formEmailField']   = self.username
            br['formPasswordField'] = self.password
            br.submit()
        return br

    # Returns the best-guess print url.
    # The second parameter (pageURL) is returned if nothing is found.
    def extractPrintURL(self, baseURL, pageURL, printString):
        tagURL = pageURL
        soup = self.index_to_soup(pageURL)
        if soup :
            printText = soup.find('a', text=printString)
        else :
            print("Failed to find Print string "+printString+ " in "+pageURL)
        if printText:
            tag = printText.parent
            tagURL = baseURL+tag['href']
        return tagURL

    def stripBadChars(self, inString) :
        return inString.replace("\'", "")


    def parseGeneric(self, baseURL):
        # Does a generic parsing of the articles.  There are six categories (0-5)
        # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
        # NoSpin and TV are generic
        fullReturn = []
        for i in range(len(self.catList)) :
            articleList = []
            print("In "+self.catList[i][0]+", index: "+ str(i))
            soup = self.index_to_soup(self.catList[i][1])
            # Set defaults
            description = 'None'
            pubdate = time.strftime('%a, %d %b')
            # Problem: 0-2 create many in an array
            # 3-5 create one.
            # So no for-div for 3-5

            if i == 0 :
                print("Starting TV Archives")
                for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
                     print("Next DIV:")
                     print(div)
                     a = div
                     summary = div.find(True, attrs={'class':'summary'})
                     if summary:
                         description = self.tag_to_string(summary, use_alt=False)
                     if not a:
                         continue
                     # url = baseURL+re.sub(r'\?.*', '', a['href'])
                     url = baseURL+a['href']
                     url = self.extractPrintURL(baseURL, url, "Print this entry")
                     title = self.tag_to_string(a, use_alt=True).strip()
                     articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))

            else :       # Current Column
                titleSpan = soup.find(self.catList[i][2], self.catList[i][3])
                if titleSpan is None :
                    print("No Current Column Title Span")
                    print(soup)
                    continue
                title = titleSpan.contents[0]
                url = self.extractPrintURL(baseURL, self.catList[i][1], "Print This Article")
            if i == 1 :
                 if self.debugMessages :
                    print(self.catList[i][0]+" Title:"+title+" at url: "+url)
                 summary = div.find(True, attrs={'class':'summary'})
                 print("At Summary")
                 print(summary)
                 if summary is not None:
                     description = self.tag_to_string(summary, use_alt=False)
                 print("At append")
                 articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
            self.catList[i][3] = articleList
            fullReturn.append((self.catList[i][0], articleList))
            print("Returning")
            # print fullReturn
        return fullReturn


    # build_index() starts with:
    #     try:
    #        feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
    #                                 max_articles_per_feed=self.max_articles_per_feed,
    #                                 log=self.log)
    #        self.report_progress(0, _('Got feeds from index page'))
    #    except NotImplementedError:
    #        feeds = self.parse_feeds()

    # which in turn is from __init__.py
    #def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100,
    #    log=default_log):
    #'''
    #@param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}.
    #@return: A list of L{Feed} objects.
    #@rtype: list
    #'''
    #feeds = []
    #for title, articles in index:
    #    pfeed = Feed(log=log)
    #    pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
    #                                   max_articles_per_feed=max_articles_per_feed)
    #    feeds.append(pfeed)
    #           return feeds

    #  use_embedded_content defaults to None, at which point if the content is > 2K, it is used as the article.


    # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
    # returns a list of tuple ('feed title', list of articles)
    # {
    # 'title'       : article title,
    # 'url'         : URL of print version,
    # 'date'        : The publication date of the article as a string,
    # 'description' : A summary of the article
    # 'content'     : The full article (can be an empty string). This is used by FullContentProfile
    # }
    # this is used instead of BasicNewsRecipe.parse_feeds().
    # it is called by download
    def parse_index(self):
        # Parse the page into Python Soup
        print("Entering recipe print_index from:")
        traceback.print_stack()
        print("web")
        baseURL = "https://www.billoreilly.com"
        masterList = self.parseGeneric(baseURL)
        #print(masterList)
        return masterList

    def preprocess_html(self, soup):
        print("In preprocess_html")
        refresh = soup.find('meta', {'http-equiv':'refresh'})
        if refresh is None:
            return soup
        content = refresh.get('content').partition('=')[2]
        raw = self.browser.open('https://www.billoreilly.com'+content).read()
        return BeautifulSoup(raw.decode('cp1252', 'replace'))

    def build_index(self):
        print("In OReilly build_index()\n\n")
        feedsRSS = []
        self.report_progress(0, _('Fetching feeds...'))
        #try:
        feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
                                 max_articles_per_feed=self.max_articles_per_feed,
                                 log=self.log)
        self.report_progress(0, _('Got feeds from index page'))
        #except NotImplementedError:
        #    feeds = self.parse_feeds()
        # Now add regular feeds.
        feedsRSS = self.parse_feeds()
        print ("feedsRSS is type "+feedsRSS.__class__.__name__)

        for articles in feedsRSS:
            print("articles is type "+articles.__class__.__name__)
            print("Title:" + articles.title)
            feeds.append(articles)
        if not feeds:
            raise ValueError('No articles found, aborting')

        #feeds = FeedCollection(feeds)

        self.report_progress(0, _('Trying to download cover...'))
        self.download_cover()
        self.report_progress(0, _('Generating masthead...'))
        self.masthead_path = None

        try:
            murl = self.get_masthead_url()
        except:
            self.log.exception('Failed to get masthead url')
            murl = None

        if murl is not None:
            # Try downloading the user-supplied masthead_url
            # Failure sets self.masthead_path to None
            self.download_masthead(murl)
        if self.masthead_path is None:
            self.log.info("Synthesizing mastheadImage")
            self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg')
            try:
                self.default_masthead_image(self.masthead_path)
            except:
                self.log.exception('Failed to generate default masthead image')
                self.masthead_path = None

        if self.test:
            feeds = feeds[:2]
        self.has_single_feed = len(feeds) == 1

        index = os.path.join(self.output_dir, 'index.html')

        html = self.feeds2index(feeds)
        with open(index, 'wb') as fi:
            fi.write(html)

        self.jobs = []

        if self.reverse_article_order:
            for feed in feeds:
                if hasattr(feed, 'reverse'):
                    feed.reverse()

        self.feed_objects = feeds
        for f, feed in enumerate(feeds):
            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
            if not os.path.isdir(feed_dir):
                os.makedirs(feed_dir)

            for a, article in enumerate(feed):
                if a >= self.max_articles_per_feed:
                    break
                art_dir = os.path.join(feed_dir, 'article_%d'%a)
                if not os.path.isdir(art_dir):
                    os.makedirs(art_dir)
                try:
                    url = self.print_version(article.url)
                except NotImplementedError:
                    url = article.url
                except:
                    self.log.exception('Failed to find print version for: '+article.url)
                    url = None
                if not url:
                    continue
                func, arg = (self.fetch_embedded_article, article) \
                            if self.use_embedded_content or (self.use_embedded_content == None and feed.has_embedded_content()) \
                            else \
                            ((self.fetch_obfuscated_article if self.articles_are_obfuscated \
                              else self.fetch_article), url)
                req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
                                      {}, (f, a), self.article_downloaded,
                                      self.error_in_article_download)
                req.feed = feed
                req.article = article
                req.feed_dir = feed_dir
                self.jobs.append(req)


        self.jobs_done = 0
        tp = ThreadPool(self.simultaneous_downloads)
        for req in self.jobs:
            tp.putRequest(req, block=True, timeout=0)


        self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
        while True:
            try:
                tp.poll()
                time.sleep(0.1)
            except NoResultsPending:
                break
        for f, feed in enumerate(feeds):
            print("Writing feeds for "+feed.title)
            html = self.feed2index(f,feeds)
            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
            with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
                fi.write(html)
        self.create_opf(feeds)
        self.report_progress(1, _('Feeds downloaded to %s')%index)

        return index