calibre/recipes/oreilly_premium.recipe

from __future__ import print_function
import time
import traceback
# above for debugging via stack
from calibre.web.feeds.recipes import BasicNewsRecipe
# Allows the Python soup converter, which makes parsing easier.
from calibre.ebooks.BeautifulSoup import BeautifulSoup

import os


from calibre.web.feeds import feeds_from_index
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending


# To Do: strip ads and graphics, Current Column lacks a title.
# The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
# Newsletters: Talking Points Memos covered by cat12
# ./ebook-convert  --username xxx --password xxx

# this is derived from BasicNewsRecipe, so it can only overload those.
# Soome of what we need is otherwise in article, so we have more copy to
# do than otherwise.
class OReillyPremium(BasicNewsRecipe):
    title = u'OReilly Premium'
    __author__ = 'TMcN'
    description = 'Retrieves Premium and News Letter content from BillOReilly.com.  Requires a Bill OReilly Premium Membership.'
    cover_url = 'http://images.billoreilly.com/images/headers/billgray_header.png'
    custom_title = 'Bill O\'Reilly Premium - ' + time.strftime('%d %b %Y')
    title = 'Bill O\'Reilly Premium'
    auto_cleanup = True
    conversion_options = {'linearize_tables': True}
    encoding = 'utf8'
    language = 'en'
    no_stylesheets = True
    needs_subscription = True
    oldest_article = 31
    remove_javascript = True
    remove_tags = [dict(name='img', attrs={})]
    # Don't go down
    recursions = 0
    max_articles_per_feed = 20

    debugMessages = True

    # Name, URL, Soup FindAll Attr if relevant (last two are special case),
    # articleList
    catList = [["TV Archives",         'https://www.billoreilly.com/show?action=tvShowArchive', 'a',
                {'class': ['showLinks', 'homeLinks']},                   []],
               ["Current Column",      'https://www.billoreilly.com/currentcolumn',
                'span', {'class': ['defaultHeader']},                           []]
               ]

    feeds = [
        (u'No Spin',        u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'),
        (u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'),
        (u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'),
        (u'Blog',           u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'),
        (u'StratFor',       u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5')
    ]
    # http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8
    # is word for the day.

    # Note: Talking Points is broken in the above model; the site changed to more Ajax-y.
    # Now using RSS

    def get_browser(self):
        print("In get_browser")
        br = BasicNewsRecipe.get_browser(self)
        if self.username is not None and self.password is not None:
            br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
            br.select_form(name='login')
            br['formEmailField'] = self.username
            br['formPasswordField'] = self.password
            br.submit()
        return br

    # Returns the best-guess print url.
    # The second parameter (pageURL) is returned if nothing is found.
    def extractPrintURL(self, baseURL, pageURL, printString):
        tagURL = pageURL
        soup = self.index_to_soup(pageURL)
        if soup:
            printText = soup.find('a', text=printString)
        else:
            print("Failed to find Print string " +
                  printString + " in " + pageURL)
        if printText:
            tag = printText.parent
            tagURL = baseURL + tag['href']
        return tagURL

    def stripBadChars(self, inString):
        return inString.replace("\'", "")

    def parseGeneric(self, baseURL):
        # Does a generic parsing of the articles.  There are six categories (0-5)
        # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
        # NoSpin and TV are generic
        fullReturn = []
        for i in range(len(self.catList)):
            articleList = []
            print("In " + self.catList[i][0] + ", index: " + str(i))
            soup = self.index_to_soup(self.catList[i][1])
            # Set defaults
            description = 'None'
            pubdate = time.strftime('%a, %d %b')
            # Problem: 0-2 create many in an array
            # 3-5 create one.
            # So no for-div for 3-5

            if i == 0:
                print("Starting TV Archives")
                for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
                    print("Next DIV:")
                    print(div)
                    a = div
                    summary = div.find(True, attrs={'class': 'summary'})
                    if summary:
                        description = self.tag_to_string(
                            summary, use_alt=False)
                    if not a:
                        continue
                    # url = baseURL+re.sub(r'\?.*', '', a['href'])
                    url = baseURL + a['href']
                    url = self.extractPrintURL(
                        baseURL, url, "Print this entry")
                    title = self.tag_to_string(a, use_alt=True).strip()
                    articleList.append(
                        dict(title=title, url=url, date=pubdate, description=description, content=''))

            else:       # Current Column
                titleSpan = soup.find(self.catList[i][2], self.catList[i][3])
                if titleSpan is None:
                    print("No Current Column Title Span")
                    print(soup)
                    continue
                title = titleSpan.contents[0]
                url = self.extractPrintURL(baseURL, self.catList[i][
                                           1], "Print This Article")
            if i == 1:
                if self.debugMessages:
                    print(self.catList[i][0] + " Title:" +
                          title + " at url: " + url)
                summary = div.find(True, attrs={'class': 'summary'})
                print("At Summary")
                print(summary)
                if summary is not None:
                    description = self.tag_to_string(summary, use_alt=False)
                print("At append")
                articleList.append(
                    dict(title=title, url=url, date=pubdate, description=description, content=''))
            self.catList[i][3] = articleList
            fullReturn.append((self.catList[i][0], articleList))
            print("Returning")
            # print fullReturn
        return fullReturn

    def parse_index(self):
        # Parse the page into Python Soup
        print("Entering recipe print_index from:")
        traceback.print_stack()
        print("web")
        baseURL = "https://www.billoreilly.com"
        masterList = self.parseGeneric(baseURL)
        # print(masterList)
        return masterList

    def preprocess_html(self, soup):
        print("In preprocess_html")
        refresh = soup.find('meta', {'http-equiv': 'refresh'})
        if refresh is None:
            return soup
        content = refresh.get('content').partition('=')[2]
        raw = self.browser.open('https://www.billoreilly.com' + content).read()
        return BeautifulSoup(raw.decode('cp1252', 'replace'))

    def build_index(self):
        print("In OReilly build_index()\n\n")
        feedsRSS = []
        self.report_progress(0, ('Fetching feeds...'))
        # try:
        feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
                                 max_articles_per_feed=self.max_articles_per_feed,
                                 log=self.log)
        self.report_progress(0, ('Got feeds from index page'))
        # except NotImplementedError:
        #    feeds = self.parse_feeds()
        # Now add regular feeds.
        feedsRSS = self.parse_feeds()
        print("feedsRSS is type " + feedsRSS.__class__.__name__)

        for articles in feedsRSS:
            print("articles is type " + articles.__class__.__name__)
            print("Title:" + articles.title)
            feeds.append(articles)
        if not feeds:
            raise ValueError('No articles found, aborting')

        self.report_progress(0, ('Trying to download cover...'))
        self.download_cover()
        self.report_progress(0, ('Generating masthead...'))
        self.masthead_path = None

        try:
            murl = self.get_masthead_url()
        except:
            self.log.exception('Failed to get masthead url')
            murl = None

        if murl is not None:
            # Try downloading the user-supplied masthead_url
            # Failure sets self.masthead_path to None
            self.download_masthead(murl)
        if self.masthead_path is None:
            self.log.info("Synthesizing mastheadImage")
            self.masthead_path = os.path.join(
                self.output_dir, 'mastheadImage.jpg')
            try:
                self.default_masthead_image(self.masthead_path)
            except:
                self.log.exception('Failed to generate default masthead image')
                self.masthead_path = None

        if self.test:
            feeds = feeds[:2]
        self.has_single_feed = len(feeds) == 1

        index = os.path.join(self.output_dir, 'index.html')

        html = self.feeds2index(feeds)
        with open(index, 'wb') as fi:
            fi.write(html)

        self.jobs = []

        if self.reverse_article_order:
            for feed in feeds:
                if hasattr(feed, 'reverse'):
                    feed.reverse()

        self.feed_objects = feeds
        for f, feed in enumerate(feeds):
            feed_dir = os.path.join(self.output_dir, 'feed_%d' % f)
            if not os.path.isdir(feed_dir):
                os.makedirs(feed_dir)

            for a, article in enumerate(feed):
                if a >= self.max_articles_per_feed:
                    break
                art_dir = os.path.join(feed_dir, 'article_%d' % a)
                if not os.path.isdir(art_dir):
                    os.makedirs(art_dir)
                try:
                    url = self.print_version(article.url)
                except NotImplementedError:
                    url = article.url
                except:
                    self.log.exception(
                        'Failed to find print version for: ' + article.url)
                    url = None
                if not url:
                    continue
                func, arg = (self.fetch_embedded_article, article) \
                    if self.use_embedded_content or (self.use_embedded_content is None and feed.has_embedded_content()) \
                    else \
                            ((self.fetch_obfuscated_article if self.articles_are_obfuscated
                              else self.fetch_article), url)
                req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
                                  {}, (f, a), self.article_downloaded,
                                  self.error_in_article_download)
                req.feed = feed
                req.article = article
                req.feed_dir = feed_dir
                self.jobs.append(req)

        self.jobs_done = 0
        tp = ThreadPool(self.simultaneous_downloads)
        for req in self.jobs:
            tp.putRequest(req, block=True, timeout=0)

        self.report_progress(
            0, ('Starting download [%d thread(s)]...') % self.simultaneous_downloads)
        while True:
            try:
                tp.poll()
                time.sleep(0.1)
            except NoResultsPending:
                break
        for f, feed in enumerate(feeds):
            print("Writing feeds for " + feed.title)
            html = self.feed2index(f, feeds)
            feed_dir = os.path.join(self.output_dir, 'feed_%d' % f)
            with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
                fi.write(html)
        self.create_opf(feeds)
        self.report_progress(1, ('Feeds downloaded to %s') % index)

        return index