Updated OReilly Premium and Real Clear

2025-07-09 03:04:10 -04:00 · 2012-04-02 09:00:25 +05:30 · 2012-04-02 09:00:25 +05:30 · bb443d01f1
commit bb443d01f1
parent ee108790db
2 changed files with 284 additions and 109 deletions
--- a/recipes/oreilly_premium.recipe
+++ b/recipes/oreilly_premium.recipe
@ -1,45 +1,73 @@
-# Talking Points is not grabbing everything.
+import string, re
 # The look is right, but only the last one added?
 import re
 import time
 import traceback
 # above for debugging via stack
 from calibre.web.feeds.recipes import BasicNewsRecipe
 # Allows the Python soup converter, which makes parsing easier.
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
-# strip ads and graphics
+
-# Current Column lacks a title.
+import os, time, traceback, re, urlparse, sys, cStringIO
-# Talking Points Memo - shorten title - Remove year and Bill's name
+from collections import defaultdict
 from functools import partial
 from contextlib import nested, closing
 from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
 from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
 # To Do: strip ads and graphics, Current Column lacks a title.
 # The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
 # Newsletters: Talking Points Memos covered by cat12 
 # ./ebook-convert  --username xxx --password xxx
 # this is derived from BasicNewsRecipe, so it can only overload those.  
 # Soome of what we need is otherwise in article, so we have more copy to do than otherwise.
 class OReillyPremium(BasicNewsRecipe):
    title           = u'OReilly Premium'
    __author__      = 'TMcN'
    language = 'en'
    description     = 'Retrieves Premium and News Letter content from BillOReilly.com.  Requires a Bill OReilly Premium Membership.'
    cover_url       = 'http://images.billoreilly.com/images/headers/billgray_header.png'
    custom_title    = 'Bill O\'Reilly Premium - '+ time.strftime('%d %b %Y')
    title           = 'Bill O\'Reilly Premium'
    auto_cleanup    = True
    conversion_options = {'linearize_tables': True}
    encoding        = 'utf8'
-    needs_subscription = True
+    language        = 'en'
    no_stylesheets  = True
-    oldest_article  = 20
+    needs_subscription = True
    oldest_article  = 31
    remove_javascript = True
    remove_tags     = [dict(name='img', attrs={})]
    # Don't go down
    recursions      = 0
-    max_articles_per_feed = 2000
+    max_articles_per_feed = 20
    debugMessages   = True
    # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
    catList = [ ["TV Archives",         'https://www.billoreilly.com/show?action=tvShowArchive', 'a',    {'class':['showLinks','homeLinks']},                   []],
-                ["No Spin Archives",    'https://www.billoreilly.com/blog?categoryID=7',         True,   {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
+                # ["No Spin Archives",    'https://www.billoreilly.com/blog?categoryID=7',         True,   {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
-                ["Daily Briefings",     'http://www.billoreilly.com/blog?categoryID=11',         True,   {'class':['defaultHeaderSmallLinks']},                 []],
+                # ["Daily Briefings",     'http://www.billoreilly.com/blog?categoryID=11',         True,   {'class':['defaultHeaderSmallLinks']},                 []],
-                ["Stratfor",            'http://www.billoreilly.com/blog?categoryID=5',          'a',    {'class':['blogLinks']},                               []],
+                # ["Stratfor",            'http://www.billoreilly.com/blog?categoryID=5',          'a',    {'class':['blogLinks']},                               []],
-                ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12',        'td',   {},                                                    []],
+                # ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12',        'td',   {},                                                    []],
                ["Current Column",      'https://www.billoreilly.com/currentcolumn',             'span', {'class':['defaultHeader']},                           []]
              ]
    feeds          = [
        (u'No Spin',        u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'),
        (u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'), 
        (u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'),
        (u'Blog',           u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'),
        (u'StratFor',       u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5')
    ]
    # http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day.              
    # Note: Talking Points is broken in the above model; the site changed to more Ajax-y.
    # Now using RSS
    def get_browser(self):
        print("In get_browser")
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
            br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
@ -66,6 +94,7 @@ class OReillyPremium(BasicNewsRecipe):
    def stripBadChars(self, inString) :
        return inString.replace("\'", "")
    def parseGeneric(self, baseURL):
        # Does a generic parsing of the articles.  There are six categories (0-5) 
        # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
@ -73,6 +102,7 @@ class OReillyPremium(BasicNewsRecipe):
        fullReturn = []
        for i in range(len(self.catList)) : 
            articleList = []
            print("In "+self.catList[i][0]+", index: "+ str(i))
            soup = self.index_to_soup(self.catList[i][1])
            # Set defaults
            description = 'None'
@ -81,14 +111,12 @@ class OReillyPremium(BasicNewsRecipe):
            # 3-5 create one.
            # So no for-div for 3-5
-            if i < 3 :
+            if i == 0 :
                print("Starting TV Archives")
                for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
                     print("Next DIV:")
                     print(div)
-                     if i == 1:
+                     a = div
                        a = div.find('a', href=True)
                     else :
                        a = div
                     print(a)
                     summary = div.find(True, attrs={'class':'summary'})
                     if summary:
                         description = self.tag_to_string(summary, use_alt=False)
@ -96,82 +124,63 @@ class OReillyPremium(BasicNewsRecipe):
                         continue
                     # url = baseURL+re.sub(r'\?.*', '', a['href'])
                     url = baseURL+a['href']
-                     if i < 2 :
+                     url = self.extractPrintURL(baseURL, url, "Print this entry")
-                        url = self.extractPrintURL(baseURL, url, "Print this entry")
+                     title = self.tag_to_string(a, use_alt=True).strip()
                        title = self.tag_to_string(a, use_alt=True).strip()
                     elif i == 2 :
                        # Daily Briefs
                        url = self.extractPrintURL(baseURL, url, "Print this entry")
                        title =  div.contents[0]
                     if self.debugMessages :
                        print(title+" @ "+url)
                     articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
            elif i == 3 :   # Stratfor
                a = soup.find('a', self.catList[i][3])
                if a is None :
                    continue
                url = baseURL+a['href']
                title = self.tag_to_string(a, use_alt=True).strip()
                # Get Stratfor contents so we can get the real title.
                stratSoup = self.index_to_soup(url)
                title = stratSoup.html.head.title.string
                stratIndex = title.find('Stratfor.com:', 0)
                if (stratIndex > -1) :
                    title = title[stratIndex+14:-1]
                # Look for first blogBody  <td class="blogBody"
                # Changed 12 Jan 2012 - new page format
                #stratBlogTable = stratSoup.find('td', {'class':['blogBody']}).findParent('table')
                #stratBody = stratSoup.find('td', {'class':['blogBody']})
            elif i == 4 :      # Talking Points
                topDate =  soup.find("td", "blogBody")
                if not topDate :
                    print("Failed to find date in Talking Points")
                # This page has the contents in double-wrapped tables!
                myTable = topDate.findParents('table')[0]
                if myTable is not None:
                    upOneTable = myTable.findParents('table')[0]
                    if upOneTable is not None:
                        upTwo = upOneTable.findParents('table')[0]
                if upTwo is None:
                    continue
                # Now navigate rows of upTwo
                if self.debugMessages :
                    print("Entering rows")
                for rows in upTwo.findChildren("tr", recursive=False):
                    # Inside top level table, each row is an article
                    rowTable = rows.find("table")
                    articleTable = rowTable.find("table")
                    # This looks wrong.
                    articleTable = rows.find("tr")
                    # The middle table is just for formatting the article buffer... but this means we can skip the inner table.
                    blogDate = articleTable.find("a","blogDate").contents[0]
                    # Skip to second blogBody for this.
                    blogTitle = articleTable.findAll("td", "blogBody")[1].contents[0]
                    blogURL = articleTable.find("a", "homeBlogReadMore bold")['href']
                    url = baseURL+re.sub(r'\?.*', '', blogURL)
                    title = blogDate+": "+self.stripBadChars(blogTitle.replace("Bill O'Reilly: ", ""))
                    if self.debugMessages :
                        print("Talking Points Memo title "+title+" at url: "+url)
                    pubdate = time.strftime('%a, %d %b')
                    articleList.append(dict(title=title, url=url, date=pubdate, description='None', content=''))
            else :       # Current Column
                titleSpan = soup.find(self.catList[i][2], self.catList[i][3])
                if titleSpan is None :
                    print("No Current Column Title Span")
                    print(soup)
                    continue
                title = titleSpan.contents[0]
                url = self.extractPrintURL(baseURL, self.catList[i][1], "Print This Article")
-            if i == 3 or i == 5 :
+            if i == 1 :
                 if self.debugMessages :
                    print(self.catList[i][0]+" Title:"+title+" at url: "+url)
                 summary = div.find(True, attrs={'class':'summary'})
-                 if summary:
+                 print("At Summary")
                 print(summary)
                 if summary is not None:
                     description = self.tag_to_string(summary, use_alt=False)
                 print("At append")
                 articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
            self.catList[i][3] = articleList
            fullReturn.append((self.catList[i][0], articleList))
            print("Returning")
            # print fullReturn
        return fullReturn
    # build_index() starts with:
    #     try:
    #        feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
    #                                 max_articles_per_feed=self.max_articles_per_feed,
    #                                 log=self.log)
    #        self.report_progress(0, _('Got feeds from index page'))
    #    except NotImplementedError:
    #        feeds = self.parse_feeds()
    # which in turn is from __init__.py
    #def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100,
    #    log=default_log):
    #'''
    #@param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}.
    #@return: A list of L{Feed} objects.
    #@rtype: list
    #'''
    #feeds = []
    #for title, articles in index:
    #    pfeed = Feed(log=log)
    #    pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
    #                                   max_articles_per_feed=max_articles_per_feed)
    #    feeds.append(pfeed)
    #           return feeds
    #  use_embedded_content defaults to None, at which point if the content is > 2K, it is used as the article.
    # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
    # returns a list of tuple ('feed title', list of articles)
    # {
@ -182,12 +191,19 @@ class OReillyPremium(BasicNewsRecipe):
    # 'content'     : The full article (can be an empty string). This is used by FullContentProfile
    # }
    # this is used instead of BasicNewsRecipe.parse_feeds().
    # it is called by download
    def parse_index(self):
        # Parse the page into Python Soup
        print("Entering recipe print_index from:")
        traceback.print_stack()
        print("web")
        baseURL = "https://www.billoreilly.com"
-        return self.parseGeneric(baseURL)
+        masterList = self.parseGeneric(baseURL)
        #print(masterList)
        return masterList
    def preprocess_html(self, soup):
        print("In preprocess_html")
        refresh = soup.find('meta', {'http-equiv':'refresh'})
        if refresh is None:
            return soup
@ -195,3 +211,128 @@ class OReillyPremium(BasicNewsRecipe):
        raw = self.browser.open('https://www.billoreilly.com'+content).read()
        return BeautifulSoup(raw.decode('cp1252', 'replace'))
    def build_index(self):
        print("In OReilly build_index()\n\n")
        feedsRSS = []
        self.report_progress(0, _('Fetching feeds...'))
        #try:
        feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
                                 max_articles_per_feed=self.max_articles_per_feed,
                                 log=self.log)
        self.report_progress(0, _('Got feeds from index page'))
        #except NotImplementedError:
        #    feeds = self.parse_feeds()
        # Now add regular feeds.    
        feedsRSS = self.parse_feeds()
        print ("feedsRSS is type "+feedsRSS.__class__.__name__)
        for articles in feedsRSS:
            print("articles is type "+articles.__class__.__name__)
            print("Title:" + articles.title)
            feeds.append(articles)
        if not feeds:
            raise ValueError('No articles found, aborting')
        #feeds = FeedCollection(feeds)
        self.report_progress(0, _('Trying to download cover...'))
        self.download_cover()
        self.report_progress(0, _('Generating masthead...'))
        self.masthead_path = None
        try:
            murl = self.get_masthead_url()
        except:
            self.log.exception('Failed to get masthead url')
            murl = None
        if murl is not None:
            # Try downloading the user-supplied masthead_url
            # Failure sets self.masthead_path to None
            self.download_masthead(murl)
        if self.masthead_path is None:
            self.log.info("Synthesizing mastheadImage")
            self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg')
            try:
                self.default_masthead_image(self.masthead_path)
            except:
                self.log.exception('Failed to generate default masthead image')
                self.masthead_path = None
        if self.test:
            feeds = feeds[:2]
        self.has_single_feed = len(feeds) == 1
        index = os.path.join(self.output_dir, 'index.html')
        html = self.feeds2index(feeds)
        with open(index, 'wb') as fi:
            fi.write(html)
        self.jobs = []
        if self.reverse_article_order:
            for feed in feeds:
                if hasattr(feed, 'reverse'):
                    feed.reverse()
        self.feed_objects = feeds
        for f, feed in enumerate(feeds):
            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
            if not os.path.isdir(feed_dir):
                os.makedirs(feed_dir)
            for a, article in enumerate(feed):
                if a >= self.max_articles_per_feed:
                    break
                art_dir = os.path.join(feed_dir, 'article_%d'%a)
                if not os.path.isdir(art_dir):
                    os.makedirs(art_dir)
                try:
                    url = self.print_version(article.url)
                except NotImplementedError:
                    url = article.url
                except:
                    self.log.exception('Failed to find print version for: '+article.url)
                    url = None
                if not url:
                    continue
                func, arg = (self.fetch_embedded_article, article) \
                            if self.use_embedded_content or (self.use_embedded_content == None and feed.has_embedded_content()) \
                            else \
                            ((self.fetch_obfuscated_article if self.articles_are_obfuscated \
                              else self.fetch_article), url)
                req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
                                      {}, (f, a), self.article_downloaded,
                                      self.error_in_article_download)
                req.feed = feed
                req.article = article
                req.feed_dir = feed_dir
                self.jobs.append(req)
        self.jobs_done = 0
        tp = ThreadPool(self.simultaneous_downloads)
        for req in self.jobs:
            tp.putRequest(req, block=True, timeout=0)
        self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
        while True:
            try:
                tp.poll()
                time.sleep(0.1)
            except NoResultsPending:
                break
        for f, feed in enumerate(feeds):
            print("Writing feeds for "+feed.title)
            html = self.feed2index(f,feeds)
            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
            with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
                fi.write(html)
        self.create_opf(feeds)
        self.report_progress(1, _('Feeds downloaded to %s')%index)
        return index
--- a/recipes/real_clear.recipe
+++ b/recipes/real_clear.recipe
@ -1,7 +1,9 @@
 #  Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug
 import string, re
 import time
 from urlparse import urlparse
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import NavigableString
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString
 class RealClear(BasicNewsRecipe):
    title           = u'Real Clear'
@ -20,12 +22,13 @@ class RealClear(BasicNewsRecipe):
    # Don't go down
    recursions      = 0
    max_articles_per_feed = 400
-    debugMessages = False
+    debugMessages = True
    # Numeric parameter is type, controls whether we look for 
    feedsets = [
-                ["Politics",        "http://www.realclearpolitics.com/index.xml", 0],
+                ["Politics",        "http://www.realclearpolitics.com/index.xml",   0],
-                ["Science",         "http://www.realclearscience.com/index.xml", 0],
+                ["Policy",           "http://www.realclearpolicy.com/index.xml", 0],
                ["Science",         "http://www.realclearscience.com/index.xml",    0],
                ["Tech",            "http://www.realcleartechnology.com/index.xml", 0],
                # The feedburner is essentially the same as the top feed, politics.
                # ["Politics Burner", "http://feeds.feedburner.com/realclearpolitics/qlMj", 1],
@ -37,7 +40,9 @@ class RealClear(BasicNewsRecipe):
            ]
    # Hints to extractPrintURL.
    # First column is the URL snippet.  Then the string to search for as text, and the attributes to look for above it.  Start with attributes and drill down.
-    printhints = [
+    phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)
    printhints = [  ["realclear",           "",                            '' , 'printpage'],
                    ["billoreilly.com",     "Print this entry",            'a', ''],
                    ["billoreilly.com",     "Print This Article",          'a', ''],
                    ["politico.com",        "Print",                       'a', 'share-print'],    
@ -48,11 +53,24 @@ class RealClear(BasicNewsRecipe):
                    # usatoday - just prints with all current crap anyhow
            ]
     # RCP - look for a strange compound.  See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html
     # The print link isn't obvious, and only the end is needed (the -full append.)  SO maybe try that first?s
     # http://www.realclearpolitics.com/printpage/?url=http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879-full.html
     # Single page articles don't have a _full; e.g. http://www.realclearpolitics.com/articles/2012/01/25/obamas_green_robber_barons_112897.html
     # Use the FULL PRINTPAGE URL; it formats it better too!
     #
     # NYT - try single page...
     # Need special code - is it one page or several?  Which URL?
     # from  http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
     # to    http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
     # which is at link rel="canonical"   and at        <meta property="og:url"    or look for "Single Page"
    # Returns the best-guess print url.
    # The second parameter (pageURL) is returned if nothing is found.
    def extractPrintURL(self, pageURL):
        tagURL = pageURL
        baseParse = urlparse(pageURL)
        baseURL = baseParse[0]+"://"+baseParse[1]
        hintsCount =len(self.printhints)
        for x in range(0,hintsCount):
            if pageURL.find(self.printhints[x][0])== -1 :
@ -62,23 +80,37 @@ class RealClear(BasicNewsRecipe):
            soup = self.index_to_soup(pageURL)
            if soup is None:
                return pageURL
-            if len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
+            if  len(self.printhints[x][self.phHrefSearch])>0 and len(self.printhints[x][self.phLinkText]) == 0:
                # e.g. RealClear
                if self.debugMessages == True :
-                    print("search1")
+                    print("Search by href: "+self.printhints[x][self.phHrefSearch])
                printFind = soup.find(href=re.compile(self.printhints[x][self.phHrefSearch]))
            elif len(self.printhints[x][3])>0 and len(self.printhints[x][1]) == 0:
                if self.debugMessages == True :
                    print("Search 1: "+self.printhints[x][2]+" Attributes: ")
                    print(self.printhints[x][3])
                printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3])
            elif  len(self.printhints[x][3])>0 :
                if self.debugMessages == True :
                    print("search2")
                printFind = soup.find(self.printhints[x][2], attrs=self.printhints[x][3], text=self.printhints[x][1])
            else :
                if self.debugMessages == True:
                    print("Default Search: "+self.printhints[x][2]+" Text: "+self.printhints[x][1])
                printFind = soup.find(self.printhints[x][2], text=self.printhints[x][1])
            if printFind is None:
                if self.debugMessages == True :
                    print("Not Found")
                    # print(soup)
                    print("end soup\n\n");
                continue
            print(printFind)
            if isinstance(printFind, NavigableString)==False:
                if printFind['href'] is not None:
                    print("Check "+printFind['href']+" for base of "+baseURL)
                    if printFind['href'].find("http")!=0 :
                        return baseURL+printFind['href']
                    return printFind['href']
            tag = printFind.parent
            print(tag)
@ -158,6 +190,7 @@ class RealClear(BasicNewsRecipe):
    def parse_index(self):
        # Parse the page into Python Soup
        articleList = []
        ans = []
        feedsCount = len(self.feedsets)
        for x in range(0,feedsCount): # should be ,4
@ -168,3 +201,4 @@ class RealClear(BasicNewsRecipe):
            print(ans)
        return ans