OReilly Premium by TechnoCat

2026-05-29 18:22:37 -04:00 · 2012-01-08 08:58:59 +05:30
parent 2dd5346104
commit 554a351966
1 changed files with 263 additions and 0 deletions
@@ -0,0 +1,263 @@
+import re
+import time
+from calibre.web.feeds.recipes import BasicNewsRecipe
+# Allows the Python soup converter, which makes parsing easier.
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
+class OReillyPremium(BasicNewsRecipe):
+    title           = u'OReilly Premium'
+    __author__      = 'TMcN'
+    description     = 'Retrieves Premium and News Letter content from BillOReilly.com.  Requires a Bill OReilly Premium Membership.'
+    cover_url       = 'http://images.billoreilly.com/images/headers/billgray_header.png'
+    auto_cleanup    = True
+    encoding        = 'utf8'
+    needs_subscription = True
+    no_stylesheets  = True
+    oldest_article  = 20
+    remove_javascript = True
+    remove_tags     = [dict(name='img', attrs={})]
+    # Don't go down
+    recursions      = 0
+    max_articles_per_feed = 2000
+    language = 'en'
+
+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser()
+        if self.username is not None and self.password is not None:
+            br.open('https://www.billoreilly.com/pg/jsp/member/membersignin.jsp')
+            br.select_form(name='login')
+            br['formEmailField']   = self.username
+            br['formPasswordField'] = self.password
+            br.submit()
+        return br
+
+    def extractPrintURL(self, baseURL, pageURL, printString):
+        tagURL = pageURL
+        soup = self.index_to_soup(pageURL)
+        if soup :
+            printText = soup.find('a', text=printString)
+        else :
+            print("Failed to find Print string "+printString+ " in "+pageURL)
+
+        if printText:
+            tag = printText.parent
+            tagURL = baseURL+tag['href']
+        return tagURL
+
+    def stripBadChars(self, inString) :
+        return inString.replace("\'", "")
+
+
+    # returns a qualifying article list
+    def parseNoSpinArchives(self, baseURL, soupURL, debugMessages):
+        articleList = []
+        soup = self.index_to_soup(soupURL)
+        for div in soup.findAll(True, attrs={'class':['blogBody'], 'style':['padding-top:10px;']}):
+             a = div.find('a', href=True)
+             if not a:
+                 continue
+             # re == regex. [href] is the link
+             url = baseURL
+             url +=re.sub(r'\?.*', '', a['href'])
+             # Get print version
+             printURL = self.extractPrintURL(baseURL, url, "Print this entry")
+             if printURL:
+                url = printURL
+             title = self.tag_to_string(a, use_alt=True).strip()
+             if debugMessages :
+                print("No Spin Archive Title:"+title+" at url: "+url)
+             description = 'None'
+             pubdate = time.strftime('%a, %d %b')
+             summary = div.find(True, attrs={'class':'summary'})
+             if summary:
+                 description = self.tag_to_string(summary, use_alt=False)
+             articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
+        return articleList
+
+
+    def parseTVArchives(self, baseURL, soupURL, debugMessages):
+        # TV Archives page has some Ajax, so look for the static only.
+        articleList = []
+        soup = self.index_to_soup(soupURL)
+        if debugMessages :
+           print("In parseTVArchives")
+        for div in soup.findAll('a', {'class':['showLinks','homeLinks']}):
+             a = div
+             url = baseURL
+             url +=a['href']
+             printURL = self.extractPrintURL(baseURL, url, "Print this entry")
+             if printURL:
+                url = printURL
+             title = self.tag_to_string(a, use_alt=True).strip()
+             title = self.stripBadChars(title)
+             if debugMessages :
+                print("TV Archive "+title+" at url: "+url)
+             description = 'None'
+             pubdate = time.strftime('%a, %d %b')
+             summary = div.find(True, attrs={'class':'summary'})
+             if summary:
+                 description = self.tag_to_string(summary, use_alt=False)
+             articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
+        if debugMessages :
+            print("Leaving TV Parse ")
+        return articleList
+
+    # Get Daily Briefing Archives
+    def parseDailyBriefs(self, baseURL, soupURL, debugMessages) :
+        print("Starting daily briefs")
+        articleList = []
+        soup = self.index_to_soup(soupURL)
+        for div in soup.findAll(True, attrs={'class':['defaultHeaderSmallLinks']}):
+             # re == regex. [href] is the link
+             url = baseURL
+             url +=re.sub(r'\?.*', '', div['href'])
+             printURL = self.extractPrintURL(baseURL, url, "Print this entry")
+             if printURL:
+                url = printURL
+             title = div.contents[0]
+             if debugMessages :
+                print("Daily Brief - title:"+title+" at url: "+url)
+             description = 'None'
+             pubdate = time.strftime('%a, %d %b')
+             summary = div.find(True, attrs={'class':'summary'})
+             if summary:
+                 description = self.tag_to_string(summary, use_alt=False)
+             articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
+        print("Leaving daily briefs")
+        return articleList
+
+    # Get the weekly Stratfor intelligence report
+    def parseStratfor(self, baseURL, soupURL, debugMessages):
+        # http://www.billoreilly.com/blog?categoryID=5
+        articleList = []
+        soup = self.index_to_soup(soupURL)
+        if debugMessages :
+           print("In parseStratfor")
+        a = soup.find('a', {'class':['blogLinks']})
+        url = baseURL
+        url +=a['href']
+        title = self.tag_to_string(a, use_alt=True).strip()
+        if debugMessages :
+            print("url: "+url)
+            print("title:"+title)
+        # Get Stratfor contents so we can get the real title.
+        stratSoup = self.index_to_soup(url)
+        title = stratSoup.html.head.title.string
+        stratIndex = title.find('Stratfor.com:', 0)
+        if (stratIndex > -1) :
+            title = title[stratIndex+14:-1]
+        # Look for first blogBody  <td class="blogBody"
+        stratBody = stratSoup.find('td', {'class':['blogBody']})
+        if debugMessages :
+            print("Strat content title:"+title)
+            print("Strat body: "+ stratBody.contents[0])
+        description = 'None'
+        pubdate = time.strftime('%a, %d %b')
+        articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
+        if debugMessages :
+           print("Leaving Stratfor Parse ")
+        return articleList
+
+    def parseTalkingPoints(self, baseURL, soupURL, debugMessages) :
+        # Look for blogDate.  That's got the date...  Then the next blogBody has the title.  and then an anchor with class "homeBlogReadMore bold" has the URL.
+        articleList = []
+        soup = self.index_to_soup(soupURL)
+        if debugMessages :
+            print("Starting Talking Points")
+        topDate =  soup.find("td", "blogBody")
+        if not topDate :
+            print("Failed to find date in Talking Points")
+        # This page has the contents in double-wrapped tables!
+        # tableParent = topDate.parent.parent
+        myTable = topDate.findParents('table')[0]
+        upOneTable = myTable.findParents('table')[0]
+        upTwo = upOneTable.findParents('table')[0]
+        # Now navigate rows of upTwo
+        if debugMessages :
+            print("Entering rows")
+        for rows in upTwo.findChildren("tr", recursive=False):
+            # Inside top level table, each row is an article
+            rowTable = rows.find("table")
+            articleTable = rowTable.find("table")
+            articleTable = rows.find("tr")
+            # The middle table is just for formatting the article buffer... but this means we can skip the inner table.
+            blogDate = articleTable.find("a","blogDate").contents[0]
+            # Skip to second blogBody for this.
+            blogTitle = articleTable.findAll("td", "blogBody")[1].contents[0]
+            blogURL = articleTable.find("a", "homeBlogReadMore bold")['href']
+            # re == regex. [href] is the link
+            url = baseURL
+            url +=re.sub(r'\?.*', '', blogURL)
+            title = blogDate+": "+self.stripBadChars(blogTitle.replace("Bill O'Reilly: ", ""))
+            if debugMessages :
+                print("Talking Points Memo title "+title+" at url: "+url)
+            description = 'None'
+            pubdate = time.strftime('%a, %d %b')
+            articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
+        print("Exiting parseTalkingPoints\n")
+        return articleList
+
+    def parseCurrentColumn(self, baseURL, soupURL, debugMessages) :
+        # Only needed to get the column title.  Otherwise it's all good already; there's only one column
+        articleList = []
+        soup = self.index_to_soup(soupURL)
+        titleSpan = soup.find('span', {'class':['defaultHeader']})
+        title = titleSpan.contents[0]
+        # Get Print URL since it's available
+        printURL = self.extractPrintURL(baseURL, soupURL, "Print This Article")
+        if printURL:
+            print("Found print URL")
+            url = printURL
+        if debugMessages :
+            print("url: "+url)
+            print("title:"+title)
+        description = 'None'
+        pubdate = time.strftime('%a, %d %b')
+        articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
+        if debugMessages :
+           print("Leaving Stratfor Parse ")
+        return articleList
+
+
+    # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
+    # returns a list of tuple ('feed title', list of articles)
+    # {
+    # 'title'       : article title,
+    # 'url'         : URL of print version,
+    # 'date'        : The publication date of the article as a string,
+    # 'description' : A summary of the article
+    # 'content'     : The full article (can be an empty string). This is used by FullContentProfile
+    # }
+    # this is used instead of BasicNewsRecipe.parse_feeds().
+    def parse_index(self):
+        # Parse the page into Python Soup
+        debugMessages = True
+        baseURL = "https://www.billoreilly.com"
+        def feed_title(div):
+            return ''.join(div.findAll(text=True, recursive=False)).strip()
+        # [] is list, {} is empty mapping.
+        articleList = []
+        ans = []
+        showList = self.parseTVArchives(baseURL, 'https://www.billoreilly.com/show?action=tvShowArchive', debugMessages)
+        articleList = self.parseNoSpinArchives(baseURL, 'https://www.billoreilly.com/blog?categoryID=7', debugMessages)
+        stratList = self.parseStratfor(baseURL, 'http://www.billoreilly.com/blog?categoryID=5', debugMessages)
+        dailyBriefs = self.parseDailyBriefs(baseURL, 'http://www.billoreilly.com/blog?categoryID=11', debugMessages)
+        talkingPoints = self.parseTalkingPoints(baseURL, 'https://www.billoreilly.com/blog?categoryID=12', debugMessages)
+        currentColumn = self.parseCurrentColumn(baseURL, 'https://www.billoreilly.com/currentcolumn', debugMessages)
+        # Below, { x:y, a:b } creates a dictionary.   We return a tuple of a title and list of dict...
+        # Lists are constructed with square brackets, separating items with commas: [a, b, c].  Tuples are constructed by the comma operator (not within square brackets), with or without enclosing parentheses, but an empty tuple must have the enclosing parentheses, such as a, b, c or (). A single item tuple must have a trailing comma, such as (d,).
+        # Shows first two if talking points and no spin news.  Also if they are TV Shows ande Stratfor Weekly, also if Daily Briefing and Curren Column
+        # So all work individually.  No idea why only getting first two in TOC now.
+        ans = [("Talking Points Memos", talkingPoints),("No Spin News", articleList),("TV Shows", showList),("Stratfor Weekly",stratList), ("Daily Briefing", dailyBriefs),("Current Column", currentColumn)]
+        if debugMessages :
+            print ans
+        return ans
+
+    def preprocess_html(self, soup):
+        refresh = soup.find('meta', {'http-equiv':'refresh'})
+        if refresh is None:
+            return soup
+        content = refresh.get('content').partition('=')[2]
+        raw = self.browser.open('https://www.billoreilly.com'+content).read()
+        return BeautifulSoup(raw.decode('cp1252', 'replace'))
+