Update Orange County Register

2025-08-11 09:13:57 -04:00 · 2018-01-16 21:18:41 +05:30 · 2018-01-16 21:18:41 +05:30 · 1870bc10fa
commit 1870bc10fa
parent 1117ae2b59
1 changed files with 124 additions and 64 deletions
--- a/recipes/oc_register.recipe
+++ b/recipes/oc_register.recipe
@ -1,77 +1,137 @@
-#!/usr/bin/env  python2
+#!/usr/bin/env python2
-__license__ = 'GPL v3'
+# vim:fileencoding=utf-8
-__author__ = 'Lorenzo Vigentini (updated by rrrrrrrrrrrryan at gmail.com)'
+from __future__ import unicode_literals, division, absolute_import, print_function
-__copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
+import time
-description = 'News from Orange county - v1.02 (10, August 2014)'
+import json
-
+import urllib
-'''
+from pprint import pprint
 http://www.ocregister.com/
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
-class ocRegister(BasicNewsRecipe):
+class OrangeCountyRegister(BasicNewsRecipe):
    author = 'Lorenzo Vigentini'
    description = 'News from the Orange county'
    cover_url = 'http://images.onset.freedom.com/ocregister/logo.gif'
    title = u'Orange County Register'
-    publisher = 'Orange County Register Communication'
+    __author__ = 'TechnoCat'
-    category = 'News, finance, economy, politics'
+    description = 'The O.C. Register\nRecipe: Nov 2016'
-
+    cover_url = 'http://www.realclearpolitics.com/dev/mt-static/images/logo.gif'
    custom_title = 'OC Register - ' + time.strftime('%d %b %Y')
    auto_cleanup = True
    extra_css = 'div.metaAuthor { display:none;}\n'
    encoding = 'utf8'
    language = 'en'
-    timefmt = '[%a, %d %b, %Y]'
+    needs_subscription = False
    oldest_article = 1
    max_articles_per_feed = 25
    use_embedded_content = False
    recursion = 10
    # remove_javascript     = True
    no_stylesheets = True
    oldest_article = 7
    remove_javascript = True
    remove_tags = [dict(name='img', attrs={})]
    # Don't go down
    recursions = 0
    max_articles_per_feed = 400
    debugMessages = False
-    needs_subscription = "optional"
+    feeds = [('News', 'https://www.ocregister.com/news/'),
             ('Opinion', 'https://www.ocregister.com/opinion/'),
             ('Politics', 'https://www.ocregister.com/news/politics/'),
             ('Business', 'https://www.ocregister.com/business/')]
-    use_javascript_to_login = True
+    def parsePage(self, index):
        if self.debugMessages is True:
            print("\n\nStarting " + self.feeds[index][0])
        articleList = []
        soup = self.index_to_soup(self.feeds[index][1])
        # Have this index page now.
        # look for a.article-title
        # If any, the description is <div class="excerpt">
        for newsentry in soup.findAll("a", {"class": "article-title"}):
            print('Next up:')
            print(newsentry)
            title = newsentry["title"]
            url = newsentry['href']
            print("Title: ")
            print(title)
            print('URL')
            print(url)
            pubdate = time.strftime('%a, %d %b')
            articleList.append(
                dict(
                    title=title,
                    url=url,
                    date=pubdate,
                    description=title,
                    content=''
                )
            )
        return articleList
-    def javascript_login(self, browser, username, password):
+    def extract_readable_article(self, html, url):
-        browser.visit('http://www.ocregister.com/sections/login')
+        cleanedHTML = super(OrangeCountyRegister,
-        form = browser.select_form(nr=1)  # Select the second form on the page
+                            self).extract_readable_article(html, url)
-        form['username'] = username
+        print("Processing html for author")
-        form['password_temp'] = password
+        # Find the attribs...
-        # Submit the form and wait at most two minutes for loading to complete
+        attribDict = self.htmlToAttribsDict(html)
-        browser.submit(timeout=120)
+        print("dict is type...")
        print(type(attribDict))
        author = attribDict.get('Byline')
        if author is not None:
            # add author code after <body>
            print("Adding author in meta")
            print(author)
            cleanedHTML = cleanedHTML.replace(
                "<body>",
                "<body>\n<div class=\"metaAuthor\" value=\"" + author + "\"></div>\n"
            )
        else:
            print('no author found')
            print(html)
        # pubDate = attribDict.get('Publish Hour of Day')
        return cleanedHTML
-    def print_version(self, url):
+    def loadURL(self, url):
-        printUrl = 'http://www.ocregister.com/common/printer/view.php?db=ocregister&id='
+        socket = urllib.urlopen(url)
-        segments = url.split('/')
+        rawHTML = socket.read()
-        subSegments = (segments[4]).split('.')
+        return rawHTML
        subSubSegments = (subSegments[0]).split('-')
        myArticle = (subSubSegments[1])
        myURL = printUrl + myArticle
        return myURL
-    keep_only_tags = [
+    def htmlToAttribsDict(self, rawHTML):
-        dict(name='div', attrs={'id': 'ArticleContentWrap'})
+        tokenStart = 'dataLayer.push({'
-    ]
+        tokenEnd = '});'
        print("1")
        startJSON = rawHTML.find(tokenStart)
        if (startJSON < 0):
            return
        JSONBeginning = rawHTML[startJSON + len(tokenStart) - 1:]
        endJSON = JSONBeginning.find(tokenEnd)
        if (endJSON < 0):
            return
        JSON = JSONBeginning[:endJSON + 1]
        JSONQuoted = JSON.replace("'", "\"")
        try:
            metadata = json.loads(JSONQuoted)
            pprint(metadata)
            return metadata
        except ValueError:
            print("Could not decode JSON:")
            print(JSONQuoted)
        return None
-    remove_tags = [
+    # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
-        dict(name='div', attrs={'class': 'hideForPrint'}),
+    # returns a list of tuple ('feed title', list of articles)
-        dict(name='div', attrs={'id': 'ContentFooter'})
+    # {
-    ]
+    # 'title'       : article title,
-
+    # 'url'         : URL of print version,
-    feeds = [
+    # 'date'        : The publication date of the article as a string,
-        (u'News', u'http://www.ocregister.com/common/rss/rss.php?catID=18800'),
+    # 'description' : A summary of the article
-        (u'Top Stories',
+    # 'content'     : The full article (can be an empty string). This is used by FullContentProfile
-         u'http://www.ocregister.com/common/rss/rss.php?catID=23541'),
+    # }
-        (u'Business', u'http://www.ocregister.com/common/rss/rss.php?catID=18909'),
+    # this is used instead of BasicNewsRecipe.parse_feeds().
-        (u'Cars', u'http://www.ocregister.com/common/rss/rss.php?catID=20128'),
+    def parse_index(self):
-        (u'Entertainment',
+        # Parse the page into Python Soup
-         u'http://www.ocregister.com/common/rss/rss.php?catID=18926'),
+        # articleList = []
-        (u'Home', u'http://www.ocregister.com/common/rss/rss.php?catID=19142'),
+        ans = []
-        (u'Life', u'http://www.ocregister.com/common/rss/rss.php?catID=18936'),
+        feedsCount = len(self.feeds)
-        (u'Opinion', u'http://www.ocregister.com/common/rss/rss.php?catID=18963'),
+        for x in range(0, feedsCount - 1):  # should be ,4
-        (u'Sports', u'http://www.ocregister.com/common/rss/rss.php?catID=18901'),
+            feedarticles = self.parsePage(x)
-        (u'Travel', u'http://www.ocregister.com/common/rss/rss.php?catID=18959')
+            if feedarticles is not None:
-    ]
+                ans.append((self.feeds[x][0], feedarticles))
        if self.debugMessages is True:
            print(ans)
        return ans