diff --git a/recipes/oc_register.recipe b/recipes/oc_register.recipe index 0c2733f0dd..69f44d5ad1 100644 --- a/recipes/oc_register.recipe +++ b/recipes/oc_register.recipe @@ -1,77 +1,137 @@ -#!/usr/bin/env python2 -__license__ = 'GPL v3' -__author__ = 'Lorenzo Vigentini (updated by rrrrrrrrrrrryan at gmail.com)' -__copyright__ = '2009, Lorenzo Vigentini ' -description = 'News from Orange county - v1.02 (10, August 2014)' - -''' -http://www.ocregister.com/ -''' +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import unicode_literals, division, absolute_import, print_function +import time +import json +import urllib +from pprint import pprint from calibre.web.feeds.news import BasicNewsRecipe -class ocRegister(BasicNewsRecipe): - author = 'Lorenzo Vigentini' - description = 'News from the Orange county' - - cover_url = 'http://images.onset.freedom.com/ocregister/logo.gif' +class OrangeCountyRegister(BasicNewsRecipe): title = u'Orange County Register' - publisher = 'Orange County Register Communication' - category = 'News, finance, economy, politics' - + __author__ = 'TechnoCat' + description = 'The O.C. Register\nRecipe: Nov 2016' + cover_url = 'http://www.realclearpolitics.com/dev/mt-static/images/logo.gif' + custom_title = 'OC Register - ' + time.strftime('%d %b %Y') + auto_cleanup = True + extra_css = 'div.metaAuthor { display:none;}\n' + encoding = 'utf8' language = 'en' - timefmt = '[%a, %d %b, %Y]' - - oldest_article = 1 - max_articles_per_feed = 25 - use_embedded_content = False - recursion = 10 - - # remove_javascript = True + needs_subscription = False no_stylesheets = True + oldest_article = 7 + remove_javascript = True + remove_tags = [dict(name='img', attrs={})] + # Don't go down + recursions = 0 + max_articles_per_feed = 400 + debugMessages = False - needs_subscription = "optional" + feeds = [('News', 'https://www.ocregister.com/news/'), + ('Opinion', 'https://www.ocregister.com/opinion/'), + ('Politics', 'https://www.ocregister.com/news/politics/'), + ('Business', 'https://www.ocregister.com/business/')] - use_javascript_to_login = True + def parsePage(self, index): + if self.debugMessages is True: + print("\n\nStarting " + self.feeds[index][0]) + articleList = [] + soup = self.index_to_soup(self.feeds[index][1]) + # Have this index page now. + # look for a.article-title + # If any, the description is
+ for newsentry in soup.findAll("a", {"class": "article-title"}): + print('Next up:') + print(newsentry) + title = newsentry["title"] + url = newsentry['href'] + print("Title: ") + print(title) + print('URL') + print(url) + pubdate = time.strftime('%a, %d %b') + articleList.append( + dict( + title=title, + url=url, + date=pubdate, + description=title, + content='' + ) + ) + return articleList - def javascript_login(self, browser, username, password): - browser.visit('http://www.ocregister.com/sections/login') - form = browser.select_form(nr=1) # Select the second form on the page - form['username'] = username - form['password_temp'] = password - # Submit the form and wait at most two minutes for loading to complete - browser.submit(timeout=120) + def extract_readable_article(self, html, url): + cleanedHTML = super(OrangeCountyRegister, + self).extract_readable_article(html, url) + print("Processing html for author") + # Find the attribs... + attribDict = self.htmlToAttribsDict(html) + print("dict is type...") + print(type(attribDict)) + author = attribDict.get('Byline') + if author is not None: + # add author code after + print("Adding author in meta") + print(author) + cleanedHTML = cleanedHTML.replace( + "", + "\n
\n" + ) + else: + print('no author found') + print(html) + # pubDate = attribDict.get('Publish Hour of Day') + return cleanedHTML - def print_version(self, url): - printUrl = 'http://www.ocregister.com/common/printer/view.php?db=ocregister&id=' - segments = url.split('/') - subSegments = (segments[4]).split('.') - subSubSegments = (subSegments[0]).split('-') - myArticle = (subSubSegments[1]) - myURL = printUrl + myArticle - return myURL + def loadURL(self, url): + socket = urllib.urlopen(url) + rawHTML = socket.read() + return rawHTML - keep_only_tags = [ - dict(name='div', attrs={'id': 'ArticleContentWrap'}) - ] + def htmlToAttribsDict(self, rawHTML): + tokenStart = 'dataLayer.push({' + tokenEnd = '});' + print("1") + startJSON = rawHTML.find(tokenStart) + if (startJSON < 0): + return + JSONBeginning = rawHTML[startJSON + len(tokenStart) - 1:] + endJSON = JSONBeginning.find(tokenEnd) + if (endJSON < 0): + return + JSON = JSONBeginning[:endJSON + 1] + JSONQuoted = JSON.replace("'", "\"") + try: + metadata = json.loads(JSONQuoted) + pprint(metadata) + return metadata + except ValueError: + print("Could not decode JSON:") + print(JSONQuoted) + return None - remove_tags = [ - dict(name='div', attrs={'class': 'hideForPrint'}), - dict(name='div', attrs={'id': 'ContentFooter'}) - ] - - feeds = [ - (u'News', u'http://www.ocregister.com/common/rss/rss.php?catID=18800'), - (u'Top Stories', - u'http://www.ocregister.com/common/rss/rss.php?catID=23541'), - (u'Business', u'http://www.ocregister.com/common/rss/rss.php?catID=18909'), - (u'Cars', u'http://www.ocregister.com/common/rss/rss.php?catID=20128'), - (u'Entertainment', - u'http://www.ocregister.com/common/rss/rss.php?catID=18926'), - (u'Home', u'http://www.ocregister.com/common/rss/rss.php?catID=19142'), - (u'Life', u'http://www.ocregister.com/common/rss/rss.php?catID=18936'), - (u'Opinion', u'http://www.ocregister.com/common/rss/rss.php?catID=18963'), - (u'Sports', u'http://www.ocregister.com/common/rss/rss.php?catID=18901'), - (u'Travel', u'http://www.ocregister.com/common/rss/rss.php?catID=18959') - ] + # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles. + # returns a list of tuple ('feed title', list of articles) + # { + # 'title' : article title, + # 'url' : URL of print version, + # 'date' : The publication date of the article as a string, + # 'description' : A summary of the article + # 'content' : The full article (can be an empty string). This is used by FullContentProfile + # } + # this is used instead of BasicNewsRecipe.parse_feeds(). + def parse_index(self): + # Parse the page into Python Soup + # articleList = [] + ans = [] + feedsCount = len(self.feeds) + for x in range(0, feedsCount - 1): # should be ,4 + feedarticles = self.parsePage(x) + if feedarticles is not None: + ans.append((self.feeds[x][0], feedarticles)) + if self.debugMessages is True: + print(ans) + return ans