Update Orange County Register

This commit is contained in:
Kovid Goyal 2018-01-16 21:18:41 +05:30
parent 1117ae2b59
commit 1870bc10fa
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,77 +1,137 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
__license__ = 'GPL v3' # vim:fileencoding=utf-8
__author__ = 'Lorenzo Vigentini (updated by rrrrrrrrrrrryan at gmail.com)' from __future__ import unicode_literals, division, absolute_import, print_function
__copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>' import time
description = 'News from Orange county - v1.02 (10, August 2014)' import json
import urllib
''' from pprint import pprint
http://www.ocregister.com/
'''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class ocRegister(BasicNewsRecipe): class OrangeCountyRegister(BasicNewsRecipe):
author = 'Lorenzo Vigentini'
description = 'News from the Orange county'
cover_url = 'http://images.onset.freedom.com/ocregister/logo.gif'
title = u'Orange County Register' title = u'Orange County Register'
publisher = 'Orange County Register Communication' __author__ = 'TechnoCat'
category = 'News, finance, economy, politics' description = 'The O.C. Register\nRecipe: Nov 2016'
cover_url = 'http://www.realclearpolitics.com/dev/mt-static/images/logo.gif'
custom_title = 'OC Register - ' + time.strftime('%d %b %Y')
auto_cleanup = True
extra_css = 'div.metaAuthor { display:none;}\n'
encoding = 'utf8'
language = 'en' language = 'en'
timefmt = '[%a, %d %b, %Y]' needs_subscription = False
oldest_article = 1
max_articles_per_feed = 25
use_embedded_content = False
recursion = 10
# remove_javascript = True
no_stylesheets = True no_stylesheets = True
oldest_article = 7
remove_javascript = True
remove_tags = [dict(name='img', attrs={})]
# Don't go down
recursions = 0
max_articles_per_feed = 400
debugMessages = False
needs_subscription = "optional" feeds = [('News', 'https://www.ocregister.com/news/'),
('Opinion', 'https://www.ocregister.com/opinion/'),
('Politics', 'https://www.ocregister.com/news/politics/'),
('Business', 'https://www.ocregister.com/business/')]
use_javascript_to_login = True def parsePage(self, index):
if self.debugMessages is True:
print("\n\nStarting " + self.feeds[index][0])
articleList = []
soup = self.index_to_soup(self.feeds[index][1])
# Have this index page now.
# look for a.article-title
# If any, the description is <div class="excerpt">
for newsentry in soup.findAll("a", {"class": "article-title"}):
print('Next up:')
print(newsentry)
title = newsentry["title"]
url = newsentry['href']
print("Title: ")
print(title)
print('URL')
print(url)
pubdate = time.strftime('%a, %d %b')
articleList.append(
dict(
title=title,
url=url,
date=pubdate,
description=title,
content=''
)
)
return articleList
def javascript_login(self, browser, username, password): def extract_readable_article(self, html, url):
browser.visit('http://www.ocregister.com/sections/login') cleanedHTML = super(OrangeCountyRegister,
form = browser.select_form(nr=1) # Select the second form on the page self).extract_readable_article(html, url)
form['username'] = username print("Processing html for author")
form['password_temp'] = password # Find the attribs...
# Submit the form and wait at most two minutes for loading to complete attribDict = self.htmlToAttribsDict(html)
browser.submit(timeout=120) print("dict is type...")
print(type(attribDict))
author = attribDict.get('Byline')
if author is not None:
# add author code after <body>
print("Adding author in meta")
print(author)
cleanedHTML = cleanedHTML.replace(
"<body>",
"<body>\n<div class=\"metaAuthor\" value=\"" + author + "\"></div>\n"
)
else:
print('no author found')
print(html)
# pubDate = attribDict.get('Publish Hour of Day')
return cleanedHTML
def print_version(self, url): def loadURL(self, url):
printUrl = 'http://www.ocregister.com/common/printer/view.php?db=ocregister&id=' socket = urllib.urlopen(url)
segments = url.split('/') rawHTML = socket.read()
subSegments = (segments[4]).split('.') return rawHTML
subSubSegments = (subSegments[0]).split('-')
myArticle = (subSubSegments[1])
myURL = printUrl + myArticle
return myURL
keep_only_tags = [ def htmlToAttribsDict(self, rawHTML):
dict(name='div', attrs={'id': 'ArticleContentWrap'}) tokenStart = 'dataLayer.push({'
] tokenEnd = '});'
print("1")
startJSON = rawHTML.find(tokenStart)
if (startJSON < 0):
return
JSONBeginning = rawHTML[startJSON + len(tokenStart) - 1:]
endJSON = JSONBeginning.find(tokenEnd)
if (endJSON < 0):
return
JSON = JSONBeginning[:endJSON + 1]
JSONQuoted = JSON.replace("'", "\"")
try:
metadata = json.loads(JSONQuoted)
pprint(metadata)
return metadata
except ValueError:
print("Could not decode JSON:")
print(JSONQuoted)
return None
remove_tags = [ # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
dict(name='div', attrs={'class': 'hideForPrint'}), # returns a list of tuple ('feed title', list of articles)
dict(name='div', attrs={'id': 'ContentFooter'}) # {
] # 'title' : article title,
# 'url' : URL of print version,
feeds = [ # 'date' : The publication date of the article as a string,
(u'News', u'http://www.ocregister.com/common/rss/rss.php?catID=18800'), # 'description' : A summary of the article
(u'Top Stories', # 'content' : The full article (can be an empty string). This is used by FullContentProfile
u'http://www.ocregister.com/common/rss/rss.php?catID=23541'), # }
(u'Business', u'http://www.ocregister.com/common/rss/rss.php?catID=18909'), # this is used instead of BasicNewsRecipe.parse_feeds().
(u'Cars', u'http://www.ocregister.com/common/rss/rss.php?catID=20128'), def parse_index(self):
(u'Entertainment', # Parse the page into Python Soup
u'http://www.ocregister.com/common/rss/rss.php?catID=18926'), # articleList = []
(u'Home', u'http://www.ocregister.com/common/rss/rss.php?catID=19142'), ans = []
(u'Life', u'http://www.ocregister.com/common/rss/rss.php?catID=18936'), feedsCount = len(self.feeds)
(u'Opinion', u'http://www.ocregister.com/common/rss/rss.php?catID=18963'), for x in range(0, feedsCount - 1): # should be ,4
(u'Sports', u'http://www.ocregister.com/common/rss/rss.php?catID=18901'), feedarticles = self.parsePage(x)
(u'Travel', u'http://www.ocregister.com/common/rss/rss.php?catID=18959') if feedarticles is not None:
] ans.append((self.feeds[x][0], feedarticles))
if self.debugMessages is True:
print(ans)
return ans