mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Update Orange County Register
This commit is contained in:
parent
1117ae2b59
commit
1870bc10fa
@ -1,77 +1,137 @@
|
|||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python2
|
||||||
__license__ = 'GPL v3'
|
# vim:fileencoding=utf-8
|
||||||
__author__ = 'Lorenzo Vigentini (updated by rrrrrrrrrrrryan at gmail.com)'
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||||
__copyright__ = '2009, Lorenzo Vigentini <l.vigentini at gmail.com>'
|
import time
|
||||||
description = 'News from Orange county - v1.02 (10, August 2014)'
|
import json
|
||||||
|
import urllib
|
||||||
'''
|
from pprint import pprint
|
||||||
http://www.ocregister.com/
|
|
||||||
'''
|
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
class ocRegister(BasicNewsRecipe):
|
class OrangeCountyRegister(BasicNewsRecipe):
|
||||||
author = 'Lorenzo Vigentini'
|
|
||||||
description = 'News from the Orange county'
|
|
||||||
|
|
||||||
cover_url = 'http://images.onset.freedom.com/ocregister/logo.gif'
|
|
||||||
title = u'Orange County Register'
|
title = u'Orange County Register'
|
||||||
publisher = 'Orange County Register Communication'
|
__author__ = 'TechnoCat'
|
||||||
category = 'News, finance, economy, politics'
|
description = 'The O.C. Register\nRecipe: Nov 2016'
|
||||||
|
cover_url = 'http://www.realclearpolitics.com/dev/mt-static/images/logo.gif'
|
||||||
|
custom_title = 'OC Register - ' + time.strftime('%d %b %Y')
|
||||||
|
auto_cleanup = True
|
||||||
|
extra_css = 'div.metaAuthor { display:none;}\n'
|
||||||
|
encoding = 'utf8'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
timefmt = '[%a, %d %b, %Y]'
|
needs_subscription = False
|
||||||
|
|
||||||
oldest_article = 1
|
|
||||||
max_articles_per_feed = 25
|
|
||||||
use_embedded_content = False
|
|
||||||
recursion = 10
|
|
||||||
|
|
||||||
# remove_javascript = True
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
oldest_article = 7
|
||||||
|
remove_javascript = True
|
||||||
|
remove_tags = [dict(name='img', attrs={})]
|
||||||
|
# Don't go down
|
||||||
|
recursions = 0
|
||||||
|
max_articles_per_feed = 400
|
||||||
|
debugMessages = False
|
||||||
|
|
||||||
needs_subscription = "optional"
|
feeds = [('News', 'https://www.ocregister.com/news/'),
|
||||||
|
('Opinion', 'https://www.ocregister.com/opinion/'),
|
||||||
|
('Politics', 'https://www.ocregister.com/news/politics/'),
|
||||||
|
('Business', 'https://www.ocregister.com/business/')]
|
||||||
|
|
||||||
use_javascript_to_login = True
|
def parsePage(self, index):
|
||||||
|
if self.debugMessages is True:
|
||||||
|
print("\n\nStarting " + self.feeds[index][0])
|
||||||
|
articleList = []
|
||||||
|
soup = self.index_to_soup(self.feeds[index][1])
|
||||||
|
# Have this index page now.
|
||||||
|
# look for a.article-title
|
||||||
|
# If any, the description is <div class="excerpt">
|
||||||
|
for newsentry in soup.findAll("a", {"class": "article-title"}):
|
||||||
|
print('Next up:')
|
||||||
|
print(newsentry)
|
||||||
|
title = newsentry["title"]
|
||||||
|
url = newsentry['href']
|
||||||
|
print("Title: ")
|
||||||
|
print(title)
|
||||||
|
print('URL')
|
||||||
|
print(url)
|
||||||
|
pubdate = time.strftime('%a, %d %b')
|
||||||
|
articleList.append(
|
||||||
|
dict(
|
||||||
|
title=title,
|
||||||
|
url=url,
|
||||||
|
date=pubdate,
|
||||||
|
description=title,
|
||||||
|
content=''
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return articleList
|
||||||
|
|
||||||
def javascript_login(self, browser, username, password):
|
def extract_readable_article(self, html, url):
|
||||||
browser.visit('http://www.ocregister.com/sections/login')
|
cleanedHTML = super(OrangeCountyRegister,
|
||||||
form = browser.select_form(nr=1) # Select the second form on the page
|
self).extract_readable_article(html, url)
|
||||||
form['username'] = username
|
print("Processing html for author")
|
||||||
form['password_temp'] = password
|
# Find the attribs...
|
||||||
# Submit the form and wait at most two minutes for loading to complete
|
attribDict = self.htmlToAttribsDict(html)
|
||||||
browser.submit(timeout=120)
|
print("dict is type...")
|
||||||
|
print(type(attribDict))
|
||||||
|
author = attribDict.get('Byline')
|
||||||
|
if author is not None:
|
||||||
|
# add author code after <body>
|
||||||
|
print("Adding author in meta")
|
||||||
|
print(author)
|
||||||
|
cleanedHTML = cleanedHTML.replace(
|
||||||
|
"<body>",
|
||||||
|
"<body>\n<div class=\"metaAuthor\" value=\"" + author + "\"></div>\n"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print('no author found')
|
||||||
|
print(html)
|
||||||
|
# pubDate = attribDict.get('Publish Hour of Day')
|
||||||
|
return cleanedHTML
|
||||||
|
|
||||||
def print_version(self, url):
|
def loadURL(self, url):
|
||||||
printUrl = 'http://www.ocregister.com/common/printer/view.php?db=ocregister&id='
|
socket = urllib.urlopen(url)
|
||||||
segments = url.split('/')
|
rawHTML = socket.read()
|
||||||
subSegments = (segments[4]).split('.')
|
return rawHTML
|
||||||
subSubSegments = (subSegments[0]).split('-')
|
|
||||||
myArticle = (subSubSegments[1])
|
|
||||||
myURL = printUrl + myArticle
|
|
||||||
return myURL
|
|
||||||
|
|
||||||
keep_only_tags = [
|
def htmlToAttribsDict(self, rawHTML):
|
||||||
dict(name='div', attrs={'id': 'ArticleContentWrap'})
|
tokenStart = 'dataLayer.push({'
|
||||||
]
|
tokenEnd = '});'
|
||||||
|
print("1")
|
||||||
|
startJSON = rawHTML.find(tokenStart)
|
||||||
|
if (startJSON < 0):
|
||||||
|
return
|
||||||
|
JSONBeginning = rawHTML[startJSON + len(tokenStart) - 1:]
|
||||||
|
endJSON = JSONBeginning.find(tokenEnd)
|
||||||
|
if (endJSON < 0):
|
||||||
|
return
|
||||||
|
JSON = JSONBeginning[:endJSON + 1]
|
||||||
|
JSONQuoted = JSON.replace("'", "\"")
|
||||||
|
try:
|
||||||
|
metadata = json.loads(JSONQuoted)
|
||||||
|
pprint(metadata)
|
||||||
|
return metadata
|
||||||
|
except ValueError:
|
||||||
|
print("Could not decode JSON:")
|
||||||
|
print(JSONQuoted)
|
||||||
|
return None
|
||||||
|
|
||||||
remove_tags = [
|
# calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
|
||||||
dict(name='div', attrs={'class': 'hideForPrint'}),
|
# returns a list of tuple ('feed title', list of articles)
|
||||||
dict(name='div', attrs={'id': 'ContentFooter'})
|
# {
|
||||||
]
|
# 'title' : article title,
|
||||||
|
# 'url' : URL of print version,
|
||||||
feeds = [
|
# 'date' : The publication date of the article as a string,
|
||||||
(u'News', u'http://www.ocregister.com/common/rss/rss.php?catID=18800'),
|
# 'description' : A summary of the article
|
||||||
(u'Top Stories',
|
# 'content' : The full article (can be an empty string). This is used by FullContentProfile
|
||||||
u'http://www.ocregister.com/common/rss/rss.php?catID=23541'),
|
# }
|
||||||
(u'Business', u'http://www.ocregister.com/common/rss/rss.php?catID=18909'),
|
# this is used instead of BasicNewsRecipe.parse_feeds().
|
||||||
(u'Cars', u'http://www.ocregister.com/common/rss/rss.php?catID=20128'),
|
def parse_index(self):
|
||||||
(u'Entertainment',
|
# Parse the page into Python Soup
|
||||||
u'http://www.ocregister.com/common/rss/rss.php?catID=18926'),
|
# articleList = []
|
||||||
(u'Home', u'http://www.ocregister.com/common/rss/rss.php?catID=19142'),
|
ans = []
|
||||||
(u'Life', u'http://www.ocregister.com/common/rss/rss.php?catID=18936'),
|
feedsCount = len(self.feeds)
|
||||||
(u'Opinion', u'http://www.ocregister.com/common/rss/rss.php?catID=18963'),
|
for x in range(0, feedsCount - 1): # should be ,4
|
||||||
(u'Sports', u'http://www.ocregister.com/common/rss/rss.php?catID=18901'),
|
feedarticles = self.parsePage(x)
|
||||||
(u'Travel', u'http://www.ocregister.com/common/rss/rss.php?catID=18959')
|
if feedarticles is not None:
|
||||||
]
|
ans.append((self.feeds[x][0], feedarticles))
|
||||||
|
if self.debugMessages is True:
|
||||||
|
print(ans)
|
||||||
|
return ans
|
||||||
|
Loading…
x
Reference in New Issue
Block a user