Update LA Times

This commit is contained in:
Kovid Goyal 2018-06-17 21:58:35 +05:30
parent 494c5a9b29
commit c57a493711
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,20 +1,55 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
import re
from collections import defaultdict from collections import defaultdict
from pprint import pformat from pprint import pformat
from calibre.utils.date import strptime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
DT_EPOCH = strptime('1970-01-01', '%Y-%m-%d', assume_utc=True)
def classes(classes): DIR_COLLECTIONS = [['world'],
q = frozenset(classes.split(' ')) ['nation'],
return dict( ['politics'],
attrs={ ['opinion', 'op-ed', 'opinion-la', 'editorials',
'class': lambda x: x and frozenset(x.split()).intersection(q) 'readersreact', 'topoftheticket', 'endorsements'],
} ['local', 'lanow', 'california', 'crime',
) 'abcarian', 'education', 'weather'],
['business', 'hollywood', 'technology'],
['sports'],
['entertainment', 'movies', 'music',
'tv', 'arts', 'gossip', 'envelope'],
['books'],
['food', 'jonathon-gold', 'dailydish'],
['health'],
['style', 'laaffairs', 'pets'],
['science', 'sciencenow'],
['home'],
['travel'],
['fashion']]
SECTIONS=['THE WORLD',
'THE NATION',
'POLITICS',
'OPINION',
'CALIFORNIA',
'OBITUARIES',
'BUSINESS',
'HOLLYWOOD',
'SPORTS',
'ENTERTAINMENT',
'MOVIES',
'TELEVISION',
'BOOKS',
'FOOD',
'HEALTH',
'SCIENCE AND TECHNOLOGY',
'HOME',
'TRAVEL',
'FASHION',
'NEWSLETTERS'
'OTHER']
def absurl(url): def absurl(url):
@ -23,66 +58,146 @@ def absurl(url):
return url return url
def check_words(words):
return lambda x: x and frozenset(words.split()).intersection(x.split())
def what_section(url):
if re.compile(r'^https?://www[.]latimes[.]com/local/obituaries').search(url):
return 'OBITUARIES'
elif re.compile(r'^https?://www[.]latimes[.]com/business/hollywood').search(url):
return 'HOLLYWOOD'
elif re.compile(r'^https?://www[.]latimes[.]com/entertainment/movies').search(url):
return 'MOVIES'
elif re.compile(r'^https?://www[.]latimes[.]com/entertainment/tv').search(url):
return 'TELEVISION'
elif re.compile(r'^https?://www[.]latimes[.]com/business/technology').search(url):
return 'SCIENCE AND TECHNOLOGY'
elif re.compile(r'^https?://www[.]latimes[.]com/world').search(url):
return 'THE WORLD'
elif re.compile(r'^https?://www[.]latimes[.]com/nation').search(url):
return 'THE NATION'
elif re.compile(r'^https?://www[.]latimes[.]com/politics').search(url):
return 'POLITICS'
elif re.compile(r'^https?://www[.]latimes[.]com/opinion').search(url):
return 'OPINION'
elif re.compile(r'^https?://www[.]latimes[.]com/(?:local|style)').search(url):
return 'CALIFORNIA'
elif re.compile(r'^https?://www[.]latimes[.]com/business').search(url):
return 'BUSINESS'
elif re.compile(r'^https?://www[.]latimes[.]com/sports').search(url):
return 'SPORTS'
elif re.compile(r'^https?://www[.]latimes[.]com/entertainment').search(url):
return 'ENTERTAINMENT'
elif re.compile(r'^https?://www[.]latimes[.]com/books').search(url):
return 'BOOKS'
elif re.compile(r'^https?://www[.]latimes[.]com/food').search(url):
return 'FOOD'
elif re.compile(r'^https?://www[.]latimes[.]com/health').search(url):
return 'HEALTH'
elif re.compile(r'^https?://www[.]latimes[.]com/science').search(url):
return 'SCIENCE AND TECHNOLOGY'
elif re.compile(r'^https?://www[.]latimes[.]com/home').search(url):
return 'HOME'
elif re.compile(r'^https?://www[.]latimes[.]com/travel').search(url):
return 'TRAVEL'
elif re.compile(r'^https?://www[.]latimes[.]com/fashion').search(url):
return 'FASHION'
elif re.compile(r'^https?://www[.]latimes[.]com/newsletter').search(url):
return 'NEWSLETTERS'
else:
return 'OTHER'
class LATimes(BasicNewsRecipe): class LATimes(BasicNewsRecipe):
title = 'Los Angeles Times' title = 'Los Angeles Times'
__author__ = 'Kovid Goyal' __author__ = 'Jose Ortiz'
description = 'The Los Angeles Times is a leading source of news on Southern California, entertainment, movies, television, music, politics, business, health, technology, travel, sports, environment, economics, autos, jobs, real estate and other topics affecting California' # noqa description = 'The Los Angeles Times is a leading source of news on Southern California, entertainment, movies, television, music, politics, business, health, technology, travel, sports, environment, economics, autos, jobs, real estate and other topics affecting California' # noqa
category = 'news, politics, USA, Los Angeles, world' category = 'news, politics, USA, Los Angeles, world'
oldest_article = 2 oldest_article = 1
max_articles_per_feed = 200 max_articles_per_feed = 200
no_stylesheets = True no_stylesheets = True
encoding = 'utf8' encoding = 'utf8'
use_embedded_content = False use_embedded_content = False
language = 'en' language = 'en'
remove_empty_feeds = True remove_empty_feeds = True
ignore_duplicate_articles = {'url'}
publication_type = 'newspaper' publication_type = 'newspaper'
cover_url = 'http://www.latimes.com/includes/sectionfronts/A1.pdf' cover_url = 'http://www.latimes.com/includes/sectionfronts/A1.pdf'
keep_only_tags = [ keep_only_tags = [
dict(name='h1'), dict(name='header', attrs={'id': 'top'}),
dict(attrs={ dict(name='article'),
'class': 'trb_ar_main' dict(name='div', attrs={'id': 'liveblog-story-wrapper'})
}), ]
remove_tags= [
dict(name='div', attrs={'class': check_words(
'hidden-tablet hidden-mobile hidden-desktop pb-f-ads-dfp')})
] ]
remove_tags_after = [ remove_tags_after = [
dict(itemprop='articleBody'), dict(name='div', attrs={'class': check_words('pb-f-article-body')})
]
remove_tags = [
dict(attrs={
'data-content-type': 'blurb'
}),
classes('trb_ar_cont trb_gptAd trb_filmstrip trb_ar_sponsoredmod'),
] ]
def parse_index(self): def parse_index(self):
soup = self.index_to_soup('http://www.latimes.com') index = 'http://www.latimes.com/'
pat = r'^(?:https?://www[.]latimes[.]com)?/[^#]+20[0-9]{6}-(?:html)?story[.]html'
articles = self.find_articles(index, pat)
for collection in DIR_COLLECTIONS:
topdir = collection.pop(0)
index = 'http://www.latimes.com/' + topdir + '/'
pat = r'^(?:https?://www[.]latimes[.]com)?/' + \
topdir + '/[^#]+20[0-9]{6}-(?:html)?story[.]html'
articles += self.find_articles(index, pat)
for subdir in collection:
sub_index = index + subdir + '/'
articles += self.find_articles(sub_index, pat)
feeds = defaultdict(list) feeds = defaultdict(list)
for x in soup.findAll( for article in articles:
attrs={ section = what_section(article['url'])
'data-content-type': 'story', feeds[section].append(article)
'data-content-section': True,
'data-content-slug': True, keys = []
} for key in SECTIONS:
): if key in feeds.keys():
a = x.find( keys.append(key)
'a', attrs={
'class': lambda x: not x or 'SectionHeading' not in x
}
)
if a is not None:
url = absurl(a['href'])
section = x['data-content-section'].capitalize()
title = self.tag_to_string(a)
feeds[section].append({'title': title, 'url': url})
self.log(pformat(dict(feeds))) self.log(pformat(dict(feeds)))
return [(k, feeds[k]) for k in sorted(feeds)] return [(k, feeds[k]) for k in keys]
def preprocess_html(self, soup): def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-baseurl': True}): for img in soup.findAll('img', attrs={'data-src': True}):
img['src'] = img['data-baseurl'] + '/750' if img.findParent('a', href='http://www.latimes.com/opinion/la-letter-to-the-editor-htmlstory.html') \
for div in soup.findAll(itemprop='articleBody'): is img.parent and img['data-src'].endswith('/la-letter-to-the-editor'):
div.extract() img.parent.extract()
soup.find('body').append(div) else:
img['src'] = img['data-src']
return soup return soup
def find_articles(self, index, pattern):
self.log('Downloading and parsing index: ', index)
self.log('Pattern: ', pattern)
try:
soup = self.index_to_soup(index)
except:
self.log('Failed to download ', index)
return []
if soup.main is not None:
alinks = soup.main.findAll('a', {'href': re.compile(pattern)})
else:
alinks = soup.findAll('a', {'href': re.compile(pattern)})
alinks = [a for a in alinks if len(
a.contents) == 1 and a.find(text=True, recursive=False)]
articles = [
{'title': a.find(text=True), 'url': absurl(a['href'])} for a in alinks]
date_rx = re.compile(
r'^https?://www[.]latimes[.]com/[^#]+-(?P<date>20[0-9]{6})-(?:html)?story[.]html')
for article in articles:
mdate = date_rx.match(article['url'])
if mdate is not None:
article['timestamp'] = (strptime(mdate.group(
'date'),'%Y%m%d') - DT_EPOCH).total_seconds()
article['url'] = mdate.group(0)
self.log('Found: ', len(articles), ' articles.\n')
return articles