Update LA Times

This commit is contained in:
Kovid Goyal 2020-03-15 18:29:26 +05:30
parent ac0d67ee6f
commit a15acae96d
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -2,13 +2,9 @@
import re import re
from collections import defaultdict from collections import defaultdict
from pprint import pformat
from calibre.utils.date import strptime, utcnow
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
DT_EPOCH = strptime('1970-01-01', '%Y-%m-%d', assume_utc=True)
DIR_COLLECTIONS = [['world'], DIR_COLLECTIONS = [['world'],
['nation'], ['nation'],
['politics'], ['politics'],
@ -29,84 +25,22 @@ DIR_COLLECTIONS = [['world'],
['travel'], ['travel'],
['fashion']] ['fashion']]
SECTIONS=['THE WORLD',
'THE NATION', def classes(classes):
'POLITICS', q = frozenset(classes.split(' '))
'OPINION', return dict(attrs={
'CALIFORNIA', 'class': lambda x: x and frozenset(x.split()).intersection(q)})
'OBITUARIES',
'BUSINESS',
'HOLLYWOOD',
'SPORTS',
'ENTERTAINMENT',
'MOVIES',
'TELEVISION',
'BOOKS',
'FOOD',
'HEALTH',
'SCIENCE AND TECHNOLOGY',
'HOME',
'TRAVEL',
'FASHION',
'NEWSLETTERS'
'OTHER']
def absurl(url): def absurl(url):
if url.startswith('/'): if url.startswith('/'):
url = 'http://www.latimes.com' + url url = 'https://www.latimes.com' + url
return url return url
def check_words(words):
return lambda x: x and frozenset(words.split()).intersection(x.split())
def what_section(url): def what_section(url):
if re.compile(r'^https?://www[.]latimes[.]com/local/obituaries').search(url): parts = url.split('/')
return 'OBITUARIES' return parts[-4].capitalize()
elif re.compile(r'^https?://www[.]latimes[.]com/business/hollywood').search(url):
return 'HOLLYWOOD'
elif re.compile(r'^https?://www[.]latimes[.]com/entertainment/movies').search(url):
return 'MOVIES'
elif re.compile(r'^https?://www[.]latimes[.]com/entertainment/tv').search(url):
return 'TELEVISION'
elif re.compile(r'^https?://www[.]latimes[.]com/business/technology').search(url):
return 'SCIENCE AND TECHNOLOGY'
elif re.compile(r'^https?://www[.]latimes[.]com/world').search(url):
return 'THE WORLD'
elif re.compile(r'^https?://www[.]latimes[.]com/nation').search(url):
return 'THE NATION'
elif re.compile(r'^https?://www[.]latimes[.]com/politics').search(url):
return 'POLITICS'
elif re.compile(r'^https?://www[.]latimes[.]com/opinion').search(url):
return 'OPINION'
elif re.compile(r'^https?://www[.]latimes[.]com/(?:local|style)').search(url):
return 'CALIFORNIA'
elif re.compile(r'^https?://www[.]latimes[.]com/business').search(url):
return 'BUSINESS'
elif re.compile(r'^https?://www[.]latimes[.]com/sports').search(url):
return 'SPORTS'
elif re.compile(r'^https?://www[.]latimes[.]com/entertainment').search(url):
return 'ENTERTAINMENT'
elif re.compile(r'^https?://www[.]latimes[.]com/books').search(url):
return 'BOOKS'
elif re.compile(r'^https?://www[.]latimes[.]com/food').search(url):
return 'FOOD'
elif re.compile(r'^https?://www[.]latimes[.]com/health').search(url):
return 'HEALTH'
elif re.compile(r'^https?://www[.]latimes[.]com/science').search(url):
return 'SCIENCE AND TECHNOLOGY'
elif re.compile(r'^https?://www[.]latimes[.]com/home').search(url):
return 'HOME'
elif re.compile(r'^https?://www[.]latimes[.]com/travel').search(url):
return 'TRAVEL'
elif re.compile(r'^https?://www[.]latimes[.]com/fashion').search(url):
return 'FASHION'
elif re.compile(r'^https?://www[.]latimes[.]com/newsletter').search(url):
return 'NEWSLETTERS'
else:
return 'OTHER'
class LATimes(BasicNewsRecipe): class LATimes(BasicNewsRecipe):
@ -126,32 +60,25 @@ class LATimes(BasicNewsRecipe):
cover_url = 'http://www.latimes.com/includes/sectionfronts/A1.pdf' cover_url = 'http://www.latimes.com/includes/sectionfronts/A1.pdf'
keep_only_tags = [ keep_only_tags = [
dict(name='header', attrs={'id': 'top'}), classes('ArticlePage-breadcrumbs ArticlePage-headline ArticlePage-mainContent'),
dict(name='article'),
dict(name='div', attrs={'id': 'liveblog-story-wrapper'})
] ]
remove_tags= [ remove_tags= [
dict(name='div', attrs={'class': check_words( classes('ArticlePage-actions Enhancement hidden-tablet hidden-mobile hidden-desktop pb-f-ads-dfp')
'hidden-tablet hidden-mobile hidden-desktop pb-f-ads-dfp')})
]
remove_tags_after = [
dict(name='div', attrs={'class': check_words('pb-f-article-body')})
] ]
def parse_index(self): def parse_index(self):
index = 'http://www.latimes.com/' index = 'https://www.latimes.com/'
pat = r'^(?:https?://www[.]latimes[.]com)?/[^#]+20[0-9]{6}-(?:html)?story[.]html' pat = r'^https://www\.latimes\.com/[^/]+?/story/20\d{2}-\d{2}-\d{2}/\S+'
articles = self.find_articles(index, pat) articles = self.find_articles(index, pat)
for collection in DIR_COLLECTIONS: for collection in DIR_COLLECTIONS:
if self.test:
continue
topdir = collection.pop(0) topdir = collection.pop(0)
index = 'http://www.latimes.com/' + topdir + '/' collection_index = index + topdir + '/'
pat = r'^(?:https?://www[.]latimes[.]com)?/' + \ articles += self.find_articles(collection_index, pat)
topdir + '/[^#]+20[0-9]{6}-(?:html)?story[.]html'
articles += self.find_articles(index, pat)
for subdir in collection: for subdir in collection:
sub_index = index + subdir + '/' sub_index = collection_index + subdir + '/'
articles += self.find_articles(sub_index, pat) articles += self.find_articles(sub_index, pat)
feeds = defaultdict(list) feeds = defaultdict(list)
@ -159,12 +86,7 @@ class LATimes(BasicNewsRecipe):
section = what_section(article['url']) section = what_section(article['url'])
feeds[section].append(article) feeds[section].append(article)
keys = [] return [(k, feeds[k]) for k in sorted(feeds)]
for key in SECTIONS:
if key in feeds.keys():
keys.append(key)
self.log(pformat(dict(feeds)))
return [(k, feeds[k]) for k in keys]
def preprocess_html(self, soup): def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-src': True}): for img in soup.findAll('img', attrs={'data-src': True}):
@ -190,16 +112,6 @@ class LATimes(BasicNewsRecipe):
alinks = [a for a in alinks if len( alinks = [a for a in alinks if len(
a.contents) == 1 and a.find(text=True, recursive=False)] a.contents) == 1 and a.find(text=True, recursive=False)]
articles = [ articles = [
{'title': a.find(text=True), 'url': absurl(a['href'])} for a in alinks] {'title': self.tag_to_string(a), 'url': absurl(a['href'])} for a in alinks]
date_rx = re.compile(
r'^https?://www[.]latimes[.]com/[^#]+-(?P<date>20[0-9]{6})-(?:html)?story[.]html')
for article in articles:
mdate = date_rx.match(article['url'])
if mdate is not None:
try:
article['timestamp'] = (strptime(mdate.group('date'),'%Y%m%d') - DT_EPOCH).total_seconds()
except Exception:
article['timestamp'] = (utcnow() - DT_EPOCH).total_seconds()
article['url'] = mdate.group(0)
self.log('Found: ', len(articles), ' articles.\n') self.log('Found: ', len(articles), ' articles.\n')
return articles return articles