mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update LA Times
This commit is contained in:
parent
ac0d67ee6f
commit
a15acae96d
@ -2,13 +2,9 @@
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from pprint import pformat
|
|
||||||
|
|
||||||
from calibre.utils.date import strptime, utcnow
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
DT_EPOCH = strptime('1970-01-01', '%Y-%m-%d', assume_utc=True)
|
|
||||||
|
|
||||||
DIR_COLLECTIONS = [['world'],
|
DIR_COLLECTIONS = [['world'],
|
||||||
['nation'],
|
['nation'],
|
||||||
['politics'],
|
['politics'],
|
||||||
@ -29,84 +25,22 @@ DIR_COLLECTIONS = [['world'],
|
|||||||
['travel'],
|
['travel'],
|
||||||
['fashion']]
|
['fashion']]
|
||||||
|
|
||||||
SECTIONS=['THE WORLD',
|
|
||||||
'THE NATION',
|
def classes(classes):
|
||||||
'POLITICS',
|
q = frozenset(classes.split(' '))
|
||||||
'OPINION',
|
return dict(attrs={
|
||||||
'CALIFORNIA',
|
'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
||||||
'OBITUARIES',
|
|
||||||
'BUSINESS',
|
|
||||||
'HOLLYWOOD',
|
|
||||||
'SPORTS',
|
|
||||||
'ENTERTAINMENT',
|
|
||||||
'MOVIES',
|
|
||||||
'TELEVISION',
|
|
||||||
'BOOKS',
|
|
||||||
'FOOD',
|
|
||||||
'HEALTH',
|
|
||||||
'SCIENCE AND TECHNOLOGY',
|
|
||||||
'HOME',
|
|
||||||
'TRAVEL',
|
|
||||||
'FASHION',
|
|
||||||
'NEWSLETTERS'
|
|
||||||
'OTHER']
|
|
||||||
|
|
||||||
|
|
||||||
def absurl(url):
|
def absurl(url):
|
||||||
if url.startswith('/'):
|
if url.startswith('/'):
|
||||||
url = 'http://www.latimes.com' + url
|
url = 'https://www.latimes.com' + url
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
def check_words(words):
|
|
||||||
return lambda x: x and frozenset(words.split()).intersection(x.split())
|
|
||||||
|
|
||||||
|
|
||||||
def what_section(url):
|
def what_section(url):
|
||||||
if re.compile(r'^https?://www[.]latimes[.]com/local/obituaries').search(url):
|
parts = url.split('/')
|
||||||
return 'OBITUARIES'
|
return parts[-4].capitalize()
|
||||||
elif re.compile(r'^https?://www[.]latimes[.]com/business/hollywood').search(url):
|
|
||||||
return 'HOLLYWOOD'
|
|
||||||
elif re.compile(r'^https?://www[.]latimes[.]com/entertainment/movies').search(url):
|
|
||||||
return 'MOVIES'
|
|
||||||
elif re.compile(r'^https?://www[.]latimes[.]com/entertainment/tv').search(url):
|
|
||||||
return 'TELEVISION'
|
|
||||||
elif re.compile(r'^https?://www[.]latimes[.]com/business/technology').search(url):
|
|
||||||
return 'SCIENCE AND TECHNOLOGY'
|
|
||||||
elif re.compile(r'^https?://www[.]latimes[.]com/world').search(url):
|
|
||||||
return 'THE WORLD'
|
|
||||||
elif re.compile(r'^https?://www[.]latimes[.]com/nation').search(url):
|
|
||||||
return 'THE NATION'
|
|
||||||
elif re.compile(r'^https?://www[.]latimes[.]com/politics').search(url):
|
|
||||||
return 'POLITICS'
|
|
||||||
elif re.compile(r'^https?://www[.]latimes[.]com/opinion').search(url):
|
|
||||||
return 'OPINION'
|
|
||||||
elif re.compile(r'^https?://www[.]latimes[.]com/(?:local|style)').search(url):
|
|
||||||
return 'CALIFORNIA'
|
|
||||||
elif re.compile(r'^https?://www[.]latimes[.]com/business').search(url):
|
|
||||||
return 'BUSINESS'
|
|
||||||
elif re.compile(r'^https?://www[.]latimes[.]com/sports').search(url):
|
|
||||||
return 'SPORTS'
|
|
||||||
elif re.compile(r'^https?://www[.]latimes[.]com/entertainment').search(url):
|
|
||||||
return 'ENTERTAINMENT'
|
|
||||||
elif re.compile(r'^https?://www[.]latimes[.]com/books').search(url):
|
|
||||||
return 'BOOKS'
|
|
||||||
elif re.compile(r'^https?://www[.]latimes[.]com/food').search(url):
|
|
||||||
return 'FOOD'
|
|
||||||
elif re.compile(r'^https?://www[.]latimes[.]com/health').search(url):
|
|
||||||
return 'HEALTH'
|
|
||||||
elif re.compile(r'^https?://www[.]latimes[.]com/science').search(url):
|
|
||||||
return 'SCIENCE AND TECHNOLOGY'
|
|
||||||
elif re.compile(r'^https?://www[.]latimes[.]com/home').search(url):
|
|
||||||
return 'HOME'
|
|
||||||
elif re.compile(r'^https?://www[.]latimes[.]com/travel').search(url):
|
|
||||||
return 'TRAVEL'
|
|
||||||
elif re.compile(r'^https?://www[.]latimes[.]com/fashion').search(url):
|
|
||||||
return 'FASHION'
|
|
||||||
elif re.compile(r'^https?://www[.]latimes[.]com/newsletter').search(url):
|
|
||||||
return 'NEWSLETTERS'
|
|
||||||
else:
|
|
||||||
return 'OTHER'
|
|
||||||
|
|
||||||
|
|
||||||
class LATimes(BasicNewsRecipe):
|
class LATimes(BasicNewsRecipe):
|
||||||
@ -126,32 +60,25 @@ class LATimes(BasicNewsRecipe):
|
|||||||
cover_url = 'http://www.latimes.com/includes/sectionfronts/A1.pdf'
|
cover_url = 'http://www.latimes.com/includes/sectionfronts/A1.pdf'
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(name='header', attrs={'id': 'top'}),
|
classes('ArticlePage-breadcrumbs ArticlePage-headline ArticlePage-mainContent'),
|
||||||
dict(name='article'),
|
|
||||||
dict(name='div', attrs={'id': 'liveblog-story-wrapper'})
|
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_tags= [
|
remove_tags= [
|
||||||
dict(name='div', attrs={'class': check_words(
|
classes('ArticlePage-actions Enhancement hidden-tablet hidden-mobile hidden-desktop pb-f-ads-dfp')
|
||||||
'hidden-tablet hidden-mobile hidden-desktop pb-f-ads-dfp')})
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_tags_after = [
|
|
||||||
dict(name='div', attrs={'class': check_words('pb-f-article-body')})
|
|
||||||
]
|
]
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
index = 'http://www.latimes.com/'
|
index = 'https://www.latimes.com/'
|
||||||
pat = r'^(?:https?://www[.]latimes[.]com)?/[^#]+20[0-9]{6}-(?:html)?story[.]html'
|
pat = r'^https://www\.latimes\.com/[^/]+?/story/20\d{2}-\d{2}-\d{2}/\S+'
|
||||||
articles = self.find_articles(index, pat)
|
articles = self.find_articles(index, pat)
|
||||||
for collection in DIR_COLLECTIONS:
|
for collection in DIR_COLLECTIONS:
|
||||||
|
if self.test:
|
||||||
|
continue
|
||||||
topdir = collection.pop(0)
|
topdir = collection.pop(0)
|
||||||
index = 'http://www.latimes.com/' + topdir + '/'
|
collection_index = index + topdir + '/'
|
||||||
pat = r'^(?:https?://www[.]latimes[.]com)?/' + \
|
articles += self.find_articles(collection_index, pat)
|
||||||
topdir + '/[^#]+20[0-9]{6}-(?:html)?story[.]html'
|
|
||||||
articles += self.find_articles(index, pat)
|
|
||||||
for subdir in collection:
|
for subdir in collection:
|
||||||
sub_index = index + subdir + '/'
|
sub_index = collection_index + subdir + '/'
|
||||||
articles += self.find_articles(sub_index, pat)
|
articles += self.find_articles(sub_index, pat)
|
||||||
|
|
||||||
feeds = defaultdict(list)
|
feeds = defaultdict(list)
|
||||||
@ -159,12 +86,7 @@ class LATimes(BasicNewsRecipe):
|
|||||||
section = what_section(article['url'])
|
section = what_section(article['url'])
|
||||||
feeds[section].append(article)
|
feeds[section].append(article)
|
||||||
|
|
||||||
keys = []
|
return [(k, feeds[k]) for k in sorted(feeds)]
|
||||||
for key in SECTIONS:
|
|
||||||
if key in feeds.keys():
|
|
||||||
keys.append(key)
|
|
||||||
self.log(pformat(dict(feeds)))
|
|
||||||
return [(k, feeds[k]) for k in keys]
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for img in soup.findAll('img', attrs={'data-src': True}):
|
for img in soup.findAll('img', attrs={'data-src': True}):
|
||||||
@ -190,16 +112,6 @@ class LATimes(BasicNewsRecipe):
|
|||||||
alinks = [a for a in alinks if len(
|
alinks = [a for a in alinks if len(
|
||||||
a.contents) == 1 and a.find(text=True, recursive=False)]
|
a.contents) == 1 and a.find(text=True, recursive=False)]
|
||||||
articles = [
|
articles = [
|
||||||
{'title': a.find(text=True), 'url': absurl(a['href'])} for a in alinks]
|
{'title': self.tag_to_string(a), 'url': absurl(a['href'])} for a in alinks]
|
||||||
date_rx = re.compile(
|
|
||||||
r'^https?://www[.]latimes[.]com/[^#]+-(?P<date>20[0-9]{6})-(?:html)?story[.]html')
|
|
||||||
for article in articles:
|
|
||||||
mdate = date_rx.match(article['url'])
|
|
||||||
if mdate is not None:
|
|
||||||
try:
|
|
||||||
article['timestamp'] = (strptime(mdate.group('date'),'%Y%m%d') - DT_EPOCH).total_seconds()
|
|
||||||
except Exception:
|
|
||||||
article['timestamp'] = (utcnow() - DT_EPOCH).total_seconds()
|
|
||||||
article['url'] = mdate.group(0)
|
|
||||||
self.log('Found: ', len(articles), ' articles.\n')
|
self.log('Found: ', len(articles), ' articles.\n')
|
||||||
return articles
|
return articles
|
||||||
|
Loading…
x
Reference in New Issue
Block a user