Updated Arts and Letters Daily

Made following changes:
1. Articles older than oldest articles parameter are ignored.
2. Articles are now grouped as per their section headers.
This commit is contained in:
PJ Paul 2018-11-05 17:29:20 +03:00 committed by GitHub
parent bd1db700fe
commit 58a51c94ef
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -4,11 +4,14 @@ __license__ = 'GPL v3'
__copyright__ = '2018, PJ Paul' __copyright__ = '2018, PJ Paul'
''' '''
Recipe for Arts and Letters Daily website https://www.aldaily.com/ Recipe for Arts and Letters Daily website
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from datetime import date as dt
from datetime import timedelta
from datetime import datetime
from itertools import compress
class ALD(BasicNewsRecipe): class ALD(BasicNewsRecipe):
title = 'Arts and Letters Daily' title = 'Arts and Letters Daily'
@ -19,19 +22,50 @@ class ALD(BasicNewsRecipe):
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
index = 'https://www.aldaily.com/alt/' index = 'https://www.aldaily.com/alt/'
cover_url = 'https://www.aldaily.com/static/images/header.gif' cover_url = 'https://www.aldaily.com/static/images/header.gif'
__author__ = 'https://github.com/pjpaulpj' __author__ = 'https://github.com/pjpaulpj'
language = 'en' language = 'en'
encoding = 'utf-8' encoding = 'utf-8'
## Define parser for the page
def parse_index(self): def parse_index(self):
articles = [] articles_note = []
new_books = []
essays = []
feeds = [] feeds = []
soup = self.index_to_soup(self.index) soup = self.index_to_soup(self.index)
for x in soup.findAll('p'): delta = timedelta(days = self.oldest_article)
if x.find('a'): now = dt.today()
title = self.tag_to_string(x) oldest_date = now - delta
url = x.find('a')['href']
articles.append({'title':title, 'url':url, 'description': '', 'date':''}) # Extract a list of dates from the page.
feeds.append(('Articles', articles)) # Subset this out to the list of target dates for extraction.
date_list = []
for div in soup.findAll('div', attrs = {'id': "dayheader"}):
date_list.append(self.tag_to_string(div))
date_list_clean = [re.sub(r'[^\w]', ' ', date) for date in date_list]
date_list_bool = [datetime.strptime(date, '%b %d %Y').date() >= oldest_date for date in date_list_clean]
compress_date = list(compress(date_list, date_list_bool))
# Process each paragraph one by one.
# Stop when the text of the previous div is not in the target date list.
for div in soup.findAll('div', attrs = {'class': "mobile-front"}):
for p in div.findAll('p'):
if self.tag_to_string(p.findPreviousSibling('div')) in compress_date:
if p.find('a'):
title =self.tag_to_string(p)
link = p.find('a')['href']
if self.tag_to_string(p.findPreviousSibling('h3')) == "Articles of Note":
articles_note.append({'title':title, 'url':link, 'description': '', 'date':''})
elif self.tag_to_string(p.findPreviousSibling('h3')) == "New Books":
new_books.append({'title':title, 'url':link, 'description': '', 'date':''})
else:
essays.append({'title':title, 'url':link, 'description': '', 'date':''})
else:
break
feeds.append(('Articles of Note', articles_note))
feeds.append(('New Books', new_books))
feeds.append(('Essays and Opinions', essays))
return feeds return feeds