Updated Arts and Letters Daily

Made following changes:
1. Articles older than oldest articles parameter are ignored.
2. Articles are now grouped as per their section headers.
This commit is contained in:
PJ Paul 2018-11-05 17:29:20 +03:00 committed by GitHub
parent bd1db700fe
commit 58a51c94ef
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,14 +1,17 @@
#!/usr/bin/env python2
from __future__ import unicode_literals, division, absolute_import, print_function
__license__ = 'GPL v3'
__copyright__ = '2018, PJ Paul '
__copyright__ = '2018, PJ Paul'
'''
Recipe for Arts and Letters Daily website https://www.aldaily.com/
Recipe for Arts and Letters Daily website
'''
from calibre.web.feeds.news import BasicNewsRecipe
from datetime import date as dt
from datetime import timedelta
from datetime import datetime
from itertools import compress
class ALD(BasicNewsRecipe):
title = 'Arts and Letters Daily'
@ -19,19 +22,50 @@ class ALD(BasicNewsRecipe):
ignore_duplicate_articles = {'url'}
index = 'https://www.aldaily.com/alt/'
cover_url = 'https://www.aldaily.com/static/images/header.gif'
__author__ = 'https://github.com/pjpaulpj'
language = 'en'
encoding = 'utf-8'
## Define parser for the page
def parse_index(self):
articles = []
articles_note = []
new_books = []
essays = []
feeds = []
soup = self.index_to_soup(self.index)
for x in soup.findAll('p'):
if x.find('a'):
title = self.tag_to_string(x)
url = x.find('a')['href']
articles.append({'title':title, 'url':url, 'description': '', 'date':''})
feeds.append(('Articles', articles))
delta = timedelta(days = self.oldest_article)
now = dt.today()
oldest_date = now - delta
# Extract a list of dates from the page.
# Subset this out to the list of target dates for extraction.
date_list = []
for div in soup.findAll('div', attrs = {'id': "dayheader"}):
date_list.append(self.tag_to_string(div))
date_list_clean = [re.sub(r'[^\w]', ' ', date) for date in date_list]
date_list_bool = [datetime.strptime(date, '%b %d %Y').date() >= oldest_date for date in date_list_clean]
compress_date = list(compress(date_list, date_list_bool))
# Process each paragraph one by one.
# Stop when the text of the previous div is not in the target date list.
for div in soup.findAll('div', attrs = {'class': "mobile-front"}):
for p in div.findAll('p'):
if self.tag_to_string(p.findPreviousSibling('div')) in compress_date:
if p.find('a'):
title =self.tag_to_string(p)
link = p.find('a')['href']
if self.tag_to_string(p.findPreviousSibling('h3')) == "Articles of Note":
articles_note.append({'title':title, 'url':link, 'description': '', 'date':''})
elif self.tag_to_string(p.findPreviousSibling('h3')) == "New Books":
new_books.append({'title':title, 'url':link, 'description': '', 'date':''})
else:
essays.append({'title':title, 'url':link, 'description': '', 'date':''})
else:
break
feeds.append(('Articles of Note', articles_note))
feeds.append(('New Books', new_books))
feeds.append(('Essays and Opinions', essays))
return feeds