diff --git a/recipes/ald.recipe b/recipes/ald.recipe index c78c78d81a..bb3fe54195 100644 --- a/recipes/ald.recipe +++ b/recipes/ald.recipe @@ -1,37 +1,88 @@ #!/usr/bin/env python2 from __future__ import unicode_literals, division, absolute_import, print_function __license__ = 'GPL v3' -__copyright__ = '2018, PJ Paul ' - +__copyright__ = '2018, PJ Paul' ''' -Recipe for Arts and Letters Daily website https://www.aldaily.com/ +Recipe for Arts and Letters Daily website ''' from calibre.web.feeds.news import BasicNewsRecipe +import re +from datetime import date as dt +from datetime import timedelta +from datetime import datetime +from itertools import compress class ALD(BasicNewsRecipe): - title = 'Arts and Letters Daily' + title = 'Arts and Letters Daily' oldest_article = 7 max_articles_per_feed = 100 - auto_cleanup = True + auto_cleanup = True recursions = 0 ignore_duplicate_articles = {'url'} index = 'https://www.aldaily.com/alt/' cover_url = 'https://www.aldaily.com/static/images/header.gif' - __author__ = 'https://github.com/pjpaulpj' language = 'en' encoding = 'utf-8' def parse_index(self): - articles = [] + articles_note = [] + new_books = [] + essays = [] feeds = [] soup = self.index_to_soup(self.index) - for x in soup.findAll('p'): - if x.find('a'): - title = self.tag_to_string(x) - url = x.find('a')['href'] - articles.append({'title':title, 'url':url, 'description': '', 'date':''}) - feeds.append(('Articles', articles)) + delta = timedelta(days=self.oldest_article) + now = dt.today() + oldest_date = now - delta + + # Extract a list of dates from the page. + # Subset this out to the list of target dates for extraction. + date_list = [] + for div in soup.findAll('div', attrs={'id': "dayheader"}): + date_list.append(self.tag_to_string(div)) + date_list_clean = [re.sub(r'[^\w]', ' ', date) for date in date_list] + date_list_bool = [ + datetime.strptime(date, '%b %d %Y').date() >= oldest_date + for date in date_list_clean + ] + compress_date = list(compress(date_list, date_list_bool)) + + # Process each paragraph one by one. + # Stop when the text of the previous div is not in the target date list. + for div in soup.findAll('div', attrs={'class': "mobile-front"}): + for p in div.findAll('p'): + if self.tag_to_string(p.findPreviousSibling('div')) in compress_date: + if p.find('a'): + title = self.tag_to_string(p) + link = p.find('a')['href'] + if self.tag_to_string(p.findPreviousSibling('h3') + ) == "Articles of Note": + articles_note.append({ + 'title': title, + 'url': link, + 'description': '', + 'date': '' + }) + elif self.tag_to_string(p.findPreviousSibling('h3') + ) == "New Books": + new_books.append({ + 'title': title, + 'url': link, + 'description': '', + 'date': '' + }) + else: + essays.append({ + 'title': title, + 'url': link, + 'description': '', + 'date': '' + }) + else: + break + feeds.append(('Articles of Note', articles_note)) + feeds.append(('New Books', new_books)) + feeds.append(('Essays and Opinions', essays)) return feeds