From 58a51c94efd13b80458071319575d16d772dd6fc Mon Sep 17 00:00:00 2001 From: PJ Paul Date: Mon, 5 Nov 2018 17:29:20 +0300 Subject: [PATCH] Updated Arts and Letters Daily Made following changes: 1. Articles older than oldest articles parameter are ignored. 2. Articles are now grouped as per their section headers. --- recipes/ald.recipe | 56 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 11 deletions(-) diff --git a/recipes/ald.recipe b/recipes/ald.recipe index c78c78d81a..f11d47e9f9 100644 --- a/recipes/ald.recipe +++ b/recipes/ald.recipe @@ -1,14 +1,17 @@ #!/usr/bin/env python2 from __future__ import unicode_literals, division, absolute_import, print_function __license__ = 'GPL v3' -__copyright__ = '2018, PJ Paul ' +__copyright__ = '2018, PJ Paul' ''' -Recipe for Arts and Letters Daily website https://www.aldaily.com/ +Recipe for Arts and Letters Daily website ''' from calibre.web.feeds.news import BasicNewsRecipe - +from datetime import date as dt +from datetime import timedelta +from datetime import datetime +from itertools import compress class ALD(BasicNewsRecipe): title = 'Arts and Letters Daily' @@ -19,19 +22,50 @@ class ALD(BasicNewsRecipe): ignore_duplicate_articles = {'url'} index = 'https://www.aldaily.com/alt/' cover_url = 'https://www.aldaily.com/static/images/header.gif' - __author__ = 'https://github.com/pjpaulpj' language = 'en' encoding = 'utf-8' + ## Define parser for the page def parse_index(self): - articles = [] + articles_note = [] + new_books = [] + essays = [] feeds = [] soup = self.index_to_soup(self.index) - for x in soup.findAll('p'): - if x.find('a'): - title = self.tag_to_string(x) - url = x.find('a')['href'] - articles.append({'title':title, 'url':url, 'description': '', 'date':''}) - feeds.append(('Articles', articles)) + delta = timedelta(days = self.oldest_article) + now = dt.today() + oldest_date = now - delta + + # Extract a list of dates from the page. + # Subset this out to the list of target dates for extraction. + date_list = [] + for div in soup.findAll('div', attrs = {'id': "dayheader"}): + date_list.append(self.tag_to_string(div)) + date_list_clean = [re.sub(r'[^\w]', ' ', date) for date in date_list] + date_list_bool = [datetime.strptime(date, '%b %d %Y').date() >= oldest_date for date in date_list_clean] + compress_date = list(compress(date_list, date_list_bool)) + + # Process each paragraph one by one. + # Stop when the text of the previous div is not in the target date list. + for div in soup.findAll('div', attrs = {'class': "mobile-front"}): + for p in div.findAll('p'): + if self.tag_to_string(p.findPreviousSibling('div')) in compress_date: + if p.find('a'): + title =self.tag_to_string(p) + link = p.find('a')['href'] + if self.tag_to_string(p.findPreviousSibling('h3')) == "Articles of Note": + articles_note.append({'title':title, 'url':link, 'description': '', 'date':''}) + elif self.tag_to_string(p.findPreviousSibling('h3')) == "New Books": + new_books.append({'title':title, 'url':link, 'description': '', 'date':''}) + else: + essays.append({'title':title, 'url':link, 'description': '', 'date':''}) + else: + break + feeds.append(('Articles of Note', articles_note)) + feeds.append(('New Books', new_books)) + feeds.append(('Essays and Opinions', essays)) return feeds + + +