Updated Arts and Letters Daily

Made following changes: 1. Articles older than oldest articles parameter are ignored. 2. Articles are now grouped as per their section headers.
2025-12-11 23:55:44 -05:00 · 2018-11-05 17:29:20 +03:00 · 2018-11-05 17:29:20 +03:00 · 58a51c94ef
commit 58a51c94ef
parent bd1db700fe
1 changed files with 45 additions and 11 deletions
--- a/recipes/ald.recipe
+++ b/recipes/ald.recipe
@ -1,14 +1,17 @@
 #!/usr/bin/env  python2
 from __future__ import unicode_literals, division, absolute_import, print_function
 __license__ = 'GPL v3'
-__copyright__ = '2018, PJ Paul '
+__copyright__ = '2018, PJ Paul'
 '''
-Recipe for Arts and Letters Daily website https://www.aldaily.com/
+Recipe for Arts and Letters Daily website
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
-
+from datetime import date as dt
 from datetime import timedelta
 from datetime import datetime
 from itertools import compress
 class ALD(BasicNewsRecipe):
    title          = 'Arts and Letters Daily'
@ -19,19 +22,50 @@ class ALD(BasicNewsRecipe):
    ignore_duplicate_articles = {'url'}
    index = 'https://www.aldaily.com/alt/'
    cover_url = 'https://www.aldaily.com/static/images/header.gif'
    __author__ = 'https://github.com/pjpaulpj'
    language = 'en'
    encoding = 'utf-8'
    ## Define parser for the page
    def parse_index(self):
-        articles = []
+        articles_note = []
        new_books = []
        essays = []
        feeds = []
        soup = self.index_to_soup(self.index)
-        for x in soup.findAll('p'):
+        delta = timedelta(days = self.oldest_article)
-            if x.find('a'):
+        now = dt.today()
-                title = self.tag_to_string(x)
+        oldest_date = now - delta
-                url = x.find('a')['href']
+
-                articles.append({'title':title, 'url':url, 'description': '', 'date':''})
+        # Extract a list of dates from the page. 
-        feeds.append(('Articles', articles))
+        # Subset this out to the list of target dates for extraction.
        date_list = [] 
        for div in soup.findAll('div', attrs = {'id': "dayheader"}):
            date_list.append(self.tag_to_string(div))
        date_list_clean = [re.sub(r'[^\w]', ' ', date) for date in date_list]
        date_list_bool = [datetime.strptime(date, '%b %d %Y').date() >= oldest_date for date in date_list_clean]
        compress_date = list(compress(date_list, date_list_bool))
        # Process each paragraph one by one.
        # Stop when the text of the previous div is not in the target date list.
        for div in soup.findAll('div', attrs = {'class': "mobile-front"}):
            for p in div.findAll('p'):
                if self.tag_to_string(p.findPreviousSibling('div')) in compress_date:
                    if p.find('a'):
                        title =self.tag_to_string(p)
                        link = p.find('a')['href']
                        if self.tag_to_string(p.findPreviousSibling('h3')) == "Articles of Note":
                            articles_note.append({'title':title, 'url':link, 'description': '', 'date':''})
                        elif self.tag_to_string(p.findPreviousSibling('h3')) == "New Books":
                            new_books.append({'title':title, 'url':link, 'description': '', 'date':''})
                        else: 
                            essays.append({'title':title, 'url':link, 'description': '', 'date':''})
                else:
                    break
        feeds.append(('Articles of Note', articles_note))
        feeds.append(('New Books', new_books))
        feeds.append(('Essays and Opinions', essays))
        return feeds