Updated Arts and Letters Daily

Made following changes: 1. Articles older than oldest articles parameter are ignored. 2. Articles are now grouped as per their section headers.
2025-06-23 15:30:45 -04:00 · 2018-11-05 17:29:20 +03:00 · 2018-11-05 17:29:20 +03:00 · 58a51c94ef
commit 58a51c94ef
parent bd1db700fe
1 changed files with 45 additions and 11 deletions
--- a/recipes/ald.recipe
+++ b/recipes/ald.recipe
@ -1,14 +1,17 @@
 #!/usr/bin/env  python2
 from __future__ import unicode_literals, division, absolute_import, print_function
 __license__ = 'GPL v3'
-__copyright__ = '2018, PJ Paul '
+__copyright__ = '2018, PJ Paul'

 '''
-Recipe for Arts and Letters Daily website https://www.aldaily.com/
+Recipe for Arts and Letters Daily website
 '''

 from calibre.web.feeds.news import BasicNewsRecipe
-
+from datetime import date as dt
+from datetime import timedelta
+from datetime import datetime
+from itertools import compress

 class ALD(BasicNewsRecipe):
    title          = 'Arts and Letters Daily'
@ -19,19 +22,50 @@ class ALD(BasicNewsRecipe):
    ignore_duplicate_articles = {'url'}
    index = 'https://www.aldaily.com/alt/'
    cover_url = 'https://www.aldaily.com/static/images/header.gif'
-
    __author__ = 'https://github.com/pjpaulpj'
    language = 'en'
    encoding = 'utf-8'

+    ## Define parser for the page
    def parse_index(self):
-        articles = []
+        articles_note = []
+        new_books = []
+        essays = []
        feeds = []
        soup = self.index_to_soup(self.index)
-        for x in soup.findAll('p'):
-            if x.find('a'):
-                title = self.tag_to_string(x)
-                url = x.find('a')['href']
-                articles.append({'title':title, 'url':url, 'description': '', 'date':''})
-        feeds.append(('Articles', articles))
+        delta = timedelta(days = self.oldest_article)
+        now = dt.today()
+        oldest_date = now - delta
+
+        # Extract a list of dates from the page. 
+        # Subset this out to the list of target dates for extraction.
+        date_list = [] 
+        for div in soup.findAll('div', attrs = {'id': "dayheader"}):
+            date_list.append(self.tag_to_string(div))
+        date_list_clean = [re.sub(r'[^\w]', ' ', date) for date in date_list]
+        date_list_bool = [datetime.strptime(date, '%b %d %Y').date() >= oldest_date for date in date_list_clean]
+        compress_date = list(compress(date_list, date_list_bool))
+
+        # Process each paragraph one by one.
+        # Stop when the text of the previous div is not in the target date list.
+        for div in soup.findAll('div', attrs = {'class': "mobile-front"}):
+            for p in div.findAll('p'):
+                if self.tag_to_string(p.findPreviousSibling('div')) in compress_date:
+                    if p.find('a'):
+                        title =self.tag_to_string(p)
+                        link = p.find('a')['href']
+                        if self.tag_to_string(p.findPreviousSibling('h3')) == "Articles of Note":
+                            articles_note.append({'title':title, 'url':link, 'description': '', 'date':''})
+                        elif self.tag_to_string(p.findPreviousSibling('h3')) == "New Books":
+                            new_books.append({'title':title, 'url':link, 'description': '', 'date':''})
+                        else: 
+                            essays.append({'title':title, 'url':link, 'description': '', 'date':''})
+                else:
+                    break
+        feeds.append(('Articles of Note', articles_note))
+        feeds.append(('New Books', new_books))
+        feeds.append(('Essays and Opinions', essays))
        return feeds
+
+
+