From 58a51c94efd13b80458071319575d16d772dd6fc Mon Sep 17 00:00:00 2001
From: PJ Paul <pjpaulpj@users.noreply.github.com>
Date: Mon, 5 Nov 2018 17:29:20 +0300
Subject: [PATCH] Updated Arts and Letters Daily

Made following changes:
1. Articles older than oldest articles parameter are ignored.
2. Articles are now grouped as per their section headers.
---
 recipes/ald.recipe | 56 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 11 deletions(-)

diff --git a/recipes/ald.recipe b/recipes/ald.recipe
index c78c78d81a..f11d47e9f9 100644
--- a/recipes/ald.recipe
+++ b/recipes/ald.recipe
@@ -1,14 +1,17 @@
 #!/usr/bin/env  python2
 from __future__ import unicode_literals, division, absolute_import, print_function
 __license__ = 'GPL v3'
-__copyright__ = '2018, PJ Paul '
+__copyright__ = '2018, PJ Paul'
 
 '''
-Recipe for Arts and Letters Daily website https://www.aldaily.com/
+Recipe for Arts and Letters Daily website
 '''
 
 from calibre.web.feeds.news import BasicNewsRecipe
-
+from datetime import date as dt
+from datetime import timedelta
+from datetime import datetime
+from itertools import compress
 
 class ALD(BasicNewsRecipe):
     title          = 'Arts and Letters Daily'
@@ -19,19 +22,50 @@ class ALD(BasicNewsRecipe):
     ignore_duplicate_articles = {'url'}
     index = 'https://www.aldaily.com/alt/'
     cover_url = 'https://www.aldaily.com/static/images/header.gif'
-
     __author__ = 'https://github.com/pjpaulpj'
     language = 'en'
     encoding = 'utf-8'
 
+    ## Define parser for the page
     def parse_index(self):
-        articles = []
+        articles_note = []
+        new_books = []
+        essays = []
         feeds = []
         soup = self.index_to_soup(self.index)
-        for x in soup.findAll('p'):
-            if x.find('a'):
-                title = self.tag_to_string(x)
-                url = x.find('a')['href']
-                articles.append({'title':title, 'url':url, 'description': '', 'date':''})
-        feeds.append(('Articles', articles))
+        delta = timedelta(days = self.oldest_article)
+        now = dt.today()
+        oldest_date = now - delta
+
+        # Extract a list of dates from the page. 
+        # Subset this out to the list of target dates for extraction.
+        date_list = [] 
+        for div in soup.findAll('div', attrs = {'id': "dayheader"}):
+            date_list.append(self.tag_to_string(div))
+        date_list_clean = [re.sub(r'[^\w]', ' ', date) for date in date_list]
+        date_list_bool = [datetime.strptime(date, '%b %d %Y').date() >= oldest_date for date in date_list_clean]
+        compress_date = list(compress(date_list, date_list_bool))
+
+        # Process each paragraph one by one.
+        # Stop when the text of the previous div is not in the target date list.
+        for div in soup.findAll('div', attrs = {'class': "mobile-front"}):
+            for p in div.findAll('p'):
+                if self.tag_to_string(p.findPreviousSibling('div')) in compress_date:
+                    if p.find('a'):
+                        title =self.tag_to_string(p)
+                        link = p.find('a')['href']
+                        if self.tag_to_string(p.findPreviousSibling('h3')) == "Articles of Note":
+                            articles_note.append({'title':title, 'url':link, 'description': '', 'date':''})
+                        elif self.tag_to_string(p.findPreviousSibling('h3')) == "New Books":
+                            new_books.append({'title':title, 'url':link, 'description': '', 'date':''})
+                        else: 
+                            essays.append({'title':title, 'url':link, 'description': '', 'date':''})
+                else:
+                    break
+        feeds.append(('Articles of Note', articles_note))
+        feeds.append(('New Books', new_books))
+        feeds.append(('Essays and Opinions', essays))
         return feeds
+
+
+