Merge branch 'patch-2' of https://github.com/pjpaulpj/calibre

2025-12-10 23:25:01 -05:00 · 2018-11-05 20:59:59 +05:30 · 2018-11-05 20:59:59 +05:30 · 7de8c6da71
commit 7de8c6da71
parent bd1db700fe 58a51c94ef
1 changed files with 64 additions and 13 deletions
--- a/recipes/ald.recipe
+++ b/recipes/ald.recipe
@ -1,13 +1,17 @@
 #!/usr/bin/env  python2
 from __future__ import unicode_literals, division, absolute_import, print_function
 __license__ = 'GPL v3'
-__copyright__ = '2018, PJ Paul '
-
+__copyright__ = '2018, PJ Paul'
 '''
-Recipe for Arts and Letters Daily website https://www.aldaily.com/
+Recipe for Arts and Letters Daily website
 '''

 from calibre.web.feeds.news import BasicNewsRecipe
+import re
+from datetime import date as dt
+from datetime import timedelta
+from datetime import datetime
+from itertools import compress


 class ALD(BasicNewsRecipe):
@ -19,19 +23,66 @@ class ALD(BasicNewsRecipe):
    ignore_duplicate_articles = {'url'}
    index = 'https://www.aldaily.com/alt/'
    cover_url = 'https://www.aldaily.com/static/images/header.gif'
-
    __author__ = 'https://github.com/pjpaulpj'
    language = 'en'
    encoding = 'utf-8'

    def parse_index(self):
-        articles = []
+        articles_note = []
+        new_books = []
+        essays = []
        feeds = []
        soup = self.index_to_soup(self.index)
-        for x in soup.findAll('p'):
-            if x.find('a'):
-                title = self.tag_to_string(x)
-                url = x.find('a')['href']
-                articles.append({'title':title, 'url':url, 'description': '', 'date':''})
-        feeds.append(('Articles', articles))
+        delta = timedelta(days=self.oldest_article)
+        now = dt.today()
+        oldest_date = now - delta
+
+        # Extract a list of dates from the page.
+        # Subset this out to the list of target dates for extraction.
+        date_list = []
+        for div in soup.findAll('div', attrs={'id': "dayheader"}):
+            date_list.append(self.tag_to_string(div))
+        date_list_clean = [re.sub(r'[^\w]', ' ', date) for date in date_list]
+        date_list_bool = [
+            datetime.strptime(date, '%b %d %Y').date() >= oldest_date
+            for date in date_list_clean
+        ]
+        compress_date = list(compress(date_list, date_list_bool))
+
+        # Process each paragraph one by one.
+        # Stop when the text of the previous div is not in the target date list.
+        for div in soup.findAll('div', attrs={'class': "mobile-front"}):
+            for p in div.findAll('p'):
+                if self.tag_to_string(p.findPreviousSibling('div')) in compress_date:
+                    if p.find('a'):
+                        title = self.tag_to_string(p)
+                        link = p.find('a')['href']
+                        if self.tag_to_string(p.findPreviousSibling('h3')
+                                              ) == "Articles of Note":
+                            articles_note.append({
+                                'title': title,
+                                'url': link,
+                                'description': '',
+                                'date': ''
+                            })
+                        elif self.tag_to_string(p.findPreviousSibling('h3')
+                                                ) == "New Books":
+                            new_books.append({
+                                'title': title,
+                                'url': link,
+                                'description': '',
+                                'date': ''
+                            })
+                        else:
+                            essays.append({
+                                'title': title,
+                                'url': link,
+                                'description': '',
+                                'date': ''
+                            })
+                else:
+                    break
+        feeds.append(('Articles of Note', articles_note))
+        feeds.append(('New Books', new_books))
+        feeds.append(('Essays and Opinions', essays))
        return feeds