mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-26 00:02:25 -04:00 
			
		
		
		
	Updated Arts and Letters Daily
Made following changes: 1. Articles older than oldest articles parameter are ignored. 2. Articles are now grouped as per their section headers.
This commit is contained in:
		
							parent
							
								
									bd1db700fe
								
							
						
					
					
						commit
						58a51c94ef
					
				| @ -1,14 +1,17 @@ | ||||
| #!/usr/bin/env  python2 | ||||
| from __future__ import unicode_literals, division, absolute_import, print_function | ||||
| __license__ = 'GPL v3' | ||||
| __copyright__ = '2018, PJ Paul ' | ||||
| __copyright__ = '2018, PJ Paul' | ||||
| 
 | ||||
| ''' | ||||
| Recipe for Arts and Letters Daily website https://www.aldaily.com/ | ||||
| Recipe for Arts and Letters Daily website | ||||
| ''' | ||||
| 
 | ||||
| from calibre.web.feeds.news import BasicNewsRecipe | ||||
| 
 | ||||
| from datetime import date as dt | ||||
| from datetime import timedelta | ||||
| from datetime import datetime | ||||
| from itertools import compress | ||||
| 
 | ||||
| class ALD(BasicNewsRecipe): | ||||
|     title          = 'Arts and Letters Daily' | ||||
| @ -19,19 +22,50 @@ class ALD(BasicNewsRecipe): | ||||
|     ignore_duplicate_articles = {'url'} | ||||
|     index = 'https://www.aldaily.com/alt/' | ||||
|     cover_url = 'https://www.aldaily.com/static/images/header.gif' | ||||
| 
 | ||||
|     __author__ = 'https://github.com/pjpaulpj' | ||||
|     language = 'en' | ||||
|     encoding = 'utf-8' | ||||
| 
 | ||||
|     ## Define parser for the page | ||||
|     def parse_index(self): | ||||
|         articles = [] | ||||
|         articles_note = [] | ||||
|         new_books = [] | ||||
|         essays = [] | ||||
|         feeds = [] | ||||
|         soup = self.index_to_soup(self.index) | ||||
|         for x in soup.findAll('p'): | ||||
|             if x.find('a'): | ||||
|                 title = self.tag_to_string(x) | ||||
|                 url = x.find('a')['href'] | ||||
|                 articles.append({'title':title, 'url':url, 'description': '', 'date':''}) | ||||
|         feeds.append(('Articles', articles)) | ||||
|         delta = timedelta(days = self.oldest_article) | ||||
|         now = dt.today() | ||||
|         oldest_date = now - delta | ||||
| 
 | ||||
|         # Extract a list of dates from the page.  | ||||
|         # Subset this out to the list of target dates for extraction. | ||||
|         date_list = []  | ||||
|         for div in soup.findAll('div', attrs = {'id': "dayheader"}): | ||||
|             date_list.append(self.tag_to_string(div)) | ||||
|         date_list_clean = [re.sub(r'[^\w]', ' ', date) for date in date_list] | ||||
|         date_list_bool = [datetime.strptime(date, '%b %d %Y').date() >= oldest_date for date in date_list_clean] | ||||
|         compress_date = list(compress(date_list, date_list_bool)) | ||||
| 
 | ||||
|         # Process each paragraph one by one. | ||||
|         # Stop when the text of the previous div is not in the target date list. | ||||
|         for div in soup.findAll('div', attrs = {'class': "mobile-front"}): | ||||
|             for p in div.findAll('p'): | ||||
|                 if self.tag_to_string(p.findPreviousSibling('div')) in compress_date: | ||||
|                     if p.find('a'): | ||||
|                         title =self.tag_to_string(p) | ||||
|                         link = p.find('a')['href'] | ||||
|                         if self.tag_to_string(p.findPreviousSibling('h3')) == "Articles of Note": | ||||
|                             articles_note.append({'title':title, 'url':link, 'description': '', 'date':''}) | ||||
|                         elif self.tag_to_string(p.findPreviousSibling('h3')) == "New Books": | ||||
|                             new_books.append({'title':title, 'url':link, 'description': '', 'date':''}) | ||||
|                         else:  | ||||
|                             essays.append({'title':title, 'url':link, 'description': '', 'date':''}) | ||||
|                 else: | ||||
|                     break | ||||
|         feeds.append(('Articles of Note', articles_note)) | ||||
|         feeds.append(('New Books', new_books)) | ||||
|         feeds.append(('Essays and Opinions', essays)) | ||||
|         return feeds | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user