This commit is contained in:
Kovid Goyal 2018-11-05 20:59:59 +05:30
commit 7de8c6da71
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,37 +1,88 @@
#!/usr/bin/env python2
from __future__ import unicode_literals, division, absolute_import, print_function
__license__ = 'GPL v3'
__copyright__ = '2018, PJ Paul '
__copyright__ = '2018, PJ Paul'
'''
Recipe for Arts and Letters Daily website https://www.aldaily.com/
Recipe for Arts and Letters Daily website
'''
from calibre.web.feeds.news import BasicNewsRecipe
import re
from datetime import date as dt
from datetime import timedelta
from datetime import datetime
from itertools import compress
class ALD(BasicNewsRecipe):
title = 'Arts and Letters Daily'
title = 'Arts and Letters Daily'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
auto_cleanup = True
recursions = 0
ignore_duplicate_articles = {'url'}
index = 'https://www.aldaily.com/alt/'
cover_url = 'https://www.aldaily.com/static/images/header.gif'
__author__ = 'https://github.com/pjpaulpj'
language = 'en'
encoding = 'utf-8'
def parse_index(self):
articles = []
articles_note = []
new_books = []
essays = []
feeds = []
soup = self.index_to_soup(self.index)
for x in soup.findAll('p'):
if x.find('a'):
title = self.tag_to_string(x)
url = x.find('a')['href']
articles.append({'title':title, 'url':url, 'description': '', 'date':''})
feeds.append(('Articles', articles))
delta = timedelta(days=self.oldest_article)
now = dt.today()
oldest_date = now - delta
# Extract a list of dates from the page.
# Subset this out to the list of target dates for extraction.
date_list = []
for div in soup.findAll('div', attrs={'id': "dayheader"}):
date_list.append(self.tag_to_string(div))
date_list_clean = [re.sub(r'[^\w]', ' ', date) for date in date_list]
date_list_bool = [
datetime.strptime(date, '%b %d %Y').date() >= oldest_date
for date in date_list_clean
]
compress_date = list(compress(date_list, date_list_bool))
# Process each paragraph one by one.
# Stop when the text of the previous div is not in the target date list.
for div in soup.findAll('div', attrs={'class': "mobile-front"}):
for p in div.findAll('p'):
if self.tag_to_string(p.findPreviousSibling('div')) in compress_date:
if p.find('a'):
title = self.tag_to_string(p)
link = p.find('a')['href']
if self.tag_to_string(p.findPreviousSibling('h3')
) == "Articles of Note":
articles_note.append({
'title': title,
'url': link,
'description': '',
'date': ''
})
elif self.tag_to_string(p.findPreviousSibling('h3')
) == "New Books":
new_books.append({
'title': title,
'url': link,
'description': '',
'date': ''
})
else:
essays.append({
'title': title,
'url': link,
'description': '',
'date': ''
})
else:
break
feeds.append(('Articles of Note', articles_note))
feeds.append(('New Books', new_books))
feeds.append(('Essays and Opinions', essays))
return feeds