calibre/recipes/letsgetcritical.recipe
2019-09-08 14:07:59 +05:30

96 lines
3.4 KiB
Plaintext

import re
from calibre.web.feeds.news import BasicNewsRecipe
class LetsGetCritical(BasicNewsRecipe):
title = u"Let's Get Critical"
description = 'Curation / aggregation of criticisms of the arts and culture '
language = 'en'
__author__ = 'barty on mobileread.com forum'
max_articles_per_feed = 100
no_stylesheets = False
timefmt = ' [%a, %d %b, %Y]'
oldest_article = 365
auto_cleanup = True
INDEX = 'http://www.letsgetcritical.org'
CATEGORIES = [
# comment out categories you don't want
# (user friendly name, system name, max number of articles to load)
('Architecture', 'architecture', 30),
('Art', 'art', 30),
('Books', 'books', 30),
('Design', 'design', 30),
('Digital', 'digital', 30),
('Food', 'food', 30),
('Movies', 'movies', 30),
('Music', 'music', 30),
('Television', 'television', 30),
('Other articles', '', 10)
]
def parse_index(self):
self.cover_url = 'http://www.letsgetcritical.org/wp-content/themes/lets_get_critical/images/lgc.jpg'
feeds = []
seen_urls = set()
regex = re.compile(r'http://(www\.)?([^/:]+)', re.I)
for category in self.CATEGORIES:
(cat_name, tag, max_articles) = category
tagurl = '' if tag == '' else '/category/' + tag.lower()
self.log('Reading category:', cat_name)
articles = []
pageno = 1
while len(articles) < max_articles and pageno < 100:
page = "%s%s/page/%d" % (self.INDEX, tagurl,
pageno) if pageno > 1 else self.INDEX + tagurl
pageno += 1
self.log('\tReading page:', page)
try:
soup = self.index_to_soup(page)
except:
break
posts = soup.findAll('div', attrs={'class': 'post_multi'})
if len(posts) == 0:
break
for post in posts:
dt = post.find('div', attrs={'class': 'title'})
atag = dt.find('a')
url = atag['href']
# skip promotionals and duplicate
if url.startswith('http://letsgetcritical') or url.startswith('/') or url in seen_urls:
continue
seen_urls.add(url)
title = self.tag_to_string(atag)
self.log('\tFound article:', title)
self.log('\t', url)
desc = post.find('blockquote')
desc = self.tag_to_string(desc) if desc else ''
m = regex.match(url)
if m:
desc = "[%s] %s" % (m.group(2), desc)
date = ''
p = post.previousSibling
# navigate up sibling to find date
while p:
if ''.join(p.get('class') or '') == 'singledate':
date = self.tag_to_string(p)
break
p = p.previousSibling
articles.append(
{'title': title, 'url': url, 'description': desc, 'date': date})
if len(articles) >= max_articles:
break
if articles:
feeds.append((cat_name, articles))
return feeds