mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
96 lines
3.4 KiB
Plaintext
96 lines
3.4 KiB
Plaintext
import re
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
|
|
|
|
class LetsGetCritical(BasicNewsRecipe):
|
|
title = u"Let's Get Critical"
|
|
description = 'Curation / aggregation of criticisms of the arts and culture '
|
|
language = 'en'
|
|
__author__ = 'barty on mobileread.com forum'
|
|
max_articles_per_feed = 100
|
|
no_stylesheets = False
|
|
timefmt = ' [%a, %d %b, %Y]'
|
|
oldest_article = 365
|
|
auto_cleanup = True
|
|
INDEX = 'http://www.letsgetcritical.org'
|
|
CATEGORIES = [
|
|
# comment out categories you don't want
|
|
# (user friendly name, system name, max number of articles to load)
|
|
('Architecture', 'architecture', 30),
|
|
('Art', 'art', 30),
|
|
('Books', 'books', 30),
|
|
('Design', 'design', 30),
|
|
('Digital', 'digital', 30),
|
|
('Food', 'food', 30),
|
|
('Movies', 'movies', 30),
|
|
('Music', 'music', 30),
|
|
('Television', 'television', 30),
|
|
('Other articles', '', 10)
|
|
]
|
|
|
|
def parse_index(self):
|
|
self.cover_url = 'http://www.letsgetcritical.org/wp-content/themes/lets_get_critical/images/lgc.jpg'
|
|
feeds = []
|
|
seen_urls = set()
|
|
regex = re.compile(r'http://(www\.)?([^/:]+)', re.I)
|
|
|
|
for category in self.CATEGORIES:
|
|
|
|
(cat_name, tag, max_articles) = category
|
|
|
|
tagurl = '' if tag == '' else '/category/' + tag.lower()
|
|
self.log('Reading category:', cat_name)
|
|
|
|
articles = []
|
|
pageno = 1
|
|
|
|
while len(articles) < max_articles and pageno < 100:
|
|
|
|
page = "%s%s/page/%d" % (self.INDEX, tagurl,
|
|
pageno) if pageno > 1 else self.INDEX + tagurl
|
|
pageno += 1
|
|
|
|
self.log('\tReading page:', page)
|
|
try:
|
|
soup = self.index_to_soup(page)
|
|
except:
|
|
break
|
|
|
|
posts = soup.findAll('div', attrs={'class': 'post_multi'})
|
|
if len(posts) == 0:
|
|
break
|
|
|
|
for post in posts:
|
|
dt = post.find('div', attrs={'class': 'title'})
|
|
atag = dt.find('a')
|
|
url = atag['href']
|
|
# skip promotionals and duplicate
|
|
if url.startswith('http://letsgetcritical') or url.startswith('/') or url in seen_urls:
|
|
continue
|
|
seen_urls.add(url)
|
|
title = self.tag_to_string(atag)
|
|
self.log('\tFound article:', title)
|
|
self.log('\t', url)
|
|
desc = post.find('blockquote')
|
|
desc = self.tag_to_string(desc) if desc else ''
|
|
m = regex.match(url)
|
|
if m:
|
|
desc = "[%s] %s" % (m.group(2), desc)
|
|
date = ''
|
|
p = post.previousSibling
|
|
# navigate up sibling to find date
|
|
while p:
|
|
if ''.join(p.get('class') or '') == 'singledate':
|
|
date = self.tag_to_string(p)
|
|
break
|
|
p = p.previousSibling
|
|
articles.append(
|
|
{'title': title, 'url': url, 'description': desc, 'date': date})
|
|
if len(articles) >= max_articles:
|
|
break
|
|
|
|
if articles:
|
|
feeds.append((cat_name, articles))
|
|
|
|
return feeds
|