calibre/recipes/letsgetcritical.recipe

import re
from calibre.web.feeds.news import BasicNewsRecipe


class LetsGetCritical(BasicNewsRecipe):
    title = u"Let's Get Critical"
    description = 'Curation / aggregation of criticisms of the arts and culture '
    language = 'en'
    __author__ = 'barty on mobileread.com forum'
    max_articles_per_feed = 100
    no_stylesheets = False
    timefmt = ' [%a, %d %b, %Y]'
    oldest_article = 365
    auto_cleanup = True
    INDEX = 'http://www.letsgetcritical.org'
    CATEGORIES = [
        # comment out categories you don't want
        # (user friendly name, system name, max number of articles to load)
        ('Architecture', 'architecture', 30),
        ('Art', 'art', 30),
        ('Books', 'books', 30),
        ('Design', 'design', 30),
        ('Digital', 'digital', 30),
        ('Food', 'food', 30),
        ('Movies', 'movies', 30),
        ('Music', 'music', 30),
        ('Television', 'television', 30),
        ('Other articles', '', 10)
    ]

    def parse_index(self):
        self.cover_url = 'http://www.letsgetcritical.org/wp-content/themes/lets_get_critical/images/lgc.jpg'
        feeds = []
        seen_urls = set()
        regex = re.compile(r'http://(www\.)?([^/:]+)', re.I)

        for category in self.CATEGORIES:

            (cat_name, tag, max_articles) = category

            tagurl = '' if tag == '' else '/category/' + tag.lower()
            self.log('Reading category:', cat_name)

            articles = []
            pageno = 1

            while len(articles) < max_articles and pageno < 100:

                page = "%s%s/page/%d" % (self.INDEX, tagurl,
                                         pageno) if pageno > 1 else self.INDEX + tagurl
                pageno += 1

                self.log('\tReading page:', page)
                try:
                    soup = self.index_to_soup(page)
                except:
                    break

                posts = soup.findAll('div', attrs={'class': 'post_multi'})
                if len(posts) == 0:
                    break

                for post in posts:
                    dt = post.find('div', attrs={'class': 'title'})
                    atag = dt.find('a')
                    url = atag['href']
                    # skip promotionals and duplicate
                    if url.startswith('http://letsgetcritical') or url.startswith('/') or url in seen_urls:
                        continue
                    seen_urls.add(url)
                    title = self.tag_to_string(atag)
                    self.log('\tFound article:', title)
                    self.log('\t', url)
                    desc = post.find('blockquote')
                    desc = self.tag_to_string(desc) if desc else ''
                    m = regex.match(url)
                    if m:
                        desc = "[%s] %s" % (m.group(2), desc)
                    date = ''
                    p = post.previousSibling
                    # navigate up sibling to find date
                    while p:
                        if ''.join(p.get('class') or '') == 'singledate':
                            date = self.tag_to_string(p)
                            break
                        p = p.previousSibling
                    articles.append(
                        {'title': title, 'url': url, 'description': desc, 'date': date})
                    if len(articles) >= max_articles:
                        break

            if articles:
                feeds.append((cat_name, articles))

        return feeds