import re from calibre.web.feeds.news import BasicNewsRecipe class LetsGetCritical(BasicNewsRecipe): title = u"Let's Get Critical" description = 'Curation / aggregation of criticisms of the arts and culture ' language = 'en' __author__ = 'barty on mobileread.com forum' max_articles_per_feed = 100 no_stylesheets = False timefmt = ' [%a, %d %b, %Y]' oldest_article = 365 auto_cleanup = True INDEX = 'http://www.letsgetcritical.org' CATEGORIES = [ # comment out categories you don't want # (user friendly name, system name, max number of articles to load) ('Architecture', 'architecture', 30), ('Art', 'art', 30), ('Books', 'books', 30), ('Design', 'design', 30), ('Digital', 'digital', 30), ('Food', 'food', 30), ('Movies', 'movies', 30), ('Music', 'music', 30), ('Television', 'television', 30), ('Other articles', '', 10) ] def parse_index(self): self.cover_url = 'http://www.letsgetcritical.org/wp-content/themes/lets_get_critical/images/lgc.jpg' feeds = [] seen_urls = set() regex = re.compile(r'http://(www\.)?([^/:]+)', re.I) for category in self.CATEGORIES: (cat_name, tag, max_articles) = category tagurl = '' if tag == '' else '/category/' + tag.lower() self.log('Reading category:', cat_name) articles = [] pageno = 1 while len(articles) < max_articles and pageno < 100: page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl pageno += 1 self.log('\tReading page:', page) try: soup = self.index_to_soup(page) except: break posts = soup.findAll('div', attrs={'class': 'post_multi'}) if len(posts) == 0: break for post in posts: dt = post.find('div', attrs={'class': 'title'}) atag = dt.find('a') url = atag['href'] # skip promotionals and duplicate if url.startswith('http://letsgetcritical') or url.startswith('/') or url in seen_urls: continue seen_urls.add(url) title = self.tag_to_string(atag) self.log('\tFound article:', title) self.log('\t', url) desc = post.find('blockquote') desc = self.tag_to_string(desc) if desc else '' m = regex.match(url) if m: desc = "[%s] %s" % (m.group(2), desc) date = '' p = post.previousSibling # navigate up sibling to find date while p: if ''.join(p.get('class') or '') == 'singledate': date = self.tag_to_string(p) break p = p.previousSibling articles.append( {'title': title, 'url': url, 'description': desc, 'date': date}) if len(articles) >= max_articles: break if articles: feeds.append((cat_name, articles)) return feeds