import re from calibre.web.feeds.news import BasicNewsRecipe class LetsGetCritical(BasicNewsRecipe): title = u"Let's Get Critical" description = 'Curation / aggregation of criticisms of the arts and culture ' language = 'en' __author__ = 'barty on mobileread.com forum' max_articles_per_feed = 100 no_stylesheets = False timefmt = ' [%a, %d %b, %Y]' oldest_article = 365 auto_cleanup = True INDEX = 'http://www.letsgetcritical.org' CATEGORIES = [ # comment out categories you don't want # (user friendly name, system name, max number of articles to load) ('Architecture','architecture',30), ('Art','art',30), ('Books','books',30), ('Design','design',30), ('Digital','digital',30), ('Food','food',30), ('Movies','movies',30), ('Music','music',30), ('Television','television',30), ('Other articles','',10) ] def parse_index(self): self.cover_url = 'http://www.letsgetcritical.org/wp-content/themes/lets_get_critical/images/lgc.jpg' feeds = [] seen_urls = set([]) regex = re.compile( r'http://(www\.)?([^/:]+)', re.I) for category in self.CATEGORIES: (cat_name, tag, max_articles) = category tagurl = '' if tag=='' else '/category/'+tag.lower() self.log('Reading category:', cat_name) articles = [] pageno = 1 while len(articles) < max_articles and pageno < 100: page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl pageno += 1 self.log('\tReading page:', page) try: soup = self.index_to_soup(page) except: break posts = soup.findAll('div',attrs={'class':'post_multi'}) if len(posts) == 0: break for post in posts: dt = post.find('div',attrs={'class':'title'}) atag = dt.find('a') url = atag['href'] # skip promotionals and duplicate if url.startswith('http://letsgetcritical') or url.startswith('/') or url in seen_urls: continue seen_urls.add(url) title = self.tag_to_string(atag) self.log('\tFound article:', title) self.log('\t', url) desc = post.find('blockquote') desc = self.tag_to_string(desc) if desc else '' m = regex.match( url) if m: desc = "[%s] %s" % (m.group(2), desc) #self.log('\t', desc) date = '' p = post.previousSibling # navigate up sibling to find date while p: if hasattr(p,'class') and p['class'] == 'singledate': date = self.tag_to_string(p) break p = p.previousSibling articles.append({'title':title,'url':url,'description':desc,'date':date}) if len(articles) >= max_articles: break if articles: feeds.append((cat_name, articles)) return feeds