import re from calibre.web.feeds.news import BasicNewsRecipe class GiveMeSomethingToRead(BasicNewsRecipe): title = u'Give Me Something To Read' description = 'Curation / aggregation of articles on diverse topics' language = 'en' __author__ = 'barty on mobileread.com forum' max_articles_per_feed = 100 no_stylesheets = False timefmt = ' [%a, %d %b, %Y]' oldest_article = 365 auto_cleanup = True INDEX = 'http://givemesomethingtoread.com' CATEGORIES = [ # comment out categories you don't want # (user friendly name, system name, max number of articles to load) ('The Arts','arts',25), ('Science','science',30), ('Technology','technology',30), ('Politics','politics',20), ('Media','media',30), ('Crime','crime',15), ('Other articles','',10) ] def parse_index(self): self.cover_url = 'http://thegretchenshow.files.wordpress.com/2009/12/well-read-cat-small.jpg' feeds = [] seen_urls = set([]) regex = re.compile( r'http://(www\.)?([^/:]+)', re.I) for category in self.CATEGORIES: (cat_name, tag, max_articles) = category tagurl = '' if tag=='' else '/tagged/'+tag self.log('Reading category:', cat_name) articles = [] pageno = 1 while len(articles) < max_articles and pageno < 100: page = "%s%s/page/%d" % (self.INDEX, tagurl, pageno) if pageno > 1 else self.INDEX + tagurl pageno += 1 self.log('\tReading page:', page) try: soup = self.index_to_soup(page) except: break headers = soup.findAll('h2') if len(headers) == .0: break for header in headers: atag = header.find('a') url = atag['href'] # skip promotionals and duplicate if url.startswith('http://givemesomethingtoread') or url.startswith('/') or url in seen_urls: continue seen_urls.add(url) title = self.tag_to_string(header) self.log('\tFound article:', title) #self.log('\t', url) desc = header.parent.find('blockquote') desc = self.tag_to_string(desc) if desc else '' m = regex.match( url) if m: desc = "[%s] %s" % (m.group(2), desc) #self.log('\t', desc) date = '' p = header.parent.previousSibling # navigate up to find h3, which contains the date while p: if hasattr(p,'name') and p.name == 'h3': date = self.tag_to_string(p) break p = p.previousSibling articles.append({'title':title,'url':url,'description':desc,'date':date}) if len(articles) >= max_articles: break if articles: feeds.append((cat_name, articles)) return feeds