from datetime import date import re from calibre.web.feeds.news import BasicNewsRecipe class VanityFair(BasicNewsRecipe): title = u"Vanity Fair" description = 'Vanity Fair Magazine (U.S.)' language = 'en' __author__ = 'Barty' max_articles_per_feed = 100 no_stylesheets = False auto_cleanup = False timefmt = ' [%B %Y]' oldest_article = 365 masthead_url = 'http://www.vanityfair.com/etc/designs/vanityfair/images/shell/print-logo.png' INDEX = 'http://www.vanityfair.com' CATEGORIES = [ # comment out categories you don't want # (user friendly name, url suffix, max number of articles to load) ('Hollywood','hollywood',10), ('Culture','culture',10), ('Business','business',10), ('Politics','politics',10), ('Society','society',10), ('Style','style',10), ('VF Daily','online/daily',10), ("James Wolcott's Blog",'online/wolcott',10), ("The Oscars",'online/oscars',10), ] # set this to False if you don't want to put the first article # that appears in each section to a "Featured" section FEATURED_CAT = True remove_tags = [ {'name':['nav']}, {'class':re.compile(r'_(header|rubric|share|subnav|leaderboard)|comments-count|ecom_placement')} ] remove_tags_after = [{'class':'cn_blogpost'},{'id':'wrapper'}] def parse_index(self): self.cover_url = 'http://www.vanityfair.com/magazine/toc/contents-%s/_jcr_content/par/cn_contentwell/par-main/cn_pagination_contai/cn_image.size.cover_vanityfair_300.jpg' % (date.today().strftime('%Y%m')) feeds = [] seen_urls = set([]) features = [] for category in self.CATEGORIES: (cat_name, tag, max_articles) = category self.log('Reading category:', cat_name) articles = [] page = "%s/%s" % (self.INDEX, tag) soup = self.index_to_soup(page) headers = soup.findAll(attrs={'class':'headline '}) add_featured = self.FEATURED_CAT for header in headers: self.log(self.tag_to_string(header)) atags = header.findAll('a') # if there's more than one a tag, it's some kind of list, skip if not atags or len(atags)>1: continue atag = atags[0] url = atag['href'] if url.startswith('/'): url = self.INDEX + url if url in seen_urls: continue seen_urls.add(url) title = self.tag_to_string(atag) self.log('\tFound article:', title) self.log('\t', url) par = header.findParent('article') if tag.startswith('online/') else header.findParent('section') if par is not None: desc = par.find(attrs={'class':'body '}) desc = self.tag_to_string(desc) if desc else '' #self.log('\t', desc) if add_featured: features.append({'title':title,'url':url,'description':desc}) add_featured = False else: articles.append({'title':title,'url':url,'description':desc}) if len(articles) >= max_articles: break if articles: feeds.append((cat_name, articles)) if features: feeds.insert(0,('Featured', features)) return feeds def print_version(self, url): return url.replace('.html', '.print')