from datetime import date import re from calibre.web.feeds.news import BasicNewsRecipe class VanityFair(BasicNewsRecipe): title = u"Vanity Fair" description = 'Vanity Fair Magazine (U.S.)' language = 'en' __author__ = 'Barty' max_articles_per_feed = 100 no_stylesheets = False auto_cleanup = False timefmt = ' [%B %Y]' oldest_article = 365 masthead_url = 'http://www.vanityfair.com/etc/designs/vanityfair/images/shell/print-logo.png' INDEX = 'http://www.vanityfair.com' CATEGORIES = [ # comment out categories you don't want # (user friendly name, url suffix, max number of articles to load) ('Hollywood', 'hollywood', 10), ('Culture', 'culture', 10), ('Business', 'business', 10), ('Politics', 'politics', 10), ('Society', 'society', 10), ('Style', 'style', 10), ('VF Daily', 'online/daily', 10), ("James Wolcott's Blog", 'online/wolcott', 10), ("The Oscars", 'online/oscars', 10), ] # set this to False if you don't want to put the first article # that appears in each section to a "Featured" section FEATURED_CAT = True remove_tags = [ {'name': ['nav']}, {'class': re.compile( r'_(header|rubric|share|subnav|leaderboard)|comments-count|ecom_placement')} ] remove_tags_after = [{'class': 'cn_blogpost'}, {'id': 'wrapper'}] def parse_index(self): self.cover_url = 'http://www.vanityfair.com/magazine/toc/contents-%s/_jcr_content/par/cn_contentwell/par-main/cn_pagination_contai/cn_image.size.cover_vanityfair_300.jpg' % ( # noqa date.today().strftime('%Y%m')) feeds = [] seen_urls = set() features = [] for category in self.CATEGORIES: (cat_name, tag, max_articles) = category self.log('Reading category:', cat_name) articles = [] page = "%s/%s" % (self.INDEX, tag) soup = self.index_to_soup(page) headers = soup.findAll(attrs={'class': 'headline '}) add_featured = self.FEATURED_CAT for header in headers: self.log(self.tag_to_string(header)) atags = header.findAll('a') # if there's more than one a tag, it's some kind of list, skip if not atags or len(atags) > 1: continue atag = atags[0] url = atag['href'] if url.startswith('/'): url = self.INDEX + url if url in seen_urls: continue seen_urls.add(url) title = self.tag_to_string(atag) self.log('\tFound article:', title) self.log('\t', url) par = header.findParent('article') if tag.startswith( 'online/') else header.findParent('section') if par is not None: desc = par.find(attrs={'class': 'body '}) desc = self.tag_to_string(desc) if desc else '' if add_featured: features.append( {'title': title, 'url': url, 'description': desc}) add_featured = False else: articles.append( {'title': title, 'url': url, 'description': desc}) if len(articles) >= max_articles: break if articles: feeds.append((cat_name, articles)) if features: feeds.insert(0, ('Featured', features)) return feeds def print_version(self, url): return url.replace('.html', '.print')