From 23caca5f47fc09438d358879c7758572eb258edb Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 30 Nov 2011 08:20:32 +0530 Subject: [PATCH] Vanity Fair by Barty --- recipes/vanityfair.recipe | 98 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 recipes/vanityfair.recipe diff --git a/recipes/vanityfair.recipe b/recipes/vanityfair.recipe new file mode 100644 index 0000000000..61ec76e003 --- /dev/null +++ b/recipes/vanityfair.recipe @@ -0,0 +1,98 @@ +from datetime import date +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class VanityFair(BasicNewsRecipe): + title = u"Vanity Fair" + description = 'Vanity Fair Magazine (U.S.)' + language = 'en' + __author__ = 'Barty' + max_articles_per_feed = 100 + no_stylesheets = False + auto_cleanup = False + timefmt = ' [%B %Y]' + oldest_article = 365 + + masthead_url = 'http://www.vanityfair.com/etc/designs/vanityfair/images/shell/print-logo.png' + + INDEX = 'http://www.vanityfair.com' + CATEGORIES = [ + # comment out categories you don't want + # (user friendly name, url suffix, max number of articles to load) + ('Hollywood','hollywood',10), + ('Culture','culture',10), + ('Business','business',10), + ('Politics','politics',10), + ('Society','society',10), + ('Style','style',10), + ('VF Daily','online/daily',10), + ("James Wolcott's Blog",'online/wolcott',10), + ("The Oscars",'online/oscars',10), + ] + # set this to False if you don't want to put the first article + # that appears in each section to a "Featured" section + FEATURED_CAT = True + + + remove_tags = [ + {'name':['nav']}, + {'class':re.compile(r'_(header|rubric|share|subnav|leaderboard)|comments-count|ecom_placement')} + ] + remove_tags_after = [{'class':'cn_blogpost'},{'id':'wrapper'}] + + def parse_index(self): + self.cover_url = 'http://www.vanityfair.com/magazine/toc/contents-%s/_jcr_content/par/cn_contentwell/par-main/cn_pagination_contai/cn_image.size.cover_vanityfair_300.jpg' % (date.today().strftime('%Y%m')) + feeds = [] + seen_urls = set([]) + features = [] + + for category in self.CATEGORIES: + + (cat_name, tag, max_articles) = category + self.log('Reading category:', cat_name) + articles = [] + + page = "%s/%s" % (self.INDEX, tag) + soup = self.index_to_soup(page) + headers = soup.findAll(attrs={'class':'headline '}) + add_featured = self.FEATURED_CAT + + for header in headers: + self.log(self.tag_to_string(header)) + atags = header.findAll('a') + # if there's more than one a tag, it's some kind of list, skip + if not atags or len(atags)>1: + continue + atag = atags[0] + url = atag['href'] + if url.startswith('/'): + url = self.INDEX + url + if url in seen_urls: + continue + seen_urls.add(url) + title = self.tag_to_string(atag) + self.log('\tFound article:', title) + self.log('\t', url) + par = header.findParent('article') if tag.startswith('online/') else header.findParent('section') + if par is not None: + desc = par.find(attrs={'class':'body '}) + desc = self.tag_to_string(desc) if desc else '' + #self.log('\t', desc) + if add_featured: + features.append({'title':title,'url':url,'description':desc}) + add_featured = False + else: + articles.append({'title':title,'url':url,'description':desc}) + if len(articles) >= max_articles: + break + + if articles: + feeds.append((cat_name, articles)) + + if features: + feeds.insert(0,('Featured', features)) + + return feeds + + def print_version(self, url): + return url.replace('.html', '.print')