Vanity Fair by Barty

2025-07-09 03:04:10 -04:00 · 2011-11-30 08:20:32 +05:30 · 2011-11-30 08:20:32 +05:30 · 23caca5f47
commit 23caca5f47
parent 75be93397b
1 changed files with 98 additions and 0 deletions
--- a/recipes/vanityfair.recipe
+++ b/recipes/vanityfair.recipe
@ -0,0 +1,98 @@
 from datetime import date
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class VanityFair(BasicNewsRecipe):
 	title          = u"Vanity Fair"
 	description    = 'Vanity Fair Magazine (U.S.)'
 	language       = 'en'
 	__author__     = 'Barty'
 	max_articles_per_feed = 100
 	no_stylesheets = False
 	auto_cleanup   = False
 	timefmt        = ' [%B %Y]'
 	oldest_article = 365
 	masthead_url   = 'http://www.vanityfair.com/etc/designs/vanityfair/images/shell/print-logo.png'
 	INDEX          = 'http://www.vanityfair.com'
 	CATEGORIES     = [
 		# comment out categories you don't want
 		# (user friendly name, url suffix, max number of articles to load)
 		('Hollywood','hollywood',10),
 		('Culture','culture',10),
 		('Business','business',10),
 		('Politics','politics',10),
 		('Society','society',10),
 		('Style','style',10),
 		('VF Daily','online/daily',10),
 		("James Wolcott's Blog",'online/wolcott',10),
 		("The Oscars",'online/oscars',10),
 		]
 	# set this to False if you don't want to put the first article
 	# that appears in each section to a "Featured" section
 	FEATURED_CAT   = True
 	remove_tags    = [
 		{'name':['nav']},
 		{'class':re.compile(r'_(header|rubric|share|subnav|leaderboard)|comments-count|ecom_placement')}
 		]
 	remove_tags_after = [{'class':'cn_blogpost'},{'id':'wrapper'}]
 	def parse_index(self):
 		self.cover_url = 'http://www.vanityfair.com/magazine/toc/contents-%s/_jcr_content/par/cn_contentwell/par-main/cn_pagination_contai/cn_image.size.cover_vanityfair_300.jpg' % (date.today().strftime('%Y%m'))
 		feeds = []
 		seen_urls = set([])
 		features = []
 		for category in self.CATEGORIES:
 			(cat_name, tag, max_articles) = category
 			self.log('Reading category:', cat_name)
 			articles = []
 			page = "%s/%s" % (self.INDEX, tag)
 			soup = self.index_to_soup(page)
 			headers = soup.findAll(attrs={'class':'headline '})
 			add_featured = self.FEATURED_CAT
 			for header in headers:
 				self.log(self.tag_to_string(header))
 				atags = header.findAll('a')
 				# if there's more than one a tag, it's some kind of list, skip
 				if not atags or len(atags)>1:
 					continue
 				atag = atags[0]
 				url = atag['href']
 				if url.startswith('/'):
 					url = self.INDEX + url
 				if url in seen_urls:
 					continue
 				seen_urls.add(url)
 				title = self.tag_to_string(atag)
 				self.log('\tFound article:', title)
 				self.log('\t', url)
 				par = header.findParent('article') if tag.startswith('online/') else header.findParent('section')
 				if par is not None:
 					desc = par.find(attrs={'class':'body '})
 				desc = self.tag_to_string(desc) if desc else ''
 				#self.log('\t', desc)
 				if add_featured:
 					features.append({'title':title,'url':url,'description':desc})
 					add_featured = False
 				else:
 					articles.append({'title':title,'url':url,'description':desc})
 					if len(articles) >= max_articles:
 						break
 			if articles:
 				feeds.append((cat_name, articles))
 		if features:
 			feeds.insert(0,('Featured', features))
 		return feeds
 	def print_version(self, url):
 		return url.replace('.html', '.print')