Vanity Fair by Barty

2025-07-09 03:04:10 -04:00 · 2011-11-30 08:20:32 +05:30 · 2011-11-30 08:20:32 +05:30 · 23caca5f47
commit 23caca5f47
parent 75be93397b
1 changed files with 98 additions and 0 deletions
--- a/recipes/vanityfair.recipe
+++ b/recipes/vanityfair.recipe
@ -0,0 +1,98 @@
+from datetime import date
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class VanityFair(BasicNewsRecipe):
+	title          = u"Vanity Fair"
+	description    = 'Vanity Fair Magazine (U.S.)'
+	language       = 'en'
+	__author__     = 'Barty'
+	max_articles_per_feed = 100
+	no_stylesheets = False
+	auto_cleanup   = False
+	timefmt        = ' [%B %Y]'
+	oldest_article = 365
+
+	masthead_url   = 'http://www.vanityfair.com/etc/designs/vanityfair/images/shell/print-logo.png'
+
+	INDEX          = 'http://www.vanityfair.com'
+	CATEGORIES     = [
+		# comment out categories you don't want
+		# (user friendly name, url suffix, max number of articles to load)
+		('Hollywood','hollywood',10),
+		('Culture','culture',10),
+		('Business','business',10),
+		('Politics','politics',10),
+		('Society','society',10),
+		('Style','style',10),
+		('VF Daily','online/daily',10),
+		("James Wolcott's Blog",'online/wolcott',10),
+		("The Oscars",'online/oscars',10),
+		]
+	# set this to False if you don't want to put the first article
+	# that appears in each section to a "Featured" section
+	FEATURED_CAT   = True
+
+
+	remove_tags    = [
+		{'name':['nav']},
+		{'class':re.compile(r'_(header|rubric|share|subnav|leaderboard)|comments-count|ecom_placement')}
+		]
+	remove_tags_after = [{'class':'cn_blogpost'},{'id':'wrapper'}]
+
+	def parse_index(self):
+		self.cover_url = 'http://www.vanityfair.com/magazine/toc/contents-%s/_jcr_content/par/cn_contentwell/par-main/cn_pagination_contai/cn_image.size.cover_vanityfair_300.jpg' % (date.today().strftime('%Y%m'))
+		feeds = []
+		seen_urls = set([])
+		features = []
+
+		for category in self.CATEGORIES:
+
+			(cat_name, tag, max_articles) = category
+			self.log('Reading category:', cat_name)
+			articles = []
+
+			page = "%s/%s" % (self.INDEX, tag)
+			soup = self.index_to_soup(page)
+			headers = soup.findAll(attrs={'class':'headline '})
+			add_featured = self.FEATURED_CAT
+
+			for header in headers:
+				self.log(self.tag_to_string(header))
+				atags = header.findAll('a')
+				# if there's more than one a tag, it's some kind of list, skip
+				if not atags or len(atags)>1:
+					continue
+				atag = atags[0]
+				url = atag['href']
+				if url.startswith('/'):
+					url = self.INDEX + url
+				if url in seen_urls:
+					continue
+				seen_urls.add(url)
+				title = self.tag_to_string(atag)
+				self.log('\tFound article:', title)
+				self.log('\t', url)
+				par = header.findParent('article') if tag.startswith('online/') else header.findParent('section')
+				if par is not None:
+					desc = par.find(attrs={'class':'body '})
+				desc = self.tag_to_string(desc) if desc else ''
+				#self.log('\t', desc)
+				if add_featured:
+					features.append({'title':title,'url':url,'description':desc})
+					add_featured = False
+				else:
+					articles.append({'title':title,'url':url,'description':desc})
+					if len(articles) >= max_articles:
+						break
+
+			if articles:
+				feeds.append((cat_name, articles))
+
+		if features:
+			feeds.insert(0,('Featured', features))
+
+		return feeds
+
+	def print_version(self, url):
+		return url.replace('.html', '.print')