Vanity Fair by Barty

This commit is contained in:
Kovid Goyal 2011-11-30 08:20:32 +05:30
parent 75be93397b
commit 23caca5f47

98
recipes/vanityfair.recipe Normal file
View File

@ -0,0 +1,98 @@
from datetime import date
import re
from calibre.web.feeds.news import BasicNewsRecipe
class VanityFair(BasicNewsRecipe):
title = u"Vanity Fair"
description = 'Vanity Fair Magazine (U.S.)'
language = 'en'
__author__ = 'Barty'
max_articles_per_feed = 100
no_stylesheets = False
auto_cleanup = False
timefmt = ' [%B %Y]'
oldest_article = 365
masthead_url = 'http://www.vanityfair.com/etc/designs/vanityfair/images/shell/print-logo.png'
INDEX = 'http://www.vanityfair.com'
CATEGORIES = [
# comment out categories you don't want
# (user friendly name, url suffix, max number of articles to load)
('Hollywood','hollywood',10),
('Culture','culture',10),
('Business','business',10),
('Politics','politics',10),
('Society','society',10),
('Style','style',10),
('VF Daily','online/daily',10),
("James Wolcott's Blog",'online/wolcott',10),
("The Oscars",'online/oscars',10),
]
# set this to False if you don't want to put the first article
# that appears in each section to a "Featured" section
FEATURED_CAT = True
remove_tags = [
{'name':['nav']},
{'class':re.compile(r'_(header|rubric|share|subnav|leaderboard)|comments-count|ecom_placement')}
]
remove_tags_after = [{'class':'cn_blogpost'},{'id':'wrapper'}]
def parse_index(self):
self.cover_url = 'http://www.vanityfair.com/magazine/toc/contents-%s/_jcr_content/par/cn_contentwell/par-main/cn_pagination_contai/cn_image.size.cover_vanityfair_300.jpg' % (date.today().strftime('%Y%m'))
feeds = []
seen_urls = set([])
features = []
for category in self.CATEGORIES:
(cat_name, tag, max_articles) = category
self.log('Reading category:', cat_name)
articles = []
page = "%s/%s" % (self.INDEX, tag)
soup = self.index_to_soup(page)
headers = soup.findAll(attrs={'class':'headline '})
add_featured = self.FEATURED_CAT
for header in headers:
self.log(self.tag_to_string(header))
atags = header.findAll('a')
# if there's more than one a tag, it's some kind of list, skip
if not atags or len(atags)>1:
continue
atag = atags[0]
url = atag['href']
if url.startswith('/'):
url = self.INDEX + url
if url in seen_urls:
continue
seen_urls.add(url)
title = self.tag_to_string(atag)
self.log('\tFound article:', title)
self.log('\t', url)
par = header.findParent('article') if tag.startswith('online/') else header.findParent('section')
if par is not None:
desc = par.find(attrs={'class':'body '})
desc = self.tag_to_string(desc) if desc else ''
#self.log('\t', desc)
if add_featured:
features.append({'title':title,'url':url,'description':desc})
add_featured = False
else:
articles.append({'title':title,'url':url,'description':desc})
if len(articles) >= max_articles:
break
if articles:
feeds.append((cat_name, articles))
if features:
feeds.insert(0,('Featured', features))
return feeds
def print_version(self, url):
return url.replace('.html', '.print')