calibre/recipes/vanityfair.recipe
2019-09-08 14:07:59 +05:30

103 lines
3.7 KiB
Plaintext

from datetime import date
import re
from calibre.web.feeds.news import BasicNewsRecipe
class VanityFair(BasicNewsRecipe):
title = u"Vanity Fair"
description = 'Vanity Fair Magazine (U.S.)'
language = 'en'
__author__ = 'Barty'
max_articles_per_feed = 100
no_stylesheets = False
auto_cleanup = False
timefmt = ' [%B %Y]'
oldest_article = 365
masthead_url = 'http://www.vanityfair.com/etc/designs/vanityfair/images/shell/print-logo.png'
INDEX = 'http://www.vanityfair.com'
CATEGORIES = [
# comment out categories you don't want
# (user friendly name, url suffix, max number of articles to load)
('Hollywood', 'hollywood', 10),
('Culture', 'culture', 10),
('Business', 'business', 10),
('Politics', 'politics', 10),
('Society', 'society', 10),
('Style', 'style', 10),
('VF Daily', 'online/daily', 10),
("James Wolcott's Blog", 'online/wolcott', 10),
("The Oscars", 'online/oscars', 10),
]
# set this to False if you don't want to put the first article
# that appears in each section to a "Featured" section
FEATURED_CAT = True
remove_tags = [
{'name': ['nav']},
{'class': re.compile(
r'_(header|rubric|share|subnav|leaderboard)|comments-count|ecom_placement')}
]
remove_tags_after = [{'class': 'cn_blogpost'}, {'id': 'wrapper'}]
def parse_index(self):
self.cover_url = 'http://www.vanityfair.com/magazine/toc/contents-%s/_jcr_content/par/cn_contentwell/par-main/cn_pagination_contai/cn_image.size.cover_vanityfair_300.jpg' % ( # noqa
date.today().strftime('%Y%m'))
feeds = []
seen_urls = set()
features = []
for category in self.CATEGORIES:
(cat_name, tag, max_articles) = category
self.log('Reading category:', cat_name)
articles = []
page = "%s/%s" % (self.INDEX, tag)
soup = self.index_to_soup(page)
headers = soup.findAll(attrs={'class': 'headline '})
add_featured = self.FEATURED_CAT
for header in headers:
self.log(self.tag_to_string(header))
atags = header.findAll('a')
# if there's more than one a tag, it's some kind of list, skip
if not atags or len(atags) > 1:
continue
atag = atags[0]
url = atag['href']
if url.startswith('/'):
url = self.INDEX + url
if url in seen_urls:
continue
seen_urls.add(url)
title = self.tag_to_string(atag)
self.log('\tFound article:', title)
self.log('\t', url)
par = header.findParent('article') if tag.startswith(
'online/') else header.findParent('section')
if par is not None:
desc = par.find(attrs={'class': 'body '})
desc = self.tag_to_string(desc) if desc else ''
if add_featured:
features.append(
{'title': title, 'url': url, 'description': desc})
add_featured = False
else:
articles.append(
{'title': title, 'url': url, 'description': desc})
if len(articles) >= max_articles:
break
if articles:
feeds.append((cat_name, articles))
if features:
feeds.insert(0, ('Featured', features))
return feeds
def print_version(self, url):
return url.replace('.html', '.print')