calibre/recipes/vanityfair.recipe

from datetime import date
import re
from calibre.web.feeds.news import BasicNewsRecipe


class VanityFair(BasicNewsRecipe):
    title = u"Vanity Fair"
    description = 'Vanity Fair Magazine (U.S.)'
    language = 'en'
    __author__ = 'Barty'
    max_articles_per_feed = 100
    no_stylesheets = False
    auto_cleanup = False
    timefmt = ' [%B %Y]'
    oldest_article = 365

    masthead_url = 'http://www.vanityfair.com/etc/designs/vanityfair/images/shell/print-logo.png'

    INDEX = 'http://www.vanityfair.com'
    CATEGORIES = [
        # comment out categories you don't want
        # (user friendly name, url suffix, max number of articles to load)
        ('Hollywood', 'hollywood', 10),
        ('Culture', 'culture', 10),
        ('Business', 'business', 10),
        ('Politics', 'politics', 10),
        ('Society', 'society', 10),
        ('Style', 'style', 10),
        ('VF Daily', 'online/daily', 10),
        ("James Wolcott's Blog", 'online/wolcott', 10),
        ("The Oscars", 'online/oscars', 10),
    ]
    # set this to False if you don't want to put the first article
    # that appears in each section to a "Featured" section
    FEATURED_CAT = True

    remove_tags = [
        {'name': ['nav']},
        {'class': re.compile(
            r'_(header|rubric|share|subnav|leaderboard)|comments-count|ecom_placement')}
    ]
    remove_tags_after = [{'class': 'cn_blogpost'}, {'id': 'wrapper'}]

    def parse_index(self):
        self.cover_url = 'http://www.vanityfair.com/magazine/toc/contents-%s/_jcr_content/par/cn_contentwell/par-main/cn_pagination_contai/cn_image.size.cover_vanityfair_300.jpg' % (  # noqa
            date.today().strftime('%Y%m'))
        feeds = []
        seen_urls = set()
        features = []

        for category in self.CATEGORIES:

            (cat_name, tag, max_articles) = category
            self.log('Reading category:', cat_name)
            articles = []

            page = "%s/%s" % (self.INDEX, tag)
            soup = self.index_to_soup(page)
            headers = soup.findAll(attrs={'class': 'headline '})
            add_featured = self.FEATURED_CAT

            for header in headers:
                self.log(self.tag_to_string(header))
                atags = header.findAll('a')
                # if there's more than one a tag, it's some kind of list, skip
                if not atags or len(atags) > 1:
                    continue
                atag = atags[0]
                url = atag['href']
                if url.startswith('/'):
                    url = self.INDEX + url
                if url in seen_urls:
                    continue
                seen_urls.add(url)
                title = self.tag_to_string(atag)
                self.log('\tFound article:', title)
                self.log('\t', url)
                par = header.findParent('article') if tag.startswith(
                    'online/') else header.findParent('section')
                if par is not None:
                    desc = par.find(attrs={'class': 'body '})
                desc = self.tag_to_string(desc) if desc else ''
                if add_featured:
                    features.append(
                        {'title': title, 'url': url, 'description': desc})
                    add_featured = False
                else:
                    articles.append(
                        {'title': title, 'url': url, 'description': desc})
                    if len(articles) >= max_articles:
                        break

            if articles:
                feeds.append((cat_name, articles))

        if features:
            feeds.insert(0, ('Featured', features))

        return feeds

    def print_version(self, url):
        return url.replace('.html', '.print')