calibre/recipes/slate.recipe

#!/usr/bin/env python
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'

__license__ = 'GPL v3'

'''
calibre recipe for slate.com
'''

from calibre.web.feeds.recipes import BasicNewsRecipe


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


class Slate(BasicNewsRecipe):
    title = 'Slate'
    description = 'A general-interest publication offering analysis and commentary about politics, news and culture.'
    __author__ = 'unkn0wn'
    no_stylesheets = True
    language = 'en'
    encoding = 'utf-8'
    remove_attributes = ['style', 'height', 'width']
    INDEX = 'https://slate.com/'
    resolve_internal_links = True
    remove_empty_feeds = True
    ignore_duplicate_articles = {'url'}

    extra_css = '''
        .article__rubric { font-size:small; color:#404040; }
        .article__byline, time { font-size:small; }
        .article__top-image, .image__credit, .image__meta { font-size:small; text-align:center; }
        em, blockquote, .alternativeHeadline { color:#202020; }
    '''

    keep_only_tags = [
        classes('article__header article__top-image article__content article-hotseats__body'),
    ]
    remove_tags = [
        dict(name=['aside', 'svg', 'iframe']),
        classes('social-share slate-ad newsletter-signup in-article-recirc'),
    ]

    def preprocess_html(self, soup):
        for h2 in soup.findAll('h2'):
            h2.name = 'h4'
        for img in soup.findAll('img', attrs={'data-srcset': True}):
            img['src'] = img['data-src'] + '&width=600'
        return soup

    def parse_index(self):
        ans = []
        for sectitle, url in (
                ('News & Politics', 'news-and-politics'),
                ('Culture', 'culture'),
                ('Technology', 'technology'),
                ('Business', 'business'),
                ('Life', 'life'),
                ('Advice', 'advice'),
        ):
            url = self.INDEX + url
            self.log('\nFound section:', sectitle, url)
            articles = self.slate_section_articles(url)
            if articles:
                ans.append((sectitle, articles))
        return ans

    def slate_section_articles(self, url):
        from datetime import date
        soup = self.index_to_soup(url)
        ans = []
        dt = date.today().strftime('/%Y/')
        for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(url + dt)}):
            url = a['href']
            head = a.find(attrs={'class':[
                'section-feed-two-column__card-headline',
                'section-feed-three-column__teaser-headline',
                'section-feed-two-column__teaser-headline',
                'topic-story__hed'
            ]})
            if head:
                title = self.tag_to_string(head).strip()
                self.log('\t' + title)
                self.log('\t\t' + url)
                ans.append({'title': title, 'url': url})
        return ans

    def populate_article_metadata(self, article, soup, first):
        summ = soup.find(attrs={'class':'article__dek'})
        if summ:
            article.summary = article.text_summary = self.tag_to_string(summ)