calibre/recipes/vice.recipe

##
# Title:        Vice News recipe for calibre
# Author:       Adrian Tennessee
# Contact:      adrian.tennessee at domainthatnobodytakes.com
##
# License:      GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
# Copyright:    Copyright 2014 Adrian Tennessee
##
# Written:      2014-09-13
# Last Edited:  2014-09-13
##

from calibre.web.feeds.news import BasicNewsRecipe


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


class VICENews(BasicNewsRecipe):
    __author__ = 'Adrian Tennessee (adrian.tennessee at domainthatnobodytakes.com)'
    __license__ = 'GPLv3'
    __copyright__ = '2014, Adrian Tennessee <adrian.tennessee at domainthatnobodytakes.com)'

    title = u'VICE News'
    language = 'en'
    description = u'VICE News web site ebook'
    publisher = 'VICE Media'
    category = 'news, world'
    cover_url = 'https://upload.wikimedia.org/wikipedia/commons/d/dc/Vice_News_logo.jpg'

    oldest_article = 7
    max_articles_per_feed = 100
    use_embedded_content = False
    no_stylesheets = True
    remove_javascript = True
    encoding = 'utf-8'

    # article-title modifies h1-tag of article title
    extra_css = '.article-title { font-size:125%; font-weight:bold }'

    keep_only_tags = [
        classes('article__header__title contributors article__header__datebar__date--original short-form__body__article-body')
    ]

    remove_tags = [
        classes('lazy-vice-ad abc__article_embed article__tagged user-newsletter-signup article__embed-component'),
    ]

    def preprocess_html(self, soup):
        for img in soup.findAll(**classes('responsive-image__img')):
            for source in img.findPreviousSiblings('source'):
                img['src'] = source['srcset'].split('?')[0]
                source.extract()
        return soup

    feeds = [(u'VICE News', u'https://news.vice.com/rss')]