calibre/recipes/unz.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
from calibre.web.feeds.news import BasicNewsRecipe, classes


class Unz(BasicNewsRecipe):
    title = 'The Unz Review'
    description = (
        'A Collection of Interesting, Important, and Controversial Perspectives '
        'Largely Excluded from the American Mainstream Media.'
    )
    __author__ = 'unkn0wn'
    oldest_article = 7
    language = 'en_US'
    max_articles_per_feed = 100
    use_embedded_content = False
    cover_url = 'https://www.unz.com/wp-content/themes/unzSite/IMAGES/unz_large_logo.png'
    encoding = 'utf-8'
    # browser_type = 'webengine'
    no_stylesheets = True
    remove_attributes = ['style', 'height', 'width']
    extra_css = '.byline, .caption { font-size: small; }'

    keep_only_tags = [
        dict(
            name='div',
            attrs={'class': ['head', 'byline', 'page-thumb', 'section-holder']},
        ),
    ]

    remove_tags = [
        dict(name=['audio', 'iframe', 'svg']),
        classes('commentlink replylink'),
    ]

    remove_tags_after = [dict(name='div', attrs={'class': 'section-holder'})]

    recipe_specific_options = {
        'days': {
            'short': 'Oldest article to download from this news source. In days ',
            'long': 'For example, 0.5, gives you articles from the past 12 hours',
            'default': str(oldest_article),
        },
    }

    def __init__(self, *args, **kwargs):
        BasicNewsRecipe.__init__(self, *args, **kwargs)
        d = self.recipe_specific_options.get('days')
        if d and isinstance(d, str):
            self.oldest_article = float(d)

    feeds = ['https://www.unz.com/feed']

    def preprocess_html(self, soup):
        head = soup.find(**classes('head'))
        if head:
            head.name = 'h1'
        return soup