calibre/recipes/science_x.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
'''
https://sciencex.com/
'''

from calibre.web.feeds.news import BasicNewsRecipe


class scix(BasicNewsRecipe):
    title = 'Science X'
    description = (
        'Science X is a network of high-quality websites that provides the most complete and comprehensive '
        'daily coverage of science, technology, and medical news. Articles from phys.org, medicalxpress.com'
        '& techxplore.com'
    )
    language = 'en'
    __author__ = 'unkn0wn'
    oldest_article = 1  # days
    max_articles_per_feed = 50
    encoding = 'utf-8'
    remove_attributes = ['height', 'width']
    ignore_duplicate_articles = {'url', 'title'}

    extra_css = '''
        #figure {text-align:center; font-size:small;}
        em, blockquote {color:#202020;}
        .article__info, .article-byline, .article-main__more, .d-print-block {font-size:small; color:#404040;}
    '''

    recipe_specific_options = {
        'days': {
            'short': 'Oldest article to download from this news source. In days ',
            'long': 'For example, 0.5, gives you articles from the past 12 hours',
            'default': str(oldest_article)
        }
    }

    def __init__(self, *args, **kwargs):
        BasicNewsRecipe.__init__(self, *args, **kwargs)
        d = self.recipe_specific_options.get('days')
        if d and isinstance(d, str):
            self.oldest_article = float(d)

    resolve_internal_links = True
    remove_empty_feeds = True

    keep_only_tags = [dict(name='article', attrs={'class':'news-article'})]

    feeds = [
            ('Tech Xplore', 'https://techxplore.com/rss-feed/'),
            ('Medical Xpress', 'https://medicalxpress.com/rss-feed/'),
            ('Phys.org', 'https://phys.org/rss-feed/')
            # https://medicalxpress.com/feeds/
            # https://techxplore.com/feeds/
        ]

    def preprocess_html(self, soup):
        for figure in soup.findAll('figure'):
            figure['id'] = 'figure'
        return soup