calibre/recipes/rtnews.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
'''
rt.com
'''

from calibre.web.feeds.news import BasicNewsRecipe, classes


class RT_eng(BasicNewsRecipe):
    title = 'Russia Today'
    __author__ = 'unkn0wn'
    description = '''
        RT creates news with an edge for viewers who want to Question More. RT covers stories overlooked by the mainstream
        media, provides alternative perspectives on current affairs, and acquaints international audiences with a Russian
        viewpoint on major global events.
    '''
    publisher = 'Autonomous Nonprofit Organization "TV-Novosti"'
    category = 'news, politics, economy, finances, Russia, world'
    oldest_article = 1.2
    no_stylesheets = True
    encoding = 'utf-8'
    ignore_duplicate_articles = {'url', 'title'}
    use_embedded_content = False
    remove_empty_feeds = True
    remove_javascript = True
    language = 'en'
    remove_attributes = ['height', 'width', 'style']
    publication_type = 'newsportal'

    recipe_specific_options = {
        'days': {
            'short': 'Oldest article to download from this news source. In days ',
            'long': 'For example, 0.5, gives you articles from the past 12 hours',
            'default': str(oldest_article)
        }
    }

    def __init__(self, *args, **kwargs):
        BasicNewsRecipe.__init__(self, *args, **kwargs)
        d = self.recipe_specific_options.get('days')
        if d and isinstance(d, str):
            self.oldest_article = float(d)

    extra_css = '''
        img {display:block; margin:0 auto;}
        em { color:#202020; }
        .date { font-size:small; color:#404040; }
        .article__summary { font-style:italic; color:#202020; }
        .media__footer { font-size:small; text-align:center; }
    '''

    conversion_options = {
        'comment': description, 'tags': category, 'publisher': publisher, 'language': language
    }

    keep_only_tags = [
        dict(name='div', attrs={'class':'article'})
    ]

    remove_tags = [
        dict(name=['meta', 'link', 'svg', 'button', 'style', 'iframe', 'noscript']),
        classes(
            'update_date_visible breadcrumbs read-more Read-more-text-only article__share '
            'article__social-wrapper article__share_bottom'
        )
    ]

    feeds = [
        ('Russia', 'https://www.rt.com/rss/russia/'),
        ('India', 'https://www.rt.com/rss/india/'),
        ('Africa', 'https://www.rt.com/rss/africa/'),
        ('World News', 'https://www.rt.com/rss/news/'),
        ('Business', 'https://www.rt.com/rss/business/'),
        ('Opinion', 'https://www.rt.com/rss/op-ed/'),
        ('Culture', 'https://www.rt.com/rss/pop-culture/'),
        ('Others', 'https://www.rt.com/rss/')
    ]

    def get_article_url(self, article):
        url = BasicNewsRecipe.get_article_url(self, article)
        return url.split('?')[0]

    def preprocess_html(self, soup):
        for img in soup.findAll('img'):
            srcset = img.find_previous_sibling('source', attrs={'data-srcset':True})
            if srcset:
                for x in srcset['data-srcset'].split(','):
                    if '/l/' in x.split()[0].strip():
                        img['src'] = x.split()[0].strip()
        for src in soup.findAll('source'):
            src.decompose()
        return soup