calibre/recipes/tjournal.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8

from calibre.web.feeds.news import BasicNewsRecipe


class TJournal(BasicNewsRecipe):
    title = u'TJournal'
    __author__ = 'bug_me_not (with fixes by bugmen00t)'
    description = 'TJournal: издание о медиа, технологиях и трендах'
    publisher = 'tjournal.ru'
    category = 'news'
    language = 'ru'
    no_stylesheets = False
    remove_javascript = True
    oldest_article = 30
    max_articles_per_feed = 100
    cover_url = 'https://tjournal.ru/static/build/tjournal.ru/images/search_logo.png'

    remove_tags_before = dict(name='div', attrs={'class': 'content-title"'})

    remove_tags_after = dict(
        name='div',
        attrs={'class': 'content-footer content-footer--full l-island-a'}
    )

    remove_tags = [
        dict(
            name='div',
            attrs={'class': 'content-footer content-footer--full l-island-a'}
        ),
        dict(name='div', attrs={'air-module': 'module.distributionFloating'}),
        dict(name='span', attrs={'class': 'content-editorial-tick'}),
        dict(name='vue'),
        dict(name='div', attrs={'class': 'comments'}),
        dict(name='div', attrs={'class': 'propaganda'}),
        dict(name='div', attrs={'class': 'propaganda propaganda--with-footer'}),
        dict(name='div', attrs={'air-module': 'module.gallery'}),
        dict(name='div', attrs={'class': 'content-container'}),
        dict(
            name='div',
            attrs={'class': 'content-header__item content-header-number'}
        ),
        dict(name='span', attrs={'class': 'views__value'}),
        dict(name='span', attrs={'class': 'views__label'})
    ]

    feeds = [(
        '\u041F\u043E\u043F\u0443\u043B\u044F\u0440\u043D\u043E\u0435',
        'https://tjournal.ru/rss'
    ), (
        '\u041D\u043E\u0432\u043E\u0441\u0442\u0438', 'https://tjournal.ru/rss/news'
    ), ('\u0421\u0432\u0435\u0436\u0435\u0435', 'https://tjournal.ru/rss/new'),
             (
                 '\u0422\u0435\u0445\u043D\u043E\u043B\u043E\u0433\u0438\u0438',
                 'https://tjournal.ru/rss/tech'
             ),
             (
                 '\u0420\u0430\u0437\u0431\u043E\u0440\u044B',
                 'https://tjournal.ru/rss/analysis'
             ),
             (
                 '\u0418\u043D\u0442\u0435\u0440\u043D\u0435\u0442',
                 'https://tjournal.ru/rss/internet'
             )]

    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs={'data-image-src': True}):
            img['src'] = img['data-image-src']
        return soup