calibre/recipes/el_pais_uy.recipe

#!/usr/bin/env python2
##
# Last Edited:  2018-02-13 Carlos Alves <carlosalves90@gmail.com>
##

__license__ = 'GPL v3'
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
'''
http://www.elpais.com.uy/
'''

from calibre.web.feeds.news import BasicNewsRecipe


class General(BasicNewsRecipe):
    title = 'El Pais - Uruguay'
    __author__ = 'Gustavo Azambuja'
    description = 'Noticias de Uruguay y el resto del mundo'
    publisher = 'EL PAIS S.A.'
    category = 'news, politics, Uruguay'
    language = 'es_UY'
    timefmt = '[%a, %d %b, %Y]'
    use_embedded_content = False
    recursion = 2
    encoding = 'utf-8'
    publication_type = 'newspaper'
    remove_javascript = True
    no_stylesheets = True

    oldest_article = 2
    max_articles_per_feed = 20
    keep_only_tags = [
        dict(name='h1', attrs={'class': 'title'}),
        dict(name='div', attrs={'class': 'composite-captioned-image'}),
        dict(name='div', attrs={'class': 'content-modules'})
    ]

    conversion_options = {
        'comment': description, 'tags': category, 'publisher': publisher, 'language': language
    }
    remove_tags = [
        dict(name='div', attrs={
             'class': ['date_text', 'comments', 'form_section', 'share_it']}),
        dict(name='div', attrs={
             'id': ['relatedPosts', 'spacer', 'banner_izquierda', 'right_container']}),
        dict(name='p', attrs={'class': 'FacebookLikeButton'}),
        dict(name=['object', 'form']),
        dict(name=['object', 'table'])]

    extra_css = '''
                h1{font-family: Georgia,"Times New Roman",Times,serif}
                h3{font-family: Georgia,"Times New Roman",Times,serif}
                h2{font-family: Georgia,"Times New Roman",Times,serif}
                p{font-family: Verdana,Arial,Helvetica,sans-serif}
                body{font-family: Verdana,Arial,Helvetica,sans-serif}
                img{margin-bottom: 0.4em; display:block;}
                '''

    feeds = [
        (u'Ultimo Momento',
         u'https://www.elpais.com.uy/rss/')
    ]

    def get_cover_url(self):
        cover_url = None
        index = 'https://www.elpais.com.uy/impresa/'
        soup = self.index_to_soup(index)
        link_item = soup.find('a', attrs={'class': 'page-link link-module'})
        # print link_item
        if link_item:
            cover_url = 'https://www.elpais.com.uy' + link_item.get('href')
        return cover_url

    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        return soup