calibre/recipes/lanacion_chile.recipe

#!/usr/bin/env python

__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
lanacion.cl
'''
try:
    from urllib.parse import quote
except ImportError:
    from urllib import quote

from calibre.web.feeds.news import BasicNewsRecipe


class LaNacionChile(BasicNewsRecipe):
    title = 'La Nacion Chile'
    __author__ = 'Darko Miletic'
    description = 'El sitio de noticias online de Chile'
    publisher = 'La Nacion'
    category = 'news, politics, Chile'
    oldest_article = 2
    max_articles_per_feed = 100
    no_stylesheets = True
    use_embedded_content = False
    encoding = 'cp1252'
    cover_url = 'http://www.lanacion.cl/prontus_noticias_v2/imag/site/logo.gif'
    remove_javascript = True

    html2lrf_options = [
        '--comment', description, '--category', category, '--publisher', publisher
    ]

    html2epub_options = 'publisher="' + publisher + \
        '"\ncomments="' + description + '"\ntags="' + category + '"'

    keep_only_tags = [dict(name='div', attrs={'class': 'bloque'})]

    feeds = [(u'Noticias', u'http://www.lanacion.cl/rss.xml')]

    def print_version(self, url):
        toprint = quote(url, ':/')
        return u'http://www.lanacion.cl/cgi-bx/imprimir.cgi?_URL=' + toprint

    def preprocess_html(self, soup):
        del soup.body['onload']
        soup.head.base.extract()
        item = soup.find('a', attrs={'href': 'javascript:window.close()'})
        if item:
            item.extract()
        mtag = '<meta http-equiv="Content-Language" content="es-CL"/>'
        soup.head.insert(0, mtag)
        for item in soup.findAll(style=True):
            del item['style']
        return soup

    language = 'es_CL'