calibre/recipes/laprensa_ni.recipe

#!/usr/bin/env python

__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
laprensa.com.ni
'''

import datetime
import time
from calibre.web.feeds.news import BasicNewsRecipe


class LaPrensa_ni(BasicNewsRecipe):
    title = 'La Prensa - Nicaragua'
    __author__ = 'Darko Miletic'
    description = 'LA PRENSA - EL Diario de los Nicaraguenses'
    publisher = 'La Prensa'
    oldest_article = 1
    max_articles_per_feed = 100
    no_stylesheets = True
    use_embedded_content = False
    encoding = 'cp1252'
    remove_javascript = True
    language = 'es_NI'

    months_es = ['enero', 'febrero', 'marzo', 'abril', 'mayo', 'junio',
                 'julio', 'agosto', 'septiembre', 'octubre', 'noviembre', 'diciembre']
    current_month = months_es[datetime.date.today().month - 1]
    current_index = time.strftime(
        "http://www.laprensa.com.ni/archivo/%Y/" + current_month + "/%d/noticias/")

    feeds = [(u'Portada', current_index + 'portada/')]

    def print_version(self, url):
        return url.replace('.shtml', '_print.shtml')

    def preprocess_html(self, soup):
        del soup.body['onload']
        mtag = '<meta http-equiv="Content-Language" content="es-NI"/>'
        soup.head.insert(0, mtag)
        atag = soup.find('span', attrs={'class': 'mas_noticias'})
        if atag:
            atag.extract()
        btag = soup.find('a', attrs={'href': '/archivo'})
        if btag:
            btag.extract()
        for item in soup.findAll(style=True):
            del item['style']
        return soup

    def parse_index(self):
        totalfeeds = []
        lfeeds = self.get_feeds()
        for feedobj in lfeeds:
            feedtitle, feedurl = feedobj
            self.report_progress(0, _('Fetching feed') + ' %s...' %
                                 (feedtitle if feedtitle else feedurl))
            articles = []
            soup = self.index_to_soup(feedurl)
            for item in soup.findAll('a', attrs={'class': ['titular', 'titulonotamed']}):
                description = ''
                url = feedurl + item['href']
                title = self.tag_to_string(item)
                date = time.strftime(self.timefmt)
                articles.append({
                    'title': title, 'date': date, 'url': url, 'description': description
                })
            totalfeeds.append((feedtitle, articles))
        return totalfeeds