calibre/recipes/levante.recipe

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag


def new_tag(soup, name, attrs=()):
    impl = getattr(soup, 'new_tag', None)
    if impl is not None:
        return impl(name, attrs=dict(attrs))
    return Tag(soup, name, attrs=attrs or None)


class LevanteRecipe(BasicNewsRecipe):
    __license__ = 'GPL v3'
    __author__ = 'kwetal'
    version = 1
    language = 'es'
    description = u'El Mercantil Valenciano'
    title = u'Levante'

    oldest_article = 2
    max_articles_per_feed = 100
    encoding = 'latin1'
    no_stylesheets = True
    remove_javascript = True

    # Feeds taken from http://www.levante-emv.com/servicios/rss/rss.jsp?pServicio=rss
    # Feed titles are without accented characters for now. Hope to resolve
    # this in the future.
    feeds = []
    feeds.append(
        (u'Portada Valencia', u'http://www.levante-emv.com/elementosInt/rss/1'))
    feeds.append(
        (u'Portada Castello', u'http://www.levante-emv.com/elementosInt/rss/2'))
    feeds.append(
        (u'Portada Alacant', u'http://www.levante-emv.com/elementosInt/rss/3'))
    feeds.append(
        (u'Lo Mas Leido', u'http://www.levante-emv.com/elementosInt/rss/LoMas'))
    feeds.append(
        (u'Seccion al minuto', u'http://www.levante-emv.com/elementosInt/rss/AlMinuto'))
    feeds.append((u'Comunidad Valenciana',
                  u'http://www.levante-emv.com/elementosInt/rss/19'))
    feeds.append(
        (u'Valencia', u'http://www.levante-emv.com/elementosInt/rss/16'))
    feeds.append(
        (u'Castello', u'http://www.levante-emv.com/elementosInt/rss/4'))
    feeds.append(
        (u'Alacant', u'http://www.levante-emv.com/elementosInt/rss/17'))
    feeds.append(
        (u'Comarcas', u'http://www.levante-emv.com/elementosInt/rss/12'))
    feeds.append((u'Espana', u'http://www.levante-emv.com/elementosInt/rss/6'))
    feeds.append(
        (u'Internacional', u'http://www.levante-emv.com/elementosInt/rss/7'))
    feeds.append(
        (u'Opinion', u'http://www.levante-emv.com/elementosInt/rss/5'))
    feeds.append(
        (u'Economia', u'http://www.levante-emv.com/elementosInt/rss/8'))
    feeds.append(
        (u'Sociedad', u'http://www.levante-emv.com/elementosInt/rss/9'))
    feeds.append(
        (u'Sucesos', u'http://www.levante-emv.com/elementosInt/rss/10'))
    feeds.append(
        (u'Deportes', u'http://www.levante-emv.com/elementosInt/rss/11'))
    feeds.append((u'Motor', u'http://www.levante-emv.com/elementosInt/rss/31'))
    feeds.append(
        (u'Panorama', u'http://www.levante-emv.com/elementosInt/rss/18'))
    feeds.append(
        (u'Salud y Vida', u'http://www.levante-emv.com/elementosInt/rss/20'))
    feeds.append(
        (u'Ciencia y Salud', u'http://www.levante-emv.com/elementosInt/rss/44'))
    feeds.append((u'Ciencia e Investigacion',
                  u'http://www.levante-emv.com/elementosInt/rss/23'))
    feeds.append(
        (u'Ensenanza', u'http://www.levante-emv.com/elementosInt/rss/22'))
    feeds.append((u'Fiestas y Tradiciones',
                  u'http://www.levante-emv.com/elementosInt/rss/24'))
    feeds.append(
        (u'Club Diario', u'http://www.levante-emv.com/elementosInt/rss/26'))
    feeds.append(
        (u'Juntos', u'http://www.levante-emv.com/elementosInt/rss/33'))
    feeds.append(
        (u'Integrados', u'http://www.levante-emv.com/elementosInt/rss/35'))
    feeds.append(
        (u'Agenda', u'http://www.levante-emv.com/elementosInt/rss/36'))
    feeds.append(
        (u'Cultura', u'http://www.levante-emv.com/elementosInt/rss/39'))
    feeds.append(
        (u'Tecnologia', u'http://www.levante-emv.com/elementosInt/rss/40'))
    feeds.append((u'Gente', u'http://www.levante-emv.com/elementosInt/rss/41'))
    feeds.append(
        (u'Television', u'http://www.levante-emv.com/elementosInt/rss/42'))
    feeds.append(
        (u'Participa', u'http://www.levante-emv.com/elementosInt/rss/45'))

    keep_only_tags = [dict(name='div', attrs={'class': 'noticia_titular'}),
                      dict(name='div', attrs={'class': 'subtitulo'}),
                      dict(name='div', attrs={'id': 'noticia_texto', 'class': 'noticia_texto'})]

    def preprocess_html(self, soup):
        # Nuke some real crappy html
        theirHead = soup.head
        theirHead.extract()
        myHead = new_tag(soup, 'head')
        soup.insert(0, myHead)

        return soup