calibre/recipes/eluniversalimpresa.recipe

from calibre.web.feeds.news import BasicNewsRecipe


class ElUniversalImpresaRecipe(BasicNewsRecipe):
    __license__ = 'GPL v3'
    __author__ = 'kwetal'
    language = 'es_MX'
    version = 1

    title = u'El Universal (Edici\u00F3n Impresa)'
    publisher = u'El Universal'
    category = u'News, Mexico'
    description = u'News from Mexico'

    remove_empty_feeds = True
    remove_javascript = True

    INDEX = 'http://www.eluniversal.com.mx'

    extra_css = '''
                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
                '''

    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
                          'publisher': publisher, 'linearize_tables': True}

    def parse_index(self):
        soup = self.index_to_soup(
            'http://www.eluniversal.com.mx/edicion_impresa.html')
        index = []

        table = soup.find('table', attrs={'width': '500'})
        articles = []
        for td in table.findAll(lambda tag: tag.name == 'td' and tag.has_key('class') and tag['class'] == 'arnegro12'):  # noqa
            a = td.a
            a.extract()
            title = self.tag_to_string(a)
            url = self.INDEX + a['href']
            description = self.tag_to_string(td)
            articles.append({'title': title, 'date': None,
                             'url': url, 'description': description})

        index.append(('Primera Plana', articles))

        for td in table.findAll(lambda tag: tag.name == 'td' and len(tag.attrs) == 0):
            articles = []
            feedTitle = None
            for a in td.findAll('a'):
                if not feedTitle:
                    feedTitle = self.tag_to_string(a)
                    continue

                title = self.tag_to_string(a)

                url = self.INDEX + a['href']
                articles.append({'title': title, 'date': None,
                                 'url': url, 'description': ''})

            index.append((feedTitle, articles))

        return index

    def print_version(self, url):
        if url.find('wcarton') >= 0:
            return None

        main, sep, id = url.rpartition('/')

        return main + '/vi_' + id

    def preprocess_html(self, soup):
        table = soup.find('table')
        table.extract()

        for p in soup.findAll('p'):
            if self.tag_to_string(p).strip() == '':
                p.extract()

        tag = soup.find('font', attrs={'color': '#0F046A'})
        if tag:
            for attr in ['color', 'face', 'helvetica,', 'sans-serif', 'size']:
                if tag.has_key(attr):  # noqa
                    del tag[attr]
            tag.name = 'h1'

        return soup