calibre/recipes/cio_magazine.recipe

from __future__ import print_function
# Los primeros comentarios son las dificultades que he tenido con el Piton
# Cuando da error UTF8 revisa los comentarios (acentos). En notepad++ Search, Goto, posicion y lo ves.
# Editar con Notepad++ Si pone - donde no debe es que ha indentado mal... Edit - Blank operations - tab to space
# He entendido lo que significa el from... son paths dentro de pylib.zip...
# Con from importa solo un simbolo...con import,la libreria completa
from calibre.web.feeds.news import BasicNewsRecipe
# sys no hace falta... lo intente usar para escribir en stderr
from calibre import strftime
# Para convertir el tiempo del articulo
import string
import re
# Para usar expresiones regulares
# Visto en pylib.zip... la primera letra es mayuscula
# Estas dos ultimas han sido un vago intento de establecer una cookie (no
# usado)


class CIO_Magazine(BasicNewsRecipe):
    title = 'CIO Magazine'
    oldest_article = 14
    max_articles_per_feed = 100
    auto_cleanup = True
    __author__ = 'Julio Map'
    description = 'CIO is the leading information brand for today-s busy Chief information Officer - CIO Magazine bi-monthly '
    language = 'en'
    encoding = 'utf8'
    cover_url = 'http://www.cio.com/homepage/images/hp-cio-logo-linkedin.png'

    remove_tags_before = dict(name='div', attrs={'id': 'container'})
# Absolutamente innecesario... al final he visto un print_version (ver mas
# adelante)

# Dentro de una revista dada...
# issue_details contiene el titulo y las secciones de este ejemplar
# DetailModule esta dentro de issue_details contiene las urls y resumenes
# Dentro de un articulo dado...
# Article-default-body contiene el texto. Pero como digo, he encontrado
# una print_version

    no_stylesheets = True
    remove_javascript = True

    def print_version(self, url):
        # A esta funcion le llama el sistema... no hay que llamarla uno mismo (porque seria llamada dos veces)
        # Existe una version imprimible de los articulos cambiando
        # http://www.cio.com/article/<num>/<titulo> por
        # http://www.cio.com/article/print/<num> que contiene todas las paginas
        # dentro del div id=container
        if url.startswith('/'):
            url = 'http://www.cio.com' + url
        segments = url.split('/')
        printURL = '/'.join(segments[0:4]) + '/print/' + segments[4] + '#'
        return printURL

    def parse_index(self):
        #######################################################################
        #    This method should be implemented in recipes that parse a website
        # instead of feeds to generate a list of articles. Typical uses are for
        # news sources that have a Print Edition webpage that lists all the
        # articles in the current print edition. If this function is implemented,
        # it will be used in preference to BasicNewsRecipe.parse_feeds().
        #
        # It must return a list. Each element of the list must be a 2-element
        # tuple of the form ('feed title', list of articles).
        #
        # Each list of articles must contain dictionaries of the form:
        #
        # {
        # 'title'       : article title,
        # 'url'         : URL of print version,
        # 'date'        : The publication date of the article as a string,
        # 'description' : A summary of the article
        # 'content'     : The full article (can be an empty string). This is used by FullContentProfile
        # }
        #
        # For an example, see the recipe for downloading The Atlantic.
        # In addition, you can add 'author' for the author of the article.
        #######################################################################

        # Primero buscamos cual es la ultima revista que se ha creado
        soupinicial = self.index_to_soup('http://www.cio.com/magazine')
        # Es el primer enlace que hay en el DIV con class content_body
        a = soupinicial.find(
            True, attrs={'class': 'content_body'}).find('a', href=True)
        INDEX = re.sub(r'\?.*', '', a['href'])
        # Como cio.com usa enlaces relativos, le anteponemos el domain name.
        if INDEX.startswith('/'):  # protegiendonos de que dejen de usarlos
            INDEX = 'http://www.cio.com' + INDEX
        # Y nos aseguramos en los logs que lo estamos haciendo bien
        print("INDEX en parse_index: ", INDEX)

    # Ya sabemos cual es la revista... procesemosla.
        soup = self.index_to_soup(INDEX)

        articles = {}
        key = None
        feeds = []
        # Para empezar nos quedamos solo con dos DIV, 'heading' y ' issue_item'
        # Del primero sacamos las categorias (key) y del segundo las urls y
        # resumenes
        for div in soup.findAll(True,
                                attrs={'class': ['heading', 'issue_item']}):

            if ''.join(div['class']) == 'heading':
                key = string.capwords(self.tag_to_string(div.span))
                print("Key: ", key)  # Esto es para depurar
                articles[key] = []
                feeds.append(key)

            elif ''.join(div['class']) == 'issue_item':
                a = div.find('a', href=True)
                if not a:
                    continue
                url = re.sub(r'\?.*', '', a['href'])
                print("url: ", url)  # Esto es para depurar
                # Ya para nota, quitar al final las dos ultimas palabras
                title = self.tag_to_string(a, use_alt=True).strip()
                # No es la fecha de publicacion sino la de colecta
                pubdate = strftime('%a, %d %b')
                # Dentro de la div 'issue_item' el unico parrafo que hay es el
                # resumen
                summary = div.find('p')
                # Si hay summary la description sera el summary... si no, la
                # dejamos en blanco
                description = ''

                if summary:
                    description = self.tag_to_string(summary, use_alt=False)
                    print("Description = ", description)

                # Esto esta copiado del NY times
                feed = key if key is not None else 'Uncategorized'
                if feed not in articles:
                    articles[feed] = []
                if 'podcasts' not in url:
                    articles[feed].append(
                        dict(title=title, url=url, date=pubdate,
                             description=description,
                             content=''))
        feeds = [(k, articles[k]) for k in feeds if k in articles]
        return feeds