CIO Magazine by Julio Map

2025-08-11 09:13:57 -04:00 · 2011-09-11 15:56:00 -06:00 · 2011-09-11 15:56:00 -06:00 · bd5ddfc7ed
commit bd5ddfc7ed
parent 7a3babf49e
1 changed files with 128 additions and 0 deletions
--- a/recipes/cio_magazine.recipe
+++ b/recipes/cio_magazine.recipe
@ -0,0 +1,128 @@
 # Los primeros comentarios son las dificultades que he tenido con el Piton
 # Cuando da error UTF8 revisa los comentarios (acentos). En notepad++ Search, Goto, posicion y lo ves.
 # Editar con Notepad++ Si pone - donde no debe es que ha indentado mal... Edit - Blank operations - tab to space
 # He entendido lo que significa el from... son paths dentro de pylib.zip...
 # Con from importa solo un simbolo...con import,la libreria completa
 from calibre.web.feeds.news import BasicNewsRecipe
 # sys no hace falta... lo intente usar para escribir en stderr
 from calibre import strftime
 # Para convertir el tiempo del articulo
 import string, re
 # Para usar expresiones regulares
 # Visto en pylib.zip... la primera letra es mayuscula
 # Estas dos ultimas han sido un vago intento de establecer una cookie (no usado)
 class CIO_Magazine(BasicNewsRecipe):
    title      = 'CIO Magazine'
    oldest_article = 14
    max_articles_per_feed = 100
    auto_cleanup = True
    __author__ = 'Julio Map'
    description = 'CIO is the leading information brand for today-s busy Chief information Officer - CIO Magazine bi-monthly '
    language              = 'en'
    encoding              = 'utf8'
    cover_url = 'http://www.cio.com/homepage/images/hp-cio-logo-linkedin.png'
    remove_tags_before = dict(name='div', attrs={'id':'container'})
 # Absolutamente innecesario... al final he visto un print_version (ver mas adelante)
 # Dentro de una revista dada...
 # issue_details contiene el titulo y las secciones de este ejemplar
 # DetailModule esta dentro de issue_details contiene las urls y resumenes
 # Dentro de un articulo dado...
 # Article-default-body contiene el texto. Pero como digo, he encontrado una print_version
    no_stylesheets = True
    remove_javascript     = True
    def print_version(self,url):
    # A esta funcion le llama el sistema... no hay que llamarla uno mismo (porque seria llamada dos veces)
    # Existe una version imprimible de los articulos cambiando
    # http://www.cio.com/article/<num>/<titulo> por
    # http://www.cio.com/article/print/<num> que contiene todas las paginas dentro del div id=container
        if url.startswith('/'):
            url = 'http://www.cio.com'+url
        segments = url.split('/')
        printURL = '/'.join(segments[0:4]) + '/print/' + segments[4] +'#'
        return printURL
    def parse_index(self):
    ###########################################################################
    #    This method should be implemented in recipes that parse a website
    # instead of feeds to generate a list of articles. Typical uses are for
    # news sources that have a Print Edition webpage that lists all the
    # articles in the current print edition. If this function is implemented,
    # it will be used in preference to BasicNewsRecipe.parse_feeds().
    #
    # It must return a list. Each element of the list must be a 2-element
    # tuple of the form ('feed title', list of articles).
    #
    # Each list of articles must contain dictionaries of the form:
    #
    # {
    # 'title'       : article title,
    # 'url'         : URL of print version,
    # 'date'        : The publication date of the article as a string,
    # 'description' : A summary of the article
    # 'content'     : The full article (can be an empty string). This is used by FullContentProfile
    # }
    #
    # For an example, see the recipe for downloading The Atlantic.
    # In addition, you can add 'author' for the author of the article.
    ###############################################################################
    # Primero buscamos cual es la ultima revista que se ha creado
        soupinicial = self.index_to_soup('http://www.cio.com/magazine')
        # Es el primer enlace que hay en el DIV con class content_body
        a= soupinicial.find(True, attrs={'class':'content_body'}).find('a', href=True)
        INDEX = re.sub(r'\?.*', '', a['href'])
        # Como cio.com usa enlaces relativos, le anteponemos el domain name.
        if INDEX.startswith('/'): # protegiendonos de que dejen de usarlos
            INDEX = 'http://www.cio.com'+INDEX
        # Y nos aseguramos en los logs que lo estamos haciendo bien
        print ("INDEX en parse_index: ", INDEX)
    # Ya sabemos cual es la revista... procesemosla.
        soup = self.index_to_soup(INDEX)
        articles = {}
        key = None
        feeds = []
        # Para empezar nos quedamos solo con dos DIV, 'heading' y ' issue_item'
        # Del primero sacamos las categorias (key) y del segundo las urls y resumenes
        for div in soup.findAll(True,
             attrs={'class':['heading', 'issue_item']}):
             if div['class'] == 'heading':
                 key = string.capwords(self.tag_to_string(div.span))
                 print ("Key: ",key) # Esto es para depurar
                 articles[key] = []
                 feeds.append(key)
             elif div['class'] == 'issue_item':
                 a = div.find('a', href=True)
                 if not a:
                     continue
                 url = re.sub(r'\?.*', '', a['href'])
                 print("url: ",url) # Esto es para depurar
                 title = self.tag_to_string(a, use_alt=True).strip() # Ya para nota, quitar al final las dos ultimas palabras
                 pubdate = strftime('%a, %d %b') # No es la fecha de publicacion sino la de colecta
                 summary = div.find('p') # Dentro de la div 'issue_item' el unico parrafo que hay es el resumen
                 description = '' # Si hay summary la description sera el summary... si no, la dejamos en blanco
                 if summary:
                     description = self.tag_to_string(summary, use_alt=False)
                     print ("Description = ", description)
                 feed = key if key is not None else 'Uncategorized' # Esto esta copiado del NY times
                 if not articles.has_key(feed):
                     articles[feed] = []
                 if not 'podcasts' in url:
                     articles[feed].append(
                               dict(title=title, url=url, date=pubdate,
                                    description=description,
                                    content=''))
        feeds = [(key, articles[key]) for key in feeds if articles.has_key(key)]
        return feeds