Updated La Jornada

2025-08-11 09:13:57 -04:00 · 2010-08-30 18:59:15 -06:00 · 2010-08-30 18:59:15 -06:00 · a3bb2c06fb
commit a3bb2c06fb
parent 2d6009b45f
5 changed files with 55 additions and 111 deletions
--- a/resources/images/news/la_jornada.png
+++ b/resources/images/news/la_jornada.png
--- a/resources/recipes/clarin.recipe
+++ b/resources/recipes/clarin.recipe
@ -18,7 +18,7 @@ class Clarin(BasicNewsRecipe):
    use_embedded_content  = False
    no_stylesheets        = True
    encoding              = 'utf8'
-    language              = 'es_AR'
+    language              = 'es'
    publication_type      = 'newspaper'
    INDEX                 = 'http://www.clarin.com'
    masthead_url          = 'http://www.clarin.com/static/CLAClarin/images/logo-clarin-print.jpg'
--- a/resources/recipes/europasur.recipe
+++ b/resources/recipes/europasur.recipe
@ -20,7 +20,7 @@ class Europasur(BasicNewsRecipe):
    delay                 = 2
    no_stylesheets        = True
    encoding              = 'cp1252'
-    language              = 'es_ES'
+    language              = 'es'
    publication_type      = 'newspaper'
    extra_css             = """ body{font-family: Verdana,Arial,Helvetica,sans-serif}
                                h2{font-family: Georgia,Times New Roman,Times,serif}
--- a/resources/recipes/la_jornada.recipe
+++ b/resources/recipes/la_jornada.recipe
@ -1,120 +1,64 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
-__copyright__ = '2010, Rogelio Dominguez <rogelio.dominguez at gmail.com>'
+__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
 '''
 www.jornada.unam.mx
 '''
 from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
-import re
+class LaJornada_mx(BasicNewsRecipe):
-
+    title                 = 'La Jornada (Mexico)'
-class LaJornada(BasicNewsRecipe):
+    __author__            = 'Darko Miletic'
-    title          = u'La Jornada'
+    description           = 'Noticias del diario mexicano La Jornada'
-    language       = 'es'
+    publisher             = 'DEMOS, Desarrollo de Medios, S.A. de C.V.'
-    oldest_article = 1
+    category              = 'news, Mexico'
-    __author__ = 'rogeliodh'
+    oldest_article        = 2
-    max_articles_per_feed = 100
+    max_articles_per_feed = 200
    remove_tags    = [dict(name='div', attrs={'class':['go gui','go gui top','comment-cont',]})]
    remove_tags_before = dict(id='article-cont')
    remove_tags_after = dict(id='article-cont')
    no_stylesheets        = True
-    extra_css      = ' .series{ \
+    encoding              = 'utf8'
-                               border-bottom: 1px solid #626366; \
+    use_embedded_content  = False
-                               font-weight: bold; \
+    language              = 'es'
-                               } \
+    remove_empty_feeds    = True
-                       .sumario{ \
+    cover_url             = strftime("http://www.jornada.unam.mx/%Y/%m/%d/planitas/portadita.jpg")
-                               font-weight: bold; \
+    masthead_url          = 'http://www.jornada.unam.mx/v7.0/imagenes/la-jornada-trans.png'
-                               margin-top: 2em; \
+    extra_css             = """
-                               text-align: center \
+                                body{font-family: "Times New Roman",serif }
-                               } \
+                                .cabeza{font-size: xx-large; font-weight: bold }
-                       p.sumario{ \
+                                .credito-articulo{font-size: 1.3em}
-                               text-align: center \
+                            """
                               } \
                       .sumarios{font-weight: bold} \
                       .cabeza{	font-size: 1.5em} \
                       .pie-foto { \
                               text-align: justify; \
                               font-size: 0.8em; \
                               text-align: justify; \
                               } \
                        .pie-foto .credito { \
                               font-weight: bold; \
                               display: block \
                               } \
                       .credito-autor{ \
                               margin-top: 1.5em; \
                               padding-left: 0.6em; \
                               border-bottom: 1px solid #626366; \
                               font-variant: small-caps; \
                               font-weight: bold \
                               } \
                       .credito-articulo{ \
                               margin-top: 1.5em; \
                               padding-left: 0.6em; \
                               border-bottom: 1px solid #626366; \
                               font-variant: small-caps; \
                               font-weight: bold \
                               } \
                       .credito-titulo{text-align: right} \
                       .hemero { \
                               text-align: right; \
                               font-size: 0.9em; \
                               margin-bottom: 8px; \
                               } \
                       .loc    { \
                               font-weight: bold; \
                               } \
                       .carton  { \
                               text-align: center; \
                               } \
                       .credit { \
                               font-weight: bold; \
                               } \
                              '
-    preprocess_regexps = [
+    conversion_options = {
-                         # Remove capitalized initial letter on some articles (editorial)
+                          'comment'   : description
-    		       	 (re.compile(r'<div class="inicial">(.*)</div><p class="s-s">', re.DOTALL|re.IGNORECASE),
+                        , 'tags'      : category
-        		  lambda match: match.group(1)),
+                        , 'publisher' : publisher
-                         # Cartons section uses a class instead of a div to identify the main content. Change it.
+                        , 'language'  : language
-    		       	 (re.compile(r'class="carton"', re.DOTALL|re.IGNORECASE),
+                        }
-        		  lambda match: 'id="article-cont" class="carton"'),
+
-                         # Remove <link rel="alternate"> as calibre has a bug (to report)
+    keep_only_tags = [
-    		       	 (re.compile(r'<link rel="alternate".*?/>', re.DOTALL|re.IGNORECASE),
+                         dict(name='div', attrs={'class':['documentContent','cabeza','sumarios','text']})
-        		  lambda match: ''),
+                        ,dict(name='div', attrs={'id':'renderComments'})
                     ]
    remove_tags = [dict(name='div', attrs={'class':'buttonbar'})]
    INDEX          = 'http://www.jornada.unam.mx/rss/edicion.xml'
    feeds = [
-                     (u'Opinion','http://www.jornada.unam.mx/rss/opinion.xml'),
+              (u'Ultimas noticias'    , u'http://www.jornada.unam.mx/ultimas/news/RSS'     )
-                     (u'Cartones','http://www.jornada.unam.mx/rss/cartones.xml'),
+             ,(u'Opinion'             , u'http://www.jornada.unam.mx/rss/opinion.xml'      )
-                     (u'Política','http://www.jornada.unam.mx/rss/politica.xml'),
+             ,(u'Politica'            , u'http://www.jornada.unam.mx/rss/politica.xml'     )
-                     (u'Economía','http://www.jornada.unam.mx/rss/economia.xml'),
+             ,(u'Economia'            , u'http://www.jornada.unam.mx/rss/economia.xml'     )
-                     (u'Mundo','http://www.jornada.unam.mx/rss/mundo.xml'),
+             ,(u'Mundo'               , u'http://www.jornada.unam.mx/rss/mundo.xml'        )
-                     (u'Estados','http://www.jornada.unam.mx/rss/estados.xml'),
+             ,(u'Estados'             , u'http://www.jornada.unam.mx/rss/estados.xml'      )
-                     (u'Capital','http://www.jornada.unam.mx/rss/capital.xml'),
+             ,(u'Capital'             , u'http://www.jornada.unam.mx/rss/capital.xml'      )
-                     (u'Sociedad','http://www.jornada.unam.mx/rss/sociedad.xml'),
+             ,(u'Sociedad y justicia' , u'http://www.jornada.unam.mx/rss/sociedad.xml'     )
-                     (u'Ciencias','http://www.jornada.unam.mx/rss/ciencias.xml'),
+             ,(u'Ciencias'            , u'http://www.jornada.unam.mx/rss/ciencias.xml'     )
-                     (u'Cultura','http://www.jornada.unam.mx/rss/cultura.xml'),
+             ,(u'Cultura'             , u'http://www.jornada.unam.mx/rss/cultura.xml'      )
-                     (u'Gastronomia','http://www.jornada.unam.mx/rss/gastronomia.xml'),
+             ,(u'Gastronomia'         , u'http://www.jornada.unam.mx/rss/gastronomia.xml'  )
-                     (u'Espectáculos','http://www.jornada.unam.mx/rss/espectaculos.xml'),
+             ,(u'Espectaculos'        , u'http://www.jornada.unam.mx/rss/espectaculos.xml' )
-                     (u'Deportes','http://www.jornada.unam.mx/rss/deportes.xml'),
+             ,(u'Deportes'            , u'http://www.jornada.unam.mx/rss/deportes.xml'     )
            ]
-    def get_cover_url(self):
+    def preprocess_html(self, soup):
-        '''
+        for item in soup.findAll(style=True):
-        Cover URL is http://www.jornada.unam.mx/YYYY/MM/DD/portada.pdf
+            del item['style']
-        '''
+        return soup
        cover_url = None
        soup = self.index_to_soup(self.INDEX)
        soupstone = BeautifulStoneSoup(str(soup))
        urlbase = str(soupstone('link')[0])
        r= re.compile(r'.*http://www.jornada.unam.mx/([0-9]{4})/([0-9]{2})/([0-9]{2})', re.DOTALL|re.IGNORECASE)
        m = r.match(urlbase)
        if m:
              cover_url = 'http://www.jornada.unam.mx/' + m.groups()[0] + '/' + m.groups()[1] + '/' + m.groups()[2] + '/portada.pdf'
        return cover_url
--- a/src/calibre/manual/gui.rst
+++ b/src/calibre/manual/gui.rst
@ -166,7 +166,7 @@ Search & Sort
 The Search & Sort section allows you to perform several powerful actions on your book collections.
-    * You can sort them by title, author, date, rating etc. by clicking on the column titles.
+    * You can sort them by title, author, date, rating etc. by clicking on the column titles. You can also sub-sort (i.e. sort on multiple columns). For example, if you click on the title column and then the author column, the book will be sorted by author and then all the entries for the same author will be sorted by title.
    * You can search for a particular book or set of books using the search bar. More on that below.