diff --git a/resources/recipes/la_jornada.recipe b/resources/recipes/la_jornada.recipe new file mode 100644 index 0000000000..edcd1ec9a7 --- /dev/null +++ b/resources/recipes/la_jornada.recipe @@ -0,0 +1,120 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Rogelio Dominguez ' +''' +www.jornada.unam.mx +''' + +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup + +import re + +class LaJornada(BasicNewsRecipe): + title = u'La Jornada' + language = 'es' + oldest_article = 1 + __author__ = 'rogeliodh' + max_articles_per_feed = 100 + remove_tags = [dict(name='div', attrs={'class':['go gui','go gui top','comment-cont',]})] + remove_tags_before = dict(id='article-cont') + remove_tags_after = dict(id='article-cont') + no_stylesheets = True + extra_css = ' .series{ \ + border-bottom: 1px solid #626366; \ + font-weight: bold; \ + } \ + .sumario{ \ + font-weight: bold; \ + margin-top: 2em; \ + text-align: center \ + } \ + p.sumario{ \ + text-align: center \ + } \ + .sumarios{font-weight: bold} \ + .cabeza{ font-size: 1.5em} \ + .pie-foto { \ + text-align: justify; \ + font-size: 0.8em; \ + text-align: justify; \ + } \ + .pie-foto .credito { \ + font-weight: bold; \ + display: block \ + } \ + .credito-autor{ \ + margin-top: 1.5em; \ + padding-left: 0.6em; \ + border-bottom: 1px solid #626366; \ + font-variant: small-caps; \ + font-weight: bold \ + } \ + .credito-articulo{ \ + margin-top: 1.5em; \ + padding-left: 0.6em; \ + border-bottom: 1px solid #626366; \ + font-variant: small-caps; \ + font-weight: bold \ + } \ + .credito-titulo{text-align: right} \ + .hemero { \ + text-align: right; \ + font-size: 0.9em; \ + margin-bottom: 8px; \ + } \ + .loc { \ + font-weight: bold; \ + } \ + .carton { \ + text-align: center; \ + } \ + .credit { \ + font-weight: bold; \ + } \ + ' + + preprocess_regexps = [ + # Remove capitalized initial letter on some articles (editorial) + (re.compile(r'
(.*)

', re.DOTALL|re.IGNORECASE), + lambda match: match.group(1)), + # Cartons section uses a class instead of a div to identify the main content. Change it. + (re.compile(r'class="carton"', re.DOTALL|re.IGNORECASE), + lambda match: 'id="article-cont" class="carton"'), + # Remove as calibre has a bug (to report) + (re.compile(r'', re.DOTALL|re.IGNORECASE), + lambda match: ''), + ] + + INDEX = 'http://www.jornada.unam.mx/rss/edicion.xml' + feeds = [ + (u'Opinion','http://www.jornada.unam.mx/rss/opinion.xml'), + (u'Cartones','http://www.jornada.unam.mx/rss/cartones.xml'), + (u'Política','http://www.jornada.unam.mx/rss/politica.xml'), + (u'Economía','http://www.jornada.unam.mx/rss/economia.xml'), + (u'Mundo','http://www.jornada.unam.mx/rss/mundo.xml'), + (u'Estados','http://www.jornada.unam.mx/rss/estados.xml'), + (u'Capital','http://www.jornada.unam.mx/rss/capital.xml'), + (u'Sociedad','http://www.jornada.unam.mx/rss/sociedad.xml'), + (u'Ciencias','http://www.jornada.unam.mx/rss/ciencias.xml'), + (u'Cultura','http://www.jornada.unam.mx/rss/cultura.xml'), + (u'Gastronomia','http://www.jornada.unam.mx/rss/gastronomia.xml'), + (u'Espectáculos','http://www.jornada.unam.mx/rss/espectaculos.xml'), + (u'Deportes','http://www.jornada.unam.mx/rss/deportes.xml'), + ] + + def get_cover_url(self): + ''' + Cover URL is http://www.jornada.unam.mx/YYYY/MM/DD/portada.pdf + ''' + cover_url = None + soup = self.index_to_soup(self.INDEX) + soupstone = BeautifulStoneSoup(str(soup)) + urlbase = str(soupstone('link')[0]) + r= re.compile(r'.*http://www.jornada.unam.mx/([0-9]{4})/([0-9]{2})/([0-9]{2})', re.DOTALL|re.IGNORECASE) + m = r.match(urlbase) + if m: + cover_url = 'http://www.jornada.unam.mx/' + m.groups()[0] + '/' + m.groups()[1] + '/' + m.groups()[2] + '/portada.pdf' + + return cover_url