La Jornada by rogeliodh

This commit is contained in:
Kovid Goyal 2010-08-19 21:07:54 -06:00
parent f61c197194
commit e947f60154

View File

@ -0,0 +1,120 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2010, Rogelio Dominguez <rogelio.dominguez at gmail.com>'
'''
www.jornada.unam.mx
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
import re
class LaJornada(BasicNewsRecipe):
title = u'La Jornada'
language = 'es'
oldest_article = 1
__author__ = 'rogeliodh'
max_articles_per_feed = 100
remove_tags = [dict(name='div', attrs={'class':['go gui','go gui top','comment-cont',]})]
remove_tags_before = dict(id='article-cont')
remove_tags_after = dict(id='article-cont')
no_stylesheets = True
extra_css = ' .series{ \
border-bottom: 1px solid #626366; \
font-weight: bold; \
} \
.sumario{ \
font-weight: bold; \
margin-top: 2em; \
text-align: center \
} \
p.sumario{ \
text-align: center \
} \
.sumarios{font-weight: bold} \
.cabeza{ font-size: 1.5em} \
.pie-foto { \
text-align: justify; \
font-size: 0.8em; \
text-align: justify; \
} \
.pie-foto .credito { \
font-weight: bold; \
display: block \
} \
.credito-autor{ \
margin-top: 1.5em; \
padding-left: 0.6em; \
border-bottom: 1px solid #626366; \
font-variant: small-caps; \
font-weight: bold \
} \
.credito-articulo{ \
margin-top: 1.5em; \
padding-left: 0.6em; \
border-bottom: 1px solid #626366; \
font-variant: small-caps; \
font-weight: bold \
} \
.credito-titulo{text-align: right} \
.hemero { \
text-align: right; \
font-size: 0.9em; \
margin-bottom: 8px; \
} \
.loc { \
font-weight: bold; \
} \
.carton { \
text-align: center; \
} \
.credit { \
font-weight: bold; \
} \
'
preprocess_regexps = [
# Remove capitalized initial letter on some articles (editorial)
(re.compile(r'<div class="inicial">(.*)</div><p class="s-s">', re.DOTALL|re.IGNORECASE),
lambda match: match.group(1)),
# Cartons section uses a class instead of a div to identify the main content. Change it.
(re.compile(r'class="carton"', re.DOTALL|re.IGNORECASE),
lambda match: 'id="article-cont" class="carton"'),
# Remove <link rel="alternate"> as calibre has a bug (to report)
(re.compile(r'<link rel="alternate".*?/>', re.DOTALL|re.IGNORECASE),
lambda match: ''),
]
INDEX = 'http://www.jornada.unam.mx/rss/edicion.xml'
feeds = [
(u'Opinion','http://www.jornada.unam.mx/rss/opinion.xml'),
(u'Cartones','http://www.jornada.unam.mx/rss/cartones.xml'),
(u'Política','http://www.jornada.unam.mx/rss/politica.xml'),
(u'Economía','http://www.jornada.unam.mx/rss/economia.xml'),
(u'Mundo','http://www.jornada.unam.mx/rss/mundo.xml'),
(u'Estados','http://www.jornada.unam.mx/rss/estados.xml'),
(u'Capital','http://www.jornada.unam.mx/rss/capital.xml'),
(u'Sociedad','http://www.jornada.unam.mx/rss/sociedad.xml'),
(u'Ciencias','http://www.jornada.unam.mx/rss/ciencias.xml'),
(u'Cultura','http://www.jornada.unam.mx/rss/cultura.xml'),
(u'Gastronomia','http://www.jornada.unam.mx/rss/gastronomia.xml'),
(u'Espectáculos','http://www.jornada.unam.mx/rss/espectaculos.xml'),
(u'Deportes','http://www.jornada.unam.mx/rss/deportes.xml'),
]
def get_cover_url(self):
'''
Cover URL is http://www.jornada.unam.mx/YYYY/MM/DD/portada.pdf
'''
cover_url = None
soup = self.index_to_soup(self.INDEX)
soupstone = BeautifulStoneSoup(str(soup))
urlbase = str(soupstone('link')[0])
r= re.compile(r'.*http://www.jornada.unam.mx/([0-9]{4})/([0-9]{2})/([0-9]{2})', re.DOTALL|re.IGNORECASE)
m = r.match(urlbase)
if m:
cover_url = 'http://www.jornada.unam.mx/' + m.groups()[0] + '/' + m.groups()[1] + '/' + m.groups()[2] + '/portada.pdf'
return cover_url