mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
La Jornada by rogeliodh
This commit is contained in:
parent
f61c197194
commit
e947f60154
120
resources/recipes/la_jornada.recipe
Normal file
120
resources/recipes/la_jornada.recipe
Normal file
@ -0,0 +1,120 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Rogelio Dominguez <rogelio.dominguez at gmail.com>'
|
||||
'''
|
||||
www.jornada.unam.mx
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
|
||||
import re
|
||||
|
||||
class LaJornada(BasicNewsRecipe):
|
||||
title = u'La Jornada'
|
||||
language = 'es'
|
||||
oldest_article = 1
|
||||
__author__ = 'rogeliodh'
|
||||
max_articles_per_feed = 100
|
||||
remove_tags = [dict(name='div', attrs={'class':['go gui','go gui top','comment-cont',]})]
|
||||
remove_tags_before = dict(id='article-cont')
|
||||
remove_tags_after = dict(id='article-cont')
|
||||
no_stylesheets = True
|
||||
extra_css = ' .series{ \
|
||||
border-bottom: 1px solid #626366; \
|
||||
font-weight: bold; \
|
||||
} \
|
||||
.sumario{ \
|
||||
font-weight: bold; \
|
||||
margin-top: 2em; \
|
||||
text-align: center \
|
||||
} \
|
||||
p.sumario{ \
|
||||
text-align: center \
|
||||
} \
|
||||
.sumarios{font-weight: bold} \
|
||||
.cabeza{ font-size: 1.5em} \
|
||||
.pie-foto { \
|
||||
text-align: justify; \
|
||||
font-size: 0.8em; \
|
||||
text-align: justify; \
|
||||
} \
|
||||
.pie-foto .credito { \
|
||||
font-weight: bold; \
|
||||
display: block \
|
||||
} \
|
||||
.credito-autor{ \
|
||||
margin-top: 1.5em; \
|
||||
padding-left: 0.6em; \
|
||||
border-bottom: 1px solid #626366; \
|
||||
font-variant: small-caps; \
|
||||
font-weight: bold \
|
||||
} \
|
||||
.credito-articulo{ \
|
||||
margin-top: 1.5em; \
|
||||
padding-left: 0.6em; \
|
||||
border-bottom: 1px solid #626366; \
|
||||
font-variant: small-caps; \
|
||||
font-weight: bold \
|
||||
} \
|
||||
.credito-titulo{text-align: right} \
|
||||
.hemero { \
|
||||
text-align: right; \
|
||||
font-size: 0.9em; \
|
||||
margin-bottom: 8px; \
|
||||
} \
|
||||
.loc { \
|
||||
font-weight: bold; \
|
||||
} \
|
||||
.carton { \
|
||||
text-align: center; \
|
||||
} \
|
||||
.credit { \
|
||||
font-weight: bold; \
|
||||
} \
|
||||
'
|
||||
|
||||
preprocess_regexps = [
|
||||
# Remove capitalized initial letter on some articles (editorial)
|
||||
(re.compile(r'<div class="inicial">(.*)</div><p class="s-s">', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: match.group(1)),
|
||||
# Cartons section uses a class instead of a div to identify the main content. Change it.
|
||||
(re.compile(r'class="carton"', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: 'id="article-cont" class="carton"'),
|
||||
# Remove <link rel="alternate"> as calibre has a bug (to report)
|
||||
(re.compile(r'<link rel="alternate".*?/>', re.DOTALL|re.IGNORECASE),
|
||||
lambda match: ''),
|
||||
]
|
||||
|
||||
INDEX = 'http://www.jornada.unam.mx/rss/edicion.xml'
|
||||
feeds = [
|
||||
(u'Opinion','http://www.jornada.unam.mx/rss/opinion.xml'),
|
||||
(u'Cartones','http://www.jornada.unam.mx/rss/cartones.xml'),
|
||||
(u'Política','http://www.jornada.unam.mx/rss/politica.xml'),
|
||||
(u'Economía','http://www.jornada.unam.mx/rss/economia.xml'),
|
||||
(u'Mundo','http://www.jornada.unam.mx/rss/mundo.xml'),
|
||||
(u'Estados','http://www.jornada.unam.mx/rss/estados.xml'),
|
||||
(u'Capital','http://www.jornada.unam.mx/rss/capital.xml'),
|
||||
(u'Sociedad','http://www.jornada.unam.mx/rss/sociedad.xml'),
|
||||
(u'Ciencias','http://www.jornada.unam.mx/rss/ciencias.xml'),
|
||||
(u'Cultura','http://www.jornada.unam.mx/rss/cultura.xml'),
|
||||
(u'Gastronomia','http://www.jornada.unam.mx/rss/gastronomia.xml'),
|
||||
(u'Espectáculos','http://www.jornada.unam.mx/rss/espectaculos.xml'),
|
||||
(u'Deportes','http://www.jornada.unam.mx/rss/deportes.xml'),
|
||||
]
|
||||
|
||||
def get_cover_url(self):
|
||||
'''
|
||||
Cover URL is http://www.jornada.unam.mx/YYYY/MM/DD/portada.pdf
|
||||
'''
|
||||
cover_url = None
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
soupstone = BeautifulStoneSoup(str(soup))
|
||||
urlbase = str(soupstone('link')[0])
|
||||
r= re.compile(r'.*http://www.jornada.unam.mx/([0-9]{4})/([0-9]{2})/([0-9]{2})', re.DOTALL|re.IGNORECASE)
|
||||
m = r.match(urlbase)
|
||||
if m:
|
||||
cover_url = 'http://www.jornada.unam.mx/' + m.groups()[0] + '/' + m.groups()[1] + '/' + m.groups()[2] + '/portada.pdf'
|
||||
|
||||
return cover_url
|
Loading…
x
Reference in New Issue
Block a user