Updated La Jornada

This commit is contained in:
Kovid Goyal 2010-08-30 18:59:15 -06:00
parent 2d6009b45f
commit a3bb2c06fb
5 changed files with 55 additions and 111 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 943 B

View File

@ -18,7 +18,7 @@ class Clarin(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
no_stylesheets = True no_stylesheets = True
encoding = 'utf8' encoding = 'utf8'
language = 'es_AR' language = 'es'
publication_type = 'newspaper' publication_type = 'newspaper'
INDEX = 'http://www.clarin.com' INDEX = 'http://www.clarin.com'
masthead_url = 'http://www.clarin.com/static/CLAClarin/images/logo-clarin-print.jpg' masthead_url = 'http://www.clarin.com/static/CLAClarin/images/logo-clarin-print.jpg'

View File

@ -20,7 +20,7 @@ class Europasur(BasicNewsRecipe):
delay = 2 delay = 2
no_stylesheets = True no_stylesheets = True
encoding = 'cp1252' encoding = 'cp1252'
language = 'es_ES' language = 'es'
publication_type = 'newspaper' publication_type = 'newspaper'
extra_css = """ body{font-family: Verdana,Arial,Helvetica,sans-serif} extra_css = """ body{font-family: Verdana,Arial,Helvetica,sans-serif}
h2{font-family: Georgia,Times New Roman,Times,serif} h2{font-family: Georgia,Times New Roman,Times,serif}

View File

@ -1,120 +1,64 @@
#!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010, Rogelio Dominguez <rogelio.dominguez at gmail.com>' __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.jornada.unam.mx www.jornada.unam.mx
''' '''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
import re class LaJornada_mx(BasicNewsRecipe):
title = 'La Jornada (Mexico)'
class LaJornada(BasicNewsRecipe): __author__ = 'Darko Miletic'
title = u'La Jornada' description = 'Noticias del diario mexicano La Jornada'
language = 'es' publisher = 'DEMOS, Desarrollo de Medios, S.A. de C.V.'
oldest_article = 1 category = 'news, Mexico'
__author__ = 'rogeliodh' oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 200
remove_tags = [dict(name='div', attrs={'class':['go gui','go gui top','comment-cont',]})]
remove_tags_before = dict(id='article-cont')
remove_tags_after = dict(id='article-cont')
no_stylesheets = True no_stylesheets = True
extra_css = ' .series{ \ encoding = 'utf8'
border-bottom: 1px solid #626366; \ use_embedded_content = False
font-weight: bold; \ language = 'es'
} \ remove_empty_feeds = True
.sumario{ \ cover_url = strftime("http://www.jornada.unam.mx/%Y/%m/%d/planitas/portadita.jpg")
font-weight: bold; \ masthead_url = 'http://www.jornada.unam.mx/v7.0/imagenes/la-jornada-trans.png'
margin-top: 2em; \ extra_css = """
text-align: center \ body{font-family: "Times New Roman",serif }
} \ .cabeza{font-size: xx-large; font-weight: bold }
p.sumario{ \ .credito-articulo{font-size: 1.3em}
text-align: center \ """
} \
.sumarios{font-weight: bold} \
.cabeza{ font-size: 1.5em} \
.pie-foto { \
text-align: justify; \
font-size: 0.8em; \
text-align: justify; \
} \
.pie-foto .credito { \
font-weight: bold; \
display: block \
} \
.credito-autor{ \
margin-top: 1.5em; \
padding-left: 0.6em; \
border-bottom: 1px solid #626366; \
font-variant: small-caps; \
font-weight: bold \
} \
.credito-articulo{ \
margin-top: 1.5em; \
padding-left: 0.6em; \
border-bottom: 1px solid #626366; \
font-variant: small-caps; \
font-weight: bold \
} \
.credito-titulo{text-align: right} \
.hemero { \
text-align: right; \
font-size: 0.9em; \
margin-bottom: 8px; \
} \
.loc { \
font-weight: bold; \
} \
.carton { \
text-align: center; \
} \
.credit { \
font-weight: bold; \
} \
'
preprocess_regexps = [ conversion_options = {
# Remove capitalized initial letter on some articles (editorial) 'comment' : description
(re.compile(r'<div class="inicial">(.*)</div><p class="s-s">', re.DOTALL|re.IGNORECASE), , 'tags' : category
lambda match: match.group(1)), , 'publisher' : publisher
# Cartons section uses a class instead of a div to identify the main content. Change it. , 'language' : language
(re.compile(r'class="carton"', re.DOTALL|re.IGNORECASE), }
lambda match: 'id="article-cont" class="carton"'),
# Remove <link rel="alternate"> as calibre has a bug (to report) keep_only_tags = [
(re.compile(r'<link rel="alternate".*?/>', re.DOTALL|re.IGNORECASE), dict(name='div', attrs={'class':['documentContent','cabeza','sumarios','text']})
lambda match: ''), ,dict(name='div', attrs={'id':'renderComments'})
] ]
remove_tags = [dict(name='div', attrs={'class':'buttonbar'})]
INDEX = 'http://www.jornada.unam.mx/rss/edicion.xml'
feeds = [ feeds = [
(u'Opinion','http://www.jornada.unam.mx/rss/opinion.xml'), (u'Ultimas noticias' , u'http://www.jornada.unam.mx/ultimas/news/RSS' )
(u'Cartones','http://www.jornada.unam.mx/rss/cartones.xml'), ,(u'Opinion' , u'http://www.jornada.unam.mx/rss/opinion.xml' )
(u'Política','http://www.jornada.unam.mx/rss/politica.xml'), ,(u'Politica' , u'http://www.jornada.unam.mx/rss/politica.xml' )
(u'Economía','http://www.jornada.unam.mx/rss/economia.xml'), ,(u'Economia' , u'http://www.jornada.unam.mx/rss/economia.xml' )
(u'Mundo','http://www.jornada.unam.mx/rss/mundo.xml'), ,(u'Mundo' , u'http://www.jornada.unam.mx/rss/mundo.xml' )
(u'Estados','http://www.jornada.unam.mx/rss/estados.xml'), ,(u'Estados' , u'http://www.jornada.unam.mx/rss/estados.xml' )
(u'Capital','http://www.jornada.unam.mx/rss/capital.xml'), ,(u'Capital' , u'http://www.jornada.unam.mx/rss/capital.xml' )
(u'Sociedad','http://www.jornada.unam.mx/rss/sociedad.xml'), ,(u'Sociedad y justicia' , u'http://www.jornada.unam.mx/rss/sociedad.xml' )
(u'Ciencias','http://www.jornada.unam.mx/rss/ciencias.xml'), ,(u'Ciencias' , u'http://www.jornada.unam.mx/rss/ciencias.xml' )
(u'Cultura','http://www.jornada.unam.mx/rss/cultura.xml'), ,(u'Cultura' , u'http://www.jornada.unam.mx/rss/cultura.xml' )
(u'Gastronomia','http://www.jornada.unam.mx/rss/gastronomia.xml'), ,(u'Gastronomia' , u'http://www.jornada.unam.mx/rss/gastronomia.xml' )
(u'Espectáculos','http://www.jornada.unam.mx/rss/espectaculos.xml'), ,(u'Espectaculos' , u'http://www.jornada.unam.mx/rss/espectaculos.xml' )
(u'Deportes','http://www.jornada.unam.mx/rss/deportes.xml'), ,(u'Deportes' , u'http://www.jornada.unam.mx/rss/deportes.xml' )
] ]
def get_cover_url(self): def preprocess_html(self, soup):
''' for item in soup.findAll(style=True):
Cover URL is http://www.jornada.unam.mx/YYYY/MM/DD/portada.pdf del item['style']
''' return soup
cover_url = None
soup = self.index_to_soup(self.INDEX)
soupstone = BeautifulStoneSoup(str(soup))
urlbase = str(soupstone('link')[0])
r= re.compile(r'.*http://www.jornada.unam.mx/([0-9]{4})/([0-9]{2})/([0-9]{2})', re.DOTALL|re.IGNORECASE)
m = r.match(urlbase)
if m:
cover_url = 'http://www.jornada.unam.mx/' + m.groups()[0] + '/' + m.groups()[1] + '/' + m.groups()[2] + '/portada.pdf'
return cover_url

View File

@ -166,7 +166,7 @@ Search & Sort
The Search & Sort section allows you to perform several powerful actions on your book collections. The Search & Sort section allows you to perform several powerful actions on your book collections.
* You can sort them by title, author, date, rating etc. by clicking on the column titles. * You can sort them by title, author, date, rating etc. by clicking on the column titles. You can also sub-sort (i.e. sort on multiple columns). For example, if you click on the title column and then the author column, the book will be sorted by author and then all the entries for the same author will be sorted by title.
* You can search for a particular book or set of books using the search bar. More on that below. * You can search for a particular book or set of books using the search bar. More on that below.