Updated La Jornada

This commit is contained in:
Kovid Goyal 2010-08-30 18:59:15 -06:00
parent 2d6009b45f
commit a3bb2c06fb
5 changed files with 55 additions and 111 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 943 B

View File

@ -18,7 +18,7 @@ class Clarin(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
no_stylesheets = True no_stylesheets = True
encoding = 'utf8' encoding = 'utf8'
language = 'es_AR' language = 'es'
publication_type = 'newspaper' publication_type = 'newspaper'
INDEX = 'http://www.clarin.com' INDEX = 'http://www.clarin.com'
masthead_url = 'http://www.clarin.com/static/CLAClarin/images/logo-clarin-print.jpg' masthead_url = 'http://www.clarin.com/static/CLAClarin/images/logo-clarin-print.jpg'

View File

@ -20,7 +20,7 @@ class Europasur(BasicNewsRecipe):
delay = 2 delay = 2
no_stylesheets = True no_stylesheets = True
encoding = 'cp1252' encoding = 'cp1252'
language = 'es_ES' language = 'es'
publication_type = 'newspaper' publication_type = 'newspaper'
extra_css = """ body{font-family: Verdana,Arial,Helvetica,sans-serif} extra_css = """ body{font-family: Verdana,Arial,Helvetica,sans-serif}
h2{font-family: Georgia,Times New Roman,Times,serif} h2{font-family: Georgia,Times New Roman,Times,serif}

View File

@ -1,120 +1,64 @@
#!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010, Rogelio Dominguez <rogelio.dominguez at gmail.com>' __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
''' '''
www.jornada.unam.mx www.jornada.unam.mx
''' '''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
import re class LaJornada_mx(BasicNewsRecipe):
title = 'La Jornada (Mexico)'
__author__ = 'Darko Miletic'
description = 'Noticias del diario mexicano La Jornada'
publisher = 'DEMOS, Desarrollo de Medios, S.A. de C.V.'
category = 'news, Mexico'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'es'
remove_empty_feeds = True
cover_url = strftime("http://www.jornada.unam.mx/%Y/%m/%d/planitas/portadita.jpg")
masthead_url = 'http://www.jornada.unam.mx/v7.0/imagenes/la-jornada-trans.png'
extra_css = """
body{font-family: "Times New Roman",serif }
.cabeza{font-size: xx-large; font-weight: bold }
.credito-articulo{font-size: 1.3em}
"""
class LaJornada(BasicNewsRecipe): conversion_options = {
title = u'La Jornada' 'comment' : description
language = 'es' , 'tags' : category
oldest_article = 1 , 'publisher' : publisher
__author__ = 'rogeliodh' , 'language' : language
max_articles_per_feed = 100 }
remove_tags = [dict(name='div', attrs={'class':['go gui','go gui top','comment-cont',]})]
remove_tags_before = dict(id='article-cont')
remove_tags_after = dict(id='article-cont')
no_stylesheets = True
extra_css = ' .series{ \
border-bottom: 1px solid #626366; \
font-weight: bold; \
} \
.sumario{ \
font-weight: bold; \
margin-top: 2em; \
text-align: center \
} \
p.sumario{ \
text-align: center \
} \
.sumarios{font-weight: bold} \
.cabeza{ font-size: 1.5em} \
.pie-foto { \
text-align: justify; \
font-size: 0.8em; \
text-align: justify; \
} \
.pie-foto .credito { \
font-weight: bold; \
display: block \
} \
.credito-autor{ \
margin-top: 1.5em; \
padding-left: 0.6em; \
border-bottom: 1px solid #626366; \
font-variant: small-caps; \
font-weight: bold \
} \
.credito-articulo{ \
margin-top: 1.5em; \
padding-left: 0.6em; \
border-bottom: 1px solid #626366; \
font-variant: small-caps; \
font-weight: bold \
} \
.credito-titulo{text-align: right} \
.hemero { \
text-align: right; \
font-size: 0.9em; \
margin-bottom: 8px; \
} \
.loc { \
font-weight: bold; \
} \
.carton { \
text-align: center; \
} \
.credit { \
font-weight: bold; \
} \
'
preprocess_regexps = [ keep_only_tags = [
# Remove capitalized initial letter on some articles (editorial) dict(name='div', attrs={'class':['documentContent','cabeza','sumarios','text']})
(re.compile(r'<div class="inicial">(.*)</div><p class="s-s">', re.DOTALL|re.IGNORECASE), ,dict(name='div', attrs={'id':'renderComments'})
lambda match: match.group(1)),
# Cartons section uses a class instead of a div to identify the main content. Change it.
(re.compile(r'class="carton"', re.DOTALL|re.IGNORECASE),
lambda match: 'id="article-cont" class="carton"'),
# Remove <link rel="alternate"> as calibre has a bug (to report)
(re.compile(r'<link rel="alternate".*?/>', re.DOTALL|re.IGNORECASE),
lambda match: ''),
]
INDEX = 'http://www.jornada.unam.mx/rss/edicion.xml'
feeds = [
(u'Opinion','http://www.jornada.unam.mx/rss/opinion.xml'),
(u'Cartones','http://www.jornada.unam.mx/rss/cartones.xml'),
(u'Política','http://www.jornada.unam.mx/rss/politica.xml'),
(u'Economía','http://www.jornada.unam.mx/rss/economia.xml'),
(u'Mundo','http://www.jornada.unam.mx/rss/mundo.xml'),
(u'Estados','http://www.jornada.unam.mx/rss/estados.xml'),
(u'Capital','http://www.jornada.unam.mx/rss/capital.xml'),
(u'Sociedad','http://www.jornada.unam.mx/rss/sociedad.xml'),
(u'Ciencias','http://www.jornada.unam.mx/rss/ciencias.xml'),
(u'Cultura','http://www.jornada.unam.mx/rss/cultura.xml'),
(u'Gastronomia','http://www.jornada.unam.mx/rss/gastronomia.xml'),
(u'Espectáculos','http://www.jornada.unam.mx/rss/espectaculos.xml'),
(u'Deportes','http://www.jornada.unam.mx/rss/deportes.xml'),
] ]
remove_tags = [dict(name='div', attrs={'class':'buttonbar'})]
def get_cover_url(self): feeds = [
''' (u'Ultimas noticias' , u'http://www.jornada.unam.mx/ultimas/news/RSS' )
Cover URL is http://www.jornada.unam.mx/YYYY/MM/DD/portada.pdf ,(u'Opinion' , u'http://www.jornada.unam.mx/rss/opinion.xml' )
''' ,(u'Politica' , u'http://www.jornada.unam.mx/rss/politica.xml' )
cover_url = None ,(u'Economia' , u'http://www.jornada.unam.mx/rss/economia.xml' )
soup = self.index_to_soup(self.INDEX) ,(u'Mundo' , u'http://www.jornada.unam.mx/rss/mundo.xml' )
soupstone = BeautifulStoneSoup(str(soup)) ,(u'Estados' , u'http://www.jornada.unam.mx/rss/estados.xml' )
urlbase = str(soupstone('link')[0]) ,(u'Capital' , u'http://www.jornada.unam.mx/rss/capital.xml' )
r= re.compile(r'.*http://www.jornada.unam.mx/([0-9]{4})/([0-9]{2})/([0-9]{2})', re.DOTALL|re.IGNORECASE) ,(u'Sociedad y justicia' , u'http://www.jornada.unam.mx/rss/sociedad.xml' )
m = r.match(urlbase) ,(u'Ciencias' , u'http://www.jornada.unam.mx/rss/ciencias.xml' )
if m: ,(u'Cultura' , u'http://www.jornada.unam.mx/rss/cultura.xml' )
cover_url = 'http://www.jornada.unam.mx/' + m.groups()[0] + '/' + m.groups()[1] + '/' + m.groups()[2] + '/portada.pdf' ,(u'Gastronomia' , u'http://www.jornada.unam.mx/rss/gastronomia.xml' )
,(u'Espectaculos' , u'http://www.jornada.unam.mx/rss/espectaculos.xml' )
,(u'Deportes' , u'http://www.jornada.unam.mx/rss/deportes.xml' )
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
return cover_url

View File

@ -166,7 +166,7 @@ Search & Sort
The Search & Sort section allows you to perform several powerful actions on your book collections. The Search & Sort section allows you to perform several powerful actions on your book collections.
* You can sort them by title, author, date, rating etc. by clicking on the column titles. * You can sort them by title, author, date, rating etc. by clicking on the column titles. You can also sub-sort (i.e. sort on multiple columns). For example, if you click on the title column and then the author column, the book will be sorted by author and then all the entries for the same author will be sorted by title.
* You can search for a particular book or set of books using the search bar. More on that below. * You can search for a particular book or set of books using the search bar. More on that below.