From 43bbfaf7e75c314e3784ca9ccb68bd8163b13a4e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 30 May 2013 06:59:22 +0530 Subject: [PATCH] Update Folha de Sao Paolo --- recipes/folhadesaopaulo.recipe | 14 +++---- recipes/folhadesaopaulo_sub.recipe | 59 +++++++++++++++++------------- 2 files changed, 40 insertions(+), 33 deletions(-) diff --git a/recipes/folhadesaopaulo.recipe b/recipes/folhadesaopaulo.recipe index 3f63e80291..f76a01ebfc 100644 --- a/recipes/folhadesaopaulo.recipe +++ b/recipes/folhadesaopaulo.recipe @@ -6,7 +6,7 @@ from calibre.utils.magick import Image, PixelWand from urllib2 import Request, urlopen, URLError class FolhaOnline(BasicNewsRecipe): - THUMBALIZR_API = '' # ---->Get your at http://www.thumbalizr.com/ and put here + THUMBALIZR_API = '' # ---->Get your at http://www.thumbalizr.com/ and put here LANGUAGE = 'pt_br' language = 'pt_BR' LANGHTM = 'pt-br' @@ -63,14 +63,14 @@ class FolhaOnline(BasicNewsRecipe): ,dict(name='div', attrs={'class':[ 'openBox adslibraryArticle' - ,'toolbar' + ,'toolbar' ]}) ,dict(name='a') ,dict(name='iframe') ,dict(name='link') ,dict(name='script') - ,dict(name='li') + ,dict(name='li') ] remove_tags_after = dict(name='div',attrs={'id':'articleEnd'}) @@ -109,7 +109,6 @@ class FolhaOnline(BasicNewsRecipe): ,(u'Valdo Cruz', u'http://feeds.folha.uol.com.br/colunas/valdocruz/rss091.xml') ] - conversion_options = { 'title' : title ,'comments' : description @@ -131,8 +130,8 @@ class FolhaOnline(BasicNewsRecipe): return soup def postprocess_html(self, soup, first): - #process all the images. assumes that the new html has the correct path - for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): + # process all the images. assumes that the new html has the correct path + for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and 'src' in tag): iurl = tag['src'] img = Image() img.open(iurl) @@ -141,7 +140,7 @@ class FolhaOnline(BasicNewsRecipe): if img < 0: raise RuntimeError('Out of memory') pw = PixelWand() - if( width > height and width > 590) : + if(width > height and width > 590) : print 'Rotate image' img.rotate(pw, -90) img.save(iurl) @@ -163,3 +162,4 @@ class FolhaOnline(BasicNewsRecipe): except URLError: cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' return cover_url + diff --git a/recipes/folhadesaopaulo_sub.recipe b/recipes/folhadesaopaulo_sub.recipe index 238310edc1..53fc06bf1d 100644 --- a/recipes/folhadesaopaulo_sub.recipe +++ b/recipes/folhadesaopaulo_sub.recipe @@ -1,6 +1,7 @@ from calibre.web.feeds.news import BasicNewsRecipe import re +import datetime class FSP(BasicNewsRecipe): @@ -9,7 +10,7 @@ class FSP(BasicNewsRecipe): description = u'Printed edition contents. UOL subscription required (Folha subscription currently not supported).' + \ u' [Conte\xfado completo da edi\xe7\xe3o impressa. Somente para assinantes UOL.]' - #found this to be the easiest place to find the index page (13-Nov-2011). + # found this to be the easiest place to find the index page (13-Nov-2011). # searching for the "Indice Geral" link HOMEPAGE = 'http://www1.folha.uol.com.br/fsp/' masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif' @@ -24,11 +25,11 @@ class FSP(BasicNewsRecipe): remove_tags = [dict(name='td', attrs={'align':'center'})] remove_attributes = ['height','width'] # fixes the problem with the section names - section_dict = {'cotidian' : 'cotidiano', 'ilustrad': 'ilustrada', \ - 'quadrin': 'quadrinhos' , 'opiniao' : u'opini\xE3o', \ - 'ciencia' : u'ci\xeancia' , 'saude' : u'sa\xfade', \ - 'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio', \ - 'imoveis' : u'im\xf3veis', 'negocios' : u'neg\xf3cios', \ + section_dict = {'cotidian' : 'cotidiano', 'ilustrad': 'ilustrada', + 'quadrin': 'quadrinhos' , 'opiniao' : u'opini\xE3o', + 'ciencia' : u'ci\xeancia' , 'saude' : u'sa\xfade', + 'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio', + 'imoveis' : u'im\xf3veis', 'negocios' : u'neg\xf3cios', 'veiculos' : u've\xedculos', 'corrida' : 'folha corrida'} # this solves the problem with truncated content in Kindle @@ -47,29 +48,31 @@ class FSP(BasicNewsRecipe): br['user'] = self.username br['pass'] = self.password br.submit().read() -## if 'Please try again' in raw: -## raise Exception('Your username and password are incorrect') + # if 'Please try again' in raw: + # raise Exception('Your username and password are incorrect') return br - def parse_index(self): - #Searching for the index page on the HOMEPAGE - hpsoup = self.index_to_soup(self.HOMEPAGE) - indexref = hpsoup.find('a', href=re.compile('^indices.*')) - self.log('--> tag containing the today s index: ', indexref) - INDEX = indexref['href'] - INDEX = 'http://www1.folha.uol.com.br/fsp/'+INDEX + # Searching for the index page on the HOMEPAGE + # hpsoup = self.index_to_soup(self.HOMEPAGE) + # indexref = hpsoup.find('a', href=re.compile('^indices.*')) + # self.log('--> tag containing the today s index: ', indexref) + # INDEX = indexref['href'] + # INDEX = 'http://www1.folha.uol.com.br/'+INDEX + today=datetime.date.today() + INDEX = 'http://www1.folha.uol.com.br/' + 'fsp/indices/index-' + str(today).replace('-','') + '.shtml' self.log('--> INDEX after extracting href and adding prefix: ', INDEX) # ... and taking the opportunity to get the cover image link - coverurl = hpsoup.find('a', href=re.compile('^cp.*'))['href'] + # coverurl = hpsoup.find('a', href=re.compile('^cp.*'))['href'] + coverurl = 'cp' + str(today.day).zfill(2) + str(today.month).zfill(2) + str(today.year) + '.shtml' if coverurl: self.log('--> tag containing the today s cover: ', coverurl) - coverurl = coverurl.replace('htm', 'jpg') + coverurl = coverurl.replace('shtml', 'jpg') coverurl = 'http://www1.folha.uol.com.br/fsp/images/'+coverurl self.log('--> coverurl after extracting href and adding prefix: ', coverurl) self.cover_url = coverurl - #soup = self.index_to_soup(self.INDEX) + # soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(INDEX) feeds = [] @@ -78,24 +81,26 @@ class FSP(BasicNewsRecipe): for post in soup.findAll('a'): # if name=True => new section strpost = str(post) - if strpost.startswith('