From 5674dd141e417f4c55f9e4409e4a79726d307f0f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 22 Oct 2014 08:34:56 +0530 Subject: [PATCH] Update Folha de Sao Paolo --- recipes/folhadesaopaulo.recipe | 26 ++----------- recipes/folhadesaopaulo_sub.recipe | 62 ++++++++++++------------------ 2 files changed, 28 insertions(+), 60 deletions(-) diff --git a/recipes/folhadesaopaulo.recipe b/recipes/folhadesaopaulo.recipe index f76a01ebfc..225bf4ad96 100644 --- a/recipes/folhadesaopaulo.recipe +++ b/recipes/folhadesaopaulo.recipe @@ -49,30 +49,10 @@ class FolhaOnline(BasicNewsRecipe): cover_margins = (0,0,'white') masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif' - keep_only_tags = [dict(name='div', attrs={'id':'articleNew'})] - remove_tags = [ - dict(name='div', - attrs={'id':[ - 'articleButton' - ,'bookmarklets' - ,'ad-180x150-1' - ,'contextualAdsArticle' - ,'articleEnd' - ,'articleComments' - ]}) - ,dict(name='div', - attrs={'class':[ - 'openBox adslibraryArticle' - ,'toolbar' - ]}) - - ,dict(name='a') - ,dict(name='iframe') - ,dict(name='link') - ,dict(name='script') - ,dict(name='li') + keep_only_tags = [ + dict(name='div', attrs={'id':'articleNew'}), + dict(name='article', id='news'), ] - remove_tags_after = dict(name='div',attrs={'id':'articleEnd'}) feeds = [ (u'Em cima da hora', u'http://feeds.folha.uol.com.br/emcimadahora/rss091.xml') diff --git a/recipes/folhadesaopaulo_sub.recipe b/recipes/folhadesaopaulo_sub.recipe index 70eea0d584..e2fc336685 100644 --- a/recipes/folhadesaopaulo_sub.recipe +++ b/recipes/folhadesaopaulo_sub.recipe @@ -10,7 +10,7 @@ class FSP(BasicNewsRecipe): description = u'Printed edition contents. UOL subscription required (Folha subscription currently not supported).' + \ u' [Conte\xfado completo da edi\xe7\xe3o impressa. Somente para assinantes UOL.]' - #found this to be the easiest place to find the index page (13-Nov-2011). + # found this to be the easiest place to find the index page (13-Nov-2011). # searching for the "Indice Geral" link HOMEPAGE = 'http://www1.folha.uol.com.br/fsp/' today=datetime.date.today() @@ -26,17 +26,20 @@ class FSP(BasicNewsRecipe): # remove_tags_before = dict(name='p') # remove_tags_before = dict(name='div', id='articleNew') # remove_tags_after = dict(name='div', id='articleNew') - keep_only_tags = [dict(name='div', id='articleNew'), dict(name='table', attrs={'class':'articleGraphic'})] + keep_only_tags = [ + dict(name='div', id='articleNew'), dict(name='table', attrs={'class':'articleGraphic'}), + dict(name='article', id='news'), + ] publication_type = 'newspaper' simultaneous_downloads = 5 # remove_tags = [dict(name='td', attrs={'align':'center'})] remove_attributes = ['height','width'] # fixes the problem with the section names - section_dict = {'cotidian' : 'cotidiano', 'ilustrad': 'ilustrada', \ - 'quadrin': 'quadrinhos' , 'opiniao' : u'opini\xE3o', \ - 'ciencia' : u'ci\xeancia' , 'saude' : u'sa\xfade', \ - 'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio', \ - 'imoveis' : u'im\xf3veis', 'negocios' : u'neg\xf3cios', \ + section_dict = {'cotidian' : 'cotidiano', 'ilustrad': 'ilustrada', + 'quadrin': 'quadrinhos' , 'opiniao' : u'opini\xE3o', + 'ciencia' : u'ci\xeancia' , 'saude' : u'sa\xfade', + 'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio', + 'imoveis' : u'im\xf3veis', 'negocios' : u'neg\xf3cios', 'veiculos' : u've\xedculos', 'corrida' : 'folha corrida'} # this solves the problem with truncated content in Kindle @@ -89,32 +92,19 @@ img { background: none !important; float: none; margin: 0px; } br['user'] = self.username br['pass'] = self.password br.submit().read() -## if 'Please try again' in raw: -## raise Exception('Your username and password are incorrect') return br -# def postprocess_html(self, soup, first_fetch): -# #Clean-up normal articles -# tags = soup.findAll('div', id='articleNew') -# if tags and tags[0]: -# return tags[0] -# #Clean-up first page -# tags = soup.findAll('div', attrs={'class':'double_column facsimile'}) -# if tags and tags[0]: -# return tags[0] -# return soup - def parse_index(self): - #Searching for the index page on the HOMEPAGE + # Searching for the index page on the HOMEPAGE # hpsoup = self.index_to_soup(self.HOMEPAGE) - #indexref = hpsoup.find('a', href=re.compile('^indices.*')) - #self.log('--> tag containing the today s index: ', indexref) - #INDEX = indexref['href'] - #INDEX = 'http://www1.folha.uol.com.br/'+INDEX + # indexref = hpsoup.find('a', href=re.compile('^indices.*')) + # self.log('--> tag containing the today s index: ', indexref) + # INDEX = indexref['href'] + # INDEX = 'http://www1.folha.uol.com.br/'+INDEX INDEX = 'http://www1.folha.uol.com.br/' + 'fsp/indices/index-' + str(self.today).replace('-','') + '.shtml' self.log('--> INDEX after extracting href and adding prefix: ', INDEX) # ... and taking the opportunity to get the cover image link - #coverurl = hpsoup.find('a', href=re.compile('^cp.*'))['href'] + # coverurl = hpsoup.find('a', href=re.compile('^cp.*'))['href'] coverurl = self.FIRSTPAGE if coverurl: self.log('--> tag containing the today s cover: ', coverurl) @@ -123,7 +113,7 @@ img { background: none !important; float: none; margin: 0px; } self.log('--> coverurl after extracting href and adding prefix: ', coverurl) self.cover_url = coverurl - #soup = self.index_to_soup(self.INDEX) + # soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(INDEX) feeds = [] @@ -132,13 +122,13 @@ img { background: none !important; float: none; margin: 0px; } for post in soup.findAll('a'): # if name=True => new section strpost = str(post) - #if strpost.startswith('