diff --git a/recipes/folhadesaopaulo_sub.recipe b/recipes/folhadesaopaulo_sub.recipe index 53fc06bf1d..8a3461e3f5 100644 --- a/recipes/folhadesaopaulo_sub.recipe +++ b/recipes/folhadesaopaulo_sub.recipe @@ -3,73 +3,89 @@ from calibre.web.feeds.news import BasicNewsRecipe import re import datetime + class FSP(BasicNewsRecipe): - title = u'Folha de S\xE3o Paulo' - __author__ = 'fluzao' + title = u'Folha de S\xE3o Paulo' + __author__ = 'Joao Eduardo Bertacchi' description = u'Printed edition contents. UOL subscription required (Folha subscription currently not supported).' + \ u' [Conte\xfado completo da edi\xe7\xe3o impressa. Somente para assinantes UOL.]' # found this to be the easiest place to find the index page (13-Nov-2011). # searching for the "Indice Geral" link HOMEPAGE = 'http://www1.folha.uol.com.br/fsp/' + today = datetime.date.today() + FIRSTPAGE = 'cp' + str(today.day).zfill(2) + str( + today.month).zfill(2) + str(today.year) + '.shtml' masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif' language = 'pt_BR' no_stylesheets = True - max_articles_per_feed = 40 - remove_javascript = True + max_articles_per_feed = 40 + remove_javascript = True needs_subscription = True remove_tags_before = dict(name='p') - remove_tags = [dict(name='td', attrs={'align':'center'})] - remove_attributes = ['height','width'] + remove_tags = [dict(name='td', attrs={'align': 'center'})] + remove_attributes = ['height', 'width'] # fixes the problem with the section names - section_dict = {'cotidian' : 'cotidiano', 'ilustrad': 'ilustrada', - 'quadrin': 'quadrinhos' , 'opiniao' : u'opini\xE3o', - 'ciencia' : u'ci\xeancia' , 'saude' : u'sa\xfade', - 'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio', - 'imoveis' : u'im\xf3veis', 'negocios' : u'neg\xf3cios', - 'veiculos' : u've\xedculos', 'corrida' : 'folha corrida'} + section_dict = {'cotidian': 'cotidiano', 'ilustrad': 'ilustrada', + 'quadrin': 'quadrinhos', 'opiniao': u'opini\xE3o', + 'ciencia': u'ci\xeancia', 'saude': u'sa\xfade', + 'ribeirao': u'ribeir\xE3o', 'equilibrio': u'equil\xedbrio', + 'imoveis': u'im\xf3veis', 'negocios': u'neg\xf3cios', + 'veiculos': u've\xedculos', 'corrida': 'folha corrida'} # this solves the problem with truncated content in Kindle - conversion_options = {'linearize_tables' : True} + conversion_options = {'linearize_tables': True} # this bit removes the footer where there are links for Proximo Texto, Texto Anterior, # Indice e Comunicar Erros preprocess_regexps = [(re.compile(r'.*Comunicar Erros', - re.DOTALL|re.IGNORECASE), lambda match: r'')] + re.DOTALL | re.IGNORECASE), lambda match: r'')] def get_browser(self): br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('https://acesso.uol.com.br/login.html') br.form = br.forms().next() - br['user'] = self.username + br['user'] = self.username br['pass'] = self.password br.submit().read() - # if 'Please try again' in raw: - # raise Exception('Your username and password are incorrect') +# if 'Please try again' in raw: +# raise Exception('Your username and password are incorrect') return br + def postprocess_html(self, soup, first_fetch): + # Clean-up normal articles + tags = soup.findAll('div', id='articleNew') + if tags and tags[0]: + return tags[0] + # Clean-up first page + tags = soup.findAll('div', attrs={'class': 'double_column facsimile'}) + if tags and tags[0]: + return tags[0] + return soup + def parse_index(self): # Searching for the index page on the HOMEPAGE - # hpsoup = self.index_to_soup(self.HOMEPAGE) + self.index_to_soup(self.HOMEPAGE) # indexref = hpsoup.find('a', href=re.compile('^indices.*')) # self.log('--> tag containing the today s index: ', indexref) # INDEX = indexref['href'] # INDEX = 'http://www1.folha.uol.com.br/'+INDEX - today=datetime.date.today() - INDEX = 'http://www1.folha.uol.com.br/' + 'fsp/indices/index-' + str(today).replace('-','') + '.shtml' + INDEX = 'http://www1.folha.uol.com.br/' + 'fsp/indices/index-' + \ + str(self.today).replace('-', '') + '.shtml' self.log('--> INDEX after extracting href and adding prefix: ', INDEX) # ... and taking the opportunity to get the cover image link # coverurl = hpsoup.find('a', href=re.compile('^cp.*'))['href'] - coverurl = 'cp' + str(today.day).zfill(2) + str(today.month).zfill(2) + str(today.year) + '.shtml' + coverurl = self.FIRSTPAGE if coverurl: self.log('--> tag containing the today s cover: ', coverurl) coverurl = coverurl.replace('shtml', 'jpg') - coverurl = 'http://www1.folha.uol.com.br/fsp/images/'+coverurl - self.log('--> coverurl after extracting href and adding prefix: ', coverurl) + coverurl = 'http://www1.folha.uol.com.br/fsp/images/' + coverurl + self.log( + '--> coverurl after extracting href and adding prefix: ', coverurl) self.cover_url = coverurl # soup = self.index_to_soup(self.INDEX) @@ -77,16 +93,17 @@ class FSP(BasicNewsRecipe): feeds = [] articles = [] - section_title = "Preambulo" + section_title = u'Primeira p\xe1gina' for post in soup.findAll('a'): # if name=True => new section strpost = str(post) # if strpost.startswith('