from calibre.web.feeds.news import BasicNewsRecipe import re class FSP(BasicNewsRecipe): title = u'Folha de S\xE3o Paulo' __author__ = 'fluzao' description = u'Printed edition contents. UOL subscription required (Folha subscription currently not supported).' + \ u' [Conte\xfado completo da edi\xe7\xe3o impressa. Somente para assinantes UOL.]' #found this to be the easiest place to find the index page (13-Nov-2011). # searching for the "Indice Geral" link HOMEPAGE = 'http://www1.folha.uol.com.br/fsp/' masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif' language = 'pt_BR' no_stylesheets = True max_articles_per_feed = 40 remove_javascript = True needs_subscription = True remove_tags_before = dict(name='p') remove_tags = [dict(name='td', attrs={'align':'center'})] remove_attributes = ['height','width'] # fixes the problem with the section names section_dict = {'cotidian' : 'cotidiano', 'ilustrad': 'ilustrada', \ 'quadrin': 'quadrinhos' , 'opiniao' : u'opini\xE3o', \ 'ciencia' : u'ci\xeancia' , 'saude' : u'sa\xfade', \ 'ribeirao' : u'ribeir\xE3o' , 'equilibrio' : u'equil\xedbrio', \ 'imoveis' : u'im\xf3veis', 'negocios' : u'neg\xf3cios', \ 'veiculos' : u've\xedculos', 'corrida' : 'folha corrida'} # this solves the problem with truncated content in Kindle conversion_options = {'linearize_tables' : True} # this bit removes the footer where there are links for Proximo Texto, Texto Anterior, # Indice e Comunicar Erros preprocess_regexps = [(re.compile(r'.*Comunicar Erros', re.DOTALL|re.IGNORECASE), lambda match: r'')] def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: br.open('https://acesso.uol.com.br/login.html') br.form = br.forms().next() br['user'] = self.username br['pass'] = self.password br.submit().read() ## if 'Please try again' in raw: ## raise Exception('Your username and password are incorrect') return br def parse_index(self): #Searching for the index page on the HOMEPAGE hpsoup = self.index_to_soup(self.HOMEPAGE) indexref = hpsoup.find('a', href=re.compile('^indices.*')) self.log('--> tag containing the today s index: ', indexref) INDEX = indexref['href'] INDEX = 'http://www1.folha.uol.com.br/fsp/'+INDEX self.log('--> INDEX after extracting href and adding prefix: ', INDEX) # ... and taking the opportunity to get the cover image link coverurl = hpsoup.find('a', href=re.compile('^cp.*'))['href'] if coverurl: self.log('--> tag containing the today s cover: ', coverurl) coverurl = coverurl.replace('htm', 'jpg') coverurl = 'http://www1.folha.uol.com.br/fsp/images/'+coverurl self.log('--> coverurl after extracting href and adding prefix: ', coverurl) self.cover_url = coverurl #soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(INDEX) feeds = [] articles = [] section_title = "Preambulo" for post in soup.findAll('a'): # if name=True => new section strpost = str(post) if strpost.startswith(' new section found, creating old section feed: ', section_title) section_title = post['name'] if section_title in self.section_dict: section_title = self.section_dict[section_title] articles = [] self.log('--> new section title: ', section_title) if strpost.startswith(' post: ', post) self.log('--> url: ', url) self.log('--> title: ', title) articles.append({'title':title, 'url':url}) if articles: feeds.append((section_title, articles)) # keeping the front page url minha_capa = feeds[0][1][1]['url'] # removing the first section (now called 'top') del feeds[0] # inserting the cover page as the first article (nicer for kindle users) feeds.insert(0,(u'primeira p\xe1gina', [{'title':u'Primeira p\xe1gina' , 'url':minha_capa}])) return feeds