calibre/recipes/folhadesaopaulo_sub.recipe
2020-11-23 23:15:29 -03:00

142 lines
6.1 KiB
Python

#!/usr/bin/env python
# vim:fileencoding=utf-8
from calibre.web.feeds.news import BasicNewsRecipe
import re
import datetime
class FSP(BasicNewsRecipe):
title = u'Folha de S\xE3o Paulo'
__author__ = 'Joao Eduardo Bertacchi - lc_addicted, 2020 Leonardo Amaral - leleobhz'
description = u'Printed edition contents. Folha subscription required (UOL subscription currently not supported).' + \
u' [Conte\xfado completo da edi\xe7\xe3o impressa. Somente para assinantes Folha. N\xE3o suporta assinantes UOL]'
today = datetime.date.today()
masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
language = 'pt_BR'
no_stylesheets = True
max_articles_per_feed = 100
remove_javascript = True
needs_subscription = True
keep_only_tags = [
dict(name='div', id='articleNew'), dict(
name='table', attrs={'class': 'articleGraphic'}),
dict(name='article', id='news'),
]
publication_type = 'newspaper'
simultaneous_downloads = 5
remove_attributes = ['height', 'width']
# The following is an attempt to fix the problem with the section names,
# but whenever new sections are added it can generate accentuation
# problems still
section_dict = {'cotidian': 'cotidiano', 'ilustrad': 'ilustrada',
'quadrin': 'quadrinhos', 'opiniao': u'opini\xE3o',
'ciencia': u'cincia', 'saude': u'sa\xfade',
'ribeirao': u'ribeir\xE3o', 'equilibrio': u'equil\xedbrio',
'imoveis': u'im\xf3veis', 'negocios': u'neg\xf3cios',
'veiculos': u've\xedculos', 'corrida': 'folha corrida',
'turismo': 'turismo'}
# this solves the problem with truncated content in Kindle
conversion_options = {'linearize_tables': True}
extra_css = """
#articleNew { font: 18px Times New Roman,verdana,arial; }
img { background: none !important; float: none; margin: 0px; }
.newstexts { list-style-type: none; height: 20px; margin: 15px 0 10px 0; }
.newstexts.last { border-top: 1px solid #ccc; margin: 5px 0 15px 0; padding-top: 15px; }
.newstexts li { display: inline; padding: 0 5px; }
.newstexts li.prev { float: left; }
.newstexts li.next { float: right; }
.newstexts li span { width: 12px; height: 15px; display: inline-block; }
.newstexts li.prev span { background-position: -818px -46px; }
.newstexts li.next span { background-position: -832px -46px; }
.newstexts li a { font: bold 12px arial, verdana, sans-serif; text-transform: uppercase; color: #999; text-decoration: none !important; }
.newstexts li a:hover { text-decoration: underline !important }
.headerart { font-weight: bold; }
.title { font: bold 39px Times New Roman,verdana,arial; margin-bottom: 15px; margin-top: 10px; }
.creditart, .origin { font: bold 12px arial, verdana, sans-serif; color: #999; margin: 0px; display: block; }
.headerart p, .fine_line p { margin: 0 !important; }
.fine_line { font: bold 18px Times New Roman,verdana,arial; }
.fine_line p { margin-bottom: 18px !important; }
.fine_line p:first-child { font-weight: normal; font-style: italic; font-size: 20px !important; }
.eye { display: block; width: 317px; border-top: 2px solid #666; padding: 7px 0 7px; border-bottom: 2px solid #666; font-style: italic; font-weight: bold; }
.kicker { font-weight: bold; text-transform: uppercase; font-size: 18px; font-family: Times New Roman,verdana,arial !important; }
.blue { color: #000080; }
.red { color: #F00; }
.blue { color: #000080; }
.green { color: #006400; }
.orange { color: #FFA042; }
.violet { color: #8A2BE2; }
.text_footer { font-size: 15px; }
.title_end { font-size: 23px; font-weight: bold; }
.divisor { text-indent: -9999px; border-bottom: 1px solid #ccc; height: 1px; margin: 0; }
.star { background: none !important; height: 15px; }
.articleGraphic { margin-bottom: 20px; }
"""
# This is the code for login, here a mini browser is called and id entered
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None:
br.open('https://login.folha.com.br/login')
br.select_form(action="https://login.folha.com.br/login")
br['email'] = self.username
br['password'] = self.password
br.submit()
return br
# Parsing the index webpage
def parse_index(self):
# In the last version, the index page has become simpler:
INDEX = 'https://www1.folha.uol.com.br/fsp/'
self.log('--> INDEX set ', INDEX)
soup = self.index_to_soup(INDEX)
feeds = []
articles = []
section_title = u'Primeira p\xe1gina'
for post in soup.findAll('a'):
strpost = str(post)
if re.match('<a href="https://www1.folha.uol.com.br/.*/"><svg aria-hidden="true" class="icon icon--star"', strpost):
if articles:
feeds.append((section_title, articles))
self.log()
self.log(
'--> new section found, creating old section feed: ', section_title)
# section_title = post['name']
section_title = self.tag_to_string(post)
if section_title in self.section_dict:
section_title = self.section_dict[section_title]
articles = []
self.log('--> new section title: ', section_title)
elif strpost.startswith('<a href="/fsp/cp'):
break
elif strpost.startswith('<a href'):
url = post['href']
if url.startswith('http://www1.folha.uol.com.br/') or url.startswith('https://www1.folha.uol.com.br/') :
title = self.tag_to_string(post)
self.log()
self.log('--> post: ', post)
self.log('--> url: ', url)
self.log('--> title: ', title)
articles.append({'title': title, 'url': url})
if articles:
feeds.append((section_title, articles))
del feeds[0]
return feeds