calibre/recipes/il_post.recipe
2025-01-24 11:14:24 +01:00

95 lines
3.5 KiB
Python

#!/usr/bin/env python
##
# Title: Il Post recipe for calibre
# Author: Marco Scirea, based on a recipe by frafra
# Contact: marco.prolog at gmail.com
##
# License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
# Copyright: Copyright 2019 Marco Scirea
##
from __future__ import absolute_import, division, print_function, unicode_literals
from datetime import date, timedelta
from calibre.web.feeds.news import BasicNewsRecipe
dates = [date.today().strftime('%Y/%m/%d'), (date.today() - timedelta(1)).strftime('%Y/%m/%d')]
# ----------- CUSTOMIZATION OPTIONS START -----------
# Comment (add # in front) to disable the sections you are not interested in
# Commenta (aggiungi # davanti alla riga) per disabilitare le sezioni che non vuoi scaricare
sections = [
('Italia', 'https://www.ilpost.it/italia/'),
('Mondo', 'https://www.ilpost.it/mondo/'),
('Politica', 'https://www.ilpost.it/politica/'),
('Tecnologia', 'https://www.ilpost.it/tecnologia/'),
('Internet', 'https://www.ilpost.it/internet/'),
('Scienza', 'https://www.ilpost.it/scienza/'),
('Cultura', 'https://www.ilpost.it/cultura/'),
('Economia', 'https://www.ilpost.it/economia/'),
('Sport', 'https://www.ilpost.it/sport/'),
('Media', 'https://www.ilpost.it/media/'),
('Moda', 'https://www.ilpost.it/moda/'),
('Libri', 'https://www.ilpost.it/libri/'),
('Auto', 'https://www.ilpost.it/auto/'),
('Konrad', 'https://www.ilpost.it/europa/'),
]
# ----------- CUSTOMIZATION OPTIONS OVER -----------
class IlPost(BasicNewsRecipe):
__author__ = 'Marco Scirea, unkn0wn'
__license__ = 'GPL v3'
__copyright__ = '2019, Marco Scirea <marco.prolog at gmail.com>'
title = 'Il Post'
language = 'it'
description = (
'Puoi decidere quali sezioni scaricare modificando la ricetta.'
' Di default le immagini sono convertite in scala di grigio per risparmiare spazio,'
" la ricetta puo' essere configurata per tenerle a colori"
)
tags = 'news'
masthead_url = 'https://www.ilpost.it/error/images/ilpost.svg'
ignore_duplicate_articles = {'title', 'url'}
no_stylesheets = True
extra_css = ' .wp-caption-text { font-size:small; } '
keep_only_tags = [dict(name='main', attrs={'id':lambda x: x and x.startswith('index_main-content__')})]
remove_tags_before = [dict(name='article')]
remove_tags_after = [dict(name='article')]
remove_tags = [
dict(attrs={'class':lambda x: x and x.startswith(
('index_actions__', 'index_il-post-comments___', 'index_art_tag__')
)}),
dict(attrs={'id':'audioPlayerArticle'})
]
def parse_page(self, name, url):
self.log.debug(url)
soup = self.index_to_soup(url)
entries = []
for article in soup.findAll('article'):
for link in article.findAll('a', href=True):
if not any(x in link['href'] for x in dates):
continue
title = self.tag_to_string(link.h2)
desc = self.tag_to_string(link.p)
if not title:
continue
self.log('\t', title)
entries.append({
'url': link['href'],
'title': title,
'description': desc
})
return name, entries
def parse_index(self):
feeds = []
for section in sections:
feeds.append(self.parse_page(section[0], section[1]))
return feeds