calibre/recipes/il_post.recipe
Kovid Goyal 2838d7382c
pep8
2023-12-28 14:35:12 +05:30

93 lines
3.5 KiB
Python

#!/usr/bin/env python
##
# Title: Il Post recipe for calibre
# Author: Marco Scirea, based on a recipe by frafra
# Contact: marco.prolog at gmail.com
##
# License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
# Copyright: Copyright 2019 Marco Scirea
##
from __future__ import absolute_import, division, print_function, unicode_literals
from calibre.web.feeds.news import BasicNewsRecipe
from datetime import date, timedelta
dates = [ date.today().strftime('%Y/%m/%d'), (date.today() - timedelta(1)).strftime('%Y/%m/%d') ]
# ----------- CUSTOMIZATION OPTIONS START -----------
# Comment (add # in front) to disable the sections you are not interested in
# Commenta (aggiungi # davanti alla riga) per disabilitare le sezioni che non vuoi scaricare
sections = [
("Italia", "https://www.ilpost.it/italia/"),
("Mondo", "https://www.ilpost.it/mondo/"),
("Politica", "https://www.ilpost.it/politica/"),
("Tecnologia", "https://www.ilpost.it/tecnologia/"),
("Internet", "https://www.ilpost.it/internet/"),
("Scienza", "https://www.ilpost.it/scienza/"),
("Cultura", "https://www.ilpost.it/cultura/"),
("Economia", "https://www.ilpost.it/economia/"),
("Sport", "https://www.ilpost.it/sport/"),
("Media", "https://www.ilpost.it/media/"),
("Moda", "https://www.ilpost.it/moda/"),
("Libri", "https://www.ilpost.it/libri/"),
("Auto", "https://www.ilpost.it/auto/"),
("Konrad", "https://www.ilpost.it/europa/"),
]
# ----------- CUSTOMIZATION OPTIONS OVER -----------
class IlPost(BasicNewsRecipe):
__author__ = 'Marco Scirea, unkn0wn'
__license__ = 'GPL v3'
__copyright__ = '2019, Marco Scirea <marco.prolog at gmail.com>'
title = "Il Post"
language = "it"
description = (
'Puoi decidere quali sezioni scaricare modificando la ricetta.'
' Di default le immagini sono convertite in scala di grigio per risparmiare spazio,'
' la ricetta puo\' essere configurata per tenerle a colori'
)
tags = "news"
masthead_url = 'https://www.ilpost.it/error/images/ilpost.svg'
ignore_duplicate_articles = {"title", "url"}
no_stylesheets = True
extra_css = ' .wp-caption-text { font-size:small; } '
keep_only_tags = [dict(name='main', attrs={'id':lambda x: x and x.startswith('index_main-content__')})]
remove_tags_before = [dict(name='article')]
remove_tags_after = [dict(name='article')]
remove_tags = [
dict(attrs={'class':lambda x: x and x.startswith(
('index_actions__', 'index_il-post-comments___', 'index_art_tag__')
)}),
dict(attrs={'id':'audioPlayerArticle'})
]
def parse_page(self, name, url):
self.log.debug(url)
soup = self.index_to_soup(url)
entries = []
for article in soup.findAll('article'):
for link in article.findAll('a', href=True):
if not any(x in link['href'] for x in dates):
continue
title = self.tag_to_string(link.h2)
desc = self.tag_to_string(link.p)
if not title:
continue
self.log('\t', title)
entries.append({
"url": link["href"],
"title": title,
"description": desc
})
return (name, entries)
def parse_index(self):
feeds = []
for section in sections:
feeds.append(self.parse_page(section[0], section[1]))
return feeds