#!/usr/bin/env python ## # Title: Il Post recipe for calibre # Author: Marco Scirea, based on a recipe by frafra # Contact: marco.prolog at gmail.com ## # License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html # Copyright: Copyright 2019 Marco Scirea ## from __future__ import absolute_import, division, print_function, unicode_literals from calibre.web.feeds.news import BasicNewsRecipe from datetime import date, timedelta dates = [ date.today().strftime('%Y/%m/%d'), (date.today() - timedelta(1)).strftime('%Y/%m/%d') ] # ----------- CUSTOMIZATION OPTIONS START ----------- # Comment (add # in front) to disable the sections you are not interested in # Commenta (aggiungi # davanti alla riga) per disabilitare le sezioni che non vuoi scaricare sections = [ ("Italia", "https://www.ilpost.it/italia/"), ("Mondo", "https://www.ilpost.it/mondo/"), ("Politica", "https://www.ilpost.it/politica/"), ("Tecnologia", "https://www.ilpost.it/tecnologia/"), ("Internet", "https://www.ilpost.it/internet/"), ("Scienza", "https://www.ilpost.it/scienza/"), ("Cultura", "https://www.ilpost.it/cultura/"), ("Economia", "https://www.ilpost.it/economia/"), ("Sport", "https://www.ilpost.it/sport/"), ("Media", "https://www.ilpost.it/media/"), ("Moda", "https://www.ilpost.it/moda/"), ("Libri", "https://www.ilpost.it/libri/"), ("Auto", "https://www.ilpost.it/auto/"), ("Konrad", "https://www.ilpost.it/europa/"), ] # ----------- CUSTOMIZATION OPTIONS OVER ----------- class IlPost(BasicNewsRecipe): __author__ = 'Marco Scirea, unkn0wn' __license__ = 'GPL v3' __copyright__ = '2019, Marco Scirea ' title = "Il Post" language = "it" description = ( 'Puoi decidere quali sezioni scaricare modificando la ricetta.' ' Di default le immagini sono convertite in scala di grigio per risparmiare spazio,' ' la ricetta puo\' essere configurata per tenerle a colori' ) tags = "news" masthead_url = 'https://www.ilpost.it/error/images/ilpost.svg' ignore_duplicate_articles = {"title", "url"} no_stylesheets = True extra_css = ' .wp-caption-text { font-size:small; } ' keep_only_tags = [dict(name='main', attrs={'id':lambda x: x and x.startswith('index_main-content__')})] remove_tags_before = [dict(name='article')] remove_tags_after = [dict(name='article')] remove_tags = [ dict(attrs={'class':lambda x: x and x.startswith( ('index_actions__', 'index_il-post-comments___', 'index_art_tag__') )}), dict(attrs={'id':'audioPlayerArticle'}) ] def parse_page(self, name, url): self.log.debug(url) soup = self.index_to_soup(url) entries = [] for article in soup.findAll('article'): for link in article.findAll('a', href=True): if not any(x in link['href'] for x in dates): continue title = self.tag_to_string(link.h2) desc = self.tag_to_string(link.p) if not title: continue self.log('\t', title) entries.append({ "url": link["href"], "title": title, "description": desc }) return (name, entries) def parse_index(self): feeds = [] for section in sections: feeds.append(self.parse_page(section[0], section[1])) return feeds