#!/usr/bin/env python ## # Title: Il Post recipe for calibre # Author: Marco Scirea, based on a recipe by frafra # Contact: marco.prolog at gmail.com ## # License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html # Copyright: Copyright 2019 Marco Scirea ## from __future__ import absolute_import, division, print_function, unicode_literals from calibre.web.feeds.news import BasicNewsRecipe from calibre.utils.magick import Image # ----------- CUSTOMIZATION OPTIONS START ----------- # Comment (add # in front) to disable the sections you are not interested in # Commenta (aggiungi # davanti alla riga) per disabilitare le sezioni che non vuoi scaricare sections = [ ("Prima Pagina", "https://www.ilpost.it/prime-pagine"), ("Mondo", "https://www.ilpost.it/mondo/"), ("Politica", "https://www.ilpost.it/politica/"), ("Tecnologia", "https://www.ilpost.it/tecnologia/"), ("Internet", "https://www.ilpost.it/internet/"), ("Scienza", "https://www.ilpost.it/scienza/"), ("Cultura", "https://www.ilpost.it/cultura/"), ("Economia", "https://www.ilpost.it/economia/"), ("Sport", "https://www.ilpost.it/sport/"), ("Media", "https://www.ilpost.it/media/"), ("Moda", "https://www.ilpost.it/moda/"), ("Libri", "https://www.ilpost.it/libri/"), ("Auto", "https://www.ilpost.it/auto/"), ("Konrad", "https://www.ilpost.it/europa/"), ] # Change this to True if you want grey images convert_to_grayscale = False # ----------- CUSTOMIZATION OPTIONS OVER ----------- prefixes = {"Permalink to", "Commenta", "Link all'articolo"} class IlPost(BasicNewsRecipe): __author__ = 'Marco Scirea' __license__ = 'GPL v3' __copyright__ = '2019, Marco Scirea ' title = "Il Post" language = "it" description = ( 'Puoi decidere quali sezioni scaricare modificando la ricetta.' ' Di default le immagini sono convertite in scala di grigio per risparmiare spazio,' ' la ricetta puo\' essere configurata per tenerle a colori' ) tags = "news" cover_url = "https://www.ilpost.it/wp-content/themes/ilpost/images/ilpost.svg" ignore_duplicate_articles = {"title", "url"} no_stylesheets = True keep_only_tags = [dict(id=["expanding", "singleBody"])] def parse_page(self, name, url): self.log.debug(url) soup = self.index_to_soup(url) entries = [] for article in soup.findAll('article'): for link in article.findAll('a', href=True, title=True): if not link["href"].startswith("https://www.ilpost.it/20"): continue title = link["title"] for prefix in prefixes: if title.startswith(prefix): title = title.lstrip(prefix) break title = title.strip() entries.append({ "url": link["href"], "title": title, }) return (name, entries) def populate_article_metadata(self, article, soup, first): description = soup.find(attrs={"name": "description"}) article.summary = description[ "content"] if description else "No meta description given" article.text_summary = description[ "content"] if description else "No meta description given" def parse_index(self): feeds = [] for section in sections: feeds.append(self.parse_page(section[0], section[1])) return feeds if convert_to_grayscale: # Image conversion to greyscale by Starson17 # https://www.mobileread.com/forums/showpost.php?p=1814815&postcount=15 def postprocess_html(self, soup, first): # process all the images for tag in soup.findAll('img', src=True): iurl = tag['src'] img = Image() img.open(iurl) img.type = "GrayscaleType" img.save(iurl) return soup def preprocess_html(self, soup): galleryItems = soup.findAll("figure", {"class": "gallery-item"}) if galleryItems: self.abort_article() return soup