calibre/recipes/internazionale.recipe
2025-04-13 13:51:09 +05:30

125 lines
4.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
from calibre.web.feeds.recipes import BasicNewsRecipe
class Volkskrant(BasicNewsRecipe):
title = 'Internazionale'
__author__ = 'Cristi Ghera'
max_articles_per_feed = 100
description = 'Internazionale - Notizie dallItalia e dal mondo'
needs_subscription = False
language = 'it'
country = 'IT'
category = 'news, politics, Italy, world'
resolve_internal_links = True
remove_tags_before = {'name': 'article'}
remove_tags_after = {'name': 'article'}
remove_tags = [
dict(
attrs={
'class': [
'item-banner',
'hentryfeed__side',
'magazine-article-share-tools',
'magazine-article-share-popup',
'article_next',
'cta_nl_ext_container',
'article_others_authors', # Remove link of other articles at the bottom of the article
'item_note2', # Remove 'lettere' and 'numero' at the bottom of the article
'article_spotlight', # Remove 'Da non perdere' at the end of the article
]
}
),
dict(name=['script', 'style']),
]
remove_attributes = ['class', 'id', 'name', 'style']
encoding = 'utf-8'
no_stylesheets = True
ignore_duplicate_articles = {'url'}
current_number_url = 'https://www.internazionale.it/sommario'
home_url = 'https://www.internazionale.it'
cover_url = None
def extract_article(self, article):
url = article.find('a')['href']
if url[0] == '/':
url = self.home_url + url
title_parts = []
tag = article.find('div', {'class': 'abstract-article__tag'})
if tag:
title_parts.append(self.tag_to_string(tag).upper())
title_parts.append(self.tag_to_string(article.find('div', {'class': 'abstract-article__title'})))
article_title = ' \u2022 '.join(title_parts)
pubdate=''
description_parts = []
author = article.find('div', {'class': 'abstract-article__author'})
if author:
description_parts.append(self.tag_to_string(author))
summary = article.find('div', {'class': 'abstract-article__content'})
if summary:
description_parts.append(self.tag_to_string(summary))
description = ' \u2022 '.join(description_parts)
return dict(
title=article_title,
url=url,
date=pubdate,
description=description,
content=''
)
def parse_index(self):
soup = self.index_to_soup(self.current_number_url)
self.cover_url = soup.find('span', {'class': 'img_expand'})['data-src']
main_container = soup.find('div', {'class': 'content_data'})
children = main_container.findAll('div', recursive=False)
sections = []
current_section = None
for container in children:
if 'abstract-testatina' in container['class'] or 'abstract-testatina-cultura' in container['class']:
if current_section:
sections.append(current_section)
current_section = (self.tag_to_string(container), [])
continue
if 'masonry-items' in container['class']:
for article in container.findAll('div', {'class': 'abstract-article'}):
current_section[1].append(self.extract_article(article))
continue
if 'abstract-article' in container['class']:
current_section[1].append(self.extract_article(container))
continue
# print(container['class'])
if current_section:
sections.append(current_section)
return sections
def preprocess_html(self, soup):
for node in soup.findAll('figure'):
img_src = None
image_attributes = [
'data-media1024',
'data-media1025',
'data-media641',
'data-media321',
'data-media',
]
for attr in image_attributes:
if node.has_attr(attr):
img_src = node[attr]
break
node.name = 'div'
if img_src:
img = soup.new_tag('img', src=img_src)
node.insert(0, img)
for node in soup.findAll('figcaption'):
node.name = 'div'
# if self.browser.cookiejar:
# self.browser.cookiejar.clear()
return soup
def get_cover_url(self):
return self.cover_url