mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-10-25 07:48:55 -04:00
125 lines
4.6 KiB
Python
125 lines
4.6 KiB
Python
#!/usr/bin/env python
|
||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||
|
||
|
||
class Volkskrant(BasicNewsRecipe):
|
||
title = 'Internazionale'
|
||
__author__ = 'Cristi Ghera'
|
||
max_articles_per_feed = 100
|
||
description = 'Internazionale - Notizie dall’Italia e dal mondo'
|
||
needs_subscription = False
|
||
language = 'it'
|
||
country = 'IT'
|
||
category = 'news, politics, Italy, world'
|
||
resolve_internal_links = True
|
||
remove_tags_before = {'name': 'article'}
|
||
remove_tags_after = {'name': 'article'}
|
||
remove_tags = [
|
||
dict(
|
||
attrs={
|
||
'class': [
|
||
'item-banner',
|
||
'hentryfeed__side',
|
||
'magazine-article-share-tools',
|
||
'magazine-article-share-popup',
|
||
'article_next',
|
||
'cta_nl_ext_container',
|
||
'article_others_authors', # Remove link of other articles at the bottom of the article
|
||
'item_note2', # Remove 'lettere' and 'numero' at the bottom of the article
|
||
'article_spotlight', # Remove 'Da non perdere' at the end of the article
|
||
]
|
||
}
|
||
),
|
||
dict(name=['script', 'style']),
|
||
]
|
||
remove_attributes = ['class', 'id', 'name', 'style']
|
||
encoding = 'utf-8'
|
||
no_stylesheets = True
|
||
ignore_duplicate_articles = {'url'}
|
||
|
||
current_number_url = 'https://www.internazionale.it/sommario'
|
||
home_url = 'https://www.internazionale.it'
|
||
cover_url = None
|
||
|
||
def extract_article(self, article):
|
||
url = article.find('a')['href']
|
||
if url[0] == '/':
|
||
url = self.home_url + url
|
||
title_parts = []
|
||
tag = article.find('div', {'class': 'abstract-article__tag'})
|
||
if tag:
|
||
title_parts.append(self.tag_to_string(tag).upper())
|
||
title_parts.append(self.tag_to_string(article.find('div', {'class': 'abstract-article__title'})))
|
||
article_title = ' \u2022 '.join(title_parts)
|
||
pubdate=''
|
||
description_parts = []
|
||
author = article.find('div', {'class': 'abstract-article__author'})
|
||
if author:
|
||
description_parts.append(self.tag_to_string(author))
|
||
summary = article.find('div', {'class': 'abstract-article__content'})
|
||
if summary:
|
||
description_parts.append(self.tag_to_string(summary))
|
||
description = ' \u2022 '.join(description_parts)
|
||
return dict(
|
||
title=article_title,
|
||
url=url,
|
||
date=pubdate,
|
||
description=description,
|
||
content=''
|
||
)
|
||
|
||
def parse_index(self):
|
||
soup = self.index_to_soup(self.current_number_url)
|
||
self.cover_url = soup.find('span', {'class': 'img_expand'})['data-src']
|
||
main_container = soup.find('div', {'class': 'content_data'})
|
||
children = main_container.findAll('div', recursive=False)
|
||
sections = []
|
||
current_section = None
|
||
for container in children:
|
||
if 'abstract-testatina' in container['class'] or 'abstract-testatina-cultura' in container['class']:
|
||
if current_section:
|
||
sections.append(current_section)
|
||
current_section = (self.tag_to_string(container), [])
|
||
continue
|
||
|
||
if 'masonry-items' in container['class']:
|
||
for article in container.findAll('div', {'class': 'abstract-article'}):
|
||
current_section[1].append(self.extract_article(article))
|
||
continue
|
||
|
||
if 'abstract-article' in container['class']:
|
||
current_section[1].append(self.extract_article(container))
|
||
continue
|
||
|
||
# print(container['class'])
|
||
if current_section:
|
||
sections.append(current_section)
|
||
return sections
|
||
|
||
def preprocess_html(self, soup):
|
||
for node in soup.findAll('figure'):
|
||
img_src = None
|
||
image_attributes = [
|
||
'data-media1024',
|
||
'data-media1025',
|
||
'data-media641',
|
||
'data-media321',
|
||
'data-media',
|
||
]
|
||
for attr in image_attributes:
|
||
if node.has_attr(attr):
|
||
img_src = node[attr]
|
||
break
|
||
node.name = 'div'
|
||
if img_src:
|
||
img = soup.new_tag('img', src=img_src)
|
||
node.insert(0, img)
|
||
for node in soup.findAll('figcaption'):
|
||
node.name = 'div'
|
||
# if self.browser.cookiejar:
|
||
# self.browser.cookiejar.clear()
|
||
return soup
|
||
|
||
def get_cover_url(self):
|
||
return self.cover_url
|