calibre/recipes/internazionale.recipe

#!/usr/bin/env  python
from calibre.web.feeds.recipes import BasicNewsRecipe


class Volkskrant(BasicNewsRecipe):
    title = 'Internazionale'
    __author__ = 'Cristi Ghera'
    max_articles_per_feed = 100
    description = 'Internazionale - Notizie dall’Italia e dal mondo'
    needs_subscription = False
    language = 'it'
    country = 'IT'
    category = 'news, politics, Italy, world'
    resolve_internal_links = True
    remove_tags_before = {'name': 'article'}
    remove_tags_after = {'name': 'article'}
    remove_tags = [
        dict(
            attrs={
                'class': [
                    'item-banner',
                    'hentryfeed__side',
                    'magazine-article-share-tools',
                    'magazine-article-share-popup',
                    'article_next',
                    'cta_nl_ext_container',
                    'article_others_authors',  # Remove link of other articles at the bottom of the article
                    'item_note2',  # Remove 'lettere' and 'numero' at the bottom of the article
                    'article_spotlight',  # Remove 'Da non perdere' at the end of the article
                ]
            }
        ),
        dict(name=['script', 'style']),
    ]
    remove_attributes = ['class', 'id', 'name', 'style']
    encoding = 'utf-8'
    no_stylesheets = True
    ignore_duplicate_articles = {'url'}

    current_number_url = 'https://www.internazionale.it/sommario'
    home_url = 'https://www.internazionale.it'
    cover_url = None

    def extract_article(self, article):
        url = article.find('a')['href']
        if url[0] == '/':
            url = self.home_url + url
        title_parts = []
        tag = article.find('div', {'class': 'abstract-article__tag'})
        if tag:
            title_parts.append(self.tag_to_string(tag).upper())
        title_parts.append(self.tag_to_string(article.find('div', {'class': 'abstract-article__title'})))
        article_title = ' \u2022 '.join(title_parts)
        pubdate=''
        description_parts = []
        author = article.find('div', {'class': 'abstract-article__author'})
        if author:
            description_parts.append(self.tag_to_string(author))
        summary = article.find('div', {'class': 'abstract-article__content'})
        if summary:
            description_parts.append(self.tag_to_string(summary))
        description = ' \u2022 '.join(description_parts)
        return dict(
            title=article_title,
            url=url,
            date=pubdate,
            description=description,
            content=''
        )

    def parse_index(self):
        soup = self.index_to_soup(self.current_number_url)
        self.cover_url = soup.find('span', {'class': 'img_expand'})['data-src']
        main_container = soup.find('div', {'class': 'content_data'})
        children = main_container.findAll('div', recursive=False)
        sections = []
        current_section = None
        for container in children:
            if 'abstract-testatina' in container['class'] or 'abstract-testatina-cultura' in container['class']:
                if current_section:
                    sections.append(current_section)
                current_section = (self.tag_to_string(container), [])
                continue

            if 'masonry-items' in container['class']:
                for article in container.findAll('div', {'class': 'abstract-article'}):
                    current_section[1].append(self.extract_article(article))
                continue

            if 'abstract-article' in container['class']:
                current_section[1].append(self.extract_article(container))
                continue

            # print(container['class'])
        if current_section:
            sections.append(current_section)
        return sections

    def preprocess_html(self, soup):
        for node in soup.findAll('figure'):
            img_src = None
            image_attributes = [
                'data-media1024',
                'data-media1025',
                'data-media641',
                'data-media321',
                'data-media',
            ]
            for attr in image_attributes:
                if node.has_attr(attr):
                    img_src = node[attr]
                    break
            node.name = 'div'
            if img_src:
                img = soup.new_tag('img', src=img_src)
                node.insert(0, img)
        for node in soup.findAll('figcaption'):
            node.name = 'div'
        # if self.browser.cookiejar:
        #     self.browser.cookiejar.clear()
        return soup

    def get_cover_url(self):
        return self.cover_url