Added Parool (NL), Revista 22 (RO), Dilema (RO) and Internazionale (IT); updated Volkskrant to download cover

2025-07-09 03:04:10 -04:00 · 2024-04-19 19:46:20 +02:00 · 2024-04-19 19:46:20 +02:00 · 24befe49bb
commit 24befe49bb
parent c581bad34e
10 changed files with 419 additions and 3 deletions
--- a/recipes/dilema.png
+++ b/recipes/dilema.png
--- a/recipes/dilema.recipe
+++ b/recipes/dilema.recipe
@ -0,0 +1,107 @@
 #!/usr/bin/env  python
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from bs4 import BeautifulSoup
 class Volkskrant(BasicNewsRecipe):
    title = 'Dilema'
    __author__ = 'Cristi Ghera'
    max_articles_per_feed = 100
    description = '"Sint vechi, domnule!" (I.L. Caragiale)'
    needs_subscription = False
    language = 'ro'
    country = 'RO'
    category = 'politics, culture, Romania'
    resolve_internal_links = True
    remove_tags_before = { 'class': 'post' }
    remove_tags_after = { 'class': 'post_content' }
    remove_tags = [
        dict(
            attrs={
                'class': [
                    'single_meta_category',
                    'avatar',
                    'jm-post-like',
                    'fa',
                ]
            }
        ),
        dict(
            name=['div'],
            attrs={
                'class': ['mb-2']
            }
        ),
        dict(id=['like', 'dlik']),
        dict(name=['script', 'noscript', 'style']),
    ]
    remove_attributes = ["class", "id", "name", "style"]
    encoding = 'utf-8'
    no_stylesheets = True
    ignore_duplicate_articles = {'url'}
    def parse_index(self):
        homepage_url = 'https://www.dilema.ro/'
        soup = self.index_to_soup(homepage_url)
        articles = []
        # .banner-container
        banner_container = soup.find('div', attrs={'class': 'banner-container'})
        container = banner_container.find('h5')
        a = container.find('a')
        url = homepage_url + a.attrs['href']
        articles.append(
            dict(
                title=self.tag_to_string(container).strip(),
                url=url,
                date=self.tag_to_string(banner_container.find(attrs={'class': 'post-date'})).strip(),
                description='',
                content=''
            )
        )
        # .homepage_builder_3grid_post
        containers = soup.findAll('div', attrs={'class': 'homepage_builder_3grid_post'})
        for container in containers:
            if self.tag_to_string(container.find('h2')) in ['CELE MAI RECENTE', 'CELE MAI CITITE']:
                continue
            for article in container.findAll('div', attrs={'class': 'blog_grid_post_style'}):
                title_container = article.find('h3')
                if not title_container:
                    continue
                url = title_container.find('a')['href']
                url = homepage_url + url
                article_title = self.tag_to_string(title_container).strip()
                author = self.tag_to_string(
                    article.find('a', attrs={'rel': 'author'})
                ).strip()
                summary = self.tag_to_string(article.find('p')).strip()
                pubdate = self.tag_to_string(article.find(attrs={'class': 'post-date'}))
                description = author + ' - ' + summary
                articles.append(
                    dict(
                        title=article_title,
                        url=url,
                        date=pubdate,
                        description=description,
                        content=''
                    )
                )
        sections = [("Numărul curent", articles)]
        return sections
    def preprocess_html(self, soup):
        main_carousel = soup.find(attrs={'id': 'main-carousel'})
        if main_carousel:
            img = main_carousel.find('img')
            body = soup.find('body')
            body.clear()
            body.append(img)
        return soup
    def get_cover_url(self):
        url = 'https://www.dilema.ro/coperta-saptaminii/'
        soup = self.index_to_soup(url)
        img = soup.find(attrs={'id': 'main-carousel'}).find('img')
        return url + img.attrs['src']
--- a/recipes/internazionale.png
+++ b/recipes/internazionale.png
--- a/recipes/internazionale.recipe
+++ b/recipes/internazionale.recipe
@ -0,0 +1,117 @@
 #!/usr/bin/env  python
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class Volkskrant(BasicNewsRecipe):
    title = 'Internazionale'
    __author__ = 'Cristi Ghera'
    max_articles_per_feed = 100
    description = 'Internazionale - Notizie dall’Italia e dal mondo'
    needs_subscription = False
    language = 'it'
    country = 'IT'
    category = 'news, politics, Italy, world'
    resolve_internal_links = True
    remove_tags_before = { 'name': 'article' }
    remove_tags_after = { 'name': 'article' }
    remove_tags = [
        dict(
            attrs={
                'class': [
                    'item-banner',
                    'hentryfeed__side',
                    'magazine-article-share-tools',
                    'magazine-article-share-popup',
                    'article_next',
                    'cta_nl_ext_container',
                ]
            }
        ),
        dict(name=['script', 'style']),
    ]
    remove_attributes = ["class", "id", "name", "style"]
    encoding = 'utf-8'
    no_stylesheets = True
    ignore_duplicate_articles = {'url'}
    current_number_url = "https://www.internazionale.it/sommario"
    home_url = "https://www.internazionale.it"
    cover_url = None
    def extract_article(self, article):
        url = article.find('a')['href']
        if url[0] == '/':
            url = self.home_url + url
        title_parts = []
        tag = article.find('div', {'class': 'abstract-article__tag'})
        if tag: title_parts.append(self.tag_to_string(tag).upper())
        title_parts.append(self.tag_to_string(article.find('div', {'class': 'abstract-article__title'})))
        article_title = ' \u2022 '.join(title_parts)
        pubdate=''
        description_parts = []
        author = article.find('div', {'class': 'abstract-article__author'})
        if author: description_parts.append(self.tag_to_string(author))
        summary = article.find('div', {'class': 'abstract-article__content'})
        if summary: description_parts.append(self.tag_to_string(summary))
        description = ' \u2022 '.join(description_parts)
        return dict(
            title=article_title,
            url=url,
            date=pubdate,
            description=description,
            content=''
        )
    def parse_index(self):
        soup = self.index_to_soup(self.current_number_url)
        self.cover_url = soup.find('span', { 'class': 'img_expand' })['data-src']
        main_container = soup.find('div', { 'class': 'content_data' })
        children = main_container.findAll('div', recursive=False)
        sections = []
        current_section = None
        for container in children:
            if 'abstract-testatina' in container['class'] or 'abstract-testatina-cultura' in container['class']:
                if current_section:
                    sections.append(current_section)
                current_section = (self.tag_to_string(container), [])
                continue
            if 'masonry-items' in container['class']:
                for article in container.findAll('div', {'class': 'abstract-article'}):
                    current_section[1].append(self.extract_article(article))
                continue
            if 'abstract-article' in container['class']:
                current_section[1].append(self.extract_article(container))
                continue
            # print(container['class'])
        if current_section:
            sections.append(current_section)
        return sections
    def preprocess_html(self, soup):
        for node in soup.findAll('figure'):
            img_src = None
            image_attributes = [
                'data-media1024',
                'data-media1025',
                'data-media641',
                'data-media321',
                'data-media',
            ]
            for attr in image_attributes:
                if node.has_attr(attr):
                    img_src = node[attr]
                    break
            node.name = 'div'
            if img_src:
                img = soup.new_tag('img', src=img_src)
                node.insert(0, img)
        for node in soup.findAll('figcaption'):
            node.name = 'div'
        # if self.browser.cookiejar:
        #     self.browser.cookiejar.clear()
        return soup
    def get_cover_url(self):
        return self.cover_url
--- a/recipes/parool.png
+++ b/recipes/parool.png
--- a/recipes/parool.recipe
+++ b/recipes/parool.recipe
@ -0,0 +1,96 @@
 #!/usr/bin/env  python
 from calibre.web.feeds.recipes import BasicNewsRecipe
 import uuid
 from mechanize import Request
 from contextlib import closing
 import json
 class Parool(BasicNewsRecipe):
    title = 'Het Parool'
    __author__ = 'Cristi Ghera'
    max_articles_per_feed = 100
    description = 'Het Parool - Vrij, Onverveerd'
    needs_subscription = False
    language = 'nl'
    country = 'NL'
    category = 'news, politics, Netherlands'
    resolve_internal_links = True
    remove_tags_before = dict(id='main-content')
    remove_tags_after  = dict(id='main-content')
    remove_tags = [
        dict(attrs={'class':['article-footer__sharing', 'artstyle__editorial-tips', 'artstyle__advertisement','artstyle__container__icon','artstyle__disabled-embed','container__title__icon',]}),
        dict(attrs={'data-element-id': ['article-element-authors']}),
        dict(name=['script', 'noscript', 'style']),
    ]
    remove_attributes = ["class", "id", "name", "style"]
    encoding = 'utf-8'
    no_stylesheets = True
    ignore_duplicate_articles = {'url'}
    def parse_index(self):
        soup = self.index_to_soup('https://www.parool.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4()))
        containers = soup.findAll('section', attrs={'class': 'section--horizontal'})
        sections = []
        for container in containers:
            section_title = self.tag_to_string(container.find('h2')).strip()
            articles = []
            for art in container.findAll('article'):
                a = art.find('a')
                url = a['href']
                if url[0] == '/':
                    url = 'https://www.parool.nl' + url
                if '/editie/' not in url:
                    continue
                header = a.find('header')
                teaser_label = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__label'})).strip()
                teaser_sublabel = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__sublabel'})).strip()
                teaser_title = self.tag_to_string(header.find('h3').find('span', attrs={'class': 'teaser__title__value--short'})).strip()
                ignore = { "dirkjan", "s1ngle", "pukkels", "hein de kort" }
                if teaser_label.lower() in ignore:
                    continue
                parts = []
                if teaser_label:
                    parts.append(teaser_label.upper())
                if teaser_sublabel:
                    parts.append(teaser_sublabel)
                if teaser_title:
                    parts.append(teaser_title)
                article_title = ' \u2022 '.join(parts)
                articles.append(dict(title=article_title,
                                    url=url,
                                    content=''))
            sections.append((section_title, articles))
        return sections
    def preprocess_html(self, soup):
        for tag in soup():
            if tag.name == 'img':
                if tag['src'][0] == '/':
                    tag['src'] = 'https://www.parool.nl' + tag['src']
        for tag in soup():
            if tag.name == "picture":
                tag.replaceWith(tag.find("img"))
        comic_articles = {
            "Alle strips van Dirkjan",
            "S1NGLE",
            "Pukkels",
            "Bekijk hier alle cartoons van Hein de Kort",
        }
        if self.tag_to_string(soup.find('h1')).strip() in comic_articles:
            for node in soup.find('figure').find_next_siblings():
                node.extract()
        return soup
    def get_cover_url(self):
        headers = {
            'X-Requested-With': 'XMLHttpRequest',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'DNT': '1',
        }
        url = "https://login-api.e-pages.dk/v1/krant.parool.nl/folders"
        with closing(self.browser.open(Request(url, None, headers))) as r:
            folders = json.loads(r.read())
            return folders["objects"][0]["teaser_medium"]
        return None
--- a/recipes/revista22.png
+++ b/recipes/revista22.png
--- a/recipes/revista22.recipe
+++ b/recipes/revista22.recipe
@ -0,0 +1,74 @@
 #!/usr/bin/env  python
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class Volkskrant(BasicNewsRecipe):
    title = 'Revista 22'
    __author__ = 'Cristi Ghera'
    max_articles_per_feed = 100
    description = 'Revista 22'
    needs_subscription = False
    language = 'ro'
    country = 'RO'
    category = 'news, politics, Romania'
    resolve_internal_links = True
    remove_tags_before = { 'class': 'col-span-8' }
    remove_tags_after = { 'class': 'col-span-8' }
    remove_tags = [
        dict(
            attrs={
                'class': [
                    'icons',
                    'float-left',
                    'samesection',
                ]
            }
        ),
        dict(
            name=['div'],
            attrs={
                'class': ['mb-2']
            }
        ),
        dict(id=['comments']),
        dict(name=['script', 'noscript', 'style']),
    ]
    remove_attributes = ["class", "id", "name", "style"]
    encoding = 'utf-8'
    no_stylesheets = True
    ignore_duplicate_articles = {'url'}
    def parse_index(self):
        soup = self.index_to_soup('https://revista22.ro')
        url = soup.find('div', attrs={'class': 'uppercase'}).find('a').attrs['href']
        if url[0] == '/':
            url = 'https://revista22.ro' + url
        soup = self.index_to_soup(url)
        main_container = soup.find('div', attrs={'class': 'col-span-8'})
        containers = main_container.findAll(attrs={'class': 'mb-4'})
        articles = []
        for container in containers:
            if 'pb-4' not in container.attrs['class']:
                continue
            a = container.find('a')
            url = a['href']
            if url[0] == '/':
                url = 'https://revista22.ro' + url
            article_title = self.tag_to_string(a.find('h3')).strip()
            author = self.tag_to_string(
                container.find('span', attrs={'class': 'text-red'})
            ).strip()
            summary = self.tag_to_string(container.find('p')).strip()
            pubdate = self.tag_to_string(a.find('span'))
            description = author + ' - ' + summary
            articles.append(
                dict(
                    title=article_title,
                    url=url,
                    date=pubdate,
                    description=description,
                    content=''
                )
            )
        sections = [('Numărul curent', articles)]
        return sections
--- a/recipes/volkskrant.png
+++ b/recipes/volkskrant.png
--- a/recipes/volkskrant.recipe
+++ b/recipes/volkskrant.recipe
@ -1,8 +1,9 @@
 #!/usr/bin/env  python
 import uuid
 from calibre.web.feeds.recipes import BasicNewsRecipe
-
+import uuid
 from mechanize import Request
 from contextlib import closing
 import json
 class Volkskrant(BasicNewsRecipe):
    title = 'Volkskrant'
@ -95,4 +96,25 @@ class Volkskrant(BasicNewsRecipe):
            if tag.name == 'img':
                if tag['src'][0] == '/':
                    tag['src'] = 'https://www.volkskrant.nl' + tag['src']
        for tag in soup():
            if tag.name == "picture":
                tag.replaceWith(tag.find("img"))
        comic_articles = { "Bas van der Schot", "Poldermodellen", "Gummbah", "Sigmund" }
        if self.tag_to_string(soup.find('h1')).strip() in comic_articles:
            for node in soup.find('figure').find_next_siblings():
                node.extract()
        return soup
    def get_cover_url(self):
        headers = {
            'X-Requested-With': 'XMLHttpRequest',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'DNT': '1',
        }
        url = "https://login-api.e-pages.dk/v1/krant.volkskrant.nl/folders"
        with closing(self.browser.open(Request(url, None, headers))) as r:
            folders = json.loads(r.read())
            return folders["objects"][0]["teaser_medium"]
        return None