calibre/recipes/volksrant.recipe

#!/usr/bin/env  python
import json
import uuid
from contextlib import closing

from calibre.web.feeds.recipes import BasicNewsRecipe
from mechanize import Request


class Volkskrant(BasicNewsRecipe):
    title = 'Volkskrant'
    __author__ = 'Cristi Ghera'
    max_articles_per_feed = 100
    language = 'nl'
    description = 'Volkskrant - Nieuws, achtergronden en columns'
    needs_subscription = False
    resolve_internal_links = True
    remove_tags_before = dict(id='main-content')
    remove_tags_after = dict(id='main-content')
    remove_tags = [
        dict(
            attrs={
                'class': [
                    'article-footer__sharing',
                    'artstyle__editorial-tips',
                    'artstyle__advertisement',
                    'artstyle__container__icon',
                    'artstyle__disabled-embed',
                    'container__title__icon',
                ]
            }
        ),
        dict(attrs={'data-element-id': ['article-element-authors']}),
        dict(name=['script', 'noscript', 'style']),
    ]
    remove_attributes = ["class", "id", "name", "style"]
    encoding = 'utf-8'
    no_stylesheets = True
    ignore_duplicate_articles = {'url'}

    def parse_index(self):
        soup = self.index_to_soup(
            'https://www.volkskrant.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4())
        )
        containers = soup.findAll('section', attrs={'class': 'section--horizontal'})
        sections = []
        for container in containers:
            section_title = self.tag_to_string(container.find('h2')).strip()
            articles = []

            for art in container.findAll('article'):
                a = art.find('a')
                url = a['href']
                if url[0] == '/':
                    url = 'https://www.volkskrant.nl' + url
                if '/editie/' not in url:
                    continue
                header = a.find('header')
                teaser_label = self.tag_to_string(
                    header.find('h4').find('span', attrs={'class': 'teaser__label'})
                ).strip()
                teaser_sublabel = self.tag_to_string(
                    header.find('h4'
                                ).find('span', attrs={'class': 'teaser__sublabel'})
                ).strip()
                teaser_title = self.tag_to_string(
                    header.find('h3').find(
                        'span', attrs={'class': 'teaser__title__value--short'}
                    )
                ).strip()
                if teaser_label.lower() == "podcast":
                    continue
                parts = []
                if teaser_label:
                    parts.append(teaser_label.upper())
                if teaser_sublabel:
                    parts.append(teaser_sublabel)
                if teaser_title:
                    parts.append(teaser_title)
                article_title = ' \u2022 '.join(parts)
                pubdate = ''
                description = ''
                articles.append(
                    dict(
                        title=article_title,
                        url=url,
                        date=pubdate,
                        description=description,
                        content=''
                    )
                )

            sections.append((section_title, articles))
        return sections

    def preprocess_html(self, soup):
        for tag in soup():
            if tag.name == 'img':
                if tag['src'][0] == '/':
                    tag['src'] = 'https://www.volkskrant.nl' + tag['src']

        for tag in soup():
            if tag.name == "picture":
                tag.replaceWith(tag.find("img"))

        comic_articles = { "Bas van der Schot", "Poldermodellen", "Gummbah", "Sigmund" }
        if self.tag_to_string(soup.find('h1')).strip() in comic_articles:
            for node in soup.find('figure').find_next_siblings():
                node.extract()
        return soup

    def get_cover_url(self):
        headers = {
            'X-Requested-With': 'XMLHttpRequest',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'DNT': '1',
        }
        url = "https://login-api.e-pages.dk/v1/krant.volkskrant.nl/folders"
        with closing(self.browser.open(Request(url, None, headers))) as r:
            folders = json.loads(r.read())
            return folders["objects"][0]["teaser_medium"]
        return None