calibre/recipes/faz_net.recipe

import json
import re

from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes


def format_tickaroo_liveblog(soup):
    for img in soup.findAll('img', attrs={'class':'tik4-media-image__img','srcset':True}):
        sources = img['srcset'].split()
        i=0
        for x in sources:
            if x == '960w,' or x == '960w':
                img['src'] = sources[i-1]
                break
            i = i + 1
        if not img.has_attr('src'):
            img['src'] = sources[0]
    for div in soup.findAll('div', attrs={'class':'tik4-content-block tik4-content-block--rich-text tik4-content-block--position-2'}):
        div.insert_before(soup.new_tag('br'))

    # format liveblogs
    for tag in soup.findAll('time'):
        ntag = soup.new_tag('br')
        tag.insert_before(ntag)

    for tag in soup.findAll(class_='tik4-author__wrapper'):
        ntag = tag.find(class_='tik4-author__name')
        if ntag:
            temp = ntag.extract()
            temp['class'] = 'tik4-media-body__title'
        ntag = tag.find(class_='tik4-author__thumb')
        if ntag and temp:
            ntag.insert_after(temp)


# process run of images
def bilderstrecke(soup,tag):
    flag = False
    try:
        struct = json.loads(str(tag.contents[0]))
    except Exception:
        return

    if struct and isinstance(struct, list):
        for v in struct:
            if isinstance(v, dict) and 'caption' in v:
                flag = True
                break
    if not flag:
        return

    temp=soup.findAll(class_='header-teaser')
    if len(temp) > 1:
        temp[0].extract()
    collect = soup.new_tag('div')

    for v in struct:
        if isinstance(v, dict) and 'caption' in v and 'defaultUrl' in v:
            # if type(struct[i-1])== str:
            #     head = soup.new_tag("h4")
            #     head.append(struct[i-1])
            cap = soup.new_tag('p')
            cap.append(struct[int(v['caption'])])
            cap['class'] = 'body-elements__image-figcaption'
            if 'source' in v.keys():
                cred = soup.new_tag('span')
                cred.append(struct[int(v['source'])])
                cred['class'] = 'body-elements__image-figcaption--source'
                cap.append(cred)
            if 'defaultUrl' in v.keys():
                fig = soup.new_tag('figure')
                img = soup.new_tag('img')
                img['src'] = struct[int(v['defaultUrl'])]
                fig.append(img)
                fig.append(cap)
                collect.append(fig)
    soup.find(class_='header-teaser').insert_after(collect)

    for tag in soup.findAll(class_='header-teaser__image--default'):
        tag.extract()


def story(soup,tag):
    first_image = soup.find('img',attrs={'loading':'lazy'})
    first_caption = soup.find('figcaption',attrs={'class':'caption'})
    if first_image and first_caption:
        first_image.insert_after(first_caption.extract())


class FazNet(BasicNewsRecipe):
    # Version 9.1m
    # Update 2024-05
    # original by Armin Geller
    # overhaul to deal with changes in the faz.net websites

    title                       = 'FAZ.NET'
    __author__                  = 'Unknown'
    description                 = 'Frankfurter Allgemeine Zeitung'
    publisher                   = 'Frankfurter Allgemeine Zeitung GmbH'
    category                    = 'news, politics, Germany'
    cover_url                   = 'https://upload.wikimedia.org/wikipedia/commons/7/72/Frankfurter_Allgemeine_logo.svg'
    encoding                    = 'utf-8'
    language                    = 'de'
    ignore_duplicate_articles   = {'title', 'url'}
    max_articles_per_feed       = 30
    no_stylesheets              = True
    remove_javascript           = True
    scale_news_images           = (10,100)
    delay                       = 1

    test_feed = 'https://www.faz.net/rss/aktuell/feuilleton/kunst-und-architektur/berlinische-galerie-zeigt-edvard-munch-die-ganze-gefuehlsskala-des-lebens-19180631.html?printPagedArticle=true#pageIndex_2'

    extra_css = '''
        .header-title,.scrolly-title {font-size: 1.5em; font-weight:bold; text-align:left;}
        .quote {font-size: 1.5em; font-weight:bold; text-align:center;}
        .author {font-size: 0.7em; font-weight:bold; text-align:center; display:block;
            margin-bottom: 0.95 em; color:grey;}
        .header-label__content {font-size: 0.7em; font-weight:bold; text-align:left; display:block;
            margin-bottom: 0.95 em; color:grey;}
        h3 {font-size:1.3em;text-align:left;}
        .caption,.body-elements__image-figcaption,.header-teaser__image-details,.tik4-media-body__title,.scrolly-text {
                margin-top:0.05em;margin-bottom:1em; font-size: 0.85em; text-align:left;}
        .body-elements__image-figcaption--source,.header-teaser__image-details--source,.tik4-media-body__credit {
                font-size: 0.65em; font-style:italic; text-align:left;margin-left:0.4em;}
        .header-detail--bold {font-size:0.6em; font-weight:bold; margin-bottom:0.75em;text-align:left;}
        time {font-size:0.6em; font-weight: normal; margin-bottom:0.75em; text-align:left; display:block;}
        .header-teaser,.scrolly-intro {font-size:1em; font-style:italic; font-weight:bold;margin-bottom:1em;}
        .tik4-media-image {margin-bottom:1em;margin-top:1em;}
        '''

    keep_only_tags = [dict(name='article', attrs={'class':['article','storytelling']}),
                      dict(name='body'),
                      dict(name='div', attrs={'class':['imageGallery','image_only']}),
                      dict(name='div', attrs={'class':'tik4-live__container'}),
                      dict(name='script', attrs={'id':'__NUXT_DATA__'}),
                      ]

    remove_tags = [
                   dict(name='div', attrs={'class':[
                       'related-articles','consent-placeholder',
                       'article-footer content-container',
                       'tik4-sharing','tik4-load-more-bottom',
                       'tik4-by','header-detail__image','mm-adbox','upper-toolbar content-container'
                   ]}),
                   # dict(name ='script'),
                   dict(name='style'),
                   dict(name='svg'),
                   dict(name='div', attrs={'data-module':'teaser'}),

                  ]

    remove_attributes = ['onclick']

    test_article = False
    if not test_article:
        feeds = [
                 ('FAZ.NET Aktuell', 'https://www.faz.net/rss/aktuell/'),
                 ('Politik', 'https://www.faz.net/rss/aktuell/politik/'),
                 ('Wirtschaft', 'https://www.faz.net/rss/aktuell/wirtschaft/'),
                 ('Feuilleton', 'https://www.faz.net/rss/aktuell/feuilleton/'),
                 ('Sport', 'https://www.faz.net/rss/aktuell/sport/'),
                 ('Lebensstil', 'https://www.faz.net/rss/aktuell/lebensstil/'),
                 ('Gesellschaft', 'https://www.faz.net/rss/aktuell/gesellschaft/'),
                 ('Finanzen', 'https://www.faz.net/rss/aktuell/finanzen/'),
                 ('Technik & Motor', 'https://www.faz.net/rss/aktuell/technik-motor/'),
                 ('Wissen', 'https://www.faz.net/rss/aktuell/wissen/'),
                 ('Reise', 'https://www.faz.net/rss/aktuell/reise/'),
                 ('Karriere & Hochschule', 'https://www.faz.net/rss/aktuell/karriere-hochschule/'),
                 ('Rhein-Main', 'https://www.faz.net/rss/aktuell/rhein-main/')
                ]
    else:
        def parse_index(self):
            test_article = 'https://www.faz.net/aktuell/stil/mode-im-em-jahr-wir-zeigen-wie-fussball-und-mode-zusammengehoeren-19766969.html'
            # test_article = 'https://www.faz.net/aktuell/feuilleton/buecher/film-eruption-ein-thriller-aus-dem-nachlass-von-michael-crichton-19770491.html'
            # test_article = 'https://www.faz.net/aktuell/stil/mode-design/leonie-benesch-sandra-hueller-ist-eine-meiner-heldinnen-19671638.html'
            # test_article = 'https://www.faz.net/aktuell/feuilleton/medien/sabine-postel-zum-siebzigsten-die-briten-nannten-sie-german-traktor-19708409.html'
            # test_article = 'https://www.faz.net/aktuell/stil/mode-design/von-richert-beil-bis-william-fan-wer-kauft-denn-das-19666592.html'
            # test_article = 'https://www.faz.net/aktuell/feuilleton/buecher/rezensionen/sachbuch/tom-mustills-buch-die-sprache-der-wale-19657782.html'
            if test_article:
                return [('Articles', [{'title': 'Test article', 'url': test_article}])]
            soup = self.index_to_soup(self.INDEX)
            img = soup.find(**prefixed_classes('IssueDescription_cover__'))
            if img is not None:
                self.cover_url = img['src']
            current_section, current_articles = 'Cover Story', []
            feeds = []
            for x in soup.findAll(**prefixed_classes('TocFeaturedSection_heading__ TocSection_heading__ TocHeroGridItem_hedLink___ TocGridItem_hedLink__')):
                cls = x['class']
                if not isinstance(cls, str):
                    cls = ' '.join(cls)
                title = self.tag_to_string(x).strip()
                if 'Section' in cls:
                    if current_articles:
                        feeds.append((current_section, current_articles))
                    current_section, current_articles = title, []
                    self.log(current_section)
                    continue
                url = x['href']
                current_articles.append({'title': title, 'url': url})
                self.log('\t', title, url)
            if current_articles:
                feeds.append((current_section, current_articles))
            return feeds

    def preprocess_html(self, soup):
        # Format story-type article
        tag = soup.find(class_='storyContainer')
        if tag:
            story(soup,tag)

        # Extract images and text from image galleries
        for par in soup.findAll('p'):
            if len(par.contents) == 1:
                cont = str(par.contents[0])
                if re.search(r'^[1-9]\d* Bilder$',cont):
                    # print(cont)
                    for tag in soup.findAll('script',attrs={'id':'__NUXT_DATA__','type':'application/json'}):
                        bilderstrecke(soup,tag)
                        break
                    break

        # unwrap buttons
        for tag in soup.findAll('button'):
            tag.unwrap()

        # remove ":""
        tag = soup.find(class_='header-label__content')
        if tag:
            colon=tag.find(class_='sr-only')
            if colon:
                colon.extract()

        # Skip articles behind paywall
        if soup.find(id='faz-paywall'):
            self.abort_article('Skipping paywalled article')

        # Remove F.A.Z. ad
        for tag in soup.findAll(attrs={'class': 'body-elements__paragraph'}):
            if tag.contents[0] and 'F.A.Z.-Newsletter' in tag.contents[0]:
                tag.extract()

        # format liveblog
        if soup.find(attrs={'class':'tik4-live__container'}):
            format_tickaroo_liveblog(soup)

        # remove sizes and calc attributes in images
        for tag in soup.findAll('img'):
            if tag.has_attr('src'):
                new_img = soup.new_tag('img')
                new_img['src'] = tag['src']
                if tag.has_attr('alt'):
                    new_img['alt'] = tag['alt']
                if tag.has_attr('title'):
                    new_img['title'] = tag['title']
                tag.replace_with(new_img)
        return soup

    # Some last cleanup

    def postprocess_html(self, soup, first_fetch):

        # Position point between figure caption and figure credit, where needed
        for tag in soup.findAll(attrs={'class':['body-elements__image-figcaption','header-teaser__image-details']}):
            if tag.string is None:
                if tag.contents[0].string:
                    tag=tag.contents[0]
            if tag.string:
                text = str(tag.string)
                text = text.strip()
                if text != '' and text[-1] not in ['.','?','!',':']:
                    tag.string.replace_with(text + '.')
        return self.adeify_images(soup)