Align recipe with structure of updated NZZ website

2026-03-31 22:32:28 -04:00 · 2025-10-11 01:50:58 +02:00 · 2025-10-11 01:50:58 +02:00 · 5d9b64ea6c
commit 5d9b64ea6c
parent 13422da467
1 changed files with 32 additions and 5 deletions
--- a/recipes/nzz_ger.recipe
+++ b/recipes/nzz_ger.recipe
@ -29,10 +29,16 @@ class Nzz(BasicNewsRecipe):
    remove_tags = [
        dict(name='div', attrs={'class': 'progressbar__wrapper'}),               # Reading progress.
        dict(name='div', attrs={'class': 'headline__meta'}),                     # Article meta data.
-        dict(name='div', attrs={'class': 'nzzinteraction'}),
-        dict(name='section', attrs={'class': 'nzzinteraction'}),
        dict(name='span', attrs={'class': 'image-description__author-single'}),  # Photo accreditation.
+        dict(name='span', attrs={'data-nzz-tid': 'image-description-author'}),   # Photo accreditation.
        dict(name='div', attrs={'class': 'disabled-overlay'}),                   # "Please enable Javascript".
+        dict(name='section', attrs={'id': 'content-table'}),                     # Table of contents.
+        dict(name='div', attrs={'componenttype': 'articlelist'}),                # Article list inside the article.
+    ]
+
+    remove_tags_after = [
+        dict(name='p', attrs={'data-vars-danzz-last-article-element': 'true', 'class': 'articlecomponent'}),
+        dict(name='div', attrs={'componenttype': 'sharebox'}),  # Remove everything after the social media share box.
    ]

    # Center and reduce the size of images and image captions.
@ -44,7 +50,6 @@ class Nzz(BasicNewsRecipe):
    remove_attributes = ['style', 'font', 'class']

    feeds = [
-        ('Neueste Artikel', 'https://www.nzz.ch/recent.rss'),
        ('Topthemen der Startseite', 'https://www.nzz.ch/startseite.rss'),
        ('International', 'https://www.nzz.ch/international.rss'),
        ('Schweiz', 'https://www.nzz.ch/schweiz.rss'),
@ -114,12 +119,34 @@ class Nzz(BasicNewsRecipe):
        return br

    def preprocess_html(self, soup):
-        # Fix lazy-loading images
+        # Fix lazy-loading images.
        for img in soup.findAll('img', attrs={'srcset': True}):
            img['src'] = img['srcset'].split()[0]

        # To prevent image captions from being displayed as headers in the output, convert them from <h2> to <p>.
-        for caption in soup.findAll('h2', attrs={'class': 'image-description__caption'}):
+        for caption in soup.findAll('h2', attrs={'data-nzz-tid': 'image-description-caption'}):
            caption.name = 'p'

+        # Remove the article metadata block.
+        # First, find the first <hr> tag.
+        hr_tag = soup.find('hr')
+        if hr_tag:
+            self.log.debug(f"Found <hr> tag: {hr_tag}")
+            # Then, if the next direct sibling after the `hr_tag` is a <div> of class "mx-auto", "items-start",
+            # "justify-between" and "text-left", remove that <div>, as it is the metadata block.
+            next_sibling = hr_tag.find_next_sibling()
+            if next_sibling and next_sibling.name == 'div':
+                 self.log.debug(f"Found next sibling <div>: {next_sibling}")
+                 next_sibling_classes = next_sibling.get('class', [])
+                 if all(c in next_sibling_classes for c in ['items-start', 'justify-between', 'text-left']):
+                    self.log.debug("Removing the article metadata block.")
+                    next_sibling.decompose()
+
+        # Remove the social media share box, which should delimit the end of the article.
+        sharebox_div = soup.find('div', attrs={'componenttype': 'sharebox'})
+        self.log.debug(f"Searching for sharebox <div> with attrs {{'componenttype': 'sharebox'}}.")
+        if sharebox_div:
+            self.log.debug(f"Found sharebox <div> that will now be removed.")
+            sharebox_div.decompose()
+
        return soup