diff --git a/recipes/nzz_ger.recipe b/recipes/nzz_ger.recipe index b48cd63970..a0d82350f4 100644 --- a/recipes/nzz_ger.recipe +++ b/recipes/nzz_ger.recipe @@ -29,10 +29,16 @@ class Nzz(BasicNewsRecipe): remove_tags = [ dict(name='div', attrs={'class': 'progressbar__wrapper'}), # Reading progress. dict(name='div', attrs={'class': 'headline__meta'}), # Article meta data. - dict(name='div', attrs={'class': 'nzzinteraction'}), - dict(name='section', attrs={'class': 'nzzinteraction'}), dict(name='span', attrs={'class': 'image-description__author-single'}), # Photo accreditation. + dict(name='span', attrs={'data-nzz-tid': 'image-description-author'}), # Photo accreditation. dict(name='div', attrs={'class': 'disabled-overlay'}), # "Please enable Javascript". + dict(name='section', attrs={'id': 'content-table'}), # Table of contents. + dict(name='div', attrs={'componenttype': 'articlelist'}), # Article list inside the article. + ] + + remove_tags_after = [ + dict(name='p', attrs={'data-vars-danzz-last-article-element': 'true', 'class': 'articlecomponent'}), + dict(name='div', attrs={'componenttype': 'sharebox'}), # Remove everything after the social media share box. ] # Center and reduce the size of images and image captions. @@ -44,7 +50,6 @@ class Nzz(BasicNewsRecipe): remove_attributes = ['style', 'font', 'class'] feeds = [ - ('Neueste Artikel', 'https://www.nzz.ch/recent.rss'), ('Topthemen der Startseite', 'https://www.nzz.ch/startseite.rss'), ('International', 'https://www.nzz.ch/international.rss'), ('Schweiz', 'https://www.nzz.ch/schweiz.rss'), @@ -114,12 +119,34 @@ class Nzz(BasicNewsRecipe): return br def preprocess_html(self, soup): - # Fix lazy-loading images + # Fix lazy-loading images. for img in soup.findAll('img', attrs={'srcset': True}): img['src'] = img['srcset'].split()[0] # To prevent image captions from being displayed as headers in the output, convert them from
. - for caption in soup.findAll('h2', attrs={'class': 'image-description__caption'}): + for caption in soup.findAll('h2', attrs={'data-nzz-tid': 'image-description-caption'}): caption.name = 'p' + # Remove the article metadata block. + # First, find the first