diff --git a/recipes/nzz_ger.recipe b/recipes/nzz_ger.recipe index b48cd63970..a0d82350f4 100644 --- a/recipes/nzz_ger.recipe +++ b/recipes/nzz_ger.recipe @@ -29,10 +29,16 @@ class Nzz(BasicNewsRecipe): remove_tags = [ dict(name='div', attrs={'class': 'progressbar__wrapper'}), # Reading progress. dict(name='div', attrs={'class': 'headline__meta'}), # Article meta data. - dict(name='div', attrs={'class': 'nzzinteraction'}), - dict(name='section', attrs={'class': 'nzzinteraction'}), dict(name='span', attrs={'class': 'image-description__author-single'}), # Photo accreditation. + dict(name='span', attrs={'data-nzz-tid': 'image-description-author'}), # Photo accreditation. dict(name='div', attrs={'class': 'disabled-overlay'}), # "Please enable Javascript". + dict(name='section', attrs={'id': 'content-table'}), # Table of contents. + dict(name='div', attrs={'componenttype': 'articlelist'}), # Article list inside the article. + ] + + remove_tags_after = [ + dict(name='p', attrs={'data-vars-danzz-last-article-element': 'true', 'class': 'articlecomponent'}), + dict(name='div', attrs={'componenttype': 'sharebox'}), # Remove everything after the social media share box. ] # Center and reduce the size of images and image captions. @@ -44,7 +50,6 @@ class Nzz(BasicNewsRecipe): remove_attributes = ['style', 'font', 'class'] feeds = [ - ('Neueste Artikel', 'https://www.nzz.ch/recent.rss'), ('Topthemen der Startseite', 'https://www.nzz.ch/startseite.rss'), ('International', 'https://www.nzz.ch/international.rss'), ('Schweiz', 'https://www.nzz.ch/schweiz.rss'), @@ -114,12 +119,34 @@ class Nzz(BasicNewsRecipe): return br def preprocess_html(self, soup): - # Fix lazy-loading images + # Fix lazy-loading images. for img in soup.findAll('img', attrs={'srcset': True}): img['src'] = img['srcset'].split()[0] # To prevent image captions from being displayed as headers in the output, convert them from

to

. - for caption in soup.findAll('h2', attrs={'class': 'image-description__caption'}): + for caption in soup.findAll('h2', attrs={'data-nzz-tid': 'image-description-caption'}): caption.name = 'p' + # Remove the article metadata block. + # First, find the first


tag. + hr_tag = soup.find('hr') + if hr_tag: + self.log.debug(f"Found
tag: {hr_tag}") + # Then, if the next direct sibling after the `hr_tag` is a
of class "mx-auto", "items-start", + # "justify-between" and "text-left", remove that
, as it is the metadata block. + next_sibling = hr_tag.find_next_sibling() + if next_sibling and next_sibling.name == 'div': + self.log.debug(f"Found next sibling
: {next_sibling}") + next_sibling_classes = next_sibling.get('class', []) + if all(c in next_sibling_classes for c in ['items-start', 'justify-between', 'text-left']): + self.log.debug("Removing the article metadata block.") + next_sibling.decompose() + + # Remove the social media share box, which should delimit the end of the article. + sharebox_div = soup.find('div', attrs={'componenttype': 'sharebox'}) + self.log.debug(f"Searching for sharebox
with attrs {{'componenttype': 'sharebox'}}.") + if sharebox_div: + self.log.debug(f"Found sharebox
that will now be removed.") + sharebox_div.decompose() + return soup