Align recipe with structure of updated NZZ website

This commit is contained in:
Henrik Holm 2025-10-11 01:50:58 +02:00
parent 13422da467
commit 5d9b64ea6c
No known key found for this signature in database

View File

@ -29,10 +29,16 @@ class Nzz(BasicNewsRecipe):
remove_tags = [
dict(name='div', attrs={'class': 'progressbar__wrapper'}), # Reading progress.
dict(name='div', attrs={'class': 'headline__meta'}), # Article meta data.
dict(name='div', attrs={'class': 'nzzinteraction'}),
dict(name='section', attrs={'class': 'nzzinteraction'}),
dict(name='span', attrs={'class': 'image-description__author-single'}), # Photo accreditation.
dict(name='span', attrs={'data-nzz-tid': 'image-description-author'}), # Photo accreditation.
dict(name='div', attrs={'class': 'disabled-overlay'}), # "Please enable Javascript".
dict(name='section', attrs={'id': 'content-table'}), # Table of contents.
dict(name='div', attrs={'componenttype': 'articlelist'}), # Article list inside the article.
]
remove_tags_after = [
dict(name='p', attrs={'data-vars-danzz-last-article-element': 'true', 'class': 'articlecomponent'}),
dict(name='div', attrs={'componenttype': 'sharebox'}), # Remove everything after the social media share box.
]
# Center and reduce the size of images and image captions.
@ -44,7 +50,6 @@ class Nzz(BasicNewsRecipe):
remove_attributes = ['style', 'font', 'class']
feeds = [
('Neueste Artikel', 'https://www.nzz.ch/recent.rss'),
('Topthemen der Startseite', 'https://www.nzz.ch/startseite.rss'),
('International', 'https://www.nzz.ch/international.rss'),
('Schweiz', 'https://www.nzz.ch/schweiz.rss'),
@ -114,12 +119,34 @@ class Nzz(BasicNewsRecipe):
return br
def preprocess_html(self, soup):
# Fix lazy-loading images
# Fix lazy-loading images.
for img in soup.findAll('img', attrs={'srcset': True}):
img['src'] = img['srcset'].split()[0]
# To prevent image captions from being displayed as headers in the output, convert them from <h2> to <p>.
for caption in soup.findAll('h2', attrs={'class': 'image-description__caption'}):
for caption in soup.findAll('h2', attrs={'data-nzz-tid': 'image-description-caption'}):
caption.name = 'p'
# Remove the article metadata block.
# First, find the first <hr> tag.
hr_tag = soup.find('hr')
if hr_tag:
self.log.debug(f"Found <hr> tag: {hr_tag}")
# Then, if the next direct sibling after the `hr_tag` is a <div> of class "mx-auto", "items-start",
# "justify-between" and "text-left", remove that <div>, as it is the metadata block.
next_sibling = hr_tag.find_next_sibling()
if next_sibling and next_sibling.name == 'div':
self.log.debug(f"Found next sibling <div>: {next_sibling}")
next_sibling_classes = next_sibling.get('class', [])
if all(c in next_sibling_classes for c in ['items-start', 'justify-between', 'text-left']):
self.log.debug("Removing the article metadata block.")
next_sibling.decompose()
# Remove the social media share box, which should delimit the end of the article.
sharebox_div = soup.find('div', attrs={'componenttype': 'sharebox'})
self.log.debug(f"Searching for sharebox <div> with attrs {{'componenttype': 'sharebox'}}.")
if sharebox_div:
self.log.debug(f"Found sharebox <div> that will now be removed.")
sharebox_div.decompose()
return soup