mirror of
https://github.com/kovidgoyal/calibre.git
synced 2026-03-31 22:32:28 -04:00
Align recipe with structure of updated NZZ website
This commit is contained in:
parent
13422da467
commit
5d9b64ea6c
@ -29,10 +29,16 @@ class Nzz(BasicNewsRecipe):
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class': 'progressbar__wrapper'}), # Reading progress.
|
||||
dict(name='div', attrs={'class': 'headline__meta'}), # Article meta data.
|
||||
dict(name='div', attrs={'class': 'nzzinteraction'}),
|
||||
dict(name='section', attrs={'class': 'nzzinteraction'}),
|
||||
dict(name='span', attrs={'class': 'image-description__author-single'}), # Photo accreditation.
|
||||
dict(name='span', attrs={'data-nzz-tid': 'image-description-author'}), # Photo accreditation.
|
||||
dict(name='div', attrs={'class': 'disabled-overlay'}), # "Please enable Javascript".
|
||||
dict(name='section', attrs={'id': 'content-table'}), # Table of contents.
|
||||
dict(name='div', attrs={'componenttype': 'articlelist'}), # Article list inside the article.
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='p', attrs={'data-vars-danzz-last-article-element': 'true', 'class': 'articlecomponent'}),
|
||||
dict(name='div', attrs={'componenttype': 'sharebox'}), # Remove everything after the social media share box.
|
||||
]
|
||||
|
||||
# Center and reduce the size of images and image captions.
|
||||
@ -44,7 +50,6 @@ class Nzz(BasicNewsRecipe):
|
||||
remove_attributes = ['style', 'font', 'class']
|
||||
|
||||
feeds = [
|
||||
('Neueste Artikel', 'https://www.nzz.ch/recent.rss'),
|
||||
('Topthemen der Startseite', 'https://www.nzz.ch/startseite.rss'),
|
||||
('International', 'https://www.nzz.ch/international.rss'),
|
||||
('Schweiz', 'https://www.nzz.ch/schweiz.rss'),
|
||||
@ -114,12 +119,34 @@ class Nzz(BasicNewsRecipe):
|
||||
return br
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
# Fix lazy-loading images
|
||||
# Fix lazy-loading images.
|
||||
for img in soup.findAll('img', attrs={'srcset': True}):
|
||||
img['src'] = img['srcset'].split()[0]
|
||||
|
||||
# To prevent image captions from being displayed as headers in the output, convert them from <h2> to <p>.
|
||||
for caption in soup.findAll('h2', attrs={'class': 'image-description__caption'}):
|
||||
for caption in soup.findAll('h2', attrs={'data-nzz-tid': 'image-description-caption'}):
|
||||
caption.name = 'p'
|
||||
|
||||
# Remove the article metadata block.
|
||||
# First, find the first <hr> tag.
|
||||
hr_tag = soup.find('hr')
|
||||
if hr_tag:
|
||||
self.log.debug(f"Found <hr> tag: {hr_tag}")
|
||||
# Then, if the next direct sibling after the `hr_tag` is a <div> of class "mx-auto", "items-start",
|
||||
# "justify-between" and "text-left", remove that <div>, as it is the metadata block.
|
||||
next_sibling = hr_tag.find_next_sibling()
|
||||
if next_sibling and next_sibling.name == 'div':
|
||||
self.log.debug(f"Found next sibling <div>: {next_sibling}")
|
||||
next_sibling_classes = next_sibling.get('class', [])
|
||||
if all(c in next_sibling_classes for c in ['items-start', 'justify-between', 'text-left']):
|
||||
self.log.debug("Removing the article metadata block.")
|
||||
next_sibling.decompose()
|
||||
|
||||
# Remove the social media share box, which should delimit the end of the article.
|
||||
sharebox_div = soup.find('div', attrs={'componenttype': 'sharebox'})
|
||||
self.log.debug(f"Searching for sharebox <div> with attrs {{'componenttype': 'sharebox'}}.")
|
||||
if sharebox_div:
|
||||
self.log.debug(f"Found sharebox <div> that will now be removed.")
|
||||
sharebox_div.decompose()
|
||||
|
||||
return soup
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user