calibre/recipes/nzz_ger.recipe
2025-04-27 17:02:18 +02:00

126 lines
4.8 KiB
Python

import json
from datetime import datetime
from mechanize import Request
from calibre.web.feeds.recipes import BasicNewsRecipe
class Nzz(BasicNewsRecipe):
title = 'NZZ'
__author__ = 'Claude Henchoz'
description = 'Neue Zürcher Zeitung'
publisher = 'Neue Zürcher Zeitung'
category = 'news, politics'
oldest_article = 7
max_articles_per_feed = 15
language = 'de'
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
scale_news_images = (600, 400)
scale_news_images_to_device = True
masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/3/37/Neue_Z%C3%BCrcher_Zeitung.svg/800px-Neue_Z%C3%BCrcher_Zeitung.svg.png'
keep_only_tags = [dict(name='section', attrs={'class': 'container--article'})]
remove_tags = [
dict(name='div', attrs={'class': 'progressbar__wrapper'}), # Reading progress.
dict(name='div', attrs={'class': 'headline__meta'}), # Article meta data.
dict(name='div', attrs={'class': 'nzzinteraction'}),
dict(name='section', attrs={'class': 'nzzinteraction'}),
dict(name='span', attrs={'class': 'image-description__author-single'}), # Photo accreditation.
dict(name='div', attrs={'class': 'disabled-overlay'}), # "Please enable Javascript".
]
# Center and reduce the size of images and image captions.
extra_css = '''
img { display: block; margin: auto; width: 50%; height: auto; }
div.calibre-nuked-tag-figure { font-size: small; text-align: center; }
'''
remove_attributes = ['style', 'font', 'class']
feeds = [
('Neueste Artikel', 'https://www.nzz.ch/recent.rss'),
('Topthemen der Startseite', 'https://www.nzz.ch/startseite.rss'),
('International', 'https://www.nzz.ch/international.rss'),
('Schweiz', 'https://www.nzz.ch/schweiz.rss'),
('Wirtschaft', 'https://www.nzz.ch/wirtschaft.rss'),
('Finanznachrichten', 'https://www.nzz.ch/finanzen.rss'),
('Kultur', 'https://www.nzz.ch/feuilleton.rss'),
('Sport', 'https://www.nzz.ch/sport.rss'),
('Zürich', 'https://www.nzz.ch/zuerich.rss'),
('Panorama', 'https://www.nzz.ch/panorama.rss'),
('Wissenschaft', 'https://www.nzz.ch/wissenschaft.rss'),
('Auto', 'https://www.nzz.ch/mobilitaet/auto-mobil.rss'),
('Technologie', 'https://www.nzz.ch/technologie.rss'),
]
def get_cover_url(self):
# Prepare the date and data
today_date = datetime.now().strftime('%Y-%m-%d')
json_data = {
'editions': [
{
'publicationDate': today_date,
'defId': 6,
},
],
'startDate': today_date,
'maxHits': 1,
'direction': 'BACKWARD',
}
# Prepare headers
headers = {
'Accept': 'application/json',
'Accept-Language': 'en-US,en;q=0.9,de;q=0.8',
'Content-Type': 'application/json',
'Origin': 'https://epaper.nzz.ch',
'Referer': 'https://epaper.nzz.ch/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0',
'X-Requested-With': 'XMLHttpRequest',
}
# Encode your JSON data
encoded_data = json.dumps(json_data).encode('utf-8')
# Create a mechanize Request object with the target URL, encoded data, and headers
req = Request(url='https://epaper.nzz.ch/epaper/1.0/findEditionsFromDate',
data=encoded_data,
headers=headers,
method='POST')
# Use mechanize to open the request and read the response
browser = self.get_browser()
response = browser.open(req)
response_data = json.loads(response.read())
# Extract the desired information
url = response_data['data'][0]['pages'][0]['pageDocUrl']['PREVIEW']['url']
return url
def get_browser(self, *args, **kwargs):
kwargs['user_agent'] = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
br.addheaders += [
('Referer', 'https://www.google.com/'),
('X-Forwarded-For', '66.249.66.1')
]
return br
def preprocess_html(self, soup):
# Fix lazy-loading images
for img in soup.findAll('img', attrs={'srcset': True}):
img['src'] = img['srcset'].split()[0]
# To prevent image captions from being displayed as headers in the output, convert them from <h2> to <p>.
for caption in soup.findAll('h2', attrs={'class': 'image-description__caption'}):
caption.name = 'p'
return soup