calibre/recipes/nzz_ger.recipe
2024-03-30 13:03:29 +05:30

114 lines
4.2 KiB
Python

import json
from datetime import datetime
from calibre.web.feeds.recipes import BasicNewsRecipe
from mechanize import Request
class Nzz(BasicNewsRecipe):
title = 'NZZ'
__author__ = 'Claude Henchoz'
description = 'Neue Zürcher Zeitung'
publisher = 'Neue Zürcher Zeitung'
category = 'news, politics'
oldest_article = 30
max_articles_per_feed = 15
language = 'de'
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
scale_news_images = (600, 400)
scale_news_images_to_device = True
masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/3/37/Neue_Z%C3%BCrcher_Zeitung.svg/800px-Neue_Z%C3%BCrcher_Zeitung.svg.png'
keep_only_tags = [dict(name='section', attrs={'class': 'container--article'})]
remove_tags = [
dict(name='div', attrs={'class': 'progressbar__wrapper'}),
dict(name='div', attrs={'class': 'headline__meta'}),
dict(name='figcaption', attrs={'class': 'articlecomponent__description'}),
dict(name='div', attrs={'class': 'nzzinteraction'}),
dict(name='section', attrs={'class': 'nzzinteraction'}),
]
remove_attributes = ['style', 'font', 'class']
feeds = [
('Neueste Artikel', 'https://www.nzz.ch/recent.rss'),
('Topthemen der Startseite', 'https://www.nzz.ch/startseite.rss'),
('International', 'https://www.nzz.ch/international.rss'),
('Schweiz', 'https://www.nzz.ch/schweiz.rss'),
('Wirtschaft', 'https://www.nzz.ch/wirtschaft.rss'),
('Finanznachrichten', 'https://www.nzz.ch/finanzen.rss'),
('Kultur', 'https://www.nzz.ch/feuilleton.rss'),
('Sport', 'https://www.nzz.ch/sport.rss'),
('Zürich', 'https://www.nzz.ch/zuerich.rss'),
('Panorama', 'https://www.nzz.ch/panorama.rss'),
('Wissenschaft', 'https://www.nzz.ch/wissenschaft.rss'),
('Auto', 'https://www.nzz.ch/mobilitaet/auto-mobil.rss'),
('Technologie', 'https://www.nzz.ch/technologie.rss'),
]
def get_cover_url(self):
# Prepare the date and data
today_date = datetime.now().strftime('%Y-%m-%d')
json_data = {
'editions': [
{
'publicationDate': today_date,
'defId': 6,
},
],
'startDate': today_date,
'maxHits': 1,
'direction': 'BACKWARD',
}
# Prepare headers
headers = {
'Accept': 'application/json',
'Accept-Language': 'en-US,en;q=0.9,de;q=0.8',
'Content-Type': 'application/json',
'Origin': 'https://epaper.nzz.ch',
'Referer': 'https://epaper.nzz.ch/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0',
'X-Requested-With': 'XMLHttpRequest',
}
# Encode your JSON data
encoded_data = json.dumps(json_data).encode('utf-8')
# Create a mechanize Request object with the target URL, encoded data, and headers
req = Request(url='https://epaper.nzz.ch/epaper/1.0/findEditionsFromDate',
data=encoded_data,
headers=headers,
method='POST')
# Use mechanize to open the request and read the response
browser = self.get_browser()
response = browser.open(req)
response_data = json.loads(response.read())
# Extract the desired information
url = response_data['data'][0]['pages'][0]['pageDocUrl']['PREVIEW']['url']
return url
def get_browser(self, *args, **kwargs):
kwargs['user_agent'] = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
br.addheaders += [
('Referer', 'https://www.google.com/'),
('X-Forwarded-For', '66.249.66.1')
]
return br
def preprocess_html(self, soup):
# Fix lazy-loading images
for img in soup.findAll('img', attrs={'srcset': True}):
img['src'] = img['srcset'].split()[0]
return soup