mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
114 lines
4.2 KiB
Python
114 lines
4.2 KiB
Python
import json
|
|
from datetime import datetime
|
|
|
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
from mechanize import Request
|
|
|
|
|
|
class Nzz(BasicNewsRecipe):
|
|
title = 'NZZ'
|
|
__author__ = 'Claude Henchoz'
|
|
description = 'Neue Zürcher Zeitung'
|
|
publisher = 'Neue Zürcher Zeitung'
|
|
category = 'news, politics'
|
|
oldest_article = 30
|
|
max_articles_per_feed = 15
|
|
language = 'de'
|
|
no_stylesheets = True
|
|
use_embedded_content = False
|
|
encoding = 'utf-8'
|
|
|
|
scale_news_images = (600, 400)
|
|
scale_news_images_to_device = True
|
|
|
|
masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/3/37/Neue_Z%C3%BCrcher_Zeitung.svg/800px-Neue_Z%C3%BCrcher_Zeitung.svg.png'
|
|
|
|
keep_only_tags = [dict(name='section', attrs={'class': 'container--article'})]
|
|
|
|
remove_tags = [
|
|
dict(name='div', attrs={'class': 'progressbar__wrapper'}),
|
|
dict(name='div', attrs={'class': 'headline__meta'}),
|
|
dict(name='figcaption', attrs={'class': 'articlecomponent__description'}),
|
|
dict(name='div', attrs={'class': 'nzzinteraction'}),
|
|
dict(name='section', attrs={'class': 'nzzinteraction'}),
|
|
]
|
|
|
|
remove_attributes = ['style', 'font', 'class']
|
|
|
|
feeds = [
|
|
('Neueste Artikel', 'https://www.nzz.ch/recent.rss'),
|
|
('Topthemen der Startseite', 'https://www.nzz.ch/startseite.rss'),
|
|
('International', 'https://www.nzz.ch/international.rss'),
|
|
('Schweiz', 'https://www.nzz.ch/schweiz.rss'),
|
|
('Wirtschaft', 'https://www.nzz.ch/wirtschaft.rss'),
|
|
('Finanznachrichten', 'https://www.nzz.ch/finanzen.rss'),
|
|
('Kultur', 'https://www.nzz.ch/feuilleton.rss'),
|
|
('Sport', 'https://www.nzz.ch/sport.rss'),
|
|
('Zürich', 'https://www.nzz.ch/zuerich.rss'),
|
|
('Panorama', 'https://www.nzz.ch/panorama.rss'),
|
|
('Wissenschaft', 'https://www.nzz.ch/wissenschaft.rss'),
|
|
('Auto', 'https://www.nzz.ch/mobilitaet/auto-mobil.rss'),
|
|
('Technologie', 'https://www.nzz.ch/technologie.rss'),
|
|
]
|
|
|
|
def get_cover_url(self):
|
|
# Prepare the date and data
|
|
today_date = datetime.now().strftime('%Y-%m-%d')
|
|
json_data = {
|
|
'editions': [
|
|
{
|
|
'publicationDate': today_date,
|
|
'defId': 6,
|
|
},
|
|
],
|
|
'startDate': today_date,
|
|
'maxHits': 1,
|
|
'direction': 'BACKWARD',
|
|
}
|
|
|
|
# Prepare headers
|
|
headers = {
|
|
'Accept': 'application/json',
|
|
'Accept-Language': 'en-US,en;q=0.9,de;q=0.8',
|
|
'Content-Type': 'application/json',
|
|
'Origin': 'https://epaper.nzz.ch',
|
|
'Referer': 'https://epaper.nzz.ch/',
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0',
|
|
'X-Requested-With': 'XMLHttpRequest',
|
|
}
|
|
|
|
# Encode your JSON data
|
|
encoded_data = json.dumps(json_data).encode('utf-8')
|
|
|
|
# Create a mechanize Request object with the target URL, encoded data, and headers
|
|
req = Request(url='https://epaper.nzz.ch/epaper/1.0/findEditionsFromDate',
|
|
data=encoded_data,
|
|
headers=headers,
|
|
method='POST')
|
|
|
|
# Use mechanize to open the request and read the response
|
|
browser = self.get_browser()
|
|
response = browser.open(req)
|
|
response_data = json.loads(response.read())
|
|
|
|
# Extract the desired information
|
|
url = response_data['data'][0]['pages'][0]['pageDocUrl']['PREVIEW']['url']
|
|
|
|
return url
|
|
|
|
|
|
def get_browser(self, *args, **kwargs):
|
|
kwargs['user_agent'] = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
|
|
br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
|
|
br.addheaders += [
|
|
('Referer', 'https://www.google.com/'),
|
|
('X-Forwarded-For', '66.249.66.1')
|
|
]
|
|
return br
|
|
|
|
def preprocess_html(self, soup):
|
|
# Fix lazy-loading images
|
|
for img in soup.findAll('img', attrs={'srcset': True}):
|
|
img['src'] = img['srcset'].split()[0]
|
|
return soup
|