calibre/recipes/zeitde.recipe
2025-01-24 11:14:25 +01:00

149 lines
5.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# vim:fileencoding=utf-8
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Fetch Zeit-Online.de
'''
import re
from datetime import datetime
from calibre.web.feeds.news import BasicNewsRecipe
try:
from http.cookiejar import Cookie
except ImportError:
from cookielib import Cookie
try:
from urllib.error import HTTPError
from urllib.request import Request, urlopen
except ImportError:
from urllib2 import HTTPError, Request, urlopen
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class ZeitDe(BasicNewsRecipe):
__author__ = 'Armin Geller, Thomas Hollstegge' # AGe 2014-02-26
title = u'Zeit Online'
description = u'German online portal of newspaper Die Zeit'
publisher = 'ZEIT ONLINE GmbH'
category = 'news, Germany'
timefmt = ' [%a, %d %b %Y]'
publication_type = 'newspaper'
language = 'de'
encoding = 'UTF-8'
oldest_article = 7
max_articles_per_feed = 100
remove_empty_feeds = True
auto_cleanup = True
auto_cleanup_keep = '//header[@class="article-header"]|//div[@class="gallery__media-container"]|//div[@class="article__media-container"]'
masthead_url = 'https://static.zeit.de/p/zeit.web/3.714/images/structured-data-publisher-logo-zon.png'
extra_css = '''.figure__text { font-size: 0.9em; font-style: italic; }
.figure__copyright { font-size: 0.9em; font-style: italic; color: #888; }
.article-heading__kicker { font-size: 0.5em; display: block; margin-bottom: 1em; }
p.summary { font-size: 1.3em; font-style: italic; } '''
remove_tags = [
dict(name='aside', class_='topicbox'),
dict(name='aside', class_='article-toc'),
dict(class_='visually-hidden'),
dict(name='a', class_='faq-link'),
]
recipe_specific_options = {
'days': {
'short': 'Oldest article to download from this news source. In days ',
'long': 'For example, 0.5, gives you articles from the past 12 hours',
'default': str(oldest_article)
}
}
def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs)
d = self.recipe_specific_options.get('days')
if d and isinstance(d, str):
self.oldest_article = float(d)
feeds = [
(u'Startseite Die wichtigsten Themen auf einen Blick',
u'https://newsfeed.zeit.de/index'),
(u'Politik Ausland und Deutschland',
u'https://newsfeed.zeit.de/politik/index'),
(u'Gesellschaft Gesellschaft und soziales Leben',
u'https://newsfeed.zeit.de/gesellschaft/index'),
(u'Wirtschaft Wirtschaft und Unternehmen',
u'https://newsfeed.zeit.de/wirtschaft/index'),
(u'Kultur Literatur, Kunst, Film und Musik',
u'https://newsfeed.zeit.de/kultur/index'),
(u'Wissen Wissenschaft, Gesundheit, Umwelt und Geschichte',
u'https://newsfeed.zeit.de/wissen/index'),
(u'Digital Hardware, Software, Internet, Datenschutz',
u'https://newsfeed.zeit.de/digital/index'),
(u'ZEIT Campus Online - Studieren, arbeiten, leben',
u'https://newsfeed.zeit.de/campus/index'),
(u'Arbeit - Umbruch, Neuanfang, Erfolg, Zweifel',
u'https://newsfeed.zeit.de/arbeit/index'),
(u'Entdecken - Die Poesie des Alltäglichen',
u'https://newsfeed.zeit.de/entdecken/index'),
(u'Mobilität Wie wir uns fortbewegen',
u'https://newsfeed.zeit.de/mobilitaet/index'), # AGe 2014-01-09
(u'Sport Sieg und Niederlage',
u'https://newsfeed.zeit.de/sport/index')
]
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
# Add a cookie indicating we have accepted the cookie
# policy
ck = Cookie(
version=0, name='zonconsent', value=datetime.now().isoformat(), port=None,
port_specified=False, domain='.zeit.de',
domain_specified=False, domain_initial_dot=True, path='/',
path_specified=False, secure=True, expires=None, discard=False,
comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)
br.cookiejar.set_cookie(ck)
return br
def print_version(self, url):
# If there is a complete page, use that one
req = Request(url=url+'/komplettansicht')
req.get_method = lambda: 'HEAD'
try:
urlopen(req)
return url + '/komplettansicht'
except HTTPError:
return url
def preprocess_raw_html(self, raw_html, url):
soup = BeautifulSoup(raw_html)
# Skip articles behind paywall
if soup.find('meta', property='lp:paywall'):
self.abort_article()
# Readability may strip the header for multipage articles, so simply
# include it in the main tag
if soup.find('body')['data-page-type'] == 'article':
body = soup.find('div', {'class': 'article-body'})
header = soup.find('header', {'class': 'article-header'})
if header is not None:
header.extract()
body.insert(0, header)
# Add real img tags for images
for container in soup.findAll(class_=re.compile(r'__media-container$')):
img = container.find('noscript')
if img is not None:
img.name = 'div'
img['data-src'] = ''
return soup.prettify()