calibre/recipes/zeitde.recipe
jamierc 7646c6b1b9
Update masthead_url to correct image location
Address had changed and was slowing recipe down
2021-01-31 12:27:54 +11:00

131 lines
4.9 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Fetch Zeit-Online.de
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from datetime import datetime
try:
from http.cookiejar import Cookie
except ImportError:
from cookielib import Cookie
try:
from urllib.request import Request, urlopen
from urllib.error import HTTPError
except ImportError:
from urllib2 import Request, urlopen, HTTPError
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class ZeitDe(BasicNewsRecipe):
__author__ = 'Armin Geller, Thomas Hollstegge' # AGe 2014-02-26
title = u'Zeit Online'
description = u'German online portal of newspaper Die Zeit'
publisher = 'ZEIT ONLINE GmbH'
category = 'news, Germany'
timefmt = ' [%a, %d %b %Y]'
publication_type = 'newspaper'
language = 'de'
encoding = 'UTF-8'
oldest_article = 7
max_articles_per_feed = 100
remove_empty_feeds = True
auto_cleanup = True
auto_cleanup_keep = '//header[@class="article-header"]|//div[@class="gallery__media-container"]|//div[@class="article__media-container"]'
masthead_url = 'https://static.zeit.de/p/zeit.web/3.714/images/structured-data-publisher-logo-zon.png'
extra_css = '.figure__text { font-size: 0.9em; font-style: italic; } \
.figure__copyright { font-size: 0.9em; font-style: italic; color: #888; } \
.article-heading__kicker { font-size: 0.5em; display: block; margin-bottom: 1em; } \
p.summary { font-size: 1.3em; font-style: italic; } '
remove_tags = [
dict(name='aside', class_='topicbox'),
dict(name='aside', class_='article-toc'),
dict(class_='visually-hidden'),
dict(name='a', class_='faq-link'),
]
feeds = [
(u'Startseite Die wichtigsten Themen auf einen Blick',
u'https://newsfeed.zeit.de/index'),
(u'Politik Ausland und Deutschland',
u'https://newsfeed.zeit.de/politik/index'),
(u'Gesellschaft Gesellschaft und soziales Leben',
u'https://newsfeed.zeit.de/gesellschaft/index'),
(u'Wirtschaft Wirtschaft und Unternehmen',
u'https://newsfeed.zeit.de/wirtschaft/index'),
(u'Kultur Literatur, Kunst, Film und Musik',
u'https://newsfeed.zeit.de/kultur/index'),
(u'Wissen Wissenschaft, Gesundheit, Umwelt und Geschichte',
u'https://newsfeed.zeit.de/wissen/index'),
(u'Digital Hardware, Software, Internet, Datenschutz',
u'https://newsfeed.zeit.de/digital/index'),
(u'ZEIT Campus Online - Studieren, arbeiten, leben',
u'https://newsfeed.zeit.de/campus/index'),
(u'Arbeit - Umbruch, Neuanfang, Erfolg, Zweifel',
u'https://newsfeed.zeit.de/arbeit/index'),
(u'Entdecken - Die Poesie des Alltäglichen',
u'https://newsfeed.zeit.de/entdecken/index'),
(u'Mobilität Wie wir uns fortbewegen',
u'https://newsfeed.zeit.de/mobilitaet/index'), # AGe 2014-01-09
(u'Sport Sieg und Niederlage',
u'https://newsfeed.zeit.de/sport/index')
]
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
# Add a cookie indicating we have accepted the cookie
# policy
ck = Cookie(
version=0, name='zonconsent', value=datetime.now().isoformat(), port=None,
port_specified=False, domain='.zeit.de',
domain_specified=False, domain_initial_dot=True, path='/',
path_specified=False, secure=True, expires=None, discard=False,
comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)
br.cookiejar.set_cookie(ck)
return br
def print_version(self, url):
# If there is a complete page, use that one
req = Request(url=url+'/komplettansicht')
req.get_method = lambda : 'HEAD'
try:
urlopen(req)
return url + '/komplettansicht'
except HTTPError:
return url
def preprocess_raw_html(self, raw_html, url):
soup = BeautifulSoup(raw_html)
# Skip articles behind paywall
if soup.find('meta', property='lp:paywall'):
self.abort_article()
# Readability may strip the header for multipage articles, so simply
# include it in the main tag
if soup.find('body')['data-page-type'] == 'article':
body = soup.find('div', {'class': 'article-body'})
header = soup.find('header', {'class': 'article-header'})
if header is not None:
header.extract()
body.insert(0, header)
# Add real img tags for images
for container in soup.findAll(class_=re.compile('__media-container$')):
img = container.find('noscript')
if img is not None:
img.name = 'div'
img['data-src'] = ''
return soup.prettify()