calibre/recipes/zeitde.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'

'''
Fetch Zeit-Online.de
'''
import re
from datetime import datetime

from calibre.web.feeds.news import BasicNewsRecipe

try:
    from http.cookiejar import Cookie
except ImportError:
    from cookielib import Cookie

try:
    from urllib.error import HTTPError
    from urllib.request import Request, urlopen
except ImportError:
    from urllib2 import HTTPError, Request, urlopen

from calibre.ebooks.BeautifulSoup import BeautifulSoup


class ZeitDe(BasicNewsRecipe):

    __author__ = 'Armin Geller, Thomas Hollstegge'    # AGe 2014-02-26
    title = u'Zeit Online'
    description = u'German online portal of newspaper Die Zeit'
    publisher = 'ZEIT ONLINE GmbH'
    category = 'news, Germany'
    timefmt = ' [%a, %d %b %Y]'
    publication_type = 'newspaper'
    language = 'de'
    encoding = 'UTF-8'

    oldest_article = 7
    max_articles_per_feed = 100
    remove_empty_feeds = True
    auto_cleanup = True
    auto_cleanup_keep = '//header[@class="article-header"]|//div[@class="gallery__media-container"]|//div[@class="article__media-container"]'

    masthead_url = 'https://static.zeit.de/p/zeit.web/3.714/images/structured-data-publisher-logo-zon.png'

    extra_css = '''.figure__text { font-size: 0.9em; font-style: italic; }
                   .figure__copyright { font-size: 0.9em; font-style: italic; color: #888; }
                   .article-heading__kicker { font-size: 0.5em; display: block; margin-bottom: 1em; }
                   p.summary { font-size: 1.3em; font-style: italic; } '''

    remove_tags = [
        dict(name='aside', class_='topicbox'),
        dict(name='aside', class_='article-toc'),
        dict(class_='visually-hidden'),
        dict(name='a', class_='faq-link'),
    ]

    recipe_specific_options = {
        'days': {
            'short': 'Oldest article to download from this news source. In days ',
            'long': 'For example, 0.5, gives you articles from the past 12 hours',
            'default': str(oldest_article)
        }
    }

    def __init__(self, *args, **kwargs):
        BasicNewsRecipe.__init__(self, *args, **kwargs)
        d = self.recipe_specific_options.get('days')
        if d and isinstance(d, str):
            self.oldest_article = float(d)

    feeds = [
        (u'Startseite – Die wichtigsten Themen auf einen Blick',
         u'https://newsfeed.zeit.de/index'),
        (u'Politik – Ausland und Deutschland',
         u'https://newsfeed.zeit.de/politik/index'),
        (u'Gesellschaft – Gesellschaft und soziales Leben',
         u'https://newsfeed.zeit.de/gesellschaft/index'),
        (u'Wirtschaft – Wirtschaft und Unternehmen',
         u'https://newsfeed.zeit.de/wirtschaft/index'),
        (u'Kultur – Literatur, Kunst, Film und Musik',
         u'https://newsfeed.zeit.de/kultur/index'),
        (u'Wissen – Wissenschaft, Gesundheit, Umwelt und Geschichte',
         u'https://newsfeed.zeit.de/wissen/index'),
        (u'Digital – Hardware, Software, Internet, Datenschutz',
         u'https://newsfeed.zeit.de/digital/index'),
        (u'ZEIT Campus Online - Studieren, arbeiten, leben',
         u'https://newsfeed.zeit.de/campus/index'),
        (u'Arbeit - Umbruch, Neuanfang, Erfolg, Zweifel',
         u'https://newsfeed.zeit.de/arbeit/index'),
        (u'Entdecken - Die Poesie des Alltäglichen',
         u'https://newsfeed.zeit.de/entdecken/index'),
        (u'Mobilität – Wie wir uns fortbewegen',
         u'https://newsfeed.zeit.de/mobilitaet/index'),  # AGe 2014-01-09
        (u'Sport – Sieg und Niederlage',
         u'https://newsfeed.zeit.de/sport/index')
    ]

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)

        # Add a cookie indicating we have accepted the cookie
        # policy
        ck = Cookie(
            version=0, name='zonconsent', value=datetime.now().isoformat(), port=None,
            port_specified=False, domain='.zeit.de',
            domain_specified=False, domain_initial_dot=True, path='/',
            path_specified=False, secure=True, expires=None, discard=False,
            comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)
        br.cookiejar.set_cookie(ck)
        return br

    def print_version(self, url):
        # If there is a complete page, use that one
        req = Request(url=url+'/komplettansicht')
        req.get_method = lambda: 'HEAD'
        try:
            urlopen(req)
            return url + '/komplettansicht'
        except HTTPError:
            return url

    def preprocess_raw_html(self, raw_html, url):
        soup = BeautifulSoup(raw_html)

        # Skip articles behind paywall
        if soup.find('meta', property='lp:paywall'):
            self.abort_article()

        # Readability may strip the header for multipage articles, so simply
        # include it in the main tag
        if soup.find('body')['data-page-type'] == 'article':
            body = soup.find('div', {'class': 'article-body'})
            header = soup.find('header', {'class': 'article-header'})
            if header is not None:
                header.extract()
                body.insert(0, header)

        # Add real img tags for images
        for container in soup.findAll(class_=re.compile(r'__media-container$')):
            img = container.find('noscript')
            if img is not None:
                img.name = 'div'
                img['data-src'] = ''

        return soup.prettify()