calibre/recipes/zeitde_sub.recipe

#!/usr/bin/env python
# -*- coding: utf-8 mode: python -*-
from __future__ import absolute_import, division, print_function, unicode_literals

__license__ = 'GPL v3'
__copyright__ = '2010, Steffen Siebert <calibre at steffensiebert.de>'
__docformat__ = 'restructuredtext de'
__version__ = '1.5'

"""
Die Zeit EPUB
"""

import os
import zipfile
import re
import io
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
from calibre import walk
try:
    from urllib.parse import urlparse
except ImportError:
    from urlparse import urlparse
from contextlib import closing
from calibre.utils.magick.draw import save_cover_data_to


class ZeitEPUBAbo(BasicNewsRecipe):

    title = u'Die Zeit'
    description = u'Das EPUB Abo der Zeit (needs subscription)'
    language = 'de'
    lang = 'de-DE'

    __author__ = 'Steffen Siebert, revised by Tobias Isenberg (with some code by Kovid Goyal), updated by Henning Losert'
    needs_subscription = True

    conversion_options = {
        'no_default_epub_cover': True,
        # fixing the wrong left margin
        'mobi_ignore_margins': True,
        'keep_ligatures': True,
    }

    preprocess_regexps = [
        # filtering for correct dashes ("Gedankenstrich" and "bis")
        (re.compile(u' (-|\u2212)(?=[ ,])'), lambda match: u' \u2013'),
        (re.compile(r'(?<=\d)-(?=\d)'), lambda match: u'\u2013'),  # number-number
        (re.compile(r'(?<=\d,)-(?= ?\u20AC)'), lambda match: u'\u2013'),  # ,- Euro
        # fix the number dash number dash for the title image that was broken
        # by the previous line
        (re.compile(r'(?<=\d\d\d\d)\u2013(?=\d?\d\.png)'), lambda match: '-'),
        # filtering for certain dash cases
        (re.compile(r'Bild - Zeitung'), lambda match: 'Bild-Zeitung'),  # the obvious
        (re.compile(r'EMail'), lambda match: 'E-Mail'),  # the obvious
        (re.compile(r'SBahn'), lambda match: 'S-Bahn'),  # the obvious
        (re.compile(r'UBoot'), lambda match: 'U-Boot'),  # the obvious
        (re.compile(r'T Shirt'), lambda match: 'T-Shirt'),  # the obvious
        (re.compile(r'TShirt'), lambda match: 'T-Shirt'),  # the obvious
        # the next two lines not only fix errors but also create new ones. this is due to additional errors in
        # the typesetting such as missing commas or wrongly placed dashes. but
        # more is fixed than broken.
        # space too much before a connecting dash
        (re.compile(r'(?<!und|der|\w\w,) -(?=\w)'), lambda match: '-'),
        (re.compile(r'(?<=\w)- (?!und\b|oder\b|wie\b|aber\b|auch\b|sondern\b|bis\b|&amp;|&\s|bzw\.|auf\b|eher\b)'),
         lambda match: '-'),  # space too much after a connecting dash
        # filtering for missing spaces before the month in long dates
        (re.compile(r'(?<=\d)\.(?=(Januar|Februar|M\u00E4rz|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember))'), lambda match: '. '),
        # filtering for other missing spaces
        (re.compile(r'Stuttgart21'), lambda match: 'Stuttgart 21'),  # the obvious
        (re.compile(r'(?<=\d)(?=\u20AC)'),
         lambda match: u'\u2013'),  # Zahl[no space]Euro
        # missing space after colon
        (re.compile(r':(?=[^\d\s</])'), lambda match: ': '),
        # missing space after closing quotation
        (re.compile(r'\u00AB(?=[^\-\.:;,\?!<\)\s])'),
         lambda match: u'\u00AB '),
        # missing space before opening quotation
        (re.compile(r'(?<=[^\s\(>])\u00BB'), lambda match: u' \u00BB'),
        # missing space before Roman numeral
        (re.compile(
            r'(?<=[a-z])(?=(I|II|III|IV|V|VI|VII|VIII|IX|X|XI|XII|XIII|XIV|XV|XVI|XVII|XVIII|XIX|XX)\.)'), lambda match: ' '),
        # missing space after Roman numeral
        (re.compile(r'(?<=(I|V|X)\.)(?=[\w])'), lambda match: ' '),
        # missing space after Roman numeral
        (re.compile(
            r'(?<=(II|IV|VI|IX|XI|XV|XX)\.)(?=[\w])'), lambda match: ' '),
        # missing space after Roman numeral
        (re.compile(
            r'(?<=(III|VII|XII|XIV|XVI|XIX)\.)(?=[\w])'), lambda match: ' '),
        # missing space after Roman numeral
        (re.compile(r'(?<=(VIII|XIII|XVII)\.)(?=[\w])'), lambda match: ' '),
        # missing space after Roman numeral
        (re.compile(r'(?<=(XVIII)\.)(?=[\w])'), lambda match: ' '),
        (re.compile(r'(?<=[A-Za-zÄÖÜäöü]),(?=[A-Za-zÄÖÜäöü])'),
         lambda match: ', '),  # missing space after comma
        # missing space after full-stop
        (re.compile(
            r'(?<=[a-zäöü])\.(?=[A-ZÄÖÜ][A-Za-zÄÖÜäöü])'), lambda match: '. '),
        # fix abbreviation that was potentially broken previously
        (re.compile(r'(?<=[uU]\.) (?=a\.)'), lambda match: u'\u2008'),
        # fix abbreviation that was potentially broken previously
        (re.compile(r'(?<=[iI]\.) (?=A\.)'), lambda match: u'\u2008'),
        # fix abbreviation that was potentially broken previously
        (re.compile(r'(?<=[zZ]\.) (?=B\.)'), lambda match: u'\u2008'),
        # fix e-mail address that was potentially broken previously
        (re.compile(r'(?<=\w\.) (?=[A-Z][a-z]*@)'), lambda match: ''),
        (re.compile(r'(?<=\d)[Pp]rozent'), lambda match: ' Prozent'),
        (re.compile(r'\.\.\.\.+'), lambda match: '...'),  # too many dots (....)
        # spaces before ...
        (re.compile(r'(?<=[^\s])\.\.\.'), lambda match: ' ...'),
        # spaces after ...
        (re.compile(r'\.\.\.(?=[^\s])'), lambda match: '... '),
        # fix special cases of ... in brackets
        (re.compile(r'(?<=[\[\(]) \.\.\. (?=[\]\)])'), lambda match: '...'),
        # fix special cases of ... after a quotation mark
        (re.compile(r'(?<=[\u00BB\u203A]) \.\.\.'), lambda match: '...'),
        # fix special cases of ... before a quotation mark or comma
        (re.compile(r'\.\.\. (?=[\u00AB\u2039,])'), lambda match: '...'),
        # fix missing spaces between numbers and any sort of units, possibly
        # with dot
        (re.compile(r'(?<=\d)(?=(Femto|Piko|Nano|Mikro|Milli|Zenti|Dezi|Hekto|Kilo|Mega|Giga|Tera|Peta|Tausend|Trilli|Kubik|Quadrat|Meter|Uhr|Jahr|Schuljahr|Seite))'), lambda match: ' '),  # noqa
        (re.compile(r'(?<=\d\.)(?=(Femto|Piko|Nano|Mikro|Milli|Zenti|Dezi|Hekto|Kilo|Mega|Giga|Tera|Peta|Tausend|Trilli|Kubik|Quadrat|Meter|Uhr|Jahr|Schuljahr|Seite))'), lambda match: ' '),  # noqa
        # fix wrong spaces
        # at beginning of paragraphs
        (re.compile(
            r'(?<=<p class="absatz">[A-ZÄÖÜ]) (?=[a-zäöü\-])'), lambda match: ''),
        # before closing quotation
        (re.compile(u' \u00AB'), lambda match: u'\u00AB '),
        # after opening quotation
        (re.compile(u'\u00BB '), lambda match: u' \u00BB'),
        # filtering for spaces in large numbers for better readability
        # end of the number with some character following
        (re.compile(r'(?<=\d\d)(?=\d\d\d[ ,;\)<\?!-])'),
         lambda match: u'\u2008'),
        # end of the number with full-stop following, then space is necessary
        # (avoid file names)
        (re.compile(r'(?<=\d\d)(?=\d\d\d. )'), lambda match: u'\u2008'),
        (re.compile(r'(?<=\d)(?=\d\d\d\u2008)'),
         lambda match: u'\u2008'),  # next level
        (re.compile(r'(?<=\d)(?=\d\d\d\u2008)'),
         lambda match: u'\u2008'),  # next level
        (re.compile(r'(?<=\d)(?=\d\d\d\u2008)'),
         lambda match: u'\u2008'),  # next level
        (re.compile(r'(?<=\d)(?=\d\d\d\u2008)'),
         lambda match: u'\u2008'),  # next level
        # filtering for unicode characters that are missing on the Kindle,
        # try to replace them with meaningful work-arounds
        # subscript-0
        (re.compile(u'\u2080'), lambda match: '<span style="font-size: 40%;">0</span>'),
        # subscript-1
        (re.compile(u'\u2081'), lambda match: '<span style="font-size: 40%;">1</span>'),
        # subscript-2
        (re.compile(u'\u2082'), lambda match: '<span style="font-size: 40%;">2</span>'),
        # subscript-3
        (re.compile(u'\u2083'), lambda match: '<span style="font-size: 40%;">3</span>'),
        # subscript-4
        (re.compile(u'\u2084'), lambda match: '<span style="font-size: 40%;">4</span>'),
        # subscript-5
        (re.compile(u'\u2085'), lambda match: '<span style="font-size: 40%;">5</span>'),
        # subscript-6
        (re.compile(u'\u2086'), lambda match: '<span style="font-size: 40%;">6</span>'),
        # subscript-7
        (re.compile(u'\u2087'), lambda match: '<span style="font-size: 40%;">7</span>'),
        # subscript-8
        (re.compile(u'\u2088'), lambda match: '<span style="font-size: 40%;">8</span>'),
        # subscript-9
        (re.compile(u'\u2089'), lambda match: '<span style="font-size: 40%;">9</span>'),
        # always chance CO2
        (re.compile(r'CO2'), lambda match: 'CO<span style="font-size: 40%;">2</span>'),  # CO2
        # remove *** paragraphs
        (re.compile(r'<p class="absatz">\*\*\*</p>'), lambda match: ''),
        # better layout for the top line of each article
        (re.compile(r'(?<=DIE ZEIT N\u00B0 \d /) (?=\d\d)'),
         lambda match: ' 20'),  # proper year in edition number
        (re.compile(r'(?<=DIE ZEIT N\u00B0 \d\d /) (?=\d\d)'),
         lambda match: ' 20'),  # proper year in edition number
        (re.compile(r'(?<=>)(?=DIE ZEIT N\u00B0 \d\d / 20\d\d)'),
         lambda match: u' \u2014 '),  # m-dash between category and DIE ZEIT
    ]

    def build_index(self):
        url = "https://meine.zeit.de/anmelden?url=https%3A//epaper.zeit.de/abo/diezeit"
        browser = self.get_browser()

        # new login process
        browser.open(url)
        browser.select_form(nr=0)
        browser.form['email'] = self.username
        browser.form['pass'] = self.password
        browser.submit()
        # change into abo section - not needed between late 2016 and May 2017, and again starting from March 2020
        # browser.open(url)
        # abolink = browser.find_link(text_regex=re.compile(
        #    '.*E-Paper.*'))  # used to be '.*Abo-Bereich.*'
        # browser.follow_link(abolink)
        # find page for latest issue
        latestlink = browser.find_link(text_regex=re.compile(
            '.*ZUR AKTUELLEN AUSGABE.*'))
        browser.follow_link(latestlink)
        # now find the correct file, we will still use the ePub file
        epublink = browser.find_link(text_regex=re.compile(
            '.*EPUB F.*R E-READER LADEN.*'))  # change from '.*EPUB FÜR E-READER LADEN.*' in May 2017
        response = browser.follow_link(epublink)
        self.report_progress(1, _('next step'))

        tmp = PersistentTemporaryFile(suffix='.epub')
        self.report_progress(0, _('downloading epub'))
        tmp.write(response.read())
        tmp.close()

        zfile = zipfile.ZipFile(tmp.name, 'r')
        self.report_progress(0, _('extracting epub'))

        zfile.extractall(self.output_dir)

        tmp.close()

        index = os.path.join(self.output_dir, 'content.opf')

        self.report_progress(1, _('epub downloaded and extracted'))

        # doing regular expression filtering
        for path in walk('.'):
            (shortname, extension) = os.path.splitext(path)
            if extension.lower() in ('.html', '.htm', '.xhtml'):
                with open(path, 'r+b') as f:
                    raw = f.read()
                    raw = raw.decode('utf-8')
                    for pat, func in self.preprocess_regexps:
                        raw = pat.sub(func, raw)
                    f.seek(0)
                    f.truncate()
                    f.write(raw.encode('utf-8'))

        # adding real cover
        self.report_progress(
            0, _('trying to download cover image (titlepage)'))
        self.download_cover()
        self.conversion_options["cover"] = self.cover_path

        return index

    # getting url of the cover
    def get_cover_url(self):
        self.log.warning('Downloading cover')
        try:
            self.log.warning('Trying PDF-based cover')
            url = "https://meine.zeit.de/anmelden?url=https%3A//epaper.zeit.de/abo/diezeit"
            browser = self.get_browser()

            # new login process
            browser.open(url)
            browser.select_form(nr=0)
            browser.form['email'] = self.username
            browser.form['pass'] = self.password
            browser.submit()
            # change into abo section - not needed at the moment
            # browser.open(url)
            # abolink = browser.find_link(text_regex=re.compile(
            #    '.*Abo-Bereich.*'))
            # browser.follow_link(abolink)
            # find page for latest issue
            latestlink = browser.find_link(text_regex=re.compile(
                '.*ZUR AKTUELLEN AUSGABE.*'))
            browser.follow_link(latestlink)
            # actual cover search
            pdflink = browser.find_link(text_regex=re.compile(
                '.*GESAMT-PDF LADEN.*'))
            cover_url = urlparse(pdflink.base_url)[0] + '://' + urlparse(pdflink.base_url)[1] + '' + (
                urlparse(pdflink.url)[2]).replace('ePaper_', '').replace('.pdf', '_001.pdf')
            self.log.warning('PDF link found:')
            self.log.warning(cover_url)
            # download the cover (has to be here due to new login process)
            with closing(browser.open(cover_url)) as r:
                cdata = r.read()
            from calibre.ebooks.metadata.pdf import get_metadata
            stream = io.BytesIO(cdata)
            cdata = None
            mi = get_metadata(stream)
            if mi.cover_data and mi.cover_data[1]:
                cdata = mi.cover_data[1]

            cpath = os.path.join(self.output_dir, 'cover.jpg')
            save_cover_data_to(cdata, cpath)
            cover_url = cpath

        except:
            self.log.warning('Trying low-res cover')
            try:
                inhalt = self.index_to_soup('http://www.zeit.de/inhalt')
                cover_url = inhalt.find('div', attrs={'class': 'singlearchive clearfix'}).img[
                    'src'].replace('icon_', '')
            except:
                self.log.warning('Using static old low-res cover')
                cover_url = 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg'
        return cover_url