#!/usr/bin/env python # -*- coding: utf-8 mode: python -*- from __future__ import absolute_import, division, print_function, unicode_literals __license__ = 'GPL v3' __copyright__ = '2010, Steffen Siebert ' __docformat__ = 'restructuredtext de' __version__ = '1.5' """ Die Zeit EPUB """ import os import zipfile import re import io from calibre.web.feeds.news import BasicNewsRecipe from calibre.ptempfile import PersistentTemporaryFile from calibre import walk try: from urllib.parse import urlparse except ImportError: from urlparse import urlparse from contextlib import closing from calibre.utils.magick.draw import save_cover_data_to class ZeitEPUBAbo(BasicNewsRecipe): title = u'Die Zeit' description = u'Das EPUB Abo der Zeit (needs subscription)' language = 'de' lang = 'de-DE' __author__ = 'Steffen Siebert, revised by Tobias Isenberg (with some code by Kovid Goyal), updated by Henning Losert' needs_subscription = True conversion_options = { 'no_default_epub_cover': True, # fixing the wrong left margin 'mobi_ignore_margins': True, 'keep_ligatures': True, } preprocess_regexps = [ # filtering for correct dashes ("Gedankenstrich" and "bis") (re.compile(u' (-|\u2212)(?=[ ,])'), lambda match: u' \u2013'), (re.compile(r'(?<=\d)-(?=\d)'), lambda match: u'\u2013'), # number-number (re.compile(r'(?<=\d,)-(?= ?\u20AC)'), lambda match: u'\u2013'), # ,- Euro # fix the number dash number dash for the title image that was broken # by the previous line (re.compile(r'(?<=\d\d\d\d)\u2013(?=\d?\d\.png)'), lambda match: '-'), # filtering for certain dash cases (re.compile(r'Bild - Zeitung'), lambda match: 'Bild-Zeitung'), # the obvious (re.compile(r'EMail'), lambda match: 'E-Mail'), # the obvious (re.compile(r'SBahn'), lambda match: 'S-Bahn'), # the obvious (re.compile(r'UBoot'), lambda match: 'U-Boot'), # the obvious (re.compile(r'T Shirt'), lambda match: 'T-Shirt'), # the obvious (re.compile(r'TShirt'), lambda match: 'T-Shirt'), # the obvious # the next two lines not only fix errors but also create new ones. this is due to additional errors in # the typesetting such as missing commas or wrongly placed dashes. but # more is fixed than broken. # space too much before a connecting dash (re.compile(r'(?])\u00BB'), lambda match: u' \u00BB'), # missing space before Roman numeral (re.compile( r'(?<=[a-z])(?=(I|II|III|IV|V|VI|VII|VIII|IX|X|XI|XII|XIII|XIV|XV|XVI|XVII|XVIII|XIX|XX)\.)'), lambda match: ' '), # missing space after Roman numeral (re.compile(r'(?<=(I|V|X)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral (re.compile( r'(?<=(II|IV|VI|IX|XI|XV|XX)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral (re.compile( r'(?<=(III|VII|XII|XIV|XVI|XIX)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral (re.compile(r'(?<=(VIII|XIII|XVII)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral (re.compile(r'(?<=(XVIII)\.)(?=[\w])'), lambda match: ' '), (re.compile(r'(?<=[A-Za-zÄÖÜäöü]),(?=[A-Za-zÄÖÜäöü])'), lambda match: ', '), # missing space after comma # missing space after full-stop (re.compile( r'(?<=[a-zäöü])\.(?=[A-ZÄÖÜ][A-Za-zÄÖÜäöü])'), lambda match: '. '), # fix abbreviation that was potentially broken previously (re.compile(r'(?<=[uU]\.) (?=a\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously (re.compile(r'(?<=[iI]\.) (?=A\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously (re.compile(r'(?<=[zZ]\.) (?=B\.)'), lambda match: u'\u2008'), # fix e-mail address that was potentially broken previously (re.compile(r'(?<=\w\.) (?=[A-Z][a-z]*@)'), lambda match: ''), (re.compile(r'(?<=\d)[Pp]rozent'), lambda match: ' Prozent'), (re.compile(r'\.\.\.\.+'), lambda match: '...'), # too many dots (....) # spaces before ... (re.compile(r'(?<=[^\s])\.\.\.'), lambda match: ' ...'), # spaces after ... (re.compile(r'\.\.\.(?=[^\s])'), lambda match: '... '), # fix special cases of ... in brackets (re.compile(r'(?<=[\[\(]) \.\.\. (?=[\]\)])'), lambda match: '...'), # fix special cases of ... after a quotation mark (re.compile(r'(?<=[\u00BB\u203A]) \.\.\.'), lambda match: '...'), # fix special cases of ... before a quotation mark or comma (re.compile(r'\.\.\. (?=[\u00AB\u2039,])'), lambda match: '...'), # fix missing spaces between numbers and any sort of units, possibly # with dot (re.compile(r'(?<=\d)(?=(Femto|Piko|Nano|Mikro|Milli|Zenti|Dezi|Hekto|Kilo|Mega|Giga|Tera|Peta|Tausend|Trilli|Kubik|Quadrat|Meter|Uhr|Jahr|Schuljahr|Seite))'), lambda match: ' '), # noqa (re.compile(r'(?<=\d\.)(?=(Femto|Piko|Nano|Mikro|Milli|Zenti|Dezi|Hekto|Kilo|Mega|Giga|Tera|Peta|Tausend|Trilli|Kubik|Quadrat|Meter|Uhr|Jahr|Schuljahr|Seite))'), lambda match: ' '), # noqa # fix wrong spaces # at beginning of paragraphs (re.compile( r'(?<=

[A-ZÄÖÜ]) (?=[a-zäöü\-])'), lambda match: ''), # before closing quotation (re.compile(u' \u00AB'), lambda match: u'\u00AB '), # after opening quotation (re.compile(u'\u00BB '), lambda match: u' \u00BB'), # filtering for spaces in large numbers for better readability # end of the number with some character following (re.compile(r'(?<=\d\d)(?=\d\d\d[ ,;\)<\?!-])'), lambda match: u'\u2008'), # end of the number with full-stop following, then space is necessary # (avoid file names) (re.compile(r'(?<=\d\d)(?=\d\d\d. )'), lambda match: u'\u2008'), (re.compile(r'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level (re.compile(r'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level (re.compile(r'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level (re.compile(r'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level # filtering for unicode characters that are missing on the Kindle, # try to replace them with meaningful work-arounds # subscript-0 (re.compile(u'\u2080'), lambda match: '0'), # subscript-1 (re.compile(u'\u2081'), lambda match: '1'), # subscript-2 (re.compile(u'\u2082'), lambda match: '2'), # subscript-3 (re.compile(u'\u2083'), lambda match: '3'), # subscript-4 (re.compile(u'\u2084'), lambda match: '4'), # subscript-5 (re.compile(u'\u2085'), lambda match: '5'), # subscript-6 (re.compile(u'\u2086'), lambda match: '6'), # subscript-7 (re.compile(u'\u2087'), lambda match: '7'), # subscript-8 (re.compile(u'\u2088'), lambda match: '8'), # subscript-9 (re.compile(u'\u2089'), lambda match: '9'), # always chance CO2 (re.compile(r'CO2'), lambda match: 'CO2'), # CO2 # remove *** paragraphs (re.compile(r'

\*\*\*

'), lambda match: ''), # better layout for the top line of each article (re.compile(r'(?<=DIE ZEIT N\u00B0 \d /) (?=\d\d)'), lambda match: ' 20'), # proper year in edition number (re.compile(r'(?<=DIE ZEIT N\u00B0 \d\d /) (?=\d\d)'), lambda match: ' 20'), # proper year in edition number (re.compile(r'(?<=>)(?=DIE ZEIT N\u00B0 \d\d / 20\d\d)'), lambda match: u' \u2014 '), # m-dash between category and DIE ZEIT ] def build_index(self): url = "https://meine.zeit.de/anmelden?url=https%3A//epaper.zeit.de/abo/diezeit" browser = self.get_browser() # new login process browser.open(url) browser.select_form(nr=0) browser.form['email'] = self.username browser.form['pass'] = self.password browser.submit() # change into abo section - not needed between late 2016 and May 2017, and again starting from March 2020 # browser.open(url) # abolink = browser.find_link(text_regex=re.compile( # '.*E-Paper.*')) # used to be '.*Abo-Bereich.*' # browser.follow_link(abolink) # find page for latest issue latestlink = browser.find_link(text_regex=re.compile( '.*ZUR AKTUELLEN AUSGABE.*')) browser.follow_link(latestlink) # now find the correct file, we will still use the ePub file epublink = browser.find_link(text_regex=re.compile( '.*EPUB F.*R E-READER LADEN.*')) # change from '.*EPUB FÜR E-READER LADEN.*' in May 2017 response = browser.follow_link(epublink) self.report_progress(1, _('next step')) tmp = PersistentTemporaryFile(suffix='.epub') self.report_progress(0, _('downloading epub')) tmp.write(response.read()) tmp.close() zfile = zipfile.ZipFile(tmp.name, 'r') self.report_progress(0, _('extracting epub')) zfile.extractall(self.output_dir) tmp.close() index = os.path.join(self.output_dir, 'content.opf') self.report_progress(1, _('epub downloaded and extracted')) # doing regular expression filtering for path in walk('.'): (shortname, extension) = os.path.splitext(path) if extension.lower() in ('.html', '.htm', '.xhtml'): with open(path, 'r+b') as f: raw = f.read() raw = raw.decode('utf-8') for pat, func in self.preprocess_regexps: raw = pat.sub(func, raw) f.seek(0) f.truncate() f.write(raw.encode('utf-8')) # adding real cover self.report_progress( 0, _('trying to download cover image (titlepage)')) self.download_cover() self.conversion_options["cover"] = self.cover_path return index # getting url of the cover def get_cover_url(self): self.log.warning('Downloading cover') try: self.log.warning('Trying PDF-based cover') url = "https://meine.zeit.de/anmelden?url=https%3A//epaper.zeit.de/abo/diezeit" browser = self.get_browser() # new login process browser.open(url) browser.select_form(nr=0) browser.form['email'] = self.username browser.form['pass'] = self.password browser.submit() # change into abo section - not needed at the moment # browser.open(url) # abolink = browser.find_link(text_regex=re.compile( # '.*Abo-Bereich.*')) # browser.follow_link(abolink) # find page for latest issue latestlink = browser.find_link(text_regex=re.compile( '.*ZUR AKTUELLEN AUSGABE.*')) browser.follow_link(latestlink) # actual cover search pdflink = browser.find_link(text_regex=re.compile( '.*GESAMT-PDF LADEN.*')) cover_url = urlparse(pdflink.base_url)[0] + '://' + urlparse(pdflink.base_url)[1] + '' + ( urlparse(pdflink.url)[2]).replace('ePaper_', '').replace('.pdf', '_001.pdf') self.log.warning('PDF link found:') self.log.warning(cover_url) # download the cover (has to be here due to new login process) with closing(browser.open(cover_url)) as r: cdata = r.read() from calibre.ebooks.metadata.pdf import get_metadata stream = io.BytesIO(cdata) cdata = None mi = get_metadata(stream) if mi.cover_data and mi.cover_data[1]: cdata = mi.cover_data[1] cpath = os.path.join(self.output_dir, 'cover.jpg') save_cover_data_to(cdata, cpath) cover_url = cpath except: self.log.warning('Trying low-res cover') try: inhalt = self.index_to_soup('http://www.zeit.de/inhalt') cover_url = inhalt.find('div', attrs={'class': 'singlearchive clearfix'}).img[ 'src'].replace('icon_', '') except: self.log.warning('Using static old low-res cover') cover_url = 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg' return cover_url