From a992cfc4638b67adac95a6b16da47b3840508d02 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 25 Feb 2011 14:01:47 -0700 Subject: [PATCH] Improve Die Zeit (subscription) --- resources/recipes/zeitde_sub.recipe | 59 ++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 5 deletions(-) diff --git a/resources/recipes/zeitde_sub.recipe b/resources/recipes/zeitde_sub.recipe index 5014837c5b..aa6447a457 100644 --- a/resources/recipes/zeitde_sub.recipe +++ b/resources/recipes/zeitde_sub.recipe @@ -2,9 +2,9 @@ # -*- coding: utf-8 mode: python -*- __license__ = 'GPL v3' -__copyright__ = '2010, Steffen Siebert ' +__copyright__ = '2010-2011, Steffen Siebert ' __docformat__ = 'restructuredtext de' -__version__ = '1.1' +__version__ = '1.2' """ Die Zeit EPUB @@ -13,21 +13,43 @@ Die Zeit EPUB import os, urllib2, zipfile, re from calibre.web.feeds.news import BasicNewsRecipe from calibre.ptempfile import PersistentTemporaryFile +from calibre import walk class ZeitEPUBAbo(BasicNewsRecipe): - title = u'Zeit Online Premium' + title = u'Die Zeit' description = u'Das EPUB Abo der Zeit (needs subscription)' language = 'de' lang = 'de-DE' - __author__ = 'Steffen Siebert' + __author__ = 'Steffen Siebert and Tobias Isenberg' needs_subscription = True conversion_options = { - 'no_default_epub_cover' : True + 'no_default_epub_cover' : True, + # fixing the wrong left margin + 'mobi_ignore_margins' : True, } + preprocess_regexps = [ + # filtering for correct dashes + (re.compile(r' - '), lambda match: ' – '), # regular "Gedankenstrich" + (re.compile(r' -,'), lambda match: ' –,'), # "Gedankenstrich" before a comma + (re.compile(r'(?<=\d)-(?=\d)'), lambda match: '–'), # number-number + # filtering for unicode characters that are missing on the Kindle, + # try to replace them with meaningful work-arounds + (re.compile(u'\u2080'), lambda match: '0'), # subscript-0 + (re.compile(u'\u2081'), lambda match: '1'), # subscript-1 + (re.compile(u'\u2082'), lambda match: '2'), # subscript-2 + (re.compile(u'\u2083'), lambda match: '3'), # subscript-3 + (re.compile(u'\u2084'), lambda match: '4'), # subscript-4 + (re.compile(u'\u2085'), lambda match: '5'), # subscript-5 + (re.compile(u'\u2086'), lambda match: '6'), # subscript-6 + (re.compile(u'\u2087'), lambda match: '7'), # subscript-7 + (re.compile(u'\u2088'), lambda match: '8'), # subscript-8 + (re.compile(u'\u2089'), lambda match: '9'), # subscript-9 + ] + def build_index(self): domain = "http://premium.zeit.de" url = domain + "/abovorteile/cgi-bin/_er_member/p4z.fpl?ER_Do=getUserData&ER_NextTemplate=login_ok" @@ -55,9 +77,36 @@ class ZeitEPUBAbo(BasicNewsRecipe): zfile.extractall(self.output_dir) tmp.close() + index = os.path.join(self.output_dir, 'content.opf') self.report_progress(1,_('epub downloaded and extracted')) + # doing regular expression filtering + for path in walk('.'): + (shortname, extension) = os.path.splitext(path) + if extension.lower() in ('.html', '.htm', '.xhtml'): + with open(path, 'r+b') as f: + raw = f.read() + raw = raw.decode('utf-8') + for pat, func in self.preprocess_regexps: + raw = pat.sub(func, raw) + f.seek(0) + f.truncate() + f.write(raw.encode('utf-8')) + + # adding real cover + self.report_progress(0,_('trying to download cover image (titlepage)')) + self.download_cover() + self.conversion_options["cover"] = self.cover_path + return index + # getting url of the cover + def get_cover_url(self): + try: + inhalt = self.index_to_soup('http://www.zeit.de/inhalt') + cover_url = inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','') + except: + cover_url = 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg' + return cover_url