Improve Die Zeit (subscription)

This commit is contained in:
Kovid Goyal 2011-02-25 14:01:47 -07:00
parent 586d97b2a6
commit a992cfc463

View File

@ -2,9 +2,9 @@
# -*- coding: utf-8 mode: python -*- # -*- coding: utf-8 mode: python -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010, Steffen Siebert <calibre at steffensiebert.de>' __copyright__ = '2010-2011, Steffen Siebert <calibre at steffensiebert.de>'
__docformat__ = 'restructuredtext de' __docformat__ = 'restructuredtext de'
__version__ = '1.1' __version__ = '1.2'
""" """
Die Zeit EPUB Die Zeit EPUB
@ -13,21 +13,43 @@ Die Zeit EPUB
import os, urllib2, zipfile, re import os, urllib2, zipfile, re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from calibre import walk
class ZeitEPUBAbo(BasicNewsRecipe): class ZeitEPUBAbo(BasicNewsRecipe):
title = u'Zeit Online Premium' title = u'Die Zeit'
description = u'Das EPUB Abo der Zeit (needs subscription)' description = u'Das EPUB Abo der Zeit (needs subscription)'
language = 'de' language = 'de'
lang = 'de-DE' lang = 'de-DE'
__author__ = 'Steffen Siebert' __author__ = 'Steffen Siebert and Tobias Isenberg'
needs_subscription = True needs_subscription = True
conversion_options = { conversion_options = {
'no_default_epub_cover' : True 'no_default_epub_cover' : True,
# fixing the wrong left margin
'mobi_ignore_margins' : True,
} }
preprocess_regexps = [
# filtering for correct dashes
(re.compile(r' - '), lambda match: ' '), # regular "Gedankenstrich"
(re.compile(r' -,'), lambda match: ' ,'), # "Gedankenstrich" before a comma
(re.compile(r'(?<=\d)-(?=\d)'), lambda match: ''), # number-number
# filtering for unicode characters that are missing on the Kindle,
# try to replace them with meaningful work-arounds
(re.compile(u'\u2080'), lambda match: '<span style="font-size: 50%;">0</span>'), # subscript-0
(re.compile(u'\u2081'), lambda match: '<span style="font-size: 50%;">1</span>'), # subscript-1
(re.compile(u'\u2082'), lambda match: '<span style="font-size: 50%;">2</span>'), # subscript-2
(re.compile(u'\u2083'), lambda match: '<span style="font-size: 50%;">3</span>'), # subscript-3
(re.compile(u'\u2084'), lambda match: '<span style="font-size: 50%;">4</span>'), # subscript-4
(re.compile(u'\u2085'), lambda match: '<span style="font-size: 50%;">5</span>'), # subscript-5
(re.compile(u'\u2086'), lambda match: '<span style="font-size: 50%;">6</span>'), # subscript-6
(re.compile(u'\u2087'), lambda match: '<span style="font-size: 50%;">7</span>'), # subscript-7
(re.compile(u'\u2088'), lambda match: '<span style="font-size: 50%;">8</span>'), # subscript-8
(re.compile(u'\u2089'), lambda match: '<span style="font-size: 50%;">9</span>'), # subscript-9
]
def build_index(self): def build_index(self):
domain = "http://premium.zeit.de" domain = "http://premium.zeit.de"
url = domain + "/abovorteile/cgi-bin/_er_member/p4z.fpl?ER_Do=getUserData&ER_NextTemplate=login_ok" url = domain + "/abovorteile/cgi-bin/_er_member/p4z.fpl?ER_Do=getUserData&ER_NextTemplate=login_ok"
@ -55,9 +77,36 @@ class ZeitEPUBAbo(BasicNewsRecipe):
zfile.extractall(self.output_dir) zfile.extractall(self.output_dir)
tmp.close() tmp.close()
index = os.path.join(self.output_dir, 'content.opf') index = os.path.join(self.output_dir, 'content.opf')
self.report_progress(1,_('epub downloaded and extracted')) self.report_progress(1,_('epub downloaded and extracted'))
# doing regular expression filtering
for path in walk('.'):
(shortname, extension) = os.path.splitext(path)
if extension.lower() in ('.html', '.htm', '.xhtml'):
with open(path, 'r+b') as f:
raw = f.read()
raw = raw.decode('utf-8')
for pat, func in self.preprocess_regexps:
raw = pat.sub(func, raw)
f.seek(0)
f.truncate()
f.write(raw.encode('utf-8'))
# adding real cover
self.report_progress(0,_('trying to download cover image (titlepage)'))
self.download_cover()
self.conversion_options["cover"] = self.cover_path
return index return index
# getting url of the cover
def get_cover_url(self):
try:
inhalt = self.index_to_soup('http://www.zeit.de/inhalt')
cover_url = inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','')
except:
cover_url = 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg'
return cover_url