mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Improve Die Zeit (subscription)
This commit is contained in:
parent
586d97b2a6
commit
a992cfc463
@ -2,9 +2,9 @@
|
|||||||
# -*- coding: utf-8 mode: python -*-
|
# -*- coding: utf-8 mode: python -*-
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2010, Steffen Siebert <calibre at steffensiebert.de>'
|
__copyright__ = '2010-2011, Steffen Siebert <calibre at steffensiebert.de>'
|
||||||
__docformat__ = 'restructuredtext de'
|
__docformat__ = 'restructuredtext de'
|
||||||
__version__ = '1.1'
|
__version__ = '1.2'
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Die Zeit EPUB
|
Die Zeit EPUB
|
||||||
@ -13,21 +13,43 @@ Die Zeit EPUB
|
|||||||
import os, urllib2, zipfile, re
|
import os, urllib2, zipfile, re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
from calibre import walk
|
||||||
|
|
||||||
class ZeitEPUBAbo(BasicNewsRecipe):
|
class ZeitEPUBAbo(BasicNewsRecipe):
|
||||||
|
|
||||||
title = u'Zeit Online Premium'
|
title = u'Die Zeit'
|
||||||
description = u'Das EPUB Abo der Zeit (needs subscription)'
|
description = u'Das EPUB Abo der Zeit (needs subscription)'
|
||||||
language = 'de'
|
language = 'de'
|
||||||
lang = 'de-DE'
|
lang = 'de-DE'
|
||||||
|
|
||||||
__author__ = 'Steffen Siebert'
|
__author__ = 'Steffen Siebert and Tobias Isenberg'
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
|
|
||||||
conversion_options = {
|
conversion_options = {
|
||||||
'no_default_epub_cover' : True
|
'no_default_epub_cover' : True,
|
||||||
|
# fixing the wrong left margin
|
||||||
|
'mobi_ignore_margins' : True,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
preprocess_regexps = [
|
||||||
|
# filtering for correct dashes
|
||||||
|
(re.compile(r' - '), lambda match: ' – '), # regular "Gedankenstrich"
|
||||||
|
(re.compile(r' -,'), lambda match: ' –,'), # "Gedankenstrich" before a comma
|
||||||
|
(re.compile(r'(?<=\d)-(?=\d)'), lambda match: '–'), # number-number
|
||||||
|
# filtering for unicode characters that are missing on the Kindle,
|
||||||
|
# try to replace them with meaningful work-arounds
|
||||||
|
(re.compile(u'\u2080'), lambda match: '<span style="font-size: 50%;">0</span>'), # subscript-0
|
||||||
|
(re.compile(u'\u2081'), lambda match: '<span style="font-size: 50%;">1</span>'), # subscript-1
|
||||||
|
(re.compile(u'\u2082'), lambda match: '<span style="font-size: 50%;">2</span>'), # subscript-2
|
||||||
|
(re.compile(u'\u2083'), lambda match: '<span style="font-size: 50%;">3</span>'), # subscript-3
|
||||||
|
(re.compile(u'\u2084'), lambda match: '<span style="font-size: 50%;">4</span>'), # subscript-4
|
||||||
|
(re.compile(u'\u2085'), lambda match: '<span style="font-size: 50%;">5</span>'), # subscript-5
|
||||||
|
(re.compile(u'\u2086'), lambda match: '<span style="font-size: 50%;">6</span>'), # subscript-6
|
||||||
|
(re.compile(u'\u2087'), lambda match: '<span style="font-size: 50%;">7</span>'), # subscript-7
|
||||||
|
(re.compile(u'\u2088'), lambda match: '<span style="font-size: 50%;">8</span>'), # subscript-8
|
||||||
|
(re.compile(u'\u2089'), lambda match: '<span style="font-size: 50%;">9</span>'), # subscript-9
|
||||||
|
]
|
||||||
|
|
||||||
def build_index(self):
|
def build_index(self):
|
||||||
domain = "http://premium.zeit.de"
|
domain = "http://premium.zeit.de"
|
||||||
url = domain + "/abovorteile/cgi-bin/_er_member/p4z.fpl?ER_Do=getUserData&ER_NextTemplate=login_ok"
|
url = domain + "/abovorteile/cgi-bin/_er_member/p4z.fpl?ER_Do=getUserData&ER_NextTemplate=login_ok"
|
||||||
@ -55,9 +77,36 @@ class ZeitEPUBAbo(BasicNewsRecipe):
|
|||||||
zfile.extractall(self.output_dir)
|
zfile.extractall(self.output_dir)
|
||||||
|
|
||||||
tmp.close()
|
tmp.close()
|
||||||
|
|
||||||
index = os.path.join(self.output_dir, 'content.opf')
|
index = os.path.join(self.output_dir, 'content.opf')
|
||||||
|
|
||||||
self.report_progress(1,_('epub downloaded and extracted'))
|
self.report_progress(1,_('epub downloaded and extracted'))
|
||||||
|
|
||||||
|
# doing regular expression filtering
|
||||||
|
for path in walk('.'):
|
||||||
|
(shortname, extension) = os.path.splitext(path)
|
||||||
|
if extension.lower() in ('.html', '.htm', '.xhtml'):
|
||||||
|
with open(path, 'r+b') as f:
|
||||||
|
raw = f.read()
|
||||||
|
raw = raw.decode('utf-8')
|
||||||
|
for pat, func in self.preprocess_regexps:
|
||||||
|
raw = pat.sub(func, raw)
|
||||||
|
f.seek(0)
|
||||||
|
f.truncate()
|
||||||
|
f.write(raw.encode('utf-8'))
|
||||||
|
|
||||||
|
# adding real cover
|
||||||
|
self.report_progress(0,_('trying to download cover image (titlepage)'))
|
||||||
|
self.download_cover()
|
||||||
|
self.conversion_options["cover"] = self.cover_path
|
||||||
|
|
||||||
return index
|
return index
|
||||||
|
|
||||||
|
# getting url of the cover
|
||||||
|
def get_cover_url(self):
|
||||||
|
try:
|
||||||
|
inhalt = self.index_to_soup('http://www.zeit.de/inhalt')
|
||||||
|
cover_url = inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','')
|
||||||
|
except:
|
||||||
|
cover_url = 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg'
|
||||||
|
return cover_url
|
||||||
|
Loading…
x
Reference in New Issue
Block a user