calibre/recipes/zeitde_sub.recipe

113 lines
4.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 mode: python -*-
__license__ = 'GPL v3'
__copyright__ = '2010-2011, Steffen Siebert <calibre at steffensiebert.de>'
__docformat__ = 'restructuredtext de'
__version__ = '1.2'
"""
Die Zeit EPUB
"""
import os, urllib2, zipfile, re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
from calibre import walk
class ZeitEPUBAbo(BasicNewsRecipe):
title = u'Die Zeit'
description = u'Das EPUB Abo der Zeit (needs subscription)'
language = 'de'
lang = 'de-DE'
__author__ = 'Steffen Siebert and Tobias Isenberg'
needs_subscription = True
conversion_options = {
'no_default_epub_cover' : True,
# fixing the wrong left margin
'mobi_ignore_margins' : True,
}
preprocess_regexps = [
# filtering for correct dashes
(re.compile(r' - '), lambda match: ' '), # regular "Gedankenstrich"
(re.compile(r' -,'), lambda match: ' ,'), # "Gedankenstrich" before a comma
(re.compile(r'(?<=\d)-(?=\d)'), lambda match: ''), # number-number
# filtering for unicode characters that are missing on the Kindle,
# try to replace them with meaningful work-arounds
(re.compile(u'\u2080'), lambda match: '<span style="font-size: 50%;">0</span>'), # subscript-0
(re.compile(u'\u2081'), lambda match: '<span style="font-size: 50%;">1</span>'), # subscript-1
(re.compile(u'\u2082'), lambda match: '<span style="font-size: 50%;">2</span>'), # subscript-2
(re.compile(u'\u2083'), lambda match: '<span style="font-size: 50%;">3</span>'), # subscript-3
(re.compile(u'\u2084'), lambda match: '<span style="font-size: 50%;">4</span>'), # subscript-4
(re.compile(u'\u2085'), lambda match: '<span style="font-size: 50%;">5</span>'), # subscript-5
(re.compile(u'\u2086'), lambda match: '<span style="font-size: 50%;">6</span>'), # subscript-6
(re.compile(u'\u2087'), lambda match: '<span style="font-size: 50%;">7</span>'), # subscript-7
(re.compile(u'\u2088'), lambda match: '<span style="font-size: 50%;">8</span>'), # subscript-8
(re.compile(u'\u2089'), lambda match: '<span style="font-size: 50%;">9</span>'), # subscript-9
]
def build_index(self):
domain = "http://premium.zeit.de"
url = domain + "/abovorteile/cgi-bin/_er_member/p4z.fpl?ER_Do=getUserData&ER_NextTemplate=login_ok"
browser = self.get_browser()
browser.add_password("http://premium.zeit.de", self.username, self.password)
try:
browser.open(url)
except urllib2.HTTPError:
self.report_progress(0,_("Can't login to download issue"))
raise ValueError('Failed to login, check your username and password')
response = browser.follow_link(text="DIE ZEIT als E-Paper")
response = browser.follow_link(url_regex=re.compile('^http://contentserver.hgv-online.de/nodrm/fulfillment\\?distributor=zeit-online&orderid=zeit_online.*'))
tmp = PersistentTemporaryFile(suffix='.epub')
self.report_progress(0,_('downloading epub'))
tmp.write(response.read())
tmp.close()
zfile = zipfile.ZipFile(tmp.name, 'r')
self.report_progress(0,_('extracting epub'))
zfile.extractall(self.output_dir)
tmp.close()
index = os.path.join(self.output_dir, 'content.opf')
self.report_progress(1,_('epub downloaded and extracted'))
# doing regular expression filtering
for path in walk('.'):
(shortname, extension) = os.path.splitext(path)
if extension.lower() in ('.html', '.htm', '.xhtml'):
with open(path, 'r+b') as f:
raw = f.read()
raw = raw.decode('utf-8')
for pat, func in self.preprocess_regexps:
raw = pat.sub(func, raw)
f.seek(0)
f.truncate()
f.write(raw.encode('utf-8'))
# adding real cover
self.report_progress(0,_('trying to download cover image (titlepage)'))
self.download_cover()
self.conversion_options["cover"] = self.cover_path
return index
# getting url of the cover
def get_cover_url(self):
try:
inhalt = self.index_to_soup('http://www.zeit.de/inhalt')
cover_url = inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','')
except:
cover_url = 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg'
return cover_url