calibre/recipes/zeitde_sub.recipe

#!/usr/bin/env  python
# -*- coding: utf-8 mode: python -*-

__license__   = 'GPL v3'
__copyright__ = '2010-2011, Steffen Siebert <calibre at steffensiebert.de>'
__docformat__ = 'restructuredtext de'
__version__   = '1.2'

"""
Die Zeit EPUB
"""

import os, urllib2, zipfile, re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
from calibre import walk

class ZeitEPUBAbo(BasicNewsRecipe):

    title = u'Die Zeit'
    description = u'Das EPUB Abo der Zeit (needs subscription)'
    language = 'de'
    lang = 'de-DE'

    __author__ = 'Steffen Siebert and Tobias Isenberg'
    needs_subscription = True

    conversion_options = {
        'no_default_epub_cover' : True,
        # fixing the wrong left margin
        'mobi_ignore_margins' : True,
    }

    preprocess_regexps    = [
        # filtering for correct dashes
        (re.compile(r' - '), lambda match: ' – '), # regular "Gedankenstrich"
        (re.compile(r' -,'), lambda match: ' –,'), # "Gedankenstrich" before a comma
        (re.compile(r'(?<=\d)-(?=\d)'), lambda match: '–'), # number-number
        # filtering for unicode characters that are missing on the Kindle,
        # try to replace them with meaningful work-arounds
        (re.compile(u'\u2080'), lambda match: '<span style="font-size: 50%;">0</span>'), # subscript-0
        (re.compile(u'\u2081'), lambda match: '<span style="font-size: 50%;">1</span>'), # subscript-1
        (re.compile(u'\u2082'), lambda match: '<span style="font-size: 50%;">2</span>'), # subscript-2
        (re.compile(u'\u2083'), lambda match: '<span style="font-size: 50%;">3</span>'), # subscript-3
        (re.compile(u'\u2084'), lambda match: '<span style="font-size: 50%;">4</span>'), # subscript-4
        (re.compile(u'\u2085'), lambda match: '<span style="font-size: 50%;">5</span>'), # subscript-5
        (re.compile(u'\u2086'), lambda match: '<span style="font-size: 50%;">6</span>'), # subscript-6
        (re.compile(u'\u2087'), lambda match: '<span style="font-size: 50%;">7</span>'), # subscript-7
        (re.compile(u'\u2088'), lambda match: '<span style="font-size: 50%;">8</span>'), # subscript-8
        (re.compile(u'\u2089'), lambda match: '<span style="font-size: 50%;">9</span>'), # subscript-9
    ]

    def build_index(self):
        domain = "http://premium.zeit.de"
        url = domain + "/abovorteile/cgi-bin/_er_member/p4z.fpl?ER_Do=getUserData&ER_NextTemplate=login_ok"

        browser = self.get_browser()
        browser.add_password("http://premium.zeit.de", self.username, self.password)

        try:
            browser.open(url)
        except urllib2.HTTPError:
            self.report_progress(0,_("Can't login to download issue"))
            raise ValueError('Failed to login, check your username and password')

        response = browser.follow_link(text="DIE ZEIT als E-Paper")
        response = browser.follow_link(url_regex=re.compile('^http://contentserver.hgv-online.de/nodrm/fulfillment\\?distributor=zeit-online&orderid=zeit_online.*'))

        tmp = PersistentTemporaryFile(suffix='.epub')
        self.report_progress(0,_('downloading epub'))
        tmp.write(response.read())
        tmp.close()

        zfile = zipfile.ZipFile(tmp.name, 'r')
        self.report_progress(0,_('extracting epub'))

        zfile.extractall(self.output_dir)

        tmp.close()

        index = os.path.join(self.output_dir, 'content.opf')

        self.report_progress(1,_('epub downloaded and extracted'))

        # doing regular expression filtering
        for path in walk('.'):
            (shortname, extension) = os.path.splitext(path)
            if extension.lower() in ('.html', '.htm', '.xhtml'):
                with open(path, 'r+b') as f:
                    raw = f.read()
                    raw = raw.decode('utf-8')
                    for pat, func in self.preprocess_regexps:
                        raw = pat.sub(func, raw)
                    f.seek(0)
                    f.truncate()
                    f.write(raw.encode('utf-8'))

        # adding real cover
        self.report_progress(0,_('trying to download cover image (titlepage)'))
        self.download_cover()
        self.conversion_options["cover"] = self.cover_path

        return index

    # getting url of the cover
    def get_cover_url(self):
        try:
            inhalt = self.index_to_soup('http://www.zeit.de/inhalt')
            cover_url = inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','')
        except:
            cover_url = 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg'
        return cover_url