calibre/recipes/sueddeutsche_mobil.recipe

__license__ = 'GPL v3'
__copyright__ = '2012, 2013 Andreas Zeiser <andreas.zeiser@web.de>'
'''
szmobil.sueddeutsche.de/
'''
# History
# 2015.01.02 Daily Cover from https://zeitung.sueddeutsche.de/webapp  by lala-rob (web@lala-rob.de)
# 2014.12.18 Fixing URL set Cover by lala-rob (web@lala-rob.de)
# 2014.10.06 Fixing Login URL and Article URL by lala-rob (web@lala-rob.de)
#
# 2013.01.09 Fixed bugs in article titles containing "strong" and
#            other small changes
# 2012.08.04 Initial release

from calibre import strftime
import datetime
from calibre.web.feeds.recipes import BasicNewsRecipe
import re


class SZmobil(BasicNewsRecipe):
    title = u'Süddeutsche Zeitung mobil'
    __author__ = u'Andreas Zeiser'
    description = u'Nachrichten aus Deutschland. Zugriff auf kostenpflichtiges Abo SZ mobil.'
    publisher = u'Sueddeutsche Zeitung'
    masthead_url = 'http://pix.sueddeutsche.de/img/layout/header/SZ_solo288x31.gif'
    language = u'de'
    publication_type = u'newspaper'
    category = u'news, politics, Germany'
    cover_url = 'https://zeitung.sueddeutsche.de/szdigital/public/issue/previewimage?size=l&issueId=' + \
        (datetime.datetime.utcnow() + datetime.timedelta(hours=1)
         ).strftime("%Y-%m-%d") + '&targetVersion=3&productId=sz'
    no_stylesheets = True
    oldest_article = 2
    encoding = 'iso-8859-1'
    needs_subscription = True
    remove_empty_feeds = True
    delay = 1

    # if you want to get rid of the date on the title page use
    # timefmt = ''
    timefmt = ' [%a, %d %b, %Y]'

    root_url = 'http://epaper.sueddeutsche.de/app/service/epaper-mobil/'
    keep_only_tags = [dict(name='div', attrs={'class': 'article'})]

    def get_browser(self):
        browser = BasicNewsRecipe.get_browser(self)

        # Login via fetching of Streiflicht -> Fill out login request
        url = 'https://id.sueddeutsche.de/login'
        browser.open(url)

        browser.select_form(nr=0)  # to select the first form
        browser['login'] = self.username
        browser['password'] = self.password
        browser.submit()
        return browser

    def parse_index(self):
        # find all sections
        src = self.index_to_soup(
            'http://epaper.sueddeutsche.de/app/service/epaper-mobil/')
        feeds = []
        for itt in src.findAll('a', href=True):
            if itt['href'].startswith('section.php?section'):
                feeds.append((itt.string[0:-2], itt['href']))

        all_articles = []
        for feed in feeds:
            feed_url = self.root_url + feed[1]
            feed_title = feed[0]

            self.report_progress(0, ('Fetching feed') + ' %s...' %
                                 (feed_title if feed_title else feed_url))

            src = self.index_to_soup(feed_url)
            articles = []
            shorttitles = dict()
            for itt in src.findAll('a', href=True):
                if itt['href'].startswith('article.php?id='):
                    article_url = itt['href']
                    article_id = int(
                        re.search(r"id=(\d*)&etag=", itt['href']).group(1))

                    # first check if link is a special article in section
                    # "Meinungsseite"
                    if itt.find('strong') is not None:
                        article_name = itt.strong.string
                        if len(itt.contents) > 1:
                            shorttitles[article_id] = itt.contents[1]

                        articles.append(
                            (article_name, article_url, article_id))
                        continue

                    # candidate for a general article
                    if itt.string is None:
                        article_name = ''
                    else:
                        article_name = itt.string

                    if (article_name.find("&nbsp;mehr") == 0):
                        # just another link ("mehr") to an article
                        continue

                    if itt.get('id') is not None:
                        shorttitles[article_id] = article_name
                    else:
                        articles.append(
                            (article_name, article_url, article_id))

            feed_articles = []
            for article_name, article_url, article_id in articles:
                url = self.root_url + article_url
                title = article_name
                # if you want to get rid of date for each article use
                # pubdate = strftime('')
                pubdate = strftime('[%a, %d %b]')
                description = ''
                if shorttitles.get(article_id) is not None:
                    description = shorttitles[article_id]
                # we do not want the flag ("Impressum")
                if "HERAUSGEGEBEN VOM" in description:
                    continue
                d = dict(title=title, url=url, date=pubdate,
                         description=description, content='')
                feed_articles.append(d)
            all_articles.append((feed_title, feed_articles))

        return all_articles