__license__ = 'GPL v3' __copyright__ = '2012, 2013 Andreas Zeiser ' ''' szmobil.sueddeutsche.de/ ''' # History # 2015.01.02 Daily Cover from https://zeitung.sueddeutsche.de/webapp by lala-rob (web@lala-rob.de) # 2014.12.18 Fixing URL set Cover by lala-rob (web@lala-rob.de) # 2014.10.06 Fixing Login URL and Article URL by lala-rob (web@lala-rob.de) # # 2013.01.09 Fixed bugs in article titles containing "strong" and # other small changes # 2012.08.04 Initial release from calibre import strftime import datetime from calibre.web.feeds.recipes import BasicNewsRecipe import re class SZmobil(BasicNewsRecipe): title = u'Süddeutsche Zeitung mobil' __author__ = u'Andreas Zeiser' description = u'Nachrichten aus Deutschland. Zugriff auf kostenpflichtiges Abo SZ mobil.' publisher = u'Sueddeutsche Zeitung' masthead_url = 'http://pix.sueddeutsche.de/img/layout/header/SZ_solo288x31.gif' language = u'de' publication_type = u'newspaper' category = u'news, politics, Germany' cover_url = 'https://zeitung.sueddeutsche.de/szdigital/public/issue/previewimage?size=l&issueId=' + \ (datetime.datetime.utcnow() + datetime.timedelta(hours=1) ).strftime("%Y-%m-%d") + '&targetVersion=3&productId=sz' no_stylesheets = True oldest_article = 2 encoding = 'iso-8859-1' needs_subscription = True remove_empty_feeds = True delay = 1 # if you want to get rid of the date on the title page use # timefmt = '' timefmt = ' [%a, %d %b, %Y]' root_url = 'http://epaper.sueddeutsche.de/app/service/epaper-mobil/' keep_only_tags = [dict(name='div', attrs={'class': 'article'})] def get_browser(self): browser = BasicNewsRecipe.get_browser(self) # Login via fetching of Streiflicht -> Fill out login request url = 'https://id.sueddeutsche.de/login' browser.open(url) browser.select_form(nr=0) # to select the first form browser['login'] = self.username browser['password'] = self.password browser.submit() return browser def parse_index(self): # find all sections src = self.index_to_soup( 'http://epaper.sueddeutsche.de/app/service/epaper-mobil/') feeds = [] for itt in src.findAll('a', href=True): if itt['href'].startswith('section.php?section'): feeds.append((itt.string[0:-2], itt['href'])) all_articles = [] for feed in feeds: feed_url = self.root_url + feed[1] feed_title = feed[0] self.report_progress(0, ('Fetching feed') + ' %s...' % (feed_title if feed_title else feed_url)) src = self.index_to_soup(feed_url) articles = [] shorttitles = dict() for itt in src.findAll('a', href=True): if itt['href'].startswith('article.php?id='): article_url = itt['href'] article_id = int( re.search("id=(\d*)&etag=", itt['href']).group(1)) # first check if link is a special article in section # "Meinungsseite" if itt.find('strong') is not None: article_name = itt.strong.string if len(itt.contents) > 1: shorttitles[article_id] = itt.contents[1] articles.append( (article_name, article_url, article_id)) continue # candidate for a general article if itt.string is None: article_name = '' else: article_name = itt.string if (article_name.find(" mehr") == 0): # just another link ("mehr") to an article continue if itt.has_key('id'): # noqa shorttitles[article_id] = article_name else: articles.append( (article_name, article_url, article_id)) feed_articles = [] for article_name, article_url, article_id in articles: url = self.root_url + article_url title = article_name # if you want to get rid of date for each article use # pubdate = strftime('') pubdate = strftime('[%a, %d %b]') description = '' if shorttitles.has_key(article_id): # noqa description = shorttitles[article_id] # we do not want the flag ("Impressum") if "HERAUSGEGEBEN VOM" in description: continue d = dict(title=title, url=url, date=pubdate, description=description, content='') feed_articles.append(d) all_articles.append((feed_title, feed_articles)) return all_articles