mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
132 lines
5.0 KiB
Plaintext
132 lines
5.0 KiB
Plaintext
__license__ = 'GPL v3'
|
|
__copyright__ = '2012, 2013 Andreas Zeiser <andreas.zeiser@web.de>'
|
|
'''
|
|
szmobil.sueddeutsche.de/
|
|
'''
|
|
# History
|
|
# 2015.01.02 Daily Cover from https://zeitung.sueddeutsche.de/webapp by lala-rob (web@lala-rob.de)
|
|
# 2014.12.18 Fixing URL set Cover by lala-rob (web@lala-rob.de)
|
|
# 2014.10.06 Fixing Login URL and Article URL by lala-rob (web@lala-rob.de)
|
|
#
|
|
# 2013.01.09 Fixed bugs in article titles containing "strong" and
|
|
# other small changes
|
|
# 2012.08.04 Initial release
|
|
|
|
from calibre import strftime
|
|
import datetime
|
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
import re
|
|
|
|
|
|
class SZmobil(BasicNewsRecipe):
|
|
title = u'Süddeutsche Zeitung mobil'
|
|
__author__ = u'Andreas Zeiser'
|
|
description = u'Nachrichten aus Deutschland. Zugriff auf kostenpflichtiges Abo SZ mobil.'
|
|
publisher = u'Sueddeutsche Zeitung'
|
|
masthead_url = 'http://pix.sueddeutsche.de/img/layout/header/SZ_solo288x31.gif'
|
|
language = u'de'
|
|
publication_type = u'newspaper'
|
|
category = u'news, politics, Germany'
|
|
cover_url = 'https://zeitung.sueddeutsche.de/szdigital/public/issue/previewimage?size=l&issueId=' + \
|
|
(datetime.datetime.utcnow() + datetime.timedelta(hours=1)
|
|
).strftime("%Y-%m-%d") + '&targetVersion=3&productId=sz'
|
|
no_stylesheets = True
|
|
oldest_article = 2
|
|
encoding = 'iso-8859-1'
|
|
needs_subscription = True
|
|
remove_empty_feeds = True
|
|
delay = 1
|
|
|
|
# if you want to get rid of the date on the title page use
|
|
# timefmt = ''
|
|
timefmt = ' [%a, %d %b, %Y]'
|
|
|
|
root_url = 'http://epaper.sueddeutsche.de/app/service/epaper-mobil/'
|
|
keep_only_tags = [dict(name='div', attrs={'class': 'article'})]
|
|
|
|
def get_browser(self):
|
|
browser = BasicNewsRecipe.get_browser(self)
|
|
|
|
# Login via fetching of Streiflicht -> Fill out login request
|
|
url = 'https://id.sueddeutsche.de/login'
|
|
browser.open(url)
|
|
|
|
browser.select_form(nr=0) # to select the first form
|
|
browser['login'] = self.username
|
|
browser['password'] = self.password
|
|
browser.submit()
|
|
return browser
|
|
|
|
def parse_index(self):
|
|
# find all sections
|
|
src = self.index_to_soup(
|
|
'http://epaper.sueddeutsche.de/app/service/epaper-mobil/')
|
|
feeds = []
|
|
for itt in src.findAll('a', href=True):
|
|
if itt['href'].startswith('section.php?section'):
|
|
feeds.append((itt.string[0:-2], itt['href']))
|
|
|
|
all_articles = []
|
|
for feed in feeds:
|
|
feed_url = self.root_url + feed[1]
|
|
feed_title = feed[0]
|
|
|
|
self.report_progress(0, ('Fetching feed') + ' %s...' %
|
|
(feed_title if feed_title else feed_url))
|
|
|
|
src = self.index_to_soup(feed_url)
|
|
articles = []
|
|
shorttitles = dict()
|
|
for itt in src.findAll('a', href=True):
|
|
if itt['href'].startswith('article.php?id='):
|
|
article_url = itt['href']
|
|
article_id = int(
|
|
re.search(r"id=(\d*)&etag=", itt['href']).group(1))
|
|
|
|
# first check if link is a special article in section
|
|
# "Meinungsseite"
|
|
if itt.find('strong') is not None:
|
|
article_name = itt.strong.string
|
|
if len(itt.contents) > 1:
|
|
shorttitles[article_id] = itt.contents[1]
|
|
|
|
articles.append(
|
|
(article_name, article_url, article_id))
|
|
continue
|
|
|
|
# candidate for a general article
|
|
if itt.string is None:
|
|
article_name = ''
|
|
else:
|
|
article_name = itt.string
|
|
|
|
if (article_name.find(" mehr") == 0):
|
|
# just another link ("mehr") to an article
|
|
continue
|
|
|
|
if itt.get('id') is not None:
|
|
shorttitles[article_id] = article_name
|
|
else:
|
|
articles.append(
|
|
(article_name, article_url, article_id))
|
|
|
|
feed_articles = []
|
|
for article_name, article_url, article_id in articles:
|
|
url = self.root_url + article_url
|
|
title = article_name
|
|
# if you want to get rid of date for each article use
|
|
# pubdate = strftime('')
|
|
pubdate = strftime('[%a, %d %b]')
|
|
description = ''
|
|
if shorttitles.get(article_id) is not None:
|
|
description = shorttitles[article_id]
|
|
# we do not want the flag ("Impressum")
|
|
if "HERAUSGEGEBEN VOM" in description:
|
|
continue
|
|
d = dict(title=title, url=url, date=pubdate,
|
|
description=description, content='')
|
|
feed_articles.append(d)
|
|
all_articles.append((feed_title, feed_articles))
|
|
|
|
return all_articles
|