#!/usr/bin/env python2 # -*- coding: utf-8 -*- from __future__ import unicode_literals, division, absolute_import, print_function __license__ = 'GPL v3' __copyright__ = '2009, Gerhard Aigner ' ''' http://www.derstandard.at - Austrian Newspaper ''' import re import random from calibre.web.feeds.news import BasicNewsRecipe class DerStandardRecipe(BasicNewsRecipe): title = u'derStandard' __author__ = 'Gerhard Aigner and Sujata Raman and Marcel Jira and Peter Reschenhofer' description = u'Nachrichten aus Österreich' publisher = 'derStandard.at' category = 'news, politics, nachrichten, Austria' use_embedded_content = False remove_empty_feeds = True no_stylesheets = True encoding = 'utf-8' language = 'de_AT' oldest_article = 1 max_articles_per_feed = 100 ignore_duplicate_articles = {'title', 'url'} masthead_url = 'http://images.derstandard.at/2012/06/19/derStandardat_1417x274.gif' feeds = [ (u'Newsroom', u'http://derStandard.at/?page=rss&ressort=Seite1'), (u'Inland', u'http://derstandard.at/?page=rss&ressort=InnenPolitik'), (u'International', u'http://derstandard.at/?page=rss&ressort=InternationalPolitik'), (u'Wirtschaft', u'http://derStandard.at/?page=rss&ressort=Wirtschaft'), (u'Web', u'http://derStandard.at/?page=rss&ressort=Web'), (u'Sport', u'http://derStandard.at/?page=rss&ressort=Sport'), (u'Panorama', u'http://derStandard.at/?page=rss&ressort=Panorama'), (u'Etat', u'http://derStandard.at/?page=rss&ressort=Etat'), (u'Kultur', u'http://derStandard.at/?page=rss&ressort=Kultur'), (u'Wissenschaft', u'http://derStandard.at/?page=rss&ressort=Wissenschaft'), (u'Gesundheit', u'http://derStandard.at/?page=rss&ressort=Gesundheit'), (u'Bildung', u'http://derStandard.at/?page=rss&ressort=Bildung'), (u'Meinung', u'http://derStandard.at/?page=rss&ressort=Meinung'), (u'Lifestyle', u'http://derStandard.at/?page=rss&ressort=Lifestyle'), (u'Reisen', u'http://derStandard.at/?page=rss&ressort=Reisen'), (u'Familie', u'http://derstandard.at/?page=rss&ressort=Familie'), (u'Greenlife', u'http://derStandard.at/?page=rss&ressort=Greenlife'), (u'Karriere', u'http://derStandard.at/?page=rss&ressort=Karriere'), (u'Immobilien', u'http://derstandard.at/?page=rss&ressort=Immobilien'), (u'Automobil', u'http://derstandard.at/?page=rss&ressort=Automobil'), (u'dieStandard', u'http://dieStandard.at/?page=rss&ressort=diestandard'), (u'daStandard', u'http://daStandard.at/?page=rss&ressort=dastandard') ] keep_only_tags = [ dict(name='div', attrs={'class': re.compile('^artikel')}) ] remove_tags = [ dict(name=['link', 'iframe', 'style', 'hr']), dict(attrs={'class': ['lookup-links', 'media-list']}), dict(name='form', attrs={'name': 'sitesearch'}), dict(name='div', attrs={'class': ['socialsharing', 'block video', 'blog-browsing section', 'diashow', 'supplemental']}), dict(name='div', attrs={'id': 'highlighted'}) ] remove_attributes = ['width', 'height'] preprocess_regexps = [ (re.compile(r'\[[\d]*\]', re.DOTALL | re.IGNORECASE), lambda match: ''), (re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL | re.IGNORECASE), lambda match: '') ] filter_regexps = [r'/r[1-9]*'] def get_article_url(self, article): matchObj = re.search(re.compile( r'/r' + '[1-9]*', flags=0), article.link, flags=0) if matchObj: return None return article.link def preprocess_html(self, soup): if soup.find('div', {'class': re.compile('^artikel')}) is None: self.abort_article() for t in soup.findAll(['ul', 'li']): t.name = 'div' return soup def get_cover_url(self): base_url = 'https://epaper.derstandard.at/' url = base_url + 'shelf.act?s=' + str(random.random() * 10000) soup = self.index_to_soup(url) img = soup.find( 'img', {'class': re.compile('^thumbnailBig'), 'src': True}) if img and img['src']: cover_url = base_url + img['src'] return cover_url