diff --git a/src/calibre/web/feeds/recipes/__init__.py b/src/calibre/web/feeds/recipes/__init__.py index 191bf905ca..ef9f58b003 100644 --- a/src/calibre/web/feeds/recipes/__init__.py +++ b/src/calibre/web/feeds/recipes/__init__.py @@ -39,7 +39,7 @@ recipe_modules = ['recipe_' + r for r in ( 'nacional_cro', '24sata', 'dnevni_avaz', 'glas_srpske', '24sata_rs', 'krstarica', 'krstarica_en', 'tanjug', 'laprensa_ni', 'azstarnet', 'corriere_della_sera_it', 'corriere_della_sera_en', 'msdnmag_en', - 'moneynews', + 'moneynews', 'der_standard', )] import re, imp, inspect, time, os diff --git a/src/calibre/web/feeds/recipes/recipe_der_standard.py b/src/calibre/web/feeds/recipes/recipe_der_standard.py new file mode 100644 index 0000000000..eec4c4e74d --- /dev/null +++ b/src/calibre/web/feeds/recipes/recipe_der_standard.py @@ -0,0 +1,42 @@ + +''' http://www.derstandard.at - Austrian Newspaper ''' +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class DerStandardRecipe(BasicNewsRecipe): + title = u'derStandard' + __author__ = 'Gerhard Aigner' + + oldest_article = 1 + max_articles_per_feed = 100 + feeds = [(u'International', u'http://derstandard.at/?page=rss&ressort=internationalpolitik'), + (u'Inland', u'http://derstandard.at/?page=rss&ressort=innenpolitik'), + (u'Wirtschaft', u'http://derstandard.at/?page=rss&ressort=investor'), + (u'Web', u'http://derstandard.at/?page=rss&ressort=webstandard'), + (u'Sport', u'http://derstandard.at/?page=rss&ressort=sport'), + (u'Panorama', u'http://derstandard.at/?page=rss&ressort=panorama'), + (u'Etat', u'http://derstandard.at/?page=rss&ressort=etat'), + (u'Kultur', u'http://derstandard.at/?page=rss&ressort=kultur'), + (u'Wissenschaft', u'http://derstandard.at/?page=rss&ressort=wissenschaft'), + (u'Gesundheit', u'http://derstandard.at/?page=rss&ressort=gesundheit'), + (u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')] + + encoding = 'utf-8' + language = _('German') + recursions = 0 + remove_tags = [dict(name='div'), dict(name='a'), dict(name='link'), dict(name='meta'), + dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr')] + preprocess_regexps = [ + (re.compile(r'\[[\d*]\]', re.DOTALL|re.IGNORECASE), lambda match: ''), + (re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '') + ] + + def print_version(self, url): + return url.replace('?id=', 'txt/?id=') + + def get_article_url(self, article): + '''if the article links to a index page (ressort) or a picture gallery + (ansichtssache), don't add it''' + if (article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0): + return None + return article.link