diff --git a/src/calibre/web/feeds/recipes/recipe_der_standard.py b/src/calibre/web/feeds/recipes/recipe_der_standard.py index bb7935d496..effda75b47 100644 --- a/src/calibre/web/feeds/recipes/recipe_der_standard.py +++ b/src/calibre/web/feeds/recipes/recipe_der_standard.py @@ -10,8 +10,8 @@ from calibre.web.feeds.news import BasicNewsRecipe class DerStandardRecipe(BasicNewsRecipe): title = u'derStandard' - __author__ = 'Gerhard Aigner' - description = u'Nachrichten aus Österreich' + __author__ = 'Gerhard Aigner and Sujata Raman' + description = u'Nachrichten aus ??sterreich' publisher ='derStandard.at' category = 'news, politics, nachrichten, Austria' use_embedded_content = False @@ -21,18 +21,16 @@ class DerStandardRecipe(BasicNewsRecipe): encoding = 'utf-8' language = 'de' - recursions = 0 oldest_article = 1 max_articles_per_feed = 100 - - html2lrf_options = [ - '--comment' , description - , '--category' , category - , '--publisher', publisher - ] - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' - + + extra_css = ''' + .artikelBody{font-family:Arial,Helvetica,sans-serif;} + .artikelLeft{font-family:Arial,Helvetica,sans-serif;font-size:x-small;} + h4{color:#404450;font-size:x-small;} + h6{color:#404450; font-size:x-small;} + ''' feeds = [(u'International', u'http://derstandard.at/?page=rss&ressort=internationalpolitik'), (u'Inland', u'http://derstandard.at/?page=rss&ressort=innenpolitik'), (u'Wirtschaft', u'http://derstandard.at/?page=rss&ressort=investor'), @@ -43,22 +41,33 @@ class DerStandardRecipe(BasicNewsRecipe): (u'Kultur', u'http://derstandard.at/?page=rss&ressort=kultur'), (u'Wissenschaft', u'http://derstandard.at/?page=rss&ressort=wissenschaft'), (u'Gesundheit', u'http://derstandard.at/?page=rss&ressort=gesundheit'), - (u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')] - remove_tags = [dict(name='div'), dict(name='a'), dict(name='link'), dict(name='meta'), - dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr')] + (u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung') + ] + + keep_only_tags = [ + dict(name='div', attrs={'class':["artikel","artikelLeft","artikelBody"]}) , + ] + + remove_tags = [ + dict(name='link'), dict(name='meta'),dict(name='iframe'),dict(name='style'), + dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr'), + dict(name='div', attrs={'class':["diashow"]})] preprocess_regexps = [ (re.compile(r'\[[\d]*\]', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '') ] - - def print_version(self, url): - return url.replace('?id=', 'txt/?id=') + + filter_regexps = [r'/r[1-9]*'] + + #def print_version(self, url): + # return url.replace('?id=', 'txt/?id=') def get_article_url(self, article): '''if the article links to a index page (ressort) or a picture gallery (ansichtssache), don't add it''' - if (article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0): + if ( article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0 ): return None + return article.link def preprocess_html(self, soup): @@ -66,4 +75,7 @@ class DerStandardRecipe(BasicNewsRecipe): soup.html['lang'] = self.lang mtag = '' soup.head.insert(0,mtag) - return soup \ No newline at end of file + + for t in soup.findAll(['ul', 'li']): + t.name = 'div' + return soup