Fix #3487 (Recipe standard.at defect)

2025-07-09 03:04:10 -04:00 · 2009-09-17 08:42:52 -06:00 · 2009-09-17 08:42:52 -06:00 · 9e95f7e7f6
commit 9e95f7e7f6
parent 2f5325e387
1 changed files with 31 additions and 19 deletions
--- a/src/calibre/web/feeds/recipes/recipe_der_standard.py
+++ b/src/calibre/web/feeds/recipes/recipe_der_standard.py
@ -10,8 +10,8 @@ from calibre.web.feeds.news import BasicNewsRecipe

 class DerStandardRecipe(BasicNewsRecipe):
    title = u'derStandard'
-    __author__ = 'Gerhard Aigner'
-    description = u'Nachrichten aus Österreich' 
+    __author__ = 'Gerhard Aigner and Sujata Raman'
+    description = u'Nachrichten aus ??sterreich'
    publisher ='derStandard.at'
    category = 'news, politics, nachrichten, Austria'
    use_embedded_content = False
@ -21,18 +21,16 @@ class DerStandardRecipe(BasicNewsRecipe):
    encoding = 'utf-8'
    language = 'de'

-    recursions = 0
    oldest_article = 1
    max_articles_per_feed = 100
-    
-    html2lrf_options = [
-                          '--comment'  , description
-                        , '--category' , category
-                        , '--publisher', publisher
-                        ]

-    html2epub_options  = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' 
-    
+
+    extra_css = '''
+                .artikelBody{font-family:Arial,Helvetica,sans-serif;}
+                .artikelLeft{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
+                h4{color:#404450;font-size:x-small;}
+                h6{color:#404450; font-size:x-small;}
+                '''
    feeds          = [(u'International', u'http://derstandard.at/?page=rss&ressort=internationalpolitik'),
        (u'Inland', u'http://derstandard.at/?page=rss&ressort=innenpolitik'),
        (u'Wirtschaft', u'http://derstandard.at/?page=rss&ressort=investor'),
@ -43,22 +41,33 @@ class DerStandardRecipe(BasicNewsRecipe):
        (u'Kultur', u'http://derstandard.at/?page=rss&ressort=kultur'),
        (u'Wissenschaft', u'http://derstandard.at/?page=rss&ressort=wissenschaft'),
        (u'Gesundheit', u'http://derstandard.at/?page=rss&ressort=gesundheit'),
-        (u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')]
-    remove_tags = [dict(name='div'), dict(name='a'), dict(name='link'), dict(name='meta'),
-        dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr')]
+        (u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')
+                      ]
+
+    keep_only_tags = [
+                        dict(name='div', attrs={'class':["artikel","artikelLeft","artikelBody"]}) ,
+                         ]
+
+    remove_tags = [
+                    dict(name='link'), dict(name='meta'),dict(name='iframe'),dict(name='style'),
+                    dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr'),
+                    dict(name='div', attrs={'class':["diashow"]})]
    preprocess_regexps = [
        (re.compile(r'\[[\d]*\]', re.DOTALL|re.IGNORECASE), lambda match: ''),
        (re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '')
    ]
-    
-    def print_version(self, url):
-        return url.replace('?id=', 'txt/?id=')
+
+    filter_regexps = [r'/r[1-9]*']
+
+    #def print_version(self, url):
+    #    return url.replace('?id=', 'txt/?id=')

    def get_article_url(self, article):
        '''if the article links to a index page (ressort) or a picture gallery
           (ansichtssache), don't add it'''
-        if (article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0):
+        if ( article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0 ):
            return None
+
        return article.link

    def preprocess_html(self, soup):
@ -66,4 +75,7 @@ class DerStandardRecipe(BasicNewsRecipe):
        soup.html['lang']     = self.lang
        mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
        soup.head.insert(0,mtag)
-        return soup  
+
+        for t in soup.findAll(['ul', 'li']):
+            t.name = 'div'
+        return soup