Fix #4947 (Problems with new scientist recipe)

2025-08-05 08:40:13 -04:00 · 2010-02-21 19:14:05 -07:00 · 2010-02-21 19:14:05 -07:00 · 1846300458
commit 1846300458
parent e3fac897f0
1 changed files with 21 additions and 9 deletions
--- a/resources/recipes/new_scientist.recipe
+++ b/resources/recipes/new_scientist.recipe
@ -1,11 +1,11 @@
-#!/usr/bin/env  python

 __license__   = 'GPL v3'
-__copyright__ = '2008-2009, AprilHare, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2008-2010, AprilHare, Darko Miletic <darko.miletic at gmail.com>'
 '''
 newscientist.com
 '''

+import re
 from calibre.web.feeds.news import BasicNewsRecipe

 class NewScientist(BasicNewsRecipe):
@ -15,12 +15,14 @@ class NewScientist(BasicNewsRecipe):
    language              = 'en'
    publisher             = 'New Scientist'
    category              = 'science news, science articles, science jobs, drugs, cancer, depression, computer software'
-    delay                 = 3
    oldest_article        = 7
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
+    cover_url             = 'http://www.newscientist.com/currentcover.jpg'
+    masthead_url          = 'http://www.newscientist.com/img/misc/ns_logo.jpg'
    encoding              = 'utf-8'
+    extra_css             = ' body{font-family: Arial,sans-serif} img{margin-bottom: 0.8em} '

    conversion_options = {
                          'comment'          : description
@ -28,14 +30,18 @@ class NewScientist(BasicNewsRecipe):
                        , 'publisher'        : publisher
                        , 'language'         : language
                        }
+    preprocess_regexps = [(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')]

-    keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol']})]
+    keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','nsblgposts','hldgalcols']})]

    remove_tags = [
-                     dict(name='div', attrs={'class':['hldBd','adline','pnl','infotext' ]})
-                    ,dict(name='div', attrs={'id'   :['compnl','artIssueInfo','artTools']})
-                    ,dict(name='p'  , attrs={'class':['marker','infotext'               ]})
+                     dict(name='div'  , attrs={'class':['hldBd','adline','pnl','infotext' ]})
+                    ,dict(name='div'  , attrs={'id'   :['compnl','artIssueInfo','artTools']})
+                    ,dict(name='p'    , attrs={'class':['marker','infotext'               ]})
+                    ,dict(name='meta' , attrs={'name' :'description'                       })
                  ]
+    remove_tags_after = dict(attrs={'class':'nbpcopy'})
+    remove_attributes = ['height','width']

    feeds          = [
                        (u'Latest Headlines'        , u'http://feeds.newscientist.com/science-news'              )
@ -50,9 +56,15 @@ class NewScientist(BasicNewsRecipe):
                     ]

    def get_article_url(self, article):
-        url = article.get('guid',  None)
-        return url
+        return article.get('guid',  None)

    def print_version(self, url):
        return url + '?full=true&print=true'

+    def preprocess_html(self, soup):
+        for tg in soup.findAll('a'):
+            if tg.string == 'Home':
+                tg.parent.extract()
+                return self.adeify_images(soup)
+        return self.adeify_images(soup)
+