From 184630045812419fdbc51bd6cea35fb7fadd7eb9 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 21 Feb 2010 19:14:05 -0700
Subject: [PATCH] Fix #4947 (Problems with new scientist recipe)

---
 resources/recipes/new_scientist.recipe | 30 ++++++++++++++++++--------
 1 file changed, 21 insertions(+), 9 deletions(-)
diff --git a/resources/recipes/new_scientist.recipe b/resources/recipes/new_scientist.recipe
index 86d2b31e1e..1727a926ed 100644
--- a/resources/recipes/new_scientist.recipe
+++ b/resources/recipes/new_scientist.recipe
@@ -1,11 +1,11 @@
-#!/usr/bin/env  python
 
 __license__   = 'GPL v3'
-__copyright__ = '2008-2009, AprilHare, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2008-2010, AprilHare, Darko Miletic <darko.miletic at gmail.com>'
 '''
 newscientist.com
 '''
 
+import re
 from calibre.web.feeds.news import BasicNewsRecipe
 
 class NewScientist(BasicNewsRecipe):
@@ -15,12 +15,14 @@ class NewScientist(BasicNewsRecipe):
     language              = 'en'
     publisher             = 'New Scientist'
     category              = 'science news, science articles, science jobs, drugs, cancer, depression, computer software'
-    delay                 = 3
     oldest_article        = 7
     max_articles_per_feed = 100
     no_stylesheets        = True
     use_embedded_content  = False
+    cover_url             = 'http://www.newscientist.com/currentcover.jpg'
+    masthead_url          = 'http://www.newscientist.com/img/misc/ns_logo.jpg'
     encoding              = 'utf-8'
+    extra_css             = ' body{font-family: Arial,sans-serif} img{margin-bottom: 0.8em} '
 
     conversion_options = {
                           'comment'          : description
@@ -28,14 +30,18 @@ class NewScientist(BasicNewsRecipe):
                         , 'publisher'        : publisher
                         , 'language'         : language
                         }
+    preprocess_regexps = [(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')]
 
-    keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol']})]
+    keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','nsblgposts','hldgalcols']})]
 
     remove_tags = [
-                     dict(name='div', attrs={'class':['hldBd','adline','pnl','infotext' ]})
-                    ,dict(name='div', attrs={'id'   :['compnl','artIssueInfo','artTools']})
-                    ,dict(name='p'  , attrs={'class':['marker','infotext'               ]})
+                     dict(name='div'  , attrs={'class':['hldBd','adline','pnl','infotext' ]})
+                    ,dict(name='div'  , attrs={'id'   :['compnl','artIssueInfo','artTools']})
+                    ,dict(name='p'    , attrs={'class':['marker','infotext'               ]})
+                    ,dict(name='meta' , attrs={'name' :'description'                       })
                   ]
+    remove_tags_after = dict(attrs={'class':'nbpcopy'})
+    remove_attributes = ['height','width']
 
     feeds          = [
                         (u'Latest Headlines'        , u'http://feeds.newscientist.com/science-news'              )
@@ -50,9 +56,15 @@ class NewScientist(BasicNewsRecipe):
                      ]
 
     def get_article_url(self, article):
-        url = article.get('guid',  None)
-        return url
+        return article.get('guid',  None)
 
     def print_version(self, url):
         return url + '?full=true&print=true'
 
+    def preprocess_html(self, soup):
+        for tg in soup.findAll('a'):
+            if tg.string == 'Home':
+                tg.parent.extract()
+                return self.adeify_images(soup)
+        return self.adeify_images(soup)
+