Update New Scientist

2025-07-09 03:04:10 -04:00 · 2012-01-20 08:50:27 +05:30 · 2012-01-20 08:50:27 +05:30 · f50122805c
commit f50122805c
parent ffdeba9f17
1 changed files with 50 additions and 5 deletions
--- a/recipes/new_scientist.recipe
+++ b/recipes/new_scientist.recipe
@ -1,16 +1,35 @@
-__license__   = 'GPL v3'
-__copyright__ = '2008-2010, AprilHare, Darko Miletic <darko.miletic at gmail.com>'
+##
+## Title:        Microwave Journal RSS recipe
+## Contact:      AprilHare, Darko Miletic <darko.miletic at gmail.com>
+##
+## License:      GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
+## Copyright:    2008-2010, AprilHare, Darko Miletic <darko.miletic at gmail.com>
+##
+## Written:      2008
+## Last Edited:  Jan 2012
+##
+
+'''
+01-19-2012: Added GrayScale Image conversion and Duplicant article removals
+'''
+
+__license__   = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
+__copyright__ = '2008-2012, AprilHare, Darko Miletic <darko.miletic at gmail.com>'
+__version__     = 'v0.5.0'
+__date__        = '2012-01-19'
+__author__      = 'Darko Miletic'
+
 '''
 newscientist.com
 '''

 import re
 import urllib
+from calibre.utils.magick import Image
 from calibre.web.feeds.news import BasicNewsRecipe

 class NewScientist(BasicNewsRecipe):
    title                 = 'New Scientist - Online News w. subscription'
-    __author__            = 'Darko Miletic'
    description           = 'Science news and science articles from New Scientist.'
    language              = 'en'
    publisher             = 'Reed Business Information Ltd.'
@ -39,6 +58,15 @@ class NewScientist(BasicNewsRecipe):

    keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','blgmaincol','nsblgposts','hldgalcols']})]

+    # Whether to omit duplicates of articles (typically arsing when articles are indexed in
+    # more than one section). If True, only the first occurance will be downloaded.
+    filterDuplicates = True
+
+    # Whether to convert images to grayscale for eInk readers.
+    Convert_Grayscale = False
+
+    url_list = []   # This list is used to check if an article had already been included.
+
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        br.open('http://www.newscientist.com/')
@ -80,6 +108,10 @@ class NewScientist(BasicNewsRecipe):
        return article.get('guid',  None)

    def print_version(self, url):
+        if self.filterDuplicates:
+            if url in self.url_list:
+                return
+        self.url_list.append(url)
        return url + '?full=true&print=true'

    def preprocess_html(self, soup):
@ -101,3 +133,16 @@ class NewScientist(BasicNewsRecipe):
                   tg.replaceWith(tstr)
        return soup

+    # Converts images to Gray Scale
+    def postprocess_html(self, soup, first):
+        if self.Convert_Grayscale:
+            #process all the images
+            for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
+                iurl = tag['src']
+                img = Image()
+                img.open(iurl)
+                if img < 0:
+                    raise RuntimeError('Out of memory')
+                img.type = "GrayscaleType"
+                img.save(iurl)
+        return soup