Fix #6948 (Scientific American Recipe)

2026-01-02 02:00:20 -05:00 · 2010-09-25 09:24:15 -06:00 · 2010-09-25 09:24:15 -06:00 · 8ce7d6ca74
commit 8ce7d6ca74
parent 0ce2126cdc
1 changed files with 72 additions and 59 deletions
--- a/resources/recipes/scientific_american.recipe
+++ b/resources/recipes/scientific_american.recipe
@ -1,78 +1,91 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
-__docformat__ = 'restructuredtext en'

-'''
-sciam.com
-'''
 import re
 from calibre.web.feeds.news import BasicNewsRecipe

 class ScientificAmerican(BasicNewsRecipe):
-    title = u'Scientific American'
-    description = u'Popular science. Monthly magazine.'
-    __author__ = 'Kovid Goyal'
-    language = 'en'
-    remove_javascript   = True
-    encoding = 'utf-8'
+    title                 = u'Scientific American'
+    description           = u'Popular Science. Monthly magazine.'
+    category              = 'science'
+    __author__            = 'Starson17'
+    no_stylesheets        = True
+    use_embedded_content  = False
+    language              = 'en'
+    publisher             = 'Nature Publishing Group'
+    remove_empty_feeds    = True
+    remove_javascript     = True
+    oldest_article        = 30
+    max_articles_per_feed = 100

-    def print_version(self, url):
-        return url + '&print=true'
+    conversion_options = {'linearize_tables'  : True
+                        , 'comment'           : description
+                        , 'tags'              : category
+                        , 'publisher'         : publisher
+                        , 'language'          : language
+                        }
+
+    keep_only_tags = [
+                dict(name='h2', attrs={'class':'articleTitle'})
+                ,dict(name='p', attrs={'id':'articleDek'})
+                ,dict(name='p', attrs={'class':'articleInfo'})
+                ,dict(name='div', attrs={'id':['articleContent']})
+                ,dict(name='img', attrs={'src':re.compile(r'/media/inline/blog/Image/', re.DOTALL|re.IGNORECASE)}) 
+                ]
+
+    remove_tags = [dict(name='a', attrs={'class':'tinyCommentCount'})]

    def parse_index(self):
        soup = self.index_to_soup('http://www.scientificamerican.com/sciammag/')
-        month = self.tag_to_string(soup.find('p',attrs={'id':'articleDek'}))
-        self.timefmt = ' [%s]'%(' '.join(month.strip().split()[:2]))
+        issuetag = soup.find('p',attrs={'id':'articleDek'})
+        self.timefmt = ' [%s]'%(self.tag_to_string(issuetag))
        img = soup.find('img', alt='Scientific American Magazine', src=True)
        if img is not None:
            self.cover_url = img['src']
-
-        feeds = []
-        for div in soup.findAll('div', attrs={'class':['primaryCol',
-            'secondaryCol']}):
-            current_section = None
-            for tag in div.findAll(['h2', 'ul']):
-                if tag.name == 'h2':
-                    current_section = self.tag_to_string(tag).strip()
-                    self.log('\tFound section:', current_section)
-                elif current_section is not None and tag.name == 'ul':
-                    articles = []
-                    for li in tag.findAll('li'):
-                        t = li.findAll('a',
-                                attrs={'class':lambda x: x != 'thumb'},
-                                href=lambda x: x and 'article.cfm' in x)
-                        if not t:
-                            continue
-                        t = t[-1]
-                        title = self.tag_to_string(t)
-                        url = t['href']
-                        desc = ''
-                        p = li.find(attrs={'class':'dek'})
-                        if p is not None:
-                            desc = self.tag_to_string(p)
-                        articles.append({'title':title, 'url':url,
-                            'description':desc, 'date':''})
-                        self.log('\t\tFound article:', title, '\n\t\tat', url)
-                    if articles:
-                        feeds.append((current_section, articles))
-                    current_section = None
+        features, feeds = [], []
+        for a in soup.find(attrs={'class':'primaryCol'}).findAll('a',attrs={'title':'Feature'}):
+            if a is None: continue
+            desc = ''
+            s = a.parent.parent.find(attrs={'class':'dek'})
+            desc = self.tag_to_string(s)
+            article = {
+                    'url' : a['href'],
+                    'title' : self.tag_to_string(a),
+                    'date' : '',
+                    'description' : desc,
+                    }
+            features.append(article)
+        feeds.append(('Features', features))
+        department = []
+        title = None
+        for li in soup.find(attrs={'class':'secondaryCol'}).findAll('li'):
+            if 'department.cfm' in li.a['href']:
+                if department:
+                    feeds.append((title, department))
+                title = self.tag_to_string(li.a)
+                department = []
+            if 'article.cfm' in li.h3.a['href']:
+                article = {
+                        'url' : li.h3.a['href'],
+                        'title' : self.tag_to_string(li.h3.a),
+                        'date': '',
+                        'description': self.tag_to_string(li.p),
+                    }
+                department.append(article)
+        if department:
+            feeds.append((title, department))
        return feeds

    def postprocess_html(self, soup, first_fetch):
-        if soup is not None:
-            for span in soup.findAll('span', attrs={'class':'pagination'}):
-                span.extract()
-            if not first_fetch:
-                div = soup.find('div', attrs={'class':'headline'})
-                if div:
-                    div.extract()
-
+        for item in soup.findAll('a'):
+            if 'topic.cfm' in item['href']:
+                item.replaceWith(item.string)
        return soup

-    preprocess_regexps = [
-        (re.compile(r'Already a Digital subscriber.*Now</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
-        (re.compile(r'If your institution has site license access, enter.*here</a>.', re.DOTALL|re.IGNORECASE), lambda match: ''),
-        (re.compile(r'to subscribe to our.*;.*\}', re.DOTALL|re.IGNORECASE), lambda match: ''),
-        (re.compile(r'\)\(jQuery\);.*-->', re.DOTALL|re.IGNORECASE), lambda match: ''),
-        ]
+    extra_css = '''
+                p{font-weight: normal; font-size:small}
+                li{font-weight: normal; font-size:small}
+                .headline p{font-size:x-small; font-family:Arial,Helvetica,sans-serif;}
+                h2{font-size:large; font-family:Arial,Helvetica,sans-serif;}
+                h3{font-size:x-small;font-family:Arial,Helvetica,sans-serif;}
+                '''