Fix #6948 (Scientific American Recipe)

2025-07-09 03:04:10 -04:00 · 2010-09-25 09:24:15 -06:00 · 2010-09-25 09:24:15 -06:00 · 8ce7d6ca74
commit 8ce7d6ca74
parent 0ce2126cdc
1 changed files with 72 additions and 59 deletions
--- a/resources/recipes/scientific_american.recipe
+++ b/resources/recipes/scientific_american.recipe
@ -1,78 +1,91 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 '''
 sciam.com
 '''
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class ScientificAmerican(BasicNewsRecipe):
-    title = u'Scientific American'
+    title                 = u'Scientific American'
-    description = u'Popular science. Monthly magazine.'
+    description           = u'Popular Science. Monthly magazine.'
-    __author__ = 'Kovid Goyal'
+    category              = 'science'
-    language = 'en'
+    __author__            = 'Starson17'
-    remove_javascript   = True
+    no_stylesheets        = True
-    encoding = 'utf-8'
+    use_embedded_content  = False
    language              = 'en'
    publisher             = 'Nature Publishing Group'
    remove_empty_feeds    = True
    remove_javascript     = True
    oldest_article        = 30
    max_articles_per_feed = 100
-    def print_version(self, url):
+    conversion_options = {'linearize_tables'  : True
-        return url + '&print=true'
+                        , 'comment'           : description
                        , 'tags'              : category
                        , 'publisher'         : publisher
                        , 'language'          : language
                        }
    keep_only_tags = [
                dict(name='h2', attrs={'class':'articleTitle'})
                ,dict(name='p', attrs={'id':'articleDek'})
                ,dict(name='p', attrs={'class':'articleInfo'})
                ,dict(name='div', attrs={'id':['articleContent']})
                ,dict(name='img', attrs={'src':re.compile(r'/media/inline/blog/Image/', re.DOTALL|re.IGNORECASE)}) 
                ]
    remove_tags = [dict(name='a', attrs={'class':'tinyCommentCount'})]
    def parse_index(self):
        soup = self.index_to_soup('http://www.scientificamerican.com/sciammag/')
-        month = self.tag_to_string(soup.find('p',attrs={'id':'articleDek'}))
+        issuetag = soup.find('p',attrs={'id':'articleDek'})
-        self.timefmt = ' [%s]'%(' '.join(month.strip().split()[:2]))
+        self.timefmt = ' [%s]'%(self.tag_to_string(issuetag))
        img = soup.find('img', alt='Scientific American Magazine', src=True)
        if img is not None:
            self.cover_url = img['src']
-
+        features, feeds = [], []
-        feeds = []
+        for a in soup.find(attrs={'class':'primaryCol'}).findAll('a',attrs={'title':'Feature'}):
-        for div in soup.findAll('div', attrs={'class':['primaryCol',
+            if a is None: continue
-            'secondaryCol']}):
+            desc = ''
-            current_section = None
+            s = a.parent.parent.find(attrs={'class':'dek'})
-            for tag in div.findAll(['h2', 'ul']):
+            desc = self.tag_to_string(s)
-                if tag.name == 'h2':
+            article = {
-                    current_section = self.tag_to_string(tag).strip()
+                    'url' : a['href'],
-                    self.log('\tFound section:', current_section)
+                    'title' : self.tag_to_string(a),
-                elif current_section is not None and tag.name == 'ul':
+                    'date' : '',
-                    articles = []
+                    'description' : desc,
-                    for li in tag.findAll('li'):
+                    }
-                        t = li.findAll('a',
+            features.append(article)
-                                attrs={'class':lambda x: x != 'thumb'},
+        feeds.append(('Features', features))
-                                href=lambda x: x and 'article.cfm' in x)
+        department = []
-                        if not t:
+        title = None
-                            continue
+        for li in soup.find(attrs={'class':'secondaryCol'}).findAll('li'):
-                        t = t[-1]
+            if 'department.cfm' in li.a['href']:
-                        title = self.tag_to_string(t)
+                if department:
-                        url = t['href']
+                    feeds.append((title, department))
-                        desc = ''
+                title = self.tag_to_string(li.a)
-                        p = li.find(attrs={'class':'dek'})
+                department = []
-                        if p is not None:
+            if 'article.cfm' in li.h3.a['href']:
-                            desc = self.tag_to_string(p)
+                article = {
-                        articles.append({'title':title, 'url':url,
+                        'url' : li.h3.a['href'],
-                            'description':desc, 'date':''})
+                        'title' : self.tag_to_string(li.h3.a),
-                        self.log('\t\tFound article:', title, '\n\t\tat', url)
+                        'date': '',
-                    if articles:
+                        'description': self.tag_to_string(li.p),
-                        feeds.append((current_section, articles))
+                    }
-                    current_section = None
+                department.append(article)
        if department:
            feeds.append((title, department))
        return feeds
    def postprocess_html(self, soup, first_fetch):
-        if soup is not None:
+        for item in soup.findAll('a'):
-            for span in soup.findAll('span', attrs={'class':'pagination'}):
+            if 'topic.cfm' in item['href']:
-                span.extract()
+                item.replaceWith(item.string)
            if not first_fetch:
                div = soup.find('div', attrs={'class':'headline'})
                if div:
                    div.extract()
        return soup
-    preprocess_regexps = [
+    extra_css = '''
-        (re.compile(r'Already a Digital subscriber.*Now</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
+                p{font-weight: normal; font-size:small}
-        (re.compile(r'If your institution has site license access, enter.*here</a>.', re.DOTALL|re.IGNORECASE), lambda match: ''),
+                li{font-weight: normal; font-size:small}
-        (re.compile(r'to subscribe to our.*;.*\}', re.DOTALL|re.IGNORECASE), lambda match: ''),
+                .headline p{font-size:x-small; font-family:Arial,Helvetica,sans-serif;}
-        (re.compile(r'\)\(jQuery\);.*-->', re.DOTALL|re.IGNORECASE), lambda match: ''),
+                h2{font-size:large; font-family:Arial,Helvetica,sans-serif;}
-        ]
+                h3{font-size:x-small;font-family:Arial,Helvetica,sans-serif;}
                '''