Update Nature

2026-01-05 19:50:21 -05:00 · 2019-01-18 06:14:22 +05:30 · 2019-01-18 06:14:22 +05:30 · 60e3bece0b
commit 60e3bece0b
parent e4b3f75c27
1 changed files with 59 additions and 23 deletions
--- a/recipes/nature.recipe
+++ b/recipes/nature.recipe
@ -18,34 +18,44 @@ def check_words(words):
    return lambda x: x and frozenset(words.split()).intersection(x.split())


+def has_all_of(words):
+    return lambda x: x and frozenset(words.split()).issubset(x.split())
+
+
 class Nature(BasicNewsRecipe):
    title = 'Nature'
    __author__ = 'Jose Ortiz'
-    description = ('Nature is a weekly international multidisciplinary scientific journal'
-                   ' publishing peer-reviewed research in all fields of science and'
-                   ' technology on the basis of its originality, importance,'
-                   ' interdisciplinary interest, timeliness, accessibility, elegance and'
-                   ' surprising conclusions. Nature also provides rapid, authoritative,'
-                   ' insightful and arresting news and interpretation of topical and coming'
-                   ' trends affecting science, scientists and the wider public.')
+    description = (
+        'Nature is a weekly international multidisciplinary scientific journal'
+        ' publishing peer-reviewed research in all fields of science and'
+        ' technology on the basis of its originality, importance,'
+        ' interdisciplinary interest, timeliness, accessibility, elegance and'
+        ' surprising conclusions.  Nauture also provides rapid, authoritative,'
+        ' insightful and arresting news and interpretation of topical and coming'
+        ' trends affecting science, scientists and the wider public.'
+    )
    language = 'en'
    encoding = 'UTF-8'
    no_javascript = True
    no_stylesheets = True

    keep_only_tags = [
-        dict(name='div',attrs={'data-component' : check_words('article-container')})
+        dict(name='div', attrs={'data-component': check_words('article-container')})
    ]

-    remove_tags = [
-        dict(attrs={'class' : check_words('hide-print')})
-    ]
+    remove_tags = [dict(attrs={'class': check_words('hide-print')})]

    def parse_index(self):
        soup = self.index_to_soup(BASE + '/nature/current-issue')
-        self.cover_url = 'https:' + soup.find('img',attrs={'data-test' : 'issue-cover-image'})['src']
-        section_tags = soup.find('div', {'data-container-type' : check_words('issue-section-list')})
-        section_tags = section_tags.findAll('div', {'class' : check_words('article-section')})
+        self.cover_url = 'https:' + soup.find(
+            'img', attrs={'data-test': check_words('issue-cover-image')}
+        )['src']
+        section_tags = soup.find(
+            'div', {'data-container-type': check_words('issue-section-list')}
+        )
+        section_tags = section_tags.findAll(
+            'div', {'class': check_words('article-section')}
+        )

        sections = defaultdict(list)
        ordered_sec_titles = []
@ -55,23 +65,49 @@ class Nature(BasicNewsRecipe):
            sec_title = self.tag_to_string(sec.find('h2'))
            ordered_sec_titles.append(sec_title)
            for article in sec.findAll('article'):
-                title = self.tag_to_string(article.find('h3', {'itemprop' : check_words('name headline')}))
-                date = ' [' + self.tag_to_string(article.find('time', {'itemprop' : check_words('datePublished')})) + ']'
-                author = self.tag_to_string(article.find('li', {'itemprop' : check_words('creator')}))
-                url =  absurl(article.find('a',{'itemprop' : check_words('url')})['href'])
-                label = self.tag_to_string(article.find(attrs={'data-test' : check_words('article.type')}))
-                description = label + ': ' + self.tag_to_string(article.find('div', attrs={'itemprop' : check_words('description')}))
-                sections[sec_title].append(
-                    {'title' : title, 'url' : url, 'description' : description, 'date' : date, 'author' : author})
+                try:
+                    url = absurl(
+                        article.find('a', {'itemprop': check_words('url')})['href']
+                    )
+                except TypeError:
+                    continue
+                title = self.tag_to_string(
+                    article.find('h3', {'itemprop': has_all_of('name headline')})
+                )
+                date = ' [' + self.tag_to_string(
+                    article.find('time', {'itemprop': check_words('datePublished')})
+                ) + ']'
+                author = self.tag_to_string(
+                    article.find('li', {'itemprop': check_words('creator')})
+                )
+                description = self.tag_to_string(
+                    article.find(attrs={'data-test': check_words('article.type')})
+                ) + u' • '
+                description += self.tag_to_string(
+                    article.find(
+                        'div', attrs={'itemprop': check_words('description')}
+                    )
+                )
+                sections[sec_title].append({
+                    'title': title,
+                    'url': url,
+                    'description': description,
+                    'date': date,
+                    'author': author
+                })

        for k in ordered_sec_titles:
            index.append((k, sections[k]))
        return index

    def preprocess_html(self, soup):
-        for img in soup.findAll('img',{'data-src' : True}):
+        for img in soup.findAll('img', {'data-src': True}):
            if img['data-src'].startswith('//'):
                img['src'] = 'https:' + img['data-src']
            else:
                img['src'] = img['data-src']
+        for div in soup.findAll(
+            'div', {'data-component': check_words('article-container')}
+        )[1:]:
+            div.extract()
        return soup