Update Smithsonian Magazine

2025-07-09 03:04:10 -04:00 · 2021-11-15 15:06:59 +05:30 · 2021-11-15 15:06:59 +05:30 · 876290c600
commit 876290c600
parent e7d4e348ba
1 changed files with 50 additions and 53 deletions
--- a/recipes/smith.recipe
+++ b/recipes/smith.recipe
@ -1,6 +1,15 @@
-import re
-from calibre.web.feeds.recipes import BasicNewsRecipe
-from collections import OrderedDict
+from calibre.web.feeds.news import BasicNewsRecipe, classes
+
+
+CATEGORIES = {
+    'smart-news': 'Smart News',
+    'history': 'History',
+    'science': 'Science',
+    'innovation': 'Innovation',
+    'arts-culture': 'Arts & Culture',
+    'travel': 'Travel',
+    'smithsonian-institution': 'At the Smithsonian'
+}


 class Smithsonian(BasicNewsRecipe):
@ -12,59 +21,47 @@ class Smithsonian(BasicNewsRecipe):
    language = 'en'
    category = 'news'
    encoding = 'UTF-8'
-    keep_only_tags = [dict(name='main', attrs={'class': 'main'})]
-    remove_tags = [
-        dict(attrs={'class': lambda x: x and set(x.split()).intersection(
-            {'hidden-phone', 'hidden-tablet', 'hidden-desktop'})}),
-        dict(attrs={'class': ['slideshow-nav', 'associated-container']}),
+    keep_only_tags = [
+        classes('article-header articleLeft')
+    ]
+    remove_tags = [
+        classes(
+            'hidden-phone hidden-tablet hidden-desktop slideshow-nav associated-container'
+            ' widget-article-pixel tag-list recommended-videos comments'
+        )
    ]
-    remove_tags_after = dict(
-        name='div', attrs={'class': lambda x: x and 'article-body' in x.split()})
    no_javascript = True
    no_stylesheets = True

+    def parse_section(self, url):
+        soup = self.index_to_soup(url)
+        seen = set()
+        for al in soup.findAll(attrs={'class': 'article-list'}):
+            for article in al.findAll(attrs={'class': 'article-list-item'}):
+                div = article.find(attrs={'class': 'article-list-text'})
+                a = div.find('a')
+                title = self.tag_to_string(a)
+                if title in seen:
+                    continue
+                seen.add(title)
+                url = 'https://www.smithsonianmag.com/' + a['href'].lstrip('/')
+                if '/tag/' in url:
+                    continue
+                desc = ''
+                p = div.find(attrs={'class': 'article-list-text'})
+                if p is not None:
+                    desc = self.tag_to_string(p)
+                self.log('\t' + title)
+                yield {'title': title, 'url': url, 'description': desc}
+
    def parse_index(self):
-        # Go to the issue
-        soup = self.index_to_soup(
-            'http://www.smithsonianmag.com/issue/archive/?no-ist')
-        li = soup.find('li', attrs={'class': 'issue'})
-        url_prefix = 'http://www.smithsonianmag.com'
-        current_issue_url = url_prefix + \
-            li.find('a', href=True)['href'] + '?no-ist'
-        self.log('Downloading issue:', current_issue_url)
-        soup = self.index_to_soup(current_issue_url)
-
-        # Go to the main body
-        div = soup.find('div', id='Page-Content')
-
-        # Find date
-        date = re.sub(
-            r'.*\:\W*', "", self.tag_to_string(div.find('h1')).strip())
-        self.timefmt = u' [%s]' % date
-
-        # Find cover
-        self.cover_url = div.find(
-            'img', alt=lambda x: x and 'Cover' in x, src=True)['src']
-
-        feeds = OrderedDict()
-        section_title = ''
-        articles = []
-        for div in soup.findAll('div', attrs={'class': 'article-list'}):
-            section_title = self.tag_to_string(
-                div.find('h2', attrs={'class': 'headline'})).capitalize()
-            self.log('\n\nFound section:', section_title)
-            articles = feeds[section_title] = []
-            for sec in div.findAll('section', attrs={'class': lambda x: x and 'article-teaser' in x.split()}):
-                head = sec.find(attrs={'class': 'headline'})
-                url = head.find('a', href=True)['href'] + '?all&no-ist'
-                if url.startswith('/'):
-                    url = url_prefix + url
-                title = self.tag_to_string(head)
-                desc = sec.find(attrs={'class': 'sub-title'})
-                desc = '' if desc is None else self.tag_to_string(desc)
-                self.log('Found article:', title)
-                self.log('\t', url)
-                articles.append(
-                    {'title': title, 'url': url, 'description': desc})
-        ans = [(key, val) for key, val in feeds.items()]
+        ans = []
+        for slug, title in CATEGORIES.items():
+            url = 'https://www.smithsonianmag.com/category/' + slug + '/'
+            self.log('Parsing section:', title)
+            articles = list(self.parse_section(url))
+            if articles:
+                ans.append((title, articles))
+            if self.test and len(ans) >= self.test[0]:
+                break
        return ans