Merge branch 'scientific-american-recipe-fix' of https://github.com/ping/calibre

2025-08-30 23:00:21 -04:00 · 2023-04-21 10:31:10 +05:30 · 2023-04-21 10:31:10 +05:30 · ef8a6f05ac
commit ef8a6f05ac
parent 5ba24a04eb 9837ad8041
1 changed files with 73 additions and 80 deletions
--- a/recipes/scientific_american.recipe
+++ b/recipes/scientific_american.recipe
@ -1,30 +1,30 @@
 #!/usr/bin/env python
-__license__ = 'GPL v3'
+__license__ = "GPL v3"
+
+import json
+from datetime import datetime
+from os.path import splitext
+from urllib.parse import urljoin

 from calibre.web.feeds.news import BasicNewsRecipe, classes
-from css_selectors import Select
-
-
-def absurl(url):
-    if url.startswith('/'):
-        url = 'https://www.scientificamerican.com' + url
-    return url


 class ScientificAmerican(BasicNewsRecipe):
-    title = u'Scientific American'
-    description = u'Popular Science. Monthly magazine. Should be downloaded around the middle of each month.'
-    category = 'science'
-    __author__ = 'Kovid Goyal'
+    title = "Scientific American"
+    description = "Popular Science. Monthly magazine. Should be downloaded around the middle of each month."
+    category = "science"
+    __author__ = "Kovid Goyal"
    no_stylesheets = True
-    language = 'en'
-    publisher = 'Nature Publishing Group'
+    language = "en"
+    publisher = "Nature Publishing Group"
    remove_empty_feeds = True
    remove_javascript = True
-    timefmt = ' [%B %Y]'
-    remove_attributes = ['height','width']
-    masthead_url = 'https://static.scientificamerican.com/sciam/assets/Image/newsletter/salogo.png'
-    extra_css = '''
+    timefmt = " [%B %Y]"
+    remove_attributes = ["height", "width"]
+    masthead_url = (
+        "https://static.scientificamerican.com/sciam/assets/Image/newsletter/salogo.png"
+    )
+    extra_css = """
        .image-captioned{font-size:small;}
        .feature-article__byline-authors{font-size:small;}
        .article-header__inner__category{font-size:small; color:gray;}
@ -33,85 +33,78 @@ class ScientificAmerican(BasicNewsRecipe):
        .opinion-article__byline-authors{font-size:small;}
        .article-author{font-size:small;}
        [role="presentation"]{font-size:small;}
-    '''
+    """

-    needs_subscription = 'optional'
+    needs_subscription = "optional"

    keep_only_tags = [
        classes(
-            'article-header article-content article-media article-author article-text feature-article--header'
-            ' feature-article--header-title opinion-article__header-title author-bio'),
+            "article-header article-content article-media article-author article-text feature-article--header"
+            " feature-article--header-title opinion-article__header-title author-bio"
+        ),
    ]
    remove_tags = [
-        classes('aside-banner moreToExplore article-footer flex-column--25 article-author__suggested medium-up-hide'),
-        dict(id=['seeAlsoLinks']),
+        classes(
+            "aside-banner moreToExplore article-footer flex-column--25 article-author__suggested medium-up-hide"
+        ),
+        dict(id=["seeAlsoLinks"]),
    ]

    def get_browser(self, *args):
        br = BasicNewsRecipe.get_browser(self)
        if self.username and self.password:
-            br.open('https://www.scientificamerican.com/my-account/login/')
-            br.select_form(predicate=lambda f: f.attrs.get('id') == 'login')
-            br['emailAddress'] = self.username
-            br['password'] = self.password
+            br.open("https://www.scientificamerican.com/my-account/login/")
+            br.select_form(predicate=lambda f: f.attrs.get("id") == "login")
+            br["emailAddress"] = self.username
+            br["password"] = self.password
            br.submit()
        return br

    def parse_index(self):
        # Get the cover, date and issue URL
-        root = self.index_to_soup(
-            'https://www.scientificamerican.com/sciammag/', as_tree=True)
-        select = Select(root)
-        self.cover_url = [x.get('src', '') for x in select('main .store-listing__img img')][0]
-        url = [x.get('href', '') for x in select('main .store-listing__img a')][0]
-        url = absurl(url)
+        fp_soup = self.index_to_soup("https://www.scientificamerican.com")
+        curr_issue_link = fp_soup.select(".tout_current-issue__cover a")
+        if not curr_issue_link:
+            self.abort_recipe_processing("Unable to find issue link")
+        issue_url = curr_issue_link[0]["href"]
+        soup = self.index_to_soup(issue_url)
+        script = soup.find("script", id="__NEXT_DATA__")
+        if not script:
+            self.abort_recipe_processing("Unable to find script")

-        # Now parse the actual issue to get the list of articles
-        select = Select(self.index_to_soup(url, as_tree=True))
-        self.cover_url = [x.get('src', '') for x in select('main .product-detail__image img')][0].split('?')[0]
-        self.cover_url += '?w=800'
-        feeds = []
-        for i, section in enumerate(select('#sa_body .toc-articles')):
-            if i == 0:
-                feeds.append(
-                    ('Features', list(self.parse_sciam_features(select, section))))
-            else:
-                feeds.extend(self.parse_sciam_departments(select, section))
+        issue_info = (
+            json.loads(script.contents[0])
+            .get("props", {})
+            .get("pageProps", {})
+            .get("issue", {})
+        )
+        if not issue_info:
+            self.abort_recipe_processing("Unable to find issue info")

-        return feeds
+        image_id, _ = splitext(issue_info["image"])
+        self.cover_url = f"https://static.scientificamerican.com/sciam/cache/file/{image_id}_source.jpg?w=800"

-    def parse_sciam_features(self, select, section):
-        for article in select('article[data-article-title]', section):
-            title = article.get('data-article-title')
-            url = 'https://www.scientificamerican.com/{}/'.format(article.get('id').replace('-', '/', 1))
-            desc = ''
-            for p in select('p.t_body', article):
-                desc += self.tag_to_string(p)
-                break
-            for p in select('.t_meta', article):
-                desc += ' ' + self.tag_to_string(p)
-                break
-            self.log('Found feature article: %s at %s' % (title, url))
-            self.log('\t' + desc)
-            yield {'title': title, 'url': url, 'description': desc}
+        edition_date = datetime.strptime(issue_info["issue_date"], "%Y-%m-%d")
+        self.timefmt = f" [{edition_date:%B %Y}]"

-    def parse_sciam_departments(self, select, section):
-        section_title, articles = 'Unknown', []
-        for li in select('li[data-article-title]', section):
-            for span in select('span.department-title', li):
-                if articles:
-                    yield section_title, articles
-                section_title, articles = self.tag_to_string(span), []
-                self.log('\nFound section: %s' % section_title)
-                break
-            url = 'https://www.scientificamerican.com/{}/'.format(li.get('id').replace('-', '/', 1))
-            for h2 in select('h2.t_listing-title', li):
-                title = self.tag_to_string(h2)
-                break
-            else:
-                continue
-            articles.append(
-                    {'title': title, 'url': url, 'description': ''})
-            self.log('\tFound article: %s at %s' % (title, url))
-        if articles:
-            yield section_title, articles
+        feeds = {}
+        for section in ("featured", "departments"):
+            for article in issue_info.get("article_previews", {}).get(section, []):
+                if section == "featured":
+                    feed_name = "Features"
+                else:
+                    feed_name = article["category"]
+                if feed_name not in feeds:
+                    feeds[feed_name] = []
+                feeds[feed_name].append(
+                    {
+                        "title": article["title"],
+                        "url": urljoin(
+                            "https://www.scientificamerican.com/article/",
+                            article["slug"],
+                        ),
+                        "description": article["summary"],
+                    }
+                )
+
+        return feeds.items()