#!/usr/bin/env python __license__ = "GPL v3" import json from datetime import datetime from urllib.parse import urljoin from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes class ScientificAmerican(BasicNewsRecipe): title = "Scientific American" description = "Popular Science. Monthly magazine. Should be downloaded around the middle of each month." category = "science" __author__ = "Kovid Goyal" no_stylesheets = True language = "en" publisher = "Nature Publishing Group" remove_empty_feeds = True remove_javascript = True timefmt = " [%B %Y]" remove_attributes = ["height", "width"] masthead_url = ( "https://static.scientificamerican.com/sciam/assets/Image/newsletter/salogo.png" ) extra_css = """ [class^="article_dek-"] { font-style:italic; color:#202020; } [class^="article_authors-"] {font-size:small; color:#202020; } [class^="article__image-"] { font-size:small; text-align:center; } [class^="lead_image-"] { font-size:small; text-align:center; } [class^="bio-"] { font-size:small; color:#404040; } em { color:#202020; } """ needs_subscription = "optional" keep_only_tags = [ prefixed_classes( 'article_hed- article_dek- article_authors- lead_image- article__content- bio-' ), ] remove_tags = [ dict(name=['button', 'svg', 'iframe', 'source']) ] def preprocess_html(self, soup): for fig in soup.findAll('figcaption'): for p in fig.findAll('p'): p.name = 'span' return soup def get_browser(self, *args): br = BasicNewsRecipe.get_browser(self) if self.username and self.password: br.open("https://www.scientificamerican.com/account/login/") br.select_form(predicate=lambda f: f.attrs.get("id") == "login") br["emailAddress"] = self.username br["password"] = self.password br.submit() return br recipe_specific_options = { 'issue_url': { 'short': 'The issue URL ', 'long': ( 'For example, https://www.scientificamerican.com/issue/sa/2024/07-01/' '\nYou can also download special-editions, physics, health, mind magazines by pasting the URL here.' ) } } def parse_index(self): # Get the cover, date and issue URL d = self.recipe_specific_options.get('issue_url') if d and isinstance(d, str): issue = d else: fp_soup = self.index_to_soup("https://www.scientificamerican.com") curr_issue_link = fp_soup.find(**prefixed_classes('latest_issue_links-')) if not curr_issue_link: self.abort_recipe_processing("Unable to find issue link") issue = 'https://www.scientificamerican.com' + curr_issue_link.a["href"] soup = self.index_to_soup(issue) script = soup.find("script", id="__DATA__") if not script: self.abort_recipe_processing("Unable to find script") JSON = script.contents[0].split('JSON.parse(`')[1].replace("\\\\", "\\") data = json.JSONDecoder().raw_decode(JSON)[0] issue_info = ( data .get("initialData", {}) .get("issueData", {}) ) if not issue_info: self.abort_recipe_processing("Unable to find issue info") self.cover_url = issue_info["image_url"] + "?w=800" edition_date = datetime.strptime(issue_info["issue_date"], "%Y-%m-%d") self.timefmt = f" [{edition_date:%B %Y}]" feeds = {} for section in ("featured", "departments"): for article in issue_info.get("article_previews", {}).get(section, []): self.log('\t', article["title"]) if section == "featured": feed_name = "Features" else: feed_name = article["category"] if feed_name not in feeds: feeds[feed_name] = [] feeds[feed_name].append( { "title": article["title"], "url": urljoin( "https://www.scientificamerican.com/article/", article["slug"], ), "description": article["summary"], } ) return feeds.items()