From 1ad963aebde2980045fd8280ce49545cfb2210a6 Mon Sep 17 00:00:00 2001 From: ping Date: Sun, 30 Jul 2023 09:58:46 +0800 Subject: [PATCH] New recipe: The New Republic Magazine --- recipes/icons/newrepublicmag.png | Bin 0 -> 370 bytes recipes/newrepublicmag.recipe | 314 +++++++++++++++++++++++++++++++ 2 files changed, 314 insertions(+) create mode 100644 recipes/icons/newrepublicmag.png create mode 100644 recipes/newrepublicmag.recipe diff --git a/recipes/icons/newrepublicmag.png b/recipes/icons/newrepublicmag.png new file mode 100644 index 0000000000000000000000000000000000000000..dc756592a1e0210ae56aa528faf1af4afab27c47 GIT binary patch literal 370 zcmV-&0ge8NP)p^Xs}3RPzFjN87xv3 ziLxlgsly#feB{|5VDx(N@9!AfE%4g5*D1DbFII8Dw*A?KMX7wLNv;IL@%VUX|1N&f zM*Zf)&4@R+db(W4hu-5%W~51~64G@AzJr$>Xp$~{yc=tSs1$+WN9iP=&<@}}OQpQA zI#VDjMKGamNxSLv0Rz&!#T!o+5S1dB7Dgp4(wx!UI5p8>_z{&Nm=>6mG*NH$@64`s z@yoOnCZz{bHziGxb|~GJ@nc#F!*(^~9!X13c`t1lzAZ>|_+Iuhp**HFU_jC!pF9m` ziNs0eQ1WS`)}r<5yavpAA{lEpyz{uO b_index: + return 1 + if a["section"] == b["section"]: + return -1 if a["date"] < b["date"] else 1 + return -1 if a["section"] < b["section"] else 1 + + +class NewRepublicMagazine(BasicNewsRecipe): + title = "The New Republic Magazine" + language = "en" + __author__ = "ping" + description = ( + "Founded in 1914, The New Republic is a media organization dedicated to addressing " + "today’s most critical issues. https://newrepublic.com/magazine" + ) + publication_type = "magazine" + use_embedded_content = False + masthead_url = "https://images.newrepublic.com/f5acdc0030e3212e601040dd24d5c2c0c684b15f.png?w=512&q=65&dpi=1&fit=crop&crop=faces&h=256" + remove_attributes = ["height", "width"] + ignore_duplicate_articles = {"title", "url"} + remove_empty_feeds = True + compress_news_images_auto_size = 6 + requires_version = (5, 0, 0) + + BASE_URL = "https://newrepublic.com" + + extra_css = """ + h1.headline { margin-bottom: 0.4rem; } + h2.subheadline { font-style: italic; margin-bottom: 1rem; font-weight: normal; } + .article-meta { margin-bottom: 1rem; } + .article-meta span { display: inline-block; font-weight: bold; margin-right: 0.5rem; } + .article-meta span:last-child { font-weight: normal; } + div.pullquote { font-size: 1.25rem; margin-left: 0; text-align: center; } + .lede-media img, .article-embed img, img { + display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto; + box-sizing: border-box; + } + .lede-media .caption, .article-embed .caption { font-size: 0.8rem; } + div.author-bios { margin-top: 2rem; font-style: italic; border-top: solid 1px dimgray; } + """ + + def _article_endpoint(self, nid): + """ + Graphql endpoint to fetch full article + :param nid: + :return: + """ + query = """ +query ($id: ID, $nid: ID) { + Article(id: $id, nid: $nid) { + ...ArticlePageFields + } +} +fragment ArticlePageFields on Article { + id + nid + slug + title + cleanTitle + badge + frontPage { + id + slug + title + } + LinkedSeriesId + authors { + id + name + slug + blurb + meta { + twitter + } + } + body + publishedAt + displayAt + publicPublishedDate + status + ledeImage { + id + src + format + width + height + alt + } + ledeAltImage { + id + src + format + width + height + alt + } + url + urlFull + meta { + wordCount + template + navigationTheme + bigLede + hideLede + cropModeFronts + ledeOverrideSource + disableAds + } + ledeImageCredit + ledeImageCreditBottom + ledeImageRealCaption + bylines + deck + type + galleries { + id + galleryData { + captionText + creditText + image { + id + src + width + height + } + } + } + tags { + id + slug + label + } +}""" + params = {"query": query, "variables": json.dumps({"nid": str(nid)})} + return f"https://newrepublic.com/graphql?{urlencode(params)}" + + def _resize_image(self, image_url, width, height): + """ + Rewrite the image url to fetch a device appropriate sized one instead + of the full-res one + + :param image_url: + :param width: + :param height: + :return: + """ + crop_params = { + "auto": "compress", + "ar": f"{width}:{height}", + "fm": "jpg", + "fit": "crop", + "crop": "faces", + "ixlib": "react-9.0.2", + "dpr": 1, + "q": 65, + "w": self.scale_news_images[0] if self.scale_news_images else 800, + } + url_tuple = urlsplit(image_url) + return f"{url_tuple.scheme}://{url_tuple.netloc}{url_tuple.path}?{urlencode(crop_params)}" + + def populate_article_metadata(self, article, soup, first): + # pick up the og link from preprocess_raw_html() and set it as url instead of the api endpoint + og_link = soup.select("[data-og-link]") + if og_link: + article.url = og_link[0]["data-og-link"] + + def preprocess_raw_html(self, raw_html, url): + # formulate the api response into html + article = json.loads(raw_html)["data"]["Article"] + # Example: 2022-08-12T10:00:00.000Z + date_published_loc = parse_date(article["publishedAt"]) + # authors + author_bios_html = "" + post_authors = [] + try: + post_authors = [a["name"] for a in article.get("authors", [])] + if post_authors: + author_bios_html = "".join( + [a.get("blurb", "") for a in article.get("authors", [])] + ) + author_bios_html = f'
{author_bios_html}
' + except (KeyError, TypeError): + pass + + # lede image + lede_image_html = "" + if article.get("ledeImage"): + img = article["ledeImage"] + lede_img_url = self._resize_image( + urljoin(self.BASE_URL, img["src"]), img["width"], img["height"] + ) + lede_image_caption = "" + if article.get("ledeImageRealCaption"): + lede_image_caption = ( + f'{article["ledeImageRealCaption"]}>/span>' + ) + lede_image_html = f"""

+ {lede_image_caption} +

""" + + body_soup = BeautifulSoup(article["body"], features="html.parser") + for img in body_soup.find_all("img", attrs={"data-serialized": True}): + try: + img_info = json.loads(img["data-serialized"]) + img_src = self._resize_image( + urljoin(self.BASE_URL, img_info["src"]), + img_info["width"], + img_info["height"], + ) + img["src"] = img_src + del img["data-serialized"] + except: # noqa + pass + + return f""" + {article["cleanTitle"]} + +
+

{article["cleanTitle"]}

+ {('

' + article["deck"] + "

") if article.get("deck") else ""} + + {lede_image_html} + {str(body_soup)} + {author_bios_html} +
+ """ + + def parse_index(self): + br = self.get_browser() + params = "" + if _issue_url: + month = urlparse(_issue_url).path.split("/")[-1] + params = f'?{urlencode({"magazineTag": month})}' + res = br.open_novisit(f"https://newrepublic.com/api/content/magazine{params}") + magazine = json.loads(res.read().decode("utf-8"))["data"] + self.log.debug(f'Found issue: {magazine["metaData"]["issueTag"]["text"]}') + self.timefmt = f': {magazine["metaData"]["issueTag"]["text"]}' + self.cover_url = urljoin(self.BASE_URL, magazine["metaData"]["image"]["src"]) + + feed_articles = [] + for k, articles in magazine.items(): + if not (k.startswith("magazine") and articles): + continue + try: + for article in articles: + self.log.debug(f'Found article: {article["title"]}') + feed_articles.append( + { + "url": self._article_endpoint(article["nid"]), + "title": article["title"].replace("\n", " "), + "description": article.get("deck", ""), + "date": article["publishedAt"], + "section": k[len("magazine") :], + } + ) + except TypeError: + # not iterable + pass + + sort_sections = [ + "Cover", + "Editorsnote", + "Features", + "StateOfTheNation", + "ResPublica", + "Columns", + "Upfront", + "Backstory", + "SignsAndWonders", + "Usandtheworld", + "Booksandthearts", + "Poetry", + "Exposure", + ] + sort_category_key = cmp_to_key(lambda a, b: sort_section(a, b, sort_sections)) + return [ + ( + magazine["metaData"]["issueTag"]["text"], + sorted(feed_articles, key=sort_category_key), + ) + ]