calibre/recipes/newrepublicmag.recipe
2023-07-30 09:58:46 +08:00

315 lines
9.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
newrepublic.com
"""
import json
from functools import cmp_to_key
from urllib.parse import urljoin, urlencode, urlsplit, urlparse
from calibre import iswindows
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.utils.date import parse_date
from calibre.web.feeds.news import BasicNewsRecipe
_issue_url = "" # example: https://newrepublic.com/magazine/may-2023
def sort_section(a, b, sections_sort):
try:
a_index = sections_sort.index(a["section"])
except ValueError:
a_index = 999
try:
b_index = sections_sort.index(b["section"])
except ValueError:
b_index = 999
if a_index < b_index:
return -1
if a_index > b_index:
return 1
if a["section"] == b["section"]:
return -1 if a["date"] < b["date"] else 1
return -1 if a["section"] < b["section"] else 1
class NewRepublicMagazine(BasicNewsRecipe):
title = "The New Republic Magazine"
language = "en"
__author__ = "ping"
description = (
"Founded in 1914, The New Republic is a media organization dedicated to addressing "
"todays most critical issues. https://newrepublic.com/magazine"
)
publication_type = "magazine"
use_embedded_content = False
masthead_url = "https://images.newrepublic.com/f5acdc0030e3212e601040dd24d5c2c0c684b15f.png?w=512&q=65&dpi=1&fit=crop&crop=faces&h=256"
remove_attributes = ["height", "width"]
ignore_duplicate_articles = {"title", "url"}
remove_empty_feeds = True
compress_news_images_auto_size = 6
requires_version = (5, 0, 0)
BASE_URL = "https://newrepublic.com"
extra_css = """
h1.headline { margin-bottom: 0.4rem; }
h2.subheadline { font-style: italic; margin-bottom: 1rem; font-weight: normal; }
.article-meta { margin-bottom: 1rem; }
.article-meta span { display: inline-block; font-weight: bold; margin-right: 0.5rem; }
.article-meta span:last-child { font-weight: normal; }
div.pullquote { font-size: 1.25rem; margin-left: 0; text-align: center; }
.lede-media img, .article-embed img, img {
display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto;
box-sizing: border-box;
}
.lede-media .caption, .article-embed .caption { font-size: 0.8rem; }
div.author-bios { margin-top: 2rem; font-style: italic; border-top: solid 1px dimgray; }
"""
def _article_endpoint(self, nid):
"""
Graphql endpoint to fetch full article
:param nid:
:return:
"""
query = """
query ($id: ID, $nid: ID) {
Article(id: $id, nid: $nid) {
...ArticlePageFields
}
}
fragment ArticlePageFields on Article {
id
nid
slug
title
cleanTitle
badge
frontPage {
id
slug
title
}
LinkedSeriesId
authors {
id
name
slug
blurb
meta {
twitter
}
}
body
publishedAt
displayAt
publicPublishedDate
status
ledeImage {
id
src
format
width
height
alt
}
ledeAltImage {
id
src
format
width
height
alt
}
url
urlFull
meta {
wordCount
template
navigationTheme
bigLede
hideLede
cropModeFronts
ledeOverrideSource
disableAds
}
ledeImageCredit
ledeImageCreditBottom
ledeImageRealCaption
bylines
deck
type
galleries {
id
galleryData {
captionText
creditText
image {
id
src
width
height
}
}
}
tags {
id
slug
label
}
}"""
params = {"query": query, "variables": json.dumps({"nid": str(nid)})}
return f"https://newrepublic.com/graphql?{urlencode(params)}"
def _resize_image(self, image_url, width, height):
"""
Rewrite the image url to fetch a device appropriate sized one instead
of the full-res one
:param image_url:
:param width:
:param height:
:return:
"""
crop_params = {
"auto": "compress",
"ar": f"{width}:{height}",
"fm": "jpg",
"fit": "crop",
"crop": "faces",
"ixlib": "react-9.0.2",
"dpr": 1,
"q": 65,
"w": self.scale_news_images[0] if self.scale_news_images else 800,
}
url_tuple = urlsplit(image_url)
return f"{url_tuple.scheme}://{url_tuple.netloc}{url_tuple.path}?{urlencode(crop_params)}"
def populate_article_metadata(self, article, soup, first):
# pick up the og link from preprocess_raw_html() and set it as url instead of the api endpoint
og_link = soup.select("[data-og-link]")
if og_link:
article.url = og_link[0]["data-og-link"]
def preprocess_raw_html(self, raw_html, url):
# formulate the api response into html
article = json.loads(raw_html)["data"]["Article"]
# Example: 2022-08-12T10:00:00.000Z
date_published_loc = parse_date(article["publishedAt"])
# authors
author_bios_html = ""
post_authors = []
try:
post_authors = [a["name"] for a in article.get("authors", [])]
if post_authors:
author_bios_html = "".join(
[a.get("blurb", "") for a in article.get("authors", [])]
)
author_bios_html = f'<div class="author-bios">{author_bios_html}</div>'
except (KeyError, TypeError):
pass
# lede image
lede_image_html = ""
if article.get("ledeImage"):
img = article["ledeImage"]
lede_img_url = self._resize_image(
urljoin(self.BASE_URL, img["src"]), img["width"], img["height"]
)
lede_image_caption = ""
if article.get("ledeImageRealCaption"):
lede_image_caption = (
f'<span class="caption">{article["ledeImageRealCaption"]}>/span>'
)
lede_image_html = f"""<p class="lede-media">
<img src="{lede_img_url}">{lede_image_caption}
</p>"""
body_soup = BeautifulSoup(article["body"], features="html.parser")
for img in body_soup.find_all("img", attrs={"data-serialized": True}):
try:
img_info = json.loads(img["data-serialized"])
img_src = self._resize_image(
urljoin(self.BASE_URL, img_info["src"]),
img_info["width"],
img_info["height"],
)
img["src"] = img_src
del img["data-serialized"]
except: # noqa
pass
return f"""<html>
<head><title>{article["cleanTitle"]}</title></head>
<body>
<article data-og-link="{article["urlFull"]}">
<h1 class="headline">{article["cleanTitle"]}</h1>
{('<h2 class="subheadline">' + article["deck"] + "</h2>") if article.get("deck") else ""}
<div class="article-meta">
{f'<span class="author">{", ".join(post_authors)}</span>' if post_authors else ""}
<span class="published-dt">
{date_published_loc:{"%b %d, %Y" if iswindows else "%b %-d, %Y"}}
</span>
</div>
{lede_image_html}
{str(body_soup)}
{author_bios_html}
</article>
</body></html>"""
def parse_index(self):
br = self.get_browser()
params = ""
if _issue_url:
month = urlparse(_issue_url).path.split("/")[-1]
params = f'?{urlencode({"magazineTag": month})}'
res = br.open_novisit(f"https://newrepublic.com/api/content/magazine{params}")
magazine = json.loads(res.read().decode("utf-8"))["data"]
self.log.debug(f'Found issue: {magazine["metaData"]["issueTag"]["text"]}')
self.timefmt = f': {magazine["metaData"]["issueTag"]["text"]}'
self.cover_url = urljoin(self.BASE_URL, magazine["metaData"]["image"]["src"])
feed_articles = []
for k, articles in magazine.items():
if not (k.startswith("magazine") and articles):
continue
try:
for article in articles:
self.log.debug(f'Found article: {article["title"]}')
feed_articles.append(
{
"url": self._article_endpoint(article["nid"]),
"title": article["title"].replace("\n", " "),
"description": article.get("deck", ""),
"date": article["publishedAt"],
"section": k[len("magazine") :],
}
)
except TypeError:
# not iterable
pass
sort_sections = [
"Cover",
"Editorsnote",
"Features",
"StateOfTheNation",
"ResPublica",
"Columns",
"Upfront",
"Backstory",
"SignsAndWonders",
"Usandtheworld",
"Booksandthearts",
"Poetry",
"Exposure",
]
sort_category_key = cmp_to_key(lambda a, b: sort_section(a, b, sort_sections))
return [
(
magazine["metaData"]["issueTag"]["text"],
sorted(feed_articles, key=sort_category_key),
)
]