mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Merge branch 'recipe-newrepublicmag' of https://github.com/ping/calibre
This commit is contained in:
commit
472c1f0a83
BIN
recipes/icons/newrepublicmag.png
Normal file
BIN
recipes/icons/newrepublicmag.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 370 B |
314
recipes/newrepublicmag.recipe
Normal file
314
recipes/newrepublicmag.recipe
Normal file
@ -0,0 +1,314 @@
|
|||||||
|
"""
|
||||||
|
newrepublic.com
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
from functools import cmp_to_key
|
||||||
|
from urllib.parse import urljoin, urlencode, urlsplit, urlparse
|
||||||
|
|
||||||
|
from calibre import iswindows
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
from calibre.utils.date import parse_date
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
|
_issue_url = "" # example: https://newrepublic.com/magazine/may-2023
|
||||||
|
|
||||||
|
|
||||||
|
def sort_section(a, b, sections_sort):
|
||||||
|
try:
|
||||||
|
a_index = sections_sort.index(a["section"])
|
||||||
|
except ValueError:
|
||||||
|
a_index = 999
|
||||||
|
try:
|
||||||
|
b_index = sections_sort.index(b["section"])
|
||||||
|
except ValueError:
|
||||||
|
b_index = 999
|
||||||
|
|
||||||
|
if a_index < b_index:
|
||||||
|
return -1
|
||||||
|
if a_index > b_index:
|
||||||
|
return 1
|
||||||
|
if a["section"] == b["section"]:
|
||||||
|
return -1 if a["date"] < b["date"] else 1
|
||||||
|
return -1 if a["section"] < b["section"] else 1
|
||||||
|
|
||||||
|
|
||||||
|
class NewRepublicMagazine(BasicNewsRecipe):
|
||||||
|
title = "The New Republic Magazine"
|
||||||
|
language = "en"
|
||||||
|
__author__ = "ping"
|
||||||
|
description = (
|
||||||
|
"Founded in 1914, The New Republic is a media organization dedicated to addressing "
|
||||||
|
"today’s most critical issues. https://newrepublic.com/magazine"
|
||||||
|
)
|
||||||
|
publication_type = "magazine"
|
||||||
|
use_embedded_content = False
|
||||||
|
masthead_url = "https://images.newrepublic.com/f5acdc0030e3212e601040dd24d5c2c0c684b15f.png?w=512&q=65&dpi=1&fit=crop&crop=faces&h=256"
|
||||||
|
remove_attributes = ["height", "width"]
|
||||||
|
ignore_duplicate_articles = {"title", "url"}
|
||||||
|
remove_empty_feeds = True
|
||||||
|
compress_news_images_auto_size = 6
|
||||||
|
requires_version = (5, 0, 0)
|
||||||
|
|
||||||
|
BASE_URL = "https://newrepublic.com"
|
||||||
|
|
||||||
|
extra_css = """
|
||||||
|
h1.headline { margin-bottom: 0.4rem; }
|
||||||
|
h2.subheadline { font-style: italic; margin-bottom: 1rem; font-weight: normal; }
|
||||||
|
.article-meta { margin-bottom: 1rem; }
|
||||||
|
.article-meta span { display: inline-block; font-weight: bold; margin-right: 0.5rem; }
|
||||||
|
.article-meta span:last-child { font-weight: normal; }
|
||||||
|
div.pullquote { font-size: 1.25rem; margin-left: 0; text-align: center; }
|
||||||
|
.lede-media img, .article-embed img, img {
|
||||||
|
display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto;
|
||||||
|
box-sizing: border-box;
|
||||||
|
}
|
||||||
|
.lede-media .caption, .article-embed .caption { font-size: 0.8rem; }
|
||||||
|
div.author-bios { margin-top: 2rem; font-style: italic; border-top: solid 1px dimgray; }
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _article_endpoint(self, nid):
|
||||||
|
"""
|
||||||
|
Graphql endpoint to fetch full article
|
||||||
|
:param nid:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
query = """
|
||||||
|
query ($id: ID, $nid: ID) {
|
||||||
|
Article(id: $id, nid: $nid) {
|
||||||
|
...ArticlePageFields
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fragment ArticlePageFields on Article {
|
||||||
|
id
|
||||||
|
nid
|
||||||
|
slug
|
||||||
|
title
|
||||||
|
cleanTitle
|
||||||
|
badge
|
||||||
|
frontPage {
|
||||||
|
id
|
||||||
|
slug
|
||||||
|
title
|
||||||
|
}
|
||||||
|
LinkedSeriesId
|
||||||
|
authors {
|
||||||
|
id
|
||||||
|
name
|
||||||
|
slug
|
||||||
|
blurb
|
||||||
|
meta {
|
||||||
|
twitter
|
||||||
|
}
|
||||||
|
}
|
||||||
|
body
|
||||||
|
publishedAt
|
||||||
|
displayAt
|
||||||
|
publicPublishedDate
|
||||||
|
status
|
||||||
|
ledeImage {
|
||||||
|
id
|
||||||
|
src
|
||||||
|
format
|
||||||
|
width
|
||||||
|
height
|
||||||
|
alt
|
||||||
|
}
|
||||||
|
ledeAltImage {
|
||||||
|
id
|
||||||
|
src
|
||||||
|
format
|
||||||
|
width
|
||||||
|
height
|
||||||
|
alt
|
||||||
|
}
|
||||||
|
url
|
||||||
|
urlFull
|
||||||
|
meta {
|
||||||
|
wordCount
|
||||||
|
template
|
||||||
|
navigationTheme
|
||||||
|
bigLede
|
||||||
|
hideLede
|
||||||
|
cropModeFronts
|
||||||
|
ledeOverrideSource
|
||||||
|
disableAds
|
||||||
|
}
|
||||||
|
ledeImageCredit
|
||||||
|
ledeImageCreditBottom
|
||||||
|
ledeImageRealCaption
|
||||||
|
bylines
|
||||||
|
deck
|
||||||
|
type
|
||||||
|
galleries {
|
||||||
|
id
|
||||||
|
galleryData {
|
||||||
|
captionText
|
||||||
|
creditText
|
||||||
|
image {
|
||||||
|
id
|
||||||
|
src
|
||||||
|
width
|
||||||
|
height
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tags {
|
||||||
|
id
|
||||||
|
slug
|
||||||
|
label
|
||||||
|
}
|
||||||
|
}"""
|
||||||
|
params = {"query": query, "variables": json.dumps({"nid": str(nid)})}
|
||||||
|
return f"https://newrepublic.com/graphql?{urlencode(params)}"
|
||||||
|
|
||||||
|
def _resize_image(self, image_url, width, height):
|
||||||
|
"""
|
||||||
|
Rewrite the image url to fetch a device appropriate sized one instead
|
||||||
|
of the full-res one
|
||||||
|
|
||||||
|
:param image_url:
|
||||||
|
:param width:
|
||||||
|
:param height:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
crop_params = {
|
||||||
|
"auto": "compress",
|
||||||
|
"ar": f"{width}:{height}",
|
||||||
|
"fm": "jpg",
|
||||||
|
"fit": "crop",
|
||||||
|
"crop": "faces",
|
||||||
|
"ixlib": "react-9.0.2",
|
||||||
|
"dpr": 1,
|
||||||
|
"q": 65,
|
||||||
|
"w": self.scale_news_images[0] if self.scale_news_images else 800,
|
||||||
|
}
|
||||||
|
url_tuple = urlsplit(image_url)
|
||||||
|
return f"{url_tuple.scheme}://{url_tuple.netloc}{url_tuple.path}?{urlencode(crop_params)}"
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
# pick up the og link from preprocess_raw_html() and set it as url instead of the api endpoint
|
||||||
|
og_link = soup.select("[data-og-link]")
|
||||||
|
if og_link:
|
||||||
|
article.url = og_link[0]["data-og-link"]
|
||||||
|
|
||||||
|
def preprocess_raw_html(self, raw_html, url):
|
||||||
|
# formulate the api response into html
|
||||||
|
article = json.loads(raw_html)["data"]["Article"]
|
||||||
|
# Example: 2022-08-12T10:00:00.000Z
|
||||||
|
date_published_loc = parse_date(article["publishedAt"])
|
||||||
|
# authors
|
||||||
|
author_bios_html = ""
|
||||||
|
post_authors = []
|
||||||
|
try:
|
||||||
|
post_authors = [a["name"] for a in article.get("authors", [])]
|
||||||
|
if post_authors:
|
||||||
|
author_bios_html = "".join(
|
||||||
|
[a.get("blurb", "") for a in article.get("authors", [])]
|
||||||
|
)
|
||||||
|
author_bios_html = f'<div class="author-bios">{author_bios_html}</div>'
|
||||||
|
except (KeyError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# lede image
|
||||||
|
lede_image_html = ""
|
||||||
|
if article.get("ledeImage"):
|
||||||
|
img = article["ledeImage"]
|
||||||
|
lede_img_url = self._resize_image(
|
||||||
|
urljoin(self.BASE_URL, img["src"]), img["width"], img["height"]
|
||||||
|
)
|
||||||
|
lede_image_caption = ""
|
||||||
|
if article.get("ledeImageRealCaption"):
|
||||||
|
lede_image_caption = (
|
||||||
|
f'<span class="caption">{article["ledeImageRealCaption"]}>/span>'
|
||||||
|
)
|
||||||
|
lede_image_html = f"""<p class="lede-media">
|
||||||
|
<img src="{lede_img_url}">{lede_image_caption}
|
||||||
|
</p>"""
|
||||||
|
|
||||||
|
body_soup = BeautifulSoup(article["body"], features="html.parser")
|
||||||
|
for img in body_soup.find_all("img", attrs={"data-serialized": True}):
|
||||||
|
try:
|
||||||
|
img_info = json.loads(img["data-serialized"])
|
||||||
|
img_src = self._resize_image(
|
||||||
|
urljoin(self.BASE_URL, img_info["src"]),
|
||||||
|
img_info["width"],
|
||||||
|
img_info["height"],
|
||||||
|
)
|
||||||
|
img["src"] = img_src
|
||||||
|
del img["data-serialized"]
|
||||||
|
except: # noqa
|
||||||
|
pass
|
||||||
|
|
||||||
|
return f"""<html>
|
||||||
|
<head><title>{article["cleanTitle"]}</title></head>
|
||||||
|
<body>
|
||||||
|
<article data-og-link="{article["urlFull"]}">
|
||||||
|
<h1 class="headline">{article["cleanTitle"]}</h1>
|
||||||
|
{('<h2 class="subheadline">' + article["deck"] + "</h2>") if article.get("deck") else ""}
|
||||||
|
<div class="article-meta">
|
||||||
|
{f'<span class="author">{", ".join(post_authors)}</span>' if post_authors else ""}
|
||||||
|
<span class="published-dt">
|
||||||
|
{date_published_loc:{"%b %d, %Y" if iswindows else "%b %-d, %Y"}}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
{lede_image_html}
|
||||||
|
{str(body_soup)}
|
||||||
|
{author_bios_html}
|
||||||
|
</article>
|
||||||
|
</body></html>"""
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
br = self.get_browser()
|
||||||
|
params = ""
|
||||||
|
if _issue_url:
|
||||||
|
month = urlparse(_issue_url).path.split("/")[-1]
|
||||||
|
params = f'?{urlencode({"magazineTag": month})}'
|
||||||
|
res = br.open_novisit(f"https://newrepublic.com/api/content/magazine{params}")
|
||||||
|
magazine = json.loads(res.read().decode("utf-8"))["data"]
|
||||||
|
self.log.debug(f'Found issue: {magazine["metaData"]["issueTag"]["text"]}')
|
||||||
|
self.timefmt = f': {magazine["metaData"]["issueTag"]["text"]}'
|
||||||
|
self.cover_url = urljoin(self.BASE_URL, magazine["metaData"]["image"]["src"])
|
||||||
|
|
||||||
|
feed_articles = []
|
||||||
|
for k, articles in magazine.items():
|
||||||
|
if not (k.startswith("magazine") and articles):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
for article in articles:
|
||||||
|
self.log.debug(f'Found article: {article["title"]}')
|
||||||
|
feed_articles.append(
|
||||||
|
{
|
||||||
|
"url": self._article_endpoint(article["nid"]),
|
||||||
|
"title": article["title"].replace("\n", " "),
|
||||||
|
"description": article.get("deck", ""),
|
||||||
|
"date": article["publishedAt"],
|
||||||
|
"section": k[len("magazine") :],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
except TypeError:
|
||||||
|
# not iterable
|
||||||
|
pass
|
||||||
|
|
||||||
|
sort_sections = [
|
||||||
|
"Cover",
|
||||||
|
"Editorsnote",
|
||||||
|
"Features",
|
||||||
|
"StateOfTheNation",
|
||||||
|
"ResPublica",
|
||||||
|
"Columns",
|
||||||
|
"Upfront",
|
||||||
|
"Backstory",
|
||||||
|
"SignsAndWonders",
|
||||||
|
"Usandtheworld",
|
||||||
|
"Booksandthearts",
|
||||||
|
"Poetry",
|
||||||
|
"Exposure",
|
||||||
|
]
|
||||||
|
sort_category_key = cmp_to_key(lambda a, b: sort_section(a, b, sort_sections))
|
||||||
|
return [
|
||||||
|
(
|
||||||
|
magazine["metaData"]["issueTag"]["text"],
|
||||||
|
sorted(feed_articles, key=sort_category_key),
|
||||||
|
)
|
||||||
|
]
|
Loading…
x
Reference in New Issue
Block a user