mirror of
https://github.com/kovidgoyal/calibre.git
synced 2026-02-02 00:53:29 -05:00
314 lines
9.3 KiB
Python
314 lines
9.3 KiB
Python
'''
|
||
newrepublic.com
|
||
'''
|
||
import json
|
||
from functools import cmp_to_key
|
||
from urllib.parse import urlencode, urljoin, urlparse, urlsplit
|
||
|
||
from calibre import iswindows
|
||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||
from calibre.utils.date import parse_date
|
||
from calibre.web.feeds.news import BasicNewsRecipe
|
||
|
||
_issue_url = '' # example: https://newrepublic.com/magazine/may-2023
|
||
|
||
|
||
def sort_section(a, b, sections_sort):
|
||
try:
|
||
a_index = sections_sort.index(a['section'])
|
||
except ValueError:
|
||
a_index = 999
|
||
try:
|
||
b_index = sections_sort.index(b['section'])
|
||
except ValueError:
|
||
b_index = 999
|
||
|
||
if a_index < b_index:
|
||
return -1
|
||
if a_index > b_index:
|
||
return 1
|
||
if a['section'] == b['section']:
|
||
return -1 if a['date'] < b['date'] else 1
|
||
return -1 if a['section'] < b['section'] else 1
|
||
|
||
|
||
class NewRepublicMagazine(BasicNewsRecipe):
|
||
title = 'The New Republic Magazine'
|
||
language = 'en'
|
||
__author__ = 'ping'
|
||
description = (
|
||
'Founded in 1914, The New Republic is a media organization dedicated to addressing '
|
||
'today’s most critical issues. https://newrepublic.com/magazine'
|
||
)
|
||
publication_type = 'magazine'
|
||
use_embedded_content = False
|
||
masthead_url = 'https://images.newrepublic.com/f5acdc0030e3212e601040dd24d5c2c0c684b15f.png?w=512&q=65&dpi=1&fit=crop&crop=faces&h=256'
|
||
remove_attributes = ['height', 'width']
|
||
ignore_duplicate_articles = {'title', 'url'}
|
||
remove_empty_feeds = True
|
||
compress_news_images_auto_size = 6
|
||
requires_version = (5, 0, 0)
|
||
|
||
BASE_URL = 'https://newrepublic.com'
|
||
|
||
extra_css = '''
|
||
h1.headline { margin-bottom: 0.4rem; }
|
||
h2.subheadline { font-style: italic; margin-bottom: 1rem; font-weight: normal; }
|
||
.article-meta { margin-bottom: 1rem; }
|
||
.article-meta span { display: inline-block; font-weight: bold; margin-right: 0.5rem; }
|
||
.article-meta span:last-child { font-weight: normal; }
|
||
div.pullquote { font-size: 1.25rem; margin-left: 0; text-align: center; }
|
||
.lede-media img, .article-embed img, img {
|
||
display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto;
|
||
box-sizing: border-box;
|
||
}
|
||
.lede-media .caption, .article-embed .caption { font-size: 0.8rem; }
|
||
div.author-bios { margin-top: 2rem; font-style: italic; border-top: solid 1px dimgray; }
|
||
'''
|
||
|
||
def _article_endpoint(self, nid):
|
||
'''
|
||
Graphql endpoint to fetch full article
|
||
:param nid:
|
||
:return:
|
||
'''
|
||
query = '''
|
||
query ($id: ID, $nid: ID) {
|
||
Article(id: $id, nid: $nid) {
|
||
...ArticlePageFields
|
||
}
|
||
}
|
||
fragment ArticlePageFields on Article {
|
||
id
|
||
nid
|
||
slug
|
||
title
|
||
cleanTitle
|
||
badge
|
||
frontPage {
|
||
id
|
||
slug
|
||
title
|
||
}
|
||
LinkedSeriesId
|
||
authors {
|
||
id
|
||
name
|
||
slug
|
||
blurb
|
||
meta {
|
||
twitter
|
||
}
|
||
}
|
||
body
|
||
publishedAt
|
||
displayAt
|
||
publicPublishedDate
|
||
status
|
||
ledeImage {
|
||
id
|
||
src
|
||
format
|
||
width
|
||
height
|
||
alt
|
||
}
|
||
ledeAltImage {
|
||
id
|
||
src
|
||
format
|
||
width
|
||
height
|
||
alt
|
||
}
|
||
url
|
||
urlFull
|
||
meta {
|
||
wordCount
|
||
template
|
||
navigationTheme
|
||
bigLede
|
||
hideLede
|
||
cropModeFronts
|
||
ledeOverrideSource
|
||
disableAds
|
||
}
|
||
ledeImageCredit
|
||
ledeImageCreditBottom
|
||
ledeImageRealCaption
|
||
bylines
|
||
deck
|
||
type
|
||
galleries {
|
||
id
|
||
galleryData {
|
||
captionText
|
||
creditText
|
||
image {
|
||
id
|
||
src
|
||
width
|
||
height
|
||
}
|
||
}
|
||
}
|
||
tags {
|
||
id
|
||
slug
|
||
label
|
||
}
|
||
}'''
|
||
params = {'query': query, 'variables': json.dumps({'nid': str(nid)})}
|
||
return f'https://newrepublic.com/graphql?{urlencode(params)}'
|
||
|
||
def _resize_image(self, image_url, width, height):
|
||
'''
|
||
Rewrite the image url to fetch a device appropriate sized one instead
|
||
of the full-res one
|
||
|
||
:param image_url:
|
||
:param width:
|
||
:param height:
|
||
:return:
|
||
'''
|
||
crop_params = {
|
||
'auto': 'compress',
|
||
'ar': f'{width}:{height}',
|
||
'fm': 'jpg',
|
||
'fit': 'crop',
|
||
'crop': 'faces',
|
||
'ixlib': 'react-9.0.2',
|
||
'dpr': 1,
|
||
'q': 65,
|
||
'w': self.scale_news_images[0] if self.scale_news_images else 800,
|
||
}
|
||
url_tuple = urlsplit(image_url)
|
||
return f'{url_tuple.scheme}://{url_tuple.netloc}{url_tuple.path}?{urlencode(crop_params)}'
|
||
|
||
def populate_article_metadata(self, article, soup, first):
|
||
# pick up the og link from preprocess_raw_html() and set it as url instead of the api endpoint
|
||
og_link = soup.select('[data-og-link]')
|
||
if og_link:
|
||
article.url = og_link[0]['data-og-link']
|
||
|
||
def preprocess_raw_html(self, raw_html, url):
|
||
# formulate the api response into html
|
||
article = json.loads(raw_html)['data']['Article']
|
||
# Example: 2022-08-12T10:00:00.000Z
|
||
date_published_loc = parse_date(article['publishedAt'])
|
||
# authors
|
||
author_bios_html = ''
|
||
post_authors = []
|
||
try:
|
||
post_authors = [a['name'] for a in article.get('authors', [])]
|
||
if post_authors:
|
||
author_bios_html = ''.join(
|
||
[a.get('blurb', '') for a in article.get('authors', [])]
|
||
)
|
||
author_bios_html = f'<div class="author-bios">{author_bios_html}</div>'
|
||
except (KeyError, TypeError):
|
||
pass
|
||
|
||
# lede image
|
||
lede_image_html = ''
|
||
if article.get('ledeImage'):
|
||
img = article['ledeImage']
|
||
lede_img_url = self._resize_image(
|
||
urljoin(self.BASE_URL, img['src']), img['width'], img['height']
|
||
)
|
||
lede_image_caption = ''
|
||
if article.get('ledeImageRealCaption'):
|
||
lede_image_caption = (
|
||
f'<span class="caption">{article["ledeImageRealCaption"]}>/span>'
|
||
)
|
||
lede_image_html = f'''<p class="lede-media">
|
||
<img src="{lede_img_url}">{lede_image_caption}
|
||
</p>'''
|
||
|
||
body_soup = BeautifulSoup(article['body'], features='html.parser')
|
||
for img in body_soup.find_all('img', attrs={'data-serialized': True}):
|
||
try:
|
||
img_info = json.loads(img['data-serialized'])
|
||
img_src = self._resize_image(
|
||
urljoin(self.BASE_URL, img_info['src']),
|
||
img_info['width'],
|
||
img_info['height'],
|
||
)
|
||
img['src'] = img_src
|
||
del img['data-serialized']
|
||
except Exception:
|
||
pass
|
||
|
||
return f'''<html>
|
||
<head><title>{article["cleanTitle"]}</title></head>
|
||
<body>
|
||
<article data-og-link="{article["urlFull"]}">
|
||
<h1 class="headline">{article["cleanTitle"]}</h1>
|
||
{('<h2 class="subheadline">' + article["deck"] + "</h2>") if article.get("deck") else ""}
|
||
<div class="article-meta">
|
||
{f'<span class="author">{", ".join(post_authors)}</span>' if post_authors else ""}
|
||
<span class="published-dt">
|
||
{date_published_loc:{"%b %d, %Y" if iswindows else "%b %-d, %Y"}}
|
||
</span>
|
||
</div>
|
||
{lede_image_html}
|
||
{body_soup!s}
|
||
{author_bios_html}
|
||
</article>
|
||
</body></html>'''
|
||
|
||
def parse_index(self):
|
||
br = self.get_browser()
|
||
params = ''
|
||
if _issue_url:
|
||
month = urlparse(_issue_url).path.split('/')[-1]
|
||
params = f'?{urlencode({"magazineTag": month})}'
|
||
res = br.open_novisit(f'https://newrepublic.com/api/content/magazine{params}')
|
||
magazine = json.loads(res.read().decode('utf-8'))['data']
|
||
self.log.debug(f'Found issue: {magazine["metaData"]["issueTag"]["text"]}')
|
||
self.timefmt = f': {magazine["metaData"]["issueTag"]["text"]}'
|
||
self.cover_url = urljoin(self.BASE_URL, magazine['metaData']['image']['src'])
|
||
|
||
feed_articles = []
|
||
for k, articles in magazine.items():
|
||
if not (k.startswith('magazine') and articles):
|
||
continue
|
||
try:
|
||
for article in articles:
|
||
self.log.debug(f'Found article: {article["title"]}')
|
||
feed_articles.append(
|
||
{
|
||
'url': self._article_endpoint(article['nid']),
|
||
'title': article['title'].replace('\n', ' '),
|
||
'description': article.get('deck', ''),
|
||
'date': article['publishedAt'],
|
||
'section': k[len('magazine') :],
|
||
}
|
||
)
|
||
except TypeError:
|
||
# not iterable
|
||
pass
|
||
|
||
sort_sections = [
|
||
'Cover',
|
||
'Editorsnote',
|
||
'Features',
|
||
'StateOfTheNation',
|
||
'ResPublica',
|
||
'Columns',
|
||
'Upfront',
|
||
'Backstory',
|
||
'SignsAndWonders',
|
||
'Usandtheworld',
|
||
'Booksandthearts',
|
||
'Poetry',
|
||
'Exposure',
|
||
]
|
||
sort_category_key = cmp_to_key(lambda a, b: sort_section(a, b, sort_sections))
|
||
return [
|
||
(
|
||
magazine['metaData']['issueTag']['text'],
|
||
sorted(feed_articles, key=sort_category_key),
|
||
)
|
||
]
|