Merge branch 'recipe-newrepublicmag' of https://github.com/ping/calibre

2025-07-08 10:44:09 -04:00 · 2023-07-30 11:57:17 +05:30 · 2023-07-30 11:57:17 +05:30 · 472c1f0a83
commit 472c1f0a83
parent 928b869614 1ad963aebd
2 changed files with 314 additions and 0 deletions
--- a/recipes/icons/newrepublicmag.png
+++ b/recipes/icons/newrepublicmag.png
--- a/recipes/newrepublicmag.recipe
+++ b/recipes/newrepublicmag.recipe
@ -0,0 +1,314 @@
+"""
+newrepublic.com
+"""
+import json
+from functools import cmp_to_key
+from urllib.parse import urljoin, urlencode, urlsplit, urlparse
+
+from calibre import iswindows
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.utils.date import parse_date
+from calibre.web.feeds.news import BasicNewsRecipe
+
+
+_issue_url = ""  # example: https://newrepublic.com/magazine/may-2023
+
+
+def sort_section(a, b, sections_sort):
+    try:
+        a_index = sections_sort.index(a["section"])
+    except ValueError:
+        a_index = 999
+    try:
+        b_index = sections_sort.index(b["section"])
+    except ValueError:
+        b_index = 999
+
+    if a_index < b_index:
+        return -1
+    if a_index > b_index:
+        return 1
+    if a["section"] == b["section"]:
+        return -1 if a["date"] < b["date"] else 1
+    return -1 if a["section"] < b["section"] else 1
+
+
+class NewRepublicMagazine(BasicNewsRecipe):
+    title = "The New Republic Magazine"
+    language = "en"
+    __author__ = "ping"
+    description = (
+        "Founded in 1914, The New Republic is a media organization dedicated to addressing "
+        "today’s most critical issues. https://newrepublic.com/magazine"
+    )
+    publication_type = "magazine"
+    use_embedded_content = False
+    masthead_url = "https://images.newrepublic.com/f5acdc0030e3212e601040dd24d5c2c0c684b15f.png?w=512&q=65&dpi=1&fit=crop&crop=faces&h=256"
+    remove_attributes = ["height", "width"]
+    ignore_duplicate_articles = {"title", "url"}
+    remove_empty_feeds = True
+    compress_news_images_auto_size = 6
+    requires_version = (5, 0, 0)
+
+    BASE_URL = "https://newrepublic.com"
+
+    extra_css = """
+    h1.headline { margin-bottom: 0.4rem; }
+    h2.subheadline { font-style: italic; margin-bottom: 1rem; font-weight: normal; }
+    .article-meta {  margin-bottom: 1rem; }
+    .article-meta span { display: inline-block; font-weight: bold; margin-right: 0.5rem; }
+    .article-meta span:last-child { font-weight: normal; }
+    div.pullquote { font-size: 1.25rem; margin-left: 0; text-align: center; }
+    .lede-media img, .article-embed img, img {
+        display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto;
+        box-sizing: border-box;
+    }
+    .lede-media .caption, .article-embed .caption { font-size: 0.8rem; }
+    div.author-bios { margin-top: 2rem; font-style: italic; border-top: solid 1px dimgray; }
+    """
+
+    def _article_endpoint(self, nid):
+        """
+        Graphql endpoint to fetch full article
+        :param nid:
+        :return:
+        """
+        query = """
+query ($id: ID, $nid: ID) {
+  Article(id: $id, nid: $nid) {
+    ...ArticlePageFields
+  }
+}
+fragment ArticlePageFields on Article {
+  id
+  nid
+  slug
+  title
+  cleanTitle
+  badge
+  frontPage {
+    id
+    slug
+    title
+  }
+  LinkedSeriesId
+  authors {
+    id
+    name
+    slug
+    blurb
+    meta {
+      twitter
+    }
+  }
+  body
+  publishedAt
+  displayAt
+  publicPublishedDate
+  status
+  ledeImage {
+    id
+    src
+    format
+    width
+    height
+    alt
+  }
+  ledeAltImage {
+    id
+    src
+    format
+    width
+    height
+    alt
+  }
+  url
+  urlFull
+  meta {
+    wordCount
+    template
+    navigationTheme
+    bigLede
+    hideLede
+    cropModeFronts
+    ledeOverrideSource
+    disableAds
+  }
+  ledeImageCredit
+  ledeImageCreditBottom
+  ledeImageRealCaption
+  bylines
+  deck
+  type
+  galleries {
+    id
+    galleryData {
+      captionText
+      creditText
+      image {
+        id
+        src
+        width
+        height
+      }
+    }
+  }
+  tags {
+    id
+    slug
+    label
+  }
+}"""
+        params = {"query": query, "variables": json.dumps({"nid": str(nid)})}
+        return f"https://newrepublic.com/graphql?{urlencode(params)}"
+
+    def _resize_image(self, image_url, width, height):
+        """
+        Rewrite the image url to fetch a device appropriate sized one instead
+        of the full-res one
+
+        :param image_url:
+        :param width:
+        :param height:
+        :return:
+        """
+        crop_params = {
+            "auto": "compress",
+            "ar": f"{width}:{height}",
+            "fm": "jpg",
+            "fit": "crop",
+            "crop": "faces",
+            "ixlib": "react-9.0.2",
+            "dpr": 1,
+            "q": 65,
+            "w": self.scale_news_images[0] if self.scale_news_images else 800,
+        }
+        url_tuple = urlsplit(image_url)
+        return f"{url_tuple.scheme}://{url_tuple.netloc}{url_tuple.path}?{urlencode(crop_params)}"
+
+    def populate_article_metadata(self, article, soup, first):
+        # pick up the og link from preprocess_raw_html() and set it as url instead of the api endpoint
+        og_link = soup.select("[data-og-link]")
+        if og_link:
+            article.url = og_link[0]["data-og-link"]
+
+    def preprocess_raw_html(self, raw_html, url):
+        # formulate the api response into html
+        article = json.loads(raw_html)["data"]["Article"]
+        # Example: 2022-08-12T10:00:00.000Z
+        date_published_loc = parse_date(article["publishedAt"])
+        # authors
+        author_bios_html = ""
+        post_authors = []
+        try:
+            post_authors = [a["name"] for a in article.get("authors", [])]
+            if post_authors:
+                author_bios_html = "".join(
+                    [a.get("blurb", "") for a in article.get("authors", [])]
+                )
+                author_bios_html = f'<div class="author-bios">{author_bios_html}</div>'
+        except (KeyError, TypeError):
+            pass
+
+        # lede image
+        lede_image_html = ""
+        if article.get("ledeImage"):
+            img = article["ledeImage"]
+            lede_img_url = self._resize_image(
+                urljoin(self.BASE_URL, img["src"]), img["width"], img["height"]
+            )
+            lede_image_caption = ""
+            if article.get("ledeImageRealCaption"):
+                lede_image_caption = (
+                    f'<span class="caption">{article["ledeImageRealCaption"]}>/span>'
+                )
+            lede_image_html = f"""<p class="lede-media">
+                <img src="{lede_img_url}">{lede_image_caption}
+                </p>"""
+
+        body_soup = BeautifulSoup(article["body"], features="html.parser")
+        for img in body_soup.find_all("img", attrs={"data-serialized": True}):
+            try:
+                img_info = json.loads(img["data-serialized"])
+                img_src = self._resize_image(
+                    urljoin(self.BASE_URL, img_info["src"]),
+                    img_info["width"],
+                    img_info["height"],
+                )
+                img["src"] = img_src
+                del img["data-serialized"]
+            except:  # noqa
+                pass
+
+        return f"""<html>
+        <head><title>{article["cleanTitle"]}</title></head>
+        <body>
+            <article data-og-link="{article["urlFull"]}">
+            <h1 class="headline">{article["cleanTitle"]}</h1>
+            {('<h2 class="subheadline">' + article["deck"] + "</h2>") if article.get("deck") else ""}
+            <div class="article-meta">
+                {f'<span class="author">{", ".join(post_authors)}</span>' if post_authors else ""}
+                <span class="published-dt">
+                    {date_published_loc:{"%b %d, %Y" if iswindows else "%b %-d, %Y"}}
+                </span>
+            </div>
+            {lede_image_html}
+            {str(body_soup)}
+            {author_bios_html}
+            </article>
+        </body></html>"""
+
+    def parse_index(self):
+        br = self.get_browser()
+        params = ""
+        if _issue_url:
+            month = urlparse(_issue_url).path.split("/")[-1]
+            params = f'?{urlencode({"magazineTag": month})}'
+        res = br.open_novisit(f"https://newrepublic.com/api/content/magazine{params}")
+        magazine = json.loads(res.read().decode("utf-8"))["data"]
+        self.log.debug(f'Found issue: {magazine["metaData"]["issueTag"]["text"]}')
+        self.timefmt = f': {magazine["metaData"]["issueTag"]["text"]}'
+        self.cover_url = urljoin(self.BASE_URL, magazine["metaData"]["image"]["src"])
+
+        feed_articles = []
+        for k, articles in magazine.items():
+            if not (k.startswith("magazine") and articles):
+                continue
+            try:
+                for article in articles:
+                    self.log.debug(f'Found article: {article["title"]}')
+                    feed_articles.append(
+                        {
+                            "url": self._article_endpoint(article["nid"]),
+                            "title": article["title"].replace("\n", " "),
+                            "description": article.get("deck", ""),
+                            "date": article["publishedAt"],
+                            "section": k[len("magazine") :],
+                        }
+                    )
+            except TypeError:
+                # not iterable
+                pass
+
+        sort_sections = [
+            "Cover",
+            "Editorsnote",
+            "Features",
+            "StateOfTheNation",
+            "ResPublica",
+            "Columns",
+            "Upfront",
+            "Backstory",
+            "SignsAndWonders",
+            "Usandtheworld",
+            "Booksandthearts",
+            "Poetry",
+            "Exposure",
+        ]
+        sort_category_key = cmp_to_key(lambda a, b: sort_section(a, b, sort_sections))
+        return [
+            (
+                magazine["metaData"]["issueTag"]["text"],
+                sorted(feed_articles, key=sort_category_key),
+            )
+        ]