New literature-related recipes (Bookforum, Kirkus Reviews, Poetry Magazine)

2025-07-07 10:14:46 -04:00 · 2023-10-24 13:23:16 +08:00 · 2023-10-24 13:23:16 +08:00 · f83db42a8c
commit f83db42a8c
parent f316dea0ad
6 changed files with 343 additions and 0 deletions
--- a/recipes/bookforummagazine.recipe
+++ b/recipes/bookforummagazine.recipe
@ -0,0 +1,78 @@
+from urllib.parse import urljoin
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+_issue_url = ""
+
+
+class BookforumMagazine(BasicNewsRecipe):
+    title = "Bookforum"
+    description = (
+        "Bookforum is an American book review magazine devoted to books and "
+        "the discussion of literature. https://www.bookforum.com/print"
+    )
+    language = "en"
+    __author__ = "ping"
+    publication_type = "magazine"
+    encoding = "utf-8"
+    remove_javascript = True
+    no_stylesheets = True
+    auto_cleanup = False
+    compress_news_images = True
+    compress_news_images_auto_size = 8
+
+    keep_only_tags = [dict(class_="blog-article")]
+    remove_tags = [dict(name=["af-share-toggle", "af-related-articles"])]
+
+    extra_css = """
+    .blog-article__header { font-size: 1.8rem; margin-bottom: 0.4rem; }
+    .blog-article__subtitle { font-size: 1.2rem; font-style: italic; margin-bottom: 1rem; }
+    .blog-article__writer { font-size: 1rem; font-weight: bold; color: #444; }
+    .blog-article__book-info { margin: 1rem 0; }
+    .article-image-container img, .blog-article__publication-media img {
+        display: block; max-width: 100%; height: auto;
+    }
+    .blog-article__caption { font-size: 0.8rem; display: block; margin-top: 0.2rem; }
+    """
+
+    def preprocess_html(self, soup):
+        # strip away links that's not needed
+        for ele in soup.select(".blog-article__header a"):
+            ele.unwrap()
+        return soup
+
+    def parse_index(self):
+        soup = self.index_to_soup(
+            _issue_url if _issue_url else "https://www.bookforum.com/print"
+        )
+        meta_ele = soup.find("meta", property="og:title")
+        if meta_ele:
+            self.timefmt = f' [{meta_ele["content"]}]'
+
+        cover_ele = soup.find("img", class_="toc-issue__cover")
+        if cover_ele:
+            self.cover_url = urljoin(
+                "https://www.bookforum.com",
+                soup.find("img", class_="toc-issue__cover")["src"],
+            )
+
+        articles = {}
+        for sect_ele in soup.find_all("div", class_="toc-articles__section"):
+            section_name = self.tag_to_string(
+                sect_ele.find("a", class_="toc__anchor-links__link")
+            )
+            for article_ele in sect_ele.find_all("article"):
+                title_ele = article_ele.find("h1")
+                sub_title_ele = article_ele.find(class_="toc-article__subtitle")
+                articles.setdefault(section_name, []).append(
+                    {
+                        "title": self.tag_to_string(title_ele),
+                        "url": article_ele.find("a", class_="toc-article__link")[
+                            "href"
+                        ],
+                        "description": self.tag_to_string(sub_title_ele)
+                        if sub_title_ele
+                        else "",
+                    }
+                )
+        return articles.items()
--- a/recipes/icons/bookforummagazine.png
+++ b/recipes/icons/bookforummagazine.png
--- a/recipes/icons/kirkusreviews.png
+++ b/recipes/icons/kirkusreviews.png
--- a/recipes/icons/poetrymagazine.png
+++ b/recipes/icons/poetrymagazine.png
--- a/recipes/kirkusreviews.recipe
+++ b/recipes/kirkusreviews.recipe
@ -0,0 +1,130 @@
+from urllib.parse import urljoin
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+
+class KirkusReviews(BasicNewsRecipe):
+    title = "Kirkus Reviews"
+    description = "Kirkus Reviews is an American book review magazine founded in 1933 by Virginia Kirkus. The magazine is headquartered in New York City. Released twice monthly on the 1st/15th. https://www.kirkusreviews.com/magazine/current/"
+    language = "en"
+    __author__ = "ping"
+    publication_type = "magazine"
+    masthead_url = (
+        "https://d1fd687oe6a92y.cloudfront.net/img/kir_images/logo/kirkus-nav-logo.svg"
+    )
+    encoding = "utf-8"
+    remove_javascript = True
+    no_stylesheets = True
+    auto_cleanup = False
+    ignore_duplicate_articles = {"url"}
+    compress_news_images = True
+    compress_news_images_auto_size = 6
+    max_articles_per_feed = 99
+
+    keep_only_tags = [
+        dict(
+            class_=[
+                "article-author",
+                "article-author-img-start",
+                "article-author-description-start",
+                "single-review",
+            ]
+        )
+    ]
+    remove_tags = [
+        dict(
+            class_=[
+                "sidebar-content",
+                "article-social-share-desktop-first",
+                "article-social-share-desktop-pagination",
+                "article-social-share-mobile",
+                "share-review-text",
+                "like-dislike-article",
+                "rate-this-book-text",
+                "input-group",
+                "user-comments",
+                "show-all-response-text",
+                "button-row",
+                "hide-on-mobile",
+                "related-article",
+                "breadcrumb-row",
+                "shop-now-dropdown",
+            ]
+        )
+    ]
+    remove_tags_after = [dict(class_="single-review")]
+
+    extra_css = """
+    .image-container img { max-width: 100%; height: auto; margin-bottom: 0.2rem; }
+    .photo-caption { font-size: 0.8rem; margin-bottom: 0.5rem; display: block; }
+    .book-review-img .image-container { text-align: center; }
+    .book-rating-module .description-title { font-size: 1.25rem; margin-left: 0; text-align: center; }
+    """
+
+    def preprocess_html(self, soup):
+        h1 = soup.find(class_="article-title")
+        book_cover = soup.find("ul", class_="book-review-img")
+        if book_cover:
+            for li in book_cover.find_all("li"):
+                li.name = "div"
+            book_cover.name = "div"
+            if h1:
+                book_cover.insert_before(h1.extract())
+        return soup
+
+    def parse_index(self):
+        issue_url = "https://www.kirkusreviews.com/magazine/current/"
+        soup = self.index_to_soup(issue_url)
+        issue = soup.find(name="article", class_="issue-container")
+        cover_img = issue.select(".issue-header .cover-image img")
+        if cover_img:
+            self.cover_url = cover_img[0]["src"]
+
+        h1 = issue.find("h1")
+        if h1:
+            self.timefmt = f" [{self.tag_to_string(h1)}]"  # edition
+
+        articles = {}
+        for book_ele in soup.find_all(name="div", class_="issue-featured-book"):
+            link = book_ele.find("a")
+            if not link:
+                continue
+            section = self.tag_to_string(book_ele.find("h3")).upper()
+            articles.setdefault(section, []).append(
+                {"url": urljoin(issue_url, link["href"]), "title": link["title"]}
+            )
+        for post_ele in issue.select("div.issue-more-posts ul li div.lead-text"):
+            link = post_ele.find("a")
+            if not link:
+                continue
+            section = self.tag_to_string(post_ele.find(class_="lead-text-type")).upper()
+            articles.setdefault(section, []).append(
+                {
+                    "url": urljoin(issue_url, link["href"]),
+                    "title": self.tag_to_string(link),
+                }
+            )
+        for section_ele in issue.select("section.reviews-section"):
+            section_articles = []
+            for review in section_ele.select("ul li.starred"):
+                link = review.select("h4 a")
+                if not link:
+                    continue
+                description = review.find("p")
+                section_articles.append(
+                    {
+                        "url": urljoin(issue_url, link[0]["href"]),
+                        "title": self.tag_to_string(link[0]),
+                        "description": ""
+                        if not description
+                        else self.tag_to_string(description),
+                    }
+                )
+            if not section_articles:
+                continue
+            section = self.tag_to_string(section_ele.find("h3")).upper()
+            if section not in articles:
+                articles[section] = []
+            articles.setdefault(section, []).extend(section_articles)
+
+        return articles.items()
--- a/recipes/poetrymagazine.recipe
+++ b/recipes/poetrymagazine.recipe
@ -0,0 +1,135 @@
+import re
+from collections import OrderedDict
+from urllib.parse import urlparse
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+_issue_url = ""
+
+COMMA_SEP_RE = re.compile(r"\s*,\s*")
+SPACE_SEP_RE = re.compile(r"\s+")
+NON_NUMERIC_RE = re.compile(r"[^\d]+")
+
+
+class Poetry(BasicNewsRecipe):
+    title = "Poetry Magazine"
+    __author__ = "ping"
+    description = (
+        "Founded in Chicago by Harriet Monroe in 1912, Poetry is the oldest monthly "
+        "devoted to verse in the English-speaking world. https://www.poetryfoundation.org/poetrymagazine"
+    )
+    publication_type = "magazine"
+    language = "en"
+    encoding = "utf-8"
+    remove_javascript = True
+    no_stylesheets = True
+    auto_cleanup = False
+    ignore_duplicate_articles = {"url"}
+    compress_news_images = False
+
+    remove_attributes = ["style", "font"]
+    keep_only_tags = [dict(name="article")]
+
+    remove_tags = [
+        dict(name="button"),
+        dict(
+            attrs={
+                "class": [
+                    "c-socialBlocks",
+                    "c-index",
+                    "o-stereo",
+                    "u-hideAboveSmall",
+                    "c-slideTrigger",
+                    "js-slideshow",
+                ]
+            }
+        ),
+    ]
+
+    extra_css = """
+    h1 { font-size: 1.8rem; margin-bottom: 0.5rem; }
+    .o-titleBar-summary { font-size: 1.2rem; font-style: italic; margin-bottom: 1rem; }
+    div.o-titleBar-meta, div.c-feature-sub { font-weight: bold; color: #444; margin-bottom: 1.5rem; }
+    div.pcms_media img, div.o-mediaEnclosure img { max-width: 100%; height: auto; }
+    div.o-mediaEnclosure .o-mediaEnclosure-metadata { font-size: 0.8rem; margin-top: 0.2rem; }
+    div.c-feature-bd { margin-bottom: 2rem; }
+    div.c-auxContent { color: #222; font-size: 0.85rem; margin-top: 2rem; }
+    """
+
+    def extract_from_img_srcset(self, srcset: str, max_width=0):
+        sources = [s.strip() for s in COMMA_SEP_RE.split(srcset) if s.strip()]
+        if len(sources) == 1:
+            # just a regular img url probably
+            return sources[0]
+        parsed_sources = []
+        for src in sources:
+            src_n_width = [s.strip() for s in SPACE_SEP_RE.split(src) if s.strip()]
+            if len(src_n_width) != 2:
+                raise ValueError(f"Not a valid srcset: {srcset}")
+            parsed_sources.append(
+                (
+                    src_n_width[0].strip(),
+                    int(NON_NUMERIC_RE.sub("", src_n_width[1].strip())),
+                )
+            )
+        parsed_sources = list(set(parsed_sources))
+        parsed_sources = sorted(parsed_sources, key=lambda x: x[1], reverse=True)
+        if not max_width:
+            return parsed_sources[0][0]
+        for img, width in parsed_sources:
+            if width <= max_width:
+                return img
+        return parsed_sources[-1][0]
+
+    def preprocess_html(self, soup):
+        for img in soup.select("div.o-mediaEnclosure img"):
+            if not img.get("srcset"):
+                continue
+            img["src"] = self.extract_from_img_srcset(img["srcset"], max_width=1000)
+        return soup
+
+    def parse_index(self):
+        if _issue_url:
+            soup = self.index_to_soup(_issue_url)
+        else:
+            soup = self.index_to_soup("https://www.poetryfoundation.org/poetrymagazine")
+            current_issue = soup.select("div.c-cover-media a")
+            if not current_issue:
+                self.abort_recipe_processing("Unable to find latest issue")
+            current_issue = current_issue[0]
+            soup = self.index_to_soup(current_issue["href"])
+
+        issue_edition = self.tag_to_string(soup.find("h1"))
+        self.timefmt = f" [{issue_edition}]"
+        cover_image = soup.select("div.c-issueBillboard-cover-media img")[0]
+        parsed_cover_url = urlparse(
+            cover_image["srcset"].split(",")[-1].strip().split(" ")[0]
+        )
+        self.cover_url = f"{parsed_cover_url.scheme}://{parsed_cover_url.netloc}{parsed_cover_url.path}"
+
+        sectioned_feeds = OrderedDict()
+
+        tabs = soup.find_all("div", attrs={"class": "c-tier_tabbed"})
+        for tab in tabs:
+            tab_title = tab.find("div", attrs={"class": "c-tier-tab"})
+            tab_content = tab.find("div", attrs={"class": "c-tier-content"})
+            if not (tab_title and tab_content):
+                continue
+            tab_title = self.tag_to_string(tab_title)
+            sectioned_feeds[tab_title] = []
+            for li in tab_content.select("ul.o-blocks > li"):
+                author = self.tag_to_string(
+                    li.find("span", attrs={"class": "c-txt_attribution"})
+                )
+                for link in li.find_all("a", attrs={"class": "c-txt_abstract"}):
+                    self.log("Found article:", self.tag_to_string(link))
+                    sectioned_feeds[tab_title].append(
+                        {
+                            "title": self.tag_to_string(link),
+                            "url": link["href"],
+                            "author": author,
+                            "description": author,
+                        }
+                    )
+
+        return sectioned_feeds.items()