diff --git a/recipes/bookforummagazine.recipe b/recipes/bookforummagazine.recipe new file mode 100644 index 0000000000..cab082a8e3 --- /dev/null +++ b/recipes/bookforummagazine.recipe @@ -0,0 +1,78 @@ +from urllib.parse import urljoin + +from calibre.web.feeds.news import BasicNewsRecipe + +_issue_url = "" + + +class BookforumMagazine(BasicNewsRecipe): + title = "Bookforum" + description = ( + "Bookforum is an American book review magazine devoted to books and " + "the discussion of literature. https://www.bookforum.com/print" + ) + language = "en" + __author__ = "ping" + publication_type = "magazine" + encoding = "utf-8" + remove_javascript = True + no_stylesheets = True + auto_cleanup = False + compress_news_images = True + compress_news_images_auto_size = 8 + + keep_only_tags = [dict(class_="blog-article")] + remove_tags = [dict(name=["af-share-toggle", "af-related-articles"])] + + extra_css = """ + .blog-article__header { font-size: 1.8rem; margin-bottom: 0.4rem; } + .blog-article__subtitle { font-size: 1.2rem; font-style: italic; margin-bottom: 1rem; } + .blog-article__writer { font-size: 1rem; font-weight: bold; color: #444; } + .blog-article__book-info { margin: 1rem 0; } + .article-image-container img, .blog-article__publication-media img { + display: block; max-width: 100%; height: auto; + } + .blog-article__caption { font-size: 0.8rem; display: block; margin-top: 0.2rem; } + """ + + def preprocess_html(self, soup): + # strip away links that's not needed + for ele in soup.select(".blog-article__header a"): + ele.unwrap() + return soup + + def parse_index(self): + soup = self.index_to_soup( + _issue_url if _issue_url else "https://www.bookforum.com/print" + ) + meta_ele = soup.find("meta", property="og:title") + if meta_ele: + self.timefmt = f' [{meta_ele["content"]}]' + + cover_ele = soup.find("img", class_="toc-issue__cover") + if cover_ele: + self.cover_url = urljoin( + "https://www.bookforum.com", + soup.find("img", class_="toc-issue__cover")["src"], + ) + + articles = {} + for sect_ele in soup.find_all("div", class_="toc-articles__section"): + section_name = self.tag_to_string( + sect_ele.find("a", class_="toc__anchor-links__link") + ) + for article_ele in sect_ele.find_all("article"): + title_ele = article_ele.find("h1") + sub_title_ele = article_ele.find(class_="toc-article__subtitle") + articles.setdefault(section_name, []).append( + { + "title": self.tag_to_string(title_ele), + "url": article_ele.find("a", class_="toc-article__link")[ + "href" + ], + "description": self.tag_to_string(sub_title_ele) + if sub_title_ele + else "", + } + ) + return articles.items() diff --git a/recipes/icons/bookforummagazine.png b/recipes/icons/bookforummagazine.png new file mode 100644 index 0000000000..5e6eac016d Binary files /dev/null and b/recipes/icons/bookforummagazine.png differ diff --git a/recipes/icons/kirkusreviews.png b/recipes/icons/kirkusreviews.png new file mode 100644 index 0000000000..abca544dbd Binary files /dev/null and b/recipes/icons/kirkusreviews.png differ diff --git a/recipes/icons/poetrymagazine.png b/recipes/icons/poetrymagazine.png new file mode 100644 index 0000000000..5959ec16ae Binary files /dev/null and b/recipes/icons/poetrymagazine.png differ diff --git a/recipes/kirkusreviews.recipe b/recipes/kirkusreviews.recipe new file mode 100644 index 0000000000..722faacd16 --- /dev/null +++ b/recipes/kirkusreviews.recipe @@ -0,0 +1,130 @@ +from urllib.parse import urljoin + +from calibre.web.feeds.news import BasicNewsRecipe + + +class KirkusReviews(BasicNewsRecipe): + title = "Kirkus Reviews" + description = "Kirkus Reviews is an American book review magazine founded in 1933 by Virginia Kirkus. The magazine is headquartered in New York City. Released twice monthly on the 1st/15th. https://www.kirkusreviews.com/magazine/current/" + language = "en" + __author__ = "ping" + publication_type = "magazine" + masthead_url = ( + "https://d1fd687oe6a92y.cloudfront.net/img/kir_images/logo/kirkus-nav-logo.svg" + ) + encoding = "utf-8" + remove_javascript = True + no_stylesheets = True + auto_cleanup = False + ignore_duplicate_articles = {"url"} + compress_news_images = True + compress_news_images_auto_size = 6 + max_articles_per_feed = 99 + + keep_only_tags = [ + dict( + class_=[ + "article-author", + "article-author-img-start", + "article-author-description-start", + "single-review", + ] + ) + ] + remove_tags = [ + dict( + class_=[ + "sidebar-content", + "article-social-share-desktop-first", + "article-social-share-desktop-pagination", + "article-social-share-mobile", + "share-review-text", + "like-dislike-article", + "rate-this-book-text", + "input-group", + "user-comments", + "show-all-response-text", + "button-row", + "hide-on-mobile", + "related-article", + "breadcrumb-row", + "shop-now-dropdown", + ] + ) + ] + remove_tags_after = [dict(class_="single-review")] + + extra_css = """ + .image-container img { max-width: 100%; height: auto; margin-bottom: 0.2rem; } + .photo-caption { font-size: 0.8rem; margin-bottom: 0.5rem; display: block; } + .book-review-img .image-container { text-align: center; } + .book-rating-module .description-title { font-size: 1.25rem; margin-left: 0; text-align: center; } + """ + + def preprocess_html(self, soup): + h1 = soup.find(class_="article-title") + book_cover = soup.find("ul", class_="book-review-img") + if book_cover: + for li in book_cover.find_all("li"): + li.name = "div" + book_cover.name = "div" + if h1: + book_cover.insert_before(h1.extract()) + return soup + + def parse_index(self): + issue_url = "https://www.kirkusreviews.com/magazine/current/" + soup = self.index_to_soup(issue_url) + issue = soup.find(name="article", class_="issue-container") + cover_img = issue.select(".issue-header .cover-image img") + if cover_img: + self.cover_url = cover_img[0]["src"] + + h1 = issue.find("h1") + if h1: + self.timefmt = f" [{self.tag_to_string(h1)}]" # edition + + articles = {} + for book_ele in soup.find_all(name="div", class_="issue-featured-book"): + link = book_ele.find("a") + if not link: + continue + section = self.tag_to_string(book_ele.find("h3")).upper() + articles.setdefault(section, []).append( + {"url": urljoin(issue_url, link["href"]), "title": link["title"]} + ) + for post_ele in issue.select("div.issue-more-posts ul li div.lead-text"): + link = post_ele.find("a") + if not link: + continue + section = self.tag_to_string(post_ele.find(class_="lead-text-type")).upper() + articles.setdefault(section, []).append( + { + "url": urljoin(issue_url, link["href"]), + "title": self.tag_to_string(link), + } + ) + for section_ele in issue.select("section.reviews-section"): + section_articles = [] + for review in section_ele.select("ul li.starred"): + link = review.select("h4 a") + if not link: + continue + description = review.find("p") + section_articles.append( + { + "url": urljoin(issue_url, link[0]["href"]), + "title": self.tag_to_string(link[0]), + "description": "" + if not description + else self.tag_to_string(description), + } + ) + if not section_articles: + continue + section = self.tag_to_string(section_ele.find("h3")).upper() + if section not in articles: + articles[section] = [] + articles.setdefault(section, []).extend(section_articles) + + return articles.items() diff --git a/recipes/poetrymagazine.recipe b/recipes/poetrymagazine.recipe new file mode 100644 index 0000000000..0aeae8c4f3 --- /dev/null +++ b/recipes/poetrymagazine.recipe @@ -0,0 +1,135 @@ +import re +from collections import OrderedDict +from urllib.parse import urlparse + +from calibre.web.feeds.news import BasicNewsRecipe + +_issue_url = "" + +COMMA_SEP_RE = re.compile(r"\s*,\s*") +SPACE_SEP_RE = re.compile(r"\s+") +NON_NUMERIC_RE = re.compile(r"[^\d]+") + + +class Poetry(BasicNewsRecipe): + title = "Poetry Magazine" + __author__ = "ping" + description = ( + "Founded in Chicago by Harriet Monroe in 1912, Poetry is the oldest monthly " + "devoted to verse in the English-speaking world. https://www.poetryfoundation.org/poetrymagazine" + ) + publication_type = "magazine" + language = "en" + encoding = "utf-8" + remove_javascript = True + no_stylesheets = True + auto_cleanup = False + ignore_duplicate_articles = {"url"} + compress_news_images = False + + remove_attributes = ["style", "font"] + keep_only_tags = [dict(name="article")] + + remove_tags = [ + dict(name="button"), + dict( + attrs={ + "class": [ + "c-socialBlocks", + "c-index", + "o-stereo", + "u-hideAboveSmall", + "c-slideTrigger", + "js-slideshow", + ] + } + ), + ] + + extra_css = """ + h1 { font-size: 1.8rem; margin-bottom: 0.5rem; } + .o-titleBar-summary { font-size: 1.2rem; font-style: italic; margin-bottom: 1rem; } + div.o-titleBar-meta, div.c-feature-sub { font-weight: bold; color: #444; margin-bottom: 1.5rem; } + div.pcms_media img, div.o-mediaEnclosure img { max-width: 100%; height: auto; } + div.o-mediaEnclosure .o-mediaEnclosure-metadata { font-size: 0.8rem; margin-top: 0.2rem; } + div.c-feature-bd { margin-bottom: 2rem; } + div.c-auxContent { color: #222; font-size: 0.85rem; margin-top: 2rem; } + """ + + def extract_from_img_srcset(self, srcset: str, max_width=0): + sources = [s.strip() for s in COMMA_SEP_RE.split(srcset) if s.strip()] + if len(sources) == 1: + # just a regular img url probably + return sources[0] + parsed_sources = [] + for src in sources: + src_n_width = [s.strip() for s in SPACE_SEP_RE.split(src) if s.strip()] + if len(src_n_width) != 2: + raise ValueError(f"Not a valid srcset: {srcset}") + parsed_sources.append( + ( + src_n_width[0].strip(), + int(NON_NUMERIC_RE.sub("", src_n_width[1].strip())), + ) + ) + parsed_sources = list(set(parsed_sources)) + parsed_sources = sorted(parsed_sources, key=lambda x: x[1], reverse=True) + if not max_width: + return parsed_sources[0][0] + for img, width in parsed_sources: + if width <= max_width: + return img + return parsed_sources[-1][0] + + def preprocess_html(self, soup): + for img in soup.select("div.o-mediaEnclosure img"): + if not img.get("srcset"): + continue + img["src"] = self.extract_from_img_srcset(img["srcset"], max_width=1000) + return soup + + def parse_index(self): + if _issue_url: + soup = self.index_to_soup(_issue_url) + else: + soup = self.index_to_soup("https://www.poetryfoundation.org/poetrymagazine") + current_issue = soup.select("div.c-cover-media a") + if not current_issue: + self.abort_recipe_processing("Unable to find latest issue") + current_issue = current_issue[0] + soup = self.index_to_soup(current_issue["href"]) + + issue_edition = self.tag_to_string(soup.find("h1")) + self.timefmt = f" [{issue_edition}]" + cover_image = soup.select("div.c-issueBillboard-cover-media img")[0] + parsed_cover_url = urlparse( + cover_image["srcset"].split(",")[-1].strip().split(" ")[0] + ) + self.cover_url = f"{parsed_cover_url.scheme}://{parsed_cover_url.netloc}{parsed_cover_url.path}" + + sectioned_feeds = OrderedDict() + + tabs = soup.find_all("div", attrs={"class": "c-tier_tabbed"}) + for tab in tabs: + tab_title = tab.find("div", attrs={"class": "c-tier-tab"}) + tab_content = tab.find("div", attrs={"class": "c-tier-content"}) + if not (tab_title and tab_content): + continue + tab_title = self.tag_to_string(tab_title) + sectioned_feeds[tab_title] = [] + for li in tab_content.select("ul.o-blocks > li"): + author = self.tag_to_string( + li.find("span", attrs={"class": "c-txt_attribution"}) + ) + for link in li.find_all("a", attrs={"class": "c-txt_abstract"}): + self.log("Found article:", self.tag_to_string(link)) + sectioned_feeds[tab_title].append( + { + "title": self.tag_to_string(link), + "url": link["href"], + "author": author, + "description": author, + } + ) + + return sectioned_feeds.items()