mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
New literature-related recipes (Bookforum, Kirkus Reviews, Poetry Magazine)
This commit is contained in:
parent
f316dea0ad
commit
f83db42a8c
78
recipes/bookforummagazine.recipe
Normal file
78
recipes/bookforummagazine.recipe
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
_issue_url = ""
|
||||||
|
|
||||||
|
|
||||||
|
class BookforumMagazine(BasicNewsRecipe):
|
||||||
|
title = "Bookforum"
|
||||||
|
description = (
|
||||||
|
"Bookforum is an American book review magazine devoted to books and "
|
||||||
|
"the discussion of literature. https://www.bookforum.com/print"
|
||||||
|
)
|
||||||
|
language = "en"
|
||||||
|
__author__ = "ping"
|
||||||
|
publication_type = "magazine"
|
||||||
|
encoding = "utf-8"
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
auto_cleanup = False
|
||||||
|
compress_news_images = True
|
||||||
|
compress_news_images_auto_size = 8
|
||||||
|
|
||||||
|
keep_only_tags = [dict(class_="blog-article")]
|
||||||
|
remove_tags = [dict(name=["af-share-toggle", "af-related-articles"])]
|
||||||
|
|
||||||
|
extra_css = """
|
||||||
|
.blog-article__header { font-size: 1.8rem; margin-bottom: 0.4rem; }
|
||||||
|
.blog-article__subtitle { font-size: 1.2rem; font-style: italic; margin-bottom: 1rem; }
|
||||||
|
.blog-article__writer { font-size: 1rem; font-weight: bold; color: #444; }
|
||||||
|
.blog-article__book-info { margin: 1rem 0; }
|
||||||
|
.article-image-container img, .blog-article__publication-media img {
|
||||||
|
display: block; max-width: 100%; height: auto;
|
||||||
|
}
|
||||||
|
.blog-article__caption { font-size: 0.8rem; display: block; margin-top: 0.2rem; }
|
||||||
|
"""
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
# strip away links that's not needed
|
||||||
|
for ele in soup.select(".blog-article__header a"):
|
||||||
|
ele.unwrap()
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup(
|
||||||
|
_issue_url if _issue_url else "https://www.bookforum.com/print"
|
||||||
|
)
|
||||||
|
meta_ele = soup.find("meta", property="og:title")
|
||||||
|
if meta_ele:
|
||||||
|
self.timefmt = f' [{meta_ele["content"]}]'
|
||||||
|
|
||||||
|
cover_ele = soup.find("img", class_="toc-issue__cover")
|
||||||
|
if cover_ele:
|
||||||
|
self.cover_url = urljoin(
|
||||||
|
"https://www.bookforum.com",
|
||||||
|
soup.find("img", class_="toc-issue__cover")["src"],
|
||||||
|
)
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
for sect_ele in soup.find_all("div", class_="toc-articles__section"):
|
||||||
|
section_name = self.tag_to_string(
|
||||||
|
sect_ele.find("a", class_="toc__anchor-links__link")
|
||||||
|
)
|
||||||
|
for article_ele in sect_ele.find_all("article"):
|
||||||
|
title_ele = article_ele.find("h1")
|
||||||
|
sub_title_ele = article_ele.find(class_="toc-article__subtitle")
|
||||||
|
articles.setdefault(section_name, []).append(
|
||||||
|
{
|
||||||
|
"title": self.tag_to_string(title_ele),
|
||||||
|
"url": article_ele.find("a", class_="toc-article__link")[
|
||||||
|
"href"
|
||||||
|
],
|
||||||
|
"description": self.tag_to_string(sub_title_ele)
|
||||||
|
if sub_title_ele
|
||||||
|
else "",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return articles.items()
|
BIN
recipes/icons/bookforummagazine.png
Normal file
BIN
recipes/icons/bookforummagazine.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 305 B |
BIN
recipes/icons/kirkusreviews.png
Normal file
BIN
recipes/icons/kirkusreviews.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.4 KiB |
BIN
recipes/icons/poetrymagazine.png
Normal file
BIN
recipes/icons/poetrymagazine.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 536 B |
130
recipes/kirkusreviews.recipe
Normal file
130
recipes/kirkusreviews.recipe
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
|
class KirkusReviews(BasicNewsRecipe):
|
||||||
|
title = "Kirkus Reviews"
|
||||||
|
description = "Kirkus Reviews is an American book review magazine founded in 1933 by Virginia Kirkus. The magazine is headquartered in New York City. Released twice monthly on the 1st/15th. https://www.kirkusreviews.com/magazine/current/"
|
||||||
|
language = "en"
|
||||||
|
__author__ = "ping"
|
||||||
|
publication_type = "magazine"
|
||||||
|
masthead_url = (
|
||||||
|
"https://d1fd687oe6a92y.cloudfront.net/img/kir_images/logo/kirkus-nav-logo.svg"
|
||||||
|
)
|
||||||
|
encoding = "utf-8"
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
auto_cleanup = False
|
||||||
|
ignore_duplicate_articles = {"url"}
|
||||||
|
compress_news_images = True
|
||||||
|
compress_news_images_auto_size = 6
|
||||||
|
max_articles_per_feed = 99
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(
|
||||||
|
class_=[
|
||||||
|
"article-author",
|
||||||
|
"article-author-img-start",
|
||||||
|
"article-author-description-start",
|
||||||
|
"single-review",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
remove_tags = [
|
||||||
|
dict(
|
||||||
|
class_=[
|
||||||
|
"sidebar-content",
|
||||||
|
"article-social-share-desktop-first",
|
||||||
|
"article-social-share-desktop-pagination",
|
||||||
|
"article-social-share-mobile",
|
||||||
|
"share-review-text",
|
||||||
|
"like-dislike-article",
|
||||||
|
"rate-this-book-text",
|
||||||
|
"input-group",
|
||||||
|
"user-comments",
|
||||||
|
"show-all-response-text",
|
||||||
|
"button-row",
|
||||||
|
"hide-on-mobile",
|
||||||
|
"related-article",
|
||||||
|
"breadcrumb-row",
|
||||||
|
"shop-now-dropdown",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
remove_tags_after = [dict(class_="single-review")]
|
||||||
|
|
||||||
|
extra_css = """
|
||||||
|
.image-container img { max-width: 100%; height: auto; margin-bottom: 0.2rem; }
|
||||||
|
.photo-caption { font-size: 0.8rem; margin-bottom: 0.5rem; display: block; }
|
||||||
|
.book-review-img .image-container { text-align: center; }
|
||||||
|
.book-rating-module .description-title { font-size: 1.25rem; margin-left: 0; text-align: center; }
|
||||||
|
"""
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
h1 = soup.find(class_="article-title")
|
||||||
|
book_cover = soup.find("ul", class_="book-review-img")
|
||||||
|
if book_cover:
|
||||||
|
for li in book_cover.find_all("li"):
|
||||||
|
li.name = "div"
|
||||||
|
book_cover.name = "div"
|
||||||
|
if h1:
|
||||||
|
book_cover.insert_before(h1.extract())
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
issue_url = "https://www.kirkusreviews.com/magazine/current/"
|
||||||
|
soup = self.index_to_soup(issue_url)
|
||||||
|
issue = soup.find(name="article", class_="issue-container")
|
||||||
|
cover_img = issue.select(".issue-header .cover-image img")
|
||||||
|
if cover_img:
|
||||||
|
self.cover_url = cover_img[0]["src"]
|
||||||
|
|
||||||
|
h1 = issue.find("h1")
|
||||||
|
if h1:
|
||||||
|
self.timefmt = f" [{self.tag_to_string(h1)}]" # edition
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
for book_ele in soup.find_all(name="div", class_="issue-featured-book"):
|
||||||
|
link = book_ele.find("a")
|
||||||
|
if not link:
|
||||||
|
continue
|
||||||
|
section = self.tag_to_string(book_ele.find("h3")).upper()
|
||||||
|
articles.setdefault(section, []).append(
|
||||||
|
{"url": urljoin(issue_url, link["href"]), "title": link["title"]}
|
||||||
|
)
|
||||||
|
for post_ele in issue.select("div.issue-more-posts ul li div.lead-text"):
|
||||||
|
link = post_ele.find("a")
|
||||||
|
if not link:
|
||||||
|
continue
|
||||||
|
section = self.tag_to_string(post_ele.find(class_="lead-text-type")).upper()
|
||||||
|
articles.setdefault(section, []).append(
|
||||||
|
{
|
||||||
|
"url": urljoin(issue_url, link["href"]),
|
||||||
|
"title": self.tag_to_string(link),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
for section_ele in issue.select("section.reviews-section"):
|
||||||
|
section_articles = []
|
||||||
|
for review in section_ele.select("ul li.starred"):
|
||||||
|
link = review.select("h4 a")
|
||||||
|
if not link:
|
||||||
|
continue
|
||||||
|
description = review.find("p")
|
||||||
|
section_articles.append(
|
||||||
|
{
|
||||||
|
"url": urljoin(issue_url, link[0]["href"]),
|
||||||
|
"title": self.tag_to_string(link[0]),
|
||||||
|
"description": ""
|
||||||
|
if not description
|
||||||
|
else self.tag_to_string(description),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if not section_articles:
|
||||||
|
continue
|
||||||
|
section = self.tag_to_string(section_ele.find("h3")).upper()
|
||||||
|
if section not in articles:
|
||||||
|
articles[section] = []
|
||||||
|
articles.setdefault(section, []).extend(section_articles)
|
||||||
|
|
||||||
|
return articles.items()
|
135
recipes/poetrymagazine.recipe
Normal file
135
recipes/poetrymagazine.recipe
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
import re
|
||||||
|
from collections import OrderedDict
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
_issue_url = ""
|
||||||
|
|
||||||
|
COMMA_SEP_RE = re.compile(r"\s*,\s*")
|
||||||
|
SPACE_SEP_RE = re.compile(r"\s+")
|
||||||
|
NON_NUMERIC_RE = re.compile(r"[^\d]+")
|
||||||
|
|
||||||
|
|
||||||
|
class Poetry(BasicNewsRecipe):
|
||||||
|
title = "Poetry Magazine"
|
||||||
|
__author__ = "ping"
|
||||||
|
description = (
|
||||||
|
"Founded in Chicago by Harriet Monroe in 1912, Poetry is the oldest monthly "
|
||||||
|
"devoted to verse in the English-speaking world. https://www.poetryfoundation.org/poetrymagazine"
|
||||||
|
)
|
||||||
|
publication_type = "magazine"
|
||||||
|
language = "en"
|
||||||
|
encoding = "utf-8"
|
||||||
|
remove_javascript = True
|
||||||
|
no_stylesheets = True
|
||||||
|
auto_cleanup = False
|
||||||
|
ignore_duplicate_articles = {"url"}
|
||||||
|
compress_news_images = False
|
||||||
|
|
||||||
|
remove_attributes = ["style", "font"]
|
||||||
|
keep_only_tags = [dict(name="article")]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name="button"),
|
||||||
|
dict(
|
||||||
|
attrs={
|
||||||
|
"class": [
|
||||||
|
"c-socialBlocks",
|
||||||
|
"c-index",
|
||||||
|
"o-stereo",
|
||||||
|
"u-hideAboveSmall",
|
||||||
|
"c-slideTrigger",
|
||||||
|
"js-slideshow",
|
||||||
|
]
|
||||||
|
}
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
extra_css = """
|
||||||
|
h1 { font-size: 1.8rem; margin-bottom: 0.5rem; }
|
||||||
|
.o-titleBar-summary { font-size: 1.2rem; font-style: italic; margin-bottom: 1rem; }
|
||||||
|
div.o-titleBar-meta, div.c-feature-sub { font-weight: bold; color: #444; margin-bottom: 1.5rem; }
|
||||||
|
div.pcms_media img, div.o-mediaEnclosure img { max-width: 100%; height: auto; }
|
||||||
|
div.o-mediaEnclosure .o-mediaEnclosure-metadata { font-size: 0.8rem; margin-top: 0.2rem; }
|
||||||
|
div.c-feature-bd { margin-bottom: 2rem; }
|
||||||
|
div.c-auxContent { color: #222; font-size: 0.85rem; margin-top: 2rem; }
|
||||||
|
"""
|
||||||
|
|
||||||
|
def extract_from_img_srcset(self, srcset: str, max_width=0):
|
||||||
|
sources = [s.strip() for s in COMMA_SEP_RE.split(srcset) if s.strip()]
|
||||||
|
if len(sources) == 1:
|
||||||
|
# just a regular img url probably
|
||||||
|
return sources[0]
|
||||||
|
parsed_sources = []
|
||||||
|
for src in sources:
|
||||||
|
src_n_width = [s.strip() for s in SPACE_SEP_RE.split(src) if s.strip()]
|
||||||
|
if len(src_n_width) != 2:
|
||||||
|
raise ValueError(f"Not a valid srcset: {srcset}")
|
||||||
|
parsed_sources.append(
|
||||||
|
(
|
||||||
|
src_n_width[0].strip(),
|
||||||
|
int(NON_NUMERIC_RE.sub("", src_n_width[1].strip())),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
parsed_sources = list(set(parsed_sources))
|
||||||
|
parsed_sources = sorted(parsed_sources, key=lambda x: x[1], reverse=True)
|
||||||
|
if not max_width:
|
||||||
|
return parsed_sources[0][0]
|
||||||
|
for img, width in parsed_sources:
|
||||||
|
if width <= max_width:
|
||||||
|
return img
|
||||||
|
return parsed_sources[-1][0]
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for img in soup.select("div.o-mediaEnclosure img"):
|
||||||
|
if not img.get("srcset"):
|
||||||
|
continue
|
||||||
|
img["src"] = self.extract_from_img_srcset(img["srcset"], max_width=1000)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
if _issue_url:
|
||||||
|
soup = self.index_to_soup(_issue_url)
|
||||||
|
else:
|
||||||
|
soup = self.index_to_soup("https://www.poetryfoundation.org/poetrymagazine")
|
||||||
|
current_issue = soup.select("div.c-cover-media a")
|
||||||
|
if not current_issue:
|
||||||
|
self.abort_recipe_processing("Unable to find latest issue")
|
||||||
|
current_issue = current_issue[0]
|
||||||
|
soup = self.index_to_soup(current_issue["href"])
|
||||||
|
|
||||||
|
issue_edition = self.tag_to_string(soup.find("h1"))
|
||||||
|
self.timefmt = f" [{issue_edition}]"
|
||||||
|
cover_image = soup.select("div.c-issueBillboard-cover-media img")[0]
|
||||||
|
parsed_cover_url = urlparse(
|
||||||
|
cover_image["srcset"].split(",")[-1].strip().split(" ")[0]
|
||||||
|
)
|
||||||
|
self.cover_url = f"{parsed_cover_url.scheme}://{parsed_cover_url.netloc}{parsed_cover_url.path}"
|
||||||
|
|
||||||
|
sectioned_feeds = OrderedDict()
|
||||||
|
|
||||||
|
tabs = soup.find_all("div", attrs={"class": "c-tier_tabbed"})
|
||||||
|
for tab in tabs:
|
||||||
|
tab_title = tab.find("div", attrs={"class": "c-tier-tab"})
|
||||||
|
tab_content = tab.find("div", attrs={"class": "c-tier-content"})
|
||||||
|
if not (tab_title and tab_content):
|
||||||
|
continue
|
||||||
|
tab_title = self.tag_to_string(tab_title)
|
||||||
|
sectioned_feeds[tab_title] = []
|
||||||
|
for li in tab_content.select("ul.o-blocks > li"):
|
||||||
|
author = self.tag_to_string(
|
||||||
|
li.find("span", attrs={"class": "c-txt_attribution"})
|
||||||
|
)
|
||||||
|
for link in li.find_all("a", attrs={"class": "c-txt_abstract"}):
|
||||||
|
self.log("Found article:", self.tag_to_string(link))
|
||||||
|
sectioned_feeds[tab_title].append(
|
||||||
|
{
|
||||||
|
"title": self.tag_to_string(link),
|
||||||
|
"url": link["href"],
|
||||||
|
"author": author,
|
||||||
|
"description": author,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return sectioned_feeds.items()
|
Loading…
x
Reference in New Issue
Block a user