from urllib.parse import urljoin from calibre.web.feeds.news import BasicNewsRecipe class KirkusReviews(BasicNewsRecipe): title = "Kirkus Reviews" description = ("Kirkus Reviews is an American book review magazine founded in 1933 by Virginia Kirkus." " The magazine is headquartered in New York City. Released twice monthly on the 1st/15th.") language = "en" __author__ = "ping" publication_type = "magazine" masthead_url = ( "https://d1fd687oe6a92y.cloudfront.net/img/kir_images/logo/kirkus-nav-logo.svg" ) encoding = "utf-8" remove_javascript = True no_stylesheets = True auto_cleanup = False ignore_duplicate_articles = {"url"} compress_news_images = True compress_news_images_auto_size = 6 max_articles_per_feed = 99 keep_only_tags = [ dict( class_=[ "article-author", "article-author-img-start", "article-author-description-start", "single-review", ] ) ] remove_tags = [ dict( class_=[ "sidebar-content", "article-social-share-desktop-first", "article-social-share-desktop-pagination", "article-social-share-mobile", "share-review-text", "like-dislike-article", "rate-this-book-text", "input-group", "user-comments", "show-all-response-text", "button-row", "hide-on-mobile", "related-article", "breadcrumb-row", "shop-now-dropdown", ] ) ] remove_tags_after = [dict(class_="single-review")] extra_css = """ .image-container img { max-width: 100%; height: auto; margin-bottom: 0.2rem; } .photo-caption { font-size: 0.8rem; margin-bottom: 0.5rem; display: block; } .book-review-img .image-container { text-align: center; } .book-rating-module .description-title { font-size: 1.25rem; margin-left: 0; text-align: center; } """ def preprocess_html(self, soup): h1 = soup.find(class_="article-title") book_cover = soup.find("ul", class_="book-review-img") if book_cover: for li in book_cover.find_all("li"): li.name = "div" book_cover.name = "div" if h1: book_cover.insert_before(h1.extract()) return soup def parse_index(self): issue_url = "https://www.kirkusreviews.com/magazine/current/" soup = self.index_to_soup(issue_url) issue = soup.find(name="article", class_="issue-container") cover_img = issue.select(".issue-header .cover-image img") if cover_img: self.cover_url = cover_img[0]["src"] h1 = issue.find("h1") if h1: self.timefmt = f" [{self.tag_to_string(h1)}]" # edition articles = {} for book_ele in soup.find_all(name="div", class_="issue-featured-book"): link = book_ele.find("a") if not link: continue section = self.tag_to_string(book_ele.find("h3")).upper() articles.setdefault(section, []).append( {"url": urljoin(issue_url, link["href"]), "title": link["title"]} ) for post_ele in issue.select("div.issue-more-posts ul li div.lead-text"): link = post_ele.find("a") if not link: continue section = self.tag_to_string(post_ele.find(class_="lead-text-type")).upper() articles.setdefault(section, []).append( { "url": urljoin(issue_url, link["href"]), "title": self.tag_to_string(link), } ) for section_ele in issue.select("section.reviews-section"): section_articles = [] for review in section_ele.select("ul li.starred"): link = review.select("h4 a") if not link: continue description = review.find("p") section_articles.append( { "url": urljoin(issue_url, link[0]["href"]), "title": self.tag_to_string(link[0]), "description": "" if not description else self.tag_to_string(description), } ) if not section_articles: continue section = self.tag_to_string(section_ele.find("h3")).upper() if section not in articles: articles[section] = [] articles.setdefault(section, []).extend(section_articles) return articles.items()