calibre/recipes/kirkusreviews.recipe
2025-01-24 11:14:14 +01:00

132 lines
4.8 KiB
Python

from urllib.parse import urljoin
from calibre.web.feeds.news import BasicNewsRecipe
class KirkusReviews(BasicNewsRecipe):
title = 'Kirkus Reviews'
description = ('Kirkus Reviews is an American book review magazine founded in 1933 by Virginia Kirkus.'
' The magazine is headquartered in New York City. Released twice monthly on the 1st/15th.')
language = 'en'
__author__ = 'ping'
publication_type = 'magazine'
masthead_url = (
'https://d1fd687oe6a92y.cloudfront.net/img/kir_images/logo/kirkus-nav-logo.svg'
)
encoding = 'utf-8'
remove_javascript = True
no_stylesheets = True
auto_cleanup = False
ignore_duplicate_articles = {'url'}
compress_news_images = True
compress_news_images_auto_size = 6
max_articles_per_feed = 99
keep_only_tags = [
dict(
class_=[
'article-author',
'article-author-img-start',
'article-author-description-start',
'single-review',
]
)
]
remove_tags = [
dict(
class_=[
'sidebar-content',
'article-social-share-desktop-first',
'article-social-share-desktop-pagination',
'article-social-share-mobile',
'share-review-text',
'like-dislike-article',
'rate-this-book-text',
'input-group',
'user-comments',
'show-all-response-text',
'button-row',
'hide-on-mobile',
'related-article',
'breadcrumb-row',
'shop-now-dropdown',
]
)
]
remove_tags_after = [dict(class_='single-review')]
extra_css = '''
.image-container img { max-width: 100%; height: auto; margin-bottom: 0.2rem; }
.photo-caption { font-size: 0.8rem; margin-bottom: 0.5rem; display: block; }
.book-review-img .image-container { text-align: center; }
.book-rating-module .description-title { font-size: 1.25rem; margin-left: 0; text-align: center; }
'''
def preprocess_html(self, soup):
h1 = soup.find(class_='article-title')
book_cover = soup.find('ul', class_='book-review-img')
if book_cover:
for li in book_cover.find_all('li'):
li.name = 'div'
book_cover.name = 'div'
if h1:
book_cover.insert_before(h1.extract())
return soup
def parse_index(self):
issue_url = 'https://www.kirkusreviews.com/magazine/current/'
soup = self.index_to_soup(issue_url)
issue = soup.find(name='article', class_='issue-container')
cover_img = issue.select('.issue-header .cover-image img')
if cover_img:
self.cover_url = cover_img[0]['src']
h1 = issue.find('h1')
if h1:
self.timefmt = f' [{self.tag_to_string(h1)}]' # edition
articles = {}
for book_ele in soup.find_all(name='div', class_='issue-featured-book'):
link = book_ele.find('a')
if not link:
continue
section = self.tag_to_string(book_ele.find('h3')).upper()
articles.setdefault(section, []).append(
{'url': urljoin(issue_url, link['href']), 'title': link['title']}
)
for post_ele in issue.select('div.issue-more-posts ul li div.lead-text'):
link = post_ele.find('a')
if not link:
continue
section = self.tag_to_string(post_ele.find(class_='lead-text-type')).upper()
articles.setdefault(section, []).append(
{
'url': urljoin(issue_url, link['href']),
'title': self.tag_to_string(link),
}
)
for section_ele in issue.select('section.reviews-section'):
section_articles = []
for review in section_ele.select('ul li.starred'):
link = review.select('h4 a')
if not link:
continue
description = review.find('p')
section_articles.append(
{
'url': urljoin(issue_url, link[0]['href']),
'title': self.tag_to_string(link[0]),
'description': ''
if not description
else self.tag_to_string(description),
}
)
if not section_articles:
continue
section = self.tag_to_string(section_ele.find('h3')).upper()
if section not in articles:
articles[section] = []
articles.setdefault(section, []).extend(section_articles)
return articles.items()