calibre/recipes/prospectmaguk_free.recipe
2025-01-24 11:14:14 +01:00

129 lines
5.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Copyright (c) 2023 https://github.com/ping/
#
# This software is released under the GNU General Public License v3.0
# https://opensource.org/licenses/GPL-3.0
from collections import OrderedDict
from urllib.parse import urljoin
from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
_issue_url = ''
class ProspectMagazineUKFree(BasicNewsRecipe):
title = 'Prospect Magazine (Free)'
__author__ = 'ping'
description = (
'Prospect is Britains leading current affairs monthly magazine. '
'It is an independent and eclectic forum for writing and thinking—in '
'print and online. Published every month with two double issues in '
'the summer and winter, it spans politics, science, foreign affairs, '
'economics, the environment, philosophy and the arts.'
)
language = 'en_GB'
category = 'news, UK'
publication_type = 'magazine'
masthead_url = 'https://media.prospectmagazine.co.uk/prod/images/gm_grid_thumbnail/358ffc17208c-f4c3cddcdeda-prospect-masthead.png'
encoding = 'utf-8'
remove_javascript = True
no_stylesheets = True
ignore_duplicate_articles = {'url'}
INDEX = 'https://www.prospectmagazine.co.uk/issues'
keep_only_tags = [dict(class_='prop-book-article-panel_main')]
remove_tags = [
dict(
class_=[
'prop-book-review-header-wrapper_magazine',
'prop-mobile-social-share_header',
'prop-magazine-link-block',
'pros-article-body__img-credit',
'pros-article-topics__wrapper',
'pros-article-author__image-wrapper',
'prop-book-review-promo_details-buy-mobile',
]
),
dict(id=['disqus_thread', 'newsletter_wrapper']),
prefixed_classes('dfp-slot-'),
]
extra_css = '''
h1 { font-size: 1.8rem; margin-bottom: 0.4rem; }
.prop-book-review-header-wrapper_standfirst { font-size: 1.2rem; font-style: italic; font-weight: normal; margin-bottom: 0.5rem; }
.prop-book-review-header-wrapper_details { margin-top: 1rem; margin-bottom: 1rem; }
.prop-book-review-header-wrapper_details-byline {
display: inline-block; font-weight: bold; color: #444; margin-right: 0.5rem; }
.prop-book-review-header-wrapper_details-date { display: inline-block; }
.gd-picture img { display: block; max-width: 100%; height: auto; }
.pros-article-body__img-caption {
font-size: 0.8rem; display: block; margin-top: 0.2rem;
}
.pullquote, blockquote { text-align: center; margin-left: 0; margin-bottom: 0.4rem; font-size: 1.25rem; }
.prop-book-review-article_author { margin: 1.5rem 0; font-style: italic; }
.prop-book-review-promo { margin-bottom: 1rem; }
'''
def preprocess_html(self, soup):
# re-position lede image
lede_img = soup.find('img', class_='prop-book-review-header-wrapper_image')
meta = soup.find('div', class_='prop-book-review-header-wrapper_details')
if lede_img and meta:
lede_img = lede_img.extract()
meta.insert_after(lede_img)
for img in soup.find_all('img', attrs={'data-src': True}):
img['src'] = img['data-src']
del img['data-src']
for byline_link in soup.find_all('a', attrs={'data-author-name': True}):
byline_link.unwrap()
for author_link in soup.find_all('a', class_='pros-article-author'):
author_link.unwrap()
return soup
def parse_index(self):
if not _issue_url:
issues_soup = self.index_to_soup(self.INDEX)
curr_issue_a_ele = issues_soup.find(
'a', class_='pros-collection-landing__item'
)
curr_issue_url = urljoin(self.INDEX, curr_issue_a_ele['href'])
else:
curr_issue_url = _issue_url
soup = self.index_to_soup(curr_issue_url)
issue_name = (
self.tag_to_string(soup.find(class_='magazine-lhc__issue-name'))
.replace(' issue', '')
.strip()
)
self.timefmt = f' [{issue_name}]'
self.cover_url = soup.find('img', class_='magazine-lhc__cover-image')[
'data-src'
].replace('portrait_small_fit', 'portrait_large_fit')
articles = OrderedDict()
sections = soup.find_all('div', class_='pro-magazine-section')
for section in sections:
section_name = self.tag_to_string(
section.find(class_='pro-magazine-section__name')
)
for sect_article in section.find_all(
class_='pro-magazine-section__article'
):
articles.setdefault(section_name, []).append(
{
'url': urljoin(self.INDEX, sect_article.find('a')['href']),
'title': self.tag_to_string(
sect_article.find(
class_='pro-magazine-section__article-headline'
)
),
}
)
return articles.items()