mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
129 lines
5.1 KiB
Python
129 lines
5.1 KiB
Python
# Copyright (c) 2023 https://github.com/ping/
|
||
#
|
||
# This software is released under the GNU General Public License v3.0
|
||
# https://opensource.org/licenses/GPL-3.0
|
||
|
||
from collections import OrderedDict
|
||
from urllib.parse import urljoin
|
||
|
||
from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
|
||
|
||
_issue_url = ''
|
||
|
||
|
||
class ProspectMagazineUKFree(BasicNewsRecipe):
|
||
title = 'Prospect Magazine (Free)'
|
||
__author__ = 'ping'
|
||
description = (
|
||
'Prospect is Britain’s leading current affairs monthly magazine. '
|
||
'It is an independent and eclectic forum for writing and thinking—in '
|
||
'print and online. Published every month with two double issues in '
|
||
'the summer and winter, it spans politics, science, foreign affairs, '
|
||
'economics, the environment, philosophy and the arts.'
|
||
)
|
||
language = 'en_GB'
|
||
category = 'news, UK'
|
||
publication_type = 'magazine'
|
||
masthead_url = 'https://media.prospectmagazine.co.uk/prod/images/gm_grid_thumbnail/358ffc17208c-f4c3cddcdeda-prospect-masthead.png'
|
||
encoding = 'utf-8'
|
||
remove_javascript = True
|
||
no_stylesheets = True
|
||
ignore_duplicate_articles = {'url'}
|
||
INDEX = 'https://www.prospectmagazine.co.uk/issues'
|
||
|
||
keep_only_tags = [dict(class_='prop-book-article-panel_main')]
|
||
remove_tags = [
|
||
dict(
|
||
class_=[
|
||
'prop-book-review-header-wrapper_magazine',
|
||
'prop-mobile-social-share_header',
|
||
'prop-magazine-link-block',
|
||
'pros-article-body__img-credit',
|
||
'pros-article-topics__wrapper',
|
||
'pros-article-author__image-wrapper',
|
||
'prop-book-review-promo_details-buy-mobile',
|
||
]
|
||
),
|
||
dict(id=['disqus_thread', 'newsletter_wrapper']),
|
||
prefixed_classes('dfp-slot-'),
|
||
]
|
||
|
||
extra_css = '''
|
||
h1 { font-size: 1.8rem; margin-bottom: 0.4rem; }
|
||
.prop-book-review-header-wrapper_standfirst { font-size: 1.2rem; font-style: italic; font-weight: normal; margin-bottom: 0.5rem; }
|
||
.prop-book-review-header-wrapper_details { margin-top: 1rem; margin-bottom: 1rem; }
|
||
.prop-book-review-header-wrapper_details-byline {
|
||
display: inline-block; font-weight: bold; color: #444; margin-right: 0.5rem; }
|
||
.prop-book-review-header-wrapper_details-date { display: inline-block; }
|
||
.gd-picture img { display: block; max-width: 100%; height: auto; }
|
||
.pros-article-body__img-caption {
|
||
font-size: 0.8rem; display: block; margin-top: 0.2rem;
|
||
}
|
||
.pullquote, blockquote { text-align: center; margin-left: 0; margin-bottom: 0.4rem; font-size: 1.25rem; }
|
||
.prop-book-review-article_author { margin: 1.5rem 0; font-style: italic; }
|
||
.prop-book-review-promo { margin-bottom: 1rem; }
|
||
'''
|
||
|
||
def preprocess_html(self, soup):
|
||
# re-position lede image
|
||
lede_img = soup.find('img', class_='prop-book-review-header-wrapper_image')
|
||
meta = soup.find('div', class_='prop-book-review-header-wrapper_details')
|
||
if lede_img and meta:
|
||
lede_img = lede_img.extract()
|
||
meta.insert_after(lede_img)
|
||
|
||
for img in soup.find_all('img', attrs={'data-src': True}):
|
||
img['src'] = img['data-src']
|
||
del img['data-src']
|
||
|
||
for byline_link in soup.find_all('a', attrs={'data-author-name': True}):
|
||
byline_link.unwrap()
|
||
for author_link in soup.find_all('a', class_='pros-article-author'):
|
||
author_link.unwrap()
|
||
|
||
return soup
|
||
|
||
def parse_index(self):
|
||
if not _issue_url:
|
||
issues_soup = self.index_to_soup(self.INDEX)
|
||
curr_issue_a_ele = issues_soup.find(
|
||
'a', class_='pros-collection-landing__item'
|
||
)
|
||
curr_issue_url = urljoin(self.INDEX, curr_issue_a_ele['href'])
|
||
else:
|
||
curr_issue_url = _issue_url
|
||
|
||
soup = self.index_to_soup(curr_issue_url)
|
||
issue_name = (
|
||
self.tag_to_string(soup.find(class_='magazine-lhc__issue-name'))
|
||
.replace(' issue', '')
|
||
.strip()
|
||
)
|
||
self.timefmt = f' [{issue_name}]'
|
||
|
||
self.cover_url = soup.find('img', class_='magazine-lhc__cover-image')[
|
||
'data-src'
|
||
].replace('portrait_small_fit', 'portrait_large_fit')
|
||
|
||
articles = OrderedDict()
|
||
sections = soup.find_all('div', class_='pro-magazine-section')
|
||
for section in sections:
|
||
section_name = self.tag_to_string(
|
||
section.find(class_='pro-magazine-section__name')
|
||
)
|
||
for sect_article in section.find_all(
|
||
class_='pro-magazine-section__article'
|
||
):
|
||
articles.setdefault(section_name, []).append(
|
||
{
|
||
'url': urljoin(self.INDEX, sect_article.find('a')['href']),
|
||
'title': self.tag_to_string(
|
||
sect_article.find(
|
||
class_='pro-magazine-section__article-headline'
|
||
)
|
||
),
|
||
}
|
||
)
|
||
|
||
return articles.items()
|