calibre/recipes/prospectmaguk_free.recipe

# Copyright (c) 2023 https://github.com/ping/
#
# This software is released under the GNU General Public License v3.0
# https://opensource.org/licenses/GPL-3.0

from collections import OrderedDict
from urllib.parse import urljoin

from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes

_issue_url = ''


class ProspectMagazineUKFree(BasicNewsRecipe):
    title = 'Prospect Magazine (Free)'
    __author__ = 'ping'
    description = (
        'Prospect is Britain’s leading current affairs monthly magazine. '
        'It is an independent and eclectic forum for writing and thinking—in '
        'print and online. Published every month with two double issues in '
        'the summer and winter, it spans politics, science, foreign affairs, '
        'economics, the environment, philosophy and the arts.'
    )
    language = 'en_GB'
    category = 'news, UK'
    publication_type = 'magazine'
    masthead_url = 'https://media.prospectmagazine.co.uk/prod/images/gm_grid_thumbnail/358ffc17208c-f4c3cddcdeda-prospect-masthead.png'
    encoding = 'utf-8'
    remove_javascript = True
    no_stylesheets = True
    ignore_duplicate_articles = {'url'}
    INDEX = 'https://www.prospectmagazine.co.uk/issues'

    keep_only_tags = [dict(class_='prop-book-article-panel_main')]
    remove_tags = [
        dict(
            class_=[
                'prop-book-review-header-wrapper_magazine',
                'prop-mobile-social-share_header',
                'prop-magazine-link-block',
                'pros-article-body__img-credit',
                'pros-article-topics__wrapper',
                'pros-article-author__image-wrapper',
                'prop-book-review-promo_details-buy-mobile',
            ]
        ),
        dict(id=['disqus_thread', 'newsletter_wrapper']),
        prefixed_classes('dfp-slot-'),
    ]

    extra_css = '''
    h1 { font-size: 1.8rem; margin-bottom: 0.4rem; }
    .prop-book-review-header-wrapper_standfirst { font-size: 1.2rem; font-style: italic; font-weight: normal; margin-bottom: 0.5rem; }
    .prop-book-review-header-wrapper_details {  margin-top: 1rem; margin-bottom: 1rem; }
    .prop-book-review-header-wrapper_details-byline {
        display: inline-block; font-weight: bold; color: #444; margin-right: 0.5rem; }
    .prop-book-review-header-wrapper_details-date { display: inline-block; }
    .gd-picture img { display: block; max-width: 100%; height: auto; }
    .pros-article-body__img-caption {
        font-size: 0.8rem; display: block; margin-top: 0.2rem;
    }
    .pullquote, blockquote { text-align: center; margin-left: 0; margin-bottom: 0.4rem; font-size: 1.25rem; }
    .prop-book-review-article_author { margin: 1.5rem 0; font-style: italic; }
    .prop-book-review-promo { margin-bottom: 1rem; }
    '''

    def preprocess_html(self, soup):
        # re-position lede image
        lede_img = soup.find('img', class_='prop-book-review-header-wrapper_image')
        meta = soup.find('div', class_='prop-book-review-header-wrapper_details')
        if lede_img and meta:
            lede_img = lede_img.extract()
            meta.insert_after(lede_img)

        for img in soup.find_all('img', attrs={'data-src': True}):
            img['src'] = img['data-src']
            del img['data-src']

        for byline_link in soup.find_all('a', attrs={'data-author-name': True}):
            byline_link.unwrap()
        for author_link in soup.find_all('a', class_='pros-article-author'):
            author_link.unwrap()

        return soup

    def parse_index(self):
        if not _issue_url:
            issues_soup = self.index_to_soup(self.INDEX)
            curr_issue_a_ele = issues_soup.find(
                'a', class_='pros-collection-landing__item'
            )
            curr_issue_url = urljoin(self.INDEX, curr_issue_a_ele['href'])
        else:
            curr_issue_url = _issue_url

        soup = self.index_to_soup(curr_issue_url)
        issue_name = (
            self.tag_to_string(soup.find(class_='magazine-lhc__issue-name'))
            .replace(' issue', '')
            .strip()
        )
        self.timefmt = f' [{issue_name}]'

        self.cover_url = soup.find('img', class_='magazine-lhc__cover-image')[
            'data-src'
        ].replace('portrait_small_fit', 'portrait_large_fit')

        articles = OrderedDict()
        sections = soup.find_all('div', class_='pro-magazine-section')
        for section in sections:
            section_name = self.tag_to_string(
                section.find(class_='pro-magazine-section__name')
            )
            for sect_article in section.find_all(
                class_='pro-magazine-section__article'
            ):
                articles.setdefault(section_name, []).append(
                    {
                        'url': urljoin(self.INDEX, sect_article.find('a')['href']),
                        'title': self.tag_to_string(
                            sect_article.find(
                                class_='pro-magazine-section__article-headline'
                            )
                        ),
                    }
                )

        return articles.items()