From 41e23a8b0c3d1a0050100117af3b081580e9c64a Mon Sep 17 00:00:00 2001 From: ping Date: Sun, 4 Jun 2023 12:05:34 +0800 Subject: [PATCH] Add Prospect Magazine UK (Free) recipe --- recipes/prospectmaguk_free.recipe | 128 ++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 recipes/prospectmaguk_free.recipe diff --git a/recipes/prospectmaguk_free.recipe b/recipes/prospectmaguk_free.recipe new file mode 100644 index 0000000000..c37513a90d --- /dev/null +++ b/recipes/prospectmaguk_free.recipe @@ -0,0 +1,128 @@ +# Copyright (c) 2023 https://github.com/ping/ +# +# This software is released under the GNU General Public License v3.0 +# https://opensource.org/licenses/GPL-3.0 + +from collections import OrderedDict +from urllib.parse import urljoin + +from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes + +_issue_url = "" + + +class ProspectMagazineUKFree(BasicNewsRecipe): + title = "Prospect Magazine (Free)" + __author__ = "ping" + description = ( + "Prospect is Britain’s leading current affairs monthly magazine. " + "It is an independent and eclectic forum for writing and thinking—in " + "print and online. Published every month with two double issues in " + "the summer and winter, it spans politics, science, foreign affairs, " + "economics, the environment, philosophy and the arts." + ) + language = "en_GB" + category = "news, UK" + publication_type = "magazine" + masthead_url = "https://media.prospectmagazine.co.uk/prod/images/gm_grid_thumbnail/358ffc17208c-f4c3cddcdeda-prospect-masthead.png" + encoding = "utf-8" + remove_javascript = True + no_stylesheets = True + ignore_duplicate_articles = {"url"} + INDEX = "https://www.prospectmagazine.co.uk/issues" + + keep_only_tags = [dict(class_="prop-book-article-panel_main")] + remove_tags = [ + dict( + class_=[ + "prop-book-review-header-wrapper_magazine", + "prop-mobile-social-share_header", + "prop-magazine-link-block", + "pros-article-body__img-credit", + "pros-article-topics__wrapper", + "pros-article-author__image-wrapper", + "prop-book-review-promo_details-buy-mobile", + ] + ), + dict(id=["disqus_thread", "newsletter_wrapper"]), + prefixed_classes("dfp-slot-"), + ] + + extra_css = """ + h1 { font-size: 1.8rem; margin-bottom: 0.4rem; } + .prop-book-review-header-wrapper_standfirst { font-size: 1.2rem; font-style: italic; font-weight: normal; margin-bottom: 0.5rem; } + .prop-book-review-header-wrapper_details { margin-top: 1rem; margin-bottom: 1rem; } + .prop-book-review-header-wrapper_details-byline { + display: inline-block; font-weight: bold; color: #444; margin-right: 0.5rem; } + .prop-book-review-header-wrapper_details-date { display: inline-block; } + .gd-picture img { display: block; max-width: 100%; height: auto; } + .pros-article-body__img-caption { + font-size: 0.8rem; display: block; margin-top: 0.2rem; + } + .pullquote, blockquote { text-align: center; margin-left: 0; margin-bottom: 0.4rem; font-size: 1.25rem; } + .prop-book-review-article_author { margin: 1.5rem 0; font-style: italic; } + .prop-book-review-promo { margin-bottom: 1rem; } + """ + + def preprocess_html(self, soup): + # re-position lede image + lede_img = soup.find("img", class_="prop-book-review-header-wrapper_image") + meta = soup.find("div", class_="prop-book-review-header-wrapper_details") + if lede_img and meta: + lede_img = lede_img.extract() + meta.insert_after(lede_img) + + for img in soup.find_all("img", attrs={"data-src": True}): + img["src"] = img["data-src"] + del img["data-src"] + + for byline_link in soup.find_all("a", attrs={"data-author-name": True}): + byline_link.unwrap() + for author_link in soup.find_all("a", class_="pros-article-author"): + author_link.unwrap() + + return soup + + def parse_index(self): + if not _issue_url: + issues_soup = self.index_to_soup(self.INDEX) + curr_issue_a_ele = issues_soup.find( + "a", class_="pros-collection-landing__item" + ) + curr_issue_url = urljoin(self.INDEX, curr_issue_a_ele["href"]) + else: + curr_issue_url = _issue_url + + soup = self.index_to_soup(curr_issue_url) + issue_name = ( + self.tag_to_string(soup.find(class_="magazine-lhc__issue-name")) + .replace(" issue", "") + .strip() + ) + self.timefmt = f" [{issue_name}]" + + self.cover_url = soup.find("img", class_="magazine-lhc__cover-image")[ + "data-src" + ].replace("portrait_small_fit", "portrait_large_fit") + + articles = OrderedDict() + sections = soup.find_all("div", class_="pro-magazine-section") + for section in sections: + section_name = self.tag_to_string( + section.find(class_="pro-magazine-section__name") + ) + for sect_article in section.find_all( + class_="pro-magazine-section__article" + ): + articles.setdefault(section_name, []).append( + { + "url": urljoin(self.INDEX, sect_article.find("a")["href"]), + "title": self.tag_to_string( + sect_article.find( + class_="pro-magazine-section__article-headline" + ) + ), + } + ) + + return articles.items()