calibre/recipes/harpers_full.recipe

# -*- mode: python -*-
# -*- coding: utf-8 -*-
# vi: set fenc=utf-8 ft=python :
# kate: encoding utf-8; syntax python;

__license__ = "GPL v3"
__copyright__ = "2008-2019, Darko Miletic <darko.miletic at gmail.com>"
"""
harpers.org - printed issue articles
This recipe only get's article's published in text format
images and pdf's are ignored
"""

from urllib.parse import urljoin

from calibre import browser
from calibre.web.feeds.news import BasicNewsRecipe

# overwrite this with a custom issue url, e.g. https://harpers.org/archive/2023/01/
_issue_url = ""


class Harpers_full(BasicNewsRecipe):
    title = "Harper's Magazine - articles from printed edition"
    __author__ = "Darko Miletic, updated by ping"
    description = "Harper's Magazine, the oldest general-interest monthly in America, explores the issues that drive our national conversation, through long-form narrative journalism and essays, and such celebrated features as the iconic Harper's Index."  # noqa
    publisher = "Harpers's"
    category = "news, politics, USA"
    oldest_article = 31
    max_articles_per_feed = 100
    no_stylesheets = True
    use_embedded_content = False
    language = "en"
    encoding = "utf8"
    publication_type = "magazine"
    requires_version = (5, 0, 0)  # py3
    ignore_duplicate_articles = {"url"}
    base_url = "https://harpers.org"

    keep_only_tags = [
        dict(
            class_=[
                "article-content",
                "template-index-archive",  # harper's index
            ]
        )
    ]
    remove_tags = [
        dict(
            class_=[
                "component-newsletter-signup",
                "sidebar",
                "header-meta",
                "component-from-author",
                "from-issue",
                "d-none",
                "COA_roles_fix_space",
                "section-tags",
                "aria-font-adjusts",
                "component-share-buttons",
                "index-footer",
                "index-prev-link",
                "comma",
            ]
        ),
        # for harper's index
        dict(
            class_=[
                "aria-font-adjusts",
                "component-share-buttons",
                "index-footer",
                "index-prev-link",
            ]
        ),
    ]
    remove_attributes = ["style", "width", "height"]

    extra_css = """
    h1.article-title { font-size: x-large; margin-bottom: 0.4rem; }
    .subheading, .post-subtitle { font-size: large; font-style: italic; margin-bottom: 1rem; }
    .byline { margin-bottom: 1rem }
    .article-hero-img img, .flex-section-image img, .wp-caption img {
        display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto;
        box-sizing: border-box;
    }
    .wp-caption-text { font-size: small; margin-top: 0.3rem; }

    .author-bio { margin-top: 2.5rem; font-style: italic; }
    .author-bio em { font-weight: bold; }

    .index-item { font-size: large; margin: 1rem 0; }
    .index-statement > p { display: inline-block; margin: 0.5rem 0; }
    .index-statement > span { display: inline-block; }
    .index-statement .index-tooltip { font-size: small; }
    """

    # Send cookie-less requests to get full article
    def get_browser(self, *args, **kwargs):
        return self

    def clone_browser(self, *args, **kwargs):
        return self.get_browser()

    def open_novisit(self, *args, **kwargs):
        br = browser()
        return br.open_novisit(*args, **kwargs)

    open = open_novisit

    def preprocess_html(self, soup):
        # General UI tweaks
        # move subheading to before byline (instead of where it is now, after)
        subheading_ele = soup.find(class_="subheading")
        byline_ele = soup.find(class_="byline")
        if byline_ele and subheading_ele:
            byline_ele.insert_before(subheading_ele.extract())

        # strip extraneous stuff from author bio
        for bio in soup.find_all(class_="author-bio"):
            for dec_ele in bio.find_all("br"):
                dec_ele.decompose()
            for unwrap_ele in bio.find_all("p"):
                unwrap_ele.unwrap()

        # remove extraneous hr
        for hr in soup.select(".after-post-content hr"):
            hr.decompose()
        return soup

    def parse_index(self):
        if not _issue_url:
            issues_soup = self.index_to_soup("https://harpers.org/issues/")
            curr_issue_a_ele = issues_soup.select_one("div.issue-card a")
            curr_issue_url = urljoin(self.base_url, curr_issue_a_ele["href"])
        else:
            curr_issue_url = _issue_url

        soup = self.index_to_soup(curr_issue_url)
        self.timefmt = (
            f' [{self.tag_to_string(soup.find("h1", class_="issue-heading")).strip()}]'
        )
        self.cover_url = soup.find("img", class_="cover-img")["src"]

        articles = {}
        for section_name in ("features", "readings", "articles"):
            section = soup.find("section", class_=f"issue-{section_name}")
            if not section:
                continue
            for card in section.find_all("div", class_="article-card"):
                title_ele = card.find(class_="ac-title")
                if not title_ele:
                    continue
                article_url = card.find("a")["href"]
                article_title = self.tag_to_string(title_ele)
                article_description = (
                    f'{self.tag_to_string(card.find(class_="ac-tax"))} '
                    f'{self.tag_to_string(card.find(class_="ac-subtitle"))}'
                ).strip()
                byline = card.find(class_="byline")
                if byline:
                    article_description += (
                        f' {self.tag_to_string(byline).strip().strip(",")}'
                    )
                articles.setdefault(section_name.title(), []).append(
                    {
                        "url": article_url,
                        "title": article_title,
                        "description": article_description,
                    }
                )
        return articles.items()