Update Harpers

2025-08-30 23:00:21 -04:00 · 2024-06-26 21:09:21 +05:30 · 2024-06-26 21:09:21 +05:30 · eb7377b144
commit eb7377b144
parent 0f49b15edf
4 changed files with 70 additions and 237 deletions
--- a/recipes/harpers.recipe
+++ b/recipes/harpers.recipe
@ -1,88 +1,91 @@
-__license__ = 'GPL v3'
-__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
 '''
 harpers.org
 '''
-from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.web.feeds.news import BasicNewsRecipe, classes
+from calibre import browser


 class Harpers(BasicNewsRecipe):
-    title = u"Harper's Magazine"
-    __author__ = u'Darko Miletic'
-    language = 'en'
-    description = u"Harper's Magazine: Founded June 1850."
+    title = 'Harper’s Magazine'
+    __author__ = 'unkn0wn'
+    language = 'en_US'
+    description = (
+        'Harper’s Magazine, the oldest general-interest monthly in America, explores the issues that drive our '
+        'national conversation, through long-form narrative journalism and essays, and such celebrated '
+        'features as the iconic Harper’s Index. With its emphasis on fine writing and original thought '
+        'Harper’s provides readers with a unique perspective on politics, society, the environment, and culture.'
+    )
    publisher = "Harper's Magazine "
    category = 'news, politics, USA'
-    oldest_article = 30
-    max_articles_per_feed = 100
    no_stylesheets = True
    use_embedded_content = False
-
-    conversion_options = {
-        'comment': description, 'tags': category, 'publisher': publisher, 'language': language
-    }
+    masthead_url = 'https://harpers.org/wp-content/themes/timber/assets/img/logo.svg'
+    ignore_duplicate_articles = {'url'}
+    encoding = 'utf-8'
+    remove_attributes = ['style', 'height', 'width']

    keep_only_tags = [
-        dict(
-            class_=[
-                "article-content",
-                "template-index-archive",  # harper's index
-            ]
-        )
+        dict(attrs={'class':lambda x: x and (
+            'title-header desktop ' in x or 'col-md-8 col-xl-9' in x
+        )}),
+        classes('article-hero-img entry-content pdf-only')
    ]
    remove_tags = [
-        dict(
-            class_=[
-                "component-newsletter-signup",
-                "sidebar",
-                "header-meta",
-                "component-from-author",
-                "from-issue",
-                "d-none",
-                "COA_roles_fix_space",
-                "section-tags",
-                "aria-font-adjusts",
-                "component-share-buttons",
-                "index-footer",
-                "index-prev-link",
-                "comma",
-            ]
-        ),
-        # for harper's index
-        dict(
-            class_=[
-                "aria-font-adjusts",
-                "component-share-buttons",
-                "index-footer",
-                "index-prev-link",
-            ]
-        ),
+        classes('header-controls')
    ]
    remove_attributes = ["style", "width", "height"]

-    extra_css = """
-    h1.article-title { font-size: x-large; margin-bottom: 0.4rem; }
-    .subheading, .post-subtitle { font-size: large; font-style: italic; margin-bottom: 1rem; }
-    .byline { margin-bottom: 1rem }
-    .article-hero-img img, .flex-section-image img, .wp-caption img {
-        display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto;
-        box-sizing: border-box;
-    }
-    .wp-caption-text { font-size: small; margin-top: 0.3rem; }
+    extra_css = '''
+        img {display:block; margin:0 auto;}
+        .category, .from-issue { font-size:small; color:#404040; }
+        .wp-caption-text { font-size:small; text-align:center; }
+        .subheading { font-style:italic; color:#202020; }
+        .byline { font-size:small; }
+        em, blockquote { color:#202020; }
+    '''

-    .author-bio { margin-top: 2.5rem; font-style: italic; }
-    .author-bio em { font-weight: bold; }
+    def preprocess_html(self, soup):
+        sub = soup.find(attrs={'class':'subheading'})
+        if sub:
+            sub.name = 'p'
+        for img in soup.findAll('img', attrs={'srcset':True}):
+            for src in img['srcset'].split(','):
+                if '768w' in src:
+                    img['src'] = img['src'].split()[0]
+        return soup

-    .index-item { font-size: large; margin: 1rem 0; }
-    .index-statement > p { display: inline-block; margin: 0.5rem 0; }
-    .index-statement > span { display: inline-block; }
-    .index-statement .index-tooltip { font-size: small; }
-    """
-
-    def get_cover_url(self):
+    def parse_index(self):
        issues_soup = self.index_to_soup("https://harpers.org/issues/")
-        curr_issue_a_ele = issues_soup.select_one("div.issue-card a")
-        if curr_issue_a_ele.find("img"):
-            return curr_issue_a_ele.img["src"]
+        a_ele = issues_soup.select_one("div.issue-card a")
+        self.timefmt = ' [' + self.tag_to_string(a_ele.find(attrs={'class':'issue-title'})) + ']'
+        url = a_ele['href']
+        soup = self.index_to_soup(url)
+        cov_div = soup.find('div', attrs={'class':'issue-cover'})
+        if cov_div:
+            self.cover_url = cov_div.find('img', attrs={'class':'cover-img'})['src']
+        ans = []
+        for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(url + '/')}):
+            if not a.find('img') and a.find(['h1', 'h2', 'h3', 'h4']):
+                url = a['href']
+                title = self.tag_to_string(a)
+                desc = ''
+                div = a.findParent('div').find('div', attrs={'class':'byline'})
+                if div:
+                    desc = self.tag_to_string(div)
+                self.log('\t', title, '\n\t', desc, '\n\t', url)
+                ans.append({'title': title, 'description': desc, 'url': url})
+        return [('Articles', ans)]

-    feeds = [(u"Harper's Magazine", u'https://harpers.org/feed/')]
+    # Harpers changes the content it delivers based on cookies, so the
+    # following ensures that we send no cookies
+    def get_browser(self, *args, **kwargs):
+        return self
+
+    def clone_browser(self, *args, **kwargs):
+        return self.get_browser()
+
+    def open_novisit(self, *args, **kwargs):
+        br = browser()
+        return br.open_novisit(*args, **kwargs)
+
+    open = open_novisit
--- a/recipes/harpers_full.recipe
+++ b/recipes/harpers_full.recipe
@ -1,170 +0,0 @@
-# -*- mode: python -*-
-# -*- coding: utf-8 -*-
-# vi: set fenc=utf-8 ft=python :
-# kate: encoding utf-8; syntax python;
-
-__license__ = "GPL v3"
-__copyright__ = "2008-2019, Darko Miletic <darko.miletic at gmail.com>"
-"""
-harpers.org - printed issue articles
-This recipe only get's article's published in text format
-images and pdf's are ignored
-"""
-
-from calibre import browser
-from calibre.web.feeds.news import BasicNewsRecipe
-
-# overwrite this with a custom issue url, e.g. https://harpers.org/archive/2023/01/
-_issue_url = ""
-
-
-class Harpers_full(BasicNewsRecipe):
-    title = "Harper's Magazine - articles from printed edition"
-    __author__ = "Darko Miletic, updated by ping"
-    description = "Harper's Magazine, the oldest general-interest monthly in America, explores the issues that drive our national conversation, through long-form narrative journalism and essays, and such celebrated features as the iconic Harper's Index."  # noqa
-    publisher = "Harpers's"
-    category = "news, politics, USA"
-    oldest_article = 31
-    max_articles_per_feed = 100
-    no_stylesheets = True
-    use_embedded_content = False
-    language = "en"
-    encoding = "utf8"
-    publication_type = "magazine"
-    requires_version = (5, 0, 0)  # py3
-    ignore_duplicate_articles = {"url"}
-    base_url = "https://harpers.org"
-
-    keep_only_tags = [
-        dict(
-            class_=[
-                "article-content",
-                "template-index-archive",  # harper's index
-            ]
-        )
-    ]
-    remove_tags = [
-        dict(
-            class_=[
-                "component-newsletter-signup",
-                "sidebar",
-                "header-meta",
-                "component-from-author",
-                "from-issue",
-                "d-none",
-                "COA_roles_fix_space",
-                "section-tags",
-                "aria-font-adjusts",
-                "component-share-buttons",
-                "index-footer",
-                "index-prev-link",
-                "comma",
-            ]
-        ),
-        # for harper's index
-        dict(
-            class_=[
-                "aria-font-adjusts",
-                "component-share-buttons",
-                "index-footer",
-                "index-prev-link",
-            ]
-        ),
-    ]
-    remove_attributes = ["style", "width", "height"]
-
-    extra_css = """
-    h1.article-title { font-size: x-large; margin-bottom: 0.4rem; }
-    .subheading, .post-subtitle { font-size: large; font-style: italic; margin-bottom: 1rem; }
-    .byline { margin-bottom: 1rem }
-    .article-hero-img img, .flex-section-image img, .wp-caption img {
-        display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto;
-        box-sizing: border-box;
-    }
-    .wp-caption-text { font-size: small; margin-top: 0.3rem; }
-
-    .author-bio { margin-top: 2.5rem; font-style: italic; }
-    .author-bio em { font-weight: bold; }
-
-    .index-item { font-size: large; margin: 1rem 0; }
-    .index-statement > p { display: inline-block; margin: 0.5rem 0; }
-    .index-statement > span { display: inline-block; }
-    .index-statement .index-tooltip { font-size: small; }
-    """
-
-    # Send cookie-less requests to get full article
-    def get_browser(self, *args, **kwargs):
-        return self
-
-    def clone_browser(self, *args, **kwargs):
-        return self.get_browser()
-
-    def open_novisit(self, *args, **kwargs):
-        br = browser()
-        return br.open_novisit(*args, **kwargs)
-
-    open = open_novisit
-
-    def preprocess_html(self, soup):
-        # General UI tweaks
-        # move subheading to before byline (instead of where it is now, after)
-        subheading_ele = soup.find(class_="subheading")
-        byline_ele = soup.find(class_="byline")
-        if byline_ele and subheading_ele:
-            byline_ele.insert_before(subheading_ele.extract())
-
-        # strip extraneous stuff from author bio
-        for bio in soup.find_all(class_="author-bio"):
-            for dec_ele in bio.find_all("br"):
-                dec_ele.decompose()
-            for unwrap_ele in bio.find_all("p"):
-                unwrap_ele.unwrap()
-
-        # remove extraneous hr
-        for hr in soup.select(".after-post-content hr"):
-            hr.decompose()
-        return soup
-
-    def parse_index(self):
-        if not _issue_url:
-            issues_soup = self.index_to_soup("https://harpers.org/issues/")
-            curr_issue_a_ele = issues_soup.select_one("div.issue-card a")
-            if curr_issue_a_ele.find("img"):
-                self.cover_url = curr_issue_a_ele.img["src"]
-        else:
-            curr_issue_url = _issue_url
-
-        soup = self.index_to_soup(curr_issue_url)
-        self.timefmt = (
-            f' [{self.tag_to_string(soup.find("h1", class_="issue-heading")).strip()}]'
-        )
-        self.cover_url = soup.find("img", class_="cover-img")["src"]
-
-        articles = {}
-        for section_name in ("features", "readings", "articles"):
-            section = soup.find("section", class_=f"issue-{section_name}")
-            if not section:
-                continue
-            for card in section.find_all("div", class_="article-card"):
-                title_ele = card.find(class_="ac-title")
-                if not title_ele:
-                    continue
-                article_url = card.find("a")["href"]
-                article_title = self.tag_to_string(title_ele)
-                article_description = (
-                    f'{self.tag_to_string(card.find(class_="ac-tax"))} '
-                    f'{self.tag_to_string(card.find(class_="ac-subtitle"))}'
-                ).strip()
-                byline = card.find(class_="byline")
-                if byline:
-                    article_description += (
-                        f' {self.tag_to_string(byline).strip().strip(",")}'
-                    )
-                articles.setdefault(section_name.title(), []).append(
-                    {
-                        "url": article_url,
-                        "title": article_title,
-                        "description": article_description,
-                    }
-                )
-        return articles.items()
--- a/recipes/icons/harpers.png
+++ b/recipes/icons/harpers.png
--- a/recipes/icons/harpers_full.png
+++ b/recipes/icons/harpers_full.png