Merge branch 'patch-harpers' of https://github.com/ping/calibre

This commit is contained in:
Kovid Goyal 2023-06-10 09:24:52 +05:30
commit 68f4b773f1
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -3,107 +3,169 @@
# vi: set fenc=utf-8 ft=python : # vi: set fenc=utf-8 ft=python :
# kate: encoding utf-8; syntax python; # kate: encoding utf-8; syntax python;
__license__ = 'GPL v3' __license__ = "GPL v3"
__copyright__ = '2008-2019, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = "2008-2019, Darko Miletic <darko.miletic at gmail.com>"
''' """
harpers.org - paid subscription/ printed issue articles harpers.org - printed issue articles
This recipe only get's article's published in text format This recipe only get's article's published in text format
images and pdf's are ignored images and pdf's are ignored
If you have institutional subscription based on access IP you do not need to enter """
anything in username/password fields
'''
import time from urllib.parse import urljoin
try:
from urllib.parse import urlencode from calibre import browser
except ImportError:
from urllib import urlencode
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
# overwrite this with a custom issue url, e.g. https://harpers.org/archive/2023/01/
def classes(classes): _issue_url = ""
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
class Harpers_full(BasicNewsRecipe): class Harpers_full(BasicNewsRecipe):
title = "Harper's Magazine - articles from printed edition" title = "Harper's Magazine - articles from printed edition"
__author__ = 'Darko Miletic' __author__ = "Darko Miletic, updated by ping"
description = "Harper's Magazine, the oldest general-interest monthly in America, explores the issues that drive our national conversation, through long-form narrative journalism and essays, and such celebrated features as the iconic Harper's Index." # noqa description = "Harper's Magazine, the oldest general-interest monthly in America, explores the issues that drive our national conversation, through long-form narrative journalism and essays, and such celebrated features as the iconic Harper's Index." # noqa
publisher = "Harpers's" publisher = "Harpers's"
category = 'news, politics, USA' category = "news, politics, USA"
oldest_article = 30 oldest_article = 31
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
delay = 1 language = "en"
language = 'en' encoding = "utf8"
encoding = 'utf8' publication_type = "magazine"
needs_subscription = 'optional' requires_version = (5, 0, 0) # py3
publication_type = 'magazine' ignore_duplicate_articles = {"url"}
LOGIN = 'https://harpers.org/wp-admin/admin-ajax.php' base_url = "https://harpers.org"
keep_only_tags = [ keep_only_tags = [
classes('article-header-text entry-content'), dict(
class_=[
"article-content",
"template-index-archive", # harper's index
]
)
] ]
remove_tags = [ remove_tags = [
classes('related-issue-tout section-tags component-from-author component-share-buttons') dict(
class_=[
"component-newsletter-signup",
"sidebar",
"header-meta",
"component-from-author",
"from-issue",
"d-none",
"COA_roles_fix_space",
"section-tags",
"aria-font-adjusts",
"component-share-buttons",
"index-footer",
"index-prev-link",
"comma",
] ]
),
# for harper's index
dict(
class_=[
"aria-font-adjusts",
"component-share-buttons",
"index-footer",
"index-prev-link",
]
),
]
remove_attributes = ["style", "width", "height"]
def get_browser(self): extra_css = """
br = BasicNewsRecipe.get_browser(self) h1.article-title { font-size: x-large; margin-bottom: 0.4rem; }
br.open('https://harpers.org/') .subheading, .post-subtitle { font-size: large; font-style: italic; margin-bottom: 1rem; }
if self.username is not None and self.password is not None: .byline { margin-bottom: 1rem }
tt = time.localtime() * 1000 .article-hero-img img, .flex-section-image img, .wp-caption img {
data = urlencode({'action': 'cds_auth_user', 'm': self.username, 'p': self.password, 'rt': 'https://harpers.org/', 'tt': tt display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto;
}) box-sizing: border-box;
br.open(self.LOGIN, data) }
return br .wp-caption-text { font-size: small; margin-top: 0.3rem; }
.author-bio { margin-top: 2.5rem; font-style: italic; }
.author-bio em { font-weight: bold; }
.index-item { font-size: large; margin: 1rem 0; }
.index-statement > p { display: inline-block; margin: 0.5rem 0; }
.index-statement > span { display: inline-block; }
.index-statement .index-tooltip { font-size: small; }
"""
# Send cookie-less requests to get full article
def get_browser(self, *args, **kwargs):
return self
def clone_browser(self, *args, **kwargs):
return self.get_browser()
def open_novisit(self, *args, **kwargs):
br = browser()
return br.open_novisit(*args, **kwargs)
open = open_novisit
def preprocess_html(self, soup):
# General UI tweaks
# move subheading to before byline (instead of where it is now, after)
subheading_ele = soup.find(class_="subheading")
byline_ele = soup.find(class_="byline")
if byline_ele and subheading_ele:
byline_ele.insert_before(subheading_ele.extract())
# strip extraneous stuff from author bio
for bio in soup.find_all(class_="author-bio"):
for dec_ele in bio.find_all("br"):
dec_ele.decompose()
for unwrap_ele in bio.find_all("p"):
unwrap_ele.unwrap()
# remove extraneous hr
for hr in soup.select(".after-post-content hr"):
hr.decompose()
return soup
def parse_index(self): def parse_index(self):
# find current issue if not _issue_url:
soup = self.index_to_soup('https://harpers.org/') issues_soup = self.index_to_soup("https://harpers.org/issues/")
currentIssue_url = soup.find(attrs={'data-current-issue-url': True})['data-current-issue-url'] curr_issue_a_ele = issues_soup.select_one("div.issue-card a")
self.log('Found issue at:', currentIssue_url) curr_issue_url = urljoin(self.base_url, curr_issue_a_ele["href"])
else:
curr_issue_url = _issue_url
# go to the current issue soup = self.index_to_soup(curr_issue_url)
soup = self.index_to_soup(currentIssue_url) self.timefmt = (
self.timefmt = u' [%s]' % self.tag_to_string(soup.find('a', href=currentIssue_url)) f' [{self.tag_to_string(soup.find("h1", class_="issue-heading")).strip()}]'
)
self.cover_url = soup.find("img", class_="cover-img")["src"]
# get cover articles = {}
self.cover_url = soup.find(**classes('past-issue')).find('img')['src'] for section_name in ("features", "readings", "articles"):
self.log('Found cover at:', self.cover_url) section = soup.find("section", class_=f"issue-{section_name}")
features = [] if not section:
continue
self.log('Features') for card in section.find_all("div", class_="article-card"):
for item in soup.find(**classes('issue-features')).findAll(**classes('article-card')): title_ele = card.find(class_="ac-title")
h = item.find(**classes('ac-title')) if not title_ele:
a = h.parent continue
url = a['href'] article_url = card.find("a")["href"]
title = self.tag_to_string(h).strip() article_title = self.tag_to_string(title_ele)
h = item.find(**classes('ac-subtitle')) article_description = (
if h is not None: f'{self.tag_to_string(card.find(class_="ac-tax"))} '
st = self.tag_to_string(h).strip() f'{self.tag_to_string(card.find(class_="ac-subtitle"))}'
if st: ).strip()
title += ': ' + st byline = card.find(class_="byline")
desc = '' if byline:
p = item.find(**classes('byline')) article_description += (
if p is not None: f' {self.tag_to_string(byline).strip().strip(",")}'
desc += self.tag_to_string(p) )
self.log(' ', title, 'at', url) articles.setdefault(section_name.title(), []).append(
features.append({'title': title, 'url': url, 'description': desc}) {
"url": article_url,
readings = [] "title": article_title,
self.log('Readings') "description": article_description,
for item in soup.find(**classes('issue-readings')).findAll(**classes('reading-item')): }
a = item.find('a', **classes('ac-title')) )
title = self.tag_to_string(a).strip() return articles.items()
url = a['href']
desc = ''
a = item.find(**classes('ac-author'))
if a is not None:
desc = self.tag_to_string(a)
self.log(' ', title, 'at', url)
readings.append({'title': title, 'url': url, 'description': desc})
return [('Features', features), ('Readings', readings)]