mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Harpers
This commit is contained in:
parent
0f49b15edf
commit
eb7377b144
@ -1,88 +1,91 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
|
||||
'''
|
||||
harpers.org
|
||||
'''
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
||||
from calibre import browser
|
||||
|
||||
|
||||
class Harpers(BasicNewsRecipe):
|
||||
title = u"Harper's Magazine"
|
||||
__author__ = u'Darko Miletic'
|
||||
language = 'en'
|
||||
description = u"Harper's Magazine: Founded June 1850."
|
||||
title = 'Harper’s Magazine'
|
||||
__author__ = 'unkn0wn'
|
||||
language = 'en_US'
|
||||
description = (
|
||||
'Harper’s Magazine, the oldest general-interest monthly in America, explores the issues that drive our '
|
||||
'national conversation, through long-form narrative journalism and essays, and such celebrated '
|
||||
'features as the iconic Harper’s Index. With its emphasis on fine writing and original thought '
|
||||
'Harper’s provides readers with a unique perspective on politics, society, the environment, and culture.'
|
||||
)
|
||||
publisher = "Harper's Magazine "
|
||||
category = 'news, politics, USA'
|
||||
oldest_article = 30
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
|
||||
conversion_options = {
|
||||
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
|
||||
}
|
||||
masthead_url = 'https://harpers.org/wp-content/themes/timber/assets/img/logo.svg'
|
||||
ignore_duplicate_articles = {'url'}
|
||||
encoding = 'utf-8'
|
||||
remove_attributes = ['style', 'height', 'width']
|
||||
|
||||
keep_only_tags = [
|
||||
dict(
|
||||
class_=[
|
||||
"article-content",
|
||||
"template-index-archive", # harper's index
|
||||
]
|
||||
)
|
||||
dict(attrs={'class':lambda x: x and (
|
||||
'title-header desktop ' in x or 'col-md-8 col-xl-9' in x
|
||||
)}),
|
||||
classes('article-hero-img entry-content pdf-only')
|
||||
]
|
||||
remove_tags = [
|
||||
dict(
|
||||
class_=[
|
||||
"component-newsletter-signup",
|
||||
"sidebar",
|
||||
"header-meta",
|
||||
"component-from-author",
|
||||
"from-issue",
|
||||
"d-none",
|
||||
"COA_roles_fix_space",
|
||||
"section-tags",
|
||||
"aria-font-adjusts",
|
||||
"component-share-buttons",
|
||||
"index-footer",
|
||||
"index-prev-link",
|
||||
"comma",
|
||||
]
|
||||
),
|
||||
# for harper's index
|
||||
dict(
|
||||
class_=[
|
||||
"aria-font-adjusts",
|
||||
"component-share-buttons",
|
||||
"index-footer",
|
||||
"index-prev-link",
|
||||
]
|
||||
),
|
||||
classes('header-controls')
|
||||
]
|
||||
remove_attributes = ["style", "width", "height"]
|
||||
|
||||
extra_css = """
|
||||
h1.article-title { font-size: x-large; margin-bottom: 0.4rem; }
|
||||
.subheading, .post-subtitle { font-size: large; font-style: italic; margin-bottom: 1rem; }
|
||||
.byline { margin-bottom: 1rem }
|
||||
.article-hero-img img, .flex-section-image img, .wp-caption img {
|
||||
display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
.wp-caption-text { font-size: small; margin-top: 0.3rem; }
|
||||
extra_css = '''
|
||||
img {display:block; margin:0 auto;}
|
||||
.category, .from-issue { font-size:small; color:#404040; }
|
||||
.wp-caption-text { font-size:small; text-align:center; }
|
||||
.subheading { font-style:italic; color:#202020; }
|
||||
.byline { font-size:small; }
|
||||
em, blockquote { color:#202020; }
|
||||
'''
|
||||
|
||||
.author-bio { margin-top: 2.5rem; font-style: italic; }
|
||||
.author-bio em { font-weight: bold; }
|
||||
def preprocess_html(self, soup):
|
||||
sub = soup.find(attrs={'class':'subheading'})
|
||||
if sub:
|
||||
sub.name = 'p'
|
||||
for img in soup.findAll('img', attrs={'srcset':True}):
|
||||
for src in img['srcset'].split(','):
|
||||
if '768w' in src:
|
||||
img['src'] = img['src'].split()[0]
|
||||
return soup
|
||||
|
||||
.index-item { font-size: large; margin: 1rem 0; }
|
||||
.index-statement > p { display: inline-block; margin: 0.5rem 0; }
|
||||
.index-statement > span { display: inline-block; }
|
||||
.index-statement .index-tooltip { font-size: small; }
|
||||
"""
|
||||
|
||||
def get_cover_url(self):
|
||||
def parse_index(self):
|
||||
issues_soup = self.index_to_soup("https://harpers.org/issues/")
|
||||
curr_issue_a_ele = issues_soup.select_one("div.issue-card a")
|
||||
if curr_issue_a_ele.find("img"):
|
||||
return curr_issue_a_ele.img["src"]
|
||||
a_ele = issues_soup.select_one("div.issue-card a")
|
||||
self.timefmt = ' [' + self.tag_to_string(a_ele.find(attrs={'class':'issue-title'})) + ']'
|
||||
url = a_ele['href']
|
||||
soup = self.index_to_soup(url)
|
||||
cov_div = soup.find('div', attrs={'class':'issue-cover'})
|
||||
if cov_div:
|
||||
self.cover_url = cov_div.find('img', attrs={'class':'cover-img'})['src']
|
||||
ans = []
|
||||
for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(url + '/')}):
|
||||
if not a.find('img') and a.find(['h1', 'h2', 'h3', 'h4']):
|
||||
url = a['href']
|
||||
title = self.tag_to_string(a)
|
||||
desc = ''
|
||||
div = a.findParent('div').find('div', attrs={'class':'byline'})
|
||||
if div:
|
||||
desc = self.tag_to_string(div)
|
||||
self.log('\t', title, '\n\t', desc, '\n\t', url)
|
||||
ans.append({'title': title, 'description': desc, 'url': url})
|
||||
return [('Articles', ans)]
|
||||
|
||||
feeds = [(u"Harper's Magazine", u'https://harpers.org/feed/')]
|
||||
# Harpers changes the content it delivers based on cookies, so the
|
||||
# following ensures that we send no cookies
|
||||
def get_browser(self, *args, **kwargs):
|
||||
return self
|
||||
|
||||
def clone_browser(self, *args, **kwargs):
|
||||
return self.get_browser()
|
||||
|
||||
def open_novisit(self, *args, **kwargs):
|
||||
br = browser()
|
||||
return br.open_novisit(*args, **kwargs)
|
||||
|
||||
open = open_novisit
|
||||
|
@ -1,170 +0,0 @@
|
||||
# -*- mode: python -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi: set fenc=utf-8 ft=python :
|
||||
# kate: encoding utf-8; syntax python;
|
||||
|
||||
__license__ = "GPL v3"
|
||||
__copyright__ = "2008-2019, Darko Miletic <darko.miletic at gmail.com>"
|
||||
"""
|
||||
harpers.org - printed issue articles
|
||||
This recipe only get's article's published in text format
|
||||
images and pdf's are ignored
|
||||
"""
|
||||
|
||||
from calibre import browser
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
# overwrite this with a custom issue url, e.g. https://harpers.org/archive/2023/01/
|
||||
_issue_url = ""
|
||||
|
||||
|
||||
class Harpers_full(BasicNewsRecipe):
|
||||
title = "Harper's Magazine - articles from printed edition"
|
||||
__author__ = "Darko Miletic, updated by ping"
|
||||
description = "Harper's Magazine, the oldest general-interest monthly in America, explores the issues that drive our national conversation, through long-form narrative journalism and essays, and such celebrated features as the iconic Harper's Index." # noqa
|
||||
publisher = "Harpers's"
|
||||
category = "news, politics, USA"
|
||||
oldest_article = 31
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = "en"
|
||||
encoding = "utf8"
|
||||
publication_type = "magazine"
|
||||
requires_version = (5, 0, 0) # py3
|
||||
ignore_duplicate_articles = {"url"}
|
||||
base_url = "https://harpers.org"
|
||||
|
||||
keep_only_tags = [
|
||||
dict(
|
||||
class_=[
|
||||
"article-content",
|
||||
"template-index-archive", # harper's index
|
||||
]
|
||||
)
|
||||
]
|
||||
remove_tags = [
|
||||
dict(
|
||||
class_=[
|
||||
"component-newsletter-signup",
|
||||
"sidebar",
|
||||
"header-meta",
|
||||
"component-from-author",
|
||||
"from-issue",
|
||||
"d-none",
|
||||
"COA_roles_fix_space",
|
||||
"section-tags",
|
||||
"aria-font-adjusts",
|
||||
"component-share-buttons",
|
||||
"index-footer",
|
||||
"index-prev-link",
|
||||
"comma",
|
||||
]
|
||||
),
|
||||
# for harper's index
|
||||
dict(
|
||||
class_=[
|
||||
"aria-font-adjusts",
|
||||
"component-share-buttons",
|
||||
"index-footer",
|
||||
"index-prev-link",
|
||||
]
|
||||
),
|
||||
]
|
||||
remove_attributes = ["style", "width", "height"]
|
||||
|
||||
extra_css = """
|
||||
h1.article-title { font-size: x-large; margin-bottom: 0.4rem; }
|
||||
.subheading, .post-subtitle { font-size: large; font-style: italic; margin-bottom: 1rem; }
|
||||
.byline { margin-bottom: 1rem }
|
||||
.article-hero-img img, .flex-section-image img, .wp-caption img {
|
||||
display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
.wp-caption-text { font-size: small; margin-top: 0.3rem; }
|
||||
|
||||
.author-bio { margin-top: 2.5rem; font-style: italic; }
|
||||
.author-bio em { font-weight: bold; }
|
||||
|
||||
.index-item { font-size: large; margin: 1rem 0; }
|
||||
.index-statement > p { display: inline-block; margin: 0.5rem 0; }
|
||||
.index-statement > span { display: inline-block; }
|
||||
.index-statement .index-tooltip { font-size: small; }
|
||||
"""
|
||||
|
||||
# Send cookie-less requests to get full article
|
||||
def get_browser(self, *args, **kwargs):
|
||||
return self
|
||||
|
||||
def clone_browser(self, *args, **kwargs):
|
||||
return self.get_browser()
|
||||
|
||||
def open_novisit(self, *args, **kwargs):
|
||||
br = browser()
|
||||
return br.open_novisit(*args, **kwargs)
|
||||
|
||||
open = open_novisit
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
# General UI tweaks
|
||||
# move subheading to before byline (instead of where it is now, after)
|
||||
subheading_ele = soup.find(class_="subheading")
|
||||
byline_ele = soup.find(class_="byline")
|
||||
if byline_ele and subheading_ele:
|
||||
byline_ele.insert_before(subheading_ele.extract())
|
||||
|
||||
# strip extraneous stuff from author bio
|
||||
for bio in soup.find_all(class_="author-bio"):
|
||||
for dec_ele in bio.find_all("br"):
|
||||
dec_ele.decompose()
|
||||
for unwrap_ele in bio.find_all("p"):
|
||||
unwrap_ele.unwrap()
|
||||
|
||||
# remove extraneous hr
|
||||
for hr in soup.select(".after-post-content hr"):
|
||||
hr.decompose()
|
||||
return soup
|
||||
|
||||
def parse_index(self):
|
||||
if not _issue_url:
|
||||
issues_soup = self.index_to_soup("https://harpers.org/issues/")
|
||||
curr_issue_a_ele = issues_soup.select_one("div.issue-card a")
|
||||
if curr_issue_a_ele.find("img"):
|
||||
self.cover_url = curr_issue_a_ele.img["src"]
|
||||
else:
|
||||
curr_issue_url = _issue_url
|
||||
|
||||
soup = self.index_to_soup(curr_issue_url)
|
||||
self.timefmt = (
|
||||
f' [{self.tag_to_string(soup.find("h1", class_="issue-heading")).strip()}]'
|
||||
)
|
||||
self.cover_url = soup.find("img", class_="cover-img")["src"]
|
||||
|
||||
articles = {}
|
||||
for section_name in ("features", "readings", "articles"):
|
||||
section = soup.find("section", class_=f"issue-{section_name}")
|
||||
if not section:
|
||||
continue
|
||||
for card in section.find_all("div", class_="article-card"):
|
||||
title_ele = card.find(class_="ac-title")
|
||||
if not title_ele:
|
||||
continue
|
||||
article_url = card.find("a")["href"]
|
||||
article_title = self.tag_to_string(title_ele)
|
||||
article_description = (
|
||||
f'{self.tag_to_string(card.find(class_="ac-tax"))} '
|
||||
f'{self.tag_to_string(card.find(class_="ac-subtitle"))}'
|
||||
).strip()
|
||||
byline = card.find(class_="byline")
|
||||
if byline:
|
||||
article_description += (
|
||||
f' {self.tag_to_string(byline).strip().strip(",")}'
|
||||
)
|
||||
articles.setdefault(section_name.title(), []).append(
|
||||
{
|
||||
"url": article_url,
|
||||
"title": article_title,
|
||||
"description": article_description,
|
||||
}
|
||||
)
|
||||
return articles.items()
|
Binary file not shown.
Before Width: | Height: | Size: 1.3 KiB After Width: | Height: | Size: 356 B |
Binary file not shown.
Before Width: | Height: | Size: 1.3 KiB |
Loading…
x
Reference in New Issue
Block a user