mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
105 lines
3.9 KiB
Python
105 lines
3.9 KiB
Python
'''
|
||
harpers.org
|
||
'''
|
||
from calibre import browser
|
||
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
||
|
||
|
||
class Harpers(BasicNewsRecipe):
|
||
title = 'Harper’s Magazine'
|
||
__author__ = 'unkn0wn'
|
||
language = 'en_US'
|
||
description = (
|
||
'Harper’s Magazine, the oldest general-interest monthly in America, explores the issues that drive our '
|
||
'national conversation, through long-form narrative journalism and essays, and such celebrated '
|
||
'features as the iconic Harper’s Index. With its emphasis on fine writing and original thought '
|
||
'Harper’s provides readers with a unique perspective on politics, society, the environment, and culture.'
|
||
)
|
||
publisher = "Harper's Magazine "
|
||
category = 'news, politics, USA'
|
||
no_stylesheets = True
|
||
use_embedded_content = False
|
||
masthead_url = 'https://harpers.org/wp-content/themes/timber/assets/img/logo.svg'
|
||
ignore_duplicate_articles = {'url'}
|
||
encoding = 'utf-8'
|
||
remove_attributes = ['style', 'height', 'width']
|
||
|
||
keep_only_tags = [
|
||
dict(attrs={'class':lambda x: x and (
|
||
'title-header desktop ' in x or 'col-md-8 col-xl-9' in x
|
||
)}),
|
||
classes('article-hero-img entry-content pdf-only')
|
||
]
|
||
remove_tags = [
|
||
classes('header-controls')
|
||
]
|
||
remove_attributes = ["style", "width", "height"]
|
||
|
||
extra_css = '''
|
||
img {display:block; margin:0 auto;}
|
||
.category, .from-issue { font-size:small; color:#404040; }
|
||
.wp-caption-text { font-size:small; text-align:center; }
|
||
.subheading { font-style:italic; color:#202020; }
|
||
.byline { font-size:small; }
|
||
em, blockquote { color:#202020; }
|
||
'''
|
||
|
||
def preprocess_html(self, soup):
|
||
sub = soup.find(attrs={'class':'subheading'})
|
||
if sub:
|
||
sub.name = 'p'
|
||
for img in soup.findAll('img', attrs={'srcset':True}):
|
||
for src in img['srcset'].split(','):
|
||
if '768w' in src:
|
||
img['src'] = src.split()[0]
|
||
return soup
|
||
|
||
recipe_specific_options = {
|
||
'date': {
|
||
'short': 'The date of the edition to download (YYYY/MM format)',
|
||
'long': 'For example, 2023/08',
|
||
}
|
||
}
|
||
|
||
def parse_index(self):
|
||
issues_soup = self.index_to_soup("https://harpers.org/issues/")
|
||
a_ele = issues_soup.select_one("div.issue-card a")
|
||
self.timefmt = ' [' + self.tag_to_string(a_ele.find(attrs={'class':'issue-title'})) + ']'
|
||
url = a_ele['href']
|
||
|
||
edition = self.recipe_specific_options.get('date')
|
||
if edition and isinstance(edition, str):
|
||
url = 'https://harpers.org/archive/' + edition
|
||
self.timefmt = ' [' +edition + ']'
|
||
|
||
soup = self.index_to_soup(url)
|
||
cov_div = soup.find('div', attrs={'class':'issue-cover'})
|
||
if cov_div:
|
||
self.cover_url = cov_div.find('img', attrs={'class':'cover-img'})['src']
|
||
ans = []
|
||
for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(url + '/')}):
|
||
if not a.find('img') and a.find(['h1', 'h2', 'h3', 'h4']):
|
||
url = a['href']
|
||
title = self.tag_to_string(a).strip()
|
||
desc = ''
|
||
div = a.findParent('div').find('div', attrs={'class':'byline'})
|
||
if div:
|
||
desc = self.tag_to_string(div).strip()
|
||
self.log(' ', title, '\n\t', desc[:-1], '\n\t', url)
|
||
ans.append({'title': title, 'description': desc, 'url': url})
|
||
return [('Articles', ans)]
|
||
|
||
# Harpers changes the content it delivers based on cookies, so the
|
||
# following ensures that we send no cookies
|
||
def get_browser(self, *args, **kwargs):
|
||
return self
|
||
|
||
def clone_browser(self, *args, **kwargs):
|
||
return self.get_browser()
|
||
|
||
def open_novisit(self, *args, **kwargs):
|
||
br = browser()
|
||
return br.open_novisit(*args, **kwargs)
|
||
|
||
open = open_novisit
|