calibre/recipes/jacobinmag.recipe
2017-04-12 20:55:57 +05:30

122 lines
4.2 KiB
Python

# -*- mode: python -*-
# -*- coding: utf-8 -*-
# vi: set ft=python :
__license__ = 'GPL v3'
__copyright__ = '2017, Darko Miletic <darko.miletic at gmail.com>'
'''
www.jacobinmag.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Jacobinmag(BasicNewsRecipe):
title = 'Jacobin'
__author__ = 'Darko Miletic'
description = 'Jacobin is a leading voice of the American left, offering socialist perspectives on politics, economics, and culture.'
publisher = 'Jacobin'
category = 'news, politics, USA'
oldest_article = 65
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'en'
remove_empty_feeds = True
publication_type = 'magazine'
needs_subscription = 'optional'
auto_cleanup = False
issue_url = None
PREFIX = 'https://www.jacobinmag.com'
LOGIN = 'https://auth.jacobinmag.com/mini_profile?redirect=https%3A%2F%2Fwww.jacobinmag.com%2F'
masthead_url = 'https://www.jacobinmag.com/wp-content/themes/boukman/images/banner/type.svg'
extra_css = """
body{font-family: Antwerp, 'Times New Roman', Times, serif}
img{margin-top:1em; margin-bottom: 1em; display:block}
.entry-dek,.entry-author{font-family: Hurme-No3, Futura, sans-serif}
"""
conversion_options = {
'comment': description,
'tags': category,
'publisher': publisher,
'language': language
}
remove_tags = [
dict(name=['meta', 'link']),
dict(name='div', attrs={'class': 'entry-bottom'}),
dict(name='div', attrs={'data-app': 'share_buttons'}),
]
keep_only_tags = [dict(attrs={'class': ['entry-header', 'entry-content']})]
def parse_index(self):
ans = []
articles = []
lurl = self.get_issue()
if lurl:
soup = self.index_to_soup(lurl)
# Find cover url
myimg = soup.find('img', attrs={'id': 'front-cover'})
if myimg:
self.cover_url = self.image_url_processor(None, myimg['src'])
# End find cover url
# Configure series
self.conversion_options.update({'series': 'Jacobin'})
# Get series title
feedtitle = 'Articles'
title = soup.find('div', attrs={'id': 'iss-title-name'})
if title:
feedtitle = self.tag_to_string(title)
# Scrape article links
for section in soup.findAll('div', attrs={'class': 'section-articles'}):
for art in section.findAll('article'):
urlbase = art.find('h3', attrs={'class': 'iss-hed'})
if urlbase and urlbase.a[
'href'
] != 'https://www.jacobinmag.com/subscribe/':
url = urlbase.a['href']
title = self.tag_to_string(urlbase)
desc = ''
descbase = urlbase = art.find(
'p', attrs={'class': 'iss-dek'}
)
if descbase:
desc = self.tag_to_string(descbase)
articles.append({
'title': title,
'url': url,
'description': desc
})
ans.append((feedtitle, articles))
return ans
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
br.open(self.PREFIX)
if self.username is not None and self.password is not None:
br.open(self.LOGIN)
br.select_form(nr=0)
br['login.email'] = self.username
br['login.password'] = self.password
br.submit()
page = br.response().read()
soup = self.index_to_soup(page)
div = soup.find('div', attrs={'id': 'redirect-target'})
if div:
br.open(div['data-redirect'])
return br
def get_issue(self):
issue = None
soup = self.index_to_soup(self.PREFIX)
mag = soup.find('li', attrs={'class': 'magazine'})
if mag:
issue = mag.a['href']
return issue