calibre/recipes/harpers_full.recipe

# -*- mode: python -*-
# -*- coding: utf-8 -*-
# vi: set fenc=utf-8 ft=python :
# kate: encoding utf-8; syntax python;

__license__ = 'GPL v3'
__copyright__ = '2008-2019, Darko Miletic <darko.miletic at gmail.com>'
'''
harpers.org - paid subscription/ printed issue articles
This recipe only get's article's published in text format
images and pdf's are ignored
If you have institutional subscription based on access IP you do not need to enter
anything in username/password fields
'''

import time
try:
    from urllib.parse import urlencode
except ImportError:
    from urllib import urlencode
from calibre.web.feeds.news import BasicNewsRecipe


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


class Harpers_full(BasicNewsRecipe):
    title = "Harper's Magazine - articles from printed edition"
    __author__ = 'Darko Miletic'
    description = "Harper's Magazine, the oldest general-interest monthly in America, explores the issues that drive our national conversation, through long-form narrative journalism and essays, and such celebrated features as the iconic Harper's Index."  # noqa
    publisher = "Harpers's"
    category = 'news, politics, USA'
    oldest_article = 30
    max_articles_per_feed = 100
    no_stylesheets = True
    use_embedded_content = False
    delay = 1
    language = 'en'
    encoding = 'utf8'
    needs_subscription = 'optional'
    publication_type = 'magazine'
    LOGIN = 'https://harpers.org/wp-admin/admin-ajax.php'
    keep_only_tags = [
        classes('article-header-text entry-content'),
    ]
    remove_tags = [
        classes('related-issue-tout section-tags component-from-author component-share-buttons')
    ]

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        br.open('https://harpers.org/')
        if self.username is not None and self.password is not None:
            tt = time.localtime() * 1000
            data = urlencode({'action': 'cds_auth_user', 'm': self.username, 'p': self.password, 'rt': 'https://harpers.org/', 'tt': tt
                                     })
            br.open(self.LOGIN, data)
        return br

    def parse_index(self):
        # find current issue
        soup = self.index_to_soup('https://harpers.org/')
        currentIssue_url = soup.find(attrs={'data-current-issue-url': True})['data-current-issue-url']
        self.log('Found issue at:', currentIssue_url)

        # go to the current issue
        soup = self.index_to_soup(currentIssue_url)
        self.timefmt = u' [%s]' % self.tag_to_string(soup.find('a', href=currentIssue_url))

        # get cover
        self.cover_url = soup.find(**classes('past-issue')).find('img')['src']
        self.log('Found cover at:', self.cover_url)
        features = []

        self.log('Features')
        for item in soup.find(**classes('issue-features')).findAll(**classes('article-card')):
            h = item.find(**classes('ac-title'))
            a = h.parent
            url = a['href']
            title = self.tag_to_string(h).strip()
            h = item.find(**classes('ac-subtitle'))
            if h is not None:
                st = self.tag_to_string(h).strip()
                if st:
                    title += ': ' + st
            desc = ''
            p = item.find(**classes('byline'))
            if p is not None:
                desc += self.tag_to_string(p)
            self.log(' ', title, 'at', url)
            features.append({'title': title, 'url': url, 'description': desc})

        readings = []
        self.log('Readings')
        for item in soup.find(**classes('issue-readings')).findAll(**classes('reading-item')):
            a = item.find('a', **classes('ac-title'))
            title = self.tag_to_string(a).strip()
            url = a['href']
            desc = ''
            a = item.find(**classes('ac-author'))
            if a is not None:
                desc = self.tag_to_string(a)
            self.log(' ', title, 'at', url)
            readings.append({'title': title, 'url': url, 'description': desc})

        return [('Features', features), ('Readings', readings)]