calibre/recipes/phillosophy_now.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
from collections import OrderedDict

from calibre import browser
from calibre.web.feeds.news import BasicNewsRecipe, classes


class PhilosophyNow(BasicNewsRecipe):

    title = 'Philosophy Now'
    __author__ = 'unkn0wn'
    description = '''Philosophy Now is a lively magazine for everyone
    interested in ideas. It isn't afraid to tackle all the major questions of
    life, the universe and everything. Published every two months, it tries to
    corrupt innocent citizens by convincing them that philosophy can be
    exciting, worthwhile and comprehensible, and also to provide some enjoyable
    reading matter for those already ensnared by the muse, such as philosophy
    students and academics.'''
    language = 'en'
    use_embedded_content = False
    no_stylesheets = True
    remove_javascript = True
    remove_attributes = ['height', 'width', 'style']
    encoding = 'utf-8'
    ignore_duplicate_articles = {'url'}
    masthead_url = 'https://philosophynow.org/media/images/regulars/logoStructuredData.png'

    keep_only_tags = [classes('article_page')]
    remove_tags = [dict(name='div', attrs={'id':'welcome_box'})]
    extra_css = '''
        img {display:block; margin:0 auto;}
        .articleImageCaption { font-size:small; text-align:center; }
        em, blockquote { color:#202020; }
    '''

    recipe_specific_options = {
        'issue': {
            'short': 'Enter the Issue Number you want to download ',
            'long': 'For example, 136'
        }
    }

    def parse_index(self):
        soup = self.index_to_soup('https://philosophynow.org/')
        div = soup.find('div', attrs={'id': 'aside_issue_cover'})
        url = 'https://philosophynow.org' + div.find('a', href=True)['href']

        d = self.recipe_specific_options.get('issue')
        if d and isinstance(d, str):
            url = 'https://philosophynow.org/issues/' + d

        soup = self.index_to_soup(url)

        div = soup.find('div', attrs={'id': 'issue_contents_cover_div'})
        cov_url = div.find('img', src=True)['src']
        self.cover_url = 'https://philosophynow.org' + cov_url
        self.timefmt = ' [' + self.tag_to_string(soup.find('h1')) + ']'

        feeds = OrderedDict()

        for h2 in soup.findAll('h2', attrs={'class':'article_list_title'}):
            articles = []
            a = h2.find('a', href=True)
            url = a['href']
            url = 'https://philosophynow.org' + url
            title = self.tag_to_string(a)
            des = h2.find_next_sibling('p')
            if des:
                desc = self.tag_to_string(des)
            h3 = h2.find_previous_sibling('h3')
            section_title = self.tag_to_string(h3).title()
            self.log('\t', title)
            self.log('\t', desc)
            self.log('\t\t', url)
            articles.append({
                'title': title,
                'url': url,
                'description': desc})

            if articles:
                if section_title not in feeds:
                    feeds[section_title] = []
                feeds[section_title] += articles
        ans = list(feeds.items())
        return ans

    # PN changes the content it delivers based on cookies, so the
    # following ensures that we send no cookies
    def get_browser(self, *args, **kwargs):
        return self

    def clone_browser(self, *args, **kwargs):
        return self.get_browser()

    def open_novisit(self, *args, **kwargs):
        br = browser()
        return br.open_novisit(*args, **kwargs)

    open = open_novisit