calibre/recipes/first_things.recipe

#!/usr/bin/env  python2
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2017, John Hutson <jfhutson at gmail.com>'
'''
firstthings.com
'''
import html5lib
from lxml import html
from calibre.web.feeds.news import BasicNewsRecipe


class FirstThings(BasicNewsRecipe):

    title = 'First Things'
    __author__ = 'John Hutson'
    description = 'America\'s Most Influential Journal of Religion and Public Life'
    INDEX = 'https://www.firstthings.com/current-edition'
    language = 'en'
    encoding = 'utf-8'

    no_stylesheets = True

    keep_only_tags = [
        dict(name='h1'),
        dict(attrs={'itemprop': ['author',]}),
        dict(attrs={'itemprop': 'articleBody'}),
        ]

    extra_css = '''
        .small-caps { font-variant: small-caps }
        .drop-cap { float: left; font-size: 75px; line-height: 60px; padding-top: 4px; padding-right: 8px; padding-left: 3px;}
        '''

    def preprocess_raw_html(self, raw, url):
        return html.tostring(html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False), method='html', encoding='unicode')

    def parse_index(self):
        soup = self.index_to_soup(self.INDEX)
        cover = soup.find('a', 'cover-link')
        if cover is not None:
            img = cover
            if img:
                self.cover_url = img['href']
        current_section, current_articles = 'Cover Story', []
        feeds = []
        for div in soup.findAll(['h3', 'h4', 'a']):
            if div.name == 'h3':
                if current_articles:
                    feeds.append((current_section, current_articles))
                current_articles = []
                current_section = self.tag_to_string(div)
                self.log('\nFound section:', current_section)
            elif div.name == 'h4':
                a = div.findChild('a')
                title = self.tag_to_string(a)
                url = a['href']
                desc = ''
                if url.startswith('/'):
                    url = 'https://www.firstthings.com/' + url
                elif div.name == 'a' and div.rel == 'author':
                    desc = self.tag_to_string(div)
                current_articles.append(
                    {'title': title, 'url': url, 'description': desc})

        if current_articles:
            feeds.append((current_section, current_articles))
        return feeds