#!/usr/bin/env python2 from __future__ import unicode_literals __license__ = 'GPL v3' __copyright__ = '2017, John Hutson ' ''' firstthings.com ''' import html5lib from lxml import html from calibre.web.feeds.news import BasicNewsRecipe class FirstThings(BasicNewsRecipe): title = 'First Things' __author__ = 'John Hutson' description = 'America\'s Most Influential Journal of Religion and Public Life' INDEX = 'https://www.firstthings.com/current-edition' language = 'en' encoding = 'utf-8' no_stylesheets = True keep_only_tags = [ dict(name='h1'), dict(attrs={'itemprop': ['author',]}), dict(attrs={'itemprop': 'articleBody'}), ] extra_css = ''' .small-caps { font-variant: small-caps } .drop-cap { float: left; font-size: 75px; line-height: 60px; padding-top: 4px; padding-right: 8px; padding-left: 3px;} ''' def preprocess_raw_html(self, raw, url): return html.tostring(html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False), method='html', encoding='unicode') def parse_index(self): soup = self.index_to_soup(self.INDEX) cover = soup.find('a', 'cover-link') if cover is not None: img = cover if img: self.cover_url = img['href'] current_section, current_articles = 'Cover Story', [] feeds = [] for div in soup.findAll(['h3', 'h4', 'a']): if div.name == 'h3': if current_articles: feeds.append((current_section, current_articles)) current_articles = [] current_section = self.tag_to_string(div) self.log('\nFound section:', current_section) elif div.name == 'h4': a = div.findChild('a') title = self.tag_to_string(a) url = a['href'] desc = '' if url.startswith('/'): url = 'https://www.firstthings.com/' + url elif div.name == 'a' and div.rel == 'author': desc = self.tag_to_string(div) current_articles.append( {'title': title, 'url': url, 'description': desc}) if current_articles: feeds.append((current_section, current_articles)) return feeds