From 488ae08a413b8f052b3ce6690b7d43b9dee03b94 Mon Sep 17 00:00:00 2001 From: jfhutson Date: Sat, 15 Apr 2017 14:34:39 -0500 Subject: [PATCH] Create first_things Recipe for First Things magazine. --- recipes/first_things | 67 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 recipes/first_things diff --git a/recipes/first_things b/recipes/first_things new file mode 100644 index 0000000000..eb63d9def5 --- /dev/null +++ b/recipes/first_things @@ -0,0 +1,67 @@ +#!/usr/bin/env python2 +from __future__ import unicode_literals +__license__ = 'GPL v3' +__copyright__ = '2017, John Hutson ' +''' +firstthings.com +''' +import html5lib +from lxml import html +from calibre.web.feeds.news import BasicNewsRecipe + +class FirstThings(BasicNewsRecipe): + + title = 'First Things' + __author__ = 'John Hutson' + description = 'America\'s Most Influential Journal of Religion and Public Life' + INDEX = 'https://www.firstthings.com/current-edition' + language = 'en' + encoding = 'utf-8' + + no_stylesheets = True + + keep_only_tags = [ + dict(name='h1'), + dict(attrs={'itemprop': ['author',]}), + dict(attrs={'itemprop': 'articleBody'}), + ] + + extra_css = ''' + .small-caps { font-variant: small-caps } + .drop-cap { float: left; font-size: 75px; line-height: 60px; padding-top: 4px; padding-right: 8px; padding-left: 3px;} + ''' + + def preprocess_raw_html(self, raw, url): + return html.tostring(html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False), method='html', encoding=unicode) + + def parse_index(self): + soup = self.index_to_soup(self.INDEX) + cover = soup.find('a', 'cover-link') + if cover is not None: + img = cover + if img: + self.cover_url = img['href'] + current_section, current_articles = 'Cover Story', [] + feeds = [] + for div in soup.findAll(['h3', 'h4', 'a']): + if div.name == 'h3': + if current_articles: + feeds.append((current_section, current_articles)) + current_articles = [] + current_section = self.tag_to_string(div) + self.log('\nFound section:', current_section) + elif div.name == 'h4': + a = div.findChild('a') + title = self.tag_to_string(a) + url = a['href'] + desc = '' + if url.startswith('/'): + url = 'https://www.firstthings.com/' + url + elif div.name == 'a' and div.rel == 'author': + desc = self.tag_to_string(div) + current_articles.append( + {'title': title, 'url': url, 'description': desc}) + + if current_articles: + feeds.append((current_section, current_articles)) + return feeds