calibre/recipes/first_things.recipe
Kovid Goyal 29cd8d64ea
Change shebangs to python from python2
Also remove a few other miscellaneous references to python2
2020-08-22 18:47:51 +05:30

69 lines
2.3 KiB
Python

#!/usr/bin/env python
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2017, John Hutson <jfhutson at gmail.com>'
'''
firstthings.com
'''
import html5lib
from lxml import html
from calibre.web.feeds.news import BasicNewsRecipe
class FirstThings(BasicNewsRecipe):
title = 'First Things'
__author__ = 'John Hutson'
description = 'America\'s Most Influential Journal of Religion and Public Life'
INDEX = 'https://www.firstthings.com/current-edition'
language = 'en'
encoding = 'utf-8'
no_stylesheets = True
keep_only_tags = [
dict(name='h1'),
dict(attrs={'itemprop': ['author',]}),
dict(attrs={'itemprop': 'articleBody'}),
]
extra_css = '''
.small-caps { font-variant: small-caps }
.drop-cap { float: left; font-size: 75px; line-height: 60px; padding-top: 4px; padding-right: 8px; padding-left: 3px;}
'''
def preprocess_raw_html(self, raw, url):
return html.tostring(html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False), method='html', encoding='unicode')
def parse_index(self):
soup = self.index_to_soup(self.INDEX)
cover = soup.find('a', 'cover-link')
if cover is not None:
img = cover
if img:
self.cover_url = img['href']
current_section, current_articles = 'Cover Story', []
feeds = []
for div in soup.findAll(['h3', 'h4', 'a']):
if div.name == 'h3':
if current_articles:
feeds.append((current_section, current_articles))
current_articles = []
current_section = self.tag_to_string(div)
self.log('\nFound section:', current_section)
elif div.name == 'h4':
a = div.findChild('a')
title = self.tag_to_string(a)
url = a['href']
desc = ''
if url.startswith('/'):
url = 'https://www.firstthings.com/' + url
elif div.name == 'a' and div.rel == 'author':
desc = self.tag_to_string(div)
current_articles.append(
{'title': title, 'url': url, 'description': desc})
if current_articles:
feeds.append((current_section, current_articles))
return feeds