mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
69 lines
2.3 KiB
Python
69 lines
2.3 KiB
Python
#!/usr/bin/env python2
|
|
from __future__ import unicode_literals
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2017, John Hutson <jfhutson at gmail.com>'
|
|
'''
|
|
firstthings.com
|
|
'''
|
|
import html5lib
|
|
from lxml import html
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
|
|
|
|
class FirstThings(BasicNewsRecipe):
|
|
|
|
title = 'First Things'
|
|
__author__ = 'John Hutson'
|
|
description = 'America\'s Most Influential Journal of Religion and Public Life'
|
|
INDEX = 'https://www.firstthings.com/current-edition'
|
|
language = 'en'
|
|
encoding = 'utf-8'
|
|
|
|
no_stylesheets = True
|
|
|
|
keep_only_tags = [
|
|
dict(name='h1'),
|
|
dict(attrs={'itemprop': ['author',]}),
|
|
dict(attrs={'itemprop': 'articleBody'}),
|
|
]
|
|
|
|
extra_css = '''
|
|
.small-caps { font-variant: small-caps }
|
|
.drop-cap { float: left; font-size: 75px; line-height: 60px; padding-top: 4px; padding-right: 8px; padding-left: 3px;}
|
|
'''
|
|
|
|
def preprocess_raw_html(self, raw, url):
|
|
return html.tostring(html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False), method='html', encoding='unicode')
|
|
|
|
def parse_index(self):
|
|
soup = self.index_to_soup(self.INDEX)
|
|
cover = soup.find('a', 'cover-link')
|
|
if cover is not None:
|
|
img = cover
|
|
if img:
|
|
self.cover_url = img['href']
|
|
current_section, current_articles = 'Cover Story', []
|
|
feeds = []
|
|
for div in soup.findAll(['h3', 'h4', 'a']):
|
|
if div.name == 'h3':
|
|
if current_articles:
|
|
feeds.append((current_section, current_articles))
|
|
current_articles = []
|
|
current_section = self.tag_to_string(div)
|
|
self.log('\nFound section:', current_section)
|
|
elif div.name == 'h4':
|
|
a = div.findChild('a')
|
|
title = self.tag_to_string(a)
|
|
url = a['href']
|
|
desc = ''
|
|
if url.startswith('/'):
|
|
url = 'https://www.firstthings.com/' + url
|
|
elif div.name == 'a' and div.rel == 'author':
|
|
desc = self.tag_to_string(div)
|
|
current_articles.append(
|
|
{'title': title, 'url': url, 'description': desc})
|
|
|
|
if current_articles:
|
|
feeds.append((current_section, current_articles))
|
|
return feeds
|