calibre/recipes/foreignaffairs.recipe
2019-08-19 08:27:33 +05:30

220 lines
6.6 KiB
Python

#!/usr/bin/env python2
import json
import re
import html5lib
import mechanize
from lxml import html
from calibre.web.feeds.news import BasicNewsRecipe
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
def as_article(source, log):
url = source['url']
title = source['title']
desc = ''
if source.get('field_subtitle'):
desc += source['field_subtitle']
if source.get('field_display_authors'):
desc += ' by ' + source['field_display_authors']
log(title, url)
return {'url': url, 'title': title, 'description': desc}
def get_issue_data(br, log, node_id='1124670'):
headers = {
'Accept': 'application/json, text/plain, */*',
'Content-Type': 'application/json;charset=UTF-8',
'Origin': 'https://www.foreignaffairs.com',
'Referer': 'https://www.foreignaffairs.com',
}
data = {
"_source": {
"includes": [
"normalized_date", "field_issue_volume_number",
"field_issue_volume", "url", "fa_path", "title",
"fa_node_issue_cover_url", "nid",
"field_issue_ssection_header",
"field_issue_ssection_articles:nid"
]
},
"query": {
"match": {
"id": {
"query": node_id
}
}
},
"size": 1
}
def get_data(data):
search_url = 'https://www.foreignaffairs.com/node/_search'
req = mechanize.Request(url=search_url,
data=json.dumps(data),
headers=headers,
method='POST')
res = br.open(req)
return json.loads(res.read())['hits']['hits']
issue_data = get_data(data)
source = issue_data[0]['_source']
nids = source['field_issue_ssection_articles:nid']
section_title = source['field_issue_ssection_header']
data = {
'_source': {
'includes': [
'field_tags:name', 'field_topics:name', 'field_regions:name',
'url', 'title', 'field_subtitle', 'field_display_authors',
'nid', 'fa_node_has_audio', 'fa_node_paywall_free',
'field_capsule_review_category:name',
'fa_node_type_or_subtype', 'type'
]
},
'query': {
'terms': {
'id': nids
}
},
'size': 30
}
sections_data = get_data(data)
log('Found main section:', section_title)
main_articles = []
for article in sections_data:
main_articles.append(as_article(article['_source'], log))
feed = {}
data['size'] = 100
data['query'] = {
'bool': {
'must': [{
'terms': {
'fa_node_type_or_subtype': [
'Comment', 'Essay', 'Interview', 'Review Essay',
'Letter From', 'Letter', 'Response', 'Capsule Review'
]
}
}, {
'term': {
'field_issue:nid': {
'term': '1124670'
}
}
}],
'must_not': [{
'terms': {
'id': nids
}
}]
}
}
article_data = get_data(data)
for article in article_data:
article = article['_source']
section = article['fa_node_type_or_subtype']
if section not in feed:
feed[section] = []
feed[section].append(as_article(article, log))
ans = []
for sec in sorted(feed):
ans.append((sec, feed[sec]))
return [(section_title, main_articles)] + ans
class ForeignAffairsRecipe(BasicNewsRecipe):
''' there are three modifications:
1) fetch issue cover
2) toggle ignore premium articles
3) extract proper section names, ie. "Comments", "Essay"
by Chen Wei, 2012-02-05
Additional modifications to support rebranded website
by anisotrope, 27 June 2015
'''
__license__ = 'GPL v3'
__author__ = 'Rick Shang, kwetal, anisotrope'
language = 'en'
version = 1.02
title = u'Foreign Affairs (Subcription)'
publisher = u'Council on Foreign Relations'
category = u'USA, Foreign Affairs'
description = u'The leading forum for serious discussion of American foreign policy and international affairs.'
no_stylesheets = True
remove_javascript = True
needs_subscription = True
INDEX = 'https://www.foreignaffairs.com'
FRONTPAGE = INDEX + '/magazine'
keep_only_tags = [
classes('article-header article-body'),
]
conversion_options = {'comments': description, 'tags': category, 'language': 'en',
'publisher': publisher}
def parse_index(self):
soup = self.index_to_soup(self.FRONTPAGE)
# get dates
date = re.split(r'\s\|\s', self.tag_to_string(
soup.head.title.string))[0]
self.title = "Foreign Affairs ({})".format(date)
self.timefmt = u' [%s]' % date
cls = soup.find('body')['class']
if isinstance(cls, (list, tuple)):
cls = ' '.join(cls)
node_id = re.search(r'\bpage-node-(\d+)\b', cls).group(1)
br = self.cloned_browser
return get_issue_data(br, self.log, node_id)
def clean_fa_html(self, root):
for svg in tuple(root.iter('{*}svg')):
svg.getparent().remove(svg)
for meta in tuple(root.iter('{*}meta')):
meta.getparent().remove(meta)
return root
def preprocess_raw_html(self, raw_html, url):
root = html5lib.parse(raw_html, treebuilder='lxml',
namespaceHTMLElements=False).getroot()
self.clean_fa_html(root)
return html.tostring(root, encoding='unicode')
def preprocess_html(self, soup):
for attr in ('ng-src', 'data-blazy'):
for img in soup.findAll('img', attrs={attr: True}):
img['src'] = img[attr]
return soup
def get_browser(self):
def select_form(form):
return form.attrs.get('id', None) == 'user-login'
br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None:
br.open(
'https://www.foreignaffairs.com/user?destination=user%3Fop%3Dlo')
br.select_form(predicate=select_form)
br.form['name'] = self.username
br.form['pass'] = self.password
br.submit()
return br