Update Foreign Affairs

This commit is contained in:
Kovid Goyal 2020-08-30 20:51:44 +05:30
parent f46f80dfb0
commit b1acfce54b
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -27,162 +27,139 @@ def as_article(source, log):
return {'url': url, 'title': title, 'description': desc} return {'url': url, 'title': title, 'description': desc}
def get_issue_data(br, log, node_id='1124670'): def get_issue_data(br, log, node_id='1126213', year='2020', volnum='99', issue_vol='5'):
headers = { headers = {
'Accept': 'application/json, text/plain, */*', 'Accept': 'application/json, text/plain, */*',
'Content-Type': 'application/json;charset=UTF-8', 'Content-Type': 'application/json;charset=UTF-8',
'Origin': 'https://www.foreignaffairs.com', 'Origin': 'https://www.foreignaffairs.com',
'Referer': 'https://www.foreignaffairs.com', 'Referer': 'https://www.foreignaffairs.com',
} }
data = {
def make_query(**kwds):
size = kwds.pop('size', 1)
is_filter = kwds.pop('filter', None)
if is_filter:
q = {'filter': [{'terms': {k:v}} for k, v in kwds.items()]}
else:
q = {'must': [{'term': {k:v}} for k, v in kwds.items()]}
return {
'from': 0,
'post_filter': {'bool': q},
"_source": { "_source": {
"includes": [ "includes": [
"normalized_date", "field_issue_volume_number", "nid", 'path', 'title', 'field_subtitle', 'field_display_authors',
"field_issue_volume", "url", "fa_path", "title", 'fa_node_type_or_subtype',
"fa_node_issue_cover_url", "nid",
"field_issue_ssection_header", 'field_issue_sspecial_articles__nid',
"field_issue_ssection_articles:nid" 'field_issue_sspecial_header'
] ]
}, },
"query": { "query": {
"match": { "match_all": {}
"id": {
"query": node_id
}
}
}, },
"size": 1 'sort': [{'field_sequence': "asc"}, {'fa_normalized_date': "desc"}],
"size": size,
} }
def get_data(data): def get_data(data):
search_url = 'https://www.foreignaffairs.com/node/_search' search_url = 'https://www.foreignaffairs.com/fa-search.php'
req = mechanize.Request(url=search_url, req = mechanize.Request(url=search_url,
data=json.dumps(data), data=json.dumps(data),
headers=headers, headers=headers,
method='POST') method='POST')
res = br.open(req) res = br.open(req)
return json.loads(res.read())['hits']['hits'] data = json.loads(res.read())
return data['hits']['hits']
issue_data = get_data(data) feeds = []
source = issue_data[0]['_source'] issue_data = get_data(make_query(
nids = source['field_issue_ssection_articles:nid'] fa_node_type_or_subtype='Issue',
section_title = source['field_issue_ssection_header'] field_issue_volume=issue_vol, field_issue_year=year,
field_issue_volume_number=volnum
))[0]['_source']
main_sec_title = issue_data['field_issue_sspecial_header'][0]
main_sec_nids = issue_data['field_issue_sspecial_articles__nid']
articles_data = get_data(make_query(nid=main_sec_nids, filter=True, size=len(main_sec_nids)))
articles = []
data = { def as_article(source):
'_source': { title = source['title'][0]
'includes': [ desc = ''
'field_tags:name', 'field_topics:name', 'field_regions:name', fs = source.get('field_subtitle')
'url', 'title', 'field_subtitle', 'field_display_authors', if fs:
'nid', 'fa_node_has_audio', 'fa_node_paywall_free', desc = fs[0]
'field_capsule_review_category:name', aus = source.get('field_display_authors')
'fa_node_type_or_subtype', 'type' if aus:
] desc += ' By ' + aus[0]
}, url = 'https://www.foreignaffairs.com' + source['path'][0]
'query': { return {'title': title, 'description': desc, 'url': url}
'terms': {
'id': nids
}
},
'size': 30
}
sections_data = get_data(data) log(main_sec_title)
log('Found main section:', section_title) for entry in articles_data:
main_articles = [] source = entry['_source']
for article in sections_data: articles.append(as_article(source))
main_articles.append(as_article(article['_source'], log)) log('\t', articles[-1]['title'], articles[-1]['url'])
feed = {} feeds.append((main_sec_title, articles))
data['size'] = 100 articles_data = get_data(make_query(field_issue__nid=node_id, size=50))
data['query'] = { ans = {}
'bool': { for entry in articles_data:
'must': [{ source = entry['_source']
'terms': { section = source['fa_node_type_or_subtype'][0]
'fa_node_type_or_subtype': [ ans.setdefault(section, []).append(as_article(source))
'Comment', 'Essay', 'Interview', 'Review Essay', for sectitle in sorted(ans):
'Letter From', 'Letter', 'Response', 'Capsule Review' articles = ans[sectitle]
] log(sectitle)
} if articles:
}, { for art in articles:
'term': { log('\t', art['title'], art['url'])
'field_issue:nid': { feeds.append((sectitle, articles))
'term': '1124670'
}
}
}],
'must_not': [{
'terms': {
'id': nids
}
}]
}
}
article_data = get_data(data) return feeds
for article in article_data:
article = article['_source']
section = article['fa_node_type_or_subtype']
if section not in feed:
feed[section] = []
feed[section].append(as_article(article, log))
ans = []
for sec in sorted(feed):
ans.append((sec, feed[sec]))
return [(section_title, main_articles)] + ans
class ForeignAffairsRecipe(BasicNewsRecipe): class ForeignAffairsRecipe(BasicNewsRecipe):
title = u'Foreign Affairs'
''' there are three modifications: __author__ = 'Kovid Goyal'
1) fetch issue cover
2) toggle ignore premium articles
3) extract proper section names, ie. "Comments", "Essay"
by Chen Wei, 2012-02-05
Additional modifications to support rebranded website
by anisotrope, 27 June 2015
'''
__license__ = 'GPL v3'
__author__ = 'Rick Shang, kwetal, anisotrope'
language = 'en' language = 'en'
version = 1.02
title = u'Foreign Affairs (Subcription)'
publisher = u'Council on Foreign Relations' publisher = u'Council on Foreign Relations'
category = u'USA, Foreign Affairs' category = u'USA, Foreign Affairs'
description = u'The leading forum for serious discussion of American foreign policy and international affairs.' description = u'The leading forum for serious discussion of American foreign policy and international affairs.'
no_stylesheets = True no_stylesheets = True
remove_javascript = True remove_javascript = True
needs_subscription = True needs_subscription = 'optional'
INDEX = 'https://www.foreignaffairs.com' INDEX = 'https://www.foreignaffairs.com/magazine'
FRONTPAGE = INDEX + '/magazine'
keep_only_tags = [ keep_only_tags = [
classes('article-header article-body'), classes('article-header article-body article-lead-image article-body-text'),
]
remove_tags = [
classes('print-hidden loading-indicator paywall article-footer')
] ]
conversion_options = {'comments': description, 'tags': category, 'language': 'en', conversion_options = {'comments': description, 'tags': category, 'language': 'en',
'publisher': publisher} 'publisher': publisher}
def parse_index(self): def parse_index(self):
soup = self.index_to_soup(self.FRONTPAGE) soup = self.index_to_soup(self.INDEX)
# get dates # get dates
date = re.split(r'\s\|\s', self.tag_to_string( date = re.split(r'\s\|\s', self.tag_to_string(
soup.head.title.string))[0] soup.head.title.string))[0]
self.title = "Foreign Affairs ({})".format(date) self.title = "Foreign Affairs ({})".format(date)
self.timefmt = u' [%s]' % date self.timefmt = u' [%s]' % date
link = soup.find('link', rel='revision', href=True)['href']
year, volnum, issue_vol = link.split('/')[-3:]
self.cover_url = soup.find('meta', property="og:image:secure_url")['content']
cls = soup.find('body')['class'] cls = soup.find('body')['class']
if isinstance(cls, (list, tuple)): if isinstance(cls, (list, tuple)):
cls = ' '.join(cls) cls = ' '.join(cls)
node_id = re.search(r'\bpage-node-(\d+)\b', cls).group(1) node_id = re.search(r'\bpage-node-(\d+)\b', cls).group(1)
br = self.cloned_browser br = self.cloned_browser
return get_issue_data(br, self.log, node_id) feeds = get_issue_data(br, self.log, node_id, year, volnum, issue_vol)
return feeds
def clean_fa_html(self, root): def clean_fa_html(self, root):
for svg in tuple(root.iter('{*}svg')): for svg in tuple(root.iter('{*}svg')):
@ -198,7 +175,7 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
return html.tostring(root, encoding='unicode') return html.tostring(root, encoding='unicode')
def preprocess_html(self, soup): def preprocess_html(self, soup):
for attr in ('ng-src', 'data-blazy'): for attr in ('ng-src', 'data-blazy', 'data-src'):
for img in soup.findAll('img', attrs={attr: True}): for img in soup.findAll('img', attrs={attr: True}):
img['src'] = img[attr] img['src'] = img[attr]
return soup return soup