Get parse_index() working for foreign affairs AJAX backend

This commit is contained in:
Kovid Goyal 2019-08-18 17:52:49 +05:30
parent d7458841e1
commit 099cbca59c
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,12 +1,12 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
from calibre.web.feeds.news import BasicNewsRecipe import json
import re import re
import html5lib import html5lib
import mechanize
from lxml import html from lxml import html
from calibre.web.feeds.news import BasicNewsRecipe
def select_form(form):
return form.attrs.get('id', None) == 'user-login'
def classes(classes): def classes(classes):
@ -15,6 +15,123 @@ def classes(classes):
'class': lambda x: x and frozenset(x.split()).intersection(q)}) 'class': lambda x: x and frozenset(x.split()).intersection(q)})
def as_article(source, log):
url = source['url']
title = source['title']
desc = ''
if source.get('field_subtitle'):
desc += source['field_subtitle']
if source.get('field_display_authors'):
desc += ' by ' + source['field_display_authors']
log(title, url)
return {'url': url, 'title': title, 'description': desc}
def get_issue_data(br, log, node_id='1124670'):
headers = {
'Accept': 'application/json, text/plain, */*',
'Content-Type': 'application/json;charset=UTF-8',
'Origin': 'https://www.foreignaffairs.com',
'Referer': 'https://www.foreignaffairs.com',
}
data = {
"_source": {
"includes": [
"normalized_date", "field_issue_volume_number",
"field_issue_volume", "url", "fa_path", "title",
"fa_node_issue_cover_url", "nid",
"field_issue_ssection_header",
"field_issue_ssection_articles:nid"
]
},
"query": {
"match": {
"id": {
"query": node_id
}
}
},
"size": 1
}
def get_data(data):
search_url = 'https://www.foreignaffairs.com/node/_search'
req = mechanize.Request(url=search_url,
data=json.dumps(data),
headers=headers,
method='POST')
res = br.open(req)
return json.loads(res.read())['hits']['hits']
issue_data = get_data(data)
source = issue_data[0]['_source']
nids = source['field_issue_ssection_articles:nid']
section_title = source['field_issue_ssection_header']
data = {
'_source': {
'includes': [
'field_tags:name', 'field_topics:name', 'field_regions:name',
'url', 'title', 'field_subtitle', 'field_display_authors',
'nid', 'fa_node_has_audio', 'fa_node_paywall_free',
'field_capsule_review_category:name',
'fa_node_type_or_subtype', 'type'
]
},
'query': {
'terms': {
'id': nids
}
},
'size': 30
}
sections_data = get_data(data)
log('Found main section:', section_title)
main_articles = []
for article in sections_data:
main_articles.append(as_article(article['_source'], log))
feed = {}
data['size'] = 100
data['query'] = {
'bool': {
'must': [{
'terms': {
'fa_node_type_or_subtype': [
'Comment', 'Essay', 'Interview', 'Review Essay',
'Letter From', 'Letter', 'Response', 'Capsule Review'
]
}
}, {
'term': {
'field_issue:nid': {
'term': '1124670'
}
}
}],
'must_not': [{
'terms': {
'id': nids
}
}]
}
}
article_data = get_data(data)
for article in article_data:
article = article['_source']
section = article['fa_node_type_or_subtype']
if section not in feed:
feed[section] = []
feed[section].append(as_article(article, log))
ans = []
for sec in sorted(feed):
ans.append((sec, feed[sec]))
return [(section_title, main_articles)] + ans
class ForeignAffairsRecipe(BasicNewsRecipe): class ForeignAffairsRecipe(BasicNewsRecipe):
''' there are three modifications: ''' there are three modifications:
@ -55,43 +172,18 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
'publisher': publisher} 'publisher': publisher}
def parse_index(self): def parse_index(self):
answer = []
soup = self.index_to_soup(self.FRONTPAGE) soup = self.index_to_soup(self.FRONTPAGE)
div = soup.find(
'div', attrs={'class': 'magazine-actions'})
self.cover_url = div.find('img')['ng-src']
# get dates # get dates
date = re.split(r'\s\|\s', self.tag_to_string( date = re.split(r'\s\|\s', self.tag_to_string(
soup.head.title.string))[0] soup.head.title.string))[0]
self.title = "Foreign Affairs ({})".format(date) self.title = "Foreign Affairs ({})".format(date)
self.timefmt = u' [%s]' % date self.timefmt = u' [%s]' % date
cls = soup.find('body')['class']
# Fetching article list does not work as site uses javascript if isinstance(cls, (list, tuple)):
# to load articles dynamically cls = ' '.join(cls)
for section in soup.findAll('section', attrs={'class':lambda x: x and 'magazine-list' in x.split()}): node_id = re.search(r'\bpage-node-(\d+)\b', cls).group(1)
articles = [] br = self.cloned_browser
section_title = self.tag_to_string(section.find('h2')) return get_issue_data(br, self.log, node_id)
if 'special_section.title' in section_title:
section_title = 'Special'
self.log('\nSection:', section_title)
for h3 in section.findAll(attrs={'class': lambda x: x and 'magazine-title' in x.split()}):
a = h3.findParent('a', href=True)
title = self.tag_to_string(h3)
url = a['href']
atr = a.findNextSibling(attrs={'class':'author'})
author = self.tag_to_string(atr) if atr else ''
desc = a.findNextSibling(attrs={'class': 'deck'})
if desc is not None:
description = self.tag_to_string(desc)
else:
description = ''
articles.append({'title': title, 'url': url,
'description': description, 'author': author})
self.log(title)
self.log('\t' + url)
if articles:
answer.append((section_title, articles))
return answer
def clean_fa_html(self, root): def clean_fa_html(self, root):
for svg in tuple(root.iter('{*}svg')): for svg in tuple(root.iter('{*}svg')):
@ -104,7 +196,7 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
root = html5lib.parse(raw_html, treebuilder='lxml', root = html5lib.parse(raw_html, treebuilder='lxml',
namespaceHTMLElements=False).getroot() namespaceHTMLElements=False).getroot()
self.clean_fa_html(root) self.clean_fa_html(root)
return html.tostring(root) return html.tostring(root, encoding='unicode')
def preprocess_html(self, soup): def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'ng-src': True}): for img in soup.findAll('img', attrs={'ng-src': True}):
@ -112,16 +204,14 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
return soup return soup
def get_browser(self): def get_browser(self):
def select_form(form):
return form.attrs.get('id', None) == 'user-login'
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
# mechanize fails to parse the html correctly, so use html5lib to br.open(
# sanitize the html first
response = br.open(
'https://www.foreignaffairs.com/user?destination=user%3Fop%3Dlo') 'https://www.foreignaffairs.com/user?destination=user%3Fop%3Dlo')
root = html5lib.parse(
response.get_data(), treebuilder='lxml', namespaceHTMLElements=False)
response.set_data(html.tostring(root))
br.set_response(response)
br.select_form(predicate=select_form) br.select_form(predicate=select_form)
br.form['name'] = self.username br.form['name'] = self.username
br.form['pass'] = self.password br.form['pass'] = self.password