mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Get parse_index() working for foreign affairs AJAX backend
This commit is contained in:
parent
d7458841e1
commit
099cbca59c
@ -1,12 +1,12 @@
|
|||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python2
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import html5lib
|
import html5lib
|
||||||
|
import mechanize
|
||||||
from lxml import html
|
from lxml import html
|
||||||
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
def select_form(form):
|
|
||||||
return form.attrs.get('id', None) == 'user-login'
|
|
||||||
|
|
||||||
|
|
||||||
def classes(classes):
|
def classes(classes):
|
||||||
@ -15,6 +15,123 @@ def classes(classes):
|
|||||||
'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
||||||
|
|
||||||
|
|
||||||
|
def as_article(source, log):
|
||||||
|
url = source['url']
|
||||||
|
title = source['title']
|
||||||
|
desc = ''
|
||||||
|
if source.get('field_subtitle'):
|
||||||
|
desc += source['field_subtitle']
|
||||||
|
if source.get('field_display_authors'):
|
||||||
|
desc += ' by ' + source['field_display_authors']
|
||||||
|
log(title, url)
|
||||||
|
return {'url': url, 'title': title, 'description': desc}
|
||||||
|
|
||||||
|
|
||||||
|
def get_issue_data(br, log, node_id='1124670'):
|
||||||
|
headers = {
|
||||||
|
'Accept': 'application/json, text/plain, */*',
|
||||||
|
'Content-Type': 'application/json;charset=UTF-8',
|
||||||
|
'Origin': 'https://www.foreignaffairs.com',
|
||||||
|
'Referer': 'https://www.foreignaffairs.com',
|
||||||
|
}
|
||||||
|
data = {
|
||||||
|
"_source": {
|
||||||
|
"includes": [
|
||||||
|
"normalized_date", "field_issue_volume_number",
|
||||||
|
"field_issue_volume", "url", "fa_path", "title",
|
||||||
|
"fa_node_issue_cover_url", "nid",
|
||||||
|
"field_issue_ssection_header",
|
||||||
|
"field_issue_ssection_articles:nid"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"query": {
|
||||||
|
"match": {
|
||||||
|
"id": {
|
||||||
|
"query": node_id
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"size": 1
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_data(data):
|
||||||
|
search_url = 'https://www.foreignaffairs.com/node/_search'
|
||||||
|
req = mechanize.Request(url=search_url,
|
||||||
|
data=json.dumps(data),
|
||||||
|
headers=headers,
|
||||||
|
method='POST')
|
||||||
|
res = br.open(req)
|
||||||
|
return json.loads(res.read())['hits']['hits']
|
||||||
|
|
||||||
|
issue_data = get_data(data)
|
||||||
|
source = issue_data[0]['_source']
|
||||||
|
nids = source['field_issue_ssection_articles:nid']
|
||||||
|
section_title = source['field_issue_ssection_header']
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'_source': {
|
||||||
|
'includes': [
|
||||||
|
'field_tags:name', 'field_topics:name', 'field_regions:name',
|
||||||
|
'url', 'title', 'field_subtitle', 'field_display_authors',
|
||||||
|
'nid', 'fa_node_has_audio', 'fa_node_paywall_free',
|
||||||
|
'field_capsule_review_category:name',
|
||||||
|
'fa_node_type_or_subtype', 'type'
|
||||||
|
]
|
||||||
|
},
|
||||||
|
'query': {
|
||||||
|
'terms': {
|
||||||
|
'id': nids
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'size': 30
|
||||||
|
}
|
||||||
|
|
||||||
|
sections_data = get_data(data)
|
||||||
|
log('Found main section:', section_title)
|
||||||
|
main_articles = []
|
||||||
|
for article in sections_data:
|
||||||
|
main_articles.append(as_article(article['_source'], log))
|
||||||
|
feed = {}
|
||||||
|
|
||||||
|
data['size'] = 100
|
||||||
|
data['query'] = {
|
||||||
|
'bool': {
|
||||||
|
'must': [{
|
||||||
|
'terms': {
|
||||||
|
'fa_node_type_or_subtype': [
|
||||||
|
'Comment', 'Essay', 'Interview', 'Review Essay',
|
||||||
|
'Letter From', 'Letter', 'Response', 'Capsule Review'
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}, {
|
||||||
|
'term': {
|
||||||
|
'field_issue:nid': {
|
||||||
|
'term': '1124670'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}],
|
||||||
|
'must_not': [{
|
||||||
|
'terms': {
|
||||||
|
'id': nids
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
article_data = get_data(data)
|
||||||
|
for article in article_data:
|
||||||
|
article = article['_source']
|
||||||
|
section = article['fa_node_type_or_subtype']
|
||||||
|
if section not in feed:
|
||||||
|
feed[section] = []
|
||||||
|
feed[section].append(as_article(article, log))
|
||||||
|
ans = []
|
||||||
|
for sec in sorted(feed):
|
||||||
|
ans.append((sec, feed[sec]))
|
||||||
|
|
||||||
|
return [(section_title, main_articles)] + ans
|
||||||
|
|
||||||
|
|
||||||
class ForeignAffairsRecipe(BasicNewsRecipe):
|
class ForeignAffairsRecipe(BasicNewsRecipe):
|
||||||
|
|
||||||
''' there are three modifications:
|
''' there are three modifications:
|
||||||
@ -55,43 +172,18 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
|||||||
'publisher': publisher}
|
'publisher': publisher}
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
answer = []
|
|
||||||
soup = self.index_to_soup(self.FRONTPAGE)
|
soup = self.index_to_soup(self.FRONTPAGE)
|
||||||
div = soup.find(
|
|
||||||
'div', attrs={'class': 'magazine-actions'})
|
|
||||||
self.cover_url = div.find('img')['ng-src']
|
|
||||||
# get dates
|
# get dates
|
||||||
date = re.split(r'\s\|\s', self.tag_to_string(
|
date = re.split(r'\s\|\s', self.tag_to_string(
|
||||||
soup.head.title.string))[0]
|
soup.head.title.string))[0]
|
||||||
self.title = "Foreign Affairs ({})".format(date)
|
self.title = "Foreign Affairs ({})".format(date)
|
||||||
self.timefmt = u' [%s]' % date
|
self.timefmt = u' [%s]' % date
|
||||||
|
cls = soup.find('body')['class']
|
||||||
# Fetching article list does not work as site uses javascript
|
if isinstance(cls, (list, tuple)):
|
||||||
# to load articles dynamically
|
cls = ' '.join(cls)
|
||||||
for section in soup.findAll('section', attrs={'class':lambda x: x and 'magazine-list' in x.split()}):
|
node_id = re.search(r'\bpage-node-(\d+)\b', cls).group(1)
|
||||||
articles = []
|
br = self.cloned_browser
|
||||||
section_title = self.tag_to_string(section.find('h2'))
|
return get_issue_data(br, self.log, node_id)
|
||||||
if 'special_section.title' in section_title:
|
|
||||||
section_title = 'Special'
|
|
||||||
self.log('\nSection:', section_title)
|
|
||||||
for h3 in section.findAll(attrs={'class': lambda x: x and 'magazine-title' in x.split()}):
|
|
||||||
a = h3.findParent('a', href=True)
|
|
||||||
title = self.tag_to_string(h3)
|
|
||||||
url = a['href']
|
|
||||||
atr = a.findNextSibling(attrs={'class':'author'})
|
|
||||||
author = self.tag_to_string(atr) if atr else ''
|
|
||||||
desc = a.findNextSibling(attrs={'class': 'deck'})
|
|
||||||
if desc is not None:
|
|
||||||
description = self.tag_to_string(desc)
|
|
||||||
else:
|
|
||||||
description = ''
|
|
||||||
articles.append({'title': title, 'url': url,
|
|
||||||
'description': description, 'author': author})
|
|
||||||
self.log(title)
|
|
||||||
self.log('\t' + url)
|
|
||||||
if articles:
|
|
||||||
answer.append((section_title, articles))
|
|
||||||
return answer
|
|
||||||
|
|
||||||
def clean_fa_html(self, root):
|
def clean_fa_html(self, root):
|
||||||
for svg in tuple(root.iter('{*}svg')):
|
for svg in tuple(root.iter('{*}svg')):
|
||||||
@ -104,7 +196,7 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
|||||||
root = html5lib.parse(raw_html, treebuilder='lxml',
|
root = html5lib.parse(raw_html, treebuilder='lxml',
|
||||||
namespaceHTMLElements=False).getroot()
|
namespaceHTMLElements=False).getroot()
|
||||||
self.clean_fa_html(root)
|
self.clean_fa_html(root)
|
||||||
return html.tostring(root)
|
return html.tostring(root, encoding='unicode')
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for img in soup.findAll('img', attrs={'ng-src': True}):
|
for img in soup.findAll('img', attrs={'ng-src': True}):
|
||||||
@ -112,16 +204,14 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
|||||||
return soup
|
return soup
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
|
|
||||||
|
def select_form(form):
|
||||||
|
return form.attrs.get('id', None) == 'user-login'
|
||||||
|
|
||||||
br = BasicNewsRecipe.get_browser(self)
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
if self.username is not None and self.password is not None:
|
if self.username is not None and self.password is not None:
|
||||||
# mechanize fails to parse the html correctly, so use html5lib to
|
br.open(
|
||||||
# sanitize the html first
|
|
||||||
response = br.open(
|
|
||||||
'https://www.foreignaffairs.com/user?destination=user%3Fop%3Dlo')
|
'https://www.foreignaffairs.com/user?destination=user%3Fop%3Dlo')
|
||||||
root = html5lib.parse(
|
|
||||||
response.get_data(), treebuilder='lxml', namespaceHTMLElements=False)
|
|
||||||
response.set_data(html.tostring(root))
|
|
||||||
br.set_response(response)
|
|
||||||
br.select_form(predicate=select_form)
|
br.select_form(predicate=select_form)
|
||||||
br.form['name'] = self.username
|
br.form['name'] = self.username
|
||||||
br.form['pass'] = self.password
|
br.form['pass'] = self.password
|
||||||
|
Loading…
x
Reference in New Issue
Block a user