diff --git a/recipes/foreignaffairs.recipe b/recipes/foreignaffairs.recipe index a3f5436d61..60f095db5c 100644 --- a/recipes/foreignaffairs.recipe +++ b/recipes/foreignaffairs.recipe @@ -1,12 +1,12 @@ #!/usr/bin/env python2 -from calibre.web.feeds.news import BasicNewsRecipe +import json import re + import html5lib +import mechanize from lxml import html - -def select_form(form): - return form.attrs.get('id', None) == 'user-login' +from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): @@ -15,6 +15,123 @@ def classes(classes): 'class': lambda x: x and frozenset(x.split()).intersection(q)}) +def as_article(source, log): + url = source['url'] + title = source['title'] + desc = '' + if source.get('field_subtitle'): + desc += source['field_subtitle'] + if source.get('field_display_authors'): + desc += ' by ' + source['field_display_authors'] + log(title, url) + return {'url': url, 'title': title, 'description': desc} + + +def get_issue_data(br, log, node_id='1124670'): + headers = { + 'Accept': 'application/json, text/plain, */*', + 'Content-Type': 'application/json;charset=UTF-8', + 'Origin': 'https://www.foreignaffairs.com', + 'Referer': 'https://www.foreignaffairs.com', + } + data = { + "_source": { + "includes": [ + "normalized_date", "field_issue_volume_number", + "field_issue_volume", "url", "fa_path", "title", + "fa_node_issue_cover_url", "nid", + "field_issue_ssection_header", + "field_issue_ssection_articles:nid" + ] + }, + "query": { + "match": { + "id": { + "query": node_id + } + } + }, + "size": 1 + } + + def get_data(data): + search_url = 'https://www.foreignaffairs.com/node/_search' + req = mechanize.Request(url=search_url, + data=json.dumps(data), + headers=headers, + method='POST') + res = br.open(req) + return json.loads(res.read())['hits']['hits'] + + issue_data = get_data(data) + source = issue_data[0]['_source'] + nids = source['field_issue_ssection_articles:nid'] + section_title = source['field_issue_ssection_header'] + + data = { + '_source': { + 'includes': [ + 'field_tags:name', 'field_topics:name', 'field_regions:name', + 'url', 'title', 'field_subtitle', 'field_display_authors', + 'nid', 'fa_node_has_audio', 'fa_node_paywall_free', + 'field_capsule_review_category:name', + 'fa_node_type_or_subtype', 'type' + ] + }, + 'query': { + 'terms': { + 'id': nids + } + }, + 'size': 30 + } + + sections_data = get_data(data) + log('Found main section:', section_title) + main_articles = [] + for article in sections_data: + main_articles.append(as_article(article['_source'], log)) + feed = {} + + data['size'] = 100 + data['query'] = { + 'bool': { + 'must': [{ + 'terms': { + 'fa_node_type_or_subtype': [ + 'Comment', 'Essay', 'Interview', 'Review Essay', + 'Letter From', 'Letter', 'Response', 'Capsule Review' + ] + } + }, { + 'term': { + 'field_issue:nid': { + 'term': '1124670' + } + } + }], + 'must_not': [{ + 'terms': { + 'id': nids + } + }] + } + } + + article_data = get_data(data) + for article in article_data: + article = article['_source'] + section = article['fa_node_type_or_subtype'] + if section not in feed: + feed[section] = [] + feed[section].append(as_article(article, log)) + ans = [] + for sec in sorted(feed): + ans.append((sec, feed[sec])) + + return [(section_title, main_articles)] + ans + + class ForeignAffairsRecipe(BasicNewsRecipe): ''' there are three modifications: @@ -55,43 +172,18 @@ class ForeignAffairsRecipe(BasicNewsRecipe): 'publisher': publisher} def parse_index(self): - answer = [] soup = self.index_to_soup(self.FRONTPAGE) - div = soup.find( - 'div', attrs={'class': 'magazine-actions'}) - self.cover_url = div.find('img')['ng-src'] # get dates date = re.split(r'\s\|\s', self.tag_to_string( soup.head.title.string))[0] self.title = "Foreign Affairs ({})".format(date) self.timefmt = u' [%s]' % date - - # Fetching article list does not work as site uses javascript - # to load articles dynamically - for section in soup.findAll('section', attrs={'class':lambda x: x and 'magazine-list' in x.split()}): - articles = [] - section_title = self.tag_to_string(section.find('h2')) - if 'special_section.title' in section_title: - section_title = 'Special' - self.log('\nSection:', section_title) - for h3 in section.findAll(attrs={'class': lambda x: x and 'magazine-title' in x.split()}): - a = h3.findParent('a', href=True) - title = self.tag_to_string(h3) - url = a['href'] - atr = a.findNextSibling(attrs={'class':'author'}) - author = self.tag_to_string(atr) if atr else '' - desc = a.findNextSibling(attrs={'class': 'deck'}) - if desc is not None: - description = self.tag_to_string(desc) - else: - description = '' - articles.append({'title': title, 'url': url, - 'description': description, 'author': author}) - self.log(title) - self.log('\t' + url) - if articles: - answer.append((section_title, articles)) - return answer + cls = soup.find('body')['class'] + if isinstance(cls, (list, tuple)): + cls = ' '.join(cls) + node_id = re.search(r'\bpage-node-(\d+)\b', cls).group(1) + br = self.cloned_browser + return get_issue_data(br, self.log, node_id) def clean_fa_html(self, root): for svg in tuple(root.iter('{*}svg')): @@ -104,7 +196,7 @@ class ForeignAffairsRecipe(BasicNewsRecipe): root = html5lib.parse(raw_html, treebuilder='lxml', namespaceHTMLElements=False).getroot() self.clean_fa_html(root) - return html.tostring(root) + return html.tostring(root, encoding='unicode') def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'ng-src': True}): @@ -112,16 +204,14 @@ class ForeignAffairsRecipe(BasicNewsRecipe): return soup def get_browser(self): + + def select_form(form): + return form.attrs.get('id', None) == 'user-login' + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: - # mechanize fails to parse the html correctly, so use html5lib to - # sanitize the html first - response = br.open( + br.open( 'https://www.foreignaffairs.com/user?destination=user%3Fop%3Dlo') - root = html5lib.parse( - response.get_data(), treebuilder='lxml', namespaceHTMLElements=False) - response.set_data(html.tostring(root)) - br.set_response(response) br.select_form(predicate=select_form) br.form['name'] = self.username br.form['pass'] = self.password