#!/usr/bin/env python import json import re import mechanize from calibre.web.feeds.news import BasicNewsRecipe, classes def as_article(source, log): url = source['url'] title = source['title'] desc = '' if source.get('field_subtitle'): desc += source['field_subtitle'] if source.get('field_display_authors'): desc += ' by ' + source['field_display_authors'] log(title, url) return {'url': url, 'title': title, 'description': desc} def get_issue_data(br, log, node_id='1126213', year='2020', volnum='99', issue_vol='5'): headers = { 'Accept': 'application/json, text/plain, */*', 'Content-Type': 'application/json;charset=UTF-8', 'Origin': 'https://www.foreignaffairs.com', 'Referer': 'https://www.foreignaffairs.com', } def make_query(**kwds): size = kwds.pop('size', 1) is_filter = kwds.pop('filter', None) if is_filter: q = {'filter': [{'terms': {k:v}} for k, v in kwds.items()]} else: q = {'must': [{'term': {k:v}} for k, v in kwds.items()]} return { 'from': 0, 'post_filter': {'bool': q}, '_source': { 'includes': [ 'nid', 'path', 'title', 'field_subtitle', 'field_display_authors', 'fa_node_type_or_subtype', 'field_issue_sspecial_articles__nid', 'field_issue_sspecial_header' ] }, 'query': { 'match_all': {} }, 'sort': [{'field_sequence': 'asc'}, {'fa_normalized_date': 'desc'}], 'size': size, } def get_data(data): search_url = 'https://www.foreignaffairs.com/fa-search.php' req = mechanize.Request(url=search_url, data=json.dumps(data), headers=headers, method='POST') res = br.open(req) data = json.loads(res.read()) return data['hits']['hits'] feeds = [] def as_article(source): title = source['title'][0] desc = '' fs = source.get('field_subtitle') if fs: desc = fs[0] aus = source.get('field_display_authors') if aus: desc += ' By ' + aus[0] url = 'https://www.foreignaffairs.com' + source['path'][0] return {'title': title, 'description': desc, 'url': url} issue_data = get_data(make_query( fa_node_type_or_subtype='Issue', field_issue_volume=issue_vol, field_issue_year=year, field_issue_volume_number=volnum ))[0]['_source'] if 'field_issue_sspecial_articles__nid' in issue_data: main_sec_title = issue_data['field_issue_sspecial_header'][0] main_sec_nids = issue_data['field_issue_sspecial_articles__nid'] articles_data = get_data(make_query(nid=main_sec_nids, filter=True, size=len(main_sec_nids))) articles = [] log(main_sec_title) for entry in articles_data: source = entry['_source'] articles.append(as_article(source)) log('\t', articles[-1]['title'], articles[-1]['url']) feeds.append((main_sec_title, articles)) articles_data = get_data(make_query(field_issue__nid=node_id, size=50)) ans = {} for entry in articles_data: source = entry['_source'] section = source['fa_node_type_or_subtype'][0] ans.setdefault(section, []).append(as_article(source)) for sectitle in sorted(ans): articles = ans[sectitle] log(sectitle) if articles: for art in articles: log('\t', art['title'], art['url']) feeds.append((sectitle, articles)) return feeds class ForeignAffairsRecipe(BasicNewsRecipe): title = u'Foreign Affairs' __author__ = 'Kovid Goyal' language = 'en' publisher = u'Council on Foreign Relations' category = u'USA, Foreign Affairs' description = u'The leading forum for serious discussion of American foreign policy and international affairs.' encoding = 'utf-8' no_stylesheets = True remove_javascript = True needs_subscription = 'optional' remove_attributes = ['style', 'height', 'width'] masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Foreign_Affairs_Logo.svg/500px-Foreign_Affairs_Logo.svg.png' extra_css = ''' .topper__date, .topper__byline, .figure__caption, .calibre-nuked-tag-figcaption, .font-style-italic { font-size: small; } .topper__subtitle { font-style: italic; color: #202020; } em, blockquote { color: #202020; } img {display:block; margin:0 auto;} ''' INDEX = 'https://www.foreignaffairs.com/magazine' recipe_specific_options = { 'issue': { 'short': 'Enter the Issue Number you want to download ', 'long': 'For example, 2024/103/1' } } keep_only_tags = [ classes('topper__heading-container topper__image-container paywall-content'), ] remove_tags = [ dict(name=['svg', 'meta']), classes('topper__issue article-newsletter-signup--container dfp-tag-wrapper') ] conversion_options = {'comments': description, 'tags': category, 'language': 'en', 'publisher': publisher} ignore_duplicate_articles = {'title', 'url'} remove_empty_feeds = True def parse_index(self): d = self.recipe_specific_options.get('issue') if d and isinstance(d, str): self.INDEX = 'https://www.foreignaffairs.com/issues/' + d soup = self.index_to_soup(self.INDEX) # get dates date = re.split(r'\s\|\s', self.tag_to_string( soup.head.title.string))[0] self.timefmt = u' [%s]' % date link = soup.find('link', rel='canonical', href=True)['href'] year, volnum, issue_vol = link.split('/')[-3:] main = soup.find('main', attrs={'id': 'content'}) cov = main.find('img', attrs={'srcset': lambda x: x and 'Cover.jpg' in x}) if cov: self.cover_url = re.sub( r'_webp_issue_small_\dx', '_webp_issue_large_2x', cov['srcset'].split()[0] ) cls = soup.find('link', attrs={'rel':'shortlink'})['href'] node_id = re.search(r'https://www.foreignaffairs.com/node/(\d+)', cls).group(1) br = self.cloned_browser feeds = get_issue_data(br, self.log, node_id, year, volnum, issue_vol) return feeds def preprocess_html(self, soup): for h2 in soup.findAll(**classes('topper__subtitle')): h2.name = 'p' for by in soup.findAll(**classes('topper__byline topper__date font-style-italic')): by.name = 'div' for img in soup.find_all('img', attrs={'srcset': True}): img['src'] = re.sub(r'_webp_small_\dx', '_webp_large_1x', img['srcset'].split()[0]) return soup def get_browser(self): def select_form(form): return form.attrs.get('id', None) == 'fa-user-login-form' br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('https://www.foreignaffairs.com/user/login') br.select_form(predicate=select_form) br.form['name'] = self.username br.form['pass'] = self.password br.submit() return br