Get parse_index() working for foreign affairs AJAX backend

2025-09-29 15:31:08 -04:00 · 2019-08-18 17:52:49 +05:30 · 2019-08-18 17:52:49 +05:30 · 099cbca59c
commit 099cbca59c
parent d7458841e1
1 changed files with 133 additions and 43 deletions
--- a/recipes/foreignaffairs.recipe
+++ b/recipes/foreignaffairs.recipe
@ -1,12 +1,12 @@
 #!/usr/bin/env python2
-from calibre.web.feeds.news import BasicNewsRecipe
+import json
 import re
+
 import html5lib
+import mechanize
 from lxml import html

-
-def select_form(form):
-    return form.attrs.get('id', None) == 'user-login'
+from calibre.web.feeds.news import BasicNewsRecipe


 def classes(classes):
@ -15,6 +15,123 @@ def classes(classes):
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


+def as_article(source, log):
+    url = source['url']
+    title = source['title']
+    desc = ''
+    if source.get('field_subtitle'):
+        desc += source['field_subtitle']
+    if source.get('field_display_authors'):
+        desc += ' by ' + source['field_display_authors']
+    log(title, url)
+    return {'url': url, 'title': title, 'description': desc}
+
+
+def get_issue_data(br, log, node_id='1124670'):
+    headers = {
+        'Accept': 'application/json, text/plain, */*',
+        'Content-Type': 'application/json;charset=UTF-8',
+        'Origin': 'https://www.foreignaffairs.com',
+        'Referer': 'https://www.foreignaffairs.com',
+    }
+    data = {
+        "_source": {
+            "includes": [
+                "normalized_date", "field_issue_volume_number",
+                "field_issue_volume", "url", "fa_path", "title",
+                "fa_node_issue_cover_url", "nid",
+                "field_issue_ssection_header",
+                "field_issue_ssection_articles:nid"
+            ]
+        },
+        "query": {
+            "match": {
+                "id": {
+                    "query": node_id
+                }
+            }
+        },
+        "size": 1
+    }
+
+    def get_data(data):
+        search_url = 'https://www.foreignaffairs.com/node/_search'
+        req = mechanize.Request(url=search_url,
+                                data=json.dumps(data),
+                                headers=headers,
+                                method='POST')
+        res = br.open(req)
+        return json.loads(res.read())['hits']['hits']
+
+    issue_data = get_data(data)
+    source = issue_data[0]['_source']
+    nids = source['field_issue_ssection_articles:nid']
+    section_title = source['field_issue_ssection_header']
+
+    data = {
+        '_source': {
+            'includes': [
+                'field_tags:name', 'field_topics:name', 'field_regions:name',
+                'url', 'title', 'field_subtitle', 'field_display_authors',
+                'nid', 'fa_node_has_audio', 'fa_node_paywall_free',
+                'field_capsule_review_category:name',
+                'fa_node_type_or_subtype', 'type'
+            ]
+        },
+        'query': {
+            'terms': {
+                'id': nids
+            }
+        },
+        'size': 30
+    }
+
+    sections_data = get_data(data)
+    log('Found main section:', section_title)
+    main_articles = []
+    for article in sections_data:
+        main_articles.append(as_article(article['_source'], log))
+    feed = {}
+
+    data['size'] = 100
+    data['query'] = {
+        'bool': {
+            'must': [{
+                'terms': {
+                    'fa_node_type_or_subtype': [
+                        'Comment', 'Essay', 'Interview', 'Review Essay',
+                        'Letter From', 'Letter', 'Response', 'Capsule Review'
+                    ]
+                }
+            }, {
+                'term': {
+                    'field_issue:nid': {
+                        'term': '1124670'
+                    }
+                }
+            }],
+            'must_not': [{
+                'terms': {
+                    'id': nids
+                }
+            }]
+        }
+    }
+
+    article_data = get_data(data)
+    for article in article_data:
+        article = article['_source']
+        section = article['fa_node_type_or_subtype']
+        if section not in feed:
+            feed[section] = []
+        feed[section].append(as_article(article, log))
+    ans = []
+    for sec in sorted(feed):
+        ans.append((sec, feed[sec]))
+
+    return [(section_title, main_articles)] + ans
+
+
 class ForeignAffairsRecipe(BasicNewsRecipe):

    ''' there are three modifications:
@ -55,43 +172,18 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
                          'publisher': publisher}

    def parse_index(self):
-        answer = []
        soup = self.index_to_soup(self.FRONTPAGE)
-        div = soup.find(
-            'div', attrs={'class': 'magazine-actions'})
-        self.cover_url = div.find('img')['ng-src']
        # get dates
        date = re.split(r'\s\|\s', self.tag_to_string(
            soup.head.title.string))[0]
        self.title = "Foreign Affairs ({})".format(date)
        self.timefmt = u' [%s]' % date
-
-        # Fetching article list does not work as site uses javascript
-        # to load articles dynamically
-        for section in soup.findAll('section', attrs={'class':lambda x: x and 'magazine-list' in x.split()}):
-            articles = []
-            section_title = self.tag_to_string(section.find('h2'))
-            if 'special_section.title' in section_title:
-                section_title = 'Special'
-            self.log('\nSection:', section_title)
-            for h3 in section.findAll(attrs={'class': lambda x: x and 'magazine-title' in x.split()}):
-                a = h3.findParent('a', href=True)
-                title = self.tag_to_string(h3)
-                url = a['href']
-                atr = a.findNextSibling(attrs={'class':'author'})
-                author = self.tag_to_string(atr) if atr else ''
-                desc = a.findNextSibling(attrs={'class': 'deck'})
-                if desc is not None:
-                    description = self.tag_to_string(desc)
-                else:
-                    description = ''
-                articles.append({'title': title, 'url': url,
-                                    'description': description, 'author': author})
-                self.log(title)
-                self.log('\t' + url)
-            if articles:
-                answer.append((section_title, articles))
-        return answer
+        cls = soup.find('body')['class']
+        if isinstance(cls, (list, tuple)):
+            cls = ' '.join(cls)
+        node_id = re.search(r'\bpage-node-(\d+)\b', cls).group(1)
+        br = self.cloned_browser
+        return get_issue_data(br, self.log, node_id)

    def clean_fa_html(self, root):
        for svg in tuple(root.iter('{*}svg')):
@ -104,7 +196,7 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
        root = html5lib.parse(raw_html, treebuilder='lxml',
                              namespaceHTMLElements=False).getroot()
        self.clean_fa_html(root)
-        return html.tostring(root)
+        return html.tostring(root, encoding='unicode')

    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs={'ng-src': True}):
@ -112,16 +204,14 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
        return soup

    def get_browser(self):
+
+        def select_form(form):
+            return form.attrs.get('id', None) == 'user-login'
+
        br = BasicNewsRecipe.get_browser(self)
        if self.username is not None and self.password is not None:
-            # mechanize fails to parse the html correctly, so use html5lib to
-            # sanitize the html first
-            response = br.open(
+            br.open(
                'https://www.foreignaffairs.com/user?destination=user%3Fop%3Dlo')
-            root = html5lib.parse(
-                response.get_data(), treebuilder='lxml', namespaceHTMLElements=False)
-            response.set_data(html.tostring(root))
-            br.set_response(response)
            br.select_form(predicate=select_form)
            br.form['name'] = self.username
            br.form['pass'] = self.password