Work on Foreign Affairs

2025-07-09 03:04:10 -04:00 · 2019-02-05 10:07:09 +05:30 · 2019-02-05 10:07:09 +05:30 · 848934643e
commit 848934643e
parent 1f203ba7b6
1 changed files with 35 additions and 28 deletions
--- a/recipes/foreignaffairs.recipe
+++ b/recipes/foreignaffairs.recipe
@ -9,6 +9,12 @@ def select_form(form):
    return form.attrs.get('id', None) == 'user-login'
 def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})
 class ForeignAffairsRecipe(BasicNewsRecipe):
    ''' there are three modifications:
@ -37,7 +43,7 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
    remove_javascript = True
    needs_subscription = True
-    INDEX = 'http://www.foreignaffairs.com'
+    INDEX = 'https://www.foreignaffairs.com'
    FRONTPAGE = INDEX + '/magazine'
    keep_only_tags = [
@ -50,36 +56,39 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
    def parse_index(self):
        answer = []
-        soup = self.index_to_soup(html.tostring(self.clean_fa_html(
+        soup = self.index_to_soup(self.FRONTPAGE)
            self.index_to_soup(self.FRONTPAGE, as_tree=True))))
        div = soup.find(
-            'div', attrs={'class': 'magazine-hero__image image_auto_width'})
+            'div', attrs={'class': 'magazine-actions'})
-        self.cover_url = div.find('img')['src']
+        self.cover_url = div.find('img')['ng-src']
        # get dates
-        date = re.split('\s\|\s', self.tag_to_string(
+        date = re.split(r'\s\|\s', self.tag_to_string(
            soup.head.title.string))[0]
        self.title = "Foreign Affairs ({})".format(date)
        self.timefmt = u' [%s]' % date
-        for section in soup.findAll(attrs={'class':lambda x: x and 'magazine-list' in x.split()}):
+        # Fetching article list does not work as site uses javascript
        # to load articles dynamically
        for section in soup.findAll('section', attrs={'class':lambda x: x and 'magazine-list' in x.split()}):
            articles = []
-            section_title = self.tag_to_string(section.find('h1'))
+            section_title = self.tag_to_string(section.find('h2'))
-            for h2 in section.findAll('h2'):
+            if 'special_section.title' in section_title:
-                a = h2.parent
+                section_title = 'Special'
-                if a.get('href'):
+            self.log('\nSection:', section_title)
-                    title = self.tag_to_string(h2)
+            for h3 in section.findAll(attrs={'class': lambda x: x and 'magazine-title' in x.split()}):
-                    url = a['href']
+                a = h3.findParent('a', href=True)
-                    atr = a.findNextSibling(attrs={'class':'author'})
+                title = self.tag_to_string(h3)
-                    author = self.tag_to_string(atr) if atr else ''
+                url = a['href']
-                    desc = a.findNextSibling(attrs={'class': 'deck'})
+                atr = a.findNextSibling(attrs={'class':'author'})
-                    if desc is not None:
+                author = self.tag_to_string(atr) if atr else ''
-                        description = self.tag_to_string(desc)
+                desc = a.findNextSibling(attrs={'class': 'deck'})
-                    else:
+                if desc is not None:
-                        description = ''
+                    description = self.tag_to_string(desc)
-                    articles.append({'title': title, 'url': url,
+                else:
-                                     'description': description, 'author': author})
+                    description = ''
-                    self.log(title)
+                articles.append({'title': title, 'url': url,
-                    self.log('\t' + url)
+                                    'description': description, 'author': author})
                self.log(title)
                self.log('\t' + url)
            if articles:
                answer.append((section_title, articles))
        return answer
@ -98,10 +107,8 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
        return html.tostring(root)
    def preprocess_html(self, soup):
-        for img in soup.findAll('img', attrs={'src': True}):
+        for img in soup.findAll('img', attrs={'ng-src': True}):
-            if not img['src'].startswith('http'):
+            img['src'] = img['ng-src']
                img['src'] = self.INDEX + img['src']
        return soup
    def get_browser(self):