Improved Foreign Affairs

2025-08-30 23:00:21 -04:00 · 2012-02-05 14:17:45 +05:30 · 2012-02-05 14:17:45 +05:30 · ca647fe34b
commit ca647fe34b
parent 8677789312
1 changed files with 52 additions and 46 deletions
--- a/recipes/foreignaffairs.recipe
+++ b/recipes/foreignaffairs.recipe
@ -3,10 +3,17 @@ import re
 from calibre.ptempfile import PersistentTemporaryFile

 class ForeignAffairsRecipe(BasicNewsRecipe):
+    ''' there are three modifications:
+    1) fetch issue cover
+    2) toggle ignore premium articles
+    3) extract proper section names, ie. "Comments", "Essay"
+
+    by Chen Wei weichen302@gmx.com, 2012-02-05'''
+
    __license__  = 'GPL v3'
    __author__ = 'kwetal'
    language = 'en'
-    version = 1
+    version = 1.01

    title = u'Foreign Affairs (Subcription or (free) Registration)'
    publisher = u'Council on Foreign Relations'
@ -17,6 +24,9 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
    remove_javascript = True

    INDEX = 'http://www.foreignaffairs.com'
+    FRONTPAGE = 'http://www.foreignaffairs.com/magazine'
+    INCLUDE_PREMIUM = False
+

    remove_tags = []
    remove_tags.append(dict(name = 'base'))
@ -37,6 +47,12 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
    temp_files = []
    articles_are_obfuscated = True

+    def get_cover_url(self):
+        soup = self.index_to_soup(self.FRONTPAGE)
+        div = soup.find('div', attrs={'class':'inthemag-issuebuy-cover'})
+        img_url =  div.find('img')['src']
+        return self.INDEX + img_url
+
    def get_obfuscated_article(self, url):
        br = self.get_browser()
        br.open(url)
@ -50,57 +66,47 @@ class ForeignAffairsRecipe(BasicNewsRecipe):

        return self.temp_files[-1].name

+
    def parse_index(self):
-        soup = self.index_to_soup('http://www.foreignaffairs.com/magazine')
-        articles = []
        answer = []
-        content = soup.find('div', attrs = {'class': 'center-wrapper'})
+        soup = self.index_to_soup(self.FRONTPAGE)
+        sec_start = soup.findAll('div', attrs={'class':'panel-separator'})
+        for sec in sec_start:
+            content = sec.nextSibling
            if content:
-            for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}):
-                tag = div.find('div', attrs = {'class': 'views-field-title'})
-                if tag:
-                    a = tag.find('a')
-                    if a:
-                        title = self.tag_to_string(a)
-                        url = self.INDEX + a['href']
-
-                        author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}))
-                        tag = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'})
-                        # If they ever fix their markup, this will break :-(
-                        summary = self.tag_to_string(tag.findNextSibling('p'))
-                        description = author  + '<br/>' + summary
-
-                        articles.append({'title': title, 'date': None, 'url': url, 'description': description})
-                    else:
-                        continue
-                else:
-                    continue
-
-            answer.append(('Magazine', articles))
-
-            ul = content.find('ul')
-            if ul:
+                section = self.tag_to_string(content.find('h2'))
                articles = []
-                for li in ul.findAll('li'):
-                    tag = li.find('div', attrs = {'class': 'views-field-title'})
+
+                tags = []
+                for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}):
+                    tags.append(div)
+                ul = content.find('ul')
+                for li in content.findAll('li'):
+                    tags.append(li)
+
+                for div in tags:
+                    title = url = description = author = None
+
+                    if self.INCLUDE_PREMIUM:
+                        found_premium = False
+                    else:
+                        found_premium = div.findAll('span', attrs={'class':
+                                                               'premium-icon'})
+                    if not found_premium:
+                        tag = div.find('div', attrs={'class': 'views-field-title'})
+
                        if tag:
                            a = tag.find('a')
                            if a:
                                title = self.tag_to_string(a)
                                url = self.INDEX + a['href']
-                            description = ''
-                            tag = li.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
-                            if tag:
-                                description = self.tag_to_string(tag)
-
-                            articles.append({'title': title, 'date': None, 'url': url, 'description': description})
-                        else:
-                            continue
-                    else:
-                        continue
-
-                answer.append(('Letters to the Editor', articles))
-
+                            author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}))
+                            tag_summary = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'})
+                            description = self.tag_to_string(tag_summary)
+                            articles.append({'title':title, 'date':None, 'url':url,
+                                     'description':description, 'author':author})
+                if articles:
+                    answer.append((section, articles))
        return answer

    def preprocess_html(self, soup):