Improved Foreign Affairs

2025-08-30 23:00:21 -04:00 · 2012-02-05 14:17:45 +05:30 · 2012-02-05 14:17:45 +05:30 · ca647fe34b
commit ca647fe34b
parent 8677789312
1 changed files with 52 additions and 46 deletions
--- a/recipes/foreignaffairs.recipe
+++ b/recipes/foreignaffairs.recipe
@ -3,10 +3,17 @@ import re
 from calibre.ptempfile import PersistentTemporaryFile
 class ForeignAffairsRecipe(BasicNewsRecipe):
    ''' there are three modifications:
    1) fetch issue cover
    2) toggle ignore premium articles
    3) extract proper section names, ie. "Comments", "Essay"
    by Chen Wei weichen302@gmx.com, 2012-02-05'''
    __license__  = 'GPL v3'
    __author__ = 'kwetal'
    language = 'en'
-    version = 1
+    version = 1.01
    title = u'Foreign Affairs (Subcription or (free) Registration)'
    publisher = u'Council on Foreign Relations'
@ -17,6 +24,9 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
    remove_javascript = True
    INDEX = 'http://www.foreignaffairs.com'
    FRONTPAGE = 'http://www.foreignaffairs.com/magazine'
    INCLUDE_PREMIUM = False
    remove_tags = []
    remove_tags.append(dict(name = 'base'))
@ -37,6 +47,12 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
    temp_files = []
    articles_are_obfuscated = True
    def get_cover_url(self):
        soup = self.index_to_soup(self.FRONTPAGE)
        div = soup.find('div', attrs={'class':'inthemag-issuebuy-cover'})
        img_url =  div.find('img')['src']
        return self.INDEX + img_url
    def get_obfuscated_article(self, url):
        br = self.get_browser()
        br.open(url)
@ -50,57 +66,47 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
        return self.temp_files[-1].name
    def parse_index(self):
        soup = self.index_to_soup('http://www.foreignaffairs.com/magazine')
        articles = []
        answer = []
-        content = soup.find('div', attrs = {'class': 'center-wrapper'})
+        soup = self.index_to_soup(self.FRONTPAGE)
-        if content:
+        sec_start = soup.findAll('div', attrs={'class':'panel-separator'})
-            for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}):
+        for sec in sec_start:
-                tag = div.find('div', attrs = {'class': 'views-field-title'})
+            content = sec.nextSibling
-                if tag:
+            if content:
-                    a = tag.find('a')
+                section = self.tag_to_string(content.find('h2'))
                    if a:
                        title = self.tag_to_string(a)
                        url = self.INDEX + a['href']
                        author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}))
                        tag = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'})
                        # If they ever fix their markup, this will break :-(
                        summary = self.tag_to_string(tag.findNextSibling('p'))
                        description = author  + '<br/>' + summary
                        articles.append({'title': title, 'date': None, 'url': url, 'description': description})
                    else:
                        continue
                else:
                    continue
            answer.append(('Magazine', articles))
            ul = content.find('ul')
            if ul:
                articles = []
                for li in ul.findAll('li'):
                    tag = li.find('div', attrs = {'class': 'views-field-title'})
                    if tag:
                        a = tag.find('a')
                        if a:
                            title = self.tag_to_string(a)
                            url = self.INDEX + a['href']
                            description = ''
                            tag = li.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
                            if tag:
                                description = self.tag_to_string(tag)
-                            articles.append({'title': title, 'date': None, 'url': url, 'description': description})
+                tags = []
-                        else:
+                for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}):
-                            continue
+                    tags.append(div)
                ul = content.find('ul')
                for li in content.findAll('li'):
                    tags.append(li)
                for div in tags:
                    title = url = description = author = None
                    if self.INCLUDE_PREMIUM:
                        found_premium = False
                    else:
-                        continue
+                        found_premium = div.findAll('span', attrs={'class':
-
+                                                               'premium-icon'})
-                answer.append(('Letters to the Editor', articles))
+                    if not found_premium:
                        tag = div.find('div', attrs={'class': 'views-field-title'})
                        if tag:
                            a = tag.find('a')
                            if a:
                                title = self.tag_to_string(a)
                                url = self.INDEX + a['href']
                            author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}))
                            tag_summary = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'})
                            description = self.tag_to_string(tag_summary)
                            articles.append({'title':title, 'date':None, 'url':url,
                                     'description':description, 'author':author})
                if articles:
                    answer.append((section, articles))
        return answer
    def preprocess_html(self, soup):