Improved Foreign Affairs

This commit is contained in:
Kovid Goyal 2012-02-05 14:17:45 +05:30
parent 8677789312
commit ca647fe34b

View File

@ -3,10 +3,17 @@ import re
from calibre.ptempfile import PersistentTemporaryFile
class ForeignAffairsRecipe(BasicNewsRecipe):
''' there are three modifications:
1) fetch issue cover
2) toggle ignore premium articles
3) extract proper section names, ie. "Comments", "Essay"
by Chen Wei weichen302@gmx.com, 2012-02-05'''
__license__ = 'GPL v3'
__author__ = 'kwetal'
language = 'en'
version = 1
version = 1.01
title = u'Foreign Affairs (Subcription or (free) Registration)'
publisher = u'Council on Foreign Relations'
@ -17,6 +24,9 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
remove_javascript = True
INDEX = 'http://www.foreignaffairs.com'
FRONTPAGE = 'http://www.foreignaffairs.com/magazine'
INCLUDE_PREMIUM = False
remove_tags = []
remove_tags.append(dict(name = 'base'))
@ -37,6 +47,12 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
temp_files = []
articles_are_obfuscated = True
def get_cover_url(self):
soup = self.index_to_soup(self.FRONTPAGE)
div = soup.find('div', attrs={'class':'inthemag-issuebuy-cover'})
img_url = div.find('img')['src']
return self.INDEX + img_url
def get_obfuscated_article(self, url):
br = self.get_browser()
br.open(url)
@ -50,57 +66,47 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
return self.temp_files[-1].name
def parse_index(self):
soup = self.index_to_soup('http://www.foreignaffairs.com/magazine')
articles = []
answer = []
content = soup.find('div', attrs = {'class': 'center-wrapper'})
soup = self.index_to_soup(self.FRONTPAGE)
sec_start = soup.findAll('div', attrs={'class':'panel-separator'})
for sec in sec_start:
content = sec.nextSibling
if content:
for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}):
tag = div.find('div', attrs = {'class': 'views-field-title'})
if tag:
a = tag.find('a')
if a:
title = self.tag_to_string(a)
url = self.INDEX + a['href']
author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}))
tag = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'})
# If they ever fix their markup, this will break :-(
summary = self.tag_to_string(tag.findNextSibling('p'))
description = author + '<br/>' + summary
articles.append({'title': title, 'date': None, 'url': url, 'description': description})
else:
continue
else:
continue
answer.append(('Magazine', articles))
ul = content.find('ul')
if ul:
section = self.tag_to_string(content.find('h2'))
articles = []
for li in ul.findAll('li'):
tag = li.find('div', attrs = {'class': 'views-field-title'})
tags = []
for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}):
tags.append(div)
ul = content.find('ul')
for li in content.findAll('li'):
tags.append(li)
for div in tags:
title = url = description = author = None
if self.INCLUDE_PREMIUM:
found_premium = False
else:
found_premium = div.findAll('span', attrs={'class':
'premium-icon'})
if not found_premium:
tag = div.find('div', attrs={'class': 'views-field-title'})
if tag:
a = tag.find('a')
if a:
title = self.tag_to_string(a)
url = self.INDEX + a['href']
description = ''
tag = li.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
if tag:
description = self.tag_to_string(tag)
articles.append({'title': title, 'date': None, 'url': url, 'description': description})
else:
continue
else:
continue
answer.append(('Letters to the Editor', articles))
author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}))
tag_summary = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'})
description = self.tag_to_string(tag_summary)
articles.append({'title':title, 'date':None, 'url':url,
'description':description, 'author':author})
if articles:
answer.append((section, articles))
return answer
def preprocess_html(self, soup):