Improved Foreign Affairs

This commit is contained in:
Kovid Goyal 2012-02-05 14:17:45 +05:30
parent 8677789312
commit ca647fe34b

View File

@ -3,10 +3,17 @@ import re
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
class ForeignAffairsRecipe(BasicNewsRecipe): class ForeignAffairsRecipe(BasicNewsRecipe):
''' there are three modifications:
1) fetch issue cover
2) toggle ignore premium articles
3) extract proper section names, ie. "Comments", "Essay"
by Chen Wei weichen302@gmx.com, 2012-02-05'''
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'kwetal' __author__ = 'kwetal'
language = 'en' language = 'en'
version = 1 version = 1.01
title = u'Foreign Affairs (Subcription or (free) Registration)' title = u'Foreign Affairs (Subcription or (free) Registration)'
publisher = u'Council on Foreign Relations' publisher = u'Council on Foreign Relations'
@ -17,6 +24,9 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
remove_javascript = True remove_javascript = True
INDEX = 'http://www.foreignaffairs.com' INDEX = 'http://www.foreignaffairs.com'
FRONTPAGE = 'http://www.foreignaffairs.com/magazine'
INCLUDE_PREMIUM = False
remove_tags = [] remove_tags = []
remove_tags.append(dict(name = 'base')) remove_tags.append(dict(name = 'base'))
@ -37,6 +47,12 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
temp_files = [] temp_files = []
articles_are_obfuscated = True articles_are_obfuscated = True
def get_cover_url(self):
soup = self.index_to_soup(self.FRONTPAGE)
div = soup.find('div', attrs={'class':'inthemag-issuebuy-cover'})
img_url = div.find('img')['src']
return self.INDEX + img_url
def get_obfuscated_article(self, url): def get_obfuscated_article(self, url):
br = self.get_browser() br = self.get_browser()
br.open(url) br.open(url)
@ -50,57 +66,47 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
return self.temp_files[-1].name return self.temp_files[-1].name
def parse_index(self): def parse_index(self):
soup = self.index_to_soup('http://www.foreignaffairs.com/magazine')
articles = []
answer = [] answer = []
content = soup.find('div', attrs = {'class': 'center-wrapper'}) soup = self.index_to_soup(self.FRONTPAGE)
sec_start = soup.findAll('div', attrs={'class':'panel-separator'})
for sec in sec_start:
content = sec.nextSibling
if content: if content:
for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}): section = self.tag_to_string(content.find('h2'))
tag = div.find('div', attrs = {'class': 'views-field-title'})
if tag:
a = tag.find('a')
if a:
title = self.tag_to_string(a)
url = self.INDEX + a['href']
author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}))
tag = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'})
# If they ever fix their markup, this will break :-(
summary = self.tag_to_string(tag.findNextSibling('p'))
description = author + '<br/>' + summary
articles.append({'title': title, 'date': None, 'url': url, 'description': description})
else:
continue
else:
continue
answer.append(('Magazine', articles))
ul = content.find('ul')
if ul:
articles = [] articles = []
for li in ul.findAll('li'):
tag = li.find('div', attrs = {'class': 'views-field-title'}) tags = []
for div in content.findAll('div', attrs = {'class': re.compile(r'view-row\s+views-row-[0-9]+\s+views-row-[odd|even].*')}):
tags.append(div)
ul = content.find('ul')
for li in content.findAll('li'):
tags.append(li)
for div in tags:
title = url = description = author = None
if self.INCLUDE_PREMIUM:
found_premium = False
else:
found_premium = div.findAll('span', attrs={'class':
'premium-icon'})
if not found_premium:
tag = div.find('div', attrs={'class': 'views-field-title'})
if tag: if tag:
a = tag.find('a') a = tag.find('a')
if a: if a:
title = self.tag_to_string(a) title = self.tag_to_string(a)
url = self.INDEX + a['href'] url = self.INDEX + a['href']
description = '' author = self.tag_to_string(div.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}))
tag = li.find('div', attrs = {'class': 'views-field-field-article-display-authors-value'}) tag_summary = div.find('span', attrs = {'class': 'views-field-field-article-summary-value'})
if tag: description = self.tag_to_string(tag_summary)
description = self.tag_to_string(tag) articles.append({'title':title, 'date':None, 'url':url,
'description':description, 'author':author})
articles.append({'title': title, 'date': None, 'url': url, 'description': description}) if articles:
else: answer.append((section, articles))
continue
else:
continue
answer.append(('Letters to the Editor', articles))
return answer return answer
def preprocess_html(self, soup): def preprocess_html(self, soup):