Update Foreign Affairs

This commit is contained in:
Kovid Goyal 2015-07-02 07:50:51 +05:30
parent eb03273848
commit 1ccd546c29

View File

@ -1,18 +1,29 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
import re import re
def select_form(form):
return form.attrs.get('class', None) == 'user-login-form'
class ForeignAffairsRecipe(BasicNewsRecipe): class ForeignAffairsRecipe(BasicNewsRecipe):
''' there are three modifications: ''' there are three modifications:
1) fetch issue cover 1) fetch issue cover
2) toggle ignore premium articles 2) toggle ignore premium articles
3) extract proper section names, ie. "Comments", "Essay" 3) extract proper section names, ie. "Comments", "Essay"
by Chen Wei weichen302@gmx.com, 2012-02-05''' by Chen Wei, 2012-02-05
Additional modifications to support rebranded website
by anisotrope, 27 June 2015
'''
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'Rick Shang, kwetal' __author__ = 'Rick Shang, kwetal, anisotrope'
language = 'en' language = 'en'
version = 1.01 version = 1.02
title = u'Foreign Affairs (Subcription)' title = u'Foreign Affairs (Subcription)'
publisher = u'Council on Foreign Relations' publisher = u'Council on Foreign Relations'
@ -26,13 +37,8 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
INDEX = 'http://www.foreignaffairs.com' INDEX = 'http://www.foreignaffairs.com'
FRONTPAGE = 'http://www.foreignaffairs.com/magazine' FRONTPAGE = 'http://www.foreignaffairs.com/magazine'
remove_tags = [dict(name='svg')]
remove_tags = [] remove_tags_before = dict(name='div', attrs={'class': 'print-content'})
remove_tags.append(dict(name = 'base'))
#remove_tags.append(dict(name = '', attrs = {'': ''}))
remove_tags_before = dict(name = 'h1', attrs = {'class': 'print-title'})
remove_tags_after = dict(name='div', attrs={'class': 'print-footer'}) remove_tags_after = dict(name='div', attrs={'class': 'print-footer'})
extra_css = ''' extra_css = '''
@ -47,9 +53,21 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup(self.FRONTPAGE) soup = self.index_to_soup(self.FRONTPAGE)
div = soup.find('div', attrs={'class':'inthemag-issuebuy-cover'}) div = soup.find('div', attrs={'class':'magazine-hero__image image_auto_width'})
img_url = div.find('img')['src'] img_url = div.find('img')['src']
return self.INDEX + img_url return img_url # The url includes the https:// as necessary
def get_print_url(self, url):
article_soup = self.index_to_soup(url.strip())
if article_soup is not None:
shortlink = article_soup.find('a', attrs={'class':re.compile(r'\bicon-print\b')})
if shortlink:
return shortlink['href']
else:
return url
else:
return url
def parse_index(self): def parse_index(self):
@ -57,48 +75,26 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
soup = self.index_to_soup(self.FRONTPAGE) soup = self.index_to_soup(self.FRONTPAGE)
# get dates # get dates
date = re.split('\s\|\s',self.tag_to_string(soup.head.title.string))[0] date = re.split('\s\|\s',self.tag_to_string(soup.head.title.string))[0]
self.title = "Foreign Affairs ({})".format(date)
self.timefmt = u' [%s]'%date self.timefmt = u' [%s]'%date
sec_start = soup.findAll('div', attrs= {'class':'panel-pane'}) sec_start = soup.findAll('section', attrs={'class':re.compile(r'\bmagazine-list\b')})
for sec in sec_start: for sec in sec_start:
articles = [] articles = []
section = self.tag_to_string(sec.find('h2')) section = self.tag_to_string(sec.find('h1'))
if 'Books' in section: for article_block in sec.findAll('article'):
reviewsection=sec.find('div', attrs = {'class': 'item-list'}) if article_block.find('a') is not None:
for subsection in reviewsection.findAll('div'): title=self.tag_to_string(article_block.div.a.h2)
subsectiontitle=self.tag_to_string(subsection.span.a) article_url = article_block.div.a['href']
subsectionurl=self.INDEX + subsection.span.a['href'] url = self.get_print_url(article_url)
soup1 = self.index_to_soup(subsectionurl) atr=article_block.findNext('p', attrs={'class': 'author'})
for div in soup1.findAll('div', attrs = {'class': 'views-field-title'}):
if div.find('a') is not None:
originalauthor=self.tag_to_string(div.findNext('div', attrs = {'class':'views-field-field-article-book-nid'}).div.a)
title=subsectiontitle+': '+self.tag_to_string(div.span.a)+' by '+originalauthor
url=self.INDEX+self.index_to_soup(self.INDEX+div.span.a['href']).find('a', attrs={'class':'fa_addthis_print'})['href']
atr=div.findNext('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
if atr is not None: if atr is not None:
author=self.tag_to_string(atr.span) author=self.tag_to_string(atr)
else: else:
author='' author=''
desc=div.findNext('span', attrs = {'class': 'views-field-field-article-summary-value'}) desc=article_block.findNext('div', attrs={'class': 'deck'})
if desc is not None: if desc is not None:
description=self.tag_to_string(desc.div.p) description=self.tag_to_string(desc)
else:
description=''
articles.append({'title':title, 'date':None, 'url':url, 'description':description, 'author':author})
subsectiontitle=''
else:
for div in sec.findAll('div', attrs = {'class': 'views-field-title'}):
if div.find('a') is not None:
title=self.tag_to_string(div.span.a)
url=self.INDEX+self.index_to_soup(self.INDEX+div.span.a['href']).find('a', attrs={'class':'fa_addthis_print'})['href']
atr=div.findNext('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
if atr is not None:
author=self.tag_to_string(atr.span)
else:
author=''
desc=div.findNext('span', attrs = {'class': 'views-field-field-article-summary-value'})
if desc is not None:
description=self.tag_to_string(desc.div.p)
else: else:
description='' description=''
articles.append({'title':title, 'date':None, 'url':url, 'description':description, 'author':author}) articles.append({'title':title, 'date':None, 'url':url, 'description':description, 'author':author})
@ -108,22 +104,20 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
def preprocess_html(self, soup): def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'src': True}): for img in soup.findAll('img', attrs={'src': True}):
if not img['src'].startswith('http://'): if not img['src'].startswith('http'):
img['src'] = self.INDEX + img['src'] img['src'] = self.INDEX + img['src']
return soup return soup
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
br.open('https://www.foreignaffairs.com/user?destination=user%3Fop%3Dlo') br.open('https://www.foreignaffairs.com/user?destination=user%3Fop%3Dlo')
br.select_form(nr = 1) br.select_form(predicate=select_form)
br['name'] = self.username br.form['name'] = self.username
br['pass'] = self.password br.form['pass'] = self.password
br.submit() br.submit()
return br return br
def cleanup(self): def cleanup(self):
self.browser.open('http://www.foreignaffairs.com/logout?destination=user%3Fop=lo') self.browser.open('https://www.foreignaffairs.com/user/logout')