Work on Foreign Affairs

This commit is contained in:
Kovid Goyal 2019-02-05 10:07:09 +05:30
parent 1f203ba7b6
commit 848934643e
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -9,6 +9,12 @@ def select_form(form):
return form.attrs.get('id', None) == 'user-login' return form.attrs.get('id', None) == 'user-login'
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
class ForeignAffairsRecipe(BasicNewsRecipe): class ForeignAffairsRecipe(BasicNewsRecipe):
''' there are three modifications: ''' there are three modifications:
@ -37,7 +43,7 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
remove_javascript = True remove_javascript = True
needs_subscription = True needs_subscription = True
INDEX = 'http://www.foreignaffairs.com' INDEX = 'https://www.foreignaffairs.com'
FRONTPAGE = INDEX + '/magazine' FRONTPAGE = INDEX + '/magazine'
keep_only_tags = [ keep_only_tags = [
@ -50,36 +56,39 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
answer = [] answer = []
soup = self.index_to_soup(html.tostring(self.clean_fa_html( soup = self.index_to_soup(self.FRONTPAGE)
self.index_to_soup(self.FRONTPAGE, as_tree=True))))
div = soup.find( div = soup.find(
'div', attrs={'class': 'magazine-hero__image image_auto_width'}) 'div', attrs={'class': 'magazine-actions'})
self.cover_url = div.find('img')['src'] self.cover_url = div.find('img')['ng-src']
# get dates # get dates
date = re.split('\s\|\s', self.tag_to_string( date = re.split(r'\s\|\s', self.tag_to_string(
soup.head.title.string))[0] soup.head.title.string))[0]
self.title = "Foreign Affairs ({})".format(date) self.title = "Foreign Affairs ({})".format(date)
self.timefmt = u' [%s]' % date self.timefmt = u' [%s]' % date
for section in soup.findAll(attrs={'class':lambda x: x and 'magazine-list' in x.split()}): # Fetching article list does not work as site uses javascript
# to load articles dynamically
for section in soup.findAll('section', attrs={'class':lambda x: x and 'magazine-list' in x.split()}):
articles = [] articles = []
section_title = self.tag_to_string(section.find('h1')) section_title = self.tag_to_string(section.find('h2'))
for h2 in section.findAll('h2'): if 'special_section.title' in section_title:
a = h2.parent section_title = 'Special'
if a.get('href'): self.log('\nSection:', section_title)
title = self.tag_to_string(h2) for h3 in section.findAll(attrs={'class': lambda x: x and 'magazine-title' in x.split()}):
url = a['href'] a = h3.findParent('a', href=True)
atr = a.findNextSibling(attrs={'class':'author'}) title = self.tag_to_string(h3)
author = self.tag_to_string(atr) if atr else '' url = a['href']
desc = a.findNextSibling(attrs={'class': 'deck'}) atr = a.findNextSibling(attrs={'class':'author'})
if desc is not None: author = self.tag_to_string(atr) if atr else ''
description = self.tag_to_string(desc) desc = a.findNextSibling(attrs={'class': 'deck'})
else: if desc is not None:
description = '' description = self.tag_to_string(desc)
articles.append({'title': title, 'url': url, else:
'description': description, 'author': author}) description = ''
self.log(title) articles.append({'title': title, 'url': url,
self.log('\t' + url) 'description': description, 'author': author})
self.log(title)
self.log('\t' + url)
if articles: if articles:
answer.append((section_title, articles)) answer.append((section_title, articles))
return answer return answer
@ -98,10 +107,8 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
return html.tostring(root) return html.tostring(root)
def preprocess_html(self, soup): def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'src': True}): for img in soup.findAll('img', attrs={'ng-src': True}):
if not img['src'].startswith('http'): img['src'] = img['ng-src']
img['src'] = self.INDEX + img['src']
return soup return soup
def get_browser(self): def get_browser(self):