From 848934643ed663631ac859e0b7e8703e14e76fcc Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 5 Feb 2019 10:07:09 +0530 Subject: [PATCH] Work on Foreign Affairs --- recipes/foreignaffairs.recipe | 63 +++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 28 deletions(-) diff --git a/recipes/foreignaffairs.recipe b/recipes/foreignaffairs.recipe index ffacbd8e88..a3f5436d61 100644 --- a/recipes/foreignaffairs.recipe +++ b/recipes/foreignaffairs.recipe @@ -9,6 +9,12 @@ def select_form(form): return form.attrs.get('id', None) == 'user-login' +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={ + 'class': lambda x: x and frozenset(x.split()).intersection(q)}) + + class ForeignAffairsRecipe(BasicNewsRecipe): ''' there are three modifications: @@ -37,7 +43,7 @@ class ForeignAffairsRecipe(BasicNewsRecipe): remove_javascript = True needs_subscription = True - INDEX = 'http://www.foreignaffairs.com' + INDEX = 'https://www.foreignaffairs.com' FRONTPAGE = INDEX + '/magazine' keep_only_tags = [ @@ -50,36 +56,39 @@ class ForeignAffairsRecipe(BasicNewsRecipe): def parse_index(self): answer = [] - soup = self.index_to_soup(html.tostring(self.clean_fa_html( - self.index_to_soup(self.FRONTPAGE, as_tree=True)))) + soup = self.index_to_soup(self.FRONTPAGE) div = soup.find( - 'div', attrs={'class': 'magazine-hero__image image_auto_width'}) - self.cover_url = div.find('img')['src'] + 'div', attrs={'class': 'magazine-actions'}) + self.cover_url = div.find('img')['ng-src'] # get dates - date = re.split('\s\|\s', self.tag_to_string( + date = re.split(r'\s\|\s', self.tag_to_string( soup.head.title.string))[0] self.title = "Foreign Affairs ({})".format(date) self.timefmt = u' [%s]' % date - for section in soup.findAll(attrs={'class':lambda x: x and 'magazine-list' in x.split()}): + # Fetching article list does not work as site uses javascript + # to load articles dynamically + for section in soup.findAll('section', attrs={'class':lambda x: x and 'magazine-list' in x.split()}): articles = [] - section_title = self.tag_to_string(section.find('h1')) - for h2 in section.findAll('h2'): - a = h2.parent - if a.get('href'): - title = self.tag_to_string(h2) - url = a['href'] - atr = a.findNextSibling(attrs={'class':'author'}) - author = self.tag_to_string(atr) if atr else '' - desc = a.findNextSibling(attrs={'class': 'deck'}) - if desc is not None: - description = self.tag_to_string(desc) - else: - description = '' - articles.append({'title': title, 'url': url, - 'description': description, 'author': author}) - self.log(title) - self.log('\t' + url) + section_title = self.tag_to_string(section.find('h2')) + if 'special_section.title' in section_title: + section_title = 'Special' + self.log('\nSection:', section_title) + for h3 in section.findAll(attrs={'class': lambda x: x and 'magazine-title' in x.split()}): + a = h3.findParent('a', href=True) + title = self.tag_to_string(h3) + url = a['href'] + atr = a.findNextSibling(attrs={'class':'author'}) + author = self.tag_to_string(atr) if atr else '' + desc = a.findNextSibling(attrs={'class': 'deck'}) + if desc is not None: + description = self.tag_to_string(desc) + else: + description = '' + articles.append({'title': title, 'url': url, + 'description': description, 'author': author}) + self.log(title) + self.log('\t' + url) if articles: answer.append((section_title, articles)) return answer @@ -98,10 +107,8 @@ class ForeignAffairsRecipe(BasicNewsRecipe): return html.tostring(root) def preprocess_html(self, soup): - for img in soup.findAll('img', attrs={'src': True}): - if not img['src'].startswith('http'): - img['src'] = self.INDEX + img['src'] - + for img in soup.findAll('img', attrs={'ng-src': True}): + img['src'] = img['ng-src'] return soup def get_browser(self):