diff --git a/recipes/new_yorker.recipe b/recipes/new_yorker.recipe index 37d6f008c6..bed4a68963 100644 --- a/recipes/new_yorker.recipe +++ b/recipes/new_yorker.recipe @@ -3,7 +3,7 @@ # License: GPLv3 Copyright: 2016, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals -from collections import defaultdict +from collections import OrderedDict from calibre import browser from calibre.ebooks.BeautifulSoup import Tag @@ -31,8 +31,8 @@ def new_tag(soup, name, attrs=()): class NewYorker(BasicNewsRecipe): - title = 'New Yorker Magazine' - description = 'Content from the New Yorker website' + title = "The New Yorker Magazine" + description = "Articles of the week's New Yorker magazine" url_list = [] language = 'en' @@ -69,9 +69,9 @@ class NewYorker(BasicNewsRecipe): return soup def parse_index(self): - soup = self.index_to_soup( - 'https://www.newyorker.com/magazine?intcid=magazine') - # soup = self.index_to_soup('file:///t/raw.html') + + # Get cover + cover_soup = self.index_to_soup('https://www.newyorker.com/archive') cover_img = cover_soup.find( attrs={'class': lambda x: x and 'MagazineSection__cover___' in x}) @@ -86,36 +86,63 @@ class NewYorker(BasicNewsRecipe): self.cover_url = self.cover_url.replace(old_width, "w_560") except Exception: self.log('Failed enlarging cover img, using the original one') - self.log('Found cover:', self.cover_url) - stories = defaultdict(list) - last_section = 'Unknown' - for story in soup.findAll( - attrs={'class': lambda x: x and 'River__riverItemContent___' in x}): - try: - section = self.tag_to_string( - story.find('a')['title']) or last_section - except KeyError: - section = last_section - last_section = section - h4 = story.find('h4') - title = self.tag_to_string(h4) - a = story.find('h4').parent - url = absurl(a['href']) - desc = '' - body = story.find(attrs={'class': 'River__dek___CayIg'}) - if body is not None: - desc = body.contents[0] - self.log('Found article:', title) - self.log('\t' + url) - self.log('\t' + desc) - self.log('') - stories[section].append({ - 'title': title, - 'url': url, - 'description': desc}) - return [(k, stories[k]) for k in sorted(stories)] + # Get content + + soup = self.index_to_soup( + 'https://www.newyorker.com/magazine?intcid=magazine') + stories = OrderedDict() # So we can list sections in order + + # Iterate sections of content + + for section_soup in soup.findAll( + attrs={'class': lambda x: x and 'MagazinePageSection__section___21cc7' in x}): + section = section_soup.find('h2').text + self.log("Found section:", section) + + # Iterate stories in section + + is_mail_section = (section == "Mail") + + if is_mail_section: + cname = "Link__link___" + else: + cname = "River__riverItemContent___" + + for story in section_soup.findAll( + attrs={'class': lambda x: x and cname in x}): + + title = "" + url = "" + desc = "" + + if is_mail_section: + title = story.text + url = absurl(story['href']) + else: + h4 = story.find('h4') + title = self.tag_to_string(h4) + a = story.find('h4').parent + url = absurl(a['href']) + # Get description + body = story.find(attrs={'class': 'River__dek___CayIg'}) + if body is not None: + desc = body.contents[0] + + self.log('Found article:', title) + self.log('\t' + url) + self.log('\t' + desc) + self.log('') + + if section not in stories: + stories[section] = [] + stories[section].append({ + 'title': title, + 'url': url, + 'description': desc}) + + return [(k, stories[k]) for k, v in stories.items()] # The New Yorker changes the content it delivers based on cookies, so the # following ensures that we send no cookies