From 408990ffd8b1e76576e915dbc42eb443d7ec4f20 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 13 Jun 2019 16:29:45 +0530 Subject: [PATCH] Update Newsweek --- recipes/newsweek.recipe | 60 +++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/recipes/newsweek.recipe b/recipes/newsweek.recipe index 09c3955b24..a8dc8d91e6 100644 --- a/recipes/newsweek.recipe +++ b/recipes/newsweek.recipe @@ -49,17 +49,16 @@ class Newsweek(BasicNewsRecipe): a = li.xpath('descendant::a[@href]')[0] url = href_to_url(a, add_piano=True) self.timefmt = self.tag_to_string(a) - img = li.xpath('descendant::a[@href]//img[@src]')[0] - self.cover_url = img.get('src') + img = li.xpath('descendant::a[@href]//img[@data-src]')[0] + self.cover_url = img.get('data-src').partition('?')[0] root = self.index_to_soup(url, as_tree=True) features = [] - href_xpath = 'descendant::*[local-name()="h1" or local-name()="h2" or local-name()="h3" or local-name()="h4"]/a[@href]' try: - div = root.xpath('//div[@id="block-nw-magazine-magazine-features"]')[0] + div = root.xpath('//div[@class="magazine-features"]')[0] except IndexError: pass else: - for a in div.xpath(href_xpath): + for a in div.xpath('descendant::div[@class="h1"]//a[@href]'): title = self.tag_to_string(a) article = a.xpath('ancestor::article')[0] desc = '' @@ -73,34 +72,35 @@ class Newsweek(BasicNewsRecipe): if features: index.append(('Features', features)) sections = defaultdict(list) - for block in ('magazine-magazine-issue-story-list', 'editors-pick'): - div = root.xpath( - '//div[@id="block-nw-{}"]'.format(block)) - if not div: - continue - div = div[0] - for a in div.xpath(href_xpath): - title = self.tag_to_string(a) - article = a.xpath('ancestor::article')[0] - desc = '' - s = article.xpath('descendant::div[@class="summary"]') - if s: - desc = self.tag_to_string(s[0]) - sec = article.xpath('descendant::div[@class="category"]') - if sec: - sec = self.tag_to_string(sec[0]) - else: - sec = 'Articles' - sections[sec].append( - {'title': title, 'url': href_to_url(a), 'description': desc}) - self.log(title, href_to_url(a)) - if desc: - self.log('\t' + desc) - self.log('') + for widget in ('editor-pick',): + self.parse_widget(widget, sections) for k in sorted(sections): index.append((k, sections[k])) return index + def parse_widget(self, widget, sections): + root = self.index_to_soup('https://d.newsweek.com/widget/' + widget, as_tree=True) + div = root.xpath('//div')[0] + href_xpath = 'descendant::*[local-name()="h1" or local-name()="h2" or local-name()="h3" or local-name()="h4"]/a[@href]' + for a in div.xpath(href_xpath): + title = self.tag_to_string(a) + article = a.xpath('ancestor::article')[0] + desc = '' + s = article.xpath('descendant::div[@class="summary"]') + if s: + desc = self.tag_to_string(s[0]) + sec = article.xpath('descendant::div[@class="category"]') + if sec: + sec = self.tag_to_string(sec[0]) + else: + sec = 'Articles' + sections[sec].append( + {'title': title, 'url': href_to_url(a), 'description': desc}) + self.log(title, href_to_url(a)) + if desc: + self.log('\t' + desc) + self.log('') + def print_version(self, url): return url + '?piano_d=1' @@ -114,4 +114,6 @@ class Newsweek(BasicNewsRecipe): s['style'] = 'display: block' s.name = 'img' s['src'] = url + for img in soup.findAll('img', attrs={'data-src': True}): + img['src'] = img['data-src'] return soup