From 6e4ed94a6b525fbc05deb4799481eb2089a12d7a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 9 Mar 2020 22:00:00 +0530 Subject: [PATCH] Update Newsweek Fixes #1866636 [newsweek won't download](https://bugs.launchpad.net/calibre/+bug/1866636) --- recipes/newsweek.recipe | 74 +++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 43 deletions(-) diff --git a/recipes/newsweek.recipe b/recipes/newsweek.recipe index a8dc8d91e6..fc55dac112 100644 --- a/recipes/newsweek.recipe +++ b/recipes/newsweek.recipe @@ -1,3 +1,8 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2015, Kovid Goyal + +import json from calibre.web.feeds.news import BasicNewsRecipe from collections import defaultdict @@ -49,28 +54,23 @@ class Newsweek(BasicNewsRecipe): a = li.xpath('descendant::a[@href]')[0] url = href_to_url(a, add_piano=True) self.timefmt = self.tag_to_string(a) - img = li.xpath('descendant::a[@href]//img[@data-src]')[0] - self.cover_url = img.get('data-src').partition('?')[0] + img = li.xpath('descendant::a[@href]//source[@type="image/jpeg"]/@srcset')[0] + self.cover_url = img.partition('?')[0] + self.log('Found cover url:', self.cover_url) root = self.index_to_soup(url, as_tree=True) features = [] - try: - div = root.xpath('//div[@class="magazine-features"]')[0] - except IndexError: - pass - else: - for a in div.xpath('descendant::div[@class="h1"]//a[@href]'): - title = self.tag_to_string(a) - article = a.xpath('ancestor::article')[0] - desc = '' - s = article.xpath('descendant::div[@class="summary"]') - if s: - desc = self.tag_to_string(s[0]) - features.append({'title': title, 'url': href_to_url(a), 'description': desc}) - self.log(title, href_to_url(a)) + for article in root.xpath('//div[@class="magazine-features"]//article'): + a = article.xpath('descendant::a[@class="article-link"]')[0] + title = self.tag_to_string(a) + url = href_to_url(a) + desc = '' + s = article.xpath('descendant::div[@class="summary"]') + if s: + desc = self.tag_to_string(s[0]) + features.append({'title': title, 'url': href_to_url(a), 'description': desc}) + self.log(title, url) - index = [] - if features: - index.append(('Features', features)) + index = [('Features', features)] sections = defaultdict(list) for widget in ('editor-pick',): self.parse_widget(widget, sections) @@ -79,30 +79,18 @@ class Newsweek(BasicNewsRecipe): return index def parse_widget(self, widget, sections): - root = self.index_to_soup('https://d.newsweek.com/widget/' + widget, as_tree=True) - div = root.xpath('//div')[0] - href_xpath = 'descendant::*[local-name()="h1" or local-name()="h2" or local-name()="h3" or local-name()="h4"]/a[@href]' - for a in div.xpath(href_xpath): - title = self.tag_to_string(a) - article = a.xpath('ancestor::article')[0] - desc = '' - s = article.xpath('descendant::div[@class="summary"]') - if s: - desc = self.tag_to_string(s[0]) - sec = article.xpath('descendant::div[@class="category"]') - if sec: - sec = self.tag_to_string(sec[0]) - else: - sec = 'Articles' - sections[sec].append( - {'title': title, 'url': href_to_url(a), 'description': desc}) - self.log(title, href_to_url(a)) - if desc: - self.log('\t' + desc) - self.log('') - - def print_version(self, url): - return url + '?piano_d=1' + raw = self.index_to_soup('https://d.newsweek.com/json/' + widget, raw=True) + data = json.loads(raw)['items'] + for item in data: + title = item['title'] + url = BASE + item['link'] + self.log(title, url) + sections[item['label']].append( + { + 'title': title, + 'url': url, + 'description': item['description'], + }) def preprocess_html(self, soup): # Parallax images in the articles are loaded as background images