From 1b413d27f941b944df6cfa08e545c8c71387f327 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 8 Apr 2016 16:31:27 +0530 Subject: [PATCH] Update Newsweek --- recipes/newsweek.recipe | 159 ++++++++++++++++------------------------ 1 file changed, 63 insertions(+), 96 deletions(-) diff --git a/recipes/newsweek.recipe b/recipes/newsweek.recipe index f4de4cb0df..4f95f9cd73 100644 --- a/recipes/newsweek.recipe +++ b/recipes/newsweek.recipe @@ -1,116 +1,83 @@ -from calibre.web.feeds.jsnews import JavascriptRecipe -import datetime +from calibre.web.feeds.news import BasicNewsRecipe +from collections import defaultdict BASE = 'http://www.newsweek.com' def href_to_url(a, add_piano=False): - return BASE + a.get('href') + ('?piano_t=1' if add_piano else '') + return BASE + a.get('href') + ('?piano_d=1' if add_piano else '') -class Newsweek(JavascriptRecipe): +def class_sels(*args): + q = set(args) + return dict(attrs={'class':lambda x: x and set(x.split()).intersection(q)}) + +class Newsweek(BasicNewsRecipe): title = 'Newsweek' __author__ = 'Kovid Goyal' - description = 'Weekly news and current affairs in the US. Requires a subscription.' + description = 'Weekly news and current affairs in the US' language = 'en' encoding = 'utf-8' no_stylesheets = True requires_version = (1, 40, 0) - keep_only_tags = ['article.content-fullwidth'] + keep_only_tags = class_sels('article-header', 'article-body', 'header-image') remove_tags = [ - 'meta', '.block-openadstream', '.block-ibtmedia-social', '.issue-next', - '.most-popular', '.ibt-media-stories', '.user-btn-group', - '#taboola-below-main-column', '.trc_related_container', - '#block-nw-magazine-magazine-more-from-issue', '.block-ibtmedia-top-stories', + dict(name='meta'), + class_sels( + 'block-openadstream', 'block-ibtmedia-social', 'issue-next', + 'most-popular', 'ibt-media-stories', 'user-btn-group', + 'trial-link', 'trc_related_container', + 'block-ibtmedia-top-stories' + ), + dict(id=['taboola-below-main-column', 'piano-root', 'block-nw-magazine-magazine-more-from-issue']), ] - LOGIN = 'https://bar.piano-media.com/lite/authent/login//custom/newsweek/?service_id=25&loc=http%3A%2F%2Fwww.newsweek.com%2F' # noqa + remove_attributes = ['style'] - needs_subscription = True - def do_login(self, br, username, password): - br.visit(self.LOGIN) - form = br.select_form('#pianomedia_login_form') - form['login'] = username - form['password'] = password - br.submit() - - def get_publication_data(self, browser): - browser.wait_for_element('nav.main-menu a[href]') - root = self.index_to_soup(browser.html) - for a in root.xpath('''descendant-or-self::nav[@class and contains(concat(' ', normalize-space(@class), ' '), ' main-menu ')]/descendant-or-self::*/a[@href]'''): - if a.text and a.text.strip() == 'This Week\'s Edition': - return self.get_newsweek_publication_data(browser, href_to_url(a, True)) - - def get_newsweek_publication_data(self, browser, url): - root = self.index_to_soup(url) - sel = lambda expr: root.xpath(expr) - ans = {} - - for img in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' cover-story ')]/descendant-or-self::*/div[@class and contains(concat(' ', normalize-space(@class), ' '), ' info ')]/descendant-or-self::*/img[@src]'''): - if '_Cover_' in img.get('title', ''): - ans['cover'] = browser.get_resource(img.get('src')) - break - for title in root.xpath('//title'): - raw = title.text - if raw: - self.timefmt = datetime.datetime.strptime(raw, '%Y/%m/%d').strftime(' [%b %d]') - - sections = [] - for div in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' cover-story ')]/descendant-or-self::*/div[@class and contains(concat(' ', normalize-space(@class), ' '), ' info ')]'''): - url = None - for a in div.xpath('descendant::a[@href]'): - url = href_to_url(a) - break - for s in div.xpath('descendant::div[@class="summary"]'): - sections.append(('Cover Story', [{'title':'Cover Story', 'date':'', 'url':url, 'description':self.tag_to_string(s)}])) - break - features = [] - for li in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' features ')]/descendant-or-self::*/li'''): - url = None - for a in li.xpath('descendant::a[@class="article-link"]'): - url = href_to_url(a) - features.append({'title':self.tag_to_string(a), 'url':url}) - break - if features: - sections.append(('Features', features)) - - for div in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' issue-list-block ')]'''): - for d in div.xpath('descendant::div[@class="block-title"]'): - section_title = self.tag_to_string(d) - articles = [] - break + def parse_index(self): + root = self.index_to_soup('http://www.newsweek.com/archive', as_tree=True) + li = root.xpath('//ul[contains(@class, "magazine-archive-items")]/li')[0] + a = li.xpath('descendant::a[@href]')[0] + url = href_to_url(a, add_piano=True) + self.timefmt = self.tag_to_string(a) + img = li.xpath('descendant::a[@href]/img[@src]')[0] + self.cover_url = img.get('src') + root = self.index_to_soup(url, as_tree=True) + div = root.xpath('//div[@id="block-nw-magazine-magazine-cover-story"]')[0] + a = div.xpath('descendant::a[@href]')[0] + index = [('Cover', [{'title':'Cover story', 'url':href_to_url(a)}])] + sections = defaultdict(list) + div = root.xpath('//div[@id="block-nw-magazine-magazine-issue-story-list"]')[0] + for a in div.xpath('descendant::h3/a[@href and contains(@class, "article-link")]'): + title = self.tag_to_string(a) + li = a.xpath('ancestor::li')[0] + desc = '' + s = li.xpath('descendant::div[@class="summary"]') + if s: + desc = self.tag_to_string(s[0]) + sec = li.xpath('descendant::div[@class="category"]') + if sec: + sec = self.tag_to_string(sec[0]) else: - continue - for li in div.xpath('descendant::li'): - desc = '' - for d in li.xpath('descendant::div[@class="summary"]'): - desc = self.tag_to_string(d) - break - for a in li.xpath('descendant::a[@class="article-link"]'): - articles.append({'title':self.tag_to_string(a), 'url':href_to_url(a), 'description':desc}) - break - if articles: - sections.append((section_title, articles)) + sec = 'Articles' + sections[sec].append({'title':title, 'url':href_to_url(a), 'description':desc}) + self.log(title, url) + if desc: + self.log('\t' + desc) + self.log('') + for k in sorted(sections): + index.append((k, sections[k])) + return index - ans['index'] = sections - return ans + def print_version(self, url): + return url + '?piano_d=1' - def load_complete(self, browser, url, recursion_level): - browser.wait_for_element('div.article-body') - return browser.load_completed # This is needed to allow the parallax images to load - - def preprocess_stage1(self, article, browser, url, recursion_level): + def preprocess_html(self, soup): # Parallax images in the articles are loaded as background images # on tags. Convert them to normal images. - for span in browser.css_select('span.parallax-image', all=True): - bg = unicode(span.styleProperty('background-image', span.InlineStyle)) - if bg: - url = bg.strip().partition('(')[-1][:-1] - span.appendInside('' % url) - span.setAttribute('style', '') - browser.run_for_a_time(0.1) # This is needed to give the DOM time to update - - def postprocess_html(self, article, root, url, recursion_level): - for x in root.xpath('//*[local-name()="body" and @style]'): - del x.attrib['style'] # body has a fixed height, which causes problems with epub viewers - for x in root.xpath('//*[@id="piano-root"]'): - x.getparent().remove(x) - return root + for span in soup.findAll('span', attrs={'class':lambda x: x and 'parallax' in x.split()}): + s = span.find(style=True) + if s is not None: + url = s['style'].partition('(')[-1][:-1] + s['style'] = 'display: block' + s.name = 'img' + s['src'] = url + return soup