From ec7a9059aeff73902671255a6c1323feea361e28 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 9 Jan 2019 10:09:23 +0530 Subject: [PATCH] Nature by Jose Ortiz --- recipes/nature.recipe | 77 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 recipes/nature.recipe diff --git a/recipes/nature.recipe b/recipes/nature.recipe new file mode 100644 index 0000000000..b384d13e23 --- /dev/null +++ b/recipes/nature.recipe @@ -0,0 +1,77 @@ +#!/usr/bin/env python2 + +from collections import defaultdict +from calibre.web.feeds.news import BasicNewsRecipe + +BASE = 'https://www.nature.com' + + +def absurl(url): + if url.startswith('/'): + url = BASE + url + elif url.startswith('http://'): + url = 'https' + url[4:] + return url + + +def check_words(words): + return lambda x: x and frozenset(words.split()).intersection(x.split()) + + +class Nature(BasicNewsRecipe): + title = 'Nature' + __author__ = 'Jose Ortiz' + description = ('Nature is a weekly international multidisciplinary scientific journal' + ' publishing peer-reviewed research in all fields of science and' + ' technology on the basis of its originality, importance,' + ' interdisciplinary interest, timeliness, accessibility, elegance and' + ' surprising conclusions. Nature also provides rapid, authoritative,' + ' insightful and arresting news and interpretation of topical and coming' + ' trends affecting science, scientists and the wider public.') + language = 'en' + encoding = 'UTF-8' + no_javascript = True + no_stylesheets = True + + keep_only_tags = [ + dict(name='div',attrs={'data-component' : check_words('article-container')}) + ] + + remove_tags = [ + dict(attrs={'class' : check_words('hide-print')}) + ] + + def parse_index(self): + soup = self.index_to_soup(BASE + '/nature/current-issue') + self.cover_url = 'https:' + soup.find('img',attrs={'data-test' : 'issue-cover-image'})['src'] + section_tags = soup.find('div', {'data-container-type' : check_words('issue-section-list')}) + section_tags = section_tags.findAll('div', {'class' : check_words('article-section')}) + + sections = defaultdict(list) + ordered_sec_titles = [] + index = [] + + for sec in section_tags: + sec_title = self.tag_to_string(sec.find('h2')) + ordered_sec_titles.append(sec_title) + for article in sec.findAll('article'): + title = self.tag_to_string(article.find('h3', {'itemprop' : check_words('name headline')})) + date = ' [' + self.tag_to_string(article.find('time', {'itemprop' : check_words('datePublished')})) + ']' + author = self.tag_to_string(article.find('li', {'itemprop' : check_words('creator')})) + url = absurl(article.find('a',{'itemprop' : check_words('url')})['href']) + label = self.tag_to_string(article.find(attrs={'data-test' : check_words('article.type')})) + description = label + ': ' + self.tag_to_string(article.find('div', attrs={'itemprop' : check_words('description')})) + sections[sec_title].append( + {'title' : title, 'url' : url, 'description' : description, 'date' : date, 'author' : author}) + + for k in ordered_sec_titles: + index.append((k, sections[k])) + return index + + def preprocess_html(self, soup): + for img in soup.findAll('img',{'data-src' : True}): + if img['data-src'].startswith('//'): + img['src'] = 'https:' + img['data-src'] + else: + img['src'] = img['data-src'] + return soup