Update Nature

This commit is contained in:
Kovid Goyal 2019-01-18 06:14:22 +05:30
parent e4b3f75c27
commit 60e3bece0b
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -18,34 +18,44 @@ def check_words(words):
return lambda x: x and frozenset(words.split()).intersection(x.split()) return lambda x: x and frozenset(words.split()).intersection(x.split())
def has_all_of(words):
return lambda x: x and frozenset(words.split()).issubset(x.split())
class Nature(BasicNewsRecipe): class Nature(BasicNewsRecipe):
title = 'Nature' title = 'Nature'
__author__ = 'Jose Ortiz' __author__ = 'Jose Ortiz'
description = ('Nature is a weekly international multidisciplinary scientific journal' description = (
'Nature is a weekly international multidisciplinary scientific journal'
' publishing peer-reviewed research in all fields of science and' ' publishing peer-reviewed research in all fields of science and'
' technology on the basis of its originality, importance,' ' technology on the basis of its originality, importance,'
' interdisciplinary interest, timeliness, accessibility, elegance and' ' interdisciplinary interest, timeliness, accessibility, elegance and'
' surprising conclusions. Nature also provides rapid, authoritative,' ' surprising conclusions. Nauture also provides rapid, authoritative,'
' insightful and arresting news and interpretation of topical and coming' ' insightful and arresting news and interpretation of topical and coming'
' trends affecting science, scientists and the wider public.') ' trends affecting science, scientists and the wider public.'
)
language = 'en' language = 'en'
encoding = 'UTF-8' encoding = 'UTF-8'
no_javascript = True no_javascript = True
no_stylesheets = True no_stylesheets = True
keep_only_tags = [ keep_only_tags = [
dict(name='div',attrs={'data-component' : check_words('article-container')}) dict(name='div', attrs={'data-component': check_words('article-container')})
] ]
remove_tags = [ remove_tags = [dict(attrs={'class': check_words('hide-print')})]
dict(attrs={'class' : check_words('hide-print')})
]
def parse_index(self): def parse_index(self):
soup = self.index_to_soup(BASE + '/nature/current-issue') soup = self.index_to_soup(BASE + '/nature/current-issue')
self.cover_url = 'https:' + soup.find('img',attrs={'data-test' : 'issue-cover-image'})['src'] self.cover_url = 'https:' + soup.find(
section_tags = soup.find('div', {'data-container-type' : check_words('issue-section-list')}) 'img', attrs={'data-test': check_words('issue-cover-image')}
section_tags = section_tags.findAll('div', {'class' : check_words('article-section')}) )['src']
section_tags = soup.find(
'div', {'data-container-type': check_words('issue-section-list')}
)
section_tags = section_tags.findAll(
'div', {'class': check_words('article-section')}
)
sections = defaultdict(list) sections = defaultdict(list)
ordered_sec_titles = [] ordered_sec_titles = []
@ -55,23 +65,49 @@ class Nature(BasicNewsRecipe):
sec_title = self.tag_to_string(sec.find('h2')) sec_title = self.tag_to_string(sec.find('h2'))
ordered_sec_titles.append(sec_title) ordered_sec_titles.append(sec_title)
for article in sec.findAll('article'): for article in sec.findAll('article'):
title = self.tag_to_string(article.find('h3', {'itemprop' : check_words('name headline')})) try:
date = ' [' + self.tag_to_string(article.find('time', {'itemprop' : check_words('datePublished')})) + ']' url = absurl(
author = self.tag_to_string(article.find('li', {'itemprop' : check_words('creator')})) article.find('a', {'itemprop': check_words('url')})['href']
url = absurl(article.find('a',{'itemprop' : check_words('url')})['href']) )
label = self.tag_to_string(article.find(attrs={'data-test' : check_words('article.type')})) except TypeError:
description = label + ': ' + self.tag_to_string(article.find('div', attrs={'itemprop' : check_words('description')})) continue
sections[sec_title].append( title = self.tag_to_string(
{'title' : title, 'url' : url, 'description' : description, 'date' : date, 'author' : author}) article.find('h3', {'itemprop': has_all_of('name headline')})
)
date = ' [' + self.tag_to_string(
article.find('time', {'itemprop': check_words('datePublished')})
) + ']'
author = self.tag_to_string(
article.find('li', {'itemprop': check_words('creator')})
)
description = self.tag_to_string(
article.find(attrs={'data-test': check_words('article.type')})
) + u' • '
description += self.tag_to_string(
article.find(
'div', attrs={'itemprop': check_words('description')}
)
)
sections[sec_title].append({
'title': title,
'url': url,
'description': description,
'date': date,
'author': author
})
for k in ordered_sec_titles: for k in ordered_sec_titles:
index.append((k, sections[k])) index.append((k, sections[k]))
return index return index
def preprocess_html(self, soup): def preprocess_html(self, soup):
for img in soup.findAll('img',{'data-src' : True}): for img in soup.findAll('img', {'data-src': True}):
if img['data-src'].startswith('//'): if img['data-src'].startswith('//'):
img['src'] = 'https:' + img['data-src'] img['src'] = 'https:' + img['data-src']
else: else:
img['src'] = img['data-src'] img['src'] = img['data-src']
for div in soup.findAll(
'div', {'data-component': check_words('article-container')}
)[1:]:
div.extract()
return soup return soup