mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Nature
This commit is contained in:
parent
e4b3f75c27
commit
60e3bece0b
@ -18,16 +18,22 @@ def check_words(words):
|
|||||||
return lambda x: x and frozenset(words.split()).intersection(x.split())
|
return lambda x: x and frozenset(words.split()).intersection(x.split())
|
||||||
|
|
||||||
|
|
||||||
|
def has_all_of(words):
|
||||||
|
return lambda x: x and frozenset(words.split()).issubset(x.split())
|
||||||
|
|
||||||
|
|
||||||
class Nature(BasicNewsRecipe):
|
class Nature(BasicNewsRecipe):
|
||||||
title = 'Nature'
|
title = 'Nature'
|
||||||
__author__ = 'Jose Ortiz'
|
__author__ = 'Jose Ortiz'
|
||||||
description = ('Nature is a weekly international multidisciplinary scientific journal'
|
description = (
|
||||||
|
'Nature is a weekly international multidisciplinary scientific journal'
|
||||||
' publishing peer-reviewed research in all fields of science and'
|
' publishing peer-reviewed research in all fields of science and'
|
||||||
' technology on the basis of its originality, importance,'
|
' technology on the basis of its originality, importance,'
|
||||||
' interdisciplinary interest, timeliness, accessibility, elegance and'
|
' interdisciplinary interest, timeliness, accessibility, elegance and'
|
||||||
' surprising conclusions. Nature also provides rapid, authoritative,'
|
' surprising conclusions. Nauture also provides rapid, authoritative,'
|
||||||
' insightful and arresting news and interpretation of topical and coming'
|
' insightful and arresting news and interpretation of topical and coming'
|
||||||
' trends affecting science, scientists and the wider public.')
|
' trends affecting science, scientists and the wider public.'
|
||||||
|
)
|
||||||
language = 'en'
|
language = 'en'
|
||||||
encoding = 'UTF-8'
|
encoding = 'UTF-8'
|
||||||
no_javascript = True
|
no_javascript = True
|
||||||
@ -37,15 +43,19 @@ class Nature(BasicNewsRecipe):
|
|||||||
dict(name='div', attrs={'data-component': check_words('article-container')})
|
dict(name='div', attrs={'data-component': check_words('article-container')})
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [dict(attrs={'class': check_words('hide-print')})]
|
||||||
dict(attrs={'class' : check_words('hide-print')})
|
|
||||||
]
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup(BASE + '/nature/current-issue')
|
soup = self.index_to_soup(BASE + '/nature/current-issue')
|
||||||
self.cover_url = 'https:' + soup.find('img',attrs={'data-test' : 'issue-cover-image'})['src']
|
self.cover_url = 'https:' + soup.find(
|
||||||
section_tags = soup.find('div', {'data-container-type' : check_words('issue-section-list')})
|
'img', attrs={'data-test': check_words('issue-cover-image')}
|
||||||
section_tags = section_tags.findAll('div', {'class' : check_words('article-section')})
|
)['src']
|
||||||
|
section_tags = soup.find(
|
||||||
|
'div', {'data-container-type': check_words('issue-section-list')}
|
||||||
|
)
|
||||||
|
section_tags = section_tags.findAll(
|
||||||
|
'div', {'class': check_words('article-section')}
|
||||||
|
)
|
||||||
|
|
||||||
sections = defaultdict(list)
|
sections = defaultdict(list)
|
||||||
ordered_sec_titles = []
|
ordered_sec_titles = []
|
||||||
@ -55,14 +65,36 @@ class Nature(BasicNewsRecipe):
|
|||||||
sec_title = self.tag_to_string(sec.find('h2'))
|
sec_title = self.tag_to_string(sec.find('h2'))
|
||||||
ordered_sec_titles.append(sec_title)
|
ordered_sec_titles.append(sec_title)
|
||||||
for article in sec.findAll('article'):
|
for article in sec.findAll('article'):
|
||||||
title = self.tag_to_string(article.find('h3', {'itemprop' : check_words('name headline')}))
|
try:
|
||||||
date = ' [' + self.tag_to_string(article.find('time', {'itemprop' : check_words('datePublished')})) + ']'
|
url = absurl(
|
||||||
author = self.tag_to_string(article.find('li', {'itemprop' : check_words('creator')}))
|
article.find('a', {'itemprop': check_words('url')})['href']
|
||||||
url = absurl(article.find('a',{'itemprop' : check_words('url')})['href'])
|
)
|
||||||
label = self.tag_to_string(article.find(attrs={'data-test' : check_words('article.type')}))
|
except TypeError:
|
||||||
description = label + ': ' + self.tag_to_string(article.find('div', attrs={'itemprop' : check_words('description')}))
|
continue
|
||||||
sections[sec_title].append(
|
title = self.tag_to_string(
|
||||||
{'title' : title, 'url' : url, 'description' : description, 'date' : date, 'author' : author})
|
article.find('h3', {'itemprop': has_all_of('name headline')})
|
||||||
|
)
|
||||||
|
date = ' [' + self.tag_to_string(
|
||||||
|
article.find('time', {'itemprop': check_words('datePublished')})
|
||||||
|
) + ']'
|
||||||
|
author = self.tag_to_string(
|
||||||
|
article.find('li', {'itemprop': check_words('creator')})
|
||||||
|
)
|
||||||
|
description = self.tag_to_string(
|
||||||
|
article.find(attrs={'data-test': check_words('article.type')})
|
||||||
|
) + u' • '
|
||||||
|
description += self.tag_to_string(
|
||||||
|
article.find(
|
||||||
|
'div', attrs={'itemprop': check_words('description')}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
sections[sec_title].append({
|
||||||
|
'title': title,
|
||||||
|
'url': url,
|
||||||
|
'description': description,
|
||||||
|
'date': date,
|
||||||
|
'author': author
|
||||||
|
})
|
||||||
|
|
||||||
for k in ordered_sec_titles:
|
for k in ordered_sec_titles:
|
||||||
index.append((k, sections[k]))
|
index.append((k, sections[k]))
|
||||||
@ -74,4 +106,8 @@ class Nature(BasicNewsRecipe):
|
|||||||
img['src'] = 'https:' + img['data-src']
|
img['src'] = 'https:' + img['data-src']
|
||||||
else:
|
else:
|
||||||
img['src'] = img['data-src']
|
img['src'] = img['data-src']
|
||||||
|
for div in soup.findAll(
|
||||||
|
'div', {'data-component': check_words('article-container')}
|
||||||
|
)[1:]:
|
||||||
|
div.extract()
|
||||||
return soup
|
return soup
|
||||||
|
Loading…
x
Reference in New Issue
Block a user