calibre/recipes/nature.recipe
un-pogaz 730ac72ee8 pep8: fix bare except
ruff 'E722'
2025-07-16 08:55:16 +02:00

124 lines
4.1 KiB
Python

#!/usr/bin/env python
import re
from collections import defaultdict
from calibre.web.feeds.news import BasicNewsRecipe, classes
BASE = 'https://www.nature.com'
def absurl(url):
if url.startswith('/'):
url = BASE + url
elif url.startswith('http://'):
url = 'https' + url[4:]
return url
def check_words(words):
return lambda x: x and frozenset(words.split()).intersection(x.split())
def has_all_of(words):
return lambda x: x and frozenset(words.split()).issubset(x.split())
class Nature(BasicNewsRecipe):
title = 'Nature'
__author__ = 'Jose Ortiz'
description = (
'Nature is a weekly international multidisciplinary scientific journal'
' publishing peer-reviewed research in all fields of science and'
' technology on the basis of its originality, importance,'
' interdisciplinary interest, timeliness, accessibility, elegance and'
' surprising conclusions. Nature also provides rapid, authoritative,'
' insightful and arresting news and interpretation of topical and coming'
' trends affecting science, scientists and the wider public.'
)
language = 'en'
encoding = 'UTF-8'
no_javascript = True
no_stylesheets = True
keep_only_tags = [dict(name='article')]
remove_tags = [
classes(
'u-hide-print hide-print c-latest-content__item c-context-bar '
'c-pdf-button__container u-js-hide'
),
dict(name='img', attrs={'class': ['visually-hidden']}),
]
def parse_index(self):
soup = self.index_to_soup(BASE + '/nature/current-issue')
self.cover_url = soup.find(
'img', attrs={'data-test': check_words('issue-cover-image')}
)['src']
try:
self.cover_url = re.sub(r'\bw\d+\b', 'w1000', self.cover_url) # enlarge cover size resolution
except Exception:
'''
failed, img src might have changed, use default width 200
'''
pass
section_tags = soup.find_all(
'section', attrs={'data-container-type': 'issue-section-list'}
)
sections = defaultdict(list)
ordered_sec_titles = []
index = []
for sec in section_tags:
sec_title = self.tag_to_string(sec.find('h2'))
ordered_sec_titles.append(sec_title)
for article in sec.findAll('article'):
try:
url = absurl(
article.find('a', {'itemprop': check_words('url')})['href']
)
except TypeError:
continue
title = self.tag_to_string(
article.find('h3', {'itemprop': has_all_of('name headline')})
)
date = ' [' + self.tag_to_string(
article.find('time', {'itemprop': check_words('datePublished')})
) + ']'
author = self.tag_to_string(
article.find('li', {'itemprop': check_words('creator')})
)
description = self.tag_to_string(
article.find(attrs={'data-test': check_words('article.type')})
) + u''
description += self.tag_to_string(
article.find(
'div', attrs={'itemprop': check_words('description')}
)
)
sections[sec_title].append({
'title': title,
'url': url,
'description': description,
'date': date,
'author': author
})
for k in ordered_sec_titles:
index.append((k, sections[k]))
return index
def preprocess_html(self, soup):
for img in soup.findAll('img', {'data-src': True}):
if img['data-src'].startswith('//'):
img['src'] = 'https:' + img['data-src']
else:
img['src'] = img['data-src']
for div in soup.findAll(
'div', {'data-component': check_words('article-container')}
)[1:]:
div.extract()
return soup