calibre/recipes/science_advances.recipe
2024-09-30 10:03:22 +05:30

116 lines
4.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
from calibre.web.feeds.news import BasicNewsRecipe, classes
def absurl(url):
if url.startswith('/'):
url = 'https://www.science.org' + url
return url
class scienceadv(BasicNewsRecipe):
title = 'Science Advances'
__author__ = 'unkn0wn'
description = (
'Science Advances is the American Association for the Advancement of Sciences (AAAS) open access '
'multidisciplinary journal, publishing impactful research papers and reviews in any area of science, in '
'both disciplinary-specific and broad, interdisciplinary areas. The mission of Science Advances is to provide '
';fair, fast, and expert peer review to authors and a vetted selection of research, freely available to readers.'
)
encoding = 'utf-8'
no_javascript = True
no_stylesheets = True
remove_attributes = ['style', 'height', 'width']
masthead_url = 'https://www.science.org/pb-assets/images/logos/sciadv-logo-1620488349693.svg'
language = 'en'
simultaneous_downloads = 1
browser_type = 'webengine'
extra_css = '''
.news-article__figure__caption, .figc {font-size:small;}
.core-self-citation, .meta-panel__left-content, .news-article__hero__top-meta {font-size:small;}
.contributors, .news-article__hero__bottom-meta, #bibliography, #elettersSection {font-size:small;}
img {display:block; margin:0 auto;}
.core-lede {font-style:italic; color:#202020;}
'''
ignore_duplicate_articles = {'url'}
keep_only_tags = [
classes('meta-panel__left-content news-article__hero__info news-article__hero__figure bodySection'),
dict(name='h1', attrs={'property':'name'}),
dict(name='div', **classes('core-lede contributors core-self-citation')),
dict(attrs={'data-core-wrapper':'content'})
]
remove_tags = [
classes('pb-ad news-article__hero__scroller news-article__version-of-story')
]
recipe_specific_options = {
'issue': {
'short': 'Enter the Issue Number you want to download\n(Vol/Issue format)',
'long': 'For example, 385/6710',
'default': 'current'
},
'res': {
'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
'default': '600'
}
}
def preprocess_html(self, soup):
for p in soup.findAll(attrs={'role':'paragraph'}):
p.name = 'p'
p.attrs = {}
for img in soup.findAll('img', attrs={'src':True}):
if img['src'].endswith('.jpg'):
res = '/cdn-cgi/image/width=600'
w = self.recipe_specific_options.get('res')
if w and isinstance(w, str):
res = '/cdn-cgi/image/width=' + w
img['src'] = absurl(res + img['src'])
for figc in soup.findAll('figcaption'):
figc['class'] = 'figc'
return soup
def parse_index(self):
issue_url = 'https://www.science.org/toc/sciadv/current'
d = self.recipe_specific_options.get('issue')
if d and isinstance(d, str):
issue_url = 'https://www.science.org/toc/sciadv/' + d
soup = self.index_to_soup(issue_url)
tme = soup.find(**classes('journal-issue__vol'))
if tme:
self.timefmt = ' [%s]' % self.tag_to_string(tme).strip().replace('|', ' | ')
det = soup.find(attrs={'id':'journal-issue-details'})
if det:
self.description = self.tag_to_string(det).strip()
cov = soup.find(**classes('cover-image__image'))
if cov:
self.cover_url = absurl('/cdn-cgi/image/width=800' + cov.img['src'])
feeds = []
for sec in soup.findAll('section', **classes('toc__section')):
name = sec.find(**classes('sidebar-article-title--decorated'))
section = self.tag_to_string(name).strip()
self.log(section)
articles = []
for card in sec.findAll(**classes('card-header')):
ti = card.find(**classes('article-title'))
url = absurl(ti.a['href'])
title = self.tag_to_string(ti).strip()
desc = ''
meta = card.find(**classes('card-meta'))
if meta:
desc = self.tag_to_string(meta).strip()
self.log(' ', title, '\n\t', desc, '\n\t\t', url)
articles.append({'title': title, 'description':desc, 'url': url})
feeds.append((section, articles))
return feeds