#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes def absurl(url): if url.startswith('/'): url = 'https://www.science.org' + url return url class scienceadv(BasicNewsRecipe): title = 'Science Translational Medicine' __author__ = 'unkn0wn' description = ( 'Science Translational Medicine is the leading weekly online journal publishing translational research at the intersection of science, ' 'engineering and medicine. The goal of Science Translational Medicine is to promote human health by providing a forum for communicating ' 'the latest research advances from biomedical, translational, and clinical researchers from all established and emerging disciplines relevant ' 'to medicine. In addition to original research, Science Translational Medicine also publishes Reviews, Editorials, Focus articles, and Viewpoints.' ) encoding = 'utf-8' no_javascript = True no_stylesheets = True remove_attributes = ['style', 'height', 'width'] masthead_url = ( 'https://www.science.org/pb-assets/images/logos/stm-logo-1620488350153.svg' ) language = 'en' simultaneous_downloads = 1 browser_type = 'webengine' extra_css = ''' .news-article__figure__caption, .calibre-nuked-tag-figcaption, .card-related {font-size:small;} .core-self-citation, .meta-panel__left-content, .news-article__hero__top-meta {font-size:small;} .contributors, .news-article__hero__bottom-meta, #bibliography, #elettersSection {font-size:small;} img {display:block; margin:0 auto;} .core-lede {font-style:italic; color:#202020;} ''' ignore_duplicate_articles = {'url'} keep_only_tags = [ classes( 'meta-panel__left-content news-article__hero__info news-article__hero__figure bodySection' ), dict(name='h1', attrs={'property': 'name'}), dict(name='div', **classes('core-lede contributors core-self-citation')), dict(attrs={'data-core-wrapper': 'content'}), ] remove_tags = [ classes('pb-ad news-article__hero__scroller news-article__version-of-story') ] recipe_specific_options = { 'issue': { 'short': 'Enter the Issue Number you want to download\n(Vol/Issue format)', 'long': 'For example, 385/6710', 'default': 'current', }, 'res': { 'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500', 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.', 'default': '600', }, } def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'src': True}): if img['src'].endswith('.jpg'): res = '/cdn-cgi/image/width=600' w = self.recipe_specific_options.get('res') if w and isinstance(w, str): res = '/cdn-cgi/image/width=' + w img['src'] = absurl(res + img['src']) for div in soup.findAll('div', attrs={'role': 'paragraph'}): div.name = 'p' return soup def postprocess_html(self, soup, first_fetch): bd = soup.find('body') if bd: bd.attrs = {} return soup def parse_index(self): issue_url = 'https://www.science.org/toc/stm/current' d = self.recipe_specific_options.get('issue') if d and isinstance(d, str): issue_url = 'https://www.science.org/toc/stm/' + d soup = self.index_to_soup(issue_url) tme = soup.find(**classes('journal-issue__vol')) if tme: self.timefmt = ' [%s]' % self.tag_to_string(tme).strip().replace('|', ' | ') det = soup.find(attrs={'id': 'journal-issue-details'}) if det: self.description = self.tag_to_string(det).strip() cov = soup.find(**classes('cover-image__image')) if cov: self.cover_url = absurl('/cdn-cgi/image/width=800' + cov.img['src']) feeds = [] for sec in soup.findAll('section', **prefixed_classes('toc__section')): name = sec.find(**classes('sidebar-article-title--decorated')) section = self.tag_to_string(name).strip() self.log(section) articles = [] for card in sec.findAll(**classes('card-header')): ti = card.find(**classes('article-title')) url = absurl(ti.a['href']) title = self.tag_to_string(ti).strip() desc = '' meta = card.find(**classes('card-meta')) if meta: desc = self.tag_to_string(meta).strip() self.log(' ', title, '\n\t', desc, '\n\t\t', url) articles.append({'title': title, 'description': desc, 'url': url}) feeds.append((section, articles)) return feeds