#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes def absurl(url): if url.startswith('/'): url = 'https://www.science.org' + url return url class scienceadv(BasicNewsRecipe): title = 'Science Robotics' __author__ = 'unkn0wn' description = ( 'Science Robotics provides a much-needed forum for the latest technological advances and for the critical social, ethical ' 'and policy issues surrounding robotics. Science Robotics caters to both researchers and general stakeholders. It is multidisciplinary, ' 'covering the traditional disciplines of robotics, as well as emerging trends such as advanced materials and bio-inspired designs; it covers ' 'all scales, from very large systems to micro/nano robots; its scope is broad, addressing both theoretical advances and practical applications.' ) encoding = 'utf-8' no_javascript = True no_stylesheets = True remove_attributes = ['style', 'height', 'width'] masthead_url = 'https://www.science.org/pb-assets/images/logos/scirobotics-logo-1620488350107.svg' language = 'en' simultaneous_downloads = 1 browser_type = 'webengine' extra_css = ''' .news-article__figure__caption, .calibre-nuked-tag-figcaption, .card-related {font-size:small;} .core-self-citation, .meta-panel__left-content, .news-article__hero__top-meta {font-size:small;} .contributors, .news-article__hero__bottom-meta, #bibliography, #elettersSection {font-size:small;} img {display:block; margin:0 auto;} .core-lede {font-style:italic; color:#202020;} ''' ignore_duplicate_articles = {'url'} keep_only_tags = [ classes( 'meta-panel__left-content news-article__hero__info news-article__hero__figure bodySection' ), dict(name='h1', attrs={'property': 'name'}), dict(name='div', **classes('core-lede contributors core-self-citation')), dict(attrs={'data-core-wrapper': 'content'}), ] remove_tags = [ classes('pb-ad news-article__hero__scroller news-article__version-of-story') ] recipe_specific_options = { 'issue': { 'short': 'Enter the Issue Number you want to download\n(Vol/Issue format)', 'long': 'For example, 385/6710', 'default': 'current', }, 'res': { 'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500', 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.', 'default': '600', }, } def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'src': True}): if img['src'].endswith('.jpg'): res = '/cdn-cgi/image/width=600' w = self.recipe_specific_options.get('res') if w and isinstance(w, str): res = '/cdn-cgi/image/width=' + w img['src'] = absurl(res + img['src']) for div in soup.findAll('div', attrs={'role': 'paragraph'}): div.name = 'p' return soup def postprocess_html(self, soup, first_fetch): bd = soup.find('body') if bd: bd.attrs = {} return soup def parse_index(self): issue_url = 'https://www.science.org/toc/scirobotics/current' d = self.recipe_specific_options.get('issue') if d and isinstance(d, str): issue_url = 'https://www.science.org/toc/scirobotics/' + d soup = self.index_to_soup(issue_url) tme = soup.find(**classes('journal-issue__vol')) if tme: self.timefmt = ' [%s]' % self.tag_to_string(tme).strip().replace('|', ' | ') det = soup.find(attrs={'id': 'journal-issue-details'}) if det: self.description = self.tag_to_string(det).strip() cov = soup.find(**classes('cover-image__image')) if cov: self.cover_url = absurl('/cdn-cgi/image/width=800' + cov.img['src']) feeds = [] for sec in soup.findAll('section', **prefixed_classes('toc__section')): name = sec.find(**classes('sidebar-article-title--decorated')) section = self.tag_to_string(name).strip() self.log(section) articles = [] for card in sec.findAll(**classes('card-header')): ti = card.find(**classes('article-title')) url = absurl(ti.a['href']) title = self.tag_to_string(ti).strip() desc = '' meta = card.find(**classes('card-meta')) if meta: desc = self.tag_to_string(meta).strip() self.log(' ', title, '\n\t', desc, '\n\t\t', url) articles.append({'title': title, 'description': desc, 'url': url}) feeds.append((section, articles)) return feeds