#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe, classes def absurl(url): if url.startswith('/'): url = 'https://www.science.org' + url return url class science(BasicNewsRecipe): title = 'Science Journal' __author__ = 'unkn0wn' description = ( 'Science continues to publish the very best in research across the sciences, with articles that ' 'consistently rank among the most cited in the world.' ) encoding = 'utf-8' no_javascript = True no_stylesheets = True remove_attributes = ['style', 'height', 'width'] language = 'en' extra_css = ''' .news-article__figure__caption {font-size:small; text-align:center;} .contributors, .core-self-citation, .meta-panel__left-content, .news-article__hero__top-meta, .news-article__hero__bottom-meta, #bibliography, #elettersSection {font-size:small;} img {display:block; margin:0 auto;} .core-lede {font-style:italic; color:#202020;} ''' ignore_duplicate_articles = {'url'} keep_only_tags = [ classes('meta-panel__left-content news-article__hero__info news-article__hero__figure bodySection'), dict(name='h1', attrs={'property':'name'}), classes('core-lede contributors core-self-citation'), dict(attrs={'data-core-wrapper':'content'}) ] remove_tags = [ classes('pb-ad') ] browser_type = 'qt' simultaneous_downloads = 1 # server returns invalid data on HTTP2 connections when multiple requests are queued on the same connection def preprocess_html(self, soup): for p in soup.findAll(attrs={'role':'paragraph'}): p.name = 'p' return soup def parse_index(self): url = 'https://www.science.org/toc/science/current' soup = self.index_to_soup(url) tme = soup.find(**classes('journal-issue__vol')) if tme: self.timefmt = ' [%s]' % self.tag_to_string(tme).strip() det = soup.find(attrs={'id':'journal-issue-details'}) if det: self.description = self.tag_to_string(det).strip() feeds = [] div = soup.find('div', attrs={'class':'toc__body'}) for sec in div.findAll('section', **classes('toc__section')): name = sec.find(**classes('sidebar-article-title--decorated')) section = self.tag_to_string(name).strip() self.log(section) articles = [] for card in sec.findAll(**classes('card-header')): ti = card.find(**classes('article-title')) url = absurl(ti.a['href']) title = self.tag_to_string(ti).strip() desc = '' meta = card.find(**classes('card-meta')) if meta: desc = self.tag_to_string(meta).strip() self.log(' ', title, '\n\t', desc, '\n\t', url) articles.append({'title': title, 'description':desc, 'url': url}) feeds.append((section, articles)) return feeds