From 44ea182c2b9059f28936632dbc8d2371bfb5d17e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 14 Aug 2024 12:58:16 +0530 Subject: [PATCH] Science Journal by unkn0wn --- recipes/science_journal.recipe | 85 ++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 recipes/science_journal.recipe diff --git a/recipes/science_journal.recipe b/recipes/science_journal.recipe new file mode 100644 index 0000000000..a7b37215f9 --- /dev/null +++ b/recipes/science_journal.recipe @@ -0,0 +1,85 @@ +#!/usr/bin/env python + +from calibre.web.feeds.news import BasicNewsRecipe, classes + + +def absurl(url): + if url.startswith('/'): + url = 'https://www.science.org' + url + return url + + + +class science(BasicNewsRecipe): + title = 'Science Journal' + __author__ = 'unkn0wn' + description = ( + 'Science continues to publish the very best in research across the sciences, with articles that ' + 'consistently rank among the most cited in the world.' + ) + encoding = 'utf-8' + no_javascript = True + no_stylesheets = True + remove_attributes = ['style', 'height', 'width'] + language = 'en' + extra_css = ''' + .news-article__figure__caption {font-size:small; text-align:center;} + .contributors, .core-self-citation, .meta-panel__left-content, .news-article__hero__top-meta, + .news-article__hero__bottom-meta, #bibliography, #elettersSection {font-size:small;} + img {display:block; margin:0 auto;} + .core-lede {font-style:italic; color:#202020;} + ''' + + ignore_duplicate_articles = {'url'} + + keep_only_tags = [ + classes('meta-panel__left-content news-article__hero__info news-article__hero__figure bodySection'), + dict(name='h1', attrs={'property':'name'}), + classes('core-lede contributors core-self-citation'), + dict(attrs={'data-core-wrapper':'content'}) + ] + + remove_tags = [ + classes('pb-ad') + ] + browser_type = 'qt' + simultaneous_downloads = 1 + + def preprocess_html(self, soup): + for p in soup.findAll(attrs={'role':'paragraph'}): + p.name = 'p' + return soup + + def parse_index(self): + url = 'https://www.science.org/toc/science/current' + + soup = self.index_to_soup(url) + tme = soup.find(**classes('journal-issue__vol')) + if tme: + self.timefmt = ' [%s]' % self.tag_to_string(tme).strip() + det = soup.find(attrs={'id':'journal-issue-details'}) + if det: + self.description = self.tag_to_string(det).strip() + + feeds = [] + + div = soup.find('div', attrs={'class':'toc__body'}) + for sec in div.findAll('section', **classes('toc__section')): + name = sec.find(**classes('sidebar-article-title--decorated')) + section = self.tag_to_string(name).strip() + self.log(section) + + articles = [] + + for card in sec.findAll(**classes('card-header')): + ti = card.find(**classes('article-title')) + url = absurl(ti.a['href']) + title = self.tag_to_string(ti).strip() + desc = '' + meta = card.find(**classes('card-meta')) + if meta: + desc = self.tag_to_string(meta).strip() + self.log(' ', title, '\n\t', desc, '\n\t', url) + articles.append({'title': title, 'description':desc, 'url': url}) + feeds.append((section, articles)) + return feeds