#!/usr/bin/env python # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2020, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class AdvancedUserRecipe1282101454(BasicNewsRecipe): title = 'Popular Science' language = 'en' __author__ = 'Kovid Goyal' description = 'Popular Science' publisher = 'Popular Science' max_articles_per_feed = 100 ignore_duplicate_articles = {'url'} no_stylesheets = True keep_only_tags = [ classes('Article-header Article-excerpt Article-author Article-thumbnail Article-bodyText article-title article-dek article-paragraph articlebody'), ] remove_tags = [ dict(name='section', attrs={'class': ['recurrent-share']}) ] def parse_section_index(self, slug): url = 'https://www.popsci.com/{}/'.format(slug) self.log('Section:', url) soup = self.index_to_soup(url) main = soup.find(**classes('category-content-wrapper lg:pb-12')) if main is None: return for div in main.findAll(**classes('card-post')): a = div.find('a', href=True, **classes('card-post-image-link')) url = a['href'] tdiv = div.find(**classes('card-post-title')).find(**classes('mobile')) title = self.tag_to_string(tdiv) desc = '' dek = div.find(**classes('card-post-excerpt')) if dek is not None: desc = self.tag_to_string(dek) self.log(' ', title, url) yield {'title': title, 'url': url, 'description': desc} def parse_index(self): sections = [] for slug, title in { 'science': 'Science', 'technology': 'Technology', 'diy': 'DIY', 'reviews': 'Reviews', }.items(): self.log('slug:', slug) articles = list(self.parse_section_index(slug)) if articles: sections.append((title, articles)) return sections def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-medsrc': True}): img['src'] = img['data-medsrc'] return soup