#!/usr/bin/env python # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2020, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals import os from collections import defaultdict from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1282101454(BasicNewsRecipe): title = 'Popular Science' language = 'en' __author__ = 'Kovid Goyal' description = 'Popular Science' publisher = 'Popular Science' max_articles_per_feed = 100 ignore_duplicate_articles = {'url'} def parse_index(self): from lxml import etree raw = self.index_to_soup('https://www.popsci.com/arcio/rss/', raw=True) root = etree.fromstring(raw) ans = defaultdict(list) for item in root.iterdescendants('item'): title = item.find('title').text desc = item.find('description').text author = item.find('dc:creator', {'dc': 'http://purl.org/dc/elements/1.1/'}).text section = item.find('category').text content = item.find('content:encoded', {'content': 'http://purl.org/rss/1.0/modules/content/'}).text content = '

{title}

{desc}

{author}

{content}

'.format( title=title, author=author, desc=desc, content=content) if author: desc += ' by ' + author with PersistentTemporaryFile('popsci.html') as pt: pt.write(content.encode('utf-8')) ans[section].append({ 'title': title, 'description': desc, 'url': 'file:///' + pt.name.replace(os.sep, '/'), }) return [(sec, ans[sec]) for sec in sorted(ans)] def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-medsrc': True}): img['src'] = img['data-medsrc'] return soup