diff --git a/recipes/popscience.recipe b/recipes/popscience.recipe index 52a2c9b7cc..817da2f6c3 100644 --- a/recipes/popscience.recipe +++ b/recipes/popscience.recipe @@ -1,4 +1,5 @@ from calibre.web.feeds.news import BasicNewsRecipe +import re class AdvancedUserRecipe1282101454(BasicNewsRecipe): @@ -12,19 +13,30 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe): no_stylesheets = True remove_javascript = True use_embedded_content = False + remove_empty_feeds = True + ignore_duplicate_articles = {'url'} feeds = [ - ('Gadgets', 'http://www.popsci.com/full-feed/gadgets'), ('Cars', 'http://www.popsci.com/full-feed/cars'), ('Science', 'http://www.popsci.com/full-feed/science'), ('Technology', 'http://www.popsci.com/full-feed/technology'), ('DIY', 'http://www.popsci.com/full-feed/diy'), - + ('Animals', 'https://www.popsci.com/rss-animals.xml'), + ('Space', 'https://www.popsci.com/rss-space.xml'), + ('Environment', 'https://www.popsci.com/rss-environment.xml'), + ('Eastern Arsenal', 'https://www.popsci.com/rss-eastern-arsenal.xml'), ] + + pane_node_body = re.compile('pane-node-(?:\w+-){0,9}body') + keep_only_tags = [ - dict(attrs={'class': lambda x: x and { - 'pane-node-header', 'pane-node-body'} & set(x.split())}), + dict(attrs={'class': lambda x: x and frozenset('pane-node-header'.split()).issubset(frozenset(x.split()))}), + dict(attrs={'class': pane_node_body}), + ] + + remove_tags = [ + dict(attrs={'class': lambda x: x and frozenset('ads seperator'.split()).issubset(frozenset(x.split()))}), ] def preprocess_html(self, soup):