diff --git a/recipes/economist.recipe b/recipes/economist.recipe index a545885edb..67f0dd59aa 100644 --- a/recipes/economist.recipe +++ b/recipes/economist.recipe @@ -59,23 +59,26 @@ class Economist(BasicNewsRecipe): oldest_article = 7.0 resolve_internal_links = True remove_tags = [ - dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']), - dict( - attrs={ + dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']), + dict(attrs={ 'class': [ 'dblClkTrk', 'ec-article-info', 'share_inline_header', - 'related-items', 'main-content-container', 'ec-topic-widget' + 'related-items', 'main-content-container', 'ec-topic-widget', + 'teaser', 'blog-post__bottom-panel-bottom', 'blog-post__comments-label', + 'blog-post__foot-note' ] } ), - { - 'class': lambda x: x and 'share-links-header' in x - }, + dict(attrs={'class': lambda x: x and 'share-links-header' in x.split()}), + dict(attrs={'class': lambda x: x and 'teaser--wrapped' in x.split()}), ] keep_only_tags = [dict(name='article', id=lambda x: not x)] no_stylesheets = True - preprocess_regexps = [(re.compile('.*', re.DOTALL), lambda x: '')] - + preprocess_regexps = [ + (re.compile('.*', re.DOTALL), lambda x: ''), + (re.compile('

',re.DOTALL|re.IGNORECASE), lambda x: '


'), + (re.compile('',re.DOTALL|re.IGNORECASE), lambda x: '

') + ] # economist.com has started throttling after about 60% of the total has # downloaded with connection reset by peer (104) errors. delay = 1