From 450b64a6f85f1a78edbcb7db4f6d8aa2a466d080 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 6 May 2020 07:46:43 +0530 Subject: [PATCH] Update Washington Post --- recipes/wash_post.recipe | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/recipes/wash_post.recipe b/recipes/wash_post.recipe index c89342c001..91e3d1f251 100644 --- a/recipes/wash_post.recipe +++ b/recipes/wash_post.recipe @@ -29,11 +29,13 @@ class TheWashingtonPost(BasicNewsRecipe): publication_type = 'newspaper' keep_only_tags = [ - dict(itemprop=['headline', 'articleBody']), + dict(name=['h1', 'figure']), + classes('byline article-body'), ] remove_tags = [ - dict(name=['meta', 'link']), - classes('inline-video'), + dict(name=['meta', 'link']), + classes('inline-video author-tooltip author-image'), + dict(attrs={'data-qa': 'article-body-ad'}), ] feeds = [ @@ -51,6 +53,6 @@ class TheWashingtonPost(BasicNewsRecipe): ] def preprocess_html(self, soup, *a): - for img in soup.findAll('img', attrs={'data-low-res-src': True}): - img['src'] = img['data-low-res-src'] + for img in soup.findAll('img', srcset=True): + img['src'] = img['srcset'].split()[0] return soup