Update Washington Post

This commit is contained in:
Kovid Goyal 2020-10-24 06:55:15 +05:30
parent 0a2829d41c
commit 7fbd939435
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -26,7 +26,9 @@ class TheWashingtonPost(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
language = 'en' language = 'en'
remove_empty_feeds = True remove_empty_feeds = True
ignore_duplicate_articles = {'url'}
publication_type = 'newspaper' publication_type = 'newspaper'
remove_attributes = ['style', 'width', 'height']
keep_only_tags = [ keep_only_tags = [
dict(name=['h1', 'figure']), dict(name=['h1', 'figure']),
@ -34,8 +36,8 @@ class TheWashingtonPost(BasicNewsRecipe):
] ]
remove_tags = [ remove_tags = [
dict(name=['meta', 'link']), dict(name=['meta', 'link']),
classes('inline-video author-tooltip author-image'), classes('inline-video author-tooltip author-image powa-wrapper'),
dict(attrs={'data-qa': 'article-body-ad'}), dict(attrs={'data-qa': ['article-body-ad', 'subscribe-promo', 'interstitial-link-wrapper']}),
] ]
feeds = [ feeds = [
@ -53,6 +55,8 @@ class TheWashingtonPost(BasicNewsRecipe):
] ]
def preprocess_html(self, soup, *a): def preprocess_html(self, soup, *a):
for img in soup.findAll('img', srcset=True): for img in soup.findAll('img', src=True):
img['src'] = img['srcset'].split()[0] src = img['src']
if src.endswith('&w=32'):
img['src'] = src[:-2] + '440'
return soup return soup