From 04dfdaf8cf84d6514d9f1e002c5305f9a1b4e721 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 16 Sep 2023 12:53:59 +0530 Subject: [PATCH] Just use the main container for WSJ extraction --- recipes/wsj.recipe | 10 ++++++---- recipes/wsj_free.recipe | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe index f9de630b5c..9d03947c04 100644 --- a/recipes/wsj.recipe +++ b/recipes/wsj.recipe @@ -66,14 +66,16 @@ class WSJ(BasicNewsRecipe): ''' keep_only_tags = [ - classes('wsj-article-headline-wrap articleLead bylineWrap bigTop-hero article-container'), - dict(name='section', attrs={'subscriptions-section':'content'}) + dict(attrs={'class': lambda x: x and 'HeadlineContainer' in ''.join(x)}), + dict(name='main'), ] remove_tags = [ - classes('wsj-ad newsletter-inset media-object-video media-object-podcast podcast--iframe dynamic-inset-overflow-button'), + classes( + 'wsj-ad newsletter-inset media-object-video media-object-podcast print-header article-body-tools' + ' podcast--iframe dynamic-inset-overflow-button snippet-logo'), dict(role=["toolbar", "complementary"]), - dict(attrs={"aria-label": ["Sponsored Offers", "What to Read Next"]}), + dict(attrs={"aria-label": ["Sponsored Offers", "What to Read Next", "breadcrumbs", "Listen To Article"]}), dict(name='amp-iframe'), # interactive graphics ] diff --git a/recipes/wsj_free.recipe b/recipes/wsj_free.recipe index 0653cb8534..5347075721 100644 --- a/recipes/wsj_free.recipe +++ b/recipes/wsj_free.recipe @@ -66,14 +66,16 @@ class WSJ(BasicNewsRecipe): ''' keep_only_tags = [ - classes('wsj-article-headline-wrap articleLead bylineWrap bigTop-hero article-container'), - dict(name='section', attrs={'subscriptions-section':'content'}) + dict(attrs={'class': lambda x: x and 'HeadlineContainer' in ''.join(x)}), + dict(name='main'), ] remove_tags = [ - classes('wsj-ad newsletter-inset media-object-video media-object-podcast podcast--iframe dynamic-inset-overflow-button'), + classes( + 'wsj-ad newsletter-inset media-object-video media-object-podcast print-header article-body-tools' + ' podcast--iframe dynamic-inset-overflow-button snippet-logo'), dict(role=["toolbar", "complementary"]), - dict(attrs={"aria-label": ["Sponsored Offers", "What to Read Next"]}), + dict(attrs={"aria-label": ["Sponsored Offers", "What to Read Next", "breadcrumbs", "Listen To Article"]}), dict(name='amp-iframe'), # interactive graphics ]