Just use the main container for WSJ extraction

This commit is contained in:
Kovid Goyal 2023-09-16 12:53:59 +05:30
parent 14a315c4d2
commit 04dfdaf8cf
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 12 additions and 8 deletions

View File

@ -66,14 +66,16 @@ class WSJ(BasicNewsRecipe):
''' '''
keep_only_tags = [ keep_only_tags = [
classes('wsj-article-headline-wrap articleLead bylineWrap bigTop-hero article-container'), dict(attrs={'class': lambda x: x and 'HeadlineContainer' in ''.join(x)}),
dict(name='section', attrs={'subscriptions-section':'content'}) dict(name='main'),
] ]
remove_tags = [ remove_tags = [
classes('wsj-ad newsletter-inset media-object-video media-object-podcast podcast--iframe dynamic-inset-overflow-button'), classes(
'wsj-ad newsletter-inset media-object-video media-object-podcast print-header article-body-tools'
' podcast--iframe dynamic-inset-overflow-button snippet-logo'),
dict(role=["toolbar", "complementary"]), dict(role=["toolbar", "complementary"]),
dict(attrs={"aria-label": ["Sponsored Offers", "What to Read Next"]}), dict(attrs={"aria-label": ["Sponsored Offers", "What to Read Next", "breadcrumbs", "Listen To Article"]}),
dict(name='amp-iframe'), # interactive graphics dict(name='amp-iframe'), # interactive graphics
] ]

View File

@ -66,14 +66,16 @@ class WSJ(BasicNewsRecipe):
''' '''
keep_only_tags = [ keep_only_tags = [
classes('wsj-article-headline-wrap articleLead bylineWrap bigTop-hero article-container'), dict(attrs={'class': lambda x: x and 'HeadlineContainer' in ''.join(x)}),
dict(name='section', attrs={'subscriptions-section':'content'}) dict(name='main'),
] ]
remove_tags = [ remove_tags = [
classes('wsj-ad newsletter-inset media-object-video media-object-podcast podcast--iframe dynamic-inset-overflow-button'), classes(
'wsj-ad newsletter-inset media-object-video media-object-podcast print-header article-body-tools'
' podcast--iframe dynamic-inset-overflow-button snippet-logo'),
dict(role=["toolbar", "complementary"]), dict(role=["toolbar", "complementary"]),
dict(attrs={"aria-label": ["Sponsored Offers", "What to Read Next"]}), dict(attrs={"aria-label": ["Sponsored Offers", "What to Read Next", "breadcrumbs", "Listen To Article"]}),
dict(name='amp-iframe'), # interactive graphics dict(name='amp-iframe'), # interactive graphics
] ]