This commit is contained in:
Kovid Goyal 2023-10-26 07:18:19 +05:30
commit 190bf0f1df
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 28 additions and 16 deletions

View File

@ -59,17 +59,23 @@ class WSJ(BasicNewsRecipe):
dict(name=['h1', 'h2']), dict(name=['h1', 'h2']),
dict(attrs={'aria-describedby':'big-top-caption'}), dict(attrs={'aria-describedby':'big-top-caption'}),
dict(attrs={'id':'big-top-caption'}), dict(attrs={'id':'big-top-caption'}),
dict(name='article') dict(name='article', attrs={'style':lambda x: x and 'article-body' in x})
] ]
remove_tags = [ remove_tags = [
dict(name=['button', 'svg', 'ufc-follow-author-widget', 'old-script']),
dict(attrs={'aria-label':['Sponsored Offers', 'Listen To Article', 'What to Read Next', 'Utility Bar', 'Conversation', 'List of Comments']}),
dict(attrs={'data-type':'inset'}), dict(attrs={'data-type':'inset'}),
dict(attrs={'data-testid':'ad-container'}), dict(attrs={'data-testid':'ad-container'}),
dict(attrs={'data-spotim-app':'conversation'}), dict(attrs={'data-spotim-app':'conversation'}),
dict(attrs={'data-spot-im-class':['message-text', 'messages-list', 'conversation-root']}), dict(name=['button', 'svg', 'old-script', 'video']),
dict(attrs={'id':lambda x: x and x.startswith(('comments_sector', 'wrapper-INLINE', 'audio-tag-inner-audio-', 'article-comments-tool'))}), dict(attrs={'aria-label':[
'Sponsored Offers', 'Listen To Article', 'What to Read Next', 'Utility Bar',
'Conversation', 'List of Comments', 'Comment', 'JR More Articles'
]}),
dict(attrs={'data-spot-im-class':['message-text', 'messages-list', 'message-view', 'conversation-root']}),
dict(attrs={'id':lambda x: x and x.startswith(
('comments_sector', 'wrapper-INLINE', 'audio-tag-inner-audio-', 'article-comments-tool')
)}),
dict(name='div', attrs={'data-message-depth':True})
] ]
articles_are_obfuscated = True articles_are_obfuscated = True
@ -94,9 +100,6 @@ class WSJ(BasicNewsRecipe):
p.name = 'p' p.name = 'p'
for a in soup.findAll('a', href=True): for a in soup.findAll('a', href=True):
a['href'] = 'http' + a['href'].split('http')[-1] a['href'] = 'http' + a['href'].split('http')[-1]
for fig in soup.findAll('figure'):
if fig.find('video'):
fig.extract()
for figc in soup.findAll('figcaption'): for figc in soup.findAll('figcaption'):
figc['id'] = 'big-top-caption' figc['id'] = 'big-top-caption'
if name:= soup.find('h2', attrs={'itemprop':'name'}): if name:= soup.find('h2', attrs={'itemprop':'name'}):
@ -114,6 +117,9 @@ class WSJ(BasicNewsRecipe):
div.name = 'span' div.name = 'span'
if parent := auth.findParent('div'): if parent := auth.findParent('div'):
parent['class'] = 'auth' parent['class'] = 'auth'
for x in soup.findAll('ufc-follow-author-widget'):
if y := x.findParent('div'):
y.extract()
return soup return soup
# login {{{ # login {{{

View File

@ -59,17 +59,23 @@ class WSJ(BasicNewsRecipe):
dict(name=['h1', 'h2']), dict(name=['h1', 'h2']),
dict(attrs={'aria-describedby':'big-top-caption'}), dict(attrs={'aria-describedby':'big-top-caption'}),
dict(attrs={'id':'big-top-caption'}), dict(attrs={'id':'big-top-caption'}),
dict(name='article') dict(name='article', attrs={'style':lambda x: x and 'article-body' in x})
] ]
remove_tags = [ remove_tags = [
dict(name=['button', 'svg', 'ufc-follow-author-widget', 'old-script']),
dict(attrs={'aria-label':['Sponsored Offers', 'Listen To Article', 'What to Read Next', 'Utility Bar', 'Conversation', 'List of Comments']}),
dict(attrs={'data-type':'inset'}), dict(attrs={'data-type':'inset'}),
dict(attrs={'data-testid':'ad-container'}), dict(attrs={'data-testid':'ad-container'}),
dict(attrs={'data-spotim-app':'conversation'}), dict(attrs={'data-spotim-app':'conversation'}),
dict(attrs={'data-spot-im-class':['message-text', 'messages-list', 'conversation-root']}), dict(name=['button', 'svg', 'old-script', 'video']),
dict(attrs={'id':lambda x: x and x.startswith(('comments_sector', 'wrapper-INLINE', 'audio-tag-inner-audio-', 'article-comments-tool'))}), dict(attrs={'aria-label':[
'Sponsored Offers', 'Listen To Article', 'What to Read Next', 'Utility Bar',
'Conversation', 'List of Comments', 'Comment', 'JR More Articles'
]}),
dict(attrs={'data-spot-im-class':['message-text', 'messages-list', 'message-view', 'conversation-root']}),
dict(attrs={'id':lambda x: x and x.startswith(
('comments_sector', 'wrapper-INLINE', 'audio-tag-inner-audio-', 'article-comments-tool')
)}),
dict(name='div', attrs={'data-message-depth':True})
] ]
articles_are_obfuscated = True articles_are_obfuscated = True
@ -94,9 +100,6 @@ class WSJ(BasicNewsRecipe):
p.name = 'p' p.name = 'p'
for a in soup.findAll('a', href=True): for a in soup.findAll('a', href=True):
a['href'] = 'http' + a['href'].split('http')[-1] a['href'] = 'http' + a['href'].split('http')[-1]
for fig in soup.findAll('figure'):
if fig.find('video'):
fig.extract()
for figc in soup.findAll('figcaption'): for figc in soup.findAll('figcaption'):
figc['id'] = 'big-top-caption' figc['id'] = 'big-top-caption'
if name:= soup.find('h2', attrs={'itemprop':'name'}): if name:= soup.find('h2', attrs={'itemprop':'name'}):
@ -114,6 +117,9 @@ class WSJ(BasicNewsRecipe):
div.name = 'span' div.name = 'span'
if parent := auth.findParent('div'): if parent := auth.findParent('div'):
parent['class'] = 'auth' parent['class'] = 'auth'
for x in soup.findAll('ufc-follow-author-widget'):
if y := x.findParent('div'):
y.extract()
return soup return soup
# login {{{ # login {{{