Update WSJ

This commit is contained in:
Kovid Goyal 2022-08-16 07:44:52 +05:30
parent 63f66f0f38
commit 0d4ee405c3
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 46 additions and 14 deletions

View File

@ -33,6 +33,19 @@ except ImportError:
needs_subscription = True needs_subscription = True
def substring_classes(classes):
q = frozenset(classes.split(' '))
def matcher(x):
if x:
for candidate in frozenset(x.split()):
for x in q:
if x in candidate:
return True
return False
return {'attrs': {'class': matcher}}
class WSJ(BasicNewsRecipe): class WSJ(BasicNewsRecipe):
if needs_subscription: if needs_subscription:
@ -54,10 +67,11 @@ class WSJ(BasicNewsRecipe):
WSJ_ITP = 'https://www.wsj.com/print-edition/today' WSJ_ITP = 'https://www.wsj.com/print-edition/today'
keep_only_tags = [ keep_only_tags = [
dict(substring_classes('StyledHeadline wsj-mosdo-Dek-Dek -Figure -Figcaption ArticleBodyContent__ -Paragraph')),
dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')), dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')),
dict(name='span', itemprop='author', rel='author'), dict(name='span', itemprop='author', rel='author'),
dict(name='article', id='article-contents articleBody'.split()), dict(name='article', id='article-contents articleBody'.split()),
dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()), dict(name='div', id='article_story_body ncTitleArea snipper-ad-login cx-snippet-overlay'.split()),
classes('nc-exp-artbody errorNotFound'), classes('nc-exp-artbody errorNotFound'),
dict(attrs={'data-module-zone': 'article_snippet'}), dict(attrs={'data-module-zone': 'article_snippet'}),
prefixed_classes( prefixed_classes(
@ -199,9 +213,11 @@ class WSJ(BasicNewsRecipe):
return br return br
# }}} # }}}
def abs_wsj_url(self, href): def abs_wsj_url(self, href, modify_query=True):
if not href.startswith('http'): if not href.startswith('http'):
href = 'https://www.wsj.com' + href href = 'https://www.wsj.com' + href
if modify_query:
href = href.split('?')[0] + '?mod=djemalertNEWS'
return href return href
def wsj_find_articles(self, url, ahed=False): def wsj_find_articles(self, url, ahed=False):
@ -224,7 +240,7 @@ class WSJ(BasicNewsRecipe):
articles.append({'title': title, 'url': url, articles.append({'title': title, 'url': url,
'description': desc, 'date': ''}) 'description': desc, 'date': ''})
self.log('\tFound article:', title) self.log('\tFound article:', title)
self.log('\t\t', desc) self.log('\t\t', desc + " " + url)
if self.test and len(articles) >= self.test[1]: if self.test and len(articles) >= self.test[1]:
break break
@ -250,7 +266,7 @@ class WSJ(BasicNewsRecipe):
'description': desc, 'date': ''}) 'description': desc, 'date': ''})
self.log('\tFound WN article:', title) self.log('\tFound WN article:', title)
self.log('\t\t', desc) self.log('\t\t', desc + " " + url)
return articles return articles
@ -290,7 +306,7 @@ class WSJ(BasicNewsRecipe):
title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.') title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
if not title: if not title:
continue continue
url = self.abs_wsj_url(a.get('href')) url = self.abs_wsj_url(a.get('href'), modify_query=False)
self.log('Found section:', title, 'at', url) self.log('Found section:', title, 'at', url)
self.wsj_add_feed(feeds, title, url) self.wsj_add_feed(feeds, title, url)
if frontpage: if frontpage:
@ -304,7 +320,7 @@ class WSJ(BasicNewsRecipe):
def test_wsj_index(self): def test_wsj_index(self):
return [ return [
('Testing', [ ('Testing', [
{'title': 'Article One', {'title': 'Subscriber Article',
'url': 'https://www.wsj.com/articles/egg-prices-jump-as-bird-flu-hits-poultry-flocks-11648900800'}, # noqa 'url': self.abs_wsj_url('https://www.wsj.com/articles/egg-prices-jump-as-bird-flu-hits-poultry-flocks-11648900800')},
]), ]),
] ]

View File

@ -33,6 +33,19 @@ except ImportError:
needs_subscription = False needs_subscription = False
def substring_classes(classes):
q = frozenset(classes.split(' '))
def matcher(x):
if x:
for candidate in frozenset(x.split()):
for x in q:
if x in candidate:
return True
return False
return {'attrs': {'class': matcher}}
class WSJ(BasicNewsRecipe): class WSJ(BasicNewsRecipe):
if needs_subscription: if needs_subscription:
@ -54,10 +67,11 @@ class WSJ(BasicNewsRecipe):
WSJ_ITP = 'https://www.wsj.com/print-edition/today' WSJ_ITP = 'https://www.wsj.com/print-edition/today'
keep_only_tags = [ keep_only_tags = [
dict(substring_classes('StyledHeadline wsj-mosdo-Dek-Dek -Figure -Figcaption ArticleBodyContent__ -Paragraph')),
dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')), dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')),
dict(name='span', itemprop='author', rel='author'), dict(name='span', itemprop='author', rel='author'),
dict(name='article', id='article-contents articleBody'.split()), dict(name='article', id='article-contents articleBody'.split()),
dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()), dict(name='div', id='article_story_body ncTitleArea snipper-ad-login cx-snippet-overlay'.split()),
classes('nc-exp-artbody errorNotFound'), classes('nc-exp-artbody errorNotFound'),
dict(attrs={'data-module-zone': 'article_snippet'}), dict(attrs={'data-module-zone': 'article_snippet'}),
prefixed_classes( prefixed_classes(
@ -199,9 +213,11 @@ class WSJ(BasicNewsRecipe):
return br return br
# }}} # }}}
def abs_wsj_url(self, href): def abs_wsj_url(self, href, modify_query=True):
if not href.startswith('http'): if not href.startswith('http'):
href = 'https://www.wsj.com' + href href = 'https://www.wsj.com' + href
if modify_query:
href = href.split('?')[0] + '?mod=djemalertNEWS'
return href return href
def wsj_find_articles(self, url, ahed=False): def wsj_find_articles(self, url, ahed=False):
@ -224,7 +240,7 @@ class WSJ(BasicNewsRecipe):
articles.append({'title': title, 'url': url, articles.append({'title': title, 'url': url,
'description': desc, 'date': ''}) 'description': desc, 'date': ''})
self.log('\tFound article:', title) self.log('\tFound article:', title)
self.log('\t\t', desc) self.log('\t\t', desc + " " + url)
if self.test and len(articles) >= self.test[1]: if self.test and len(articles) >= self.test[1]:
break break
@ -250,7 +266,7 @@ class WSJ(BasicNewsRecipe):
'description': desc, 'date': ''}) 'description': desc, 'date': ''})
self.log('\tFound WN article:', title) self.log('\tFound WN article:', title)
self.log('\t\t', desc) self.log('\t\t', desc + " " + url)
return articles return articles
@ -290,7 +306,7 @@ class WSJ(BasicNewsRecipe):
title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.') title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
if not title: if not title:
continue continue
url = self.abs_wsj_url(a.get('href')) url = self.abs_wsj_url(a.get('href'), modify_query=False)
self.log('Found section:', title, 'at', url) self.log('Found section:', title, 'at', url)
self.wsj_add_feed(feeds, title, url) self.wsj_add_feed(feeds, title, url)
if frontpage: if frontpage:
@ -304,7 +320,7 @@ class WSJ(BasicNewsRecipe):
def test_wsj_index(self): def test_wsj_index(self):
return [ return [
('Testing', [ ('Testing', [
{'title': 'Article One', {'title': 'Subscriber Article',
'url': 'https://www.wsj.com/articles/egg-prices-jump-as-bird-flu-hits-poultry-flocks-11648900800'}, # noqa 'url': self.abs_wsj_url('https://www.wsj.com/articles/egg-prices-jump-as-bird-flu-hits-poultry-flocks-11648900800')},
]), ]),
] ]