Update The Wall Street Journal

This commit is contained in:
Kovid Goyal 2021-01-20 21:20:49 +05:30
parent b0555e1cfe
commit 310f92b868
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 20 additions and 26 deletions

View File

@ -66,8 +66,7 @@ class WSJ(BasicNewsRecipe):
' media-object-video article_tools nc-exp-artmeta category type-InsetArticlesRelatedByType media-object-rich-text'), ' media-object-video article_tools nc-exp-artmeta category type-InsetArticlesRelatedByType media-object-rich-text'),
dict(name='span', attrs={ dict(name='span', attrs={
'data-country-code': True, 'data-ticker-code': True}), 'data-country-code': True, 'data-ticker-code': True}),
dict(name='meta link'.split()), dict(name='meta link button'.split()),
dict(name='button'),
] ]
def preprocess_soup(self, soup): def preprocess_soup(self, soup):
@ -169,19 +168,14 @@ class WSJ(BasicNewsRecipe):
def wsj_find_articles(self, url, ahed=False): def wsj_find_articles(self, url, ahed=False):
root = self.index_to_soup(url, as_tree=True) root = self.index_to_soup(url, as_tree=True)
CSSSelect = Select(root) CSSSelect = Select(root)
# from lxml import etree
# from calibre.utils.ipython import ipython
# open('/t/section.html', 'w').write(etree.tostring(root, encoding='unicode'))
# ipython({'root': root, 'CSSSelect': CSSSelect})
# raise SystemExit(1)
articles = [] articles = []
for container in CSSSelect('article[class^="WSJTheme--story--"]'): for container in root.xpath('descendant::div[contains(@class, "WSJTheme--list-item-")]'):
heading = next(CSSSelect('h2, h3', container)) heading = next(CSSSelect('h2, h3', container))
a = next(CSSSelect('a', heading)) a = next(CSSSelect('a', heading))
title = self.tag_to_string(a) title = self.tag_to_string(a)
url = self.abs_wsj_url(a.get('href')) url = self.abs_wsj_url(a.get('href'))
desc = '' desc = ''
for p in container.xpath('descendant::p[contains(@class, "WSJTheme--summary--")]'): for p in container.xpath('descendant::p[contains(@class, "WSJTheme--description-")]'):
q = self.tag_to_string(p) q = self.tag_to_string(p)
if 'Subscriber Content' in q: if 'Subscriber Content' in q:
continue continue
@ -190,17 +184,17 @@ class WSJ(BasicNewsRecipe):
articles.append({'title': title, 'url': url, articles.append({'title': title, 'url': url,
'description': desc, 'date': ''}) 'description': desc, 'date': ''})
if self.test and len(articles) >= self.test[1]:
break
self.log('\tFound article:', title) self.log('\tFound article:', title)
self.log('\t\t', desc) self.log('\t\t', desc)
if self.test and len(articles) >= self.test[1]:
break
return articles return articles
def wsj_find_wn_articles(self, feeds, root, CSSSelect): def wsj_find_wn_articles(self, feeds, root, CSSSelect):
articles = [] articles = []
for a in CSSSelect('.style--strap--3DsLojSy'): for a in CSSSelect('.style--strap--ND8Cuaip'):
if 'WHAT\'S NEWS' in self.tag_to_string(a).upper(): if 'WHAT\'S NEWS' in self.tag_to_string(a).upper():
whats_news = a.getparent() whats_news = a.getparent()
break break
@ -246,13 +240,16 @@ class WSJ(BasicNewsRecipe):
break break
feeds = [] feeds = []
for a in CSSSelect('ul[aria-label="Primary Navigation"] a[class^="style--section-link--"]'): for a in CSSSelect('.WSJTheme--nav-container--2qF6xQnZ .WSJTheme--section-link--3VznjgTM'):
frontpage = a.get('href').endswith('frontpage')
title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.') title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
if not title: if not title:
continue continue
url = self.abs_wsj_url(a.get('href')) url = self.abs_wsj_url(a.get('href'))
self.log('Found section:', title, 'at', url) self.log('Found section:', title, 'at', url)
self.wsj_add_feed(feeds, title, url) self.wsj_add_feed(feeds, title, url)
if frontpage:
self.wsj_find_wn_articles(feeds, root, CSSSelect)
if self.test and len(feeds) >= self.test[0]: if self.test and len(feeds) >= self.test[0]:
break break
return feeds return feeds

View File

@ -66,8 +66,7 @@ class WSJ(BasicNewsRecipe):
' media-object-video article_tools nc-exp-artmeta category type-InsetArticlesRelatedByType media-object-rich-text'), ' media-object-video article_tools nc-exp-artmeta category type-InsetArticlesRelatedByType media-object-rich-text'),
dict(name='span', attrs={ dict(name='span', attrs={
'data-country-code': True, 'data-ticker-code': True}), 'data-country-code': True, 'data-ticker-code': True}),
dict(name='meta link'.split()), dict(name='meta link button'.split()),
dict(name='button'),
] ]
def preprocess_soup(self, soup): def preprocess_soup(self, soup):
@ -169,19 +168,14 @@ class WSJ(BasicNewsRecipe):
def wsj_find_articles(self, url, ahed=False): def wsj_find_articles(self, url, ahed=False):
root = self.index_to_soup(url, as_tree=True) root = self.index_to_soup(url, as_tree=True)
CSSSelect = Select(root) CSSSelect = Select(root)
# from lxml import etree
# from calibre.utils.ipython import ipython
# open('/t/section.html', 'w').write(etree.tostring(root, encoding='unicode'))
# ipython({'root': root, 'CSSSelect': CSSSelect})
# raise SystemExit(1)
articles = [] articles = []
for container in CSSSelect('article[class^="WSJTheme--story--"]'): for container in root.xpath('descendant::div[contains(@class, "WSJTheme--list-item-")]'):
heading = next(CSSSelect('h2, h3', container)) heading = next(CSSSelect('h2, h3', container))
a = next(CSSSelect('a', heading)) a = next(CSSSelect('a', heading))
title = self.tag_to_string(a) title = self.tag_to_string(a)
url = self.abs_wsj_url(a.get('href')) url = self.abs_wsj_url(a.get('href'))
desc = '' desc = ''
for p in container.xpath('descendant::p[contains(@class, "WSJTheme--summary--")]'): for p in container.xpath('descendant::p[contains(@class, "WSJTheme--description-")]'):
q = self.tag_to_string(p) q = self.tag_to_string(p)
if 'Subscriber Content' in q: if 'Subscriber Content' in q:
continue continue
@ -190,17 +184,17 @@ class WSJ(BasicNewsRecipe):
articles.append({'title': title, 'url': url, articles.append({'title': title, 'url': url,
'description': desc, 'date': ''}) 'description': desc, 'date': ''})
if self.test and len(articles) >= self.test[1]:
break
self.log('\tFound article:', title) self.log('\tFound article:', title)
self.log('\t\t', desc) self.log('\t\t', desc)
if self.test and len(articles) >= self.test[1]:
break
return articles return articles
def wsj_find_wn_articles(self, feeds, root, CSSSelect): def wsj_find_wn_articles(self, feeds, root, CSSSelect):
articles = [] articles = []
for a in CSSSelect('.style--strap--3DsLojSy'): for a in CSSSelect('.style--strap--ND8Cuaip'):
if 'WHAT\'S NEWS' in self.tag_to_string(a).upper(): if 'WHAT\'S NEWS' in self.tag_to_string(a).upper():
whats_news = a.getparent() whats_news = a.getparent()
break break
@ -246,13 +240,16 @@ class WSJ(BasicNewsRecipe):
break break
feeds = [] feeds = []
for a in CSSSelect('ul[aria-label="Primary Navigation"] a[class^="style--section-link--"]'): for a in CSSSelect('.WSJTheme--nav-container--2qF6xQnZ .WSJTheme--section-link--3VznjgTM'):
frontpage = a.get('href').endswith('frontpage')
title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.') title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
if not title: if not title:
continue continue
url = self.abs_wsj_url(a.get('href')) url = self.abs_wsj_url(a.get('href'))
self.log('Found section:', title, 'at', url) self.log('Found section:', title, 'at', url)
self.wsj_add_feed(feeds, title, url) self.wsj_add_feed(feeds, title, url)
if frontpage:
self.wsj_find_wn_articles(feeds, root, CSSSelect)
if self.test and len(feeds) >= self.test[0]: if self.test and len(feeds) >= self.test[0]:
break break
return feeds return feeds