mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update The Wall Street Journal
This commit is contained in:
parent
ebbc4b6a76
commit
0103bbc57f
@ -67,6 +67,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
dict(name='span', attrs={
|
dict(name='span', attrs={
|
||||||
'data-country-code': True, 'data-ticker-code': True}),
|
'data-country-code': True, 'data-ticker-code': True}),
|
||||||
dict(name='meta link'.split()),
|
dict(name='meta link'.split()),
|
||||||
|
dict(name='button'),
|
||||||
]
|
]
|
||||||
|
|
||||||
def preprocess_soup(self, soup):
|
def preprocess_soup(self, soup):
|
||||||
@ -168,14 +169,19 @@ class WSJ(BasicNewsRecipe):
|
|||||||
def wsj_find_articles(self, url, ahed=False):
|
def wsj_find_articles(self, url, ahed=False):
|
||||||
root = self.index_to_soup(url, as_tree=True)
|
root = self.index_to_soup(url, as_tree=True)
|
||||||
CSSSelect = Select(root)
|
CSSSelect = Select(root)
|
||||||
|
# from lxml import etree
|
||||||
|
# from calibre.utils.ipython import ipython
|
||||||
|
# open('/t/section.html', 'w').write(etree.tostring(root, encoding='unicode'))
|
||||||
|
# ipython({'root': root, 'CSSSelect': CSSSelect})
|
||||||
|
# raise SystemExit(1)
|
||||||
articles = []
|
articles = []
|
||||||
for container in root.xpath('descendant::div[contains(@class, "WSJTheme--list-item-")]'):
|
for container in CSSSelect('article[class^="WSJTheme--story--"]'):
|
||||||
heading = next(CSSSelect('h2, h3', container))
|
heading = next(CSSSelect('h2, h3', container))
|
||||||
a = next(CSSSelect('a', heading))
|
a = next(CSSSelect('a', heading))
|
||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
url = self.abs_wsj_url(a.get('href'))
|
url = self.abs_wsj_url(a.get('href'))
|
||||||
desc = ''
|
desc = ''
|
||||||
for p in container.xpath('descendant::p[contains(@class, "WSJTheme--description-")]'):
|
for p in container.xpath('descendant::p[contains(@class, "WSJTheme--summary--")]'):
|
||||||
q = self.tag_to_string(p)
|
q = self.tag_to_string(p)
|
||||||
if 'Subscriber Content' in q:
|
if 'Subscriber Content' in q:
|
||||||
continue
|
continue
|
||||||
@ -187,6 +193,8 @@ class WSJ(BasicNewsRecipe):
|
|||||||
|
|
||||||
self.log('\tFound article:', title)
|
self.log('\tFound article:', title)
|
||||||
self.log('\t\t', desc)
|
self.log('\t\t', desc)
|
||||||
|
if self.test and len(articles) >= self.test[1]:
|
||||||
|
break
|
||||||
|
|
||||||
return articles
|
return articles
|
||||||
|
|
||||||
@ -238,16 +246,15 @@ class WSJ(BasicNewsRecipe):
|
|||||||
break
|
break
|
||||||
|
|
||||||
feeds = []
|
feeds = []
|
||||||
for a in CSSSelect('.WSJTheme--nav-container--2qF6xQnZ .WSJTheme--section-link--3VznjgTM'):
|
for a in CSSSelect('ul[aria-label="Primary Navigation"] a[class^="style--section-link--"]'):
|
||||||
frontpage = a.get('href').endswith('frontpage')
|
|
||||||
title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
|
title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
|
||||||
if not title:
|
if not title:
|
||||||
continue
|
continue
|
||||||
url = self.abs_wsj_url(a.get('href'))
|
url = self.abs_wsj_url(a.get('href'))
|
||||||
self.log('Found section:', title, 'at', url)
|
self.log('Found section:', title, 'at', url)
|
||||||
self.wsj_add_feed(feeds, title, url)
|
self.wsj_add_feed(feeds, title, url)
|
||||||
if frontpage:
|
if self.test and len(feeds) >= self.test[0]:
|
||||||
self.wsj_find_wn_articles(feeds, root, CSSSelect)
|
break
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def test_wsj_index(self):
|
def test_wsj_index(self):
|
||||||
|
@ -67,6 +67,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
dict(name='span', attrs={
|
dict(name='span', attrs={
|
||||||
'data-country-code': True, 'data-ticker-code': True}),
|
'data-country-code': True, 'data-ticker-code': True}),
|
||||||
dict(name='meta link'.split()),
|
dict(name='meta link'.split()),
|
||||||
|
dict(name='button'),
|
||||||
]
|
]
|
||||||
|
|
||||||
def preprocess_soup(self, soup):
|
def preprocess_soup(self, soup):
|
||||||
@ -168,14 +169,19 @@ class WSJ(BasicNewsRecipe):
|
|||||||
def wsj_find_articles(self, url, ahed=False):
|
def wsj_find_articles(self, url, ahed=False):
|
||||||
root = self.index_to_soup(url, as_tree=True)
|
root = self.index_to_soup(url, as_tree=True)
|
||||||
CSSSelect = Select(root)
|
CSSSelect = Select(root)
|
||||||
|
# from lxml import etree
|
||||||
|
# from calibre.utils.ipython import ipython
|
||||||
|
# open('/t/section.html', 'w').write(etree.tostring(root, encoding='unicode'))
|
||||||
|
# ipython({'root': root, 'CSSSelect': CSSSelect})
|
||||||
|
# raise SystemExit(1)
|
||||||
articles = []
|
articles = []
|
||||||
for container in root.xpath('descendant::div[contains(@class, "WSJTheme--list-item-")]'):
|
for container in CSSSelect('article[class^="WSJTheme--story--"]'):
|
||||||
heading = next(CSSSelect('h2, h3', container))
|
heading = next(CSSSelect('h2, h3', container))
|
||||||
a = next(CSSSelect('a', heading))
|
a = next(CSSSelect('a', heading))
|
||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
url = self.abs_wsj_url(a.get('href'))
|
url = self.abs_wsj_url(a.get('href'))
|
||||||
desc = ''
|
desc = ''
|
||||||
for p in container.xpath('descendant::p[contains(@class, "WSJTheme--description-")]'):
|
for p in container.xpath('descendant::p[contains(@class, "WSJTheme--summary--")]'):
|
||||||
q = self.tag_to_string(p)
|
q = self.tag_to_string(p)
|
||||||
if 'Subscriber Content' in q:
|
if 'Subscriber Content' in q:
|
||||||
continue
|
continue
|
||||||
@ -187,6 +193,8 @@ class WSJ(BasicNewsRecipe):
|
|||||||
|
|
||||||
self.log('\tFound article:', title)
|
self.log('\tFound article:', title)
|
||||||
self.log('\t\t', desc)
|
self.log('\t\t', desc)
|
||||||
|
if self.test and len(articles) >= self.test[1]:
|
||||||
|
break
|
||||||
|
|
||||||
return articles
|
return articles
|
||||||
|
|
||||||
@ -238,16 +246,15 @@ class WSJ(BasicNewsRecipe):
|
|||||||
break
|
break
|
||||||
|
|
||||||
feeds = []
|
feeds = []
|
||||||
for a in CSSSelect('.WSJTheme--nav-container--2qF6xQnZ .WSJTheme--section-link--3VznjgTM'):
|
for a in CSSSelect('ul[aria-label="Primary Navigation"] a[class^="style--section-link--"]'):
|
||||||
frontpage = a.get('href').endswith('frontpage')
|
|
||||||
title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
|
title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
|
||||||
if not title:
|
if not title:
|
||||||
continue
|
continue
|
||||||
url = self.abs_wsj_url(a.get('href'))
|
url = self.abs_wsj_url(a.get('href'))
|
||||||
self.log('Found section:', title, 'at', url)
|
self.log('Found section:', title, 'at', url)
|
||||||
self.wsj_add_feed(feeds, title, url)
|
self.wsj_add_feed(feeds, title, url)
|
||||||
if frontpage:
|
if self.test and len(feeds) >= self.test[0]:
|
||||||
self.wsj_find_wn_articles(feeds, root, CSSSelect)
|
break
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def test_wsj_index(self):
|
def test_wsj_index(self):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user