diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe index 652c305e75..12a28c9f55 100644 --- a/recipes/wsj.recipe +++ b/recipes/wsj.recipe @@ -66,8 +66,7 @@ class WSJ(BasicNewsRecipe): ' media-object-video article_tools nc-exp-artmeta category type-InsetArticlesRelatedByType media-object-rich-text'), dict(name='span', attrs={ 'data-country-code': True, 'data-ticker-code': True}), - dict(name='meta link'.split()), - dict(name='button'), + dict(name='meta link button'.split()), ] def preprocess_soup(self, soup): @@ -169,19 +168,14 @@ class WSJ(BasicNewsRecipe): def wsj_find_articles(self, url, ahed=False): root = self.index_to_soup(url, as_tree=True) CSSSelect = Select(root) - # from lxml import etree - # from calibre.utils.ipython import ipython - # open('/t/section.html', 'w').write(etree.tostring(root, encoding='unicode')) - # ipython({'root': root, 'CSSSelect': CSSSelect}) - # raise SystemExit(1) articles = [] - for container in CSSSelect('article[class^="WSJTheme--story--"]'): + for container in root.xpath('descendant::div[contains(@class, "WSJTheme--list-item-")]'): heading = next(CSSSelect('h2, h3', container)) a = next(CSSSelect('a', heading)) title = self.tag_to_string(a) url = self.abs_wsj_url(a.get('href')) desc = '' - for p in container.xpath('descendant::p[contains(@class, "WSJTheme--summary--")]'): + for p in container.xpath('descendant::p[contains(@class, "WSJTheme--description-")]'): q = self.tag_to_string(p) if 'Subscriber Content' in q: continue @@ -190,17 +184,17 @@ class WSJ(BasicNewsRecipe): articles.append({'title': title, 'url': url, 'description': desc, 'date': ''}) + if self.test and len(articles) >= self.test[1]: + break self.log('\tFound article:', title) self.log('\t\t', desc) - if self.test and len(articles) >= self.test[1]: - break return articles def wsj_find_wn_articles(self, feeds, root, CSSSelect): articles = [] - for a in CSSSelect('.style--strap--3DsLojSy'): + for a in CSSSelect('.style--strap--ND8Cuaip'): if 'WHAT\'S NEWS' in self.tag_to_string(a).upper(): whats_news = a.getparent() break @@ -246,13 +240,16 @@ class WSJ(BasicNewsRecipe): break feeds = [] - for a in CSSSelect('ul[aria-label="Primary Navigation"] a[class^="style--section-link--"]'): + for a in CSSSelect('.WSJTheme--nav-container--2qF6xQnZ .WSJTheme--section-link--3VznjgTM'): + frontpage = a.get('href').endswith('frontpage') title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.') if not title: continue url = self.abs_wsj_url(a.get('href')) self.log('Found section:', title, 'at', url) self.wsj_add_feed(feeds, title, url) + if frontpage: + self.wsj_find_wn_articles(feeds, root, CSSSelect) if self.test and len(feeds) >= self.test[0]: break return feeds diff --git a/recipes/wsj_free.recipe b/recipes/wsj_free.recipe index 7de3162252..4fb026ef66 100644 --- a/recipes/wsj_free.recipe +++ b/recipes/wsj_free.recipe @@ -66,8 +66,7 @@ class WSJ(BasicNewsRecipe): ' media-object-video article_tools nc-exp-artmeta category type-InsetArticlesRelatedByType media-object-rich-text'), dict(name='span', attrs={ 'data-country-code': True, 'data-ticker-code': True}), - dict(name='meta link'.split()), - dict(name='button'), + dict(name='meta link button'.split()), ] def preprocess_soup(self, soup): @@ -169,19 +168,14 @@ class WSJ(BasicNewsRecipe): def wsj_find_articles(self, url, ahed=False): root = self.index_to_soup(url, as_tree=True) CSSSelect = Select(root) - # from lxml import etree - # from calibre.utils.ipython import ipython - # open('/t/section.html', 'w').write(etree.tostring(root, encoding='unicode')) - # ipython({'root': root, 'CSSSelect': CSSSelect}) - # raise SystemExit(1) articles = [] - for container in CSSSelect('article[class^="WSJTheme--story--"]'): + for container in root.xpath('descendant::div[contains(@class, "WSJTheme--list-item-")]'): heading = next(CSSSelect('h2, h3', container)) a = next(CSSSelect('a', heading)) title = self.tag_to_string(a) url = self.abs_wsj_url(a.get('href')) desc = '' - for p in container.xpath('descendant::p[contains(@class, "WSJTheme--summary--")]'): + for p in container.xpath('descendant::p[contains(@class, "WSJTheme--description-")]'): q = self.tag_to_string(p) if 'Subscriber Content' in q: continue @@ -190,17 +184,17 @@ class WSJ(BasicNewsRecipe): articles.append({'title': title, 'url': url, 'description': desc, 'date': ''}) + if self.test and len(articles) >= self.test[1]: + break self.log('\tFound article:', title) self.log('\t\t', desc) - if self.test and len(articles) >= self.test[1]: - break return articles def wsj_find_wn_articles(self, feeds, root, CSSSelect): articles = [] - for a in CSSSelect('.style--strap--3DsLojSy'): + for a in CSSSelect('.style--strap--ND8Cuaip'): if 'WHAT\'S NEWS' in self.tag_to_string(a).upper(): whats_news = a.getparent() break @@ -246,13 +240,16 @@ class WSJ(BasicNewsRecipe): break feeds = [] - for a in CSSSelect('ul[aria-label="Primary Navigation"] a[class^="style--section-link--"]'): + for a in CSSSelect('.WSJTheme--nav-container--2qF6xQnZ .WSJTheme--section-link--3VznjgTM'): + frontpage = a.get('href').endswith('frontpage') title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.') if not title: continue url = self.abs_wsj_url(a.get('href')) self.log('Found section:', title, 'at', url) self.wsj_add_feed(feeds, title, url) + if frontpage: + self.wsj_find_wn_articles(feeds, root, CSSSelect) if self.test and len(feeds) >= self.test[0]: break return feeds