Update The Wall Street Journal

2025-07-09 03:04:10 -04:00 · 2021-01-20 21:20:49 +05:30 · 2021-01-20 21:20:49 +05:30 · 310f92b868
commit 310f92b868
parent b0555e1cfe
2 changed files with 20 additions and 26 deletions
--- a/recipes/wsj.recipe
+++ b/recipes/wsj.recipe
@ -66,8 +66,7 @@ class WSJ(BasicNewsRecipe):
            ' media-object-video article_tools nc-exp-artmeta category type-InsetArticlesRelatedByType media-object-rich-text'),
        dict(name='span', attrs={
             'data-country-code': True, 'data-ticker-code': True}),
-        dict(name='meta link'.split()),
-        dict(name='button'),
+        dict(name='meta link button'.split()),
    ]

    def preprocess_soup(self, soup):
@ -169,19 +168,14 @@ class WSJ(BasicNewsRecipe):
    def wsj_find_articles(self, url, ahed=False):
        root = self.index_to_soup(url, as_tree=True)
        CSSSelect = Select(root)
-        # from lxml import etree
-        # from calibre.utils.ipython import ipython
-        # open('/t/section.html', 'w').write(etree.tostring(root, encoding='unicode'))
-        # ipython({'root': root, 'CSSSelect': CSSSelect})
-        # raise SystemExit(1)
        articles = []
-        for container in CSSSelect('article[class^="WSJTheme--story--"]'):
+        for container in root.xpath('descendant::div[contains(@class, "WSJTheme--list-item-")]'):
            heading = next(CSSSelect('h2, h3', container))
            a = next(CSSSelect('a', heading))
            title = self.tag_to_string(a)
            url = self.abs_wsj_url(a.get('href'))
            desc = ''
-            for p in container.xpath('descendant::p[contains(@class, "WSJTheme--summary--")]'):
+            for p in container.xpath('descendant::p[contains(@class, "WSJTheme--description-")]'):
                q = self.tag_to_string(p)
                if 'Subscriber Content' in q:
                    continue
@ -190,17 +184,17 @@ class WSJ(BasicNewsRecipe):

            articles.append({'title': title, 'url': url,
                             'description': desc, 'date': ''})
+            if self.test and len(articles) >= self.test[1]:
+                break

            self.log('\tFound article:', title)
            self.log('\t\t', desc)
-            if self.test and len(articles) >= self.test[1]:
-                break

        return articles

    def wsj_find_wn_articles(self, feeds, root, CSSSelect):
        articles = []
-        for a in CSSSelect('.style--strap--3DsLojSy'):
+        for a in CSSSelect('.style--strap--ND8Cuaip'):
            if 'WHAT\'S NEWS' in self.tag_to_string(a).upper():
                whats_news = a.getparent()
                break
@ -246,13 +240,16 @@ class WSJ(BasicNewsRecipe):
                break

        feeds = []
-        for a in CSSSelect('ul[aria-label="Primary Navigation"] a[class^="style--section-link--"]'):
+        for a in CSSSelect('.WSJTheme--nav-container--2qF6xQnZ .WSJTheme--section-link--3VznjgTM'):
+            frontpage = a.get('href').endswith('frontpage')
            title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
            if not title:
                continue
            url = self.abs_wsj_url(a.get('href'))
            self.log('Found section:', title, 'at', url)
            self.wsj_add_feed(feeds, title, url)
+            if frontpage:
+                self.wsj_find_wn_articles(feeds, root, CSSSelect)
            if self.test and len(feeds) >= self.test[0]:
                break
        return feeds
--- a/recipes/wsj_free.recipe
+++ b/recipes/wsj_free.recipe
@ -66,8 +66,7 @@ class WSJ(BasicNewsRecipe):
            ' media-object-video article_tools nc-exp-artmeta category type-InsetArticlesRelatedByType media-object-rich-text'),
        dict(name='span', attrs={
             'data-country-code': True, 'data-ticker-code': True}),
-        dict(name='meta link'.split()),
-        dict(name='button'),
+        dict(name='meta link button'.split()),
    ]

    def preprocess_soup(self, soup):
@ -169,19 +168,14 @@ class WSJ(BasicNewsRecipe):
    def wsj_find_articles(self, url, ahed=False):
        root = self.index_to_soup(url, as_tree=True)
        CSSSelect = Select(root)
-        # from lxml import etree
-        # from calibre.utils.ipython import ipython
-        # open('/t/section.html', 'w').write(etree.tostring(root, encoding='unicode'))
-        # ipython({'root': root, 'CSSSelect': CSSSelect})
-        # raise SystemExit(1)
        articles = []
-        for container in CSSSelect('article[class^="WSJTheme--story--"]'):
+        for container in root.xpath('descendant::div[contains(@class, "WSJTheme--list-item-")]'):
            heading = next(CSSSelect('h2, h3', container))
            a = next(CSSSelect('a', heading))
            title = self.tag_to_string(a)
            url = self.abs_wsj_url(a.get('href'))
            desc = ''
-            for p in container.xpath('descendant::p[contains(@class, "WSJTheme--summary--")]'):
+            for p in container.xpath('descendant::p[contains(@class, "WSJTheme--description-")]'):
                q = self.tag_to_string(p)
                if 'Subscriber Content' in q:
                    continue
@ -190,17 +184,17 @@ class WSJ(BasicNewsRecipe):

            articles.append({'title': title, 'url': url,
                             'description': desc, 'date': ''})
+            if self.test and len(articles) >= self.test[1]:
+                break

            self.log('\tFound article:', title)
            self.log('\t\t', desc)
-            if self.test and len(articles) >= self.test[1]:
-                break

        return articles

    def wsj_find_wn_articles(self, feeds, root, CSSSelect):
        articles = []
-        for a in CSSSelect('.style--strap--3DsLojSy'):
+        for a in CSSSelect('.style--strap--ND8Cuaip'):
            if 'WHAT\'S NEWS' in self.tag_to_string(a).upper():
                whats_news = a.getparent()
                break
@ -246,13 +240,16 @@ class WSJ(BasicNewsRecipe):
                break

        feeds = []
-        for a in CSSSelect('ul[aria-label="Primary Navigation"] a[class^="style--section-link--"]'):
+        for a in CSSSelect('.WSJTheme--nav-container--2qF6xQnZ .WSJTheme--section-link--3VznjgTM'):
+            frontpage = a.get('href').endswith('frontpage')
            title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
            if not title:
                continue
            url = self.abs_wsj_url(a.get('href'))
            self.log('Found section:', title, 'at', url)
            self.wsj_add_feed(feeds, title, url)
+            if frontpage:
+                self.wsj_find_wn_articles(feeds, root, CSSSelect)
            if self.test and len(feeds) >= self.test[0]:
                break
        return feeds