Update New York Times Book Review

2025-07-09 03:04:10 -04:00 · 2019-01-16 19:29:06 +05:30 · 2019-01-16 19:29:06 +05:30 · ecb302fa04
commit ecb302fa04
parent 4344028bfc
1 changed files with 24 additions and 32 deletions
--- a/recipes/nytimesbook.recipe
+++ b/recipes/nytimesbook.recipe
@ -6,6 +6,12 @@ def classes(classes):
    return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)})
 def absolutize(url):
    if url.startswith('/'):
        url = 'https://www.nytimes.com' + url
    return url
 class NewYorkTimesBookReview(BasicNewsRecipe):
    title = u'New York Times Book Review'
    language = 'en'
@ -18,42 +24,27 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
    encoding = 'utf-8'
    keep_only_tags = [
-        dict(id='story'),
+            dict(name='h1'),
-    ]
+            dict(attrs={'data-testid':'photoviewer-wrapper'}),
-    remove_tags = [
+            dict(itemprop=['author creator', 'articleBody']),
        dict(attrs={'aria-label':'tools'.split()}),
        dict(attrs={'aria-label': lambda x: x and 'New York Times Logo' in x}),
        dict(href='#site-content #site-index'.split()),
        dict(attrs={'aria-hidden':'true'}),
        dict(attrs={'data-videoid':True}),
        dict(name='button meta link'.split()),
        dict(id=lambda x: x and x.startswith('story-ad-')),
        dict(name='head'),
        dict(role='toolbar'),
        dict(name='a', href=lambda x: x and '#story-continues-' in x),
        dict(name='a', href=lambda x: x and '#whats-next' in x),
        dict(id=lambda x: x and 'sharetools-' in x),
        dict(id='newsletter-promo supported-by-ad bottom-wrapper'.split()),
        classes('story-print-citation supported-by accessibility-ad-header visually-hidden bottom-of-article ad nav-wrapper'),
        dict(attrs={'class': lambda x: x and (
            'SectionBar' in x or 'recirculation' in x or 'ResponsiveAd' in x or 'accessibility-visuallyHidden' in x or 'RelatedCoverage' in x)}),
    ]
    def parse_index(self):
        soup = self.index_to_soup(
-            'http://www.nytimes.com/pages/books/review/index.html')
+            'https://www.nytimes.com/pages/books/review/index.html')
        # Find TOC
-        toc = soup.find('div', attrs={'class': 'rank'})
+        toc = soup.find('section', id='collection-book-review').find('section').find('ol')
        main_articles, articles = [], []
        feeds = [('Features', main_articles), ('Latest', articles)]
-        for h2 in toc.findAll('h2', attrs={'class': 'headline'}):
+        for li in toc.findAll('li'):
            h2 = li.find('h2')
            a = h2.find('a', href=True)
            if a is not None:
                title = self.tag_to_string(a)
-                url = a['href']
+                url = absolutize(a['href'])
                desc = ''
-                p = h2.findNextSibling('p', attrs={'class': 'summary'})
+                p = h2.findNextSibling('p')
                if p:
                    desc = self.tag_to_string(p)
                main_articles.append(
@ -61,14 +52,15 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
                self.log('Found:', title, 'at', url)
                if desc:
                    self.log('\t', desc)
-        for li in soup.find(id='latest-panel').find('ol').findAll('li'):
+        for li in soup.find(id='stream-panel').find('ol').findAll('li'):
-            a = li.find('a', attrs={'class': 'story-link'}, href=True)
+            h2 = li.find('h2')
-            if a is None:
+            a = h2.findParent('a')
-                continue
+            url = absolutize(a['href'])
-            url = a['href']
+            p = h2.findNextSibling('p')
-            m = a.find(attrs={'class': 'story-meta'})
+            title = self.tag_to_string(h2)
-            title = self.tag_to_string(m.find('h2'))
+            desc = ''
-            desc = self.tag_to_string(m.find(attrs={'class': 'summary'}))
+            if p:
                desc = self.tag_to_string(p)
            articles.append({'title': title, 'url': url, 'description': desc})
            self.log('Found:', title, 'at', url)
            if desc: