Update New York Times Book Review

2025-08-30 23:00:21 -04:00 · 2025-08-19 07:23:06 +05:30 · 2025-08-19 07:23:06 +05:30 · 206dda25a6
commit 206dda25a6
parent 5c5d168369
1 changed files with 65 additions and 84 deletions
--- a/recipes/nytimesbook.recipe
+++ b/recipes/nytimesbook.recipe
@ -2,18 +2,14 @@
 # vim:fileencoding=utf-8
 # License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>

+
 import json
+from pprint import pprint

-import mechanize
-
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.web.feeds.news import BasicNewsRecipe
-from polyglot.urllib import urlencode

 use_wayback_machine = False
-# This is an Apollo persisted query hash which you can get
-# from looking at the XHR requests made by: https://www.nytimes.com/section/todayspaper
-# or by https://www.nytimes.com/section/world
-persistedQuery = '1f99120a11e94dd62a9474f68ee1255537ee3cf7eac20a0377819edb2fa1fef7'


 def absolutize(url):
@ -94,51 +90,16 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
            if c.lower() == 'yes':
                self.compress_news_images = True

-    def read_nyt_metadata(self):
-        soup = self.index_to_soup('https://www.nytimes.com/pages/books/review/index.html')
-        script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
-        script = type(u'')(script)
-        json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';')  # }}
-        self.nytimes_graphql_config = json.loads(self.nyt_parser.clean_js_json(json_data))['config']
-        return soup
-
-    def nyt_graphql_query(self, qid, operationName='CollectionsQuery'):
-        query = {
-            'operationName': operationName,
-            'variables': json.dumps({
-                'id': qid,
-                'first': 10,
-                'exclusionMode': 'HIGHLIGHTS_AND_EMBEDDED',
-                'isFetchMore':False,
-                'isTranslatable':False,
-                'isEspanol':False,
-                'highlightsListUri':'nyt://per/personalized-list/__null__',
-                'highlightsListFirst':0,
-                'hasHighlightsList':False
-            }, separators=',:'),
-            'extensions': json.dumps({
-                'persistedQuery': {
-                    'version':1,
-                    'sha256Hash': persistedQuery,
-                },
-            }, separators=',:')
-        }
-        url = self.nytimes_graphql_config['gqlUrlClient'] + '?' + urlencode(query)
-        br = self.browser
-        # br.set_debug_http(True)
-        headers = dict(self.nytimes_graphql_config['gqlRequestHeaders'])
-        headers['Accept'] = 'application/json'
-        req = mechanize.Request(url, headers=headers)
-        raw = br.open(req).read()
-        # open('/t/raw.json', 'wb').write(raw)
-        return json.loads(raw)
-
    def parse_index(self):
        # return [('Articles', [{'url': 'https://www.nytimes.com/2022/09/08/books/review/karen-armstrong-by-the-book-interview.html', 'title':'test'}])]
-        self.read_nyt_metadata()
-        query_id = '/section/books/review'
-        data = self.nyt_graphql_query(query_id)
-        return parse_toc(data, self.log)
+        soup = self.index_to_soup('https://www.nytimes.com/pages/books/review/index.html')
+        # with open('/t/raw.html', 'w') as f: f.write(str(soup))
+        feeds = parse_toc(soup)
+        for section_title, articles in feeds:
+            self.log(section_title)
+            for a in articles:
+                self.log('\t' + a['title'], a['url'])
+        return feeds

    def get_browser(self, *args, **kwargs):
        kwargs['user_agent'] = 'User-Agent: Mozilla/5.0 (compatible; archive.org_bot; Wayback Machine Live Record; +http://archive.org/details/archive.org_bot)'
@ -164,43 +125,63 @@ def asset_to_article(asset):
    return {'title': title, 'url': asset['url'], 'description': asset['summary']}


-def parse_toc(data, log=print):
-    containers = data['data']['legacyCollection']['groupings'][0]['containers']
+def preloaded_data(soup):
+    from calibre.web.site_parsers.nytimes import clean_js_json
+    candidates = soup.find_all('script', string=lambda x: x and 'window.__preloadedData' in x)
+    script = candidates[0]
+    script = str(script)
+    raw = script[script.find('{') : script.rfind(';')].strip().rstrip(';')  # }
+    raw = clean_js_json(raw)
+    return json.loads(raw)['initialState']
+
+
+def parse_toc(soup):
+    data = preloaded_data(soup)
+    # with open('/t/raw.json', 'w') as f: pprint(data, stream=f)
+    article_map = {}
+    for k, v in data.items():
+        if v['__typename'] == 'Article':
+            article_map[k] = asset_to_article(v)
+    feeds = []
+    for k, v in data['ROOT_QUERY'].items():
+        if k.startswith('workOrLocation'):
+            for g in data[v['__ref']]['groupings']:
+                for c in g['containers']:
+                    articles = []
+                    for r in c['relations']:
+                        ref = r['asset']['__ref']
+                        if ref in article_map:
+                            articles.append(article_map[ref])
+                    if articles:
+                        feeds.append(('Highlights', articles))
+
    articles = []
-    log('Book reviews')
-    for cont in containers:
-        if cont['__typename'] != 'LegacyCollectionContainer':
-            continue
-        for rel in cont['relations']:
-            if rel.get('__typename') == 'LegacyCollectionRelation':
-                asset = rel['asset']
-                if asset['__typename'] == 'Article':
-                    articles.append(asset_to_article(asset))
-                    log(' ', articles[-1]['title'] + ':', articles[-1]['url'])
-    feeds = [('Book reviews', articles)]
-    articles = []
-    log('Books of the Times')
-    try:
-        containers = data['data']['legacyCollection']['collectionsPage']
-        if containers.get('embeddedCollections'):
-            containers = containers['embeddedCollections']
-        else:
-            containers = [containers]
-    except Exception as e:
-        log('Failed to parse web section', 'Books of the Times', 'with error:', e)
-        return articles
-    for cont in containers:
-        for s in cont['stream']['edges']:
-            asset = s['node']
-            if asset['__typename'] == 'Article':
-                articles.append(asset_to_article(asset))
-                log(' ', articles[-1]['title'] + ':', articles[-1]['url'])
-    if articles:
-        feeds.append(('Book of the Times', articles))
+    for k, v in data['ROOT_QUERY'].items():
+        if k.startswith('workOrLocation'):
+            c = data[v['__ref']]
+            section_title = c['name']
+            for k, v in c['collectionsPage'].items():
+                if k.startswith('stream'):
+                    for k, v in v.items():
+                        if k.startswith('edges'):
+                            for q in v:
+                                r = q['node']['__ref']
+                                if r.startswith('Article:'):
+                                    articles.append(article_map[r])
+            if not articles:
+                for c in c['collectionsPage']['embeddedCollections']:
+                    for e in c['stream']['edges']:
+                        for k, v in e.items():
+                            if k.startswith('node'):
+                                articles.append(article_map[v['__ref']])
+    feeds.append((section_title, articles))
    return feeds


 if __name__ == '__main__':
    import sys
-    data = json.loads(open(sys.argv[-1], 'rb').read())
-    parse_toc(data)
+    with open(sys.argv[-1]) as f:
+        html = f.read()
+    soup = BeautifulSoup(html)
+    feeds = parse_toc(soup)
+    pprint(feeds)