diff --git a/recipes/nytimesbook.recipe b/recipes/nytimesbook.recipe index 17dd224dc5..3b6a8f70a3 100644 --- a/recipes/nytimesbook.recipe +++ b/recipes/nytimesbook.recipe @@ -2,18 +2,14 @@ # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2015, Kovid Goyal + import json +from pprint import pprint -import mechanize - +from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe -from polyglot.urllib import urlencode use_wayback_machine = False -# This is an Apollo persisted query hash which you can get -# from looking at the XHR requests made by: https://www.nytimes.com/section/todayspaper -# or by https://www.nytimes.com/section/world -persistedQuery = '1f99120a11e94dd62a9474f68ee1255537ee3cf7eac20a0377819edb2fa1fef7' def absolutize(url): @@ -94,51 +90,16 @@ class NewYorkTimesBookReview(BasicNewsRecipe): if c.lower() == 'yes': self.compress_news_images = True - def read_nyt_metadata(self): - soup = self.index_to_soup('https://www.nytimes.com/pages/books/review/index.html') - script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0] - script = type(u'')(script) - json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }} - self.nytimes_graphql_config = json.loads(self.nyt_parser.clean_js_json(json_data))['config'] - return soup - - def nyt_graphql_query(self, qid, operationName='CollectionsQuery'): - query = { - 'operationName': operationName, - 'variables': json.dumps({ - 'id': qid, - 'first': 10, - 'exclusionMode': 'HIGHLIGHTS_AND_EMBEDDED', - 'isFetchMore':False, - 'isTranslatable':False, - 'isEspanol':False, - 'highlightsListUri':'nyt://per/personalized-list/__null__', - 'highlightsListFirst':0, - 'hasHighlightsList':False - }, separators=',:'), - 'extensions': json.dumps({ - 'persistedQuery': { - 'version':1, - 'sha256Hash': persistedQuery, - }, - }, separators=',:') - } - url = self.nytimes_graphql_config['gqlUrlClient'] + '?' + urlencode(query) - br = self.browser - # br.set_debug_http(True) - headers = dict(self.nytimes_graphql_config['gqlRequestHeaders']) - headers['Accept'] = 'application/json' - req = mechanize.Request(url, headers=headers) - raw = br.open(req).read() - # open('/t/raw.json', 'wb').write(raw) - return json.loads(raw) - def parse_index(self): # return [('Articles', [{'url': 'https://www.nytimes.com/2022/09/08/books/review/karen-armstrong-by-the-book-interview.html', 'title':'test'}])] - self.read_nyt_metadata() - query_id = '/section/books/review' - data = self.nyt_graphql_query(query_id) - return parse_toc(data, self.log) + soup = self.index_to_soup('https://www.nytimes.com/pages/books/review/index.html') + # with open('/t/raw.html', 'w') as f: f.write(str(soup)) + feeds = parse_toc(soup) + for section_title, articles in feeds: + self.log(section_title) + for a in articles: + self.log('\t' + a['title'], a['url']) + return feeds def get_browser(self, *args, **kwargs): kwargs['user_agent'] = 'User-Agent: Mozilla/5.0 (compatible; archive.org_bot; Wayback Machine Live Record; +http://archive.org/details/archive.org_bot)' @@ -164,43 +125,63 @@ def asset_to_article(asset): return {'title': title, 'url': asset['url'], 'description': asset['summary']} -def parse_toc(data, log=print): - containers = data['data']['legacyCollection']['groupings'][0]['containers'] +def preloaded_data(soup): + from calibre.web.site_parsers.nytimes import clean_js_json + candidates = soup.find_all('script', string=lambda x: x and 'window.__preloadedData' in x) + script = candidates[0] + script = str(script) + raw = script[script.find('{') : script.rfind(';')].strip().rstrip(';') # } + raw = clean_js_json(raw) + return json.loads(raw)['initialState'] + + +def parse_toc(soup): + data = preloaded_data(soup) + # with open('/t/raw.json', 'w') as f: pprint(data, stream=f) + article_map = {} + for k, v in data.items(): + if v['__typename'] == 'Article': + article_map[k] = asset_to_article(v) + feeds = [] + for k, v in data['ROOT_QUERY'].items(): + if k.startswith('workOrLocation'): + for g in data[v['__ref']]['groupings']: + for c in g['containers']: + articles = [] + for r in c['relations']: + ref = r['asset']['__ref'] + if ref in article_map: + articles.append(article_map[ref]) + if articles: + feeds.append(('Highlights', articles)) + articles = [] - log('Book reviews') - for cont in containers: - if cont['__typename'] != 'LegacyCollectionContainer': - continue - for rel in cont['relations']: - if rel.get('__typename') == 'LegacyCollectionRelation': - asset = rel['asset'] - if asset['__typename'] == 'Article': - articles.append(asset_to_article(asset)) - log(' ', articles[-1]['title'] + ':', articles[-1]['url']) - feeds = [('Book reviews', articles)] - articles = [] - log('Books of the Times') - try: - containers = data['data']['legacyCollection']['collectionsPage'] - if containers.get('embeddedCollections'): - containers = containers['embeddedCollections'] - else: - containers = [containers] - except Exception as e: - log('Failed to parse web section', 'Books of the Times', 'with error:', e) - return articles - for cont in containers: - for s in cont['stream']['edges']: - asset = s['node'] - if asset['__typename'] == 'Article': - articles.append(asset_to_article(asset)) - log(' ', articles[-1]['title'] + ':', articles[-1]['url']) - if articles: - feeds.append(('Book of the Times', articles)) + for k, v in data['ROOT_QUERY'].items(): + if k.startswith('workOrLocation'): + c = data[v['__ref']] + section_title = c['name'] + for k, v in c['collectionsPage'].items(): + if k.startswith('stream'): + for k, v in v.items(): + if k.startswith('edges'): + for q in v: + r = q['node']['__ref'] + if r.startswith('Article:'): + articles.append(article_map[r]) + if not articles: + for c in c['collectionsPage']['embeddedCollections']: + for e in c['stream']['edges']: + for k, v in e.items(): + if k.startswith('node'): + articles.append(article_map[v['__ref']]) + feeds.append((section_title, articles)) return feeds if __name__ == '__main__': import sys - data = json.loads(open(sys.argv[-1], 'rb').read()) - parse_toc(data) + with open(sys.argv[-1]) as f: + html = f.read() + soup = BeautifulSoup(html) + feeds = parse_toc(soup) + pprint(feeds)