From b43f02fc8270ef4ed33720cd83e237e7b8220a91 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 15 Aug 2025 12:43:49 +0530 Subject: [PATCH] Use JSON data for nytimes web sections as well --- recipes/nytimes.recipe | 144 +++++++++++++++++++------------------ recipes/nytimes_sub.recipe | 144 +++++++++++++++++++------------------ 2 files changed, 152 insertions(+), 136 deletions(-) diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe index 5823345b77..c59b2b90bf 100644 --- a/recipes/nytimes.recipe +++ b/recipes/nytimes.recipe @@ -23,32 +23,31 @@ persistedQuery = '1f99120a11e94dd62a9474f68ee1255537ee3cf7eac20a0377819edb2fa1fe # The sections to download when downloading the web edition, comment out # the section you are not interested in web_sections = [ - ('World', 'world'), - ('U.S.', 'us'), - ('Politics', 'politics'), - ('New York', 'nyregion'), - ('Business', 'business'), - ('Technology', 'technology'), - ('Sports', 'sports'), - ('Science', 'science'), - ('Health', 'health'), - ('Opinion', 'opinion'), - ('Arts', 'arts'), - # ('Books', 'books'), - ('Movies', 'movies'), - ('Music', 'arts/music'), - ('Television', 'arts/television'), - ('Style', 'style'), - ('Dining & Wine', 'food'), - ('Fashion & Style', 'fashion'), - # ('Home & Garden', 'garden'), - ('Travel', 'travel'), - ('Education', 'education'), - ('Multimedia', 'multimedia'), - ('Obituaries', 'obituaries'), - ('Sunday Magazine', 'magazine') + 'world', + 'us', + 'politics', + 'nyregion', + 'business', + 'technology', + 'sports', + 'science', + 'health', + 'opinion', + 'arts', + 'books', + 'movies', + 'arts/music', + 'arts/television', + 'style', + 'food', + 'fashion', + 'travel', + 'education', + 'multimedia', + 'obituaries', + 'magazine', ] -# web_sections = [ ('Business', 'business'), ] +# web_sections = [ 'business' ] url_date_pat = re.compile(r'/(2\d\d\d)/(\d\d)/(\d\d)/') @@ -200,14 +199,14 @@ class NewYorkTimes(BasicNewsRecipe): def parse_web_sections(self): feeds = [] - for section_title, slug in web_sections: + for slug in web_sections: url = 'https://www.nytimes.com/section/' + slug self.log('Download section index:', url) soup = self.index_to_soup(url) # with open('/t/raw.html', 'w') as f: # f.write(str(soup)) + section_title, articles = parse_web_section(soup) self.log('Section:', section_title) - articles = parse_web_section(soup) if articles: feeds.append((section_title, articles)) for a in articles: @@ -222,16 +221,16 @@ class NewYorkTimes(BasicNewsRecipe): # return [('All articles', [ # {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'}, # ])] + date, feeds = self.parse_todays_page() + pdate = date.strftime('%Y/%m/%d') + self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(pdate) + self.timefmt = strftime(' [%d %b, %Y]', date) if self.is_web_edition: return self.parse_web_sections() - date, feeds = self.parse_todays_page() for s, articles in feeds: self.log('Section:', s) for a in articles: self.log('\t', a['title'], a['url']) - pdate = date.strftime('%Y/%m/%d') - self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(pdate) - self.timefmt = strftime(' [%d %b, %Y]', date) return feeds def get_browser(self, *args, **kwargs): @@ -259,39 +258,7 @@ class NewYorkTimes(BasicNewsRecipe): self.log('\tSkipping ', url) -def parse_web_section(soup): - seen = set() - ans = [] - - def handle_h3(h3): - if h3.parent.name == 'a': - href = h3.parent['href'] - parent = h3.parent.parent - else: - href = h3.find('a')['href'] - parent = h3.parent - if href.startswith('/video/') or href in seen: - return - seen.add(href) - title = h3.get_text(separator=' ', strip=True) - desc = '' - for p in parent.find_all('p'): - desc += p.get_text(separator=' ', strip=True) - ans.append({'title': title, 'url': absolutize_href(href), 'description': desc}) - - tuple(map(handle_h3, soup.find(id='collection-highlights-container').find_all('h3'))) - tuple(map(handle_h3, soup.find(attrs={'data-testid': 'main-collection'}).find_all('h3'))) - return ans - - -def asset_to_article(asset): - title = asset['headline']['default'] - return {'title': title, 'url': asset['url'], 'description': asset['summary']} - - -def parse_todays_page(soup): - m = soup.find('meta', attrs={'name':'nyt-collection:uri'})['content'].split('/') - pdate = strptime('{}/{}/{}'.format(*m[-4:-1]), '%Y/%m/%d', assume_utc=False, as_utc=False) +def preloaded_data(soup): from calibre.web.site_parsers.nytimes import clean_js_json candidates = soup.find_all('script', string=lambda x: x and 'window.__preloadedData' in x) script = candidates[0] @@ -300,14 +267,53 @@ def parse_todays_page(soup): raw = clean_js_json(raw) # with open('/t/raw.json', 'w') as f: # f.write(raw) - data = json.loads(raw)['initialState'] + return json.loads(raw)['initialState'] + + +def asset_to_article(asset): + title = asset['headline']['default'] + return {'title': title, 'url': asset['url'], 'description': asset['summary']} + + +def parse_web_section(soup): + data = preloaded_data(soup) article_map = {} + for k, v in data.items(): + if v['__typename'] == 'Article': + article_map[k] = asset_to_article(v) + articles = [] + for k, v in data['ROOT_QUERY'].items(): + if k.startswith('workOrLocation'): + c = data[v['__ref']] + section_title = c['name'] + for k, v in c['collectionsPage'].items(): + if k.startswith('stream'): + for k, v in v.items(): + if k.startswith('edges'): + for q in v: + r = q['node']['__ref'] + if r.startswith('Article:'): + articles.append(article_map[r]) + if not articles: + for c in c['collectionsPage']['embeddedCollections']: + for e in c['stream']['edges']: + for k, v in e.items(): + if k.startswith('node'): + articles.append(article_map[v['__ref']]) + return section_title, articles + + +def parse_todays_page(soup): + m = soup.find('meta', attrs={'name':'nyt-collection:uri'})['content'].split('/') + pdate = strptime('{}/{}/{}'.format(*m[-4:-1]), '%Y/%m/%d', assume_utc=False, as_utc=False) + article_map = {} + data = preloaded_data(soup) for k, v in data.items(): if v['__typename'] == 'Article': article_map[k] = asset_to_article(v) feeds = [] - for v in data['ROOT_QUERY'].values(): - if isinstance(v, dict): + for k, v in data['ROOT_QUERY'].items(): + if k.startswith('workOrLocation'): for g in data[v['__ref']]['groupings']: for c in g['containers']: articles = [] @@ -326,7 +332,9 @@ if __name__ == '__main__': html = f.read() soup = BeautifulSoup(html) if is_web_edition: - pprint(parse_web_section(soup)) + section_title, articles = parse_web_section(soup) + print(section_title) + pprint(articles) else: pdate, feeds = parse_todays_page(soup) print(pdate) diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe index 615ce332df..c31bf5c466 100644 --- a/recipes/nytimes_sub.recipe +++ b/recipes/nytimes_sub.recipe @@ -23,32 +23,31 @@ persistedQuery = '1f99120a11e94dd62a9474f68ee1255537ee3cf7eac20a0377819edb2fa1fe # The sections to download when downloading the web edition, comment out # the section you are not interested in web_sections = [ - ('World', 'world'), - ('U.S.', 'us'), - ('Politics', 'politics'), - ('New York', 'nyregion'), - ('Business', 'business'), - ('Technology', 'technology'), - ('Sports', 'sports'), - ('Science', 'science'), - ('Health', 'health'), - ('Opinion', 'opinion'), - ('Arts', 'arts'), - # ('Books', 'books'), - ('Movies', 'movies'), - ('Music', 'arts/music'), - ('Television', 'arts/television'), - ('Style', 'style'), - ('Dining & Wine', 'food'), - ('Fashion & Style', 'fashion'), - # ('Home & Garden', 'garden'), - ('Travel', 'travel'), - ('Education', 'education'), - ('Multimedia', 'multimedia'), - ('Obituaries', 'obituaries'), - ('Sunday Magazine', 'magazine') + 'world', + 'us', + 'politics', + 'nyregion', + 'business', + 'technology', + 'sports', + 'science', + 'health', + 'opinion', + 'arts', + 'books', + 'movies', + 'arts/music', + 'arts/television', + 'style', + 'food', + 'fashion', + 'travel', + 'education', + 'multimedia', + 'obituaries', + 'magazine', ] -# web_sections = [ ('Business', 'business'), ] +# web_sections = [ 'business' ] url_date_pat = re.compile(r'/(2\d\d\d)/(\d\d)/(\d\d)/') @@ -200,14 +199,14 @@ class NewYorkTimes(BasicNewsRecipe): def parse_web_sections(self): feeds = [] - for section_title, slug in web_sections: + for slug in web_sections: url = 'https://www.nytimes.com/section/' + slug self.log('Download section index:', url) soup = self.index_to_soup(url) # with open('/t/raw.html', 'w') as f: # f.write(str(soup)) + section_title, articles = parse_web_section(soup) self.log('Section:', section_title) - articles = parse_web_section(soup) if articles: feeds.append((section_title, articles)) for a in articles: @@ -222,16 +221,16 @@ class NewYorkTimes(BasicNewsRecipe): # return [('All articles', [ # {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'}, # ])] + date, feeds = self.parse_todays_page() + pdate = date.strftime('%Y/%m/%d') + self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(pdate) + self.timefmt = strftime(' [%d %b, %Y]', date) if self.is_web_edition: return self.parse_web_sections() - date, feeds = self.parse_todays_page() for s, articles in feeds: self.log('Section:', s) for a in articles: self.log('\t', a['title'], a['url']) - pdate = date.strftime('%Y/%m/%d') - self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(pdate) - self.timefmt = strftime(' [%d %b, %Y]', date) return feeds def get_browser(self, *args, **kwargs): @@ -259,39 +258,7 @@ class NewYorkTimes(BasicNewsRecipe): self.log('\tSkipping ', url) -def parse_web_section(soup): - seen = set() - ans = [] - - def handle_h3(h3): - if h3.parent.name == 'a': - href = h3.parent['href'] - parent = h3.parent.parent - else: - href = h3.find('a')['href'] - parent = h3.parent - if href.startswith('/video/') or href in seen: - return - seen.add(href) - title = h3.get_text(separator=' ', strip=True) - desc = '' - for p in parent.find_all('p'): - desc += p.get_text(separator=' ', strip=True) - ans.append({'title': title, 'url': absolutize_href(href), 'description': desc}) - - tuple(map(handle_h3, soup.find(id='collection-highlights-container').find_all('h3'))) - tuple(map(handle_h3, soup.find(attrs={'data-testid': 'main-collection'}).find_all('h3'))) - return ans - - -def asset_to_article(asset): - title = asset['headline']['default'] - return {'title': title, 'url': asset['url'], 'description': asset['summary']} - - -def parse_todays_page(soup): - m = soup.find('meta', attrs={'name':'nyt-collection:uri'})['content'].split('/') - pdate = strptime('{}/{}/{}'.format(*m[-4:-1]), '%Y/%m/%d', assume_utc=False, as_utc=False) +def preloaded_data(soup): from calibre.web.site_parsers.nytimes import clean_js_json candidates = soup.find_all('script', string=lambda x: x and 'window.__preloadedData' in x) script = candidates[0] @@ -300,14 +267,53 @@ def parse_todays_page(soup): raw = clean_js_json(raw) # with open('/t/raw.json', 'w') as f: # f.write(raw) - data = json.loads(raw)['initialState'] + return json.loads(raw)['initialState'] + + +def asset_to_article(asset): + title = asset['headline']['default'] + return {'title': title, 'url': asset['url'], 'description': asset['summary']} + + +def parse_web_section(soup): + data = preloaded_data(soup) article_map = {} + for k, v in data.items(): + if v['__typename'] == 'Article': + article_map[k] = asset_to_article(v) + articles = [] + for k, v in data['ROOT_QUERY'].items(): + if k.startswith('workOrLocation'): + c = data[v['__ref']] + section_title = c['name'] + for k, v in c['collectionsPage'].items(): + if k.startswith('stream'): + for k, v in v.items(): + if k.startswith('edges'): + for q in v: + r = q['node']['__ref'] + if r.startswith('Article:'): + articles.append(article_map[r]) + if not articles: + for c in c['collectionsPage']['embeddedCollections']: + for e in c['stream']['edges']: + for k, v in e.items(): + if k.startswith('node'): + articles.append(article_map[v['__ref']]) + return section_title, articles + + +def parse_todays_page(soup): + m = soup.find('meta', attrs={'name':'nyt-collection:uri'})['content'].split('/') + pdate = strptime('{}/{}/{}'.format(*m[-4:-1]), '%Y/%m/%d', assume_utc=False, as_utc=False) + article_map = {} + data = preloaded_data(soup) for k, v in data.items(): if v['__typename'] == 'Article': article_map[k] = asset_to_article(v) feeds = [] - for v in data['ROOT_QUERY'].values(): - if isinstance(v, dict): + for k, v in data['ROOT_QUERY'].items(): + if k.startswith('workOrLocation'): for g in data[v['__ref']]['groupings']: for c in g['containers']: articles = [] @@ -326,7 +332,9 @@ if __name__ == '__main__': html = f.read() soup = BeautifulSoup(html) if is_web_edition: - pprint(parse_web_section(soup)) + section_title, articles = parse_web_section(soup) + print(section_title) + pprint(articles) else: pdate, feeds = parse_todays_page(soup) print(pdate)