Use JSON data for nytimes web sections as well

This commit is contained in:
Kovid Goyal 2025-08-15 12:43:49 +05:30
parent 99f8c3cfec
commit b43f02fc82
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 152 additions and 136 deletions

View File

@ -23,32 +23,31 @@ persistedQuery = '1f99120a11e94dd62a9474f68ee1255537ee3cf7eac20a0377819edb2fa1fe
# The sections to download when downloading the web edition, comment out # The sections to download when downloading the web edition, comment out
# the section you are not interested in # the section you are not interested in
web_sections = [ web_sections = [
('World', 'world'), 'world',
('U.S.', 'us'), 'us',
('Politics', 'politics'), 'politics',
('New York', 'nyregion'), 'nyregion',
('Business', 'business'), 'business',
('Technology', 'technology'), 'technology',
('Sports', 'sports'), 'sports',
('Science', 'science'), 'science',
('Health', 'health'), 'health',
('Opinion', 'opinion'), 'opinion',
('Arts', 'arts'), 'arts',
# ('Books', 'books'), 'books',
('Movies', 'movies'), 'movies',
('Music', 'arts/music'), 'arts/music',
('Television', 'arts/television'), 'arts/television',
('Style', 'style'), 'style',
('Dining & Wine', 'food'), 'food',
('Fashion & Style', 'fashion'), 'fashion',
# ('Home & Garden', 'garden'), 'travel',
('Travel', 'travel'), 'education',
('Education', 'education'), 'multimedia',
('Multimedia', 'multimedia'), 'obituaries',
('Obituaries', 'obituaries'), 'magazine',
('Sunday Magazine', 'magazine')
] ]
# web_sections = [ ('Business', 'business'), ] # web_sections = [ 'business' ]
url_date_pat = re.compile(r'/(2\d\d\d)/(\d\d)/(\d\d)/') url_date_pat = re.compile(r'/(2\d\d\d)/(\d\d)/(\d\d)/')
@ -200,14 +199,14 @@ class NewYorkTimes(BasicNewsRecipe):
def parse_web_sections(self): def parse_web_sections(self):
feeds = [] feeds = []
for section_title, slug in web_sections: for slug in web_sections:
url = 'https://www.nytimes.com/section/' + slug url = 'https://www.nytimes.com/section/' + slug
self.log('Download section index:', url) self.log('Download section index:', url)
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
# with open('/t/raw.html', 'w') as f: # with open('/t/raw.html', 'w') as f:
# f.write(str(soup)) # f.write(str(soup))
section_title, articles = parse_web_section(soup)
self.log('Section:', section_title) self.log('Section:', section_title)
articles = parse_web_section(soup)
if articles: if articles:
feeds.append((section_title, articles)) feeds.append((section_title, articles))
for a in articles: for a in articles:
@ -222,16 +221,16 @@ class NewYorkTimes(BasicNewsRecipe):
# return [('All articles', [ # return [('All articles', [
# {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'}, # {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'},
# ])] # ])]
date, feeds = self.parse_todays_page()
pdate = date.strftime('%Y/%m/%d')
self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(pdate)
self.timefmt = strftime(' [%d %b, %Y]', date)
if self.is_web_edition: if self.is_web_edition:
return self.parse_web_sections() return self.parse_web_sections()
date, feeds = self.parse_todays_page()
for s, articles in feeds: for s, articles in feeds:
self.log('Section:', s) self.log('Section:', s)
for a in articles: for a in articles:
self.log('\t', a['title'], a['url']) self.log('\t', a['title'], a['url'])
pdate = date.strftime('%Y/%m/%d')
self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(pdate)
self.timefmt = strftime(' [%d %b, %Y]', date)
return feeds return feeds
def get_browser(self, *args, **kwargs): def get_browser(self, *args, **kwargs):
@ -259,39 +258,7 @@ class NewYorkTimes(BasicNewsRecipe):
self.log('\tSkipping ', url) self.log('\tSkipping ', url)
def parse_web_section(soup): def preloaded_data(soup):
seen = set()
ans = []
def handle_h3(h3):
if h3.parent.name == 'a':
href = h3.parent['href']
parent = h3.parent.parent
else:
href = h3.find('a')['href']
parent = h3.parent
if href.startswith('/video/') or href in seen:
return
seen.add(href)
title = h3.get_text(separator=' ', strip=True)
desc = ''
for p in parent.find_all('p'):
desc += p.get_text(separator=' ', strip=True)
ans.append({'title': title, 'url': absolutize_href(href), 'description': desc})
tuple(map(handle_h3, soup.find(id='collection-highlights-container').find_all('h3')))
tuple(map(handle_h3, soup.find(attrs={'data-testid': 'main-collection'}).find_all('h3')))
return ans
def asset_to_article(asset):
title = asset['headline']['default']
return {'title': title, 'url': asset['url'], 'description': asset['summary']}
def parse_todays_page(soup):
m = soup.find('meta', attrs={'name':'nyt-collection:uri'})['content'].split('/')
pdate = strptime('{}/{}/{}'.format(*m[-4:-1]), '%Y/%m/%d', assume_utc=False, as_utc=False)
from calibre.web.site_parsers.nytimes import clean_js_json from calibre.web.site_parsers.nytimes import clean_js_json
candidates = soup.find_all('script', string=lambda x: x and 'window.__preloadedData' in x) candidates = soup.find_all('script', string=lambda x: x and 'window.__preloadedData' in x)
script = candidates[0] script = candidates[0]
@ -300,14 +267,53 @@ def parse_todays_page(soup):
raw = clean_js_json(raw) raw = clean_js_json(raw)
# with open('/t/raw.json', 'w') as f: # with open('/t/raw.json', 'w') as f:
# f.write(raw) # f.write(raw)
data = json.loads(raw)['initialState'] return json.loads(raw)['initialState']
def asset_to_article(asset):
title = asset['headline']['default']
return {'title': title, 'url': asset['url'], 'description': asset['summary']}
def parse_web_section(soup):
data = preloaded_data(soup)
article_map = {} article_map = {}
for k, v in data.items():
if v['__typename'] == 'Article':
article_map[k] = asset_to_article(v)
articles = []
for k, v in data['ROOT_QUERY'].items():
if k.startswith('workOrLocation'):
c = data[v['__ref']]
section_title = c['name']
for k, v in c['collectionsPage'].items():
if k.startswith('stream'):
for k, v in v.items():
if k.startswith('edges'):
for q in v:
r = q['node']['__ref']
if r.startswith('Article:'):
articles.append(article_map[r])
if not articles:
for c in c['collectionsPage']['embeddedCollections']:
for e in c['stream']['edges']:
for k, v in e.items():
if k.startswith('node'):
articles.append(article_map[v['__ref']])
return section_title, articles
def parse_todays_page(soup):
m = soup.find('meta', attrs={'name':'nyt-collection:uri'})['content'].split('/')
pdate = strptime('{}/{}/{}'.format(*m[-4:-1]), '%Y/%m/%d', assume_utc=False, as_utc=False)
article_map = {}
data = preloaded_data(soup)
for k, v in data.items(): for k, v in data.items():
if v['__typename'] == 'Article': if v['__typename'] == 'Article':
article_map[k] = asset_to_article(v) article_map[k] = asset_to_article(v)
feeds = [] feeds = []
for v in data['ROOT_QUERY'].values(): for k, v in data['ROOT_QUERY'].items():
if isinstance(v, dict): if k.startswith('workOrLocation'):
for g in data[v['__ref']]['groupings']: for g in data[v['__ref']]['groupings']:
for c in g['containers']: for c in g['containers']:
articles = [] articles = []
@ -326,7 +332,9 @@ if __name__ == '__main__':
html = f.read() html = f.read()
soup = BeautifulSoup(html) soup = BeautifulSoup(html)
if is_web_edition: if is_web_edition:
pprint(parse_web_section(soup)) section_title, articles = parse_web_section(soup)
print(section_title)
pprint(articles)
else: else:
pdate, feeds = parse_todays_page(soup) pdate, feeds = parse_todays_page(soup)
print(pdate) print(pdate)

View File

@ -23,32 +23,31 @@ persistedQuery = '1f99120a11e94dd62a9474f68ee1255537ee3cf7eac20a0377819edb2fa1fe
# The sections to download when downloading the web edition, comment out # The sections to download when downloading the web edition, comment out
# the section you are not interested in # the section you are not interested in
web_sections = [ web_sections = [
('World', 'world'), 'world',
('U.S.', 'us'), 'us',
('Politics', 'politics'), 'politics',
('New York', 'nyregion'), 'nyregion',
('Business', 'business'), 'business',
('Technology', 'technology'), 'technology',
('Sports', 'sports'), 'sports',
('Science', 'science'), 'science',
('Health', 'health'), 'health',
('Opinion', 'opinion'), 'opinion',
('Arts', 'arts'), 'arts',
# ('Books', 'books'), 'books',
('Movies', 'movies'), 'movies',
('Music', 'arts/music'), 'arts/music',
('Television', 'arts/television'), 'arts/television',
('Style', 'style'), 'style',
('Dining & Wine', 'food'), 'food',
('Fashion & Style', 'fashion'), 'fashion',
# ('Home & Garden', 'garden'), 'travel',
('Travel', 'travel'), 'education',
('Education', 'education'), 'multimedia',
('Multimedia', 'multimedia'), 'obituaries',
('Obituaries', 'obituaries'), 'magazine',
('Sunday Magazine', 'magazine')
] ]
# web_sections = [ ('Business', 'business'), ] # web_sections = [ 'business' ]
url_date_pat = re.compile(r'/(2\d\d\d)/(\d\d)/(\d\d)/') url_date_pat = re.compile(r'/(2\d\d\d)/(\d\d)/(\d\d)/')
@ -200,14 +199,14 @@ class NewYorkTimes(BasicNewsRecipe):
def parse_web_sections(self): def parse_web_sections(self):
feeds = [] feeds = []
for section_title, slug in web_sections: for slug in web_sections:
url = 'https://www.nytimes.com/section/' + slug url = 'https://www.nytimes.com/section/' + slug
self.log('Download section index:', url) self.log('Download section index:', url)
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
# with open('/t/raw.html', 'w') as f: # with open('/t/raw.html', 'w') as f:
# f.write(str(soup)) # f.write(str(soup))
section_title, articles = parse_web_section(soup)
self.log('Section:', section_title) self.log('Section:', section_title)
articles = parse_web_section(soup)
if articles: if articles:
feeds.append((section_title, articles)) feeds.append((section_title, articles))
for a in articles: for a in articles:
@ -222,16 +221,16 @@ class NewYorkTimes(BasicNewsRecipe):
# return [('All articles', [ # return [('All articles', [
# {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'}, # {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'},
# ])] # ])]
date, feeds = self.parse_todays_page()
pdate = date.strftime('%Y/%m/%d')
self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(pdate)
self.timefmt = strftime(' [%d %b, %Y]', date)
if self.is_web_edition: if self.is_web_edition:
return self.parse_web_sections() return self.parse_web_sections()
date, feeds = self.parse_todays_page()
for s, articles in feeds: for s, articles in feeds:
self.log('Section:', s) self.log('Section:', s)
for a in articles: for a in articles:
self.log('\t', a['title'], a['url']) self.log('\t', a['title'], a['url'])
pdate = date.strftime('%Y/%m/%d')
self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(pdate)
self.timefmt = strftime(' [%d %b, %Y]', date)
return feeds return feeds
def get_browser(self, *args, **kwargs): def get_browser(self, *args, **kwargs):
@ -259,39 +258,7 @@ class NewYorkTimes(BasicNewsRecipe):
self.log('\tSkipping ', url) self.log('\tSkipping ', url)
def parse_web_section(soup): def preloaded_data(soup):
seen = set()
ans = []
def handle_h3(h3):
if h3.parent.name == 'a':
href = h3.parent['href']
parent = h3.parent.parent
else:
href = h3.find('a')['href']
parent = h3.parent
if href.startswith('/video/') or href in seen:
return
seen.add(href)
title = h3.get_text(separator=' ', strip=True)
desc = ''
for p in parent.find_all('p'):
desc += p.get_text(separator=' ', strip=True)
ans.append({'title': title, 'url': absolutize_href(href), 'description': desc})
tuple(map(handle_h3, soup.find(id='collection-highlights-container').find_all('h3')))
tuple(map(handle_h3, soup.find(attrs={'data-testid': 'main-collection'}).find_all('h3')))
return ans
def asset_to_article(asset):
title = asset['headline']['default']
return {'title': title, 'url': asset['url'], 'description': asset['summary']}
def parse_todays_page(soup):
m = soup.find('meta', attrs={'name':'nyt-collection:uri'})['content'].split('/')
pdate = strptime('{}/{}/{}'.format(*m[-4:-1]), '%Y/%m/%d', assume_utc=False, as_utc=False)
from calibre.web.site_parsers.nytimes import clean_js_json from calibre.web.site_parsers.nytimes import clean_js_json
candidates = soup.find_all('script', string=lambda x: x and 'window.__preloadedData' in x) candidates = soup.find_all('script', string=lambda x: x and 'window.__preloadedData' in x)
script = candidates[0] script = candidates[0]
@ -300,14 +267,53 @@ def parse_todays_page(soup):
raw = clean_js_json(raw) raw = clean_js_json(raw)
# with open('/t/raw.json', 'w') as f: # with open('/t/raw.json', 'w') as f:
# f.write(raw) # f.write(raw)
data = json.loads(raw)['initialState'] return json.loads(raw)['initialState']
def asset_to_article(asset):
title = asset['headline']['default']
return {'title': title, 'url': asset['url'], 'description': asset['summary']}
def parse_web_section(soup):
data = preloaded_data(soup)
article_map = {} article_map = {}
for k, v in data.items():
if v['__typename'] == 'Article':
article_map[k] = asset_to_article(v)
articles = []
for k, v in data['ROOT_QUERY'].items():
if k.startswith('workOrLocation'):
c = data[v['__ref']]
section_title = c['name']
for k, v in c['collectionsPage'].items():
if k.startswith('stream'):
for k, v in v.items():
if k.startswith('edges'):
for q in v:
r = q['node']['__ref']
if r.startswith('Article:'):
articles.append(article_map[r])
if not articles:
for c in c['collectionsPage']['embeddedCollections']:
for e in c['stream']['edges']:
for k, v in e.items():
if k.startswith('node'):
articles.append(article_map[v['__ref']])
return section_title, articles
def parse_todays_page(soup):
m = soup.find('meta', attrs={'name':'nyt-collection:uri'})['content'].split('/')
pdate = strptime('{}/{}/{}'.format(*m[-4:-1]), '%Y/%m/%d', assume_utc=False, as_utc=False)
article_map = {}
data = preloaded_data(soup)
for k, v in data.items(): for k, v in data.items():
if v['__typename'] == 'Article': if v['__typename'] == 'Article':
article_map[k] = asset_to_article(v) article_map[k] = asset_to_article(v)
feeds = [] feeds = []
for v in data['ROOT_QUERY'].values(): for k, v in data['ROOT_QUERY'].items():
if isinstance(v, dict): if k.startswith('workOrLocation'):
for g in data[v['__ref']]['groupings']: for g in data[v['__ref']]['groupings']:
for c in g['containers']: for c in g['containers']:
articles = [] articles = []
@ -326,7 +332,9 @@ if __name__ == '__main__':
html = f.read() html = f.read()
soup = BeautifulSoup(html) soup = BeautifulSoup(html)
if is_web_edition: if is_web_edition:
pprint(parse_web_section(soup)) section_title, articles = parse_web_section(soup)
print(section_title)
pprint(articles)
else: else:
pdate, feeds = parse_todays_page(soup) pdate, feeds = parse_todays_page(soup)
print(pdate) print(pdate)