Update NYTimes Web Edition for website changes

This commit is contained in:
Kovid Goyal 2025-04-10 15:55:46 +05:30
parent 1bf1265b1b
commit 6cdd57289b
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 153 additions and 242 deletions

View File

@ -8,13 +8,20 @@ import datetime
import json import json
import re import re
import mechanize
from calibre import strftime from calibre import strftime
from calibre.ebooks.BeautifulSoup import Tag from calibre.ebooks.BeautifulSoup import Tag
from calibre.utils.date import strptime from calibre.utils.date import strptime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from polyglot.urllib import urlencode
use_wayback_machine = False use_wayback_machine = False
# This is an Apollo persisted query hash which you can get
# from looking at the XHR requests made by: https://www.nytimes.com/section/todayspaper
# or by https://www.nytimes.com/section/world
persistedQuery = '1f99120a11e94dd62a9474f68ee1255537ee3cf7eac20a0377819edb2fa1fef7'
# The sections to download when downloading the web edition, comment out # The sections to download when downloading the web edition, comment out
# the section you are not interested in # the section you are not interested in
@ -76,7 +83,7 @@ def new_tag(soup, name, attrs=()):
class NewYorkTimes(BasicNewsRecipe): class NewYorkTimes(BasicNewsRecipe):
title = 'The New York Times (Web)' title = 'The New York Times (Web)'
description = ( description = (
'New York Times (Web). You can edit the recipe to remove sections you are not interested in. ' 'New York Times (Web). You can edit the recipe to remove sections you are not interested in. '
'Use advanced menu to make changes to fetch Todays Paper' 'Use advanced menu to make changes to fetch Todays Paper'
) )
encoding = 'utf-8' encoding = 'utf-8'
@ -169,169 +176,83 @@ class NewYorkTimes(BasicNewsRecipe):
self.compress_news_images = True self.compress_news_images = True
def read_todays_paper(self): def read_todays_paper(self):
INDEX = 'https://www.nytimes.com/section/todayspaper' pdate = self.recipe_specific_options.get('date')
# INDEX = 'file:///t/raw.html' templ = 'https://www.nytimes.com/issue/todayspaper/{}/todays-new-york-times'
d = self.recipe_specific_options.get('date') if pdate and isinstance(pdate, str):
if d and isinstance(d, str): return pdate, self.index_to_soup(templ.format(pdate))
INDEX = 'https://www.nytimes.com/issue/todayspaper/' + d + '/todays-new-york-times' # Cant figure out how to get the date so just try todays and yesterdays dates
return self.index_to_soup(self.get_nyt_page(INDEX, skip_wayback=True)) date = datetime.date.today()
pdate = date.strftime('%Y/%m/%d')
try:
soup = self.index_to_soup(templ.format(pdate))
except Exception as e:
if getattr(e, 'code', None) == 404:
date -= datetime.timedelta(days=1)
pdate = date.strftime('%Y/%m/%d')
soup = self.index_to_soup(templ.format(pdate))
else:
raise
self.log("Using today's paper from:", pdate)
return pdate, soup
def read_nyt_metadata(self): def read_nyt_metadata(self):
soup = self.read_todays_paper() pdate, soup = self.read_todays_paper()
pdate = soup.find('meta', attrs={'name':'pdate', 'content': True})['content'] date = strptime(pdate, '%Y/%m/%d', assume_utc=False, as_utc=False)
date = strptime(pdate, '%Y%m%d', assume_utc=False, as_utc=False) self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(pdate)
self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(date.strftime('%Y/%m/%d'))
self.timefmt = strftime(' [%d %b, %Y]', date) self.timefmt = strftime(' [%d %b, %Y]', date)
return soup self.nytimes_publication_date = pdate
def parse_todays_page(self):
soup = self.read_nyt_metadata()
script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0] script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
script = type(u'')(script) script = type(u'')(script)
json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';') json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }}
data = json.loads(json_data.replace(':undefined', ':null'))['initialState'] self.nytimes_graphql_config = json.loads(json_data.replace(':undefined', ':null'))['config']
article_map = {} return soup
sections = []
for key in data:
if 'Article' in key:
adata = data[key]
if adata.get('__typename') == 'Article':
url = adata.get('url')
summary = adata.get('summary')
headline = adata.get('headline')
if url and headline:
title = headline['default']
article_map[adata['id']] = {
'title': title, 'url': url, 'description': summary or ''}
elif 'LegacyCollection:' in key:
lc = data[key]
if not lc.get('active'):
continue
for sdata in lc['groupings']:
tname = sdata.get('__typename')
if tname != 'LegacyCollectionGrouping':
continue
for cont in sdata['containers']:
if cont.get('__typename') == 'LegacyCollectionContainer':
section_name = cont['label@stripHtml']
articles = []
for rel in cont['relations']:
if rel.get('__typename') == 'LegacyCollectionRelation':
asset = rel['asset']['__ref']
if asset.startswith('Article:'):
articles.append(asset.partition(':')[2])
if articles:
sections.append((section_name, articles))
feeds = [] def nyt_graphql_query(self, qid, operationName='CollectionsQuery'):
for section_title, article_ids in sections: query = {
articles = [] 'operationName': operationName,
for aid in article_ids: 'variables': json.dumps({
if aid in article_map: 'id': qid,
art = article_map[aid] 'first': 10,
articles.append(art) 'exclusionMode': 'HIGHLIGHTS_AND_EMBEDDED',
if articles: 'isFetchMore':False,
feeds.append((section_title, articles)) 'isTranslatable':False,
'isEspanol':False,
'highlightsListUri':'nyt://per/personalized-list/__null__',
'highlightsListFirst':0,
'hasHighlightsList':False
}, separators=',:'),
'extensions': json.dumps({
'persistedQuery': {
'version':1,
'sha256Hash': persistedQuery,
},
}, separators=',:')
}
url = self.nytimes_graphql_config['gqlUrlClient'] + '?' + urlencode(query)
br = self.browser
# br.set_debug_http(True)
headers = dict(self.nytimes_graphql_config['gqlRequestHeaders'])
headers['Accept'] = 'application/json'
req = mechanize.Request(url, headers=headers)
raw = br.open(req).read()
# open('/t/raw.json', 'wb').write(raw)
return json.loads(raw)
def skey(x): def parse_todays_page(self):
name = x[0].strip() self.read_nyt_metadata()
if name == 'The Front Page': query_id = '/issue/todayspaper/{}/todays-new-york-times'.format(self.nytimes_publication_date)
return 0, '' data = self.nyt_graphql_query(query_id)
return 1, name.lower() return parse_todays_page(data, self.log)
feeds.sort(key=skey)
for section, articles in feeds:
self.log('\n' + section)
for article in articles:
self.log(article['title'] + ' - ' + article['url'])
# raise SystemExit(1)
return feeds
def parse_article_group(self, container):
for li in container.findAll('li'):
article = li.find('article')
if article is None:
a = li.find('a', href=True)
if a is not None:
title = self.tag_to_string(li.find(['h3', 'h2'])).strip()
paras = li.findAll('p')
if not title:
title = self.tag_to_string(paras[0]).strip()
if not title:
raise ValueError('No title found in article')
url = a['href']
if url.startswith('/'):
url = 'https://www.nytimes.com' + url
desc = ''
if len(paras) > 0:
desc = self.tag_to_string(paras[-1])
date = ''
d = date_from_url(url)
if d is not None:
date = format_date(d)
today = datetime.date.today()
delta = today - d
if delta.days > self.oldest_web_edition_article:
self.log.debug('\tSkipping article', title, 'as it is too old')
continue
yield {'title': title, 'url': url, 'description': desc, 'date': date}
continue
h2 = article.find(['h2', 'h3'])
if h2 is not None:
title = self.tag_to_string(h2)
a = h2.find('a', href=True)
if a is not None:
url = a['href']
if url.startswith('/'):
url = 'https://www.nytimes.com' + url
desc = ''
p = h2.findNextSibling('p')
if p is not None:
desc = self.tag_to_string(p)
date = ''
d = date_from_url(url)
if d is not None:
date = format_date(d)
today = datetime.date.today()
delta = today - d
if delta.days > self.oldest_web_edition_article:
self.log.debug('\tSkipping article', title, 'as it is too old')
continue
yield {'title': title, 'url': url, 'description': desc, 'date': date}
def parse_web_section(self, soup, slug):
def log(article):
self.log('\t', article['title'] + article['date'], ':', article['url'])
if article.get('description'):
self.log('\t\t', article['description'])
cid = slug.split('/')[-1]
if cid == 'dining':
cid = 'food'
try:
container = soup.find(id='collection-{}'.format(cid)).find('section')
except AttributeError:
container = None
if container is None:
raise ValueError('Failed to find articles container for slug: {}'.format(slug))
for ol in container.findAll('ol'):
for article in self.parse_article_group(ol):
log(article)
yield article
def parse_web_sections(self): def parse_web_sections(self):
self.read_nyt_metadata() self.read_nyt_metadata()
feeds = [] feeds = []
for section_title, slug in web_sections: for section_title, slug in web_sections:
url = 'https://www.nytimes.com/section/' + slug query_id = '/section/' + slug
try: data = self.nyt_graphql_query(query_id)
soup = self.index_to_soup(self.get_nyt_page(url)) articles = parse_web_section(data)
except Exception:
self.log.error('Failed to download section:', url)
continue
self.log('Found section:', section_title)
articles = list(self.parse_web_section(soup, slug))
if articles: if articles:
self.log('Found section:', section_title)
feeds.append((section_title, articles)) feeds.append((section_title, articles))
if self.test and len(feeds) >= self.test[0]: if self.test and len(feeds) >= self.test[0]:
break break
@ -372,3 +293,47 @@ class NewYorkTimes(BasicNewsRecipe):
if not re.search(r'/video/|/athletic/|/card/', url): if not re.search(r'/video/|/athletic/|/card/', url):
return url return url
self.log('\tSkipping ', url) self.log('\tSkipping ', url)
def asset_to_article(asset):
title = asset['headline']['default']
return {'title': title, 'url': asset['url'], 'description': asset['summary']}
def parse_todays_page(data, log=print):
containers = data['data']['legacyCollection']['groupings'][0]['containers']
feeds = []
for cont in containers:
if cont['__typename'] != 'LegacyCollectionContainer':
continue
section_name = cont['label'].strip()
if not section_name:
continue
log(section_name)
articles = []
for rel in cont['relations']:
if rel.get('__typename') == 'LegacyCollectionRelation':
asset = rel['asset']
if asset['__typename'] == 'Article':
articles.append(asset_to_article(asset))
log(' ', articles[-1]['title'] + ':', articles[-1]['url'])
if articles:
feeds.append((section_name, articles))
return feeds
def parse_web_section(data, log=print):
articles = []
containers = data['data']['legacyCollection']['collectionsPage']['embeddedCollections']
for cont in containers:
for s in cont['stream']['edges']:
asset = s['node']
if asset['__typename'] == 'Article':
articles.append(asset_to_article(asset))
log(' ', articles[-1]['title'] + ':', articles[-1]['url'])
return articles
if __name__ == '__main__':
import sys
parse_web_section(json.loads(open(sys.argv[-1], 'rb').read()))

View File

@ -18,6 +18,10 @@ from polyglot.urllib import urlencode
use_wayback_machine = False use_wayback_machine = False
# This is an Apollo persisted query hash which you can get
# from looking at the XHR requests made by: https://www.nytimes.com/section/todayspaper
# or by https://www.nytimes.com/section/world
persistedQuery = '1f99120a11e94dd62a9474f68ee1255537ee3cf7eac20a0377819edb2fa1fef7'
# The sections to download when downloading the web edition, comment out # The sections to download when downloading the web edition, comment out
# the section you are not interested in # the section you are not interested in
@ -203,12 +207,11 @@ class NewYorkTimes(BasicNewsRecipe):
self.nytimes_graphql_config = json.loads(json_data.replace(':undefined', ':null'))['config'] self.nytimes_graphql_config = json.loads(json_data.replace(':undefined', ':null'))['config']
return soup return soup
def parse_todays_page(self): def nyt_graphql_query(self, qid, operationName='CollectionsQuery'):
self.read_nyt_metadata()
query = { query = {
'operationName': 'CollectionsQuery', 'operationName': operationName,
'variables': json.dumps({ 'variables': json.dumps({
'id': '/issue/todayspaper/{}/todays-new-york-times'.format(self.nytimes_publication_date), 'id': qid,
'first': 10, 'first': 10,
'exclusionMode': 'HIGHLIGHTS_AND_EMBEDDED', 'exclusionMode': 'HIGHLIGHTS_AND_EMBEDDED',
'isFetchMore':False, 'isFetchMore':False,
@ -221,9 +224,7 @@ class NewYorkTimes(BasicNewsRecipe):
'extensions': json.dumps({ 'extensions': json.dumps({
'persistedQuery': { 'persistedQuery': {
'version':1, 'version':1,
# This is an Apollo persisted query hash which you can get 'sha256Hash': persistedQuery,
# from looking at the XHR requests made by: https://www.nytimes.com/section/todayspaper
'sha256Hash': '1f99120a11e94dd62a9474f68ee1255537ee3cf7eac20a0377819edb2fa1fef7',
}, },
}, separators=',:') }, separators=',:')
} }
@ -235,94 +236,23 @@ class NewYorkTimes(BasicNewsRecipe):
req = mechanize.Request(url, headers=headers) req = mechanize.Request(url, headers=headers)
raw = br.open(req).read() raw = br.open(req).read()
# open('/t/raw.json', 'wb').write(raw) # open('/t/raw.json', 'wb').write(raw)
return parse_todays_page(json.loads(raw), self.log) return json.loads(raw)
def parse_article_group(self, container): def parse_todays_page(self):
for li in container.findAll('li'): self.read_nyt_metadata()
article = li.find('article') query_id = '/issue/todayspaper/{}/todays-new-york-times'.format(self.nytimes_publication_date)
if article is None: data = self.nyt_graphql_query(query_id)
a = li.find('a', href=True) return parse_todays_page(data, self.log)
if a is not None:
title = self.tag_to_string(li.find(['h3', 'h2'])).strip()
paras = li.findAll('p')
if not title:
title = self.tag_to_string(paras[0]).strip()
if not title:
raise ValueError('No title found in article')
url = a['href']
if url.startswith('/'):
url = 'https://www.nytimes.com' + url
desc = ''
if len(paras) > 0:
desc = self.tag_to_string(paras[-1])
date = ''
d = date_from_url(url)
if d is not None:
date = format_date(d)
today = datetime.date.today()
delta = today - d
if delta.days > self.oldest_web_edition_article:
self.log.debug('\tSkipping article', title, 'as it is too old')
continue
yield {'title': title, 'url': url, 'description': desc, 'date': date}
continue
h2 = article.find(['h2', 'h3'])
if h2 is not None:
title = self.tag_to_string(h2)
a = h2.find('a', href=True)
if a is not None:
url = a['href']
if url.startswith('/'):
url = 'https://www.nytimes.com' + url
desc = ''
p = h2.findNextSibling('p')
if p is not None:
desc = self.tag_to_string(p)
date = ''
d = date_from_url(url)
if d is not None:
date = format_date(d)
today = datetime.date.today()
delta = today - d
if delta.days > self.oldest_web_edition_article:
self.log.debug('\tSkipping article', title, 'as it is too old')
continue
yield {'title': title, 'url': url, 'description': desc, 'date': date}
def parse_web_section(self, soup, slug):
def log(article):
self.log('\t', article['title'] + article['date'], ':', article['url'])
if article.get('description'):
self.log('\t\t', article['description'])
cid = slug.split('/')[-1]
if cid == 'dining':
cid = 'food'
try:
container = soup.find(id='collection-{}'.format(cid)).find('section')
except AttributeError:
container = None
if container is None:
raise ValueError('Failed to find articles container for slug: {}'.format(slug))
for ol in container.findAll('ol'):
for article in self.parse_article_group(ol):
log(article)
yield article
def parse_web_sections(self): def parse_web_sections(self):
self.read_nyt_metadata() self.read_nyt_metadata()
feeds = [] feeds = []
for section_title, slug in web_sections: for section_title, slug in web_sections:
url = 'https://www.nytimes.com/section/' + slug query_id = '/section/' + slug
try: data = self.nyt_graphql_query(query_id)
soup = self.index_to_soup(self.get_nyt_page(url)) articles = parse_web_section(data)
except Exception:
self.log.error('Failed to download section:', url)
continue
self.log('Found section:', section_title)
articles = list(self.parse_web_section(soup, slug))
if articles: if articles:
self.log('Found section:', section_title)
feeds.append((section_title, articles)) feeds.append((section_title, articles))
if self.test and len(feeds) >= self.test[0]: if self.test and len(feeds) >= self.test[0]:
break break
@ -365,6 +295,11 @@ class NewYorkTimes(BasicNewsRecipe):
self.log('\tSkipping ', url) self.log('\tSkipping ', url)
def asset_to_article(asset):
title = asset['headline']['default']
return {'title': title, 'url': asset['url'], 'description': asset['summary']}
def parse_todays_page(data, log=print): def parse_todays_page(data, log=print):
containers = data['data']['legacyCollection']['groupings'][0]['containers'] containers = data['data']['legacyCollection']['groupings'][0]['containers']
feeds = [] feeds = []
@ -380,14 +315,25 @@ def parse_todays_page(data, log=print):
if rel.get('__typename') == 'LegacyCollectionRelation': if rel.get('__typename') == 'LegacyCollectionRelation':
asset = rel['asset'] asset = rel['asset']
if asset['__typename'] == 'Article': if asset['__typename'] == 'Article':
title = asset['headline']['default'] articles.append(asset_to_article(asset))
articles.append({'title': title, 'url': asset['url'], 'description': asset['summary']}) log(' ', articles[-1]['title'] + ':', articles[-1]['url'])
log(' ', title + ':', asset['url'])
if articles: if articles:
feeds.append((section_name, articles)) feeds.append((section_name, articles))
return feeds return feeds
def parse_web_section(data, log=print):
articles = []
containers = data['data']['legacyCollection']['collectionsPage']['embeddedCollections']
for cont in containers:
for s in cont['stream']['edges']:
asset = s['node']
if asset['__typename'] == 'Article':
articles.append(asset_to_article(asset))
log(' ', articles[-1]['title'] + ':', articles[-1]['url'])
return articles
if __name__ == '__main__': if __name__ == '__main__':
import sys import sys
parse_todays_page(json.loads(open(sys.argv[-1], 'rb').read())) parse_todays_page(json.loads(open(sys.argv[-1], 'rb').read()))