Update NYTimes todays paper recipe for website changes

This commit is contained in:
Kovid Goyal 2025-04-10 15:22:08 +05:30
parent ef7e3df6c2
commit 1501da22db
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -8,10 +8,13 @@ import datetime
import json import json
import re import re
import mechanize
from calibre import strftime from calibre import strftime
from calibre.ebooks.BeautifulSoup import Tag from calibre.ebooks.BeautifulSoup import Tag
from calibre.utils.date import strptime from calibre.utils.date import strptime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from polyglot.urllib import urlencode
use_wayback_machine = False use_wayback_machine = False
@ -169,82 +172,70 @@ class NewYorkTimes(BasicNewsRecipe):
self.compress_news_images = True self.compress_news_images = True
def read_todays_paper(self): def read_todays_paper(self):
INDEX = 'https://www.nytimes.com/section/todayspaper' pdate = self.recipe_specific_options.get('date')
# INDEX = 'file:///t/raw.html' templ = 'https://www.nytimes.com/issue/todayspaper/{}/todays-new-york-times'
d = self.recipe_specific_options.get('date') if pdate and isinstance(pdate, str):
if d and isinstance(d, str): return pdate, self.index_to_soup(templ.format(pdate))
INDEX = 'https://www.nytimes.com/issue/todayspaper/' + d + '/todays-new-york-times' # Cant figure out how to get the date so just try todays and yesterdays dates
return self.index_to_soup(self.get_nyt_page(INDEX, skip_wayback=True)) date = datetime.date.today()
pdate = date.strftime('%Y/%m/%d')
try:
soup = self.index_to_soup(templ.format(pdate))
except Exception as e:
if getattr(e, 'code', None) == 404:
date -= datetime.timedelta(days=1)
pdate = date.strftime('%Y/%m/%d')
soup = self.index_to_soup(templ.format(pdate))
else:
raise
self.log("Using today's paper from:", pdate)
return pdate, soup
def read_nyt_metadata(self): def read_nyt_metadata(self):
soup = self.read_todays_paper() pdate, soup = self.read_todays_paper()
pdate = soup.find('meta', attrs={'name':'pdate', 'content': True})['content'] date = strptime(pdate, '%Y/%m/%d', assume_utc=False, as_utc=False)
date = strptime(pdate, '%Y%m%d', assume_utc=False, as_utc=False) self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(pdate)
self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(date.strftime('%Y/%m/%d'))
self.timefmt = strftime(' [%d %b, %Y]', date) self.timefmt = strftime(' [%d %b, %Y]', date)
self.nytimes_publication_date = pdate
script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
script = type(u'')(script)
json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }}
self.nytimes_graphql_config = json.loads(json_data.replace(':undefined', ':null'))['config']
return soup return soup
def parse_todays_page(self): def parse_todays_page(self):
soup = self.read_nyt_metadata() self.read_nyt_metadata()
script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0] query = {
script = type(u'')(script) 'operationName': 'CollectionsQuery',
json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';') 'variables': json.dumps({
data = json.loads(json_data.replace(':undefined', ':null'))['initialState'] 'id': '/issue/todayspaper/{}/todays-new-york-times'.format(self.nytimes_publication_date),
article_map = {} 'first': 10,
sections = [] 'exclusionMode': 'HIGHLIGHTS_AND_EMBEDDED',
for key in data: 'isFetchMore':False,
if 'Article' in key: 'isTranslatable':False,
adata = data[key] 'isEspanol':False,
if adata.get('__typename') == 'Article': 'highlightsListUri':'nyt://per/personalized-list/__null__',
url = adata.get('url') 'highlightsListFirst':0,
summary = adata.get('summary') 'hasHighlightsList':False
headline = adata.get('headline') }, separators=',:'),
if url and headline: 'extensions': json.dumps({
title = headline['default'] 'persistedQuery': {
article_map[adata['id']] = { 'version':1,
'title': title, 'url': url, 'description': summary or ''} # This is an Apollo persisted query hash which you can get
elif 'LegacyCollection:' in key: # from looking at the XHR requests made by: https://www.nytimes.com/section/todayspaper
lc = data[key] 'sha256Hash': '1f99120a11e94dd62a9474f68ee1255537ee3cf7eac20a0377819edb2fa1fef7',
if not lc.get('active'): },
continue }, separators=',:')
for sdata in lc['groupings']: }
tname = sdata.get('__typename') url = self.nytimes_graphql_config['gqlUrlClient'] + '?' + urlencode(query)
if tname != 'LegacyCollectionGrouping': br = self.browser
continue # br.set_debug_http(True)
for cont in sdata['containers']: headers = dict(self.nytimes_graphql_config['gqlRequestHeaders'])
if cont.get('__typename') == 'LegacyCollectionContainer': headers['Accept'] = 'application/json'
section_name = cont['label@stripHtml'] req = mechanize.Request(url, headers=headers)
articles = [] raw = br.open(req).read()
for rel in cont['relations']: # open('/t/raw.json', 'wb').write(raw)
if rel.get('__typename') == 'LegacyCollectionRelation': return parse_todays_page(json.loads(raw), self.log)
asset = rel['asset']['__ref']
if asset.startswith('Article:'):
articles.append(asset.partition(':')[2])
if articles:
sections.append((section_name, articles))
feeds = []
for section_title, article_ids in sections:
articles = []
for aid in article_ids:
if aid in article_map:
art = article_map[aid]
articles.append(art)
if articles:
feeds.append((section_title, articles))
def skey(x):
name = x[0].strip()
if name == 'The Front Page':
return 0, ''
return 1, name.lower()
feeds.sort(key=skey)
for section, articles in feeds:
self.log('\n' + section)
for article in articles:
self.log(article['title'] + ' - ' + article['url'])
# raise SystemExit(1)
return feeds
def parse_article_group(self, container): def parse_article_group(self, container):
for li in container.findAll('li'): for li in container.findAll('li'):
@ -372,3 +363,31 @@ class NewYorkTimes(BasicNewsRecipe):
if not re.search(r'/video/|/athletic/|/card/', url): if not re.search(r'/video/|/athletic/|/card/', url):
return url return url
self.log('\tSkipping ', url) self.log('\tSkipping ', url)
def parse_todays_page(data, log=print):
containers = data['data']['legacyCollection']['groupings'][0]['containers']
feeds = []
for cont in containers:
if cont['__typename'] != 'LegacyCollectionContainer':
continue
section_name = cont['label'].strip()
if not section_name:
continue
log(section_name)
articles = []
for rel in cont['relations']:
if rel.get('__typename') == 'LegacyCollectionRelation':
asset = rel['asset']
if asset['__typename'] == 'Article':
title = asset['headline']['default']
articles.append({'title': title, 'url': asset['url'], 'description': asset['summary']})
log(' ', title + ':', asset['url'])
if articles:
feeds.append((section_name, articles))
return feeds
if __name__ == '__main__':
import sys
parse_todays_page(json.loads(open(sys.argv[-1], 'rb').read()))