Fix parsing of NYTimes Todays paper

JSON format of legacy collection changed.
This commit is contained in:
Kovid Goyal 2025-03-01 12:15:45 +05:30
parent 40d9236470
commit 357555689f
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 62 additions and 57 deletions

View File

@ -192,8 +192,7 @@ class NewYorkTimes(BasicNewsRecipe):
data = json.loads(json_data.replace(':undefined', ':null'))['initialState'] data = json.loads(json_data.replace(':undefined', ':null'))['initialState']
containers, sections = {}, {} containers, sections = {}, {}
article_map = {} article_map = {}
gc_pat = re.compile(r'groupings.(\d+).containers.(\d+)') sections = []
pat = re.compile(r'groupings.(\d+).containers.(\d+).relations.(\d+)')
for key in data: for key in data:
if 'Article' in key: if 'Article' in key:
adata = data[key] adata = data[key]
@ -201,36 +200,39 @@ class NewYorkTimes(BasicNewsRecipe):
url = adata.get('url') url = adata.get('url')
summary = adata.get('summary') summary = adata.get('summary')
headline = adata.get('headline') headline = adata.get('headline')
if url and headline and 'id' in headline: if url and headline:
title = data[headline['id']]['default'] title = headline['default']
article_map[adata['id']] = { article_map[adata['id']] = {
'title': title, 'url': url, 'description': summary or ''} 'title': title, 'url': url, 'description': summary or ''}
elif 'Legacy' in key: elif 'LegacyCollection:' in key:
sdata = data[key] lc = data[key]
if not lc.get('active'):
continue
for sdata in lc['groupings']:
tname = sdata.get('__typename') tname = sdata.get('__typename')
if tname == 'LegacyCollectionContainer': if tname != 'LegacyCollectionGrouping':
m = gc_pat.search(key) continue
containers[int(m.group(2))] = sdata['label'] or sdata['name'] for cont in sdata['containers']:
elif tname == 'LegacyCollectionRelation': if cont.get('__typename') == 'LegacyCollectionContainer':
m = pat.search(key) section_name = cont['label@stripHtml']
grouping, container, relation = map(int, m.groups()) articles = []
asset = sdata['asset'] for rel in cont['relations']:
if asset and asset['typename'] == 'Article' and grouping == 0: if rel.get('__typename') == 'LegacyCollectionRelation':
if container not in sections: asset = rel['asset']['__ref']
sections[container] = [] if asset.startswith('Article:'):
sections[container].append(asset['id'].split(':', 1)[1]) articles.append(asset.partition(':')[2])
if articles:
sections.append((section_name, articles))
feeds = [] feeds = []
for container_num in sorted(containers): for section_title, article_ids in sections:
section_title = containers[container_num] articles = []
if container_num in sections: for aid in article_ids:
articles = sections[container_num] if aid in article_map:
art = article_map[aid]
articles.append(art)
if articles: if articles:
feeds.append((section_title, [])) feeds.append((section_title, articles))
for artid in articles:
if artid in article_map:
art = article_map[artid]
feeds[-1][1].append(art)
def skey(x): def skey(x):
name = x[0].strip() name = x[0].strip()

View File

@ -192,8 +192,7 @@ class NewYorkTimes(BasicNewsRecipe):
data = json.loads(json_data.replace(':undefined', ':null'))['initialState'] data = json.loads(json_data.replace(':undefined', ':null'))['initialState']
containers, sections = {}, {} containers, sections = {}, {}
article_map = {} article_map = {}
gc_pat = re.compile(r'groupings.(\d+).containers.(\d+)') sections = []
pat = re.compile(r'groupings.(\d+).containers.(\d+).relations.(\d+)')
for key in data: for key in data:
if 'Article' in key: if 'Article' in key:
adata = data[key] adata = data[key]
@ -202,35 +201,38 @@ class NewYorkTimes(BasicNewsRecipe):
summary = adata.get('summary') summary = adata.get('summary')
headline = adata.get('headline') headline = adata.get('headline')
if url and headline: if url and headline:
title = data[headline['id']]['default'] title = headline['default']
article_map[adata['id']] = { article_map[adata['id']] = {
'title': title, 'url': url, 'description': summary or ''} 'title': title, 'url': url, 'description': summary or ''}
elif 'Legacy' in key: elif 'LegacyCollection:' in key:
sdata = data[key] lc = data[key]
if not lc.get('active'):
continue
for sdata in lc['groupings']:
tname = sdata.get('__typename') tname = sdata.get('__typename')
if tname == 'LegacyCollectionContainer': if tname != 'LegacyCollectionGrouping':
m = gc_pat.search(key) continue
containers[int(m.group(2))] = sdata['label'] or sdata['name'] for cont in sdata['containers']:
elif tname == 'LegacyCollectionRelation': if cont.get('__typename') == 'LegacyCollectionContainer':
m = pat.search(key) section_name = cont['label@stripHtml']
grouping, container, relation = map(int, m.groups()) articles = []
asset = sdata['asset'] for rel in cont['relations']:
if asset and asset['typename'] == 'Article' and grouping == 0: if rel.get('__typename') == 'LegacyCollectionRelation':
if container not in sections: asset = rel['asset']['__ref']
sections[container] = [] if asset.startswith('Article:'):
sections[container].append(asset['id'].split(':', 1)[1]) articles.append(asset.partition(':')[2])
if articles:
sections.append((section_name, articles))
feeds = [] feeds = []
for container_num in sorted(containers): for section_title, article_ids in sections:
section_title = containers[container_num] articles = []
if container_num in sections: for aid in article_ids:
articles = sections[container_num] if aid in article_map:
art = article_map[aid]
articles.append(art)
if articles: if articles:
feeds.append((section_title, [])) feeds.append((section_title, articles))
for artid in articles:
if artid in article_map:
art = article_map[artid]
feeds[-1][1].append(art)
def skey(x): def skey(x):
name = x[0].strip() name = x[0].strip()
@ -368,5 +370,6 @@ class NewYorkTimes(BasicNewsRecipe):
def get_article_url(self, article): def get_article_url(self, article):
url = BasicNewsRecipe.get_article_url(self, article) url = BasicNewsRecipe.get_article_url(self, article)
if not re.search(r'/video/|/athletic/', url): if not re.search(r'/video/|/athletic/|/card/', url):
return url return url
self.log('\tSkipping ', url)