mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix parsing of NYTimes Todays paper
JSON format of legacy collection changed.
This commit is contained in:
parent
40d9236470
commit
357555689f
@ -76,7 +76,7 @@ def new_tag(soup, name, attrs=()):
|
|||||||
class NewYorkTimes(BasicNewsRecipe):
|
class NewYorkTimes(BasicNewsRecipe):
|
||||||
title = 'The New York Times (Web)'
|
title = 'The New York Times (Web)'
|
||||||
description = (
|
description = (
|
||||||
'New York Times (Web). You can edit the recipe to remove sections you are not interested in. '
|
'New York Times (Web). You can edit the recipe to remove sections you are not interested in. '
|
||||||
'Use advanced menu to make changes to fetch Todays Paper'
|
'Use advanced menu to make changes to fetch Todays Paper'
|
||||||
)
|
)
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
@ -192,8 +192,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
data = json.loads(json_data.replace(':undefined', ':null'))['initialState']
|
data = json.loads(json_data.replace(':undefined', ':null'))['initialState']
|
||||||
containers, sections = {}, {}
|
containers, sections = {}, {}
|
||||||
article_map = {}
|
article_map = {}
|
||||||
gc_pat = re.compile(r'groupings.(\d+).containers.(\d+)')
|
sections = []
|
||||||
pat = re.compile(r'groupings.(\d+).containers.(\d+).relations.(\d+)')
|
|
||||||
for key in data:
|
for key in data:
|
||||||
if 'Article' in key:
|
if 'Article' in key:
|
||||||
adata = data[key]
|
adata = data[key]
|
||||||
@ -201,36 +200,39 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
url = adata.get('url')
|
url = adata.get('url')
|
||||||
summary = adata.get('summary')
|
summary = adata.get('summary')
|
||||||
headline = adata.get('headline')
|
headline = adata.get('headline')
|
||||||
if url and headline and 'id' in headline:
|
if url and headline:
|
||||||
title = data[headline['id']]['default']
|
title = headline['default']
|
||||||
article_map[adata['id']] = {
|
article_map[adata['id']] = {
|
||||||
'title': title, 'url': url, 'description': summary or ''}
|
'title': title, 'url': url, 'description': summary or ''}
|
||||||
elif 'Legacy' in key:
|
elif 'LegacyCollection:' in key:
|
||||||
sdata = data[key]
|
lc = data[key]
|
||||||
tname = sdata.get('__typename')
|
if not lc.get('active'):
|
||||||
if tname == 'LegacyCollectionContainer':
|
continue
|
||||||
m = gc_pat.search(key)
|
for sdata in lc['groupings']:
|
||||||
containers[int(m.group(2))] = sdata['label'] or sdata['name']
|
tname = sdata.get('__typename')
|
||||||
elif tname == 'LegacyCollectionRelation':
|
if tname != 'LegacyCollectionGrouping':
|
||||||
m = pat.search(key)
|
continue
|
||||||
grouping, container, relation = map(int, m.groups())
|
for cont in sdata['containers']:
|
||||||
asset = sdata['asset']
|
if cont.get('__typename') == 'LegacyCollectionContainer':
|
||||||
if asset and asset['typename'] == 'Article' and grouping == 0:
|
section_name = cont['label@stripHtml']
|
||||||
if container not in sections:
|
articles = []
|
||||||
sections[container] = []
|
for rel in cont['relations']:
|
||||||
sections[container].append(asset['id'].split(':', 1)[1])
|
if rel.get('__typename') == 'LegacyCollectionRelation':
|
||||||
|
asset = rel['asset']['__ref']
|
||||||
|
if asset.startswith('Article:'):
|
||||||
|
articles.append(asset.partition(':')[2])
|
||||||
|
if articles:
|
||||||
|
sections.append((section_name, articles))
|
||||||
|
|
||||||
feeds = []
|
feeds = []
|
||||||
for container_num in sorted(containers):
|
for section_title, article_ids in sections:
|
||||||
section_title = containers[container_num]
|
articles = []
|
||||||
if container_num in sections:
|
for aid in article_ids:
|
||||||
articles = sections[container_num]
|
if aid in article_map:
|
||||||
if articles:
|
art = article_map[aid]
|
||||||
feeds.append((section_title, []))
|
articles.append(art)
|
||||||
for artid in articles:
|
if articles:
|
||||||
if artid in article_map:
|
feeds.append((section_title, articles))
|
||||||
art = article_map[artid]
|
|
||||||
feeds[-1][1].append(art)
|
|
||||||
|
|
||||||
def skey(x):
|
def skey(x):
|
||||||
name = x[0].strip()
|
name = x[0].strip()
|
||||||
|
@ -192,8 +192,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
data = json.loads(json_data.replace(':undefined', ':null'))['initialState']
|
data = json.loads(json_data.replace(':undefined', ':null'))['initialState']
|
||||||
containers, sections = {}, {}
|
containers, sections = {}, {}
|
||||||
article_map = {}
|
article_map = {}
|
||||||
gc_pat = re.compile(r'groupings.(\d+).containers.(\d+)')
|
sections = []
|
||||||
pat = re.compile(r'groupings.(\d+).containers.(\d+).relations.(\d+)')
|
|
||||||
for key in data:
|
for key in data:
|
||||||
if 'Article' in key:
|
if 'Article' in key:
|
||||||
adata = data[key]
|
adata = data[key]
|
||||||
@ -202,35 +201,38 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
summary = adata.get('summary')
|
summary = adata.get('summary')
|
||||||
headline = adata.get('headline')
|
headline = adata.get('headline')
|
||||||
if url and headline:
|
if url and headline:
|
||||||
title = data[headline['id']]['default']
|
title = headline['default']
|
||||||
article_map[adata['id']] = {
|
article_map[adata['id']] = {
|
||||||
'title': title, 'url': url, 'description': summary or ''}
|
'title': title, 'url': url, 'description': summary or ''}
|
||||||
elif 'Legacy' in key:
|
elif 'LegacyCollection:' in key:
|
||||||
sdata = data[key]
|
lc = data[key]
|
||||||
tname = sdata.get('__typename')
|
if not lc.get('active'):
|
||||||
if tname == 'LegacyCollectionContainer':
|
continue
|
||||||
m = gc_pat.search(key)
|
for sdata in lc['groupings']:
|
||||||
containers[int(m.group(2))] = sdata['label'] or sdata['name']
|
tname = sdata.get('__typename')
|
||||||
elif tname == 'LegacyCollectionRelation':
|
if tname != 'LegacyCollectionGrouping':
|
||||||
m = pat.search(key)
|
continue
|
||||||
grouping, container, relation = map(int, m.groups())
|
for cont in sdata['containers']:
|
||||||
asset = sdata['asset']
|
if cont.get('__typename') == 'LegacyCollectionContainer':
|
||||||
if asset and asset['typename'] == 'Article' and grouping == 0:
|
section_name = cont['label@stripHtml']
|
||||||
if container not in sections:
|
articles = []
|
||||||
sections[container] = []
|
for rel in cont['relations']:
|
||||||
sections[container].append(asset['id'].split(':', 1)[1])
|
if rel.get('__typename') == 'LegacyCollectionRelation':
|
||||||
|
asset = rel['asset']['__ref']
|
||||||
|
if asset.startswith('Article:'):
|
||||||
|
articles.append(asset.partition(':')[2])
|
||||||
|
if articles:
|
||||||
|
sections.append((section_name, articles))
|
||||||
|
|
||||||
feeds = []
|
feeds = []
|
||||||
for container_num in sorted(containers):
|
for section_title, article_ids in sections:
|
||||||
section_title = containers[container_num]
|
articles = []
|
||||||
if container_num in sections:
|
for aid in article_ids:
|
||||||
articles = sections[container_num]
|
if aid in article_map:
|
||||||
if articles:
|
art = article_map[aid]
|
||||||
feeds.append((section_title, []))
|
articles.append(art)
|
||||||
for artid in articles:
|
if articles:
|
||||||
if artid in article_map:
|
feeds.append((section_title, articles))
|
||||||
art = article_map[artid]
|
|
||||||
feeds[-1][1].append(art)
|
|
||||||
|
|
||||||
def skey(x):
|
def skey(x):
|
||||||
name = x[0].strip()
|
name = x[0].strip()
|
||||||
@ -368,5 +370,6 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
def get_article_url(self, article):
|
def get_article_url(self, article):
|
||||||
url = BasicNewsRecipe.get_article_url(self, article)
|
url = BasicNewsRecipe.get_article_url(self, article)
|
||||||
if not re.search(r'/video/|/athletic/', url):
|
if not re.search(r'/video/|/athletic/|/card/', url):
|
||||||
return url
|
return url
|
||||||
|
self.log('\tSkipping ', url)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user