Update New York Times Book Review

They rationalised their JSON schema finally.
This commit is contained in:
Kovid Goyal 2022-09-11 18:55:32 +05:30
parent 3d82d3f70d
commit b8b4351741
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -12,17 +12,11 @@ from calibre.web.feeds.news import BasicNewsRecipe
# {{{ parse NYT JSON # {{{ parse NYT JSON
def key_startswith(key, obj):
for q, val in obj.items():
if q.startswith(key):
return val
def is_heading(tn): def is_heading(tn):
return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block') return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block')
def process_inline_text(lines, block, data): def process_inline_text(lines, block):
text = '' text = ''
if 'text@stripHtml' in block: if 'text@stripHtml' in block:
text = escape(block['text@stripHtml']) text = escape(block['text@stripHtml'])
@ -32,16 +26,16 @@ def process_inline_text(lines, block, data):
text = block['text'] text = block['text']
if text: if text:
for fmt in block.get('formats', ()): for fmt in block.get('formats', ()):
tn = fmt['typename'] tn = fmt['__typename']
if tn == 'LinkFormat': if tn == 'LinkFormat':
ab = data[fmt['id']] ab = fmt
text = '<a href="{}" title="{}">{}</a>'.format(ab['url'], ab.get('title') or '', text) text = '<a href="{}" title="{}">{}</a>'.format(ab['url'], ab.get('title') or '', text)
elif tn == 'BoldFormat': elif tn == 'BoldFormat':
text = '<b>' + text + '</b>' text = '<b>' + text + '</b>'
lines.append(text) lines.append(text)
def process_paragraph(lines, block, data, content_key='content'): def process_paragraph(lines, block, content_key='content'):
tn = block['__typename'] tn = block['__typename']
m = re.match(r'Heading([1-6])Block', tn) m = re.match(r'Heading([1-6])Block', tn)
if m is not None: if m is not None:
@ -52,48 +46,48 @@ def process_paragraph(lines, block, data, content_key='content'):
style = 'text-align: {}'.format(ta.lower()) style = 'text-align: {}'.format(ta.lower())
lines.append('<{} style="{}">'.format(tag, style)) lines.append('<{} style="{}">'.format(tag, style))
for item in block[content_key]: for item in block[content_key]:
tn = item['typename'] tn = item['__typename']
if tn in ('TextInline', 'Byline'): if tn in ('TextInline', 'Byline'):
process_inline_text(lines, data[item['id']], data) process_inline_text(lines, item)
lines.append('</' + tag + '>') lines.append('</' + tag + '>')
def process_timestamp(lines, block, data): def process_timestamp(lines, block):
ts = block['timestamp'] ts = block['timestamp']
dt = parse_iso8601(ts, as_utc=False) dt = parse_iso8601(ts, as_utc=False)
lines.append('<p class="timestamp">' + escape(dt.strftime('%b %d, %Y')) + '</p>') lines.append('<p class="timestamp">' + escape(dt.strftime('%b %d, %Y')) + '</p>')
def process_header(lines, block, data): def process_header(lines, block):
label = block.get('label') label = block.get('label')
if label: if label:
process_paragraph(lines, data[label['id']], data) process_paragraph(lines, label)
headline = block.get('headline') headline = block.get('headline')
if headline: if headline:
process_paragraph(lines, data[headline['id']], data) process_paragraph(lines, headline)
summary = block.get('summary') summary = block.get('summary')
if summary: if summary:
process_paragraph(lines, data[summary['id']], data) process_paragraph(lines, summary)
lm = block.get('ledeMedia') lm = block.get('ledeMedia')
if lm and lm.get('typename') == 'ImageBlock': if lm and lm.get('__typename') == 'ImageBlock':
process_image_block(lines, data[lm['id']], data) process_image_block(lines, lm)
byline = block.get('byline') byline = block.get('byline')
if byline: if byline:
process_paragraph(lines, data[byline['id']], data, content_key='bylines') process_paragraph(lines, byline, content_key='bylines')
timestamp = block.get('timestampBlock') timestamp = block.get('timestampBlock')
if timestamp: if timestamp:
process_timestamp(lines, data[timestamp['id']], data) process_timestamp(lines, timestamp)
def process_image_block(lines, block, data): def process_image_block(lines, block):
media = data[block['media']['id']] media = block['media']
caption = media.get('caption') caption = media.get('caption')
caption_lines = [] caption_lines = []
if caption: if caption:
process_inline_text(caption_lines, data[caption['id']], data) process_inline_text(caption_lines, caption)
crops = key_startswith('crops({', media) crops = media['crops']
renditions = data[crops[0]['id']]['renditions'] renditions = crops[0]['renditions']
img = data[renditions[0]['id']]['url'] img = renditions[0]['url']
lines.append('<div style="text-align: center"><img src={}/>'.format(quoteattr(img))) lines.append('<div style="text-align: center"><img src={}/>'.format(quoteattr(img)))
lines.extend(caption_lines) lines.extend(caption_lines)
lines.append('</div>') lines.append('</div>')
@ -101,18 +95,19 @@ def process_image_block(lines, block, data):
def json_to_html(raw): def json_to_html(raw):
data = json.loads(raw.replace(':undefined', ':null')) data = json.loads(raw.replace(':undefined', ':null'))
data = data['initialState'] # open('/t/raw.json', 'w').write(json.dumps(data, indent=2))
data = data['initialData']['data']
article = next(iter(data.values())) article = next(iter(data.values()))
body = data[article['sprinkledBody']['id']] body = article['sprinkledBody']['content']
lines = [] lines = []
for item in body['content@filterEmpty']: for item in body:
tn = item['typename'] tn = item['__typename']
if tn in ('HeaderBasicBlock', 'HeaderLegacyBlock'): if tn in ('HeaderBasicBlock', 'HeaderLegacyBlock', 'HeaderFullBleedVerticalBlock'):
process_header(lines, data[item['id']], data) process_header(lines, item)
elif tn in ('ParagraphBlock', 'LabelBlock', 'DetailBlock') or is_heading(tn): elif tn in ('ParagraphBlock', 'LabelBlock', 'DetailBlock') or is_heading(tn):
process_paragraph(lines, data[item['id']], data) process_paragraph(lines, item)
elif tn == 'ImageBlock': elif tn == 'ImageBlock':
process_image_block(lines, data[item['id']], data) process_image_block(lines, item)
return '<html><body>' + '\n'.join(lines) + '</body></html>' return '<html><body>' + '\n'.join(lines) + '</body></html>'
@ -152,6 +147,7 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
return html return html
def parse_index(self): def parse_index(self):
return [('Articles', [{'url': 'https://www.nytimes.com/2022/09/08/books/review/karen-armstrong-by-the-book-interview.html', 'title':'test'}])]
soup = self.index_to_soup( soup = self.index_to_soup(
'https://www.nytimes.com/pages/books/review/index.html') 'https://www.nytimes.com/pages/books/review/index.html')