From b8b43517412b2f76a63d41e7515fe217412f7589 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 11 Sep 2022 18:55:32 +0530 Subject: [PATCH] Update New York Times Book Review They rationalised their JSON schema finally. --- recipes/nytimesbook.recipe | 66 ++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 35 deletions(-) diff --git a/recipes/nytimesbook.recipe b/recipes/nytimesbook.recipe index e9375f4fd8..13b3f481b1 100644 --- a/recipes/nytimesbook.recipe +++ b/recipes/nytimesbook.recipe @@ -12,17 +12,11 @@ from calibre.web.feeds.news import BasicNewsRecipe # {{{ parse NYT JSON -def key_startswith(key, obj): - for q, val in obj.items(): - if q.startswith(key): - return val - - def is_heading(tn): return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block') -def process_inline_text(lines, block, data): +def process_inline_text(lines, block): text = '' if 'text@stripHtml' in block: text = escape(block['text@stripHtml']) @@ -32,16 +26,16 @@ def process_inline_text(lines, block, data): text = block['text'] if text: for fmt in block.get('formats', ()): - tn = fmt['typename'] + tn = fmt['__typename'] if tn == 'LinkFormat': - ab = data[fmt['id']] + ab = fmt text = '{}'.format(ab['url'], ab.get('title') or '', text) elif tn == 'BoldFormat': text = '' + text + '' lines.append(text) -def process_paragraph(lines, block, data, content_key='content'): +def process_paragraph(lines, block, content_key='content'): tn = block['__typename'] m = re.match(r'Heading([1-6])Block', tn) if m is not None: @@ -52,48 +46,48 @@ def process_paragraph(lines, block, data, content_key='content'): style = 'text-align: {}'.format(ta.lower()) lines.append('<{} style="{}">'.format(tag, style)) for item in block[content_key]: - tn = item['typename'] + tn = item['__typename'] if tn in ('TextInline', 'Byline'): - process_inline_text(lines, data[item['id']], data) + process_inline_text(lines, item) lines.append('') -def process_timestamp(lines, block, data): +def process_timestamp(lines, block): ts = block['timestamp'] dt = parse_iso8601(ts, as_utc=False) lines.append('

' + escape(dt.strftime('%b %d, %Y')) + '

') -def process_header(lines, block, data): +def process_header(lines, block): label = block.get('label') if label: - process_paragraph(lines, data[label['id']], data) + process_paragraph(lines, label) headline = block.get('headline') if headline: - process_paragraph(lines, data[headline['id']], data) + process_paragraph(lines, headline) summary = block.get('summary') if summary: - process_paragraph(lines, data[summary['id']], data) + process_paragraph(lines, summary) lm = block.get('ledeMedia') - if lm and lm.get('typename') == 'ImageBlock': - process_image_block(lines, data[lm['id']], data) + if lm and lm.get('__typename') == 'ImageBlock': + process_image_block(lines, lm) byline = block.get('byline') if byline: - process_paragraph(lines, data[byline['id']], data, content_key='bylines') + process_paragraph(lines, byline, content_key='bylines') timestamp = block.get('timestampBlock') if timestamp: - process_timestamp(lines, data[timestamp['id']], data) + process_timestamp(lines, timestamp) -def process_image_block(lines, block, data): - media = data[block['media']['id']] +def process_image_block(lines, block): + media = block['media'] caption = media.get('caption') caption_lines = [] if caption: - process_inline_text(caption_lines, data[caption['id']], data) - crops = key_startswith('crops({', media) - renditions = data[crops[0]['id']]['renditions'] - img = data[renditions[0]['id']]['url'] + process_inline_text(caption_lines, caption) + crops = media['crops'] + renditions = crops[0]['renditions'] + img = renditions[0]['url'] lines.append('
'.format(quoteattr(img))) lines.extend(caption_lines) lines.append('
') @@ -101,18 +95,19 @@ def process_image_block(lines, block, data): def json_to_html(raw): data = json.loads(raw.replace(':undefined', ':null')) - data = data['initialState'] + # open('/t/raw.json', 'w').write(json.dumps(data, indent=2)) + data = data['initialData']['data'] article = next(iter(data.values())) - body = data[article['sprinkledBody']['id']] + body = article['sprinkledBody']['content'] lines = [] - for item in body['content@filterEmpty']: - tn = item['typename'] - if tn in ('HeaderBasicBlock', 'HeaderLegacyBlock'): - process_header(lines, data[item['id']], data) + for item in body: + tn = item['__typename'] + if tn in ('HeaderBasicBlock', 'HeaderLegacyBlock', 'HeaderFullBleedVerticalBlock'): + process_header(lines, item) elif tn in ('ParagraphBlock', 'LabelBlock', 'DetailBlock') or is_heading(tn): - process_paragraph(lines, data[item['id']], data) + process_paragraph(lines, item) elif tn == 'ImageBlock': - process_image_block(lines, data[item['id']], data) + process_image_block(lines, item) return '' + '\n'.join(lines) + '' @@ -152,6 +147,7 @@ class NewYorkTimesBookReview(BasicNewsRecipe): return html def parse_index(self): + return [('Articles', [{'url': 'https://www.nytimes.com/2022/09/08/books/review/karen-armstrong-by-the-book-interview.html', 'title':'test'}])] soup = self.index_to_soup( 'https://www.nytimes.com/pages/books/review/index.html')