mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update New York Times Book Review
They rationalised their JSON schema finally.
This commit is contained in:
parent
3d82d3f70d
commit
b8b4351741
@ -12,17 +12,11 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
|||||||
|
|
||||||
|
|
||||||
# {{{ parse NYT JSON
|
# {{{ parse NYT JSON
|
||||||
def key_startswith(key, obj):
|
|
||||||
for q, val in obj.items():
|
|
||||||
if q.startswith(key):
|
|
||||||
return val
|
|
||||||
|
|
||||||
|
|
||||||
def is_heading(tn):
|
def is_heading(tn):
|
||||||
return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block')
|
return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block')
|
||||||
|
|
||||||
|
|
||||||
def process_inline_text(lines, block, data):
|
def process_inline_text(lines, block):
|
||||||
text = ''
|
text = ''
|
||||||
if 'text@stripHtml' in block:
|
if 'text@stripHtml' in block:
|
||||||
text = escape(block['text@stripHtml'])
|
text = escape(block['text@stripHtml'])
|
||||||
@ -32,16 +26,16 @@ def process_inline_text(lines, block, data):
|
|||||||
text = block['text']
|
text = block['text']
|
||||||
if text:
|
if text:
|
||||||
for fmt in block.get('formats', ()):
|
for fmt in block.get('formats', ()):
|
||||||
tn = fmt['typename']
|
tn = fmt['__typename']
|
||||||
if tn == 'LinkFormat':
|
if tn == 'LinkFormat':
|
||||||
ab = data[fmt['id']]
|
ab = fmt
|
||||||
text = '<a href="{}" title="{}">{}</a>'.format(ab['url'], ab.get('title') or '', text)
|
text = '<a href="{}" title="{}">{}</a>'.format(ab['url'], ab.get('title') or '', text)
|
||||||
elif tn == 'BoldFormat':
|
elif tn == 'BoldFormat':
|
||||||
text = '<b>' + text + '</b>'
|
text = '<b>' + text + '</b>'
|
||||||
lines.append(text)
|
lines.append(text)
|
||||||
|
|
||||||
|
|
||||||
def process_paragraph(lines, block, data, content_key='content'):
|
def process_paragraph(lines, block, content_key='content'):
|
||||||
tn = block['__typename']
|
tn = block['__typename']
|
||||||
m = re.match(r'Heading([1-6])Block', tn)
|
m = re.match(r'Heading([1-6])Block', tn)
|
||||||
if m is not None:
|
if m is not None:
|
||||||
@ -52,48 +46,48 @@ def process_paragraph(lines, block, data, content_key='content'):
|
|||||||
style = 'text-align: {}'.format(ta.lower())
|
style = 'text-align: {}'.format(ta.lower())
|
||||||
lines.append('<{} style="{}">'.format(tag, style))
|
lines.append('<{} style="{}">'.format(tag, style))
|
||||||
for item in block[content_key]:
|
for item in block[content_key]:
|
||||||
tn = item['typename']
|
tn = item['__typename']
|
||||||
if tn in ('TextInline', 'Byline'):
|
if tn in ('TextInline', 'Byline'):
|
||||||
process_inline_text(lines, data[item['id']], data)
|
process_inline_text(lines, item)
|
||||||
lines.append('</' + tag + '>')
|
lines.append('</' + tag + '>')
|
||||||
|
|
||||||
|
|
||||||
def process_timestamp(lines, block, data):
|
def process_timestamp(lines, block):
|
||||||
ts = block['timestamp']
|
ts = block['timestamp']
|
||||||
dt = parse_iso8601(ts, as_utc=False)
|
dt = parse_iso8601(ts, as_utc=False)
|
||||||
lines.append('<p class="timestamp">' + escape(dt.strftime('%b %d, %Y')) + '</p>')
|
lines.append('<p class="timestamp">' + escape(dt.strftime('%b %d, %Y')) + '</p>')
|
||||||
|
|
||||||
|
|
||||||
def process_header(lines, block, data):
|
def process_header(lines, block):
|
||||||
label = block.get('label')
|
label = block.get('label')
|
||||||
if label:
|
if label:
|
||||||
process_paragraph(lines, data[label['id']], data)
|
process_paragraph(lines, label)
|
||||||
headline = block.get('headline')
|
headline = block.get('headline')
|
||||||
if headline:
|
if headline:
|
||||||
process_paragraph(lines, data[headline['id']], data)
|
process_paragraph(lines, headline)
|
||||||
summary = block.get('summary')
|
summary = block.get('summary')
|
||||||
if summary:
|
if summary:
|
||||||
process_paragraph(lines, data[summary['id']], data)
|
process_paragraph(lines, summary)
|
||||||
lm = block.get('ledeMedia')
|
lm = block.get('ledeMedia')
|
||||||
if lm and lm.get('typename') == 'ImageBlock':
|
if lm and lm.get('__typename') == 'ImageBlock':
|
||||||
process_image_block(lines, data[lm['id']], data)
|
process_image_block(lines, lm)
|
||||||
byline = block.get('byline')
|
byline = block.get('byline')
|
||||||
if byline:
|
if byline:
|
||||||
process_paragraph(lines, data[byline['id']], data, content_key='bylines')
|
process_paragraph(lines, byline, content_key='bylines')
|
||||||
timestamp = block.get('timestampBlock')
|
timestamp = block.get('timestampBlock')
|
||||||
if timestamp:
|
if timestamp:
|
||||||
process_timestamp(lines, data[timestamp['id']], data)
|
process_timestamp(lines, timestamp)
|
||||||
|
|
||||||
|
|
||||||
def process_image_block(lines, block, data):
|
def process_image_block(lines, block):
|
||||||
media = data[block['media']['id']]
|
media = block['media']
|
||||||
caption = media.get('caption')
|
caption = media.get('caption')
|
||||||
caption_lines = []
|
caption_lines = []
|
||||||
if caption:
|
if caption:
|
||||||
process_inline_text(caption_lines, data[caption['id']], data)
|
process_inline_text(caption_lines, caption)
|
||||||
crops = key_startswith('crops({', media)
|
crops = media['crops']
|
||||||
renditions = data[crops[0]['id']]['renditions']
|
renditions = crops[0]['renditions']
|
||||||
img = data[renditions[0]['id']]['url']
|
img = renditions[0]['url']
|
||||||
lines.append('<div style="text-align: center"><img src={}/>'.format(quoteattr(img)))
|
lines.append('<div style="text-align: center"><img src={}/>'.format(quoteattr(img)))
|
||||||
lines.extend(caption_lines)
|
lines.extend(caption_lines)
|
||||||
lines.append('</div>')
|
lines.append('</div>')
|
||||||
@ -101,18 +95,19 @@ def process_image_block(lines, block, data):
|
|||||||
|
|
||||||
def json_to_html(raw):
|
def json_to_html(raw):
|
||||||
data = json.loads(raw.replace(':undefined', ':null'))
|
data = json.loads(raw.replace(':undefined', ':null'))
|
||||||
data = data['initialState']
|
# open('/t/raw.json', 'w').write(json.dumps(data, indent=2))
|
||||||
|
data = data['initialData']['data']
|
||||||
article = next(iter(data.values()))
|
article = next(iter(data.values()))
|
||||||
body = data[article['sprinkledBody']['id']]
|
body = article['sprinkledBody']['content']
|
||||||
lines = []
|
lines = []
|
||||||
for item in body['content@filterEmpty']:
|
for item in body:
|
||||||
tn = item['typename']
|
tn = item['__typename']
|
||||||
if tn in ('HeaderBasicBlock', 'HeaderLegacyBlock'):
|
if tn in ('HeaderBasicBlock', 'HeaderLegacyBlock', 'HeaderFullBleedVerticalBlock'):
|
||||||
process_header(lines, data[item['id']], data)
|
process_header(lines, item)
|
||||||
elif tn in ('ParagraphBlock', 'LabelBlock', 'DetailBlock') or is_heading(tn):
|
elif tn in ('ParagraphBlock', 'LabelBlock', 'DetailBlock') or is_heading(tn):
|
||||||
process_paragraph(lines, data[item['id']], data)
|
process_paragraph(lines, item)
|
||||||
elif tn == 'ImageBlock':
|
elif tn == 'ImageBlock':
|
||||||
process_image_block(lines, data[item['id']], data)
|
process_image_block(lines, item)
|
||||||
return '<html><body>' + '\n'.join(lines) + '</body></html>'
|
return '<html><body>' + '\n'.join(lines) + '</body></html>'
|
||||||
|
|
||||||
|
|
||||||
@ -152,6 +147,7 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
|
|||||||
return html
|
return html
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
|
return [('Articles', [{'url': 'https://www.nytimes.com/2022/09/08/books/review/karen-armstrong-by-the-book-interview.html', 'title':'test'}])]
|
||||||
soup = self.index_to_soup(
|
soup = self.index_to_soup(
|
||||||
'https://www.nytimes.com/pages/books/review/index.html')
|
'https://www.nytimes.com/pages/books/review/index.html')
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user