This commit is contained in:
Kovid Goyal 2024-09-29 20:20:27 +05:30
commit 682624e573
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 195 additions and 178 deletions

View File

@ -12,52 +12,52 @@ def extract_json(raw):
return js['initialData']['data']['article']['sprinkledBody']['content'] return js['initialData']['data']['article']['sprinkledBody']['content']
def parse_image(i): def parse_image(i):
if i.get('crops'): crop = i.get('crops') or i.get('spanImageCrops')
yield '<div><img src="{}">'.format(i['crops'][0]['renditions'][0]['url']) if crop:
elif i.get('spanImageCrops'): yield f'<div><img src="{crop[0]["renditions"][0]["url"]}" title="{i.get("altText", "")}">'
yield '<div><img src="{}">'.format(i['spanImageCrops'][0]['renditions'][0]['url'])
if i.get('caption'): if i.get('caption'):
yield '<div class="cap">' + ''.join(parse_types(i['caption'])) yield f'<div class="cap">{"".join(parse_types(i["caption"]))}'
if i.get('credit'): if i.get('credit'):
yield '<span class="cred"> ' + i['credit'] + '</span>' yield f'<span class="cred"> {i["credit"]}</span>'
yield '</div>' yield '</div>'
elif i.get('legacyHtmlCaption'):
if i['legacyHtmlCaption'].strip():
yield f'<div class="cap">{i["legacyHtmlCaption"]}</div>'
yield '</div>' yield '</div>'
def parse_img_grid(g): def parse_img_grid(g):
for grd in g.get('gridMedia', {}): for grd in g.get('gridMedia', {}):
yield ''.join(parse_image(grd)) yield ''.join(parse_image(grd))
if g.get('caption'): if g.get('caption'):
yield '<div class="cap">{}'.format(g['caption']) yield f'<div class="cap">{g["caption"]}'
if g.get('credit'): if g.get('credit'):
yield '<span class="cred"> ' + g['credit'] + '</span>' yield f'<span class="cred"> {g["credit"]}</span>'
yield '</div>' yield '</div>'
def parse_vid(v): def parse_vid(v):
if v.get('promotionalMedia'): if v.get('promotionalMedia'):
if v.get('headline'): headline = v.get("headline", {}).get("default", "")
if v.get('url'): rendition = v.get('renditions')
yield '<div><b><a href="{}">Video</a>: '.format(v['url'])\ yield (f'<div><b><a href="{rendition[0]["url"]}">Video</a>: {headline}</b></div>'
+ v['headline'].get('default', '') + '</b></div>' if rendition else f'<div><b>{headline}</b></div>')
elif v['headline'].get('default'): yield ''.join(parse_types(v["promotionalMedia"]))
yield '<div><b>' + v['headline']['default'] + '</b></div>'
yield ''.join(parse_types(v['promotionalMedia']))
if v.get('promotionalSummary'): if v.get('promotionalSummary'):
yield '<div class="cap">' + v['promotionalSummary'] + '</div>' yield f'<div class="cap">{v["promotionalSummary"]}</div>'
def parse_emb(e): def parse_emb(e):
if e.get('html') and 'datawrapper.dwcdn.net' in e.get('html', ''): if e.get('html') and 'datawrapper.dwcdn.net' in e.get('html', ''):
dw = re.search(r'datawrapper.dwcdn.net/(.{5})', e['html']).group(1) dw = re.search(r'datawrapper.dwcdn.net/(.{5})', e['html']).group(1)
yield '<div><img src="{}">'.format('https://datawrapper.dwcdn.net/' + dw + '/full.png') + '</div>' yield f'<div><img src="https://datawrapper.dwcdn.net/{dw}/full.png"></div>'
elif e.get('promotionalMedia'): elif e.get('promotionalMedia'):
if e.get('headline'): if e.get('headline'):
yield '<div><b>' + e['headline']['default'] + '</b></div>' yield f'<div><b>{e["headline"]["default"]}</b></div>'
yield ''.join(parse_types(e['promotionalMedia'])) yield ''.join(parse_types(e["promotionalMedia"]))
if e.get('note'): if e.get('note'):
yield '<div class="cap">' + e['note'] + '</div>' yield f'<div class="cap">{e["note"]}</div>'
def parse_byline(byl): def parse_byline(byl):
for b in byl.get('bylines', {}): for b in byl.get('bylines', {}):
yield '<div>' + b['renderedRepresentation'] + '</div>' yield f'<div>{b["renderedRepresentation"]}</div>'
yield '<div><b><i>' yield '<div><b><i>'
for rl in byl.get('role', {}): for rl in byl.get('role', {}):
if ''.join(parse_cnt(rl)).strip(): if ''.join(parse_cnt(rl)).strip():
@ -70,106 +70,114 @@ def iso_date(x):
def parse_header(h): def parse_header(h):
if h.get('label'): if h.get('label'):
yield '<div class="lbl">' + ''.join(parse_types(h['label'])) + '</div>' yield f'<div class="lbl">{"".join(parse_types(h["label"]))}</div>'
if h.get('headline'): if h.get('headline'):
yield ''.join(parse_types(h['headline'])) yield ''.join(parse_types(h["headline"]))
if h.get('summary'): if h.get('summary'):
yield '<p><i>' + ''.join(parse_types(h['summary'])) + '</i></p>' yield f'<p><i>{"".join(parse_types(h["summary"]))}</i></p>'
if h.get('ledeMedia'): if h.get('ledeMedia'):
yield ''.join(parse_types(h['ledeMedia'])) yield ''.join(parse_types(h["ledeMedia"]))
if h.get('byline'): if h.get('byline'):
yield ''.join(parse_types(h['byline'])) yield ''.join(parse_types(h["byline"]))
if h.get('timestampBlock'): if h.get('timestampBlock'):
yield ''.join(parse_types(h['timestampBlock'])) yield ''.join(parse_types(h["timestampBlock"]))
def parse_fmt_type(fm): def parse_fmt_type(fm):
for f in fm.get('formats', {}): for f in fm.get('formats', {}):
if f.get('__typename', '') == 'BoldFormat': ftype = f.get("__typename", "")
if ftype == "BoldFormat":
yield '<strong>' yield '<strong>'
if f.get('__typename', '') == 'ItalicFormat': if ftype == "ItalicFormat":
yield '<em>' yield '<em>'
if f.get('__typename', '') == 'LinkFormat': if ftype == "LinkFormat":
hrf = f['url'] hrf = f["url"]
yield '<a href="{}">'.format(hrf) yield f'<a href="{hrf}">'
yield fm['text'] yield fm.get("text", "")
for f in reversed(fm.get('formats', {})): for f in reversed(fm.get('formats', {})):
if f.get('__typename', '') == 'BoldFormat': ftype = f.get("__typename", "")
if ftype == "BoldFormat":
yield '</strong>' yield '</strong>'
if f.get('__typename', '') == 'ItalicFormat': if ftype == "ItalicFormat":
yield '</em>' yield '</em>'
if f.get('__typename', '') == 'LinkFormat': if ftype == "LinkFormat":
yield '</a>' yield '</a>'
def parse_cnt(cnt): def parse_cnt(cnt):
if cnt.get('formats'): for k in cnt:
yield ''.join(parse_fmt_type(cnt)) if isinstance(cnt[k], list):
elif cnt.get('content'): if k == 'formats':
for cnt_ in cnt['content']: yield ''.join(parse_fmt_type(cnt))
yield from parse_types(cnt_) else:
elif cnt.get('text'): for cnt_ in cnt[k]:
yield from parse_types(cnt_)
if isinstance(cnt[k], dict):
yield from parse_types(cnt[k])
if cnt.get('text') and 'formats' not in cnt:
yield cnt['text'] yield cnt['text']
def parse_types(x): def parse_types(x):
if 'Header' in x.get('__typename', ''): typename = x.get('__typename', '')
if 'Header' in typename:
yield '\n'.join(parse_header(x)) yield '\n'.join(parse_header(x))
elif x.get('__typename', '') == 'Heading1Block': elif typename.startswith('Heading'):
yield '<h1>' + ''.join(parse_cnt(x)) + '</h1>' htag = 'h' + re.match(r'Heading([1-6])Block', typename).group(1)
elif x.get('__typename', '') in {'Heading2Block', 'Heading3Block', 'Heading4Block'}: yield f'<{htag}>{"".join(parse_cnt(x))}</{htag}>'
yield '<h4>' + ''.join(parse_cnt(x)) + '</h4>'
elif x.get('__typename', '') == 'ParagraphBlock': elif typename == 'ParagraphBlock':
yield '<p>' + ''.join(parse_cnt(x)) + '</p>' yield f'<p>{"".join(parse_cnt(x))}</p>'
elif x.get('__typename', '') == 'BylineBlock': elif typename == 'BylineBlock':
yield '<div class="byl"><br/>' + ''.join(parse_byline(x)) + '</div>' yield f'<div class="byl"><br/>{"".join(parse_byline(x))}</div>'
elif x.get('__typename', '') == 'LabelBlock': elif typename == 'LabelBlock':
yield '<div class="sc">' + ''.join(parse_cnt(x)) + '</div>' yield f'<div class="sc">{"".join(parse_cnt(x))}</div>'
elif x.get('__typename', '') == 'BlockquoteBlock': elif typename == 'BlockquoteBlock':
yield '<blockquote>' + ''.join(parse_cnt(x)) + '</blockquote>' yield f'<blockquote>{"".join(parse_cnt(x))}</blockquote>'
elif x.get('__typename', '') == 'TimestampBlock': elif typename == 'TimestampBlock':
yield '<div class="time">' + iso_date(x['timestamp']) + '</div>' yield f'<div class="time">{iso_date(x["timestamp"])}</div>'
elif x.get('__typename', '') == 'LineBreakInline': elif typename == 'LineBreakInline':
yield '<br/>' yield '<br/>'
elif x.get('__typename', '') == 'RuleBlock': elif typename == 'RuleBlock':
yield '<hr/>' yield '<hr/>'
elif x.get('__typename', '') == 'Image': elif typename in {'ImageBlock', 'VideoBlock', 'InteractiveBlock'}:
yield ''.join(parse_image(x)) yield "".join(parse_types(x['media']))
elif x.get('__typename', '') == 'ImageBlock':
yield ''.join(parse_types(x['media']))
elif x.get('__typename', '') == 'GridBlock':
yield ''.join(parse_img_grid(x))
elif x.get('__typename', '') == 'VideoBlock': elif typename == 'Image':
yield ''.join(parse_types(x['media'])) yield "".join(parse_image(x))
elif x.get('__typename', '') == 'Video':
yield ''.join(parse_vid(x))
elif x.get('__typename', '') == 'InteractiveBlock':
yield ''.join(parse_types(x['media']))
elif x.get('__typename', '') == 'EmbeddedInteractive':
yield ''.join(parse_emb(x))
elif x.get('__typename', '') == 'ListBlock': elif typename == 'GridBlock':
yield '<ul>' + ''.join(parse_cnt(x)) + '</ul>' yield "".join(parse_img_grid(x))
elif x.get('__typename', '') == 'ListItemBlock':
yield '<li>' + ''.join(parse_cnt(x)) + '</li>'
elif x.get('__typename', '') == 'CapsuleBlock': elif typename == 'Video':
yield "".join(parse_vid(x))
elif typename == 'EmbeddedInteractive':
yield "".join(parse_emb(x))
elif typename == 'ListBlock':
yield f'<ul>{"".join(parse_cnt(x))}</ul>'
elif typename == 'ListItemBlock':
yield f'<li>{"".join(parse_cnt(x))}</li>'
elif typename == 'CapsuleBlock':
if x['capsuleContent'].get('body'): if x['capsuleContent'].get('body'):
yield ''.join(parse_cnt(x['capsuleContent']['body'])) yield "".join(parse_cnt(x['capsuleContent']['body']))
elif x.get('__typename', '') == 'Capsule': elif typename == 'Capsule':
yield ''.join(parse_cnt(x['body'])) yield "".join(parse_cnt(x['body']))
elif x.get('__typename', '') in { elif typename in {
'TextInline', 'TextOnlyDocumentBlock', 'DocumentBlock', 'SummaryBlock' 'TextInline', 'TextOnlyDocumentBlock', 'DocumentBlock',
'SummaryBlock', 'VisualStackBlock'
}: }:
yield ''.join(parse_cnt(x)) yield "".join(parse_cnt(x))
elif x.get('__typename'): elif typename and typename not in {'RelatedLinksBlock', 'Dropzone'}:
if ''.join(parse_cnt(x)).strip(): if x.get('media'):
yield '<p><i>' + ''.join(parse_cnt(x)) + '</i></p>' yield "".join(parse_types(x['media']))
elif "".join(parse_cnt(x)).strip():
yield f'<p><i>{"".join(parse_cnt(x))}</i></p>'
def article_parse(data): def article_parse(data):
yield "<html><body>" yield "<html><body>"
@ -178,7 +186,7 @@ def article_parse(data):
yield "</body></html>" yield "</body></html>"
class nytFeeds(BasicNewsRecipe): class NytFeeds(BasicNewsRecipe):
title = 'NYT News' title = 'NYT News'
__author__ = 'unkn0wn' __author__ = 'unkn0wn'
description = ( description = (
@ -236,7 +244,7 @@ class nytFeeds(BasicNewsRecipe):
extra_css = ''' extra_css = '''
.byl, .time { font-size:small; color:#202020; } .byl, .time { font-size:small; color:#202020; }
.cap { font-size:small; text-align:center; } .cap { font-size:small; }
.cred { font-style:italic; font-size:small; } .cred { font-style:italic; font-size:small; }
em, blockquote { color: #202020; } em, blockquote { color: #202020; }
.sc { font-variant: small-caps; } .sc { font-variant: small-caps; }
@ -305,3 +313,4 @@ class nytFeeds(BasicNewsRecipe):
if not re.search(r'/video/|/live/|/athletic/|/espanol/|/card/', url): if not re.search(r'/video/|/live/|/athletic/|/espanol/|/card/', url):
return url return url
self.log('\tSkipped URL: ', url) self.log('\tSkipped URL: ', url)
return None

View File

@ -9,57 +9,57 @@ from xml.sax.saxutils import escape, quoteattr
from calibre.utils.iso8601 import parse_iso8601 from calibre.utils.iso8601 import parse_iso8601
module_version = 6 # needed for live updates module_version = 7 # needed for live updates
pprint pprint
def parse_image(i): def parse_image(i):
if i.get('crops'): crop = i.get('crops') or i.get('spanImageCrops')
yield '<div><img src="{}">'.format(i['crops'][0]['renditions'][0]['url']) if crop:
elif i.get('spanImageCrops'): yield f'<div><img src="{crop[0]["renditions"][0]["url"]}" title="{i.get("altText", "")}">'
yield '<div><img src="{}">'.format(i['spanImageCrops'][0]['renditions'][0]['url'])
if i.get('caption'): if i.get('caption'):
yield '<div class="cap">' + ''.join(parse_types(i['caption'])) yield f'<div class="cap">{"".join(parse_types(i["caption"]))}'
if i.get('credit'): if i.get('credit'):
yield '<span class="cred"> ' + i['credit'] + '</span>' yield f'<span class="cred"> {i["credit"]}</span>'
yield '</div>' yield '</div>'
elif i.get('legacyHtmlCaption'):
if i['legacyHtmlCaption'].strip():
yield f'<div class="cap">{i["legacyHtmlCaption"]}</div>'
yield '</div>' yield '</div>'
def parse_img_grid(g): def parse_img_grid(g):
for grd in g.get('gridMedia', {}): for grd in g.get('gridMedia', {}):
yield ''.join(parse_image(grd)) yield ''.join(parse_image(grd))
if g.get('caption'): if g.get('caption'):
yield '<div class="cap">{}'.format(g['caption']) yield f'<div class="cap">{g["caption"]}'
if g.get('credit'): if g.get('credit'):
yield '<span class="cred"> ' + g['credit'] + '</span>' yield f'<span class="cred"> {g["credit"]}</span>'
yield '</div>' yield '</div>'
def parse_vid(v): def parse_vid(v):
if v.get('promotionalMedia'): if v.get('promotionalMedia'):
if v.get('headline'): headline = v.get("headline", {}).get("default", "")
if v.get('url'): rendition = v.get('renditions')
yield '<div><b><a href="{}">Video</a>: '.format(v['url'])\ yield (f'<div><b><a href="{rendition[0]["url"]}">Video</a>: {headline}</b></div>'
+ v['headline'].get('default', '') + '</b></div>' if rendition else f'<div><b>{headline}</b></div>')
elif v['headline'].get('default'): yield ''.join(parse_types(v["promotionalMedia"]))
yield '<div><b>' + v['headline']['default'] + '</b></div>'
yield ''.join(parse_types(v['promotionalMedia']))
if v.get('promotionalSummary'): if v.get('promotionalSummary'):
yield '<div class="cap">' + v['promotionalSummary'] + '</div>' yield f'<div class="cap">{v["promotionalSummary"]}</div>'
def parse_emb(e): def parse_emb(e):
if e.get('html') and 'datawrapper.dwcdn.net' in e.get('html', ''): if e.get('html') and 'datawrapper.dwcdn.net' in e.get('html', ''):
dw = re.search(r'datawrapper.dwcdn.net/(.{5})', e['html']).group(1) dw = re.search(r'datawrapper.dwcdn.net/(.{5})', e['html']).group(1)
yield '<div><img src="{}">'.format('https://datawrapper.dwcdn.net/' + dw + '/full.png') + '</div>' yield f'<div><img src="https://datawrapper.dwcdn.net/{dw}/full.png"></div>'
elif e.get('promotionalMedia'): elif e.get('promotionalMedia'):
if e.get('headline'): if e.get('headline'):
yield '<div><b>' + e['headline']['default'] + '</b></div>' yield f'<div><b>{e["headline"]["default"]}</b></div>'
yield ''.join(parse_types(e['promotionalMedia'])) yield ''.join(parse_types(e["promotionalMedia"]))
if e.get('note'): if e.get('note'):
yield '<div class="cap">' + e['note'] + '</div>' yield f'<div class="cap">{e["note"]}</div>'
def parse_byline(byl): def parse_byline(byl):
for b in byl.get('bylines', {}): for b in byl.get('bylines', {}):
yield '<div>' + b['renderedRepresentation'] + '</div>' yield f'<div>{b["renderedRepresentation"]}</div>'
yield '<div><b><i>' yield '<div><b><i>'
for rl in byl.get('role', {}): for rl in byl.get('role', {}):
if ''.join(parse_cnt(rl)).strip(): if ''.join(parse_cnt(rl)).strip():
@ -72,106 +72,114 @@ def iso_date(x):
def parse_header(h): def parse_header(h):
if h.get('label'): if h.get('label'):
yield '<div class="lbl">' + ''.join(parse_types(h['label'])) + '</div>' yield f'<div class="lbl">{"".join(parse_types(h["label"]))}</div>'
if h.get('headline'): if h.get('headline'):
yield ''.join(parse_types(h['headline'])) yield ''.join(parse_types(h["headline"]))
if h.get('summary'): if h.get('summary'):
yield '<p><i>' + ''.join(parse_types(h['summary'])) + '</i></p>' yield f'<p><i>{"".join(parse_types(h["summary"]))}</i></p>'
if h.get('ledeMedia'): if h.get('ledeMedia'):
yield ''.join(parse_types(h['ledeMedia'])) yield ''.join(parse_types(h["ledeMedia"]))
if h.get('byline'): if h.get('byline'):
yield ''.join(parse_types(h['byline'])) yield ''.join(parse_types(h["byline"]))
if h.get('timestampBlock'): if h.get('timestampBlock'):
yield ''.join(parse_types(h['timestampBlock'])) yield ''.join(parse_types(h["timestampBlock"]))
def parse_fmt_type(fm): def parse_fmt_type(fm):
for f in fm.get('formats', {}): for f in fm.get('formats', {}):
if f.get('__typename', '') == 'BoldFormat': ftype = f.get("__typename", "")
if ftype == "BoldFormat":
yield '<strong>' yield '<strong>'
if f.get('__typename', '') == 'ItalicFormat': if ftype == "ItalicFormat":
yield '<em>' yield '<em>'
if f.get('__typename', '') == 'LinkFormat': if ftype == "LinkFormat":
hrf = f['url'] hrf = f["url"]
yield '<a href="{}">'.format(hrf) yield f'<a href="{hrf}">'
yield fm['text'] yield fm.get("text", "")
for f in reversed(fm.get('formats', {})): for f in reversed(fm.get('formats', {})):
if f.get('__typename', '') == 'BoldFormat': ftype = f.get("__typename", "")
if ftype == "BoldFormat":
yield '</strong>' yield '</strong>'
if f.get('__typename', '') == 'ItalicFormat': if ftype == "ItalicFormat":
yield '</em>' yield '</em>'
if f.get('__typename', '') == 'LinkFormat': if ftype == "LinkFormat":
yield '</a>' yield '</a>'
def parse_cnt(cnt): def parse_cnt(cnt):
if cnt.get('formats'): for k in cnt:
yield ''.join(parse_fmt_type(cnt)) if isinstance(cnt[k], list):
elif cnt.get('content'): if k == 'formats':
for cnt_ in cnt['content']: yield ''.join(parse_fmt_type(cnt))
yield from parse_types(cnt_) else:
elif cnt.get('text'): for cnt_ in cnt[k]:
yield from parse_types(cnt_)
if isinstance(cnt[k], dict):
yield from parse_types(cnt[k])
if cnt.get('text') and 'formats' not in cnt:
yield cnt['text'] yield cnt['text']
def parse_types(x): def parse_types(x):
if 'Header' in x.get('__typename', ''): typename = x.get('__typename', '')
if 'Header' in typename:
yield '\n'.join(parse_header(x)) yield '\n'.join(parse_header(x))
elif x.get('__typename', '') == 'Heading1Block': elif typename.startswith('Heading'):
yield '<h1>' + ''.join(parse_cnt(x)) + '</h1>' htag = 'h' + re.match(r'Heading([1-6])Block', typename).group(1)
elif x.get('__typename', '') in {'Heading2Block', 'Heading3Block', 'Heading4Block'}: yield f'<{htag}>{"".join(parse_cnt(x))}</{htag}>'
yield '<h4>' + ''.join(parse_cnt(x)) + '</h4>'
elif x.get('__typename', '') == 'ParagraphBlock': elif typename == 'ParagraphBlock':
yield '<p>' + ''.join(parse_cnt(x)) + '</p>' yield f'<p>{"".join(parse_cnt(x))}</p>'
elif x.get('__typename', '') == 'BylineBlock': elif typename == 'BylineBlock':
yield '<div class="byl"><br/>' + ''.join(parse_byline(x)) + '</div>' yield f'<div class="byl"><br/>{"".join(parse_byline(x))}</div>'
elif x.get('__typename', '') == 'LabelBlock': elif typename == 'LabelBlock':
yield '<div class="sc">' + ''.join(parse_cnt(x)) + '</div>' yield f'<div class="sc">{"".join(parse_cnt(x))}</div>'
elif x.get('__typename', '') == 'BlockquoteBlock': elif typename == 'BlockquoteBlock':
yield '<blockquote>' + ''.join(parse_cnt(x)) + '</blockquote>' yield f'<blockquote>{"".join(parse_cnt(x))}</blockquote>'
elif x.get('__typename', '') == 'TimestampBlock': elif typename == 'TimestampBlock':
yield '<div class="time">' + iso_date(x['timestamp']) + '</div>' yield f'<div class="time">{iso_date(x["timestamp"])}</div>'
elif x.get('__typename', '') == 'LineBreakInline': elif typename == 'LineBreakInline':
yield '<br/>' yield '<br/>'
elif x.get('__typename', '') == 'RuleBlock': elif typename == 'RuleBlock':
yield '<hr/>' yield '<hr/>'
elif x.get('__typename', '') == 'Image': elif typename in {'ImageBlock', 'VideoBlock', 'InteractiveBlock'}:
yield ''.join(parse_image(x)) yield "".join(parse_types(x['media']))
elif x.get('__typename', '') == 'ImageBlock':
yield ''.join(parse_image(x['media']))
elif x.get('__typename', '') == 'GridBlock':
yield ''.join(parse_img_grid(x))
elif x.get('__typename', '') == 'VideoBlock': elif typename == 'Image':
yield ''.join(parse_types(x['media'])) yield "".join(parse_image(x))
elif x.get('__typename', '') == 'Video':
yield ''.join(parse_vid(x))
elif x.get('__typename', '') == 'InteractiveBlock': elif typename == 'GridBlock':
yield ''.join(parse_types(x['media'])) yield "".join(parse_img_grid(x))
elif x.get('__typename', '') == 'EmbeddedInteractive':
yield ''.join(parse_emb(x))
elif x.get('__typename', '') == 'ListBlock': elif typename == 'Video':
yield '<ul>' + ''.join(parse_cnt(x)) + '</ul>' yield "".join(parse_vid(x))
elif x.get('__typename', '') == 'ListItemBlock':
yield '<li>' + ''.join(parse_cnt(x)) + '</li>'
elif x.get('__typename', '') == 'CapsuleBlock': elif typename == 'EmbeddedInteractive':
yield "".join(parse_emb(x))
elif typename == 'ListBlock':
yield f'<ul>{"".join(parse_cnt(x))}</ul>'
elif typename == 'ListItemBlock':
yield f'<li>{"".join(parse_cnt(x))}</li>'
elif typename == 'CapsuleBlock':
if x['capsuleContent'].get('body'): if x['capsuleContent'].get('body'):
yield ''.join(parse_cnt(x['capsuleContent']['body'])) yield "".join(parse_cnt(x['capsuleContent']['body']))
elif x.get('__typename', '') == 'Capsule': elif typename == 'Capsule':
yield ''.join(parse_cnt(x['body'])) yield "".join(parse_cnt(x['body']))
elif x.get('__typename', '') in { elif typename in {
'TextInline', 'TextOnlyDocumentBlock', 'DocumentBlock', 'SummaryBlock' 'TextInline', 'TextOnlyDocumentBlock', 'DocumentBlock',
'SummaryBlock', 'VisualStackBlock'
}: }:
yield ''.join(parse_cnt(x)) yield "".join(parse_cnt(x))
elif x.get('__typename'): elif typename and typename not in {'RelatedLinksBlock', 'Dropzone'}:
if ''.join(parse_cnt(x)).strip(): if x.get('media'):
yield '<p><i>' + ''.join(parse_cnt(x)) + '</i></p>' yield "".join(parse_types(x['media']))
elif "".join(parse_cnt(x)).strip():
yield f'<p><i>{"".join(parse_cnt(x))}</i></p>'
def article_parse(data): def article_parse(data):
yield "<html><body>" yield "<html><body>"