diff --git a/recipes/nytfeeds.recipe b/recipes/nytfeeds.recipe index 4b9fd17551..0ac1ed5fcd 100644 --- a/recipes/nytfeeds.recipe +++ b/recipes/nytfeeds.recipe @@ -12,52 +12,52 @@ def extract_json(raw): return js['initialData']['data']['article']['sprinkledBody']['content'] def parse_image(i): - if i.get('crops'): - yield '
'.format(i['crops'][0]['renditions'][0]['url']) - elif i.get('spanImageCrops'): - yield '
'.format(i['spanImageCrops'][0]['renditions'][0]['url']) + crop = i.get('crops') or i.get('spanImageCrops') + if crop: + yield f'
' if i.get('caption'): - yield '
' + ''.join(parse_types(i['caption'])) + yield f'
{"".join(parse_types(i["caption"]))}' if i.get('credit'): - yield ' ' + i['credit'] + '' + yield f' {i["credit"]}' yield '
' + elif i.get('legacyHtmlCaption'): + if i['legacyHtmlCaption'].strip(): + yield f'
{i["legacyHtmlCaption"]}
' yield '
' def parse_img_grid(g): for grd in g.get('gridMedia', {}): yield ''.join(parse_image(grd)) if g.get('caption'): - yield '
{}'.format(g['caption']) + yield f'
{g["caption"]}' if g.get('credit'): - yield ' ' + g['credit'] + '' + yield f' {g["credit"]}' yield '
' def parse_vid(v): if v.get('promotionalMedia'): - if v.get('headline'): - if v.get('url'): - yield '
Video: '.format(v['url'])\ - + v['headline'].get('default', '') + '
' - elif v['headline'].get('default'): - yield '
' + v['headline']['default'] + '
' - yield ''.join(parse_types(v['promotionalMedia'])) + headline = v.get("headline", {}).get("default", "") + rendition = v.get('renditions') + yield (f'
Video: {headline}
' + if rendition else f'
{headline}
') + yield ''.join(parse_types(v["promotionalMedia"])) if v.get('promotionalSummary'): - yield '
' + v['promotionalSummary'] + '
' + yield f'
{v["promotionalSummary"]}
' def parse_emb(e): if e.get('html') and 'datawrapper.dwcdn.net' in e.get('html', ''): dw = re.search(r'datawrapper.dwcdn.net/(.{5})', e['html']).group(1) - yield '
'.format('https://datawrapper.dwcdn.net/' + dw + '/full.png') + '
' + yield f'
' elif e.get('promotionalMedia'): if e.get('headline'): - yield '
' + e['headline']['default'] + '
' - yield ''.join(parse_types(e['promotionalMedia'])) + yield f'
{e["headline"]["default"]}
' + yield ''.join(parse_types(e["promotionalMedia"])) if e.get('note'): - yield '
' + e['note'] + '
' + yield f'
{e["note"]}
' def parse_byline(byl): for b in byl.get('bylines', {}): - yield '
' + b['renderedRepresentation'] + '
' + yield f'
{b["renderedRepresentation"]}
' yield '
' for rl in byl.get('role', {}): if ''.join(parse_cnt(rl)).strip(): @@ -70,106 +70,114 @@ def iso_date(x): def parse_header(h): if h.get('label'): - yield '
' + ''.join(parse_types(h['label'])) + '
' + yield f'
{"".join(parse_types(h["label"]))}
' if h.get('headline'): - yield ''.join(parse_types(h['headline'])) + yield ''.join(parse_types(h["headline"])) if h.get('summary'): - yield '

' + ''.join(parse_types(h['summary'])) + '

' + yield f'

{"".join(parse_types(h["summary"]))}

' if h.get('ledeMedia'): - yield ''.join(parse_types(h['ledeMedia'])) + yield ''.join(parse_types(h["ledeMedia"])) if h.get('byline'): - yield ''.join(parse_types(h['byline'])) + yield ''.join(parse_types(h["byline"])) if h.get('timestampBlock'): - yield ''.join(parse_types(h['timestampBlock'])) + yield ''.join(parse_types(h["timestampBlock"])) def parse_fmt_type(fm): for f in fm.get('formats', {}): - if f.get('__typename', '') == 'BoldFormat': + ftype = f.get("__typename", "") + if ftype == "BoldFormat": yield '' - if f.get('__typename', '') == 'ItalicFormat': + if ftype == "ItalicFormat": yield '' - if f.get('__typename', '') == 'LinkFormat': - hrf = f['url'] - yield ''.format(hrf) - yield fm['text'] + if ftype == "LinkFormat": + hrf = f["url"] + yield f'' + yield fm.get("text", "") for f in reversed(fm.get('formats', {})): - if f.get('__typename', '') == 'BoldFormat': + ftype = f.get("__typename", "") + if ftype == "BoldFormat": yield '' - if f.get('__typename', '') == 'ItalicFormat': + if ftype == "ItalicFormat": yield '' - if f.get('__typename', '') == 'LinkFormat': + if ftype == "LinkFormat": yield '' def parse_cnt(cnt): - if cnt.get('formats'): - yield ''.join(parse_fmt_type(cnt)) - elif cnt.get('content'): - for cnt_ in cnt['content']: - yield from parse_types(cnt_) - elif cnt.get('text'): + for k in cnt: + if isinstance(cnt[k], list): + if k == 'formats': + yield ''.join(parse_fmt_type(cnt)) + else: + for cnt_ in cnt[k]: + yield from parse_types(cnt_) + if isinstance(cnt[k], dict): + yield from parse_types(cnt[k]) + if cnt.get('text') and 'formats' not in cnt: yield cnt['text'] def parse_types(x): - if 'Header' in x.get('__typename', ''): + typename = x.get('__typename', '') + + if 'Header' in typename: yield '\n'.join(parse_header(x)) - elif x.get('__typename', '') == 'Heading1Block': - yield '

' + ''.join(parse_cnt(x)) + '

' - elif x.get('__typename', '') in {'Heading2Block', 'Heading3Block', 'Heading4Block'}: - yield '

' + ''.join(parse_cnt(x)) + '

' + elif typename.startswith('Heading'): + htag = 'h' + re.match(r'Heading([1-6])Block', typename).group(1) + yield f'<{htag}>{"".join(parse_cnt(x))}' - elif x.get('__typename', '') == 'ParagraphBlock': - yield '

' + ''.join(parse_cnt(x)) + '

' + elif typename == 'ParagraphBlock': + yield f'

{"".join(parse_cnt(x))}

' - elif x.get('__typename', '') == 'BylineBlock': - yield '

' + ''.join(parse_byline(x)) + '
' - elif x.get('__typename', '') == 'LabelBlock': - yield '
' + ''.join(parse_cnt(x)) + '
' - elif x.get('__typename', '') == 'BlockquoteBlock': - yield '
' + ''.join(parse_cnt(x)) + '
' - elif x.get('__typename', '') == 'TimestampBlock': - yield '
' + iso_date(x['timestamp']) + '
' - elif x.get('__typename', '') == 'LineBreakInline': + elif typename == 'BylineBlock': + yield f'

{"".join(parse_byline(x))}
' + elif typename == 'LabelBlock': + yield f'
{"".join(parse_cnt(x))}
' + elif typename == 'BlockquoteBlock': + yield f'
{"".join(parse_cnt(x))}
' + elif typename == 'TimestampBlock': + yield f'
{iso_date(x["timestamp"])}
' + elif typename == 'LineBreakInline': yield '
' - elif x.get('__typename', '') == 'RuleBlock': + elif typename == 'RuleBlock': yield '
' - elif x.get('__typename', '') == 'Image': - yield ''.join(parse_image(x)) - elif x.get('__typename', '') == 'ImageBlock': - yield ''.join(parse_types(x['media'])) - elif x.get('__typename', '') == 'GridBlock': - yield ''.join(parse_img_grid(x)) + elif typename in {'ImageBlock', 'VideoBlock', 'InteractiveBlock'}: + yield "".join(parse_types(x['media'])) - elif x.get('__typename', '') == 'VideoBlock': - yield ''.join(parse_types(x['media'])) - elif x.get('__typename', '') == 'Video': - yield ''.join(parse_vid(x)) - - elif x.get('__typename', '') == 'InteractiveBlock': - yield ''.join(parse_types(x['media'])) - elif x.get('__typename', '') == 'EmbeddedInteractive': - yield ''.join(parse_emb(x)) + elif typename == 'Image': + yield "".join(parse_image(x)) - elif x.get('__typename', '') == 'ListBlock': - yield '
    ' + ''.join(parse_cnt(x)) + '
' - elif x.get('__typename', '') == 'ListItemBlock': - yield '
  • ' + ''.join(parse_cnt(x)) + '
  • ' + elif typename == 'GridBlock': + yield "".join(parse_img_grid(x)) - elif x.get('__typename', '') == 'CapsuleBlock': + elif typename == 'Video': + yield "".join(parse_vid(x)) + + elif typename == 'EmbeddedInteractive': + yield "".join(parse_emb(x)) + + elif typename == 'ListBlock': + yield f'
      {"".join(parse_cnt(x))}
    ' + elif typename == 'ListItemBlock': + yield f'
  • {"".join(parse_cnt(x))}
  • ' + + elif typename == 'CapsuleBlock': if x['capsuleContent'].get('body'): - yield ''.join(parse_cnt(x['capsuleContent']['body'])) - elif x.get('__typename', '') == 'Capsule': - yield ''.join(parse_cnt(x['body'])) + yield "".join(parse_cnt(x['capsuleContent']['body'])) + elif typename == 'Capsule': + yield "".join(parse_cnt(x['body'])) - elif x.get('__typename', '') in { - 'TextInline', 'TextOnlyDocumentBlock', 'DocumentBlock', 'SummaryBlock' + elif typename in { + 'TextInline', 'TextOnlyDocumentBlock', 'DocumentBlock', + 'SummaryBlock', 'VisualStackBlock' }: - yield ''.join(parse_cnt(x)) + yield "".join(parse_cnt(x)) - elif x.get('__typename'): - if ''.join(parse_cnt(x)).strip(): - yield '

    ' + ''.join(parse_cnt(x)) + '

    ' + elif typename and typename not in {'RelatedLinksBlock', 'Dropzone'}: + if x.get('media'): + yield "".join(parse_types(x['media'])) + elif "".join(parse_cnt(x)).strip(): + yield f'

    {"".join(parse_cnt(x))}

    ' def article_parse(data): yield "" @@ -178,7 +186,7 @@ def article_parse(data): yield "" -class nytFeeds(BasicNewsRecipe): +class NytFeeds(BasicNewsRecipe): title = 'NYT News' __author__ = 'unkn0wn' description = ( @@ -236,7 +244,7 @@ class nytFeeds(BasicNewsRecipe): extra_css = ''' .byl, .time { font-size:small; color:#202020; } - .cap { font-size:small; text-align:center; } + .cap { font-size:small; } .cred { font-style:italic; font-size:small; } em, blockquote { color: #202020; } .sc { font-variant: small-caps; } @@ -305,3 +313,4 @@ class nytFeeds(BasicNewsRecipe): if not re.search(r'/video/|/live/|/athletic/|/espanol/|/card/', url): return url self.log('\tSkipped URL: ', url) + return None diff --git a/src/calibre/web/site_parsers/nytimes.py b/src/calibre/web/site_parsers/nytimes.py index 961a87b96e..b82f7a124b 100644 --- a/src/calibre/web/site_parsers/nytimes.py +++ b/src/calibre/web/site_parsers/nytimes.py @@ -9,57 +9,57 @@ from xml.sax.saxutils import escape, quoteattr from calibre.utils.iso8601 import parse_iso8601 -module_version = 6 # needed for live updates +module_version = 7 # needed for live updates pprint def parse_image(i): - if i.get('crops'): - yield '
    '.format(i['crops'][0]['renditions'][0]['url']) - elif i.get('spanImageCrops'): - yield '
    '.format(i['spanImageCrops'][0]['renditions'][0]['url']) + crop = i.get('crops') or i.get('spanImageCrops') + if crop: + yield f'
    ' if i.get('caption'): - yield '
    ' + ''.join(parse_types(i['caption'])) + yield f'
    {"".join(parse_types(i["caption"]))}' if i.get('credit'): - yield ' ' + i['credit'] + '' + yield f' {i["credit"]}' yield '
    ' + elif i.get('legacyHtmlCaption'): + if i['legacyHtmlCaption'].strip(): + yield f'
    {i["legacyHtmlCaption"]}
    ' yield '
    ' def parse_img_grid(g): for grd in g.get('gridMedia', {}): yield ''.join(parse_image(grd)) if g.get('caption'): - yield '
    {}'.format(g['caption']) + yield f'
    {g["caption"]}' if g.get('credit'): - yield ' ' + g['credit'] + '' + yield f' {g["credit"]}' yield '
    ' def parse_vid(v): if v.get('promotionalMedia'): - if v.get('headline'): - if v.get('url'): - yield '
    Video: '.format(v['url'])\ - + v['headline'].get('default', '') + '
    ' - elif v['headline'].get('default'): - yield '
    ' + v['headline']['default'] + '
    ' - yield ''.join(parse_types(v['promotionalMedia'])) + headline = v.get("headline", {}).get("default", "") + rendition = v.get('renditions') + yield (f'
    Video: {headline}
    ' + if rendition else f'
    {headline}
    ') + yield ''.join(parse_types(v["promotionalMedia"])) if v.get('promotionalSummary'): - yield '
    ' + v['promotionalSummary'] + '
    ' + yield f'
    {v["promotionalSummary"]}
    ' def parse_emb(e): if e.get('html') and 'datawrapper.dwcdn.net' in e.get('html', ''): dw = re.search(r'datawrapper.dwcdn.net/(.{5})', e['html']).group(1) - yield '
    '.format('https://datawrapper.dwcdn.net/' + dw + '/full.png') + '
    ' + yield f'
    ' elif e.get('promotionalMedia'): if e.get('headline'): - yield '
    ' + e['headline']['default'] + '
    ' - yield ''.join(parse_types(e['promotionalMedia'])) + yield f'
    {e["headline"]["default"]}
    ' + yield ''.join(parse_types(e["promotionalMedia"])) if e.get('note'): - yield '
    ' + e['note'] + '
    ' + yield f'
    {e["note"]}
    ' def parse_byline(byl): for b in byl.get('bylines', {}): - yield '
    ' + b['renderedRepresentation'] + '
    ' + yield f'
    {b["renderedRepresentation"]}
    ' yield '
    ' for rl in byl.get('role', {}): if ''.join(parse_cnt(rl)).strip(): @@ -72,106 +72,114 @@ def iso_date(x): def parse_header(h): if h.get('label'): - yield '
    ' + ''.join(parse_types(h['label'])) + '
    ' + yield f'
    {"".join(parse_types(h["label"]))}
    ' if h.get('headline'): - yield ''.join(parse_types(h['headline'])) + yield ''.join(parse_types(h["headline"])) if h.get('summary'): - yield '

    ' + ''.join(parse_types(h['summary'])) + '

    ' + yield f'

    {"".join(parse_types(h["summary"]))}

    ' if h.get('ledeMedia'): - yield ''.join(parse_types(h['ledeMedia'])) + yield ''.join(parse_types(h["ledeMedia"])) if h.get('byline'): - yield ''.join(parse_types(h['byline'])) + yield ''.join(parse_types(h["byline"])) if h.get('timestampBlock'): - yield ''.join(parse_types(h['timestampBlock'])) + yield ''.join(parse_types(h["timestampBlock"])) def parse_fmt_type(fm): for f in fm.get('formats', {}): - if f.get('__typename', '') == 'BoldFormat': + ftype = f.get("__typename", "") + if ftype == "BoldFormat": yield '' - if f.get('__typename', '') == 'ItalicFormat': + if ftype == "ItalicFormat": yield '' - if f.get('__typename', '') == 'LinkFormat': - hrf = f['url'] - yield ''.format(hrf) - yield fm['text'] + if ftype == "LinkFormat": + hrf = f["url"] + yield f'' + yield fm.get("text", "") for f in reversed(fm.get('formats', {})): - if f.get('__typename', '') == 'BoldFormat': + ftype = f.get("__typename", "") + if ftype == "BoldFormat": yield '' - if f.get('__typename', '') == 'ItalicFormat': + if ftype == "ItalicFormat": yield '' - if f.get('__typename', '') == 'LinkFormat': + if ftype == "LinkFormat": yield '' def parse_cnt(cnt): - if cnt.get('formats'): - yield ''.join(parse_fmt_type(cnt)) - elif cnt.get('content'): - for cnt_ in cnt['content']: - yield from parse_types(cnt_) - elif cnt.get('text'): + for k in cnt: + if isinstance(cnt[k], list): + if k == 'formats': + yield ''.join(parse_fmt_type(cnt)) + else: + for cnt_ in cnt[k]: + yield from parse_types(cnt_) + if isinstance(cnt[k], dict): + yield from parse_types(cnt[k]) + if cnt.get('text') and 'formats' not in cnt: yield cnt['text'] def parse_types(x): - if 'Header' in x.get('__typename', ''): + typename = x.get('__typename', '') + + if 'Header' in typename: yield '\n'.join(parse_header(x)) - elif x.get('__typename', '') == 'Heading1Block': - yield '

    ' + ''.join(parse_cnt(x)) + '

    ' - elif x.get('__typename', '') in {'Heading2Block', 'Heading3Block', 'Heading4Block'}: - yield '

    ' + ''.join(parse_cnt(x)) + '

    ' + elif typename.startswith('Heading'): + htag = 'h' + re.match(r'Heading([1-6])Block', typename).group(1) + yield f'<{htag}>{"".join(parse_cnt(x))}' - elif x.get('__typename', '') == 'ParagraphBlock': - yield '

    ' + ''.join(parse_cnt(x)) + '

    ' + elif typename == 'ParagraphBlock': + yield f'

    {"".join(parse_cnt(x))}

    ' - elif x.get('__typename', '') == 'BylineBlock': - yield '

    ' + ''.join(parse_byline(x)) + '
    ' - elif x.get('__typename', '') == 'LabelBlock': - yield '
    ' + ''.join(parse_cnt(x)) + '
    ' - elif x.get('__typename', '') == 'BlockquoteBlock': - yield '
    ' + ''.join(parse_cnt(x)) + '
    ' - elif x.get('__typename', '') == 'TimestampBlock': - yield '
    ' + iso_date(x['timestamp']) + '
    ' - elif x.get('__typename', '') == 'LineBreakInline': + elif typename == 'BylineBlock': + yield f'

    {"".join(parse_byline(x))}
    ' + elif typename == 'LabelBlock': + yield f'
    {"".join(parse_cnt(x))}
    ' + elif typename == 'BlockquoteBlock': + yield f'
    {"".join(parse_cnt(x))}
    ' + elif typename == 'TimestampBlock': + yield f'
    {iso_date(x["timestamp"])}
    ' + elif typename == 'LineBreakInline': yield '
    ' - elif x.get('__typename', '') == 'RuleBlock': + elif typename == 'RuleBlock': yield '
    ' - elif x.get('__typename', '') == 'Image': - yield ''.join(parse_image(x)) - elif x.get('__typename', '') == 'ImageBlock': - yield ''.join(parse_image(x['media'])) - elif x.get('__typename', '') == 'GridBlock': - yield ''.join(parse_img_grid(x)) + elif typename in {'ImageBlock', 'VideoBlock', 'InteractiveBlock'}: + yield "".join(parse_types(x['media'])) - elif x.get('__typename', '') == 'VideoBlock': - yield ''.join(parse_types(x['media'])) - elif x.get('__typename', '') == 'Video': - yield ''.join(parse_vid(x)) + elif typename == 'Image': + yield "".join(parse_image(x)) - elif x.get('__typename', '') == 'InteractiveBlock': - yield ''.join(parse_types(x['media'])) - elif x.get('__typename', '') == 'EmbeddedInteractive': - yield ''.join(parse_emb(x)) + elif typename == 'GridBlock': + yield "".join(parse_img_grid(x)) - elif x.get('__typename', '') == 'ListBlock': - yield '
      ' + ''.join(parse_cnt(x)) + '
    ' - elif x.get('__typename', '') == 'ListItemBlock': - yield '
  • ' + ''.join(parse_cnt(x)) + '
  • ' + elif typename == 'Video': + yield "".join(parse_vid(x)) - elif x.get('__typename', '') == 'CapsuleBlock': + elif typename == 'EmbeddedInteractive': + yield "".join(parse_emb(x)) + + elif typename == 'ListBlock': + yield f'
      {"".join(parse_cnt(x))}
    ' + elif typename == 'ListItemBlock': + yield f'
  • {"".join(parse_cnt(x))}
  • ' + + elif typename == 'CapsuleBlock': if x['capsuleContent'].get('body'): - yield ''.join(parse_cnt(x['capsuleContent']['body'])) - elif x.get('__typename', '') == 'Capsule': - yield ''.join(parse_cnt(x['body'])) + yield "".join(parse_cnt(x['capsuleContent']['body'])) + elif typename == 'Capsule': + yield "".join(parse_cnt(x['body'])) - elif x.get('__typename', '') in { - 'TextInline', 'TextOnlyDocumentBlock', 'DocumentBlock', 'SummaryBlock' + elif typename in { + 'TextInline', 'TextOnlyDocumentBlock', 'DocumentBlock', + 'SummaryBlock', 'VisualStackBlock' }: - yield ''.join(parse_cnt(x)) + yield "".join(parse_cnt(x)) - elif x.get('__typename'): - if ''.join(parse_cnt(x)).strip(): - yield '

    ' + ''.join(parse_cnt(x)) + '

    ' + elif typename and typename not in {'RelatedLinksBlock', 'Dropzone'}: + if x.get('media'): + yield "".join(parse_types(x['media'])) + elif "".join(parse_cnt(x)).strip(): + yield f'

    {"".join(parse_cnt(x))}

    ' def article_parse(data): yield ""