diff --git a/recipes/nytfeeds.recipe b/recipes/nytfeeds.recipe index bbcfde127e..67b9189064 100644 --- a/recipes/nytfeeds.recipe +++ b/recipes/nytfeeds.recipe @@ -12,17 +12,13 @@ def extract_json(raw): return js['initialData']['data']['article']['sprinkledBody']['content'] def parse_image(i): - if i['__typename'] == 'Image': - yield '
' - yield ''.format(i['crops'][0]['renditions'][0]['url']) - if i.get('caption'): - yield '
{}'.format( - i['caption'].get('text', '') - ) - if i.get('credit'): - yield ' ' + i['credit'] + '' - yield '
' + yield '
'.format(i['crops'][0]['renditions'][0]['url']) + if i.get('caption'): + yield '
' + ''.join(parse_types(i['caption'])) + if i.get('credit'): + yield ' ' + i['credit'] + '' yield '
' + yield '
' def parse_img_grid(g): for grd in g.get('gridMedia', {}): @@ -33,92 +29,114 @@ def parse_img_grid(g): yield ' ' + g['credit'] + '' yield '
' -def parse_cnt(cnt): - txt = '' - if cnt['__typename'] == 'TextInline': - if cnt.get('formats'): - for fmt in cnt.get('formats', {}): - if fmt['__typename'] == 'ItalicFormat': - txt += '' - if fmt['__typename'] == 'LinkFormat': - txt += ''.format(fmt['url']) - txt += cnt['text'] - elif cnt['__typename'] == 'LineBreakInline': - txt += '
' - if '' in txt and '
' - elif '' in txt: - yield txt + '' - elif '' - else: - yield txt - def parse_byline(byl): for b in byl.get('bylines', {}): yield '
' + b['renderedRepresentation'] + '
' for rl in byl.get('role', {}): - yield '
' + ''.join(parse_cnt(rl)) + '
' + if ''.join(parse_cnt(rl)).strip(): + yield '
' + ''.join(parse_cnt(rl)) + '
' def iso_date(x): dt = datetime.fromisoformat(x[:-1]) + timedelta(seconds=time.timezone) return dt.strftime('%b %d, %Y at %I:%M %p') -def header_parse(h): +def parse_header(h): if h.get('label'): - if h['label'].get('content'): - for cl in h['label']['content']: - yield '
' + ''.join(parse_cnt(cl)) + '
' - for ch in h['headline']['content']: - yield '

' + ''.join(parse_cnt(ch)) + '

' + yield '
' + ''.join(parse_types(h['label'])) + '
' + if h.get('headline'): + yield ''.join(parse_types(h['headline'])) if h.get('summary'): - for cs in h['summary']['content']: - yield '

' + ''.join(parse_cnt(cs)) + '

' + yield '

' + ''.join(parse_types(h['summary'])) + '

' if h.get('ledeMedia'): - if h['ledeMedia'].get('__typename', '') == 'ImageBlock': - yield ''.join(parse_image(h['ledeMedia']['media'])) + yield ''.join(parse_types(h['ledeMedia'])) if h.get('byline'): - yield '

' - yield '\t' + '\t'.join(parse_byline(h['byline'])) - if h.get('timestampBlock'): - yield '\t
' + iso_date(h['timestampBlock']['timestamp']) + '
' - yield '
' + yield ''.join(parse_types(h['byline'])) + if h.get('timestampBlock'): + yield ''.join(parse_types(h['timestampBlock'])) + +def parse_fmt_type(fm): + for f in fm.get('formats', {}): + if f.get('__typename', '') == 'BoldFormat': + yield '' + if f.get('__typename', '') == 'ItalicFormat': + yield '' + if f.get('__typename', '') == 'LinkFormat': + hrf = f['url'] + yield '
'.format(hrf) + yield fm['text'] + for f in reversed(fm.get('formats', {})): + if f.get('__typename', '') == 'BoldFormat': + yield '' + if f.get('__typename', '') == 'ItalicFormat': + yield '' + if f.get('__typename', '') == 'LinkFormat': + yield '' + +def parse_cnt(cnt): + if cnt.get('formats'): + yield ''.join(parse_fmt_type(cnt)) + elif cnt.get('content'): + for cnt_ in cnt['content']: + yield from parse_types(cnt_) + elif cnt.get('text'): + yield cnt['text'] + +def parse_types(x): + if 'Header' in x.get('__typename', ''): + yield '\n'.join(parse_header(x)) + + elif x.get('__typename', '') == 'Heading1Block': + yield '

' + ''.join(parse_cnt(x)) + '

' + elif x.get('__typename', '') in {'Heading2Block', 'Heading3Block', 'Heading4Block'}: + yield '

' + ''.join(parse_cnt(x)) + '

' + + elif x.get('__typename', '') == 'ParagraphBlock': + yield '

' + ''.join(parse_cnt(x)) + '

' + + elif x.get('__typename', '') == 'BylineBlock': + yield '

' + ''.join(parse_byline(x)) + '
' + elif x.get('__typename', '') == 'LabelBlock': + yield '
' + ''.join(parse_cnt(x)) + '
' + elif x.get('__typename', '') == 'BlockquoteBlock': + yield '
' + ''.join(parse_cnt(x)) + '
' + elif x.get('__typename', '') == 'TimestampBlock': + yield '
' + iso_date(x['timestamp']) + '
' + elif x.get('__typename', '') == 'LineBreakInline': + yield '
' + elif x.get('__typename', '') == 'RuleBlock': + yield '
' + + elif x.get('__typename', '') == 'Image': + yield ''.join(parse_image(x)) + elif x.get('__typename', '') == 'ImageBlock': + yield ''.join(parse_image(x['media'])) + elif x.get('__typename', '') == 'GridBlock': + yield ''.join(parse_img_grid(x)) + + elif x.get('__typename', '') == 'ListBlock': + yield '' + elif x.get('__typename', '') == 'ListItemBlock': + yield '
  • ' + ''.join(parse_cnt(x)) + '
  • ' + + elif x.get('__typename', '') == 'CapsuleBlock': + if x['capsuleContent'].get('body'): + yield ''.join(parse_cnt(x['capsuleContent']['body'])) + elif x.get('__typename', '') == 'Capsule': + yield ''.join(parse_cnt(x['body'])) + + elif x.get('__typename', '') in { + 'TextInline', 'TextOnlyDocumentBlock', 'DocumentBlock', 'SummaryBlock' + }: + yield ''.join(parse_cnt(x)) + + elif x.get('__typename'): + if ''.join(parse_cnt(x)).strip(): + yield '

    ' + ''.join(parse_cnt(x)) + '

    ' def article_parse(data): yield "" - for x in data: - if x.get('__typename', '') in {'HeaderBasicBlock', 'HeaderFullBleedVerticalBlock', 'HeaderFullBleedHorizontalBlock'}: - yield '\n'.join(header_parse(x)) - elif x.get('__typename', '') == 'ParagraphBlock': - p_txt = '' - for para in x['content']: - p_txt += ''.join(parse_cnt(para)) - if p_txt.strip(): - yield '

    ' + p_txt + '

    ' - elif x.get('__typename', '') in {'Heading2Block', 'Heading3Block'}: - h4_txt = '' - for h2 in x['content']: - h4_txt += ''.join(parse_cnt(h2)) - if h4_txt.strip(): - yield '

    ' + h4_txt + '

    ' - elif x.get('__typename', '') == 'Heading1Block': - h1_txt = '' - for h1 in x['content']: - h1_txt += ''.join(parse_cnt(h1)) - if h1_txt.strip(): - yield '

    ' + h1_txt + '

    ' - elif x.get('__typename', '') == 'BylineBlock': - yield '
    \n
    \t' + '\t'.join(parse_byline(x)) + '
    ' - elif x.get('__typename', '') == 'ImageBlock': - yield ''.join(parse_image(x['media'])) - elif x.get('__typename', '') == 'GridBlock': - yield ''.join(parse_img_grid(x)) - elif x.get('content'): - o_txt = '' - for i in x['content']: - o_txt += ''.join(parse_cnt(i)) - if o_txt.strip(): - yield '

    ' + o_txt + '

    ' + for d in data: + yield from parse_types(d) yield "" @@ -159,7 +177,7 @@ class nytFeeds(BasicNewsRecipe): 'default': 'no' }, 'res': { - 'short': 'For hi-res images, select a resolution from the\nfollowing options: popup, jumbo, mobileMasterAt3x, superJumbo', + 'short': 'For hi-res images, select a resolution from the following\noptions: popup, jumbo, mobileMasterAt3x, superJumbo', 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default(articleLarge), use articleInline.', } } @@ -179,10 +197,12 @@ class nytFeeds(BasicNewsRecipe): self.compress_news_images = True extra_css = ''' - .byl { font-size:small; color:#202020; } + .byl, .time { font-size:small; color:#202020; } .cap { font-size:small; text-align:center; } .cred { font-style:italic; font-size:small; } .sub { font-style:italic; } + em, blockquote { color: #202020; } + .sc { font-variant: small-caps; } .lbl { font-size:small; color:#404040; } img { display:block; margin:0 auto; } ''' @@ -216,7 +236,11 @@ class nytFeeds(BasicNewsRecipe): def preprocess_html(self, soup): w = self.recipe_specific_options.get('res') if w and isinstance(w, str): - res = '-' + w + '.jpg' + res = '-' + w for img in soup.findAll('img', attrs={'src':True}): - img['src'] = img['src'].rsplit('-article', 1)[0] + res + ext = img['src'].split('?')[0].split('.')[-1] + img['src'] = img['src'].rsplit('-article', 1)[0] + res + '.' + ext + for c in soup.findAll('div', attrs={'class':'cap'}): + for p in c.findAll(['p', 'div']): + p.name = 'span' return soup