diff --git a/recipes/nytfeeds.recipe b/recipes/nytfeeds.recipe index c93203ad17..bbcfde127e 100644 --- a/recipes/nytfeeds.recipe +++ b/recipes/nytfeeds.recipe @@ -34,16 +34,25 @@ def parse_img_grid(g): yield '' def parse_cnt(cnt): + txt = '' if cnt['__typename'] == 'TextInline': if cnt.get('formats'): for fmt in cnt.get('formats', {}): + if fmt['__typename'] == 'ItalicFormat': + txt += '' if fmt['__typename'] == 'LinkFormat': - hrf = fmt['url'] - yield ''.format(hrf) + cnt['text'] + '' - else: - yield cnt['text'] - else: - yield cnt['text'] + txt += ''.format(fmt['url']) + txt += cnt['text'] + elif cnt['__typename'] == 'LineBreakInline': + txt += '
' + if '' in txt and '
' + elif '' in txt: + yield txt + '' + elif '' + else: + yield txt def parse_byline(byl): for b in byl.get('bylines', {}): @@ -69,8 +78,8 @@ def header_parse(h): if h['ledeMedia'].get('__typename', '') == 'ImageBlock': yield ''.join(parse_image(h['ledeMedia']['media'])) if h.get('byline'): - yield '
' - yield '\t'.join(parse_byline(h['byline'])) + yield '

' + yield '\t' + '\t'.join(parse_byline(h['byline'])) if h.get('timestampBlock'): yield '\t
' + iso_date(h['timestampBlock']['timestamp']) + '
' yield '
' @@ -81,33 +90,35 @@ def article_parse(data): if x.get('__typename', '') in {'HeaderBasicBlock', 'HeaderFullBleedVerticalBlock', 'HeaderFullBleedHorizontalBlock'}: yield '\n'.join(header_parse(x)) elif x.get('__typename', '') == 'ParagraphBlock': - yield '

' + p_txt = '' for para in x['content']: - yield '\t'.join(parse_cnt(para)) - yield '

' + p_txt += ''.join(parse_cnt(para)) + if p_txt.strip(): + yield '

' + p_txt + '

' elif x.get('__typename', '') in {'Heading2Block', 'Heading3Block'}: - yield '

' + h4_txt = '' for h2 in x['content']: - yield '\t'.join(parse_cnt(h2)) - yield '

' + h4_txt += ''.join(parse_cnt(h2)) + if h4_txt.strip(): + yield '

' + h4_txt + '

' elif x.get('__typename', '') == 'Heading1Block': - yield '

' + h1_txt = '' for h1 in x['content']: - yield '\t'.join(parse_cnt(h1)) - yield '

' + h1_txt += ''.join(parse_cnt(h1)) + if h1_txt.strip(): + yield '

' + h1_txt + '

' elif x.get('__typename', '') == 'BylineBlock': - yield '
' - yield '\t'.join(parse_byline(x)) - yield '
' + yield '
\n
\t' + '\t'.join(parse_byline(x)) + '
' elif x.get('__typename', '') == 'ImageBlock': yield ''.join(parse_image(x['media'])) elif x.get('__typename', '') == 'GridBlock': yield ''.join(parse_img_grid(x)) elif x.get('content'): - yield '

' + o_txt = '' for i in x['content']: - yield '\t'.join(parse_cnt(i)) - yield '

' + o_txt += ''.join(parse_cnt(i)) + if o_txt.strip(): + yield '

' + o_txt + '

' yield "" @@ -207,5 +218,5 @@ class nytFeeds(BasicNewsRecipe): if w and isinstance(w, str): res = '-' + w + '.jpg' for img in soup.findAll('img', attrs={'src':True}): - img['src'] = img['src'].rsplit('-', 1)[0] + res + img['src'] = img['src'].rsplit('-article', 1)[0] + res return soup