diff --git a/recipes/nytfeeds.recipe b/recipes/nytfeeds.recipe
index bbcfde127e..67b9189064 100644
--- a/recipes/nytfeeds.recipe
+++ b/recipes/nytfeeds.recipe
@@ -12,17 +12,13 @@ def extract_json(raw):
return js['initialData']['data']['article']['sprinkledBody']['content']
def parse_image(i):
- if i['__typename'] == 'Image':
- yield '
'
- yield '

'.format(i['crops'][0]['renditions'][0]['url'])
- if i.get('caption'):
- yield '
{}'.format(
- i['caption'].get('text', '')
- )
- if i.get('credit'):
- yield ' ' + i['credit'] + ''
- yield '
'
+ yield '

'.format(i['crops'][0]['renditions'][0]['url'])
+ if i.get('caption'):
+ yield '
' + ''.join(parse_types(i['caption']))
+ if i.get('credit'):
+ yield ' ' + i['credit'] + ''
yield '
'
+ yield '
'
def parse_img_grid(g):
for grd in g.get('gridMedia', {}):
@@ -33,92 +29,114 @@ def parse_img_grid(g):
yield '
' + g['credit'] + ''
yield '
'
-def parse_cnt(cnt):
- txt = ''
- if cnt['__typename'] == 'TextInline':
- if cnt.get('formats'):
- for fmt in cnt.get('formats', {}):
- if fmt['__typename'] == 'ItalicFormat':
- txt += ''
- if fmt['__typename'] == 'LinkFormat':
- txt += ''.format(fmt['url'])
- txt += cnt['text']
- elif cnt['__typename'] == 'LineBreakInline':
- txt += '
'
- if '' in txt and ''
- elif '' in txt:
- yield txt + ''
- elif ''
- else:
- yield txt
-
def parse_byline(byl):
for b in byl.get('bylines', {}):
yield '' + b['renderedRepresentation'] + '
'
for rl in byl.get('role', {}):
- yield '' + ''.join(parse_cnt(rl)) + '
'
+ if ''.join(parse_cnt(rl)).strip():
+ yield '' + ''.join(parse_cnt(rl)) + '
'
def iso_date(x):
dt = datetime.fromisoformat(x[:-1]) + timedelta(seconds=time.timezone)
return dt.strftime('%b %d, %Y at %I:%M %p')
-def header_parse(h):
+def parse_header(h):
if h.get('label'):
- if h['label'].get('content'):
- for cl in h['label']['content']:
- yield '' + ''.join(parse_cnt(cl)) + '
'
- for ch in h['headline']['content']:
- yield '' + ''.join(parse_cnt(ch)) + '
'
+ yield '' + ''.join(parse_types(h['label'])) + '
'
+ if h.get('headline'):
+ yield ''.join(parse_types(h['headline']))
if h.get('summary'):
- for cs in h['summary']['content']:
- yield '' + ''.join(parse_cnt(cs)) + '
'
+ yield '' + ''.join(parse_types(h['summary'])) + '
'
if h.get('ledeMedia'):
- if h['ledeMedia'].get('__typename', '') == 'ImageBlock':
- yield ''.join(parse_image(h['ledeMedia']['media']))
+ yield ''.join(parse_types(h['ledeMedia']))
if h.get('byline'):
- yield ''
- yield '\t' + '\t'.join(parse_byline(h['byline']))
- if h.get('timestampBlock'):
- yield '\t
' + iso_date(h['timestampBlock']['timestamp']) + '
'
- yield '
'
+ yield ''.join(parse_types(h['byline']))
+ if h.get('timestampBlock'):
+ yield ''.join(parse_types(h['timestampBlock']))
+
+def parse_fmt_type(fm):
+ for f in fm.get('formats', {}):
+ if f.get('__typename', '') == 'BoldFormat':
+ yield ''
+ if f.get('__typename', '') == 'ItalicFormat':
+ yield ''
+ if f.get('__typename', '') == 'LinkFormat':
+ hrf = f['url']
+ yield ''.format(hrf)
+ yield fm['text']
+ for f in reversed(fm.get('formats', {})):
+ if f.get('__typename', '') == 'BoldFormat':
+ yield ''
+ if f.get('__typename', '') == 'ItalicFormat':
+ yield ''
+ if f.get('__typename', '') == 'LinkFormat':
+ yield ''
+
+def parse_cnt(cnt):
+ if cnt.get('formats'):
+ yield ''.join(parse_fmt_type(cnt))
+ elif cnt.get('content'):
+ for cnt_ in cnt['content']:
+ yield from parse_types(cnt_)
+ elif cnt.get('text'):
+ yield cnt['text']
+
+def parse_types(x):
+ if 'Header' in x.get('__typename', ''):
+ yield '\n'.join(parse_header(x))
+
+ elif x.get('__typename', '') == 'Heading1Block':
+ yield '' + ''.join(parse_cnt(x)) + '
'
+ elif x.get('__typename', '') in {'Heading2Block', 'Heading3Block', 'Heading4Block'}:
+ yield '' + ''.join(parse_cnt(x)) + '
'
+
+ elif x.get('__typename', '') == 'ParagraphBlock':
+ yield '' + ''.join(parse_cnt(x)) + '
'
+
+ elif x.get('__typename', '') == 'BylineBlock':
+ yield '
' + ''.join(parse_byline(x)) + '
'
+ elif x.get('__typename', '') == 'LabelBlock':
+ yield '' + ''.join(parse_cnt(x)) + '
'
+ elif x.get('__typename', '') == 'BlockquoteBlock':
+ yield '' + ''.join(parse_cnt(x)) + '
'
+ elif x.get('__typename', '') == 'TimestampBlock':
+ yield '' + iso_date(x['timestamp']) + '
'
+ elif x.get('__typename', '') == 'LineBreakInline':
+ yield '
'
+ elif x.get('__typename', '') == 'RuleBlock':
+ yield '
'
+
+ elif x.get('__typename', '') == 'Image':
+ yield ''.join(parse_image(x))
+ elif x.get('__typename', '') == 'ImageBlock':
+ yield ''.join(parse_image(x['media']))
+ elif x.get('__typename', '') == 'GridBlock':
+ yield ''.join(parse_img_grid(x))
+
+ elif x.get('__typename', '') == 'ListBlock':
+ yield '' + ''.join(parse_cnt(x)) + '
'
+ elif x.get('__typename', '') == 'ListItemBlock':
+ yield '' + ''.join(parse_cnt(x)) + ''
+
+ elif x.get('__typename', '') == 'CapsuleBlock':
+ if x['capsuleContent'].get('body'):
+ yield ''.join(parse_cnt(x['capsuleContent']['body']))
+ elif x.get('__typename', '') == 'Capsule':
+ yield ''.join(parse_cnt(x['body']))
+
+ elif x.get('__typename', '') in {
+ 'TextInline', 'TextOnlyDocumentBlock', 'DocumentBlock', 'SummaryBlock'
+ }:
+ yield ''.join(parse_cnt(x))
+
+ elif x.get('__typename'):
+ if ''.join(parse_cnt(x)).strip():
+ yield '' + ''.join(parse_cnt(x)) + '
'
def article_parse(data):
yield ""
- for x in data:
- if x.get('__typename', '') in {'HeaderBasicBlock', 'HeaderFullBleedVerticalBlock', 'HeaderFullBleedHorizontalBlock'}:
- yield '\n'.join(header_parse(x))
- elif x.get('__typename', '') == 'ParagraphBlock':
- p_txt = ''
- for para in x['content']:
- p_txt += ''.join(parse_cnt(para))
- if p_txt.strip():
- yield '' + p_txt + '
'
- elif x.get('__typename', '') in {'Heading2Block', 'Heading3Block'}:
- h4_txt = ''
- for h2 in x['content']:
- h4_txt += ''.join(parse_cnt(h2))
- if h4_txt.strip():
- yield '' + h4_txt + '
'
- elif x.get('__typename', '') == 'Heading1Block':
- h1_txt = ''
- for h1 in x['content']:
- h1_txt += ''.join(parse_cnt(h1))
- if h1_txt.strip():
- yield '' + h1_txt + '
'
- elif x.get('__typename', '') == 'BylineBlock':
- yield '\n
\t' + '\t'.join(parse_byline(x)) + '
'
- elif x.get('__typename', '') == 'ImageBlock':
- yield ''.join(parse_image(x['media']))
- elif x.get('__typename', '') == 'GridBlock':
- yield ''.join(parse_img_grid(x))
- elif x.get('content'):
- o_txt = ''
- for i in x['content']:
- o_txt += ''.join(parse_cnt(i))
- if o_txt.strip():
- yield '' + o_txt + '
'
+ for d in data:
+ yield from parse_types(d)
yield ""
@@ -159,7 +177,7 @@ class nytFeeds(BasicNewsRecipe):
'default': 'no'
},
'res': {
- 'short': 'For hi-res images, select a resolution from the\nfollowing options: popup, jumbo, mobileMasterAt3x, superJumbo',
+ 'short': 'For hi-res images, select a resolution from the following\noptions: popup, jumbo, mobileMasterAt3x, superJumbo',
'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default(articleLarge), use articleInline.',
}
}
@@ -179,10 +197,12 @@ class nytFeeds(BasicNewsRecipe):
self.compress_news_images = True
extra_css = '''
- .byl { font-size:small; color:#202020; }
+ .byl, .time { font-size:small; color:#202020; }
.cap { font-size:small; text-align:center; }
.cred { font-style:italic; font-size:small; }
.sub { font-style:italic; }
+ em, blockquote { color: #202020; }
+ .sc { font-variant: small-caps; }
.lbl { font-size:small; color:#404040; }
img { display:block; margin:0 auto; }
'''
@@ -216,7 +236,11 @@ class nytFeeds(BasicNewsRecipe):
def preprocess_html(self, soup):
w = self.recipe_specific_options.get('res')
if w and isinstance(w, str):
- res = '-' + w + '.jpg'
+ res = '-' + w
for img in soup.findAll('img', attrs={'src':True}):
- img['src'] = img['src'].rsplit('-article', 1)[0] + res
+ ext = img['src'].split('?')[0].split('.')[-1]
+ img['src'] = img['src'].rsplit('-article', 1)[0] + res + '.' + ext
+ for c in soup.findAll('div', attrs={'class':'cap'}):
+ for p in c.findAll(['p', 'div']):
+ p.name = 'span'
return soup