From ae2f42a0a0ead0edf26755386050ea7f92e2f967 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 15 Sep 2024 23:25:28 +0530 Subject: [PATCH] ... --- recipes/nytfeeds.recipe | 47 +++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/recipes/nytfeeds.recipe b/recipes/nytfeeds.recipe index fd148a03c7..3c86f705bb 100644 --- a/recipes/nytfeeds.recipe +++ b/recipes/nytfeeds.recipe @@ -15,11 +15,11 @@ def parse_image(i): if i['__typename'] == 'Image': yield '
' yield ''.format(i['crops'][0]['renditions'][0]['url']) - if 'caption' in i and i['caption']: + if i.get('caption'): yield '
{}'.format( i['caption'].get('text', '') ) - if 'credit' in i and i['credit']: + if i.get('credit'): yield ' ' + i['credit'] + '' yield '
' yield '
' @@ -27,15 +27,15 @@ def parse_image(i): def parse_img_grid(g): for grd in g.get('gridMedia', {}): yield ''.join(parse_image(grd)) - if 'caption' in g and g['caption']: + if g.get('caption'): yield '
{}'.format(g['caption']) - if 'credit' in g and g['credit']: + if g.get('credit'): yield ' ' + g['credit'] + '' yield '
' def parse_cnt(cnt): if cnt['__typename'] == 'TextInline': - if 'formats' in cnt and cnt['formats']: + if cnt.get('formats'): for fmt in cnt.get('formats', {}): if fmt['__typename'] == 'LinkFormat': hrf = fmt['url'] @@ -58,34 +58,43 @@ def iso_date(x): def header_parse(h): for ch in h['headline']['content']: yield '

' + ''.join(parse_cnt(ch)) + '

' - if 'summary' in h and h['summary']: + if h.get('summary'): for cs in h['summary']['content']: yield '

' + ''.join(parse_cnt(cs)) + '

' - if 'ledeMedia' in h and h['ledeMedia']: + if h.get('ledeMedia'): if h['ledeMedia'].get('__typename', '') == 'ImageBlock': yield ''.join(parse_image(h['ledeMedia']['media'])) - if 'byline' in h and h['byline']: + if h.get('byline'): yield '
' yield '\t'.join(parse_byline(h['byline'])) - if 'timestampBlock' in h and h['timestampBlock']: + if h.get('timestampBlock'): yield '\t
' + iso_date(h['timestampBlock']['timestamp']) + '
' yield '
' def article_parse(data): yield "" for x in data: - if x.get('__typename', '') in {'HeaderBasicBlock', 'HeaderFullBleedVerticalBlock'}: + if x.get('__typename', '') in {'HeaderBasicBlock', 'HeaderFullBleedVerticalBlock', 'HeaderFullBleedHorizontalBlock'}: yield '\n'.join(header_parse(x)) elif x.get('__typename', '') == 'ParagraphBlock': yield '

' for para in x['content']: yield '\t'.join(parse_cnt(para)) yield '

' - elif x.get('__typename', '') == 'Heading2Block': + elif x.get('__typename', '') in {'Heading2Block', 'Heading3Block'}: yield '

' - for para in x['content']: - yield '\t'.join(parse_cnt(para)) + for h2 in x['content']: + yield '\t'.join(parse_cnt(h2)) yield '

' + elif x.get('__typename', '') == 'Heading1Block': + yield '

' + for h1 in x['content']: + yield '\t'.join(parse_cnt(h1)) + yield '

' + elif x.get('__typename', '') == 'BylineBlock': + yield '
' + yield '\t'.join(parse_byline(x)) + yield '
' elif x.get('__typename', '') == 'ImageBlock': yield ''.join(parse_image(x['media'])) elif x.get('__typename', '') == 'GridBlock': @@ -133,6 +142,10 @@ class nytFeeds(BasicNewsRecipe): 'short': 'Reverse the order of articles in each feed?', 'long': 'enter yes', 'default': 'no' + }, + 'res': { + 'short': 'For hi-res images, select a resolution from the\nfollowing options: popup, jumbo, mobileMasterAt3x, superJumbo', + 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default(articleLarge), use articleInline.', } } @@ -182,3 +195,11 @@ class nytFeeds(BasicNewsRecipe): def preprocess_raw_html(self, raw_html, url): data = extract_json(raw_html) return '\n'.join(article_parse(data)) + + def preprocess_html(self, soup): + w = self.recipe_specific_options.get('res') + if w and isinstance(w, str): + res = '-' + w + '.jpg' + for img in soup.findAll('img', attrs={'src':True}): + img['src'] = img['src'].rsplit('-', 1)[0] + res + return soup