From 2822ec364b275e19df952f1e17d31827da7f6bf2 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 10 Mar 2024 16:47:46 +0530 Subject: [PATCH 1/4] Nat Geo fix images --- recipes/natgeo.recipe | 126 +++++++++++++++++++++++++-------------- recipes/natgeomag.recipe | 126 +++++++++++++++++++++++++-------------- 2 files changed, 164 insertions(+), 88 deletions(-) diff --git a/recipes/natgeo.recipe b/recipes/natgeo.recipe index b8a42b1311..345830095a 100644 --- a/recipes/natgeo.recipe +++ b/recipes/natgeo.recipe @@ -20,8 +20,6 @@ def extract_json(raw): s = raw.find("window['__natgeo__']") script = raw[s:raw.find('', s)] data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content'] - if 'article' in data: - return data['article'] return data['prismarticle'] @@ -35,63 +33,100 @@ def parse_contributors(grp): def parse_lead_image(media): if 'image' in media: + yield '

' if 'dsc' in media['image']: - yield '

{}
'.format( + yield '
{}
'.format( escape(media['image']['src'], True), escape(media['image']['dsc'], True)) else: - yield '

'.format(escape(media['image']['src'], True)) - if 'caption' in media: + yield '
'.format(escape(media['image']['src'], True)) + if 'caption' in media and 'credit' in media: + yield '
' + media['caption'] + ' ' + media['credit'] + '
' + elif 'caption' in media: yield '
' + media['caption'] + '
' - if 'credit' in media: - yield '
' + media['credit'] + '
' yield '

' -def parse_body(item): - c = item['cntnt'] - if item.get('type') == 'inline': - if c.get('cmsType') == 'listicle': - if 'title' in c: - yield '

' + escape(c['title']) + '

' - yield c['text'] - elif c.get('cmsType') == 'image': - for line in parse_lead_image(c): - yield line - elif c.get('cmsType') == 'imagegroup': - for imgs in c['images']: - for line in parse_lead_image(imgs): - yield line - elif c.get('cmsType') == 'pullquote': - if 'quote' in c: - yield '
' + c['quote'] + '
' - elif c.get('cmsType') == 'editorsNote': - if 'note' in c: - yield '
' + c['note'] + '
' - else: - if c['mrkup'].strip().startswith('<'): - yield c['mrkup'] - else: - yield '<{tag}>{markup}'.format( - tag=item['type'], markup=c['mrkup']) +def parse_inline(inl): + if inl.get('content', {}).get('name', '') == 'Image': + props = inl['content']['props'] + yield '

' + if 'image' in props: + yield '

'.format(props['image']['src']) + if 'caption' in props: + yield '
{}{}
'.format( + props['caption']['text'], ' ' + props['caption']['credit'] + ) + yield '

' + if inl.get('content', {}).get('name', '') == 'ImageGroup': + if 'images' in inl['content']['props']: + for imgs in inl['content']['props']['images']: + yield '

' + if 'src' in imgs: + yield '

'.format(imgs['src']) + if 'caption' in imgs: + yield '
{}{}
'.format( + imgs['caption']['text'], ' ' + imgs['caption']['credit'] + ) + + +def parse_cont(content): + for cont in content.get('content', {}): + if isinstance(cont, dict): + yield from parse_body(cont) + if isinstance(cont, str): + yield cont + + +def parse_body(x): + if isinstance(x, dict): + if 'type' in x: + tag = x['type'] + if tag == 'inline': + for inl in parse_inline(x): + yield inl + elif 'attrs' in x and 'href' in x.get('attrs', {}): + yield '<' + tag + ' href = "{}">'.format(x['attrs']['href']) + for yld in parse_cont(x): + yield yld + yield '' + else: + yield '<' + tag + '>' + for yld in parse_cont(x): + yield yld + yield '' + elif isinstance(x, list): + for y in x: + if isinstance(y, dict): + yield from parse_body(y) def parse_article(edg): sc = edg['schma'] - yield '

' + escape(edg['sctn']) + '

' + yield '
' + escape(edg['sctn']) + '
' yield '

' + escape(sc['sclTtl']) + '

' - yield '
' + escape(sc['sclDsc']) + '

' + yield '
' + escape(sc['sclDsc']) + '
' + yield '

' for line in parse_contributors(edg['cntrbGrp']): yield line ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y') yield '

Published: ' + escape(ts) + '
' if 'readTime' in edg: - yield '
' + escape(edg['readTime']) + '

' + yield '
' + escape(edg['readTime']) + '
' + yield '

' if edg.get('ldMda', {}).get('cmsType') == 'image': for line in parse_lead_image(edg['ldMda']): yield line - for item in edg['bdy']: - for line in parse_body(item): - yield line + for main in edg['prismData']['mainComponents']: + if main['name'] == 'Body': + for item in main['props']['body']: + if isinstance(item, dict): + if item.get('type', '') == 'inline': + for inl in parse_inline(item): + yield inl + elif isinstance(item, list): + for line in item: + for p in parse_body(line): + yield p def article_parse(data): @@ -131,11 +166,12 @@ class NatGeo(BasicNewsRecipe): ignore_duplicate_articles = {'url'} extra_css = ''' - .sub, blockquote { color:#404040; } + blockquote { color:#404040; } .byline, i { font-style:italic; color:#202020; } - .cap {text-align:center; font-size:small; } - .cred {text-align:center; font-size:small; color:#404040; } - .auth, .time { font-size:small; color:#5c5c5c; } + .cap { font-size:small; } + img {display:block; margin:0 auto;} + .cred { font-style:italic; font-size:small; color:#404040; } + .auth, .time, .sub { font-size:small; color:#5c5c5c; } ''' def get_cover_url(self): @@ -186,9 +222,11 @@ class NatGeo(BasicNewsRecipe): return '\n'.join(article_parse(data)) def preprocess_html(self, soup): + for h2 in soup.findAll('h2'): + h2.name = 'h4' for img in soup.findAll('img', src=True): # for high res images use '?w=2000&h=2000' - img['src'] = img['src'] + '?w=1000&h=1000' + img['src'] = img['src'] + '?w=600&h=600' return soup def populate_article_metadata(self, article, soup, first): diff --git a/recipes/natgeomag.recipe b/recipes/natgeomag.recipe index 4e5fc6bb4d..5f69b1f3b1 100644 --- a/recipes/natgeomag.recipe +++ b/recipes/natgeomag.recipe @@ -24,8 +24,6 @@ def extract_json(raw): s = raw.find("window['__natgeo__']") script = raw[s:raw.find('', s)] data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content'] - if 'article' in data: - return data['article'] return data['prismarticle'] @@ -39,63 +37,100 @@ def parse_contributors(grp): def parse_lead_image(media): if 'image' in media: + yield '

' if 'dsc' in media['image']: - yield '

{}
'.format( + yield '
{}
'.format( escape(media['image']['src'], True), escape(media['image']['dsc'], True)) else: - yield '

'.format(escape(media['image']['src'], True)) - if 'caption' in media: + yield '
'.format(escape(media['image']['src'], True)) + if 'caption' in media and 'credit' in media: + yield '
' + media['caption'] + ' ' + media['credit'] + '
' + elif 'caption' in media: yield '
' + media['caption'] + '
' - if 'credit' in media: - yield '
' + media['credit'] + '
' yield '

' -def parse_body(item): - c = item['cntnt'] - if item.get('type') == 'inline': - if c.get('cmsType') == 'listicle': - if 'title' in c: - yield '

' + escape(c['title']) + '

' - yield c['text'] - elif c.get('cmsType') == 'image': - for line in parse_lead_image(c): - yield line - elif c.get('cmsType') == 'imagegroup': - for imgs in c['images']: - for line in parse_lead_image(imgs): - yield line - elif c.get('cmsType') == 'pullquote': - if 'quote' in c: - yield '
' + c['quote'] + '
' - elif c.get('cmsType') == 'editorsNote': - if 'note' in c: - yield '
' + c['note'] + '
' - else: - if c['mrkup'].strip().startswith('<'): - yield c['mrkup'] - else: - yield '<{tag}>{markup}'.format( - tag=item['type'], markup=c['mrkup']) +def parse_inline(inl): + if inl.get('content', {}).get('name', '') == 'Image': + props = inl['content']['props'] + yield '

' + if 'image' in props: + yield '

'.format(props['image']['src']) + if 'caption' in props: + yield '
{}{}
'.format( + props['caption']['text'], ' ' + props['caption']['credit'] + ) + yield '

' + if inl.get('content', {}).get('name', '') == 'ImageGroup': + if 'images' in inl['content']['props']: + for imgs in inl['content']['props']['images']: + yield '

' + if 'src' in imgs: + yield '

'.format(imgs['src']) + if 'caption' in imgs: + yield '
{}{}
'.format( + imgs['caption']['text'], ' ' + imgs['caption']['credit'] + ) + + +def parse_cont(content): + for cont in content.get('content', {}): + if isinstance(cont, dict): + yield from parse_body(cont) + if isinstance(cont, str): + yield cont + + +def parse_body(x): + if isinstance(x, dict): + if 'type' in x: + tag = x['type'] + if tag == 'inline': + for inl in parse_inline(x): + yield inl + elif 'attrs' in x and 'href' in x.get('attrs', {}): + yield '<' + tag + ' href = "{}">'.format(x['attrs']['href']) + for yld in parse_cont(x): + yield yld + yield '' + else: + yield '<' + tag + '>' + for yld in parse_cont(x): + yield yld + yield '' + elif isinstance(x, list): + for y in x: + if isinstance(y, dict): + yield from parse_body(y) def parse_article(edg): sc = edg['schma'] - yield '

' + escape(edg['sctn']) + '

' + yield '
' + escape(edg['sctn']) + '
' yield '

' + escape(sc['sclTtl']) + '

' - yield '
' + escape(sc['sclDsc']) + '

' + yield '
' + escape(sc['sclDsc']) + '
' + yield '

' for line in parse_contributors(edg['cntrbGrp']): yield line ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y') yield '

Published: ' + escape(ts) + '
' if 'readTime' in edg: - yield '
' + escape(edg['readTime']) + '

' + yield '
' + escape(edg['readTime']) + '
' + yield '

' if edg.get('ldMda', {}).get('cmsType') == 'image': for line in parse_lead_image(edg['ldMda']): yield line - for item in edg['bdy']: - for line in parse_body(item): - yield line + for main in edg['prismData']['mainComponents']: + if main['name'] == 'Body': + for item in main['props']['body']: + if isinstance(item, dict): + if item.get('type', '') == 'inline': + for inl in parse_inline(item): + yield inl + elif isinstance(item, list): + for line in item: + for p in parse_body(line): + yield p def article_parse(data): @@ -134,11 +169,12 @@ class NatGeo(BasicNewsRecipe): resolve_internal_links = True extra_css = ''' - .sub, blockquote { color:#404040; } + blockquote { color:#404040; } .byline, i { font-style:italic; color:#202020; } - .cap {text-align:center; font-size:small; } - .cred {text-align:center; font-size:small; color:#404040; } - .auth, .time { font-size:small; color:#5c5c5c; } + .cap { font-size:small; } + img {display:block; margin:0 auto;} + .cred { font-style:italic; font-size:small; color:#404040; } + .auth, .time, .sub { font-size:small; color:#5c5c5c; } ''' def parse_index(self): @@ -183,9 +219,11 @@ class NatGeo(BasicNewsRecipe): return '\n'.join(article_parse(data)) def preprocess_html(self, soup): + for h2 in soup.findAll('h2'): + h2.name = 'h4' for img in soup.findAll('img', src=True): # for high res images use '?w=2000&h=2000' - img['src'] = img['src'] + '?w=1200&h=1200' + img['src'] = img['src'] + '?w=600&h=600' return soup def populate_article_metadata(self, article, soup, first): From 5f69db86eab28913a9b24f0975d179409487c946 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 10 Mar 2024 16:51:45 +0530 Subject: [PATCH 2/4] ... --- recipes/natgeo.recipe | 3 +- recipes/natgeohis.recipe | 131 +++++++++++++++++++++++++-------------- recipes/natgeomag.recipe | 3 +- 3 files changed, 86 insertions(+), 51 deletions(-) diff --git a/recipes/natgeo.recipe b/recipes/natgeo.recipe index 345830095a..f3bab4bc22 100644 --- a/recipes/natgeo.recipe +++ b/recipes/natgeo.recipe @@ -19,8 +19,7 @@ def classes(classes): def extract_json(raw): s = raw.find("window['__natgeo__']") script = raw[s:raw.find('', s)] - data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content'] - return data['prismarticle'] + data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle'] def parse_contributors(grp): diff --git a/recipes/natgeohis.recipe b/recipes/natgeohis.recipe index 2055309e2c..e56c01b825 100644 --- a/recipes/natgeohis.recipe +++ b/recipes/natgeohis.recipe @@ -18,10 +18,7 @@ def classes(classes): def extract_json(raw): s = raw.find("window['__natgeo__']") script = raw[s:raw.find('', s)] - data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content'] - if 'article' in data: - return data['article'] - return data['prismarticle'] + data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle'] def parse_contributors(grp): @@ -34,63 +31,100 @@ def parse_contributors(grp): def parse_lead_image(media): if 'image' in media: + yield '

' if 'dsc' in media['image']: - yield '

{}
'.format( + yield '
{}
'.format( escape(media['image']['src'], True), escape(media['image']['dsc'], True)) else: - yield '

'.format(escape(media['image']['src'], True)) - if 'caption' in media: + yield '
'.format(escape(media['image']['src'], True)) + if 'caption' in media and 'credit' in media: + yield '
' + media['caption'] + ' ' + media['credit'] + '
' + elif 'caption' in media: yield '
' + media['caption'] + '
' - if 'credit' in media: - yield '
' + media['credit'] + '
' yield '

' -def parse_body(item): - c = item['cntnt'] - if item.get('type') == 'inline': - if c.get('cmsType') == 'listicle': - if 'title' in c: - yield '

' + escape(c['title']) + '

' - yield c['text'] - elif c.get('cmsType') == 'image': - for line in parse_lead_image(c): - yield line - elif c.get('cmsType') == 'imagegroup': - for imgs in c['images']: - for line in parse_lead_image(imgs): - yield line - elif c.get('cmsType') == 'pullquote': - if 'quote' in c: - yield '
' + c['quote'] + '
' - elif c.get('cmsType') == 'editorsNote': - if 'note' in c: - yield '
' + c['note'] + '
' - else: - if c['mrkup'].strip().startswith('<'): - yield c['mrkup'] - else: - yield '<{tag}>{markup}'.format( - tag=item['type'], markup=c['mrkup']) +def parse_inline(inl): + if inl.get('content', {}).get('name', '') == 'Image': + props = inl['content']['props'] + yield '

' + if 'image' in props: + yield '

'.format(props['image']['src']) + if 'caption' in props: + yield '
{}{}
'.format( + props['caption']['text'], ' ' + props['caption']['credit'] + ) + yield '

' + if inl.get('content', {}).get('name', '') == 'ImageGroup': + if 'images' in inl['content']['props']: + for imgs in inl['content']['props']['images']: + yield '

' + if 'src' in imgs: + yield '

'.format(imgs['src']) + if 'caption' in imgs: + yield '
{}{}
'.format( + imgs['caption']['text'], ' ' + imgs['caption']['credit'] + ) + + +def parse_cont(content): + for cont in content.get('content', {}): + if isinstance(cont, dict): + yield from parse_body(cont) + if isinstance(cont, str): + yield cont + + +def parse_body(x): + if isinstance(x, dict): + if 'type' in x: + tag = x['type'] + if tag == 'inline': + for inl in parse_inline(x): + yield inl + elif 'attrs' in x and 'href' in x.get('attrs', {}): + yield '<' + tag + ' href = "{}">'.format(x['attrs']['href']) + for yld in parse_cont(x): + yield yld + yield '' + else: + yield '<' + tag + '>' + for yld in parse_cont(x): + yield yld + yield '' + elif isinstance(x, list): + for y in x: + if isinstance(y, dict): + yield from parse_body(y) def parse_article(edg): sc = edg['schma'] - yield '

' + escape(edg['sctn']) + '

' + yield '
' + escape(edg['sctn']) + '
' yield '

' + escape(sc['sclTtl']) + '

' - yield '
' + escape(sc['sclDsc']) + '

' + yield '
' + escape(sc['sclDsc']) + '
' + yield '

' for line in parse_contributors(edg['cntrbGrp']): yield line ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y') yield '

Published: ' + escape(ts) + '
' if 'readTime' in edg: - yield '
' + escape(edg['readTime']) + '

' + yield '
' + escape(edg['readTime']) + '
' + yield '

' if edg.get('ldMda', {}).get('cmsType') == 'image': for line in parse_lead_image(edg['ldMda']): yield line - for item in edg['bdy']: - for line in parse_body(item): - yield line + for main in edg['prismData']['mainComponents']: + if main['name'] == 'Body': + for item in main['props']['body']: + if isinstance(item, dict): + if item.get('type', '') == 'inline': + for inl in parse_inline(item): + yield inl + elif isinstance(item, list): + for line in item: + for p in parse_body(line): + yield p def article_parse(data): @@ -120,7 +154,7 @@ class NatGeo(BasicNewsRecipe): encoding = 'utf8' publisher = 'nationalgeographic.com' category = 'science, nat geo' - __author__ = 'Kovid Goyal' + __author__ = 'Kovid Goyal, unkn0wn' description = 'Inspiring people to care about the planet since 1888' timefmt = ' [%a, %d %b, %Y]' no_stylesheets = True @@ -131,11 +165,12 @@ class NatGeo(BasicNewsRecipe): resolve_internal_links = True extra_css = ''' - .sub, blockquote { color:#404040; } + blockquote { color:#404040; } .byline, i { font-style:italic; color:#202020; } - .cap {text-align:center; font-size:small; } - .cred {text-align:center; font-size:small; color:#404040; } - .auth, .time { font-size:small; color:#5c5c5c; } + .cap { font-size:small; } + img {display:block; margin:0 auto;} + .cred { font-style:italic; font-size:small; color:#404040; } + .auth, .time, .sub { font-size:small; color:#5c5c5c; } ''' def get_cover_url(self): @@ -161,9 +196,11 @@ class NatGeo(BasicNewsRecipe): return '\n'.join(article_parse(data)) def preprocess_html(self, soup): + for h2 in soup.findAll('h2'): + h2.name = 'h4' for img in soup.findAll('img', src=True): # for high res images use '?w=2000&h=2000' - img['src'] = img['src'] + '?w=1000&h=1000' + img['src'] = img['src'] + '?w=600&h=600' return soup def populate_article_metadata(self, article, soup, first): diff --git a/recipes/natgeomag.recipe b/recipes/natgeomag.recipe index 5f69b1f3b1..3736001438 100644 --- a/recipes/natgeomag.recipe +++ b/recipes/natgeomag.recipe @@ -23,8 +23,7 @@ def classes(classes): def extract_json(raw): s = raw.find("window['__natgeo__']") script = raw[s:raw.find('', s)] - data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content'] - return data['prismarticle'] + data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle'] def parse_contributors(grp): From 96aead4da953b1897a10c50993e6e846193dcdaa Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 10 Mar 2024 16:54:59 +0530 Subject: [PATCH 3/4] update bloomberg --- recipes/bloomberg-business-week.recipe | 4 ++-- recipes/bloomberg.recipe | 4 ++-- recipes/natgeo.recipe | 2 +- recipes/natgeohis.recipe | 2 +- recipes/natgeomag.recipe | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/recipes/bloomberg-business-week.recipe b/recipes/bloomberg-business-week.recipe index c90f9bfbc7..297a107792 100644 --- a/recipes/bloomberg-business-week.recipe +++ b/recipes/bloomberg-business-week.recipe @@ -30,7 +30,7 @@ def get_contents(x): return '
' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
' elif otype == 'media': if x['subType'] == 'photo': - return '
{}
{}
'.format( + return '
{} {}
'.format( x['data']['photo']['src'], x['data']['photo']['caption'], x['data']['photo']['credit']) elif x['subType'] == 'chart': if x['data'] and x['data']['chart']: @@ -78,7 +78,7 @@ class Bloomberg(BasicNewsRecipe): extra_css = ''' .auth {font-size:small; font-weight:bold;} .time, .chart, .css--lede-byline, .css--lede-timestamp {font-size:small;} - .subhead {font-style:italic; color:#404040;} + .subhead, .cap span {font-style:italic; color:#404040;} em, .col {color:#202020;} .cat {font-size:small; color:gray;} .news-figure-caption-text, .cap, .img, .css--caption-outer-wrapper {font-size:small; text-align:center;} diff --git a/recipes/bloomberg.recipe b/recipes/bloomberg.recipe index 0a1841bb3f..e525d3cbb4 100644 --- a/recipes/bloomberg.recipe +++ b/recipes/bloomberg.recipe @@ -31,7 +31,7 @@ def get_contents(x): return '
' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
' elif otype == 'media': if x['subType'] == 'photo': - return '
{}
{}
'.format( + return '
{} {}
'.format( x['data']['photo']['src'], x['data']['photo']['caption'], x['data']['photo']['credit']) elif x['subType'] == 'chart': if x['data'] and x['data']['chart']: @@ -77,7 +77,7 @@ class Bloomberg(BasicNewsRecipe): extra_css = ''' .auth {font-size:small; font-weight:bold;} .time, .chart {font-size:small;} - .subhead {font-style:italic; color:#404040;} + .subhead, .cap span {font-style:italic; color:#404040;} em, .col {color:#202020;} .cat {font-size:small; color:gray;} .news-figure-caption-text, .cap, .img {font-size:small; text-align:center;} diff --git a/recipes/natgeo.recipe b/recipes/natgeo.recipe index f3bab4bc22..a5f0a15cb4 100644 --- a/recipes/natgeo.recipe +++ b/recipes/natgeo.recipe @@ -19,7 +19,7 @@ def classes(classes): def extract_json(raw): s = raw.find("window['__natgeo__']") script = raw[s:raw.find('', s)] - data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle'] + return json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle'] def parse_contributors(grp): diff --git a/recipes/natgeohis.recipe b/recipes/natgeohis.recipe index e56c01b825..b005e9e339 100644 --- a/recipes/natgeohis.recipe +++ b/recipes/natgeohis.recipe @@ -18,7 +18,7 @@ def classes(classes): def extract_json(raw): s = raw.find("window['__natgeo__']") script = raw[s:raw.find('', s)] - data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle'] + return json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle'] def parse_contributors(grp): diff --git a/recipes/natgeomag.recipe b/recipes/natgeomag.recipe index 3736001438..98d444ada7 100644 --- a/recipes/natgeomag.recipe +++ b/recipes/natgeomag.recipe @@ -23,7 +23,7 @@ def classes(classes): def extract_json(raw): s = raw.find("window['__natgeo__']") script = raw[s:raw.find('', s)] - data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle'] + return json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle'] def parse_contributors(grp): From 01b16cd9f8b5926ef2fdf8946a36e55931e6309c Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 10 Mar 2024 18:31:00 +0530 Subject: [PATCH 4/4] ... fix html. --- recipes/natgeo.recipe | 7 +++---- recipes/natgeohis.recipe | 7 +++---- recipes/natgeomag.recipe | 7 +++---- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/recipes/natgeo.recipe b/recipes/natgeo.recipe index a5f0a15cb4..3580693ad9 100644 --- a/recipes/natgeo.recipe +++ b/recipes/natgeo.recipe @@ -66,6 +66,7 @@ def parse_inline(inl): yield '
{}{}
'.format( imgs['caption']['text'], ' ' + imgs['caption']['credit'] ) + yield '

' def parse_cont(content): @@ -81,8 +82,7 @@ def parse_body(x): if 'type' in x: tag = x['type'] if tag == 'inline': - for inl in parse_inline(x): - yield inl + yield ''.join(parse_inline(x)) elif 'attrs' in x and 'href' in x.get('attrs', {}): yield '<' + tag + ' href = "{}">'.format(x['attrs']['href']) for yld in parse_cont(x): @@ -124,8 +124,7 @@ def parse_article(edg): yield inl elif isinstance(item, list): for line in item: - for p in parse_body(line): - yield p + yield ''.join(parse_body(line)) def article_parse(data): diff --git a/recipes/natgeohis.recipe b/recipes/natgeohis.recipe index b005e9e339..89a2b59ea0 100644 --- a/recipes/natgeohis.recipe +++ b/recipes/natgeohis.recipe @@ -65,6 +65,7 @@ def parse_inline(inl): yield '
{}{}
'.format( imgs['caption']['text'], ' ' + imgs['caption']['credit'] ) + yield '

' def parse_cont(content): @@ -80,8 +81,7 @@ def parse_body(x): if 'type' in x: tag = x['type'] if tag == 'inline': - for inl in parse_inline(x): - yield inl + yield ''.join(parse_inline(x)) elif 'attrs' in x and 'href' in x.get('attrs', {}): yield '<' + tag + ' href = "{}">'.format(x['attrs']['href']) for yld in parse_cont(x): @@ -123,8 +123,7 @@ def parse_article(edg): yield inl elif isinstance(item, list): for line in item: - for p in parse_body(line): - yield p + yield ''.join(parse_body(line)) def article_parse(data): diff --git a/recipes/natgeomag.recipe b/recipes/natgeomag.recipe index 98d444ada7..537263eb65 100644 --- a/recipes/natgeomag.recipe +++ b/recipes/natgeomag.recipe @@ -70,6 +70,7 @@ def parse_inline(inl): yield '
{}{}
'.format( imgs['caption']['text'], ' ' + imgs['caption']['credit'] ) + yield '

' def parse_cont(content): @@ -85,8 +86,7 @@ def parse_body(x): if 'type' in x: tag = x['type'] if tag == 'inline': - for inl in parse_inline(x): - yield inl + yield ''.join(parse_inline(x)) elif 'attrs' in x and 'href' in x.get('attrs', {}): yield '<' + tag + ' href = "{}">'.format(x['attrs']['href']) for yld in parse_cont(x): @@ -128,8 +128,7 @@ def parse_article(edg): yield inl elif isinstance(item, list): for line in item: - for p in parse_body(line): - yield p + yield ''.join(parse_body(line)) def article_parse(data):