From 2822ec364b275e19df952f1e17d31827da7f6bf2 Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Sun, 10 Mar 2024 16:47:46 +0530
Subject: [PATCH 1/4] Nat Geo
fix images
---
recipes/natgeo.recipe | 126 +++++++++++++++++++++++++--------------
recipes/natgeomag.recipe | 126 +++++++++++++++++++++++++--------------
2 files changed, 164 insertions(+), 88 deletions(-)
diff --git a/recipes/natgeo.recipe b/recipes/natgeo.recipe
index b8a42b1311..345830095a 100644
--- a/recipes/natgeo.recipe
+++ b/recipes/natgeo.recipe
@@ -20,8 +20,6 @@ def extract_json(raw):
s = raw.find("window['__natgeo__']")
script = raw[s:raw.find('', s)]
data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']
- if 'article' in data:
- return data['article']
return data['prismarticle']
@@ -35,63 +33,100 @@ def parse_contributors(grp):
def parse_lead_image(media):
if 'image' in media:
+ yield '
'
if 'dsc' in media['image']:
- yield '
'.format(
+ yield ''.format(
escape(media['image']['src'], True), escape(media['image']['dsc'], True))
else:
- yield ''.format(escape(media['image']['src'], True))
- if 'caption' in media:
+ yield ''.format(escape(media['image']['src'], True))
+ if 'caption' in media and 'credit' in media:
+ yield '' + media['caption'] + ' ' + media['credit'] + '
'
+ elif 'caption' in media:
yield '' + media['caption'] + '
'
- if 'credit' in media:
- yield '' + media['credit'] + '
'
yield ''
-def parse_body(item):
- c = item['cntnt']
- if item.get('type') == 'inline':
- if c.get('cmsType') == 'listicle':
- if 'title' in c:
- yield '' + escape(c['title']) + '
'
- yield c['text']
- elif c.get('cmsType') == 'image':
- for line in parse_lead_image(c):
- yield line
- elif c.get('cmsType') == 'imagegroup':
- for imgs in c['images']:
- for line in parse_lead_image(imgs):
- yield line
- elif c.get('cmsType') == 'pullquote':
- if 'quote' in c:
- yield '' + c['quote'] + '
'
- elif c.get('cmsType') == 'editorsNote':
- if 'note' in c:
- yield '' + c['note'] + '
'
- else:
- if c['mrkup'].strip().startswith('<'):
- yield c['mrkup']
- else:
- yield '<{tag}>{markup}{tag}>'.format(
- tag=item['type'], markup=c['mrkup'])
+def parse_inline(inl):
+ if inl.get('content', {}).get('name', '') == 'Image':
+ props = inl['content']['props']
+ yield ''
+ if 'image' in props:
+ yield '
'.format(props['image']['src'])
+ if 'caption' in props:
+ yield '{}{}
'.format(
+ props['caption']['text'], ' ' + props['caption']['credit']
+ )
+ yield ''
+ if inl.get('content', {}).get('name', '') == 'ImageGroup':
+ if 'images' in inl['content']['props']:
+ for imgs in inl['content']['props']['images']:
+ yield ''
+ if 'src' in imgs:
+ yield '
'.format(imgs['src'])
+ if 'caption' in imgs:
+ yield '{}{}
'.format(
+ imgs['caption']['text'], ' ' + imgs['caption']['credit']
+ )
+
+
+def parse_cont(content):
+ for cont in content.get('content', {}):
+ if isinstance(cont, dict):
+ yield from parse_body(cont)
+ if isinstance(cont, str):
+ yield cont
+
+
+def parse_body(x):
+ if isinstance(x, dict):
+ if 'type' in x:
+ tag = x['type']
+ if tag == 'inline':
+ for inl in parse_inline(x):
+ yield inl
+ elif 'attrs' in x and 'href' in x.get('attrs', {}):
+ yield '<' + tag + ' href = "{}">'.format(x['attrs']['href'])
+ for yld in parse_cont(x):
+ yield yld
+ yield '' + tag + '>'
+ else:
+ yield '<' + tag + '>'
+ for yld in parse_cont(x):
+ yield yld
+ yield '' + tag + '>'
+ elif isinstance(x, list):
+ for y in x:
+ if isinstance(y, dict):
+ yield from parse_body(y)
def parse_article(edg):
sc = edg['schma']
- yield '' + escape(edg['sctn']) + '
'
+ yield '' + escape(edg['sctn']) + '
'
yield '' + escape(sc['sclTtl']) + '
'
- yield '' + escape(sc['sclDsc']) + '
'
+ yield '' + escape(sc['sclDsc']) + '
'
+ yield ''
for line in parse_contributors(edg['cntrbGrp']):
yield line
ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y')
yield '
Published: ' + escape(ts) + '
'
if 'readTime' in edg:
- yield '' + escape(edg['readTime']) + '
'
+ yield '' + escape(edg['readTime']) + '
'
+ yield ''
if edg.get('ldMda', {}).get('cmsType') == 'image':
for line in parse_lead_image(edg['ldMda']):
yield line
- for item in edg['bdy']:
- for line in parse_body(item):
- yield line
+ for main in edg['prismData']['mainComponents']:
+ if main['name'] == 'Body':
+ for item in main['props']['body']:
+ if isinstance(item, dict):
+ if item.get('type', '') == 'inline':
+ for inl in parse_inline(item):
+ yield inl
+ elif isinstance(item, list):
+ for line in item:
+ for p in parse_body(line):
+ yield p
def article_parse(data):
@@ -131,11 +166,12 @@ class NatGeo(BasicNewsRecipe):
ignore_duplicate_articles = {'url'}
extra_css = '''
- .sub, blockquote { color:#404040; }
+ blockquote { color:#404040; }
.byline, i { font-style:italic; color:#202020; }
- .cap {text-align:center; font-size:small; }
- .cred {text-align:center; font-size:small; color:#404040; }
- .auth, .time { font-size:small; color:#5c5c5c; }
+ .cap { font-size:small; }
+ img {display:block; margin:0 auto;}
+ .cred { font-style:italic; font-size:small; color:#404040; }
+ .auth, .time, .sub { font-size:small; color:#5c5c5c; }
'''
def get_cover_url(self):
@@ -186,9 +222,11 @@ class NatGeo(BasicNewsRecipe):
return '\n'.join(article_parse(data))
def preprocess_html(self, soup):
+ for h2 in soup.findAll('h2'):
+ h2.name = 'h4'
for img in soup.findAll('img', src=True):
# for high res images use '?w=2000&h=2000'
- img['src'] = img['src'] + '?w=1000&h=1000'
+ img['src'] = img['src'] + '?w=600&h=600'
return soup
def populate_article_metadata(self, article, soup, first):
diff --git a/recipes/natgeomag.recipe b/recipes/natgeomag.recipe
index 4e5fc6bb4d..5f69b1f3b1 100644
--- a/recipes/natgeomag.recipe
+++ b/recipes/natgeomag.recipe
@@ -24,8 +24,6 @@ def extract_json(raw):
s = raw.find("window['__natgeo__']")
script = raw[s:raw.find('', s)]
data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']
- if 'article' in data:
- return data['article']
return data['prismarticle']
@@ -39,63 +37,100 @@ def parse_contributors(grp):
def parse_lead_image(media):
if 'image' in media:
+ yield ''
if 'dsc' in media['image']:
- yield '
'.format(
+ yield ''.format(
escape(media['image']['src'], True), escape(media['image']['dsc'], True))
else:
- yield ''.format(escape(media['image']['src'], True))
- if 'caption' in media:
+ yield ''.format(escape(media['image']['src'], True))
+ if 'caption' in media and 'credit' in media:
+ yield '' + media['caption'] + ' ' + media['credit'] + '
'
+ elif 'caption' in media:
yield '' + media['caption'] + '
'
- if 'credit' in media:
- yield '' + media['credit'] + '
'
yield ''
-def parse_body(item):
- c = item['cntnt']
- if item.get('type') == 'inline':
- if c.get('cmsType') == 'listicle':
- if 'title' in c:
- yield '' + escape(c['title']) + '
'
- yield c['text']
- elif c.get('cmsType') == 'image':
- for line in parse_lead_image(c):
- yield line
- elif c.get('cmsType') == 'imagegroup':
- for imgs in c['images']:
- for line in parse_lead_image(imgs):
- yield line
- elif c.get('cmsType') == 'pullquote':
- if 'quote' in c:
- yield '' + c['quote'] + '
'
- elif c.get('cmsType') == 'editorsNote':
- if 'note' in c:
- yield '' + c['note'] + '
'
- else:
- if c['mrkup'].strip().startswith('<'):
- yield c['mrkup']
- else:
- yield '<{tag}>{markup}{tag}>'.format(
- tag=item['type'], markup=c['mrkup'])
+def parse_inline(inl):
+ if inl.get('content', {}).get('name', '') == 'Image':
+ props = inl['content']['props']
+ yield ''
+ if 'image' in props:
+ yield '
'.format(props['image']['src'])
+ if 'caption' in props:
+ yield '{}{}
'.format(
+ props['caption']['text'], ' ' + props['caption']['credit']
+ )
+ yield ''
+ if inl.get('content', {}).get('name', '') == 'ImageGroup':
+ if 'images' in inl['content']['props']:
+ for imgs in inl['content']['props']['images']:
+ yield ''
+ if 'src' in imgs:
+ yield '
'.format(imgs['src'])
+ if 'caption' in imgs:
+ yield '{}{}
'.format(
+ imgs['caption']['text'], ' ' + imgs['caption']['credit']
+ )
+
+
+def parse_cont(content):
+ for cont in content.get('content', {}):
+ if isinstance(cont, dict):
+ yield from parse_body(cont)
+ if isinstance(cont, str):
+ yield cont
+
+
+def parse_body(x):
+ if isinstance(x, dict):
+ if 'type' in x:
+ tag = x['type']
+ if tag == 'inline':
+ for inl in parse_inline(x):
+ yield inl
+ elif 'attrs' in x and 'href' in x.get('attrs', {}):
+ yield '<' + tag + ' href = "{}">'.format(x['attrs']['href'])
+ for yld in parse_cont(x):
+ yield yld
+ yield '' + tag + '>'
+ else:
+ yield '<' + tag + '>'
+ for yld in parse_cont(x):
+ yield yld
+ yield '' + tag + '>'
+ elif isinstance(x, list):
+ for y in x:
+ if isinstance(y, dict):
+ yield from parse_body(y)
def parse_article(edg):
sc = edg['schma']
- yield '' + escape(edg['sctn']) + '
'
+ yield '' + escape(edg['sctn']) + '
'
yield '' + escape(sc['sclTtl']) + '
'
- yield '' + escape(sc['sclDsc']) + '
'
+ yield '' + escape(sc['sclDsc']) + '
'
+ yield ''
for line in parse_contributors(edg['cntrbGrp']):
yield line
ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y')
yield '
Published: ' + escape(ts) + '
'
if 'readTime' in edg:
- yield '' + escape(edg['readTime']) + '
'
+ yield '' + escape(edg['readTime']) + '
'
+ yield ''
if edg.get('ldMda', {}).get('cmsType') == 'image':
for line in parse_lead_image(edg['ldMda']):
yield line
- for item in edg['bdy']:
- for line in parse_body(item):
- yield line
+ for main in edg['prismData']['mainComponents']:
+ if main['name'] == 'Body':
+ for item in main['props']['body']:
+ if isinstance(item, dict):
+ if item.get('type', '') == 'inline':
+ for inl in parse_inline(item):
+ yield inl
+ elif isinstance(item, list):
+ for line in item:
+ for p in parse_body(line):
+ yield p
def article_parse(data):
@@ -134,11 +169,12 @@ class NatGeo(BasicNewsRecipe):
resolve_internal_links = True
extra_css = '''
- .sub, blockquote { color:#404040; }
+ blockquote { color:#404040; }
.byline, i { font-style:italic; color:#202020; }
- .cap {text-align:center; font-size:small; }
- .cred {text-align:center; font-size:small; color:#404040; }
- .auth, .time { font-size:small; color:#5c5c5c; }
+ .cap { font-size:small; }
+ img {display:block; margin:0 auto;}
+ .cred { font-style:italic; font-size:small; color:#404040; }
+ .auth, .time, .sub { font-size:small; color:#5c5c5c; }
'''
def parse_index(self):
@@ -183,9 +219,11 @@ class NatGeo(BasicNewsRecipe):
return '\n'.join(article_parse(data))
def preprocess_html(self, soup):
+ for h2 in soup.findAll('h2'):
+ h2.name = 'h4'
for img in soup.findAll('img', src=True):
# for high res images use '?w=2000&h=2000'
- img['src'] = img['src'] + '?w=1200&h=1200'
+ img['src'] = img['src'] + '?w=600&h=600'
return soup
def populate_article_metadata(self, article, soup, first):
From 5f69db86eab28913a9b24f0975d179409487c946 Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Sun, 10 Mar 2024 16:51:45 +0530
Subject: [PATCH 2/4] ...
---
recipes/natgeo.recipe | 3 +-
recipes/natgeohis.recipe | 131 +++++++++++++++++++++++++--------------
recipes/natgeomag.recipe | 3 +-
3 files changed, 86 insertions(+), 51 deletions(-)
diff --git a/recipes/natgeo.recipe b/recipes/natgeo.recipe
index 345830095a..f3bab4bc22 100644
--- a/recipes/natgeo.recipe
+++ b/recipes/natgeo.recipe
@@ -19,8 +19,7 @@ def classes(classes):
def extract_json(raw):
s = raw.find("window['__natgeo__']")
script = raw[s:raw.find('', s)]
- data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']
- return data['prismarticle']
+ data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle']
def parse_contributors(grp):
diff --git a/recipes/natgeohis.recipe b/recipes/natgeohis.recipe
index 2055309e2c..e56c01b825 100644
--- a/recipes/natgeohis.recipe
+++ b/recipes/natgeohis.recipe
@@ -18,10 +18,7 @@ def classes(classes):
def extract_json(raw):
s = raw.find("window['__natgeo__']")
script = raw[s:raw.find('', s)]
- data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']
- if 'article' in data:
- return data['article']
- return data['prismarticle']
+ data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle']
def parse_contributors(grp):
@@ -34,63 +31,100 @@ def parse_contributors(grp):
def parse_lead_image(media):
if 'image' in media:
+ yield ''
if 'dsc' in media['image']:
- yield '
'.format(
+ yield ''.format(
escape(media['image']['src'], True), escape(media['image']['dsc'], True))
else:
- yield ''.format(escape(media['image']['src'], True))
- if 'caption' in media:
+ yield ''.format(escape(media['image']['src'], True))
+ if 'caption' in media and 'credit' in media:
+ yield '' + media['caption'] + ' ' + media['credit'] + '
'
+ elif 'caption' in media:
yield '' + media['caption'] + '
'
- if 'credit' in media:
- yield '' + media['credit'] + '
'
yield ''
-def parse_body(item):
- c = item['cntnt']
- if item.get('type') == 'inline':
- if c.get('cmsType') == 'listicle':
- if 'title' in c:
- yield '' + escape(c['title']) + '
'
- yield c['text']
- elif c.get('cmsType') == 'image':
- for line in parse_lead_image(c):
- yield line
- elif c.get('cmsType') == 'imagegroup':
- for imgs in c['images']:
- for line in parse_lead_image(imgs):
- yield line
- elif c.get('cmsType') == 'pullquote':
- if 'quote' in c:
- yield '' + c['quote'] + '
'
- elif c.get('cmsType') == 'editorsNote':
- if 'note' in c:
- yield '' + c['note'] + '
'
- else:
- if c['mrkup'].strip().startswith('<'):
- yield c['mrkup']
- else:
- yield '<{tag}>{markup}{tag}>'.format(
- tag=item['type'], markup=c['mrkup'])
+def parse_inline(inl):
+ if inl.get('content', {}).get('name', '') == 'Image':
+ props = inl['content']['props']
+ yield ''
+ if 'image' in props:
+ yield '
'.format(props['image']['src'])
+ if 'caption' in props:
+ yield '{}{}
'.format(
+ props['caption']['text'], ' ' + props['caption']['credit']
+ )
+ yield ''
+ if inl.get('content', {}).get('name', '') == 'ImageGroup':
+ if 'images' in inl['content']['props']:
+ for imgs in inl['content']['props']['images']:
+ yield ''
+ if 'src' in imgs:
+ yield '
'.format(imgs['src'])
+ if 'caption' in imgs:
+ yield '{}{}
'.format(
+ imgs['caption']['text'], ' ' + imgs['caption']['credit']
+ )
+
+
+def parse_cont(content):
+ for cont in content.get('content', {}):
+ if isinstance(cont, dict):
+ yield from parse_body(cont)
+ if isinstance(cont, str):
+ yield cont
+
+
+def parse_body(x):
+ if isinstance(x, dict):
+ if 'type' in x:
+ tag = x['type']
+ if tag == 'inline':
+ for inl in parse_inline(x):
+ yield inl
+ elif 'attrs' in x and 'href' in x.get('attrs', {}):
+ yield '<' + tag + ' href = "{}">'.format(x['attrs']['href'])
+ for yld in parse_cont(x):
+ yield yld
+ yield '' + tag + '>'
+ else:
+ yield '<' + tag + '>'
+ for yld in parse_cont(x):
+ yield yld
+ yield '' + tag + '>'
+ elif isinstance(x, list):
+ for y in x:
+ if isinstance(y, dict):
+ yield from parse_body(y)
def parse_article(edg):
sc = edg['schma']
- yield '' + escape(edg['sctn']) + '
'
+ yield '' + escape(edg['sctn']) + '
'
yield '' + escape(sc['sclTtl']) + '
'
- yield '' + escape(sc['sclDsc']) + '
'
+ yield '' + escape(sc['sclDsc']) + '
'
+ yield ''
for line in parse_contributors(edg['cntrbGrp']):
yield line
ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y')
yield '
Published: ' + escape(ts) + '
'
if 'readTime' in edg:
- yield '' + escape(edg['readTime']) + '
'
+ yield '' + escape(edg['readTime']) + '
'
+ yield ''
if edg.get('ldMda', {}).get('cmsType') == 'image':
for line in parse_lead_image(edg['ldMda']):
yield line
- for item in edg['bdy']:
- for line in parse_body(item):
- yield line
+ for main in edg['prismData']['mainComponents']:
+ if main['name'] == 'Body':
+ for item in main['props']['body']:
+ if isinstance(item, dict):
+ if item.get('type', '') == 'inline':
+ for inl in parse_inline(item):
+ yield inl
+ elif isinstance(item, list):
+ for line in item:
+ for p in parse_body(line):
+ yield p
def article_parse(data):
@@ -120,7 +154,7 @@ class NatGeo(BasicNewsRecipe):
encoding = 'utf8'
publisher = 'nationalgeographic.com'
category = 'science, nat geo'
- __author__ = 'Kovid Goyal'
+ __author__ = 'Kovid Goyal, unkn0wn'
description = 'Inspiring people to care about the planet since 1888'
timefmt = ' [%a, %d %b, %Y]'
no_stylesheets = True
@@ -131,11 +165,12 @@ class NatGeo(BasicNewsRecipe):
resolve_internal_links = True
extra_css = '''
- .sub, blockquote { color:#404040; }
+ blockquote { color:#404040; }
.byline, i { font-style:italic; color:#202020; }
- .cap {text-align:center; font-size:small; }
- .cred {text-align:center; font-size:small; color:#404040; }
- .auth, .time { font-size:small; color:#5c5c5c; }
+ .cap { font-size:small; }
+ img {display:block; margin:0 auto;}
+ .cred { font-style:italic; font-size:small; color:#404040; }
+ .auth, .time, .sub { font-size:small; color:#5c5c5c; }
'''
def get_cover_url(self):
@@ -161,9 +196,11 @@ class NatGeo(BasicNewsRecipe):
return '\n'.join(article_parse(data))
def preprocess_html(self, soup):
+ for h2 in soup.findAll('h2'):
+ h2.name = 'h4'
for img in soup.findAll('img', src=True):
# for high res images use '?w=2000&h=2000'
- img['src'] = img['src'] + '?w=1000&h=1000'
+ img['src'] = img['src'] + '?w=600&h=600'
return soup
def populate_article_metadata(self, article, soup, first):
diff --git a/recipes/natgeomag.recipe b/recipes/natgeomag.recipe
index 5f69b1f3b1..3736001438 100644
--- a/recipes/natgeomag.recipe
+++ b/recipes/natgeomag.recipe
@@ -23,8 +23,7 @@ def classes(classes):
def extract_json(raw):
s = raw.find("window['__natgeo__']")
script = raw[s:raw.find('', s)]
- data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']
- return data['prismarticle']
+ data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle']
def parse_contributors(grp):
From 96aead4da953b1897a10c50993e6e846193dcdaa Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Sun, 10 Mar 2024 16:54:59 +0530
Subject: [PATCH 3/4] update bloomberg
---
recipes/bloomberg-business-week.recipe | 4 ++--
recipes/bloomberg.recipe | 4 ++--
recipes/natgeo.recipe | 2 +-
recipes/natgeohis.recipe | 2 +-
recipes/natgeomag.recipe | 2 +-
5 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/recipes/bloomberg-business-week.recipe b/recipes/bloomberg-business-week.recipe
index c90f9bfbc7..297a107792 100644
--- a/recipes/bloomberg-business-week.recipe
+++ b/recipes/bloomberg-business-week.recipe
@@ -30,7 +30,7 @@ def get_contents(x):
return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
'
elif otype == 'media':
if x['subType'] == 'photo':
- return ''.format(
+ return ''.format(
x['data']['photo']['src'], x['data']['photo']['caption'], x['data']['photo']['credit'])
elif x['subType'] == 'chart':
if x['data'] and x['data']['chart']:
@@ -78,7 +78,7 @@ class Bloomberg(BasicNewsRecipe):
extra_css = '''
.auth {font-size:small; font-weight:bold;}
.time, .chart, .css--lede-byline, .css--lede-timestamp {font-size:small;}
- .subhead {font-style:italic; color:#404040;}
+ .subhead, .cap span {font-style:italic; color:#404040;}
em, .col {color:#202020;}
.cat {font-size:small; color:gray;}
.news-figure-caption-text, .cap, .img, .css--caption-outer-wrapper {font-size:small; text-align:center;}
diff --git a/recipes/bloomberg.recipe b/recipes/bloomberg.recipe
index 0a1841bb3f..e525d3cbb4 100644
--- a/recipes/bloomberg.recipe
+++ b/recipes/bloomberg.recipe
@@ -31,7 +31,7 @@ def get_contents(x):
return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
'
elif otype == 'media':
if x['subType'] == 'photo':
- return ''.format(
+ return ''.format(
x['data']['photo']['src'], x['data']['photo']['caption'], x['data']['photo']['credit'])
elif x['subType'] == 'chart':
if x['data'] and x['data']['chart']:
@@ -77,7 +77,7 @@ class Bloomberg(BasicNewsRecipe):
extra_css = '''
.auth {font-size:small; font-weight:bold;}
.time, .chart {font-size:small;}
- .subhead {font-style:italic; color:#404040;}
+ .subhead, .cap span {font-style:italic; color:#404040;}
em, .col {color:#202020;}
.cat {font-size:small; color:gray;}
.news-figure-caption-text, .cap, .img {font-size:small; text-align:center;}
diff --git a/recipes/natgeo.recipe b/recipes/natgeo.recipe
index f3bab4bc22..a5f0a15cb4 100644
--- a/recipes/natgeo.recipe
+++ b/recipes/natgeo.recipe
@@ -19,7 +19,7 @@ def classes(classes):
def extract_json(raw):
s = raw.find("window['__natgeo__']")
script = raw[s:raw.find('', s)]
- data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle']
+ return json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle']
def parse_contributors(grp):
diff --git a/recipes/natgeohis.recipe b/recipes/natgeohis.recipe
index e56c01b825..b005e9e339 100644
--- a/recipes/natgeohis.recipe
+++ b/recipes/natgeohis.recipe
@@ -18,7 +18,7 @@ def classes(classes):
def extract_json(raw):
s = raw.find("window['__natgeo__']")
script = raw[s:raw.find('', s)]
- data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle']
+ return json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle']
def parse_contributors(grp):
diff --git a/recipes/natgeomag.recipe b/recipes/natgeomag.recipe
index 3736001438..98d444ada7 100644
--- a/recipes/natgeomag.recipe
+++ b/recipes/natgeomag.recipe
@@ -23,7 +23,7 @@ def classes(classes):
def extract_json(raw):
s = raw.find("window['__natgeo__']")
script = raw[s:raw.find('', s)]
- data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle']
+ return json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle']
def parse_contributors(grp):
From 01b16cd9f8b5926ef2fdf8946a36e55931e6309c Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Sun, 10 Mar 2024 18:31:00 +0530
Subject: [PATCH 4/4] ...
fix html.
---
recipes/natgeo.recipe | 7 +++----
recipes/natgeohis.recipe | 7 +++----
recipes/natgeomag.recipe | 7 +++----
3 files changed, 9 insertions(+), 12 deletions(-)
diff --git a/recipes/natgeo.recipe b/recipes/natgeo.recipe
index a5f0a15cb4..3580693ad9 100644
--- a/recipes/natgeo.recipe
+++ b/recipes/natgeo.recipe
@@ -66,6 +66,7 @@ def parse_inline(inl):
yield '{}{}
'.format(
imgs['caption']['text'], ' ' + imgs['caption']['credit']
)
+ yield ''
def parse_cont(content):
@@ -81,8 +82,7 @@ def parse_body(x):
if 'type' in x:
tag = x['type']
if tag == 'inline':
- for inl in parse_inline(x):
- yield inl
+ yield ''.join(parse_inline(x))
elif 'attrs' in x and 'href' in x.get('attrs', {}):
yield '<' + tag + ' href = "{}">'.format(x['attrs']['href'])
for yld in parse_cont(x):
@@ -124,8 +124,7 @@ def parse_article(edg):
yield inl
elif isinstance(item, list):
for line in item:
- for p in parse_body(line):
- yield p
+ yield ''.join(parse_body(line))
def article_parse(data):
diff --git a/recipes/natgeohis.recipe b/recipes/natgeohis.recipe
index b005e9e339..89a2b59ea0 100644
--- a/recipes/natgeohis.recipe
+++ b/recipes/natgeohis.recipe
@@ -65,6 +65,7 @@ def parse_inline(inl):
yield '{}{}
'.format(
imgs['caption']['text'], ' ' + imgs['caption']['credit']
)
+ yield ''
def parse_cont(content):
@@ -80,8 +81,7 @@ def parse_body(x):
if 'type' in x:
tag = x['type']
if tag == 'inline':
- for inl in parse_inline(x):
- yield inl
+ yield ''.join(parse_inline(x))
elif 'attrs' in x and 'href' in x.get('attrs', {}):
yield '<' + tag + ' href = "{}">'.format(x['attrs']['href'])
for yld in parse_cont(x):
@@ -123,8 +123,7 @@ def parse_article(edg):
yield inl
elif isinstance(item, list):
for line in item:
- for p in parse_body(line):
- yield p
+ yield ''.join(parse_body(line))
def article_parse(data):
diff --git a/recipes/natgeomag.recipe b/recipes/natgeomag.recipe
index 98d444ada7..537263eb65 100644
--- a/recipes/natgeomag.recipe
+++ b/recipes/natgeomag.recipe
@@ -70,6 +70,7 @@ def parse_inline(inl):
yield '{}{}
'.format(
imgs['caption']['text'], ' ' + imgs['caption']['credit']
)
+ yield ''
def parse_cont(content):
@@ -85,8 +86,7 @@ def parse_body(x):
if 'type' in x:
tag = x['type']
if tag == 'inline':
- for inl in parse_inline(x):
- yield inl
+ yield ''.join(parse_inline(x))
elif 'attrs' in x and 'href' in x.get('attrs', {}):
yield '<' + tag + ' href = "{}">'.format(x['attrs']['href'])
for yld in parse_cont(x):
@@ -128,8 +128,7 @@ def parse_article(edg):
yield inl
elif isinstance(item, list):
for line in item:
- for p in parse_body(line):
- yield p
+ yield ''.join(parse_body(line))
def article_parse(data):