This commit is contained in:
unkn0w7n 2024-03-10 09:52:46 +05:30
parent 6a88069f01
commit 4559d2cda7
2 changed files with 30 additions and 24 deletions

View File

@ -19,8 +19,10 @@ def classes(classes):
def extract_json(raw):
s = raw.find("window['__natgeo__']")
script = raw[s:raw.find('</script>', s)]
return json.loads(
script[script.find('{'):].rstrip(';'))['page']['content']['article']
data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']
if 'article' in data:
return data['article']
return data['prismarticle']
def parse_contributors(grp):
@ -32,16 +34,17 @@ def parse_contributors(grp):
def parse_lead_image(media):
if 'dsc' in media['image']:
yield '<p><div><img src="{}" alt="{}"></div>'.format(
escape(media['image']['src'], True), escape(media['image']['dsc'], True))
else:
yield '<p><div><img src="{}"></div>'.format(escape(media['image']['src'], True))
if 'caption' in media:
yield '<div class="cap">' + media['caption'] + '</div>'
if 'credit' in media:
yield '<div class="cred">' + media['credit'] + '</div>'
yield '</p>'
if 'image' in media:
if 'dsc' in media['image']:
yield '<p><div><img src="{}" alt="{}"></div>'.format(
escape(media['image']['src'], True), escape(media['image']['dsc'], True))
else:
yield '<p><div><img src="{}"></div>'.format(escape(media['image']['src'], True))
if 'caption' in media:
yield '<div class="cap">' + media['caption'] + '</div>'
if 'credit' in media:
yield '<div class="cred">' + media['credit'] + '</div>'
yield '</p>'
def parse_body(item):

View File

@ -18,8 +18,10 @@ def classes(classes):
def extract_json(raw):
s = raw.find("window['__natgeo__']")
script = raw[s:raw.find('</script>', s)]
return json.loads(
script[script.find('{'):].rstrip(';'))['page']['content']['article']
data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']
if 'article' in data:
return data['article']
return data['prismarticle']
def parse_contributors(grp):
@ -31,16 +33,17 @@ def parse_contributors(grp):
def parse_lead_image(media):
if 'dsc' in media['image']:
yield '<p><div><img src="{}" alt="{}"></div>'.format(
escape(media['image']['src'], True), escape(media['image']['dsc'], True))
else:
yield '<p><div><img src="{}"></div>'.format(escape(media['image']['src'], True))
if 'caption' in media:
yield '<div class="cap">' + media['caption'] + '</div>'
if 'credit' in media:
yield '<div class="cred">' + media['credit'] + '</div>'
yield '</p>'
if 'image' in media:
if 'dsc' in media['image']:
yield '<p><div><img src="{}" alt="{}"></div>'.format(
escape(media['image']['src'], True), escape(media['image']['dsc'], True))
else:
yield '<p><div><img src="{}"></div>'.format(escape(media['image']['src'], True))
if 'caption' in media:
yield '<div class="cap">' + media['caption'] + '</div>'
if 'credit' in media:
yield '<div class="cred">' + media['credit'] + '</div>'
yield '</p>'
def parse_body(item):