This commit is contained in:
Kovid Goyal 2024-03-10 09:55:32 +05:30
commit 2aec8675cd
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 45 additions and 36 deletions

View File

@ -19,8 +19,10 @@ def classes(classes):
def extract_json(raw): def extract_json(raw):
s = raw.find("window['__natgeo__']") s = raw.find("window['__natgeo__']")
script = raw[s:raw.find('</script>', s)] script = raw[s:raw.find('</script>', s)]
return json.loads( data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']
script[script.find('{'):].rstrip(';'))['page']['content']['article'] if 'article' in data:
return data['article']
return data['prismarticle']
def parse_contributors(grp): def parse_contributors(grp):
@ -32,16 +34,17 @@ def parse_contributors(grp):
def parse_lead_image(media): def parse_lead_image(media):
if 'dsc' in media['image']: if 'image' in media:
yield '<p><div><img src="{}" alt="{}"></div>'.format( if 'dsc' in media['image']:
escape(media['image']['src'], True), escape(media['image']['dsc'], True)) yield '<p><div><img src="{}" alt="{}"></div>'.format(
else: escape(media['image']['src'], True), escape(media['image']['dsc'], True))
yield '<p><div><img src="{}"></div>'.format(escape(media['image']['src'], True)) else:
if 'caption' in media: yield '<p><div><img src="{}"></div>'.format(escape(media['image']['src'], True))
yield '<div class="cap">' + media['caption'] + '</div>' if 'caption' in media:
if 'credit' in media: yield '<div class="cap">' + media['caption'] + '</div>'
yield '<div class="cred">' + media['credit'] + '</div>' if 'credit' in media:
yield '</p>' yield '<div class="cred">' + media['credit'] + '</div>'
yield '</p>'
def parse_body(item): def parse_body(item):

View File

@ -18,8 +18,10 @@ def classes(classes):
def extract_json(raw): def extract_json(raw):
s = raw.find("window['__natgeo__']") s = raw.find("window['__natgeo__']")
script = raw[s:raw.find('</script>', s)] script = raw[s:raw.find('</script>', s)]
return json.loads( data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']
script[script.find('{'):].rstrip(';'))['page']['content']['article'] if 'article' in data:
return data['article']
return data['prismarticle']
def parse_contributors(grp): def parse_contributors(grp):
@ -31,16 +33,17 @@ def parse_contributors(grp):
def parse_lead_image(media): def parse_lead_image(media):
if 'dsc' in media['image']: if 'image' in media:
yield '<p><div><img src="{}" alt="{}"></div>'.format( if 'dsc' in media['image']:
escape(media['image']['src'], True), escape(media['image']['dsc'], True)) yield '<p><div><img src="{}" alt="{}"></div>'.format(
else: escape(media['image']['src'], True), escape(media['image']['dsc'], True))
yield '<p><div><img src="{}"></div>'.format(escape(media['image']['src'], True)) else:
if 'caption' in media: yield '<p><div><img src="{}"></div>'.format(escape(media['image']['src'], True))
yield '<div class="cap">' + media['caption'] + '</div>' if 'caption' in media:
if 'credit' in media: yield '<div class="cap">' + media['caption'] + '</div>'
yield '<div class="cred">' + media['credit'] + '</div>' if 'credit' in media:
yield '</p>' yield '<div class="cred">' + media['credit'] + '</div>'
yield '</p>'
def parse_body(item): def parse_body(item):

View File

@ -23,8 +23,10 @@ def classes(classes):
def extract_json(raw): def extract_json(raw):
s = raw.find("window['__natgeo__']") s = raw.find("window['__natgeo__']")
script = raw[s:raw.find('</script>', s)] script = raw[s:raw.find('</script>', s)]
return json.loads( data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content']
script[script.find('{'):].rstrip(';'))['page']['content']['article'] if 'article' in data:
return data['article']
return data['prismarticle']
def parse_contributors(grp): def parse_contributors(grp):
@ -36,16 +38,17 @@ def parse_contributors(grp):
def parse_lead_image(media): def parse_lead_image(media):
if 'dsc' in media['image']: if 'image' in media:
yield '<p><div><img src="{}" alt="{}"></div>'.format( if 'dsc' in media['image']:
escape(media['image']['src'], True), escape(media['image']['dsc'], True)) yield '<p><div><img src="{}" alt="{}"></div>'.format(
else: escape(media['image']['src'], True), escape(media['image']['dsc'], True))
yield '<p><div><img src="{}"></div>'.format(escape(media['image']['src'], True)) else:
if 'caption' in media: yield '<p><div><img src="{}"></div>'.format(escape(media['image']['src'], True))
yield '<div class="cap">' + media['caption'] + '</div>' if 'caption' in media:
if 'credit' in media: yield '<div class="cap">' + media['caption'] + '</div>'
yield '<div class="cred">' + media['credit'] + '</div>' if 'credit' in media:
yield '</p>' yield '<div class="cred">' + media['credit'] + '</div>'
yield '</p>'
def parse_body(item): def parse_body(item):