This commit is contained in:
Kovid Goyal 2024-03-10 19:12:57 +05:30
commit 8cd60c1a96
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
5 changed files with 251 additions and 143 deletions

View File

@ -30,7 +30,7 @@ def get_contents(x):
return '<blockquote class="col">' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</blockquote>' return '<blockquote class="col">' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</blockquote>'
elif otype == 'media': elif otype == 'media':
if x['subType'] == 'photo': if x['subType'] == 'photo':
return '<div><div class="img"><img src="{}"></div><div class="cap">{}<div>{}</div></div></div>'.format( return '<div><div class="img"><img src="{}"></div><div class="cap">{} <span>{}</span></div></div>'.format(
x['data']['photo']['src'], x['data']['photo']['caption'], x['data']['photo']['credit']) x['data']['photo']['src'], x['data']['photo']['caption'], x['data']['photo']['credit'])
elif x['subType'] == 'chart': elif x['subType'] == 'chart':
if x['data'] and x['data']['chart']: if x['data'] and x['data']['chart']:
@ -78,7 +78,7 @@ class Bloomberg(BasicNewsRecipe):
extra_css = ''' extra_css = '''
.auth {font-size:small; font-weight:bold;} .auth {font-size:small; font-weight:bold;}
.time, .chart, .css--lede-byline, .css--lede-timestamp {font-size:small;} .time, .chart, .css--lede-byline, .css--lede-timestamp {font-size:small;}
.subhead {font-style:italic; color:#404040;} .subhead, .cap span {font-style:italic; color:#404040;}
em, .col {color:#202020;} em, .col {color:#202020;}
.cat {font-size:small; color:gray;} .cat {font-size:small; color:gray;}
.news-figure-caption-text, .cap, .img, .css--caption-outer-wrapper {font-size:small; text-align:center;} .news-figure-caption-text, .cap, .img, .css--caption-outer-wrapper {font-size:small; text-align:center;}

View File

@ -31,7 +31,7 @@ def get_contents(x):
return '<blockquote class="col">' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</blockquote>' return '<blockquote class="col">' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</blockquote>'
elif otype == 'media': elif otype == 'media':
if x['subType'] == 'photo': if x['subType'] == 'photo':
return '<div><div class="img"><img src="{}"></div><div class="cap">{}<div>{}</div></div></div>'.format( return '<div><div class="img"><img src="{}"></div><div class="cap">{} <span>{}</span></div></div>'.format(
x['data']['photo']['src'], x['data']['photo']['caption'], x['data']['photo']['credit']) x['data']['photo']['src'], x['data']['photo']['caption'], x['data']['photo']['credit'])
elif x['subType'] == 'chart': elif x['subType'] == 'chart':
if x['data'] and x['data']['chart']: if x['data'] and x['data']['chart']:
@ -77,7 +77,7 @@ class Bloomberg(BasicNewsRecipe):
extra_css = ''' extra_css = '''
.auth {font-size:small; font-weight:bold;} .auth {font-size:small; font-weight:bold;}
.time, .chart {font-size:small;} .time, .chart {font-size:small;}
.subhead {font-style:italic; color:#404040;} .subhead, .cap span {font-style:italic; color:#404040;}
em, .col {color:#202020;} em, .col {color:#202020;}
.cat {font-size:small; color:gray;} .cat {font-size:small; color:gray;}
.news-figure-caption-text, .cap, .img {font-size:small; text-align:center;} .news-figure-caption-text, .cap, .img {font-size:small; text-align:center;}

View File

@ -19,10 +19,7 @@ def classes(classes):
def extract_json(raw): def extract_json(raw):
s = raw.find("window['__natgeo__']") s = raw.find("window['__natgeo__']")
script = raw[s:raw.find('</script>', s)] script = raw[s:raw.find('</script>', s)]
data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content'] return json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle']
if 'article' in data:
return data['article']
return data['prismarticle']
def parse_contributors(grp): def parse_contributors(grp):
@ -35,63 +32,99 @@ def parse_contributors(grp):
def parse_lead_image(media): def parse_lead_image(media):
if 'image' in media: if 'image' in media:
yield '<p>'
if 'dsc' in media['image']: if 'dsc' in media['image']:
yield '<p><div><img src="{}" alt="{}"></div>'.format( yield '<div><img src="{}" alt="{}"></div>'.format(
escape(media['image']['src'], True), escape(media['image']['dsc'], True)) escape(media['image']['src'], True), escape(media['image']['dsc'], True))
else: else:
yield '<p><div><img src="{}"></div>'.format(escape(media['image']['src'], True)) yield '<div><img src="{}"></div>'.format(escape(media['image']['src'], True))
if 'caption' in media: if 'caption' in media and 'credit' in media:
yield '<div class="cap">' + media['caption'] + '<span class="cred"> ' + media['credit'] + '</span></div>'
elif 'caption' in media:
yield '<div class="cap">' + media['caption'] + '</div>' yield '<div class="cap">' + media['caption'] + '</div>'
if 'credit' in media:
yield '<div class="cred">' + media['credit'] + '</div>'
yield '</p>' yield '</p>'
def parse_body(item): def parse_inline(inl):
c = item['cntnt'] if inl.get('content', {}).get('name', '') == 'Image':
if item.get('type') == 'inline': props = inl['content']['props']
if c.get('cmsType') == 'listicle': yield '<p>'
if 'title' in c: if 'image' in props:
yield '<h3>' + escape(c['title']) + '</h3>' yield '<div class="img"><img src="{}"></div>'.format(props['image']['src'])
yield c['text'] if 'caption' in props:
elif c.get('cmsType') == 'image': yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
for line in parse_lead_image(c): props['caption']['text'], ' ' + props['caption']['credit']
yield line )
elif c.get('cmsType') == 'imagegroup': yield '</p>'
for imgs in c['images']: if inl.get('content', {}).get('name', '') == 'ImageGroup':
for line in parse_lead_image(imgs): if 'images' in inl['content']['props']:
yield line for imgs in inl['content']['props']['images']:
elif c.get('cmsType') == 'pullquote': yield '<p>'
if 'quote' in c: if 'src' in imgs:
yield '<blockquote>' + c['quote'] + '</blockquote>' yield '<div class="img"><img src="{}"></div>'.format(imgs['src'])
elif c.get('cmsType') == 'editorsNote': if 'caption' in imgs:
if 'note' in c: yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
yield '<blockquote>' + c['note'] + '</blockquote>' imgs['caption']['text'], ' ' + imgs['caption']['credit']
else: )
if c['mrkup'].strip().startswith('<'): yield '</p>'
yield c['mrkup']
else:
yield '<{tag}>{markup}</{tag}>'.format( def parse_cont(content):
tag=item['type'], markup=c['mrkup']) for cont in content.get('content', {}):
if isinstance(cont, dict):
yield from parse_body(cont)
if isinstance(cont, str):
yield cont
def parse_body(x):
if isinstance(x, dict):
if 'type' in x:
tag = x['type']
if tag == 'inline':
yield ''.join(parse_inline(x))
elif 'attrs' in x and 'href' in x.get('attrs', {}):
yield '<' + tag + ' href = "{}">'.format(x['attrs']['href'])
for yld in parse_cont(x):
yield yld
yield '</' + tag + '>'
else:
yield '<' + tag + '>'
for yld in parse_cont(x):
yield yld
yield '</' + tag + '>'
elif isinstance(x, list):
for y in x:
if isinstance(y, dict):
yield from parse_body(y)
def parse_article(edg): def parse_article(edg):
sc = edg['schma'] sc = edg['schma']
yield '<h3 class="sub">' + escape(edg['sctn']) + '</h3>' yield '<div class="sub">' + escape(edg['sctn']) + '</div>'
yield '<h1>' + escape(sc['sclTtl']) + '</h1>' yield '<h1>' + escape(sc['sclTtl']) + '</h1>'
yield '<div class="byline">' + escape(sc['sclDsc']) + '</div><br>' yield '<div class="byline">' + escape(sc['sclDsc']) + '</div>'
yield '<p>'
for line in parse_contributors(edg['cntrbGrp']): for line in parse_contributors(edg['cntrbGrp']):
yield line yield line
ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y') ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y')
yield '<div class="time">Published: ' + escape(ts) + '</div>' yield '<div class="time">Published: ' + escape(ts) + '</div>'
if 'readTime' in edg: if 'readTime' in edg:
yield '<div class="time">' + escape(edg['readTime']) + '</div><br>' yield '<div class="time">' + escape(edg['readTime']) + '</div>'
yield '</p>'
if edg.get('ldMda', {}).get('cmsType') == 'image': if edg.get('ldMda', {}).get('cmsType') == 'image':
for line in parse_lead_image(edg['ldMda']): for line in parse_lead_image(edg['ldMda']):
yield line yield line
for item in edg['bdy']: for main in edg['prismData']['mainComponents']:
for line in parse_body(item): if main['name'] == 'Body':
yield line for item in main['props']['body']:
if isinstance(item, dict):
if item.get('type', '') == 'inline':
for inl in parse_inline(item):
yield inl
elif isinstance(item, list):
for line in item:
yield ''.join(parse_body(line))
def article_parse(data): def article_parse(data):
@ -131,11 +164,12 @@ class NatGeo(BasicNewsRecipe):
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
extra_css = ''' extra_css = '''
.sub, blockquote { color:#404040; } blockquote { color:#404040; }
.byline, i { font-style:italic; color:#202020; } .byline, i { font-style:italic; color:#202020; }
.cap {text-align:center; font-size:small; } .cap { font-size:small; }
.cred {text-align:center; font-size:small; color:#404040; } img {display:block; margin:0 auto;}
.auth, .time { font-size:small; color:#5c5c5c; } .cred { font-style:italic; font-size:small; color:#404040; }
.auth, .time, .sub { font-size:small; color:#5c5c5c; }
''' '''
def get_cover_url(self): def get_cover_url(self):
@ -186,9 +220,11 @@ class NatGeo(BasicNewsRecipe):
return '\n'.join(article_parse(data)) return '\n'.join(article_parse(data))
def preprocess_html(self, soup): def preprocess_html(self, soup):
for h2 in soup.findAll('h2'):
h2.name = 'h4'
for img in soup.findAll('img', src=True): for img in soup.findAll('img', src=True):
# for high res images use '?w=2000&h=2000' # for high res images use '?w=2000&h=2000'
img['src'] = img['src'] + '?w=1000&h=1000' img['src'] = img['src'] + '?w=600&h=600'
return soup return soup
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):

View File

@ -18,10 +18,7 @@ def classes(classes):
def extract_json(raw): def extract_json(raw):
s = raw.find("window['__natgeo__']") s = raw.find("window['__natgeo__']")
script = raw[s:raw.find('</script>', s)] script = raw[s:raw.find('</script>', s)]
data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content'] return json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle']
if 'article' in data:
return data['article']
return data['prismarticle']
def parse_contributors(grp): def parse_contributors(grp):
@ -34,63 +31,99 @@ def parse_contributors(grp):
def parse_lead_image(media): def parse_lead_image(media):
if 'image' in media: if 'image' in media:
yield '<p>'
if 'dsc' in media['image']: if 'dsc' in media['image']:
yield '<p><div><img src="{}" alt="{}"></div>'.format( yield '<div><img src="{}" alt="{}"></div>'.format(
escape(media['image']['src'], True), escape(media['image']['dsc'], True)) escape(media['image']['src'], True), escape(media['image']['dsc'], True))
else: else:
yield '<p><div><img src="{}"></div>'.format(escape(media['image']['src'], True)) yield '<div><img src="{}"></div>'.format(escape(media['image']['src'], True))
if 'caption' in media: if 'caption' in media and 'credit' in media:
yield '<div class="cap">' + media['caption'] + '<span class="cred"> ' + media['credit'] + '</span></div>'
elif 'caption' in media:
yield '<div class="cap">' + media['caption'] + '</div>' yield '<div class="cap">' + media['caption'] + '</div>'
if 'credit' in media:
yield '<div class="cred">' + media['credit'] + '</div>'
yield '</p>' yield '</p>'
def parse_body(item): def parse_inline(inl):
c = item['cntnt'] if inl.get('content', {}).get('name', '') == 'Image':
if item.get('type') == 'inline': props = inl['content']['props']
if c.get('cmsType') == 'listicle': yield '<p>'
if 'title' in c: if 'image' in props:
yield '<h3>' + escape(c['title']) + '</h3>' yield '<div class="img"><img src="{}"></div>'.format(props['image']['src'])
yield c['text'] if 'caption' in props:
elif c.get('cmsType') == 'image': yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
for line in parse_lead_image(c): props['caption']['text'], ' ' + props['caption']['credit']
yield line )
elif c.get('cmsType') == 'imagegroup': yield '</p>'
for imgs in c['images']: if inl.get('content', {}).get('name', '') == 'ImageGroup':
for line in parse_lead_image(imgs): if 'images' in inl['content']['props']:
yield line for imgs in inl['content']['props']['images']:
elif c.get('cmsType') == 'pullquote': yield '<p>'
if 'quote' in c: if 'src' in imgs:
yield '<blockquote>' + c['quote'] + '</blockquote>' yield '<div class="img"><img src="{}"></div>'.format(imgs['src'])
elif c.get('cmsType') == 'editorsNote': if 'caption' in imgs:
if 'note' in c: yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
yield '<blockquote>' + c['note'] + '</blockquote>' imgs['caption']['text'], ' ' + imgs['caption']['credit']
else: )
if c['mrkup'].strip().startswith('<'): yield '</p>'
yield c['mrkup']
else:
yield '<{tag}>{markup}</{tag}>'.format( def parse_cont(content):
tag=item['type'], markup=c['mrkup']) for cont in content.get('content', {}):
if isinstance(cont, dict):
yield from parse_body(cont)
if isinstance(cont, str):
yield cont
def parse_body(x):
if isinstance(x, dict):
if 'type' in x:
tag = x['type']
if tag == 'inline':
yield ''.join(parse_inline(x))
elif 'attrs' in x and 'href' in x.get('attrs', {}):
yield '<' + tag + ' href = "{}">'.format(x['attrs']['href'])
for yld in parse_cont(x):
yield yld
yield '</' + tag + '>'
else:
yield '<' + tag + '>'
for yld in parse_cont(x):
yield yld
yield '</' + tag + '>'
elif isinstance(x, list):
for y in x:
if isinstance(y, dict):
yield from parse_body(y)
def parse_article(edg): def parse_article(edg):
sc = edg['schma'] sc = edg['schma']
yield '<h3 class="sub">' + escape(edg['sctn']) + '</h3>' yield '<div class="sub">' + escape(edg['sctn']) + '</div>'
yield '<h1>' + escape(sc['sclTtl']) + '</h1>' yield '<h1>' + escape(sc['sclTtl']) + '</h1>'
yield '<div class="byline">' + escape(sc['sclDsc']) + '</div><br>' yield '<div class="byline">' + escape(sc['sclDsc']) + '</div>'
yield '<p>'
for line in parse_contributors(edg['cntrbGrp']): for line in parse_contributors(edg['cntrbGrp']):
yield line yield line
ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y') ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y')
yield '<div class="time">Published: ' + escape(ts) + '</div>' yield '<div class="time">Published: ' + escape(ts) + '</div>'
if 'readTime' in edg: if 'readTime' in edg:
yield '<div class="time">' + escape(edg['readTime']) + '</div><br>' yield '<div class="time">' + escape(edg['readTime']) + '</div>'
yield '</p>'
if edg.get('ldMda', {}).get('cmsType') == 'image': if edg.get('ldMda', {}).get('cmsType') == 'image':
for line in parse_lead_image(edg['ldMda']): for line in parse_lead_image(edg['ldMda']):
yield line yield line
for item in edg['bdy']: for main in edg['prismData']['mainComponents']:
for line in parse_body(item): if main['name'] == 'Body':
yield line for item in main['props']['body']:
if isinstance(item, dict):
if item.get('type', '') == 'inline':
for inl in parse_inline(item):
yield inl
elif isinstance(item, list):
for line in item:
yield ''.join(parse_body(line))
def article_parse(data): def article_parse(data):
@ -120,7 +153,7 @@ class NatGeo(BasicNewsRecipe):
encoding = 'utf8' encoding = 'utf8'
publisher = 'nationalgeographic.com' publisher = 'nationalgeographic.com'
category = 'science, nat geo' category = 'science, nat geo'
__author__ = 'Kovid Goyal' __author__ = 'Kovid Goyal, unkn0wn'
description = 'Inspiring people to care about the planet since 1888' description = 'Inspiring people to care about the planet since 1888'
timefmt = ' [%a, %d %b, %Y]' timefmt = ' [%a, %d %b, %Y]'
no_stylesheets = True no_stylesheets = True
@ -131,11 +164,12 @@ class NatGeo(BasicNewsRecipe):
resolve_internal_links = True resolve_internal_links = True
extra_css = ''' extra_css = '''
.sub, blockquote { color:#404040; } blockquote { color:#404040; }
.byline, i { font-style:italic; color:#202020; } .byline, i { font-style:italic; color:#202020; }
.cap {text-align:center; font-size:small; } .cap { font-size:small; }
.cred {text-align:center; font-size:small; color:#404040; } img {display:block; margin:0 auto;}
.auth, .time { font-size:small; color:#5c5c5c; } .cred { font-style:italic; font-size:small; color:#404040; }
.auth, .time, .sub { font-size:small; color:#5c5c5c; }
''' '''
def get_cover_url(self): def get_cover_url(self):
@ -161,9 +195,11 @@ class NatGeo(BasicNewsRecipe):
return '\n'.join(article_parse(data)) return '\n'.join(article_parse(data))
def preprocess_html(self, soup): def preprocess_html(self, soup):
for h2 in soup.findAll('h2'):
h2.name = 'h4'
for img in soup.findAll('img', src=True): for img in soup.findAll('img', src=True):
# for high res images use '?w=2000&h=2000' # for high res images use '?w=2000&h=2000'
img['src'] = img['src'] + '?w=1000&h=1000' img['src'] = img['src'] + '?w=600&h=600'
return soup return soup
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):

View File

@ -23,10 +23,7 @@ def classes(classes):
def extract_json(raw): def extract_json(raw):
s = raw.find("window['__natgeo__']") s = raw.find("window['__natgeo__']")
script = raw[s:raw.find('</script>', s)] script = raw[s:raw.find('</script>', s)]
data = json.loads(script[script.find('{'):].rstrip(';'))['page']['content'] return json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle']
if 'article' in data:
return data['article']
return data['prismarticle']
def parse_contributors(grp): def parse_contributors(grp):
@ -39,63 +36,99 @@ def parse_contributors(grp):
def parse_lead_image(media): def parse_lead_image(media):
if 'image' in media: if 'image' in media:
yield '<p>'
if 'dsc' in media['image']: if 'dsc' in media['image']:
yield '<p><div><img src="{}" alt="{}"></div>'.format( yield '<div><img src="{}" alt="{}"></div>'.format(
escape(media['image']['src'], True), escape(media['image']['dsc'], True)) escape(media['image']['src'], True), escape(media['image']['dsc'], True))
else: else:
yield '<p><div><img src="{}"></div>'.format(escape(media['image']['src'], True)) yield '<div><img src="{}"></div>'.format(escape(media['image']['src'], True))
if 'caption' in media: if 'caption' in media and 'credit' in media:
yield '<div class="cap">' + media['caption'] + '<span class="cred"> ' + media['credit'] + '</span></div>'
elif 'caption' in media:
yield '<div class="cap">' + media['caption'] + '</div>' yield '<div class="cap">' + media['caption'] + '</div>'
if 'credit' in media:
yield '<div class="cred">' + media['credit'] + '</div>'
yield '</p>' yield '</p>'
def parse_body(item): def parse_inline(inl):
c = item['cntnt'] if inl.get('content', {}).get('name', '') == 'Image':
if item.get('type') == 'inline': props = inl['content']['props']
if c.get('cmsType') == 'listicle': yield '<p>'
if 'title' in c: if 'image' in props:
yield '<h3>' + escape(c['title']) + '</h3>' yield '<div class="img"><img src="{}"></div>'.format(props['image']['src'])
yield c['text'] if 'caption' in props:
elif c.get('cmsType') == 'image': yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
for line in parse_lead_image(c): props['caption']['text'], ' ' + props['caption']['credit']
yield line )
elif c.get('cmsType') == 'imagegroup': yield '</p>'
for imgs in c['images']: if inl.get('content', {}).get('name', '') == 'ImageGroup':
for line in parse_lead_image(imgs): if 'images' in inl['content']['props']:
yield line for imgs in inl['content']['props']['images']:
elif c.get('cmsType') == 'pullquote': yield '<p>'
if 'quote' in c: if 'src' in imgs:
yield '<blockquote>' + c['quote'] + '</blockquote>' yield '<div class="img"><img src="{}"></div>'.format(imgs['src'])
elif c.get('cmsType') == 'editorsNote': if 'caption' in imgs:
if 'note' in c: yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
yield '<blockquote>' + c['note'] + '</blockquote>' imgs['caption']['text'], ' ' + imgs['caption']['credit']
else: )
if c['mrkup'].strip().startswith('<'): yield '</p>'
yield c['mrkup']
else:
yield '<{tag}>{markup}</{tag}>'.format( def parse_cont(content):
tag=item['type'], markup=c['mrkup']) for cont in content.get('content', {}):
if isinstance(cont, dict):
yield from parse_body(cont)
if isinstance(cont, str):
yield cont
def parse_body(x):
if isinstance(x, dict):
if 'type' in x:
tag = x['type']
if tag == 'inline':
yield ''.join(parse_inline(x))
elif 'attrs' in x and 'href' in x.get('attrs', {}):
yield '<' + tag + ' href = "{}">'.format(x['attrs']['href'])
for yld in parse_cont(x):
yield yld
yield '</' + tag + '>'
else:
yield '<' + tag + '>'
for yld in parse_cont(x):
yield yld
yield '</' + tag + '>'
elif isinstance(x, list):
for y in x:
if isinstance(y, dict):
yield from parse_body(y)
def parse_article(edg): def parse_article(edg):
sc = edg['schma'] sc = edg['schma']
yield '<h3 class="sub">' + escape(edg['sctn']) + '</h3>' yield '<div class="sub">' + escape(edg['sctn']) + '</div>'
yield '<h1>' + escape(sc['sclTtl']) + '</h1>' yield '<h1>' + escape(sc['sclTtl']) + '</h1>'
yield '<div class="byline">' + escape(sc['sclDsc']) + '</div><br>' yield '<div class="byline">' + escape(sc['sclDsc']) + '</div>'
yield '<p>'
for line in parse_contributors(edg['cntrbGrp']): for line in parse_contributors(edg['cntrbGrp']):
yield line yield line
ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y') ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y')
yield '<div class="time">Published: ' + escape(ts) + '</div>' yield '<div class="time">Published: ' + escape(ts) + '</div>'
if 'readTime' in edg: if 'readTime' in edg:
yield '<div class="time">' + escape(edg['readTime']) + '</div><br>' yield '<div class="time">' + escape(edg['readTime']) + '</div>'
yield '</p>'
if edg.get('ldMda', {}).get('cmsType') == 'image': if edg.get('ldMda', {}).get('cmsType') == 'image':
for line in parse_lead_image(edg['ldMda']): for line in parse_lead_image(edg['ldMda']):
yield line yield line
for item in edg['bdy']: for main in edg['prismData']['mainComponents']:
for line in parse_body(item): if main['name'] == 'Body':
yield line for item in main['props']['body']:
if isinstance(item, dict):
if item.get('type', '') == 'inline':
for inl in parse_inline(item):
yield inl
elif isinstance(item, list):
for line in item:
yield ''.join(parse_body(line))
def article_parse(data): def article_parse(data):
@ -134,11 +167,12 @@ class NatGeo(BasicNewsRecipe):
resolve_internal_links = True resolve_internal_links = True
extra_css = ''' extra_css = '''
.sub, blockquote { color:#404040; } blockquote { color:#404040; }
.byline, i { font-style:italic; color:#202020; } .byline, i { font-style:italic; color:#202020; }
.cap {text-align:center; font-size:small; } .cap { font-size:small; }
.cred {text-align:center; font-size:small; color:#404040; } img {display:block; margin:0 auto;}
.auth, .time { font-size:small; color:#5c5c5c; } .cred { font-style:italic; font-size:small; color:#404040; }
.auth, .time, .sub { font-size:small; color:#5c5c5c; }
''' '''
def parse_index(self): def parse_index(self):
@ -183,9 +217,11 @@ class NatGeo(BasicNewsRecipe):
return '\n'.join(article_parse(data)) return '\n'.join(article_parse(data))
def preprocess_html(self, soup): def preprocess_html(self, soup):
for h2 in soup.findAll('h2'):
h2.name = 'h4'
for img in soup.findAll('img', src=True): for img in soup.findAll('img', src=True):
# for high res images use '?w=2000&h=2000' # for high res images use '?w=2000&h=2000'
img['src'] = img['src'] + '?w=1200&h=1200' img['src'] = img['src'] + '?w=600&h=600'
return soup return soup
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):