This commit is contained in:
unkn0w7n 2024-10-06 12:19:34 +05:30
parent 7c2aeb5422
commit 91c0aa0b1f
5 changed files with 259 additions and 477 deletions

View File

@ -1,151 +1,11 @@
#!/usr/bin/env python #!/usr/bin/env python
# vim:fileencoding=utf-8 # vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
import json
from pprint import pformat from pprint import pformat
from calibre.web.feeds.news import BasicNewsRecipe, classes
from calibre import prepare_string_for_xml as escape
from calibre.utils.iso8601 import parse_iso8601
from calibre.web.feeds.news import BasicNewsRecipe
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
def extract_json(raw):
s = raw.find("window['__natgeo__']")
script = raw[s:raw.find('</script>', s)]
return json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle']
def parse_contributors(grp):
for item in grp:
line = '<div class="auth">' + escape(item['title']) + ' '
for c in item['contributors']:
line += escape(c['displayName'])
yield line + '</div>'
def parse_lead_image(media):
if 'image' in media:
yield '<p>'
if 'dsc' in media['image']:
yield '<div><img src="{}" alt="{}"></div>'.format(
escape(media['image']['src'], True), escape(media['image']['dsc'], True)
)
else:
yield '<div><img src="{}"></div>'.format(escape(media['image']['src'], True))
if 'caption' in media and 'credit' in media:
yield '<div class="cap">' + media['caption'] + '<span class="cred"> ' + media['credit'] + '</span></div>'
elif 'caption' in media:
yield '<div class="cap">' + media['caption'] + '</div>'
yield '</p>'
def parse_inline(inl):
if inl.get('content', {}).get('name', '') == 'Image':
props = inl['content']['props']
yield '<p>'
if 'image' in props:
yield '<div class="img"><img src="{}"></div>'.format(props['image']['src'])
if 'caption' in props:
yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
props['caption'].get('text', ''), ' ' + props['caption'].get('credit', '')
)
yield '</p>'
if inl.get('content', {}).get('name', '') == 'ImageGroup':
if 'images' in inl['content']['props']:
for imgs in inl['content']['props']['images']:
yield '<p>'
if 'src' in imgs:
yield '<div class="img"><img src="{}"></div>'.format(imgs['src'])
if 'caption' in imgs:
yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
imgs['caption'].get('text', ''), ' ' + imgs['caption'].get('credit', '')
)
yield '</p>'
def parse_cont(content):
for cont in content.get('content', {}):
if isinstance(cont, dict):
yield from parse_body(cont)
if isinstance(cont, str):
yield cont
def parse_body(x):
if isinstance(x, dict):
if 'type' in x:
tag = x['type']
if tag == 'inline':
yield ''.join(parse_inline(x))
elif 'attrs' in x and 'href' in x.get('attrs', ''):
yield '<' + tag + ' href="{}">'.format(x['attrs']['href'])
for yld in parse_cont(x):
yield yld
yield '</' + tag + '>'
else:
yield '<' + tag + '>'
for yld in parse_cont(x):
yield yld
yield '</' + tag + '>'
elif isinstance(x, list):
for y in x:
if isinstance(y, dict):
yield from parse_body(y)
def parse_article(edg):
sc = edg['schma']
yield '<div class="sub">' + escape(edg['sctn']) + '</div>'
yield '<h1>' + escape(sc['sclTtl']) + '</h1>'
yield '<div class="byline">' + escape(sc['sclDsc']) + '</div>'
yield '<p>'
for line in parse_contributors(edg.get('cntrbGrp', {})):
yield line
ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y')
yield '<div class="time">Published: ' + escape(ts) + '</div>'
if 'readTime' in edg:
yield '<div class="time">' + escape(edg['readTime']) + '</div>'
yield '</p>'
if edg.get('ldMda', {}).get('cmsType') == 'image':
for line in parse_lead_image(edg['ldMda']):
yield line
for main in edg['prismData']['mainComponents']:
if main['name'] == 'Body':
for item in main['props']['body']:
if isinstance(item, dict):
if item.get('type', '') == 'inline':
yield ''.join(parse_inline(item))
elif isinstance(item, list):
for line in item:
yield ''.join(parse_body(line))
def article_parse(data):
yield "<html><body>"
for frm in data['frms']:
if not frm:
continue
for mod in frm.get('mods', ()):
for edg in mod.get('edgs', ()):
if edg.get('cmsType') == 'ImmersiveLeadTile':
if 'image' in edg.get('cmsImage', {}):
for line in parse_lead_image(edg['cmsImage']):
yield line
if edg.get('cmsType') == 'ArticleBodyTile':
for line in parse_article(edg):
yield line
yield "</body></html>"
class NatGeo(BasicNewsRecipe): class NatGeo(BasicNewsRecipe):
title = u'National Geographic' title = 'National Geographic'
description = 'News articles from The National Geographic, Download Monthly.' description = 'News articles from The National Geographic, Download Monthly.'
language = 'en' language = 'en'
encoding = 'utf8' encoding = 'utf8'
@ -167,24 +27,40 @@ class NatGeo(BasicNewsRecipe):
'res': { 'res': {
'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500', 'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.', 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
'default': '600' 'default': '600',
} }
} }
extra_css = ''' @property
def natgeo_parser(self):
ans = getattr(self, '_natgeo_parser', None)
if ans is None:
from calibre.live import load_module
self._natgeo_parser = ans = load_module('calibre.web.site_parsers.natgeo')
return ans
def preprocess_raw_html(self, raw_html, url):
return self.natgeo_parser.extract_html(raw_html)
extra_css = """
blockquote { color:#404040; } blockquote { color:#404040; }
.byline, i { font-style:italic; color:#202020; } .byline, i { font-style:italic; color:#202020; }
.cap { font-size:small; } .cap { font-size:small; }
img {display:block; margin:0 auto;} img {display:block; margin:0 auto;}
.cred { font-style:italic; font-size:small; color:#404040; } .cred { font-style:italic; font-size:small; color:#404040; }
.auth, .time, .sub { font-size:small; color:#5c5c5c; } .auth, .time, .sub { font-size:small; color:#5c5c5c; }
''' """
def get_cover_url(self): def get_cover_url(self):
# soup = self.index_to_soup('https://www.nationalgeographic.com/magazine/') # soup = self.index_to_soup('https://www.nationalgeographic.com/magazine/')
# png = re.findall('https://i\.natgeofe\.com\S+?national-geographic-\S+?\.jpg', soup.decode('utf-8')) # png = re.findall('https://i\.natgeofe\.com\S+?national-geographic-\S+?\.jpg', soup.decode('utf-8'))
from datetime import date from datetime import date
url = 'https://www.nationalgeographic.com/magazine/issue/' + (date.today().strftime('%B-%Y')).lower()
url = (
'https://www.nationalgeographic.com/magazine/issue/'
+ (date.today().strftime('%B-%Y')).lower()
)
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
png = soup.find('meta', attrs={'property': 'og:image'})['content'].split('?') png = soup.find('meta', attrs={'property': 'og:image'})['content'].split('?')
return png[0] + '?w=1000&h=1000' return png[0] + '?w=1000&h=1000'
@ -195,7 +71,7 @@ class NatGeo(BasicNewsRecipe):
'https://www.nationalgeographic.com/environment', 'https://www.nationalgeographic.com/environment',
'https://www.nationalgeographic.com/history', 'https://www.nationalgeographic.com/history',
'https://www.nationalgeographic.com/science', 'https://www.nationalgeographic.com/science',
'https://www.nationalgeographic.com/travel' 'https://www.nationalgeographic.com/travel',
] ]
feeds = [] feeds = []
@ -217,16 +93,14 @@ class NatGeo(BasicNewsRecipe):
section = self.tag_to_string(article.find(**classes('SectionLabel'))) section = self.tag_to_string(article.find(**classes('SectionLabel')))
if section.startswith('Paid Content'): if section.startswith('Paid Content'):
continue continue
title = self.tag_to_string(article.find(**classes('PromoTile__Title--truncated'))) title = self.tag_to_string(
article.find(**classes('PromoTile__Title--truncated'))
)
articles = ans.setdefault(section, []) articles = ans.setdefault(section, [])
articles.append({'title': title, 'url': url}) articles.append({'title': title, 'url': url})
self.log(pformat(ans)) self.log(pformat(ans))
return list(ans.items()) return list(ans.items())
def preprocess_raw_html(self, raw_html, url):
data = extract_json(raw_html)
return '\n'.join(article_parse(data))
def preprocess_html(self, soup): def preprocess_html(self, soup):
for h2 in soup.findAll('h2'): for h2 in soup.findAll('h2'):
h2.name = 'h4' h2.name = 'h4'

View File

@ -1,150 +1,10 @@
#!/usr/bin/env python #!/usr/bin/env python
# vim:fileencoding=utf-8 # vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals from calibre.web.feeds.news import BasicNewsRecipe, classes
import json
from calibre import prepare_string_for_xml as escape
from calibre.utils.iso8601 import parse_iso8601
from calibre.web.feeds.news import BasicNewsRecipe
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
def extract_json(raw):
s = raw.find("window['__natgeo__']")
script = raw[s:raw.find('</script>', s)]
return json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle']
def parse_contributors(grp):
for item in grp:
line = '<div class="auth">' + escape(item['title']) + ' '
for c in item['contributors']:
line += escape(c['displayName'])
yield line + '</div>'
def parse_lead_image(media):
if 'image' in media:
yield '<p>'
if 'dsc' in media['image']:
yield '<div><img src="{}" alt="{}"></div>'.format(
escape(media['image']['src'], True), escape(media['image']['dsc'], True)
)
else:
yield '<div><img src="{}"></div>'.format(escape(media['image']['src'], True))
if 'caption' in media and 'credit' in media:
yield '<div class="cap">' + media['caption'] + '<span class="cred"> ' + media['credit'] + '</span></div>'
elif 'caption' in media:
yield '<div class="cap">' + media['caption'] + '</div>'
yield '</p>'
def parse_inline(inl):
if inl.get('content', {}).get('name', '') == 'Image':
props = inl['content']['props']
yield '<p>'
if 'image' in props:
yield '<div class="img"><img src="{}"></div>'.format(props['image']['src'])
if 'caption' in props:
yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
props['caption'].get('text', ''), ' ' + props['caption'].get('credit', '')
)
yield '</p>'
if inl.get('content', {}).get('name', '') == 'ImageGroup':
if 'images' in inl['content']['props']:
for imgs in inl['content']['props']['images']:
yield '<p>'
if 'src' in imgs:
yield '<div class="img"><img src="{}"></div>'.format(imgs['src'])
if 'caption' in imgs:
yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
imgs['caption'].get('text', ''), ' ' + imgs['caption'].get('credit', '')
)
yield '</p>'
def parse_cont(content):
for cont in content.get('content', {}):
if isinstance(cont, dict):
yield from parse_body(cont)
if isinstance(cont, str):
yield cont
def parse_body(x):
if isinstance(x, dict):
if 'type' in x:
tag = x['type']
if tag == 'inline':
yield ''.join(parse_inline(x))
elif 'attrs' in x and 'href' in x.get('attrs', ''):
yield '<' + tag + ' href="{}">'.format(x['attrs']['href'])
for yld in parse_cont(x):
yield yld
yield '</' + tag + '>'
else:
yield '<' + tag + '>'
for yld in parse_cont(x):
yield yld
yield '</' + tag + '>'
elif isinstance(x, list):
for y in x:
if isinstance(y, dict):
yield from parse_body(y)
def parse_article(edg):
sc = edg['schma']
yield '<div class="sub">' + escape(edg['sctn']) + '</div>'
yield '<h1>' + escape(sc['sclTtl']) + '</h1>'
yield '<div class="byline">' + escape(sc['sclDsc']) + '</div>'
yield '<p>'
for line in parse_contributors(edg.get('cntrbGrp', {})):
yield line
ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y')
yield '<div class="time">Published: ' + escape(ts) + '</div>'
if 'readTime' in edg:
yield '<div class="time">' + escape(edg['readTime']) + '</div>'
yield '</p>'
if edg.get('ldMda', {}).get('cmsType') == 'image':
for line in parse_lead_image(edg['ldMda']):
yield line
for main in edg['prismData']['mainComponents']:
if main['name'] == 'Body':
for item in main['props']['body']:
if isinstance(item, dict):
if item.get('type', '') == 'inline':
yield ''.join(parse_inline(item))
elif isinstance(item, list):
for line in item:
yield ''.join(parse_body(line))
def article_parse(data):
yield "<html><body>"
for frm in data['frms']:
if not frm:
continue
for mod in frm.get('mods', ()):
for edg in mod.get('edgs', ()):
if edg.get('cmsType') == 'ImmersiveLeadTile':
if 'image' in edg.get('cmsImage', {}):
for line in parse_lead_image(edg['cmsImage']):
yield line
if edg.get('cmsType') == 'ArticleBodyTile':
for line in parse_article(edg):
yield line
yield "</body></html>"
class NatGeo(BasicNewsRecipe): class NatGeo(BasicNewsRecipe):
title = u'National Geographic History' title = 'National Geographic History'
description = ( description = (
'From Caesar to Napoleon, the Pyramids to the Parthenon, the Trojan War to the Civil War—National Geographic ' 'From Caesar to Napoleon, the Pyramids to the Parthenon, the Trojan War to the Civil War—National Geographic '
'HISTORY draws readers in with more than 5,000 years of people, places, and things to explore.' 'HISTORY draws readers in with more than 5,000 years of people, places, and things to explore.'
@ -167,18 +27,30 @@ class NatGeo(BasicNewsRecipe):
'res': { 'res': {
'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500', 'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.', 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
'default': '600' 'default': '600',
} }
} }
extra_css = ''' @property
def natgeo_parser(self):
ans = getattr(self, '_natgeo_parser', None)
if ans is None:
from calibre.live import load_module
self._natgeo_parser = ans = load_module('calibre.web.site_parsers.natgeo')
return ans
def preprocess_raw_html(self, raw_html, url):
return self.natgeo_parser.extract_html(raw_html)
extra_css = """
blockquote { color:#404040; } blockquote { color:#404040; }
.byline, i { font-style:italic; color:#202020; } .byline, i { font-style:italic; color:#202020; }
.cap { font-size:small; } .cap { font-size:small; }
img {display:block; margin:0 auto;} img {display:block; margin:0 auto;}
.cred { font-style:italic; font-size:small; color:#404040; } .cred { font-style:italic; font-size:small; color:#404040; }
.auth, .time, .sub { font-size:small; color:#5c5c5c; } .auth, .time, .sub { font-size:small; color:#5c5c5c; }
''' """
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup('https://ngsingleissues.nationalgeographic.com/history') soup = self.index_to_soup('https://ngsingleissues.nationalgeographic.com/history')
@ -186,22 +58,22 @@ class NatGeo(BasicNewsRecipe):
return wrap.img['src'] return wrap.img['src']
def parse_index(self): def parse_index(self):
soup = self.index_to_soup('https://www.nationalgeographic.com/history/history-magazine') soup = self.index_to_soup(
'https://www.nationalgeographic.com/history/history-magazine'
)
ans = [] ans = []
for article in soup.findAll('article'): for article in soup.findAll('article'):
a = article.find('a') a = article.find('a')
url = a['href'] url = a['href']
if url.startswith('/'): if url.startswith('/'):
url = 'https://www.nationalgeographic.com' + url url = 'https://www.nationalgeographic.com' + url
title = self.tag_to_string(article.find(**classes('PromoTile__Title--truncated'))) title = self.tag_to_string(
article.find(**classes('PromoTile__Title--truncated'))
)
ans.append({'title': title, 'url': url}) ans.append({'title': title, 'url': url})
self.log(title, ' ', url) self.log(title, ' ', url)
return [('Articles', ans)] return [('Articles', ans)]
def preprocess_raw_html(self, raw_html, url):
data = extract_json(raw_html)
return '\n'.join(article_parse(data))
def preprocess_html(self, soup): def preprocess_html(self, soup):
for h2 in soup.findAll('h2'): for h2 in soup.findAll('h2'):
h2.name = 'h4' h2.name = 'h4'

View File

@ -1,152 +1,12 @@
#!/usr/bin/env python #!/usr/bin/env python
# vim:fileencoding=utf-8 # vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
import json
from datetime import date from datetime import date
from pprint import pformat from pprint import pformat
from calibre.web.feeds.news import BasicNewsRecipe, classes
from calibre import prepare_string_for_xml as escape
from calibre.utils.iso8601 import parse_iso8601
from calibre.web.feeds.news import BasicNewsRecipe
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
def extract_json(raw):
s = raw.find("window['__natgeo__']")
script = raw[s:raw.find('</script>', s)]
return json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle']
def parse_contributors(grp):
for item in grp:
line = '<div class="auth">' + escape(item['title']) + ' '
for c in item['contributors']:
line += escape(c['displayName'])
yield line + '</div>'
def parse_lead_image(media):
if 'image' in media:
yield '<p>'
if 'dsc' in media['image']:
yield '<div><img src="{}" alt="{}"></div>'.format(
escape(media['image']['src'], True), escape(media['image']['dsc'], True)
)
else:
yield '<div><img src="{}"></div>'.format(escape(media['image']['src'], True))
if 'caption' in media and 'credit' in media:
yield '<div class="cap">' + media['caption'] + '<span class="cred"> ' + media['credit'] + '</span></div>'
elif 'caption' in media:
yield '<div class="cap">' + media['caption'] + '</div>'
yield '</p>'
def parse_inline(inl):
if inl.get('content', {}).get('name', '') == 'Image':
props = inl['content']['props']
yield '<p>'
if 'image' in props:
yield '<div class="img"><img src="{}"></div>'.format(props['image']['src'])
if 'caption' in props:
yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
props['caption'].get('text', ''), ' ' + props['caption'].get('credit', '')
)
yield '</p>'
if inl.get('content', {}).get('name', '') == 'ImageGroup':
if 'images' in inl['content']['props']:
for imgs in inl['content']['props']['images']:
yield '<p>'
if 'src' in imgs:
yield '<div class="img"><img src="{}"></div>'.format(imgs['src'])
if 'caption' in imgs:
yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
imgs['caption'].get('text', ''), ' ' + imgs['caption'].get('credit', '')
)
yield '</p>'
def parse_cont(content):
for cont in content.get('content', {}):
if isinstance(cont, dict):
yield from parse_body(cont)
if isinstance(cont, str):
yield cont
def parse_body(x):
if isinstance(x, dict):
if 'type' in x:
tag = x['type']
if tag == 'inline':
yield ''.join(parse_inline(x))
elif 'attrs' in x and 'href' in x.get('attrs', ''):
yield '<' + tag + ' href="{}">'.format(x['attrs']['href'])
for yld in parse_cont(x):
yield yld
yield '</' + tag + '>'
else:
yield '<' + tag + '>'
for yld in parse_cont(x):
yield yld
yield '</' + tag + '>'
elif isinstance(x, list):
for y in x:
if isinstance(y, dict):
yield from parse_body(y)
def parse_article(edg):
sc = edg['schma']
yield '<div class="sub">' + escape(edg['sctn']) + '</div>'
yield '<h1>' + escape(sc['sclTtl']) + '</h1>'
yield '<div class="byline">' + escape(sc['sclDsc']) + '</div>'
yield '<p>'
for line in parse_contributors(edg.get('cntrbGrp', {})):
yield line
ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y')
yield '<div class="time">Published: ' + escape(ts) + '</div>'
if 'readTime' in edg:
yield '<div class="time">' + escape(edg['readTime']) + '</div>'
yield '</p>'
if edg.get('ldMda', {}).get('cmsType') == 'image':
for line in parse_lead_image(edg['ldMda']):
yield line
for main in edg['prismData']['mainComponents']:
if main['name'] == 'Body':
for item in main['props']['body']:
if isinstance(item, dict):
if item.get('type', '') == 'inline':
yield ''.join(parse_inline(item))
elif isinstance(item, list):
for line in item:
yield ''.join(parse_body(line))
def article_parse(data):
yield "<html><body>"
for frm in data['frms']:
if not frm:
continue
for mod in frm.get('mods', ()):
for edg in mod.get('edgs', ()):
if edg.get('cmsType') == 'ImmersiveLeadTile':
if 'image' in edg.get('cmsImage', {}):
for line in parse_lead_image(edg['cmsImage']):
yield line
if edg.get('cmsType') == 'ArticleBodyTile':
for line in parse_article(edg):
yield line
yield "</body></html>"
class NatGeo(BasicNewsRecipe): class NatGeo(BasicNewsRecipe):
title = u'National Geographic Magazine' title = 'National Geographic Magazine'
description = 'The National Geographic, an American monthly magazine' description = 'The National Geographic, an American monthly magazine'
language = 'en' language = 'en'
encoding = 'utf8' encoding = 'utf8'
@ -163,26 +23,38 @@ class NatGeo(BasicNewsRecipe):
remove_empty_feeds = True remove_empty_feeds = True
resolve_internal_links = True resolve_internal_links = True
extra_css = ''' recipe_specific_options = {
'date': {
'short': 'The date of the edition to download (Month-YYYY format)',
'long': 'For example, March-2023',
},
'res': {
'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
'default': '600',
},
}
@property
def natgeo_parser(self):
ans = getattr(self, '_natgeo_parser', None)
if ans is None:
from calibre.live import load_module
self._natgeo_parser = ans = load_module('calibre.web.site_parsers.natgeo')
return ans
def preprocess_raw_html(self, raw_html, url):
return self.natgeo_parser.extract_html(raw_html)
extra_css = """
blockquote { color:#404040; } blockquote { color:#404040; }
.byline, i { font-style:italic; color:#202020; } .byline, i { font-style:italic; color:#202020; }
.cap { font-size:small; } .cap { font-size:small; }
img {display:block; margin:0 auto;} img {display:block; margin:0 auto;}
.cred { font-style:italic; font-size:small; color:#404040; } .cred { font-style:italic; font-size:small; color:#404040; }
.auth, .time, .sub { font-size:small; color:#5c5c5c; } .auth, .time, .sub { font-size:small; color:#5c5c5c; }
''' """
recipe_specific_options = {
'date': {
'short': 'The date of the edition to download (Month-YYYY format)',
'long': 'For example, March-2023'
},
'res': {
'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
'default': '600'
}
}
def parse_index(self): def parse_index(self):
edition = date.today().strftime('%B-%Y') edition = date.today().strftime('%B-%Y')
@ -195,11 +67,19 @@ class NatGeo(BasicNewsRecipe):
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
# png = re.findall('https://i\.natgeofe\.com\S+?national-geographic-\S+?\.jpg', soup.decode('utf-8')) # png = re.findall('https://i\.natgeofe\.com\S+?national-geographic-\S+?\.jpg', soup.decode('utf-8'))
# self.cover_url = png[0] + '?w=1000&h=1000' # self.cover_url = png[0] + '?w=1000&h=1000'
self.cover_url = soup.find('meta', attrs={'property':'og:image'})['content'].split('?')[0] + '?w=1000' self.cover_url = (
soup.find('meta', attrs={'property': 'og:image'})['content'].split('?')[0]
+ '?w=1000'
)
# self.title = 'National Geographic ' + self.tag_to_string(name) # self.title = 'National Geographic ' + self.tag_to_string(name)
ans = {} ans = {}
if photoart := soup.find(attrs={'class':lambda x: x and 'BgImagePromo__Container__Text__Link' in x.split()}): if photoart := soup.find(
attrs={
'class': lambda x: x
and 'BgImagePromo__Container__Text__Link' in x.split()
}
):
section = 'Photo Essay' section = 'Photo Essay'
title = self.tag_to_string(photoart) title = self.tag_to_string(photoart)
url = photoart['href'] url = photoart['href']
@ -211,7 +91,9 @@ class NatGeo(BasicNewsRecipe):
if promo.find('a', attrs={'href': True}) and promo.a.get('href'): if promo.find('a', attrs={'href': True}) and promo.a.get('href'):
url = promo.a['href'] url = promo.a['href']
section = self.tag_to_string(promo.find(**classes('SectionLabel'))) section = self.tag_to_string(promo.find(**classes('SectionLabel')))
title = self.tag_to_string(promo.find(**classes('Card__Content__Heading'))) title = self.tag_to_string(
promo.find(**classes('Card__Content__Heading'))
)
articles = ans.setdefault(section, []) articles = ans.setdefault(section, [])
articles.append({'title': title, 'url': url}) articles.append({'title': title, 'url': url})
for gird in soup.findAll(attrs={'class': 'GridPromoTile'}): for gird in soup.findAll(attrs={'class': 'GridPromoTile'}):
@ -223,16 +105,14 @@ class NatGeo(BasicNewsRecipe):
if '/graphics/' in url: if '/graphics/' in url:
continue continue
section = self.tag_to_string(article.find(**classes('SectionLabel'))) section = self.tag_to_string(article.find(**classes('SectionLabel')))
title = self.tag_to_string(article.find(**classes('PromoTile__Title--truncated'))) title = self.tag_to_string(
article.find(**classes('PromoTile__Title--truncated'))
)
articles = ans.setdefault(section, []) articles = ans.setdefault(section, [])
articles.append({'title': title, 'url': url}) articles.append({'title': title, 'url': url})
self.log(pformat(ans)) self.log(pformat(ans))
return list(ans.items()) return list(ans.items())
def preprocess_raw_html(self, raw_html, url):
data = extract_json(raw_html)
return '\n'.join(article_parse(data))
def preprocess_html(self, soup): def preprocess_html(self, soup):
for h2 in soup.findAll('h2'): for h2 in soup.findAll('h2'):
h2.name = 'h4' h2.name = 'h4'

View File

@ -28,10 +28,10 @@ from calibre.gui2 import gprefs
from calibre.gui2.tweak_book.editor.canvas import Canvas from calibre.gui2.tweak_book.editor.canvas import Canvas
def reduce_to_ratio(w, h, t): def reduce_to_ratio(w, h, r):
h = min(h, w / t) h = min(h, w / r)
w = t * h w = r * h
return int(w), int(h) return int(round(w)), int(round(h))
class Region(QDialog): class Region(QDialog):

View File

@ -0,0 +1,156 @@
#!/usr/bin/env python
from __future__ import (
absolute_import,
division,
print_function,
unicode_literals,
)
import json
from pprint import pprint
from calibre import prepare_string_for_xml as escape
from calibre.utils.iso8601 import parse_iso8601
module_version = 1 # needed for live updates
pprint
def extract_json(raw):
s = raw.find("window['__natgeo__']")
script = raw[s : raw.find('</script>', s)]
return json.loads(script[script.find('{') :].rstrip(';'))['page']['content'][
'prismarticle'
]
def parse_contributors(grp):
for item in grp:
line = '<div class="auth">' + escape(item['title']) + ' '
for c in item['contributors']:
line += escape(c['displayName'])
yield line + '</div>'
def parse_lead_image(media):
if 'image' in media:
yield '<p>'
if 'dsc' in media['image']:
yield (
f'<div><img src="{escape(media["image"]["src"], True)}" '
f'alt="{escape(media["image"]["dsc"], True)}"></div>'
)
else:
yield f'<div><img src="{escape(media["image"]["src"], True)}"></div>'
if 'caption' in media and 'credit' in media:
yield (
'<div class="cap">'
+ media['caption']
+ '<span class="cred"> '
+ media['credit']
+ '</span></div>'
)
elif 'caption' in media:
yield '<div class="cap">' + media['caption'] + '</div>'
yield '</p>'
def parse_inline(inl):
if inl.get('content', {}).get('name', '') == 'Image':
props = inl['content']['props']
yield '<p>'
if 'image' in props:
yield f'<div class="img"><img src="{props["image"]["src"]}"></div>'
if 'caption' in props:
yield (
f'<div class="cap">{props["caption"].get("text", "")}<span '
f'class="cred"> {props["caption"].get("credit", "")}</span></div>'
)
yield '</p>'
if inl.get('content', {}).get('name', '') == 'ImageGroup':
if 'images' in inl['content']['props']:
for imgs in inl['content']['props']['images']:
yield '<p>'
if 'src' in imgs:
yield f'<div class="img"><img src="{imgs["src"]}"></div>'
if 'caption' in imgs:
yield (
f'<div class="cap">{imgs["caption"].get("text", "")}<span '
f'class="cred"> {imgs["caption"].get("credit", "")}</span></div>'
)
yield '</p>'
def parse_cont(content):
for cont in content.get('content', {}):
if isinstance(cont, dict):
yield from parse_body(cont)
if isinstance(cont, str):
yield cont
def parse_body(x):
if isinstance(x, dict):
if 'type' in x:
tag = x['type']
if tag == 'inline':
yield ''.join(parse_inline(x))
elif 'attrs' in x and 'href' in x.get('attrs', ''):
yield '<' + tag + f' href="{x["attrs"]["href"]}">'
yield from parse_cont(x)
yield '</' + tag + '>'
else:
yield '<' + tag + '>'
yield from parse_cont(x)
yield '</' + tag + '>'
elif isinstance(x, list):
for y in x:
if isinstance(y, dict):
yield from parse_body(y)
def parse_article(edg):
sc = edg['schma']
yield '<div class="sub">' + escape(edg['sctn']) + '</div>'
yield '<h1>' + escape(sc['sclTtl']) + '</h1>'
yield '<div class="byline">' + escape(sc['sclDsc']) + '</div>'
yield '<p>'
yield from parse_contributors(edg.get('cntrbGrp', {}))
ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y')
yield '<div class="time">Published: ' + escape(ts) + '</div>'
if 'readTime' in edg:
yield '<div class="time">' + escape(edg['readTime']) + '</div>'
yield '</p>'
if edg.get('ldMda', {}).get('cmsType') == 'image':
yield from parse_lead_image(edg['ldMda'])
for main in edg['prismData']['mainComponents']:
if main['name'] == 'Body':
for item in main['props']['body']:
if isinstance(item, dict):
if item.get('type', '') == 'inline':
yield ''.join(parse_inline(item))
elif isinstance(item, list):
for line in item:
yield ''.join(parse_body(line))
def article_parse(data):
yield '<html><body>'
for frm in data['frms']:
if not frm:
continue
for mod in frm.get('mods', ()):
for edg in mod.get('edgs', ()):
if edg.get('cmsType') == 'ImmersiveLeadTile':
if 'image' in edg.get('cmsImage', {}):
yield from parse_lead_image(edg['cmsImage'])
if edg.get('cmsType') == 'ArticleBodyTile':
yield from parse_article(edg)
yield '</body></html>'
def extract_html(raw_html):
data = extract_json(raw_html)
return '\n'.join(article_parse(data))