From dd9b8ba1c845412307516c5f93cb9e1a11767379 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Tue, 5 Sep 2023 17:44:55 +0530 Subject: [PATCH 1/2] National Geographic --- recipes/icons/natgeohis.png | Bin 0 -> 111 bytes recipes/icons/natgeomag.png | Bin 0 -> 111 bytes recipes/natgeo.recipe | 73 +++++++++++++---- recipes/natgeohis.recipe | 141 ++++++++++++++++++++++++++++++++ recipes/natgeomag.recipe | 159 ++++++++++++++++++++++++++++++++++++ 5 files changed, 357 insertions(+), 16 deletions(-) create mode 100644 recipes/icons/natgeohis.png create mode 100644 recipes/icons/natgeomag.png create mode 100644 recipes/natgeohis.recipe create mode 100644 recipes/natgeomag.recipe diff --git a/recipes/icons/natgeohis.png b/recipes/icons/natgeohis.png new file mode 100644 index 0000000000000000000000000000000000000000..8a7f1b583d2b1179a699179d8255fad6901e3a3d GIT binary patch literal 111 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!63?wyl`GbKJTYyi9E0F&GieZ1=G#4O?u_VYZ zn8D%MjWi%f+|$J|q~ccc0XB(;4BHu6' + yield line + '' def parse_lead_image(media): - yield '
{}
'.format( - escape(media['image']['src'], True), escape(media['image']['dsc'], True)) - yield '

' + escape(media['caption']) + '

' + if 'dsc' in media['image']: + yield '
{}
'.format( + escape(media['image']['src'], True), escape(media['image']['dsc'], True)) + else: + yield '
'.format(escape(media['image']['src'], True)) + if 'caption' in media: + yield '
' + media['caption'] + '
' if 'credit' in media: - yield '

' + escape(media['credit']) + '

' + yield '
' + media['credit'] + '
' def parse_body(item): c = item['cntnt'] if item.get('type') == 'inline': if c.get('cmsType') == 'listicle': - yield '

' + escape(c['title']) + "

" + if 'title' in c: + yield '

' + escape(c['title']) + "

" yield c['text'] elif c.get('cmsType') == 'image': for line in parse_lead_image(c): yield line else: - yield '<{tag}>{markup}'.format( - tag=item['type'], markup=c['mrkup']) + if c['mrkup'].strip().startswith('<'): + yield c['mrkup'] + else: + yield '<{tag}>{markup}'.format( + tag=item['type'], markup=c['mrkup']) def parse_article(edg): sc = edg['schma'] - yield '

' + escape(edg['sctn']) + '

' + yield '

' + escape(edg['sctn']) + '

' yield '

' + escape(sc['sclTtl']) + '

' - yield '
' + escape(sc['sclDsc']) + '
' + yield '
' + escape(sc['sclDsc']) + '

' for line in parse_contributors(edg['cntrbGrp']): yield line ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y') - yield '

Published: ' + escape(ts) + '

' + yield '
Published: ' + escape(ts) + '
' if 'readTime' in edg: - yield '

' + escape(edg['readTime']) + '

' + yield '
' + escape(edg['readTime']) + '

' if edg.get('ldMda', {}).get('cmsType') == 'image': for line in parse_lead_image(edg['ldMda']): yield line @@ -87,26 +95,54 @@ def article_parse(data): class NatGeo(BasicNewsRecipe): title = u'National Geographic' - description = 'Daily news articles from The National Geographic' + description = 'News articles from The National Geographic, Download Monthly.' language = 'en' encoding = 'utf8' publisher = 'nationalgeographic.com' category = 'science, nat geo' - __author__ = 'Kovid Goyal' + __author__ = 'Kovid Goyal, unkn0wn' description = 'Inspiring people to care about the planet since 1888' timefmt = ' [%a, %d %b, %Y]' no_stylesheets = True use_embedded_content = False remove_attributes = ['style'] remove_javascript = False + masthead_url = 'https://i.natgeofe.com/n/e76f5368-6797-4794-b7f6-8d757c79ea5c/ng-logo-2fl.png?w=600&h=600' + + extra_css = ''' + .sub { color:#404040; } + .byline, i { font-style:italic; color:#202020; } + .cap {text-align:center; font-size:small; } + .cred {text-align:center; font-size:small; color:#404040; } + .auth, .time { font-size:small; color:#5c5c5c; } + ''' def parse_index(self): - soup = self.index_to_soup('https://www.nationalgeographic.com/latest-stories/') + pages = [ + 'https://www.nationalgeographic.com/animals', + 'https://www.nationalgeographic.com/environment', + 'https://www.nationalgeographic.com/history', + 'https://www.nationalgeographic.com/science', + 'https://www.nationalgeographic.com/travel' + ] + + feeds = [] + + for sec in pages: + soup = self.index_to_soup(sec) + parsed = self.articles_from_soup(soup) + if parsed: + feeds += parsed + return feeds + + def articles_from_soup(self, soup): ans = {} for article in soup.findAll('article'): a = article.find('a') url = a['href'] section = self.tag_to_string(article.find(**classes('SectionLabel'))) + if section.startswith('Paid Content'): + continue title = self.tag_to_string(article.find(**classes('PromoTile__Title--truncated'))) articles = ans.setdefault(section, []) articles.append({'title': title, 'url': url}) @@ -116,3 +152,8 @@ class NatGeo(BasicNewsRecipe): def preprocess_raw_html(self, raw_html, url): data = extract_json(raw_html) return '\n'.join(article_parse(data)) + + def preprocess_html(self, soup): + for img in soup.findAll('img', src=True): + img['src'] = img['src'] + '?w=700&h=700' + return soup diff --git a/recipes/natgeohis.recipe b/recipes/natgeohis.recipe new file mode 100644 index 0000000000..808127cc84 --- /dev/null +++ b/recipes/natgeohis.recipe @@ -0,0 +1,141 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +from pprint import pformat + +from calibre.web.feeds.news import BasicNewsRecipe +from calibre import prepare_string_for_xml as escape +from calibre.utils.iso8601 import parse_iso8601 + + +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={ + 'class': lambda x: x and frozenset(x.split()).intersection(q)}) + + +def extract_json(raw): + s = raw.find("window['__natgeo__']") + script = raw[s:raw.find('', s)] + return json.loads( + script[script.find('{'):].rstrip(';'))['page']['content']['article'] + + +def parse_contributors(grp): + for item in grp: + line = '
' + escape(item['title']) + ' ' + for c in item['contributors']: + line += escape(c['displayName']) + yield line + '
' + + +def parse_lead_image(media): + if 'dsc' in media['image']: + yield '
{}
'.format( + escape(media['image']['src'], True), escape(media['image']['dsc'], True)) + else: + yield '
'.format(escape(media['image']['src'], True)) + if 'caption' in media: + yield '
' + media['caption'] + '
' + if 'credit' in media: + yield '
' + media['credit'] + '
' + + +def parse_body(item): + c = item['cntnt'] + if item.get('type') == 'inline': + if c.get('cmsType') == 'listicle': + if 'title' in c: + yield '

' + escape(c['title']) + "

" + yield c['text'] + elif c.get('cmsType') == 'image': + for line in parse_lead_image(c): + yield line + else: + if c['mrkup'].strip().startswith('<'): + yield c['mrkup'] + else: + yield '<{tag}>{markup}'.format( + tag=item['type'], markup=c['mrkup']) + + +def parse_article(edg): + sc = edg['schma'] + yield '

' + escape(edg['sctn']) + '

' + yield '

' + escape(sc['sclTtl']) + '

' + yield '
' + escape(sc['sclDsc']) + '

' + for line in parse_contributors(edg['cntrbGrp']): + yield line + ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y') + yield '
Published: ' + escape(ts) + '
' + if 'readTime' in edg: + yield '
' + escape(edg['readTime']) + '

' + if edg.get('ldMda', {}).get('cmsType') == 'image': + for line in parse_lead_image(edg['ldMda']): + yield line + for item in edg['bdy']: + for line in parse_body(item): + yield line + + +def article_parse(data): + yield "" + for frm in data['frms']: + if not frm: + continue + for mod in frm.get('mods', ()): + for edg in mod.get('edgs', ()): + if edg.get('cmsType') == 'ArticleBodyTile': + for line in parse_article(edg): + yield line + yield "" + + +class NatGeo(BasicNewsRecipe): + title = u'National Geographic History' + description = ( + 'From Caesar to Napoleon, the Pyramids to the Parthenon, the Trojan War to the Civil War—National Geographic ' + 'HISTORY draws readers in with more than 5,000 years of people, places, and things to explore.' + ) + language = 'en' + encoding = 'utf8' + publisher = 'nationalgeographic.com' + category = 'science, nat geo' + __author__ = 'Kovid Goyal' + description = 'Inspiring people to care about the planet since 1888' + timefmt = ' [%a, %d %b, %Y]' + no_stylesheets = True + use_embedded_content = False + remove_attributes = ['style'] + remove_javascript = False + masthead_url = 'https://i.natgeofe.com/n/e76f5368-6797-4794-b7f6-8d757c79ea5c/ng-logo-2fl.png?w=600&h=600' + + extra_css = ''' + .sub { color:#404040; } + .byline, i { font-style:italic; color:#202020; } + .cap {text-align:center; font-size:small; } + .cred {text-align:center; font-size:small; color:#404040; } + .auth, .time { font-size:small; color:#5c5c5c; } + ''' + + def parse_index(self): + soup = self.index_to_soup('https://www.nationalgeographic.com/history/history-magazine') + ans = [] + for article in soup.findAll('article'): + a = article.find('a') + url = a['href'] + title = self.tag_to_string(article.find(**classes('PromoTile__Title--truncated'))) + ans.append({'title': title, 'url': url}) + self.log(title, ' ', url) + return [('Articles', ans)] + + def preprocess_raw_html(self, raw_html, url): + data = extract_json(raw_html) + return '\n'.join(article_parse(data)) + + def preprocess_html(self, soup): + for img in soup.findAll('img', src=True): + img['src'] = img['src'] + '?w=1000&h=1000' + return soup diff --git a/recipes/natgeomag.recipe b/recipes/natgeomag.recipe new file mode 100644 index 0000000000..1e10f585bb --- /dev/null +++ b/recipes/natgeomag.recipe @@ -0,0 +1,159 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import absolute_import, division, print_function, unicode_literals + +import json, re +from pprint import pformat + +from calibre.web.feeds.news import BasicNewsRecipe +from calibre import prepare_string_for_xml as escape +from calibre.utils.iso8601 import parse_iso8601 + + +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={ + 'class': lambda x: x and frozenset(x.split()).intersection(q)}) + + +def extract_json(raw): + s = raw.find("window['__natgeo__']") + script = raw[s:raw.find('', s)] + return json.loads( + script[script.find('{'):].rstrip(';'))['page']['content']['article'] + + +def parse_contributors(grp): + for item in grp: + line = '
' + escape(item['title']) + ' ' + for c in item['contributors']: + line += escape(c['displayName']) + yield line + '
' + + +def parse_lead_image(media): + if 'dsc' in media['image']: + yield '
{}
'.format( + escape(media['image']['src'], True), escape(media['image']['dsc'], True)) + else: + yield '
'.format(escape(media['image']['src'], True)) + if 'caption' in media: + yield '
' + media['caption'] + '
' + if 'credit' in media: + yield '
' + media['credit'] + '
' + + +def parse_body(item): + c = item['cntnt'] + if item.get('type') == 'inline': + if c.get('cmsType') == 'listicle': + if 'title' in c: + yield '

' + escape(c['title']) + "

" + yield c['text'] + elif c.get('cmsType') == 'image': + for line in parse_lead_image(c): + yield line + else: + if c['mrkup'].strip().startswith('<'): + yield c['mrkup'] + else: + yield '<{tag}>{markup}'.format( + tag=item['type'], markup=c['mrkup']) + + +def parse_article(edg): + sc = edg['schma'] + yield '

' + escape(edg['sctn']) + '

' + yield '

' + escape(sc['sclTtl']) + '

' + yield '
' + escape(sc['sclDsc']) + '

' + for line in parse_contributors(edg['cntrbGrp']): + yield line + ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y') + yield '
Published: ' + escape(ts) + '
' + if 'readTime' in edg: + yield '
' + escape(edg['readTime']) + '

' + if edg.get('ldMda', {}).get('cmsType') == 'image': + for line in parse_lead_image(edg['ldMda']): + yield line + for item in edg['bdy']: + for line in parse_body(item): + yield line + + +def article_parse(data): + yield "" + for frm in data['frms']: + if not frm: + continue + for mod in frm.get('mods', ()): + for edg in mod.get('edgs', ()): + if edg.get('cmsType') == 'ArticleBodyTile': + for line in parse_article(edg): + yield line + yield "" + + +class NatGeo(BasicNewsRecipe): + title = u'National Geographic Magazine' + description = 'The National Geographic, an American monthly magazine' + language = 'en' + encoding = 'utf8' + publisher = 'nationalgeographic.com' + category = 'science, nat geo' + __author__ = 'Kovid Goyal, unkn0wn' + description = 'Inspiring people to care about the planet since 1888' + timefmt = ' [%a, %d %b, %Y]' + no_stylesheets = True + use_embedded_content = False + remove_attributes = ['style'] + remove_javascript = False + masthead_url = 'https://i.natgeofe.com/n/e76f5368-6797-4794-b7f6-8d757c79ea5c/ng-logo-2fl.png?w=600&h=600' + + extra_css = ''' + .sub { color:#404040; } + .byline, i { font-style:italic; color:#202020; } + .cap {text-align:center; font-size:small; } + .cred {text-align:center; font-size:small; color:#404040; } + .auth, .time { font-size:small; color:#5c5c5c; } + ''' + + def parse_index(self): + issues = self.index_to_soup('https://www.nationalgeographic.com/magazine') + mag = issues.find('a', attrs={'href':lambda x: x and x.startswith( + 'https://www.nationalgeographic.com/magazine/issue/' + )}) + self.timefmt = ' [' + self.tag_to_string(mag).replace(' Issue', '') + ']' + soup = self.index_to_soup(mag['href']) + png = re.findall('https://i\.natgeofe\.com\S+?national-geographic-magazine-\S+?\.jpg', soup.decode('utf-8')) + self.cover_url = png[0] + '?w=1000&h=1000' + + name = soup.find(attrs={'class':lambda x: x and 'Header__Description' in x.split()}) + self.title = 'National Geographic ' + self.tag_to_string(name) + ans = {} + ans2 = None + if photoart := soup.find(attrs={'class':lambda x: x and 'BgImagePromo__Container__Text__Link' in x.split()}): + ans2 = [] + title = self.tag_to_string(photoart) + url = 'https://www.nationalgeographic.com' + photoart['href'] + ans2.append(('Photo Essay', [{'title': title, 'url': url}])) + for gird in soup.findAll(attrs={'class':'GridPromoTile'}): + for article in soup.findAll('article'): + a = article.find('a') + url = a['href'] + section = self.tag_to_string(article.find(**classes('SectionLabel'))) + title = self.tag_to_string(article.find(**classes('PromoTile__Title--truncated'))) + articles = ans.setdefault(section, []) + articles.append({'title': title, 'url': url}) + self.log(pformat(ans)) + if ans2: + return list(ans.items()) + ans2 + return list(ans.items()) + + def preprocess_raw_html(self, raw_html, url): + data = extract_json(raw_html) + return '\n'.join(article_parse(data)) + + def preprocess_html(self, soup): + for img in soup.findAll('img', src=True): + img['src'] = img['src'] + '?w=1000&h=1000' + return soup From b72ebeb0956bc368d33886f8468aa2eeca913a0c Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Tue, 5 Sep 2023 18:18:38 +0530 Subject: [PATCH 2/2] ... --- recipes/natgeohis.recipe | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/recipes/natgeohis.recipe b/recipes/natgeohis.recipe index 808127cc84..d562e6b69a 100644 --- a/recipes/natgeohis.recipe +++ b/recipes/natgeohis.recipe @@ -120,6 +120,11 @@ class NatGeo(BasicNewsRecipe): .auth, .time { font-size:small; color:#5c5c5c; } ''' + def get_cover_url(self): + soup = self.index_to_soup('https://ngsingleissues.nationalgeographic.com/history') + wrap = soup.find(attrs={'class':'product-image-wrapper'}) + return wrap.img['src'] + def parse_index(self): soup = self.index_to_soup('https://www.nationalgeographic.com/history/history-magazine') ans = [] @@ -134,7 +139,7 @@ class NatGeo(BasicNewsRecipe): def preprocess_raw_html(self, raw_html, url): data = extract_json(raw_html) return '\n'.join(article_parse(data)) - + def preprocess_html(self, soup): for img in soup.findAll('img', src=True): img['src'] = img['src'] + '?w=1000&h=1000'