diff --git a/recipes/icons/natgeo_kids.png b/recipes/icons/natgeo_kids.png new file mode 100644 index 0000000000..de34804a9a Binary files /dev/null and b/recipes/icons/natgeo_kids.png differ diff --git a/recipes/icons/natgeo_traveller.png b/recipes/icons/natgeo_traveller.png new file mode 100644 index 0000000000..69834ab831 Binary files /dev/null and b/recipes/icons/natgeo_traveller.png differ diff --git a/recipes/natgeo_kids.recipe b/recipes/natgeo_kids.recipe new file mode 100644 index 0000000000..8884f8488d --- /dev/null +++ b/recipes/natgeo_kids.recipe @@ -0,0 +1,95 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 + +from calibre.web.feeds.news import BasicNewsRecipe + + +class NatGeo(BasicNewsRecipe): + title = 'National Geographic Kids' + description = 'The National Geographic, an American monthly magazine' + language = 'en' + encoding = 'utf8' + publisher = 'kids.nationalgeographic.com' + category = 'science, nat geo' + __author__ = 'unkn0wn' + description = 'Inspiring people to care about the planet since 1888' + timefmt = ' [%a, %d %b, %Y]' + use_embedded_content = False + remove_javascript = True + masthead_url = 'https://i.natgeofe.com/n/e76f5368-6797-4794-b7f6-8d757c79ea5c/ng-logo-2fl.png?w=600&h=600' + remove_empty_feeds = True + resolve_internal_links = True + ignore_duplicate_articles = {'title', 'url'} + + recipe_specific_options = { + 'res': { + 'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500', + 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.', + 'default': '600', + }, + } + + @property + def natgeo_parser(self): + ans = getattr(self, '_natgeo_parser', None) + if ans is None: + from calibre.live import load_module + + self._natgeo_parser = ans = load_module('calibre.web.site_parsers.natgeo') + return ans + + def preprocess_raw_html(self, raw_html, url): + return self.natgeo_parser.extract_html(raw_html) + + extra_css = """ + blockquote { color:#404040; } + .byline, i { font-style:italic; color:#202020; } + .cap { font-size:small; } + img {display:block; margin:0 auto;} + .cred { font-style:italic; font-size:small; color:#404040; } + .auth, .time, .sub { font-size:small; color:#5c5c5c; } + """ + + def parse_index(self): + index = 'https://kids.nationalgeographic.com/' + sections = [ + 'Front Page', 'animals', 'history', 'science', + 'space', 'homework-help', 'crafts', + ] + feeds = [] + for sec in sections: + section = sec.capitalize() + self.log(section) + url = index + sec + if sec.startswith('Front'): + url = index + self.log('Fetching articles from ', url) + soup = self.index_to_soup(url) + articles = [] + for a in soup.findAll('a', attrs={'href': lambda x: x and '/article/' in x}): + if a.find('img') and '/games/' in a['href']: + continue + url = a['href'] + title = self.tag_to_string(a) + self.log('\t', title, '\n\t\t', url) + articles.append({'title': title, 'url': url}) + if articles: + feeds.append((section, articles)) + return feeds + + def preprocess_html(self, soup): + for h2 in soup.findAll('h2'): + h2.name = 'h4' + for img in soup.findAll('img', src=True): + res = '?w=600' + w = self.recipe_specific_options.get('res') + if w and isinstance(w, str): + res = '?w=' + w + img['src'] = img['src'] + res + return soup + + def populate_article_metadata(self, article, soup, first): + summ = soup.find(attrs={'class': 'byline'}) + if summ: + article.summary = self.tag_to_string(summ) + article.text_summary = self.tag_to_string(summ) diff --git a/recipes/natgeo_traveller.recipe b/recipes/natgeo_traveller.recipe new file mode 100644 index 0000000000..eed9e5a0c2 --- /dev/null +++ b/recipes/natgeo_traveller.recipe @@ -0,0 +1,103 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from pprint import pformat + +from calibre.web.feeds.news import BasicNewsRecipe, classes + + +class NatGeo(BasicNewsRecipe): + title = 'National Geographic Traveller' + description = 'News articles from The National Geographic Traveller, Download Monthly.' + language = 'en' + encoding = 'utf8' + publisher = 'nationalgeographic.com' + category = 'science, nat geo' + __author__ = 'unkn0wn' + description = 'Inspiring people to care about the planet since 1888' + timefmt = ' [%a, %d %b, %Y]' + no_stylesheets = True + use_embedded_content = False + remove_attributes = ['style'] + remove_javascript = False + masthead_url = 'https://i.natgeofe.com/n/e76f5368-6797-4794-b7f6-8d757c79ea5c/ng-logo-2fl.png?w=600&h=600' + remove_empty_feeds = True + resolve_internal_links = True + ignore_duplicate_articles = {'url'} + + recipe_specific_options = { + 'res': { + 'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500', + 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.', + 'default': '600', + } + } + + @property + def natgeo_parser(self): + ans = getattr(self, '_natgeo_parser', None) + if ans is None: + from calibre.live import load_module + + self._natgeo_parser = ans = load_module('calibre.web.site_parsers.natgeo') + return ans + + def preprocess_raw_html(self, raw_html, url): + return self.natgeo_parser.extract_html(raw_html) + + extra_css = """ + blockquote { color:#404040; } + .byline, i { font-style:italic; color:#202020; } + .cap { font-size:small; } + img {display:block; margin:0 auto;} + .cred { font-style:italic; font-size:small; color:#404040; } + .auth, .time, .sub { font-size:small; color:#5c5c5c; } + """ + + def parse_index(self): + pages = [ + 'https://www.nationalgeographic.com/travel/topic/national-geographic-traveller-uk' + ] + + feeds = [] + + for sec in pages: + soup = self.index_to_soup(sec) + parsed = self.articles_from_soup(soup) + if parsed: + feeds += parsed + return feeds + + def articles_from_soup(self, soup): + ans = {} + for article in soup.findAll('article'): + a = article.find('a') + url = a['href'] + if url.startswith('/'): + url = 'https://www.nationalgeographic.com' + url + section = self.tag_to_string(article.find(**classes('SectionLabel'))) + if section.startswith('Paid Content'): + continue + title = self.tag_to_string( + article.find(**classes('PromoTile__Title--truncated')) + ) + articles = ans.setdefault(section, []) + articles.append({'title': title, 'url': url}) + self.log(pformat(ans)) + return list(ans.items()) + + def preprocess_html(self, soup): + for h2 in soup.findAll('h2'): + h2.name = 'h4' + for img in soup.findAll('img', src=True): + res = '?w=600' + w = self.recipe_specific_options.get('res') + if w and isinstance(w, str): + res = '?w=' + w + img['src'] = img['src'] + res + return soup + + def populate_article_metadata(self, article, soup, first): + summ = soup.find(attrs={'class': 'byline'}) + if summ: + article.summary = self.tag_to_string(summ) + article.text_summary = self.tag_to_string(summ) diff --git a/src/calibre/web/site_parsers/natgeo.py b/src/calibre/web/site_parsers/natgeo.py index 007515725d..3e60a3e96e 100644 --- a/src/calibre/web/site_parsers/natgeo.py +++ b/src/calibre/web/site_parsers/natgeo.py @@ -15,9 +15,11 @@ pprint def extract_json(raw): s = raw.find("window['__natgeo__']") script = raw[s : raw.find('', s)] - return json.loads(script[script.find('{') :].rstrip(';'))['page']['content'][ - 'prismarticle' - ] + content = json.loads(script[script.find('{') :].rstrip(';'))['page']['content'] + if content.get('prismarticle'): + return content['prismarticle'] + if content.get('article'): + return content['article'] def parse_contributors(grp): @@ -104,12 +106,37 @@ def parse_body(x): if isinstance(y, dict): yield from parse_body(y) +def parse_bdy(item): + c = item['cntnt'] + if item.get('type') == 'inline': + if c.get('cmsType') == 'listicle': + if 'title' in c: + yield '

' + escape(c['title']) + '

' + yield c['text'] + elif c.get('cmsType') == 'image': + yield from parse_lead_image(c) + elif c.get('cmsType') == 'imagegroup': + for imgs in c['images']: + yield from parse_lead_image(imgs) + elif c.get('cmsType') == 'pullquote': + if 'quote' in c: + yield '
' + c['quote'] + '
' + elif c.get('cmsType') == 'editorsNote': + if 'note' in c: + yield '
' + c['note'] + '
' + else: + if c['mrkup'].strip().startswith('<'): + yield c['mrkup'] + else: + yield '<{tag}>{markup}'.format( + tag=item['type'], markup=c['mrkup']) def parse_article(edg): sc = edg['schma'] yield '
' + escape(edg['sctn']) + '
' yield '

' + escape(sc['sclTtl']) + '

' - yield '
' + escape(sc['sclDsc']) + '
' + if sc.get('sclDsc'): + yield '
' + escape(sc['sclDsc']) + '
' yield '

' yield from parse_contributors(edg.get('cntrbGrp', {})) ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y') @@ -119,15 +146,19 @@ def parse_article(edg): yield '

' if edg.get('ldMda', {}).get('cmsType') == 'image': yield from parse_lead_image(edg['ldMda']) - for main in edg['prismData']['mainComponents']: - if main['name'] == 'Body': - for item in main['props']['body']: - if isinstance(item, dict): - if item.get('type', '') == 'inline': - yield ''.join(parse_inline(item)) - elif isinstance(item, list): - for line in item: - yield ''.join(parse_body(line)) + if edg.get('prismData'): + for main in edg['prismData']['mainComponents']: + if main['name'] == 'Body': + for item in main['props']['body']: + if isinstance(item, dict): + if item.get('type', '') == 'inline': + yield ''.join(parse_inline(item)) + elif isinstance(item, list): + for line in item: + yield ''.join(parse_body(line)) + elif edg.get('bdy'): + for item in edg['bdy']: + yield from parse_bdy(item) def article_parse(data): diff --git a/src/calibre/web/site_parsers/nytimes.py b/src/calibre/web/site_parsers/nytimes.py index 56b1db6f1d..172088210d 100644 --- a/src/calibre/web/site_parsers/nytimes.py +++ b/src/calibre/web/site_parsers/nytimes.py @@ -9,7 +9,7 @@ from xml.sax.saxutils import escape, quoteattr from calibre.utils.iso8601 import parse_iso8601 -module_version = 10 # needed for live updates +module_version = 11 # needed for live updates pprint @@ -183,6 +183,7 @@ def parse_types(x): 'RelatedLinksBlock', 'EmailSignupBlock', 'Dropzone', + 'AudioBlock', }: yield ''.join(parse_cnt(x))