more natgeo recipes

2025-07-07 10:14:46 -04:00 · 2024-10-13 10:52:41 +05:30 · 2024-10-13 10:52:41 +05:30 · 16e73eed0a
commit 16e73eed0a
parent 45525d7f34
6 changed files with 244 additions and 14 deletions
--- a/recipes/icons/natgeo_kids.png
+++ b/recipes/icons/natgeo_kids.png
--- a/recipes/icons/natgeo_traveller.png
+++ b/recipes/icons/natgeo_traveller.png
--- a/recipes/natgeo_kids.recipe
+++ b/recipes/natgeo_kids.recipe
@ -0,0 +1,95 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+
+class NatGeo(BasicNewsRecipe):
+    title = 'National Geographic Kids'
+    description = 'The National Geographic, an American monthly magazine'
+    language = 'en'
+    encoding = 'utf8'
+    publisher = 'kids.nationalgeographic.com'
+    category = 'science, nat geo'
+    __author__ = 'unkn0wn'
+    description = 'Inspiring people to care about the planet since 1888'
+    timefmt = ' [%a, %d %b, %Y]'
+    use_embedded_content = False
+    remove_javascript = True
+    masthead_url = 'https://i.natgeofe.com/n/e76f5368-6797-4794-b7f6-8d757c79ea5c/ng-logo-2fl.png?w=600&h=600'
+    remove_empty_feeds = True
+    resolve_internal_links = True
+    ignore_duplicate_articles = {'title', 'url'}
+
+    recipe_specific_options = {
+        'res': {
+            'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
+            'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
+            'default': '600',
+        },
+    }
+
+    @property
+    def natgeo_parser(self):
+        ans = getattr(self, '_natgeo_parser', None)
+        if ans is None:
+            from calibre.live import load_module
+
+            self._natgeo_parser = ans = load_module('calibre.web.site_parsers.natgeo')
+        return ans
+
+    def preprocess_raw_html(self, raw_html, url):
+        return self.natgeo_parser.extract_html(raw_html)
+
+    extra_css = """
+        blockquote { color:#404040; }
+        .byline, i { font-style:italic; color:#202020; }
+        .cap { font-size:small; }
+        img {display:block; margin:0 auto;}
+        .cred { font-style:italic; font-size:small; color:#404040; }
+        .auth, .time, .sub { font-size:small; color:#5c5c5c; }
+    """
+
+    def parse_index(self):
+        index = 'https://kids.nationalgeographic.com/'
+        sections = [
+            'Front Page', 'animals', 'history', 'science',
+            'space', 'homework-help', 'crafts',
+        ]
+        feeds = []
+        for sec in sections:
+            section = sec.capitalize()
+            self.log(section)
+            url = index + sec
+            if sec.startswith('Front'):
+                url = index
+            self.log('Fetching articles from ', url)
+            soup = self.index_to_soup(url)
+            articles = []
+            for a in soup.findAll('a', attrs={'href': lambda x: x and '/article/' in x}):
+                if a.find('img') and '/games/' in a['href']:
+                    continue
+                url = a['href']
+                title = self.tag_to_string(a)
+                self.log('\t', title, '\n\t\t', url)
+                articles.append({'title': title, 'url': url})
+            if articles:
+                feeds.append((section, articles))
+        return feeds
+
+    def preprocess_html(self, soup):
+        for h2 in soup.findAll('h2'):
+            h2.name = 'h4'
+        for img in soup.findAll('img', src=True):
+            res = '?w=600'
+            w = self.recipe_specific_options.get('res')
+            if w and isinstance(w, str):
+                res = '?w=' + w
+            img['src'] = img['src'] + res
+        return soup
+
+    def populate_article_metadata(self, article, soup, first):
+        summ = soup.find(attrs={'class': 'byline'})
+        if summ:
+            article.summary = self.tag_to_string(summ)
+            article.text_summary = self.tag_to_string(summ)
--- a/recipes/natgeo_traveller.recipe
+++ b/recipes/natgeo_traveller.recipe
@ -0,0 +1,103 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from pprint import pformat
+
+from calibre.web.feeds.news import BasicNewsRecipe, classes
+
+
+class NatGeo(BasicNewsRecipe):
+    title = 'National Geographic Traveller'
+    description = 'News articles from The National Geographic Traveller, Download Monthly.'
+    language = 'en'
+    encoding = 'utf8'
+    publisher = 'nationalgeographic.com'
+    category = 'science, nat geo'
+    __author__ = 'unkn0wn'
+    description = 'Inspiring people to care about the planet since 1888'
+    timefmt = ' [%a, %d %b, %Y]'
+    no_stylesheets = True
+    use_embedded_content = False
+    remove_attributes = ['style']
+    remove_javascript = False
+    masthead_url = 'https://i.natgeofe.com/n/e76f5368-6797-4794-b7f6-8d757c79ea5c/ng-logo-2fl.png?w=600&h=600'
+    remove_empty_feeds = True
+    resolve_internal_links = True
+    ignore_duplicate_articles = {'url'}
+
+    recipe_specific_options = {
+        'res': {
+            'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
+            'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
+            'default': '600',
+        }
+    }
+
+    @property
+    def natgeo_parser(self):
+        ans = getattr(self, '_natgeo_parser', None)
+        if ans is None:
+            from calibre.live import load_module
+
+            self._natgeo_parser = ans = load_module('calibre.web.site_parsers.natgeo')
+        return ans
+
+    def preprocess_raw_html(self, raw_html, url):
+        return self.natgeo_parser.extract_html(raw_html)
+
+    extra_css = """
+        blockquote { color:#404040; }
+        .byline, i { font-style:italic; color:#202020; }
+        .cap { font-size:small; }
+        img {display:block; margin:0 auto;}
+        .cred { font-style:italic; font-size:small; color:#404040; }
+        .auth, .time, .sub { font-size:small; color:#5c5c5c; }
+    """
+
+    def parse_index(self):
+        pages = [
+            'https://www.nationalgeographic.com/travel/topic/national-geographic-traveller-uk'
+        ]
+
+        feeds = []
+
+        for sec in pages:
+            soup = self.index_to_soup(sec)
+            parsed = self.articles_from_soup(soup)
+            if parsed:
+                feeds += parsed
+        return feeds
+
+    def articles_from_soup(self, soup):
+        ans = {}
+        for article in soup.findAll('article'):
+            a = article.find('a')
+            url = a['href']
+            if url.startswith('/'):
+                url = 'https://www.nationalgeographic.com' + url
+            section = self.tag_to_string(article.find(**classes('SectionLabel')))
+            if section.startswith('Paid Content'):
+                continue
+            title = self.tag_to_string(
+                article.find(**classes('PromoTile__Title--truncated'))
+            )
+            articles = ans.setdefault(section, [])
+            articles.append({'title': title, 'url': url})
+        self.log(pformat(ans))
+        return list(ans.items())
+
+    def preprocess_html(self, soup):
+        for h2 in soup.findAll('h2'):
+            h2.name = 'h4'
+        for img in soup.findAll('img', src=True):
+            res = '?w=600'
+            w = self.recipe_specific_options.get('res')
+            if w and isinstance(w, str):
+                res = '?w=' + w
+            img['src'] = img['src'] + res
+        return soup
+
+    def populate_article_metadata(self, article, soup, first):
+        summ = soup.find(attrs={'class': 'byline'})
+        if summ:
+            article.summary = self.tag_to_string(summ)
+            article.text_summary = self.tag_to_string(summ)
--- a/src/calibre/web/site_parsers/natgeo.py
+++ b/src/calibre/web/site_parsers/natgeo.py
@ -15,9 +15,11 @@ pprint
 def extract_json(raw):
    s = raw.find("window['__natgeo__']")
    script = raw[s : raw.find('</script>', s)]
-    return json.loads(script[script.find('{') :].rstrip(';'))['page']['content'][
-        'prismarticle'
-    ]
+    content = json.loads(script[script.find('{') :].rstrip(';'))['page']['content']
+    if content.get('prismarticle'):
+        return content['prismarticle']
+    if content.get('article'):
+        return content['article']


 def parse_contributors(grp):
@ -104,12 +106,37 @@ def parse_body(x):
            if isinstance(y, dict):
                yield from parse_body(y)

+def parse_bdy(item):
+    c = item['cntnt']
+    if item.get('type') == 'inline':
+        if c.get('cmsType') == 'listicle':
+            if 'title' in c:
+                yield '<h3>' + escape(c['title']) + '</h3>'
+            yield c['text']
+        elif c.get('cmsType') == 'image':
+            yield from parse_lead_image(c)
+        elif c.get('cmsType') == 'imagegroup':
+            for imgs in c['images']:
+                yield from parse_lead_image(imgs)
+        elif c.get('cmsType') == 'pullquote':
+            if 'quote' in c:
+                yield '<blockquote>' + c['quote'] + '</blockquote>'
+        elif c.get('cmsType') == 'editorsNote':
+            if 'note' in c:
+                yield '<blockquote>' + c['note'] + '</blockquote>'
+    else:
+        if c['mrkup'].strip().startswith('<'):
+            yield c['mrkup']
+        else:
+            yield '<{tag}>{markup}</{tag}>'.format(
+                tag=item['type'], markup=c['mrkup'])

 def parse_article(edg):
    sc = edg['schma']
    yield '<div class="sub">' + escape(edg['sctn']) + '</div>'
    yield '<h1>' + escape(sc['sclTtl']) + '</h1>'
-    yield '<div class="byline">' + escape(sc['sclDsc']) + '</div>'
+    if sc.get('sclDsc'):
+        yield '<div class="byline">' + escape(sc['sclDsc']) + '</div>'
    yield '<p>'
    yield from parse_contributors(edg.get('cntrbGrp', {}))
    ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y')
@ -119,15 +146,19 @@ def parse_article(edg):
    yield '</p>'
    if edg.get('ldMda', {}).get('cmsType') == 'image':
        yield from parse_lead_image(edg['ldMda'])
-    for main in edg['prismData']['mainComponents']:
-        if main['name'] == 'Body':
-            for item in main['props']['body']:
-                if isinstance(item, dict):
-                    if item.get('type', '') == 'inline':
-                        yield ''.join(parse_inline(item))
-                elif isinstance(item, list):
-                    for line in item:
-                        yield ''.join(parse_body(line))
+    if edg.get('prismData'):
+        for main in edg['prismData']['mainComponents']:
+            if main['name'] == 'Body':
+                for item in main['props']['body']:
+                    if isinstance(item, dict):
+                        if item.get('type', '') == 'inline':
+                            yield ''.join(parse_inline(item))
+                    elif isinstance(item, list):
+                        for line in item:
+                            yield ''.join(parse_body(line))
+    elif edg.get('bdy'):
+        for item in edg['bdy']:
+            yield from parse_bdy(item)


 def article_parse(data):
--- a/src/calibre/web/site_parsers/nytimes.py
+++ b/src/calibre/web/site_parsers/nytimes.py
@ -9,7 +9,7 @@ from xml.sax.saxutils import escape, quoteattr

 from calibre.utils.iso8601 import parse_iso8601

-module_version = 10  # needed for live updates
+module_version = 11  # needed for live updates
 pprint


@ -183,6 +183,7 @@ def parse_types(x):
        'RelatedLinksBlock',
        'EmailSignupBlock',
        'Dropzone',
+        'AudioBlock',
    }:
        yield ''.join(parse_cnt(x))