more natgeo recipes

2025-08-11 09:13:57 -04:00 · 2024-10-13 10:52:41 +05:30 · 2024-10-13 10:52:41 +05:30 · 16e73eed0a
commit 16e73eed0a
parent 45525d7f34
6 changed files with 244 additions and 14 deletions
--- a/recipes/icons/natgeo_kids.png
+++ b/recipes/icons/natgeo_kids.png
--- a/recipes/icons/natgeo_traveller.png
+++ b/recipes/icons/natgeo_traveller.png
--- a/recipes/natgeo_kids.recipe
+++ b/recipes/natgeo_kids.recipe
@ -0,0 +1,95 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
 from calibre.web.feeds.news import BasicNewsRecipe
 class NatGeo(BasicNewsRecipe):
    title = 'National Geographic Kids'
    description = 'The National Geographic, an American monthly magazine'
    language = 'en'
    encoding = 'utf8'
    publisher = 'kids.nationalgeographic.com'
    category = 'science, nat geo'
    __author__ = 'unkn0wn'
    description = 'Inspiring people to care about the planet since 1888'
    timefmt = ' [%a, %d %b, %Y]'
    use_embedded_content = False
    remove_javascript = True
    masthead_url = 'https://i.natgeofe.com/n/e76f5368-6797-4794-b7f6-8d757c79ea5c/ng-logo-2fl.png?w=600&h=600'
    remove_empty_feeds = True
    resolve_internal_links = True
    ignore_duplicate_articles = {'title', 'url'}
    recipe_specific_options = {
        'res': {
            'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
            'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
            'default': '600',
        },
    }
    @property
    def natgeo_parser(self):
        ans = getattr(self, '_natgeo_parser', None)
        if ans is None:
            from calibre.live import load_module
            self._natgeo_parser = ans = load_module('calibre.web.site_parsers.natgeo')
        return ans
    def preprocess_raw_html(self, raw_html, url):
        return self.natgeo_parser.extract_html(raw_html)
    extra_css = """
        blockquote { color:#404040; }
        .byline, i { font-style:italic; color:#202020; }
        .cap { font-size:small; }
        img {display:block; margin:0 auto;}
        .cred { font-style:italic; font-size:small; color:#404040; }
        .auth, .time, .sub { font-size:small; color:#5c5c5c; }
    """
    def parse_index(self):
        index = 'https://kids.nationalgeographic.com/'
        sections = [
            'Front Page', 'animals', 'history', 'science',
            'space', 'homework-help', 'crafts',
        ]
        feeds = []
        for sec in sections:
            section = sec.capitalize()
            self.log(section)
            url = index + sec
            if sec.startswith('Front'):
                url = index
            self.log('Fetching articles from ', url)
            soup = self.index_to_soup(url)
            articles = []
            for a in soup.findAll('a', attrs={'href': lambda x: x and '/article/' in x}):
                if a.find('img') and '/games/' in a['href']:
                    continue
                url = a['href']
                title = self.tag_to_string(a)
                self.log('\t', title, '\n\t\t', url)
                articles.append({'title': title, 'url': url})
            if articles:
                feeds.append((section, articles))
        return feeds
    def preprocess_html(self, soup):
        for h2 in soup.findAll('h2'):
            h2.name = 'h4'
        for img in soup.findAll('img', src=True):
            res = '?w=600'
            w = self.recipe_specific_options.get('res')
            if w and isinstance(w, str):
                res = '?w=' + w
            img['src'] = img['src'] + res
        return soup
    def populate_article_metadata(self, article, soup, first):
        summ = soup.find(attrs={'class': 'byline'})
        if summ:
            article.summary = self.tag_to_string(summ)
            article.text_summary = self.tag_to_string(summ)
--- a/recipes/natgeo_traveller.recipe
+++ b/recipes/natgeo_traveller.recipe
@ -0,0 +1,103 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
 from pprint import pformat
 from calibre.web.feeds.news import BasicNewsRecipe, classes
 class NatGeo(BasicNewsRecipe):
    title = 'National Geographic Traveller'
    description = 'News articles from The National Geographic Traveller, Download Monthly.'
    language = 'en'
    encoding = 'utf8'
    publisher = 'nationalgeographic.com'
    category = 'science, nat geo'
    __author__ = 'unkn0wn'
    description = 'Inspiring people to care about the planet since 1888'
    timefmt = ' [%a, %d %b, %Y]'
    no_stylesheets = True
    use_embedded_content = False
    remove_attributes = ['style']
    remove_javascript = False
    masthead_url = 'https://i.natgeofe.com/n/e76f5368-6797-4794-b7f6-8d757c79ea5c/ng-logo-2fl.png?w=600&h=600'
    remove_empty_feeds = True
    resolve_internal_links = True
    ignore_duplicate_articles = {'url'}
    recipe_specific_options = {
        'res': {
            'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
            'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
            'default': '600',
        }
    }
    @property
    def natgeo_parser(self):
        ans = getattr(self, '_natgeo_parser', None)
        if ans is None:
            from calibre.live import load_module
            self._natgeo_parser = ans = load_module('calibre.web.site_parsers.natgeo')
        return ans
    def preprocess_raw_html(self, raw_html, url):
        return self.natgeo_parser.extract_html(raw_html)
    extra_css = """
        blockquote { color:#404040; }
        .byline, i { font-style:italic; color:#202020; }
        .cap { font-size:small; }
        img {display:block; margin:0 auto;}
        .cred { font-style:italic; font-size:small; color:#404040; }
        .auth, .time, .sub { font-size:small; color:#5c5c5c; }
    """
    def parse_index(self):
        pages = [
            'https://www.nationalgeographic.com/travel/topic/national-geographic-traveller-uk'
        ]
        feeds = []
        for sec in pages:
            soup = self.index_to_soup(sec)
            parsed = self.articles_from_soup(soup)
            if parsed:
                feeds += parsed
        return feeds
    def articles_from_soup(self, soup):
        ans = {}
        for article in soup.findAll('article'):
            a = article.find('a')
            url = a['href']
            if url.startswith('/'):
                url = 'https://www.nationalgeographic.com' + url
            section = self.tag_to_string(article.find(**classes('SectionLabel')))
            if section.startswith('Paid Content'):
                continue
            title = self.tag_to_string(
                article.find(**classes('PromoTile__Title--truncated'))
            )
            articles = ans.setdefault(section, [])
            articles.append({'title': title, 'url': url})
        self.log(pformat(ans))
        return list(ans.items())
    def preprocess_html(self, soup):
        for h2 in soup.findAll('h2'):
            h2.name = 'h4'
        for img in soup.findAll('img', src=True):
            res = '?w=600'
            w = self.recipe_specific_options.get('res')
            if w and isinstance(w, str):
                res = '?w=' + w
            img['src'] = img['src'] + res
        return soup
    def populate_article_metadata(self, article, soup, first):
        summ = soup.find(attrs={'class': 'byline'})
        if summ:
            article.summary = self.tag_to_string(summ)
            article.text_summary = self.tag_to_string(summ)
--- a/src/calibre/web/site_parsers/natgeo.py
+++ b/src/calibre/web/site_parsers/natgeo.py
@ -15,9 +15,11 @@ pprint
 def extract_json(raw):
    s = raw.find("window['__natgeo__']")
    script = raw[s : raw.find('</script>', s)]
-    return json.loads(script[script.find('{') :].rstrip(';'))['page']['content'][
+    content = json.loads(script[script.find('{') :].rstrip(';'))['page']['content']
-        'prismarticle'
+    if content.get('prismarticle'):
-    ]
+        return content['prismarticle']
    if content.get('article'):
        return content['article']
 def parse_contributors(grp):
@ -104,12 +106,37 @@ def parse_body(x):
            if isinstance(y, dict):
                yield from parse_body(y)
 def parse_bdy(item):
    c = item['cntnt']
    if item.get('type') == 'inline':
        if c.get('cmsType') == 'listicle':
            if 'title' in c:
                yield '<h3>' + escape(c['title']) + '</h3>'
            yield c['text']
        elif c.get('cmsType') == 'image':
            yield from parse_lead_image(c)
        elif c.get('cmsType') == 'imagegroup':
            for imgs in c['images']:
                yield from parse_lead_image(imgs)
        elif c.get('cmsType') == 'pullquote':
            if 'quote' in c:
                yield '<blockquote>' + c['quote'] + '</blockquote>'
        elif c.get('cmsType') == 'editorsNote':
            if 'note' in c:
                yield '<blockquote>' + c['note'] + '</blockquote>'
    else:
        if c['mrkup'].strip().startswith('<'):
            yield c['mrkup']
        else:
            yield '<{tag}>{markup}</{tag}>'.format(
                tag=item['type'], markup=c['mrkup'])
 def parse_article(edg):
    sc = edg['schma']
    yield '<div class="sub">' + escape(edg['sctn']) + '</div>'
    yield '<h1>' + escape(sc['sclTtl']) + '</h1>'
-    yield '<div class="byline">' + escape(sc['sclDsc']) + '</div>'
+    if sc.get('sclDsc'):
        yield '<div class="byline">' + escape(sc['sclDsc']) + '</div>'
    yield '<p>'
    yield from parse_contributors(edg.get('cntrbGrp', {}))
    ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y')
@ -119,15 +146,19 @@ def parse_article(edg):
    yield '</p>'
    if edg.get('ldMda', {}).get('cmsType') == 'image':
        yield from parse_lead_image(edg['ldMda'])
-    for main in edg['prismData']['mainComponents']:
+    if edg.get('prismData'):
-        if main['name'] == 'Body':
+        for main in edg['prismData']['mainComponents']:
-            for item in main['props']['body']:
+            if main['name'] == 'Body':
-                if isinstance(item, dict):
+                for item in main['props']['body']:
-                    if item.get('type', '') == 'inline':
+                    if isinstance(item, dict):
-                        yield ''.join(parse_inline(item))
+                        if item.get('type', '') == 'inline':
-                elif isinstance(item, list):
+                            yield ''.join(parse_inline(item))
-                    for line in item:
+                    elif isinstance(item, list):
-                        yield ''.join(parse_body(line))
+                        for line in item:
                            yield ''.join(parse_body(line))
    elif edg.get('bdy'):
        for item in edg['bdy']:
            yield from parse_bdy(item)
 def article_parse(data):
--- a/src/calibre/web/site_parsers/nytimes.py
+++ b/src/calibre/web/site_parsers/nytimes.py
@ -9,7 +9,7 @@ from xml.sax.saxutils import escape, quoteattr
 from calibre.utils.iso8601 import parse_iso8601
-module_version = 10  # needed for live updates
+module_version = 11  # needed for live updates
 pprint
@ -183,6 +183,7 @@ def parse_types(x):
        'RelatedLinksBlock',
        'EmailSignupBlock',
        'Dropzone',
        'AudioBlock',
    }:
        yield ''.join(parse_cnt(x))