Update National Geographic

2025-07-09 03:04:10 -04:00 · 2019-08-08 10:05:59 +05:30 · 2019-08-08 10:05:59 +05:30 · 383d6cde29
commit 383d6cde29
parent 69ea54fa15
1 changed files with 47 additions and 18 deletions
--- a/recipes/natgeo.recipe
+++ b/recipes/natgeo.recipe
@ -3,6 +3,8 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 import json
 from collections import defaultdict
 from calibre.ebooks.BeautifulSoup import Tag
 from calibre.web.feeds.news import BasicNewsRecipe
@ -20,17 +22,35 @@ def new_tag(soup, name, attrs=()):
    return Tag(soup, name, attrs=attrs or None)
 def entry_to_article(entry):
    url = entry.get('uri')
    if not url:
        return None, None
    section = 'Articles'
    article = {'url': url}
    for component in entry.get('components', ()):
        if component.get('content_type') == 'title':
            article['title'] = component['title']['text']
        elif component.get('content_type') == 'kicker':
            v = component['kicker'].get('vertical') or {}
            if v.get('name'):
                section = v['name']
        elif component.get('content_type') == 'dek':
            if component['dek'].get('text'):
                article['description'] = component['dek']['text']
    if 'title' in article:
        return article, section
    return None, None
 class NatGeo(BasicNewsRecipe):
    title = u'National Geographic'
    description = 'Daily news articles from The National Geographic'
    language = 'en'
    oldest_article = 20
    max_articles_per_feed = 25
    encoding = 'utf8'
    publisher = 'nationalgeographic.com'
    category = 'science, nat geo'
    __author__ = 'Kovid Goyal'
    masthead_url = 'http://s.ngeo.com/wpf/sites/themes/global/i/presentation/ng_logo_small.png'
    description = 'Inspiring people to care about the planet since 1888'
    timefmt = ' [%a, %d %b, %Y]'
    no_stylesheets = True
@ -39,25 +59,34 @@ class NatGeo(BasicNewsRecipe):
    remove_javascript = False
    keep_only_tags = [
-            classes('mainArt byline'),
+        classes('main-title article__dek byline-component publishDate mainArt byline'),
-            dict(id='article__body'),
+        dict(id='article__body'),
    ]
    remove_tags = [
-            classes('hide-from-mobile ad-holder enlarge-button'),
+        classes('hide-from-mobile ad-holder enlarge-button'),
-            dict(name='svg meta'.split()),
+        dict(name='svg meta'.split()),
    ]
-    feeds = [
+    def parse_index(self):
-        (u'Daily News', u'http://feeds.nationalgeographic.com/ng/News/News_Main')
+        feeds = defaultdict(list)
-    ]
+        br = self.get_browser()
-
+        # br.open('https://www.nationalgeographic.com/latest-stories/').read()
-    def parse_feeds(self):
+        res = br.open_novisit(
-        feeds = BasicNewsRecipe.parse_feeds(self)
+            'https://www.nationalgeographic.com/latest-stories/_jcr_content/content/hubfeed.promo-hub-feed-all-stories.json?offset=0&max=18')
-        for feed in feeds:
+        entries = json.loads(res.read())
-            for article in feed.articles[:]:
+        for entry in entries:
-                if 'Presented' in article.title or 'Pictures' in article.title:
+            art, section = entry_to_article(entry)
-                    feed.articles.remove(article)
+            if art is None:
-        return feeds
+                continue
            feeds[section].append(art)
        ans = [(sec, feeds[sec]) for sec in sorted(feeds) if feeds[sec]]
        for (sec, articles) in ans:
            self.log('Found section:', sec)
            for art in articles:
                self.log('\t', art['title'], art['url'])
                if 'description' in art:
                    self.log('\t\t', art['description'])
        return ans
    def preprocess_html(self, soup):
        for div in soup.findAll(attrs={'data-pestle-module': 'PictureFill'}):