#!/usr/bin/env python # vim:fileencoding=utf-8 from __future__ import absolute_import, division, print_function, unicode_literals import json from collections import defaultdict from calibre.ebooks.BeautifulSoup import Tag from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) def new_tag(soup, name, attrs=()): impl = getattr(soup, 'new_tag', None) if impl is not None: return impl(name, attrs=dict(attrs)) return Tag(soup, name, attrs=attrs or None) def entry_to_article(entry): url = entry.get('uri') if not url: return None, None section = 'Articles' article = {'url': url} for component in entry.get('components', ()): if component.get('content_type') == 'title': article['title'] = component['title']['text'] elif component.get('content_type') == 'kicker': v = component['kicker'].get('vertical') or {} if v.get('name'): section = v['name'] elif component.get('content_type') == 'dek': if component['dek'].get('text'): article['description'] = component['dek']['text'] if 'title' in article: return article, section return None, None class NatGeo(BasicNewsRecipe): title = u'National Geographic' description = 'Daily news articles from The National Geographic' language = 'en' encoding = 'utf8' publisher = 'nationalgeographic.com' category = 'science, nat geo' __author__ = 'Kovid Goyal' description = 'Inspiring people to care about the planet since 1888' timefmt = ' [%a, %d %b, %Y]' no_stylesheets = True use_embedded_content = False remove_attributes = ['style'] remove_javascript = False keep_only_tags = [ classes('main-title article__dek byline-component publishDate mainArt byline'), dict(id='article__body'), ] remove_tags = [ classes('hide-from-mobile ad-holder enlarge-button'), dict(name='svg meta'.split()), ] def parse_index(self): feeds = defaultdict(list) br = self.get_browser() # br.open('https://www.nationalgeographic.com/latest-stories/').read() res = br.open_novisit( 'https://www.nationalgeographic.com/latest-stories/_jcr_content/content/hubfeed.promo-hub-feed-all-stories.json?offset=0&max=18') entries = json.loads(res.read()) for entry in entries: art, section = entry_to_article(entry) if art is None: continue feeds[section].append(art) ans = [(sec, feeds[sec]) for sec in sorted(feeds) if feeds[sec]] for (sec, articles) in ans: self.log('Found section:', sec) for art in articles: self.log('\t', art['title'], art['url']) if 'description' in art: self.log('\t\t', art['description']) return ans def preprocess_html(self, soup): for div in soup.findAll(attrs={'data-pestle-module': 'PictureFill'}): script = div.find('script') src = json.loads(self.tag_to_string(script))['src'] div.name = 'img' div['src'] = src for div in soup.findAll(attrs={'data-src': True, 'class': 'delayed-image-load'}): url = div['data-src'] idx = url.find('.jpg/{width') if idx != -1: url = url[:idx + 4] img = new_tag(soup, "img") img['src'] = url div.append(img) for script in soup.findAll('script'): script.extract() return soup