#!/usr/bin/env python # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2017, Kovid Goyal import json from calibre.utils.date import parse_date, utcnow from calibre.web.feeds.news import BasicNewsRecipe, classes class AssociatedPress(BasicNewsRecipe): title = u'Associated Press' description = 'Global news' __author__ = 'Kovid Goyal' use_embedded_content = False language = 'en' encoding = 'utf-8' no_stylesheets = True ignore_duplicate_articles = {'title', 'url'} remove_empty_feeds = False oldest_article = 1.5 keep_only_tags = [ classes('Page-headline Page-lead Page-storyBody Page-authorinfo'), ] remove_tags = [ classes('Page-actions Enhancement'), dict(name='source'), ] remove_attributes = ['srcset'] extra_css = ''' .Figure-caption { font-style: italic; font-size: smaller; margin-left: 1rem; margin-right: 1rem; } ''' def parse_index(self): feeds = [] limit = self.test[0] if self.test else 100 for front in ( 'topnews sports politics entertainment usnews oddities' ' Travel technology lifestyle business Health science intlnews'.split() ): name = { 'topnews': 'Top News', 'intlnews': 'International', 'usnews': 'U.S. News' }.get(front, front).capitalize() feeds.append([name, self.parse_section(front)]) if len(feeds) >= limit: break return feeds def parse_section(self, front): url = 'https://afs-prod.appspot.com/api/v2/feed/tag?tags=apf-' + front self.log('Processing section:', front, 'at', url) data = self.index_to_soup(url, raw=True) data = json.loads(data) cards = data.get('cards', ()) articles = [] for card in cards: for article in card['contents']: url = article['localLinkUrl'] title = article.get('headline', article.get('flattenedFirstWords')) if not title: continue title = title.split('\u2014')[-1] updated = article.get('updated') if updated: updated = parse_date(updated, assume_utc=True) delta = utcnow() - updated if (delta.days*24*3600 + delta.seconds) > 24*3600*self.oldest_article: self.log('Skipping', title, 'as it is too old') continue self.log('\tFound article:', title, 'at', url) articles.append({'title': title, 'url': url}) self.log('') return articles