#!/usr/bin/env python # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2017, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals import json import re from calibre.web.feeds.news import BasicNewsRecipe def extract_article(raw): ms = re.search(r"window\['titanium-state'\]", raw) me = re.search(r"window\['titanium-cacheConfig'\]", raw) raw = raw[ms.start():me.start()] raw = raw[raw.find('{'):] data = json.loads(raw) data = tuple(data['content']['data'].values())[0] story_html = '

' + data['headline'] + '

\n' story_html += '

' + data['bylines'] + '

\n' story_html += '

' + data['published'] + '

\n' for m in data.get('media', ()): sizes = m['imageRenderedSizes'] if sizes: sz = 800 if 800 in sizes else sizes[0] url = m['gcsBaseUrl'] + '{}{}'.format(sz, m['imageFileExtension']) story_html += '\n

\n' story_html += '

' + m['caption'] + '

\n' story_html += '\n

' + data['storyHTML'] + '

' return '' + story_html + '' class AssociatedPress(BasicNewsRecipe): title = u'Associated Press' description = 'Global news' __author__ = 'Kovid Goyal' use_embedded_content = False language = 'en' encoding = 'utf-8' no_stylesheets = True ignore_duplicate_articles = {'title', 'url'} remove_empty_feeds = False def parse_index(self): feeds = [] limit = self.test[0] if self.test else 100 for front in ( 'topnews sports politics entertainment usnews oddities' ' Travel technology lifestyle business Health science intlnews'.split() ): name = { 'topnews': 'Top News', 'intlnews': 'International', 'usnews': 'U.S. News' }.get(front, front).capitalize() feeds.append([name, self.parse_section(front)]) if len(feeds) >= limit: break return feeds def parse_section(self, front): url = 'https://afs-prod.appspot.com/api/v2/feed/tag?tags=apf-' + front self.log('Processing section:', front, 'at', url) data = self.index_to_soup(url, raw=True) data = json.loads(data) cards = data.get('cards', ()) articles = [] for card in cards: for article in card['contents']: url = article['localLinkUrl'] title = article.get('headline', article.get('flattenedFirstWords')) if not title: continue title = title.split('\u2014')[-1] self.log('\tFound article:', title, 'at', url) articles.append({'title': title, 'url': url}) self.log('') return articles def preprocess_raw_html(self, raw_html, url): return extract_article(raw_html)