#!/usr/bin/env python2 # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2017, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals import json from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag def classes(classes): q = frozenset(classes.split(' ')) return dict( attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)} ) def new_tag(soup, name, attrs=()): impl = getattr(soup, 'new_tag', None) if impl is not None: return impl(name, attrs=dict(attrs)) return Tag(soup, name, attrs=attrs or None) class AssociatedPress(BasicNewsRecipe): title = u'Associated Press' description = 'Global news' __author__ = 'Krittika Goyal' use_embedded_content = False language = 'en' encoding = 'utf-8' no_stylesheets = True ignore_duplicate_articles = {'title', 'url'} remove_empty_feeds = False keep_only_tags = [ classes('CardHeadline LeadFeature Article'), ] remove_tags = [ classes('ad-placeholder modalImageButton modalVideoButton'), dict(name=['button', 'svg']), ] def parse_index(self): feeds = [] limit = self.test[0] if self.test else 100 for front in ( 'topnews sports politics entertainment usnews oddities' ' Travel technology lifestyle business Health science intlnews'.split() ): name = { 'topnews': 'Top News', 'intlnews': 'International', 'usnews': 'U.S. News' }.get(front, front).capitalize() feeds.append([name, self.parse_section(front)]) if len(feeds) >= limit: break return feeds def parse_section(self, front): url = 'https://afs-prod.appspot.com/api/v2/feed/tag?tags=apf-' + front self.log('Processing section:', front, 'at', url) data = self.index_to_soup(url, raw=True) data = json.loads(data) cards = data.get('cards', ()) articles = [] for card in cards: for article in card['contents']: url = article['localLinkUrl'] title = article.get('headline', article.get('flattenedFirstWords')) if not title: continue title = title.split('\u2014')[-1] self.log('\tFound article:', title, 'at', url) articles.append({'title': title, 'url': url}) self.log('') return articles def preprocess_html(self, soup, *a): for meta in soup.findAll('meta', attrs=dict(name="twitter:image:alt")): for div in soup.findAll(**classes('LeadFeature')): img = new_tag(soup, 'img') img['src'] = meta['content'] div.insert(0, img) return soup