#!/usr/bin/env python # vim:fileencoding=utf-8 from __future__ import absolute_import, division, print_function, unicode_literals __license__ = 'GPL v3' __copyright__ = '2015, Kovid Goyal ' from collections import OrderedDict from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class ForeignPolicy(BasicNewsRecipe): title = u'Foreign Policy' language = 'en' __author__ = 'Kovid Goyal' description = 'International News' no_stylesheets = True remove_javascript = True keep_only_tags = [ dict(name='h1'), classes('dek-heading meta-data figure-image post-content-main'), dict(attrs={'class': lambda x: x and set(x.split()).intersection( {'wide_header_bg', 'wide_header_text'})}), ] remove_tags = [ dict(name=['meta', 'link']), dict(attrs={'class': ['fp-lightbox--overlay']}), dict(attrs={'class': lambda x: x and 'share-links' in x}), ] remove_tags_after = [classes('post-content-main')] def parse_index(self): soup = self.index_to_soup('https://foreignpolicy.com/the-magazine') img = soup.find('img', src=True, attrs={'alt': lambda x: x and 'foreign-policy-magazine-cover' in x}) self.cover_url = img['src'] current_section = None amap = OrderedDict() for x in soup.findAll(name=('h2', 'h3')): if x.name == 'h2': current_section = self.tag_to_string(x) self.log(current_section) if current_section.lower() == 'recent issues': break else: title = self.tag_to_string(x) a = x.parent url = a['href'] self.log('\t', title, 'url') amap.setdefault(current_section, []).append({'title': title, 'url': url}) ans = [] for sec_name in sorted(amap, key=lambda x: x.lower()): articles = amap[sec_name] if articles: ans.append((sec_name, articles)) return ans def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-srcset': True}): img['src'] = img['data-srcset'].split()[0] body = soup.find('body') div = soup.find( attrs={'class': lambda x: x and 'wide_header_bg' in x.split()}) if div is not None: div.extract() body.insert(0, div) div = soup.find( attrs={'class': lambda x: x and 'wide_header_text' in x.split()}) if div is not None: div.extract() body.insert(0, div) for div in soup.findAll(id='footer-logo'): div.parent.extract() return soup