#!/usr/bin/env python # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2016, Kovid Goyal from calibre.web.feeds.news import BasicNewsRecipe, classes class E1843(BasicNewsRecipe): title = '1843' __author__ = 'Kovid Goyal' description = 'The ideas, culture and lifestyle magazine from The Economist' language = 'en_GB' no_stylesheets = True remove_javascript = True encoding = 'utf-8' # economist.com has started throttling after about 60% of the total has # downloaded with connection reset by peer (104) errors. delay = 1 keep_only_tags = [ dict(id='content') ] remove_tags = [ classes('advert ad ds-share-list article__wordmark related-articles newsletter-signup'), dict(attrs={'data-test-id':'sharing-modal'}), ] def parse_index(self): soup = self.index_to_soup('https://economist.com/1843') ans = [] main = soup.find(id='content') for h3 in main.find_all('h3'): a = h3.find('a') url = a['href'] if url.startswith('/'): url = 'https://economist.com' + url title = self.tag_to_string(a) self.log(title, ' at ', url) desc = '' d = a.parent.findNextSibling('p') if d is not None: desc = self.tag_to_string(d) ans.append({'title': title, 'url': url, 'description': desc}) return [('Articles', ans)] def postprocess_html(self, soup, *a): a = soup.find('a', string='More from 1843 magazine') if a is not None: more = a.parent.parent more.extract() return soup