#!/usr/bin/env python2 # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2018, Kovid Goyal from calibre.web.feeds.news import BasicNewsRecipe def absolutize(href): if href.startswith('/'): href = 'https://www.theglobeandmail.com' + href return href def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class GlobeMail(BasicNewsRecipe): title = u'The Globe and Mail' __author__ = 'Kovid Goyal' encoding = 'utf-8' publisher = 'Globe & Mail' language = 'en_CA' ignore_duplicate_articles = {'title', 'url'} no_stylesheets = True remove_attributes = ['style'] keep_only_tags = [ dict(name='h1'), dict(name='main', attrs={'class': lambda x: x and 'article-primary-content-chain' in x.split()}), ] remove_tags = [ classes('c-ad pb-f-commercial-dfp-ads pb-f-article-actions pb-f-article-meta'), ] def parse_index(self): ans = [] for section in 'canada opinion politics sports life arts world real-estate'.split(): if self.test and len(ans) >= self.test[0]: break soup = self.index_to_soup('https://www.theglobeandmail.com/{}/'.format(section)) self.log('Processing section:', section) articles = list(self.parse_gm_section(soup)) if articles: ans.append((section.capitalize(), articles)) return ans def parse_gm_section(self, soup): for a in soup.findAll('a', href=True, attrs={'data-lt-lid': lambda x: x and x.startswith('Headline.')}): headline = a.find('div', 'c-card__hed-text') title = self.tag_to_string(headline) url = absolutize(a['href']) self.log(' ', title, 'at', url) yield {'title': title, 'url': url}