calibre/recipes/globe_and_mail.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>

from calibre.web.feeds.news import BasicNewsRecipe


def absolutize(href):
    if href.startswith('/'):
        href = 'https://www.theglobeandmail.com' + href
    return href


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


class GlobeMail(BasicNewsRecipe):
    title = u'The Globe and Mail'
    __author__ = 'Kovid Goyal'
    encoding = 'utf-8'
    publisher = 'Globe & Mail'
    language = 'en_CA'
    ignore_duplicate_articles = {'title', 'url'}
    no_stylesheets = True
    remove_attributes = ['style']

    keep_only_tags = [
            dict(name='h1'),
            dict(name='main', attrs={'class': lambda x: x and 'article-primary-content-chain' in x.split()}),
    ]
    remove_tags = [
            classes('c-ad pb-f-commercial-dfp-ads pb-f-article-actions pb-f-article-meta'),
    ]

    def parse_index(self):
        ans = []
        for section in 'canada opinion politics sports life arts world real-estate'.split():
            if self.test and len(ans) >= self.test[0]:
                break
            soup = self.index_to_soup('https://www.theglobeandmail.com/{}/'.format(section))
            self.log('Processing section:', section)
            articles = list(self.parse_gm_section(soup))
            if articles:
                ans.append((section.capitalize(), articles))
        return ans

    def parse_gm_section(self, soup):
        for a in soup.findAll('a', href=True, attrs={'data-lt-lid': lambda x: x and x.startswith('Headline.')}):
            headline = a.find('div', 'c-card__hed-text')
            title = self.tag_to_string(headline)
            url = absolutize(a['href'])
            self.log('  ', title, 'at', url)
            yield {'title': title, 'url': url}