From c2bac39446d221b3cc6b689cd29894224824cb31 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 14 Apr 2018 07:29:21 +0530 Subject: [PATCH] Update Globe & Mail --- recipes/globe_and_mail.recipe | 92 ++++++++++++++++------------------- 1 file changed, 42 insertions(+), 50 deletions(-) diff --git a/recipes/globe_and_mail.recipe b/recipes/globe_and_mail.recipe index 9f7834daf4..35e8d75daf 100644 --- a/recipes/globe_and_mail.recipe +++ b/recipes/globe_and_mail.recipe @@ -1,63 +1,55 @@ -#!/usr/bin/env python2 -__license__ = 'GPL v3' - -__copyright__ = '2010, Szing' -__docformat__ = 'restructuredtext en' - -''' -globeandmail.com -''' - -import re +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2018, Kovid Goyal from calibre.web.feeds.news import BasicNewsRecipe -class AdvancedUserRecipe1287083651(BasicNewsRecipe): +def absolutize(href): + if href.startswith('/'): + href = 'https://www.theglobeandmail.com' + href + return href + + +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={ + 'class': lambda x: x and frozenset(x.split()).intersection(q)}) + + +class GlobeMail(BasicNewsRecipe): title = u'Globe & Mail' __author__ = 'Kovid Goyal' - oldest_article = 2 - no_stylesheets = True - max_articles_per_feed = 100 - encoding = 'utf8' + encoding = 'utf-8' publisher = 'Globe & Mail' language = 'en_CA' - use_embedded_content = False - + ignore_duplicate_articles = {'title', 'url'} no_stylesheets = True - auto_cleanup = True - extra_css = 'p.meta {font-size:75%}\n .redtext {color: red;}\n .byline {font-size: 70%}' + remove_attributes = ['style'] - feeds = [ - (u'Top National Stories', - u'http://www.theglobeandmail.com/news/national/?service=rss'), - (u'Business', u'http://www.theglobeandmail.com/report-on-business/?service=rss'), - (u'Commentary', u'http://www.theglobeandmail.com/report-on-business/commentary/?service=rss'), - (u'Blogs', u'http://www.theglobeandmail.com/blogs/?service=rss'), - (u'Facts & Arguments', - u'http://www.theglobeandmail.com/life/facts-and-arguments/?service=rss'), - (u'Technology', u'http://www.theglobeandmail.com/news/technology/?service=rss'), - (u'Investing', u'http://www.theglobeandmail.com/globe-investor/?service=rss'), - (u'Top Polical Stories', - u'http://www.theglobeandmail.com/news/politics/?service=rss'), - (u'Arts', u'http://www.theglobeandmail.com/news/arts/?service=rss'), - (u'Life', u'http://www.theglobeandmail.com/life/?service=rss'), - (u'Real Estate', u'http://www.theglobeandmail.com/real-estate/?service=rss'), - (u'Sports', u'http://www.theglobeandmail.com/sports/?service=rss'), - (u'Drive', u'http://www.theglobeandmail.com/auto/?service=rss') + keep_only_tags = [ + dict(name='h1'), + dict(name='main', attrs={'class': lambda x: x and 'article-primary-content-chain' in x.split()}), + ] + remove_tags = [ + classes('c-ad pb-f-commercial-dfp-ads pb-f-article-actions'), ] - preprocess_regexps = [ - (re.compile(r'', re.DOTALL), lambda m: ''), - (re.compile(r'', re.DOTALL), lambda m: ''), - ] + def parse_index(self): + ans = [] + for section in 'canada opinion politics sports life arts world'.split(): + if self.test and len(ans) >= self.test[0]: + break + soup = self.index_to_soup('https://www.theglobeandmail.com/{}/'.format(section)) + self.log('Processing section:', section) + articles = list(self.parse_gm_section(soup)) + if articles: + ans.append((section.capitalize(), articles)) + return ans - def populate_article_metadata(self, article, soup, first): - if first and hasattr(self, 'add_toc_thumbnail'): - picdiv = soup.find('img') - if picdiv is not None: - self.add_toc_thumbnail(article, picdiv['src']) - - # Use the mobile version rather than the web version - def print_version(self, url): - return url.rpartition('?')[0] + '?service=mobile' + def parse_gm_section(self, soup): + for a in soup.findAll('a', href=True, attrs={'data-lt-lid': lambda x: x and x.startswith('Headline.')}): + title = self.tag_to_string(a) + url = absolutize(a['href']) + self.log(' ', title, 'at', url) + yield {'title': title, 'url': url}