Update Globe & Mail

2025-07-09 03:04:10 -04:00 · 2018-04-14 07:29:21 +05:30 · 2018-04-14 07:29:21 +05:30 · c2bac39446
commit c2bac39446
parent 8e89d862fb
1 changed files with 42 additions and 50 deletions
--- a/recipes/globe_and_mail.recipe
+++ b/recipes/globe_and_mail.recipe
@ -1,63 +1,55 @@
-#!/usr/bin/env  python2
+#!/usr/bin/env python2
-__license__ = 'GPL v3'
+# vim:fileencoding=utf-8
-
+# License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>
 __copyright__ = '2010, Szing'
 __docformat__ = 'restructuredtext en'
 '''
 globeandmail.com
 '''
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
-class AdvancedUserRecipe1287083651(BasicNewsRecipe):
+def absolutize(href):
    if href.startswith('/'):
        href = 'https://www.theglobeandmail.com' + href
    return href
 def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})
 class GlobeMail(BasicNewsRecipe):
    title = u'Globe & Mail'
    __author__ = 'Kovid Goyal'
-    oldest_article = 2
+    encoding = 'utf-8'
    no_stylesheets = True
    max_articles_per_feed = 100
    encoding = 'utf8'
    publisher = 'Globe & Mail'
    language = 'en_CA'
-    use_embedded_content = False
+    ignore_duplicate_articles = {'title', 'url'}
    no_stylesheets = True
-    auto_cleanup = True
+    remove_attributes = ['style']
    extra_css = 'p.meta {font-size:75%}\n .redtext {color: red;}\n .byline {font-size: 70%}'
-    feeds = [
+    keep_only_tags = [
-        (u'Top National Stories',
+            dict(name='h1'),
-         u'http://www.theglobeandmail.com/news/national/?service=rss'),
+            dict(name='main', attrs={'class': lambda x: x and 'article-primary-content-chain' in x.split()}),
-        (u'Business', u'http://www.theglobeandmail.com/report-on-business/?service=rss'),
+    ]
-        (u'Commentary', u'http://www.theglobeandmail.com/report-on-business/commentary/?service=rss'),
+    remove_tags = [
-        (u'Blogs', u'http://www.theglobeandmail.com/blogs/?service=rss'),
+            classes('c-ad pb-f-commercial-dfp-ads pb-f-article-actions'),
        (u'Facts & Arguments',
            u'http://www.theglobeandmail.com/life/facts-and-arguments/?service=rss'),
        (u'Technology', u'http://www.theglobeandmail.com/news/technology/?service=rss'),
        (u'Investing', u'http://www.theglobeandmail.com/globe-investor/?service=rss'),
        (u'Top Polical Stories',
            u'http://www.theglobeandmail.com/news/politics/?service=rss'),
        (u'Arts', u'http://www.theglobeandmail.com/news/arts/?service=rss'),
        (u'Life', u'http://www.theglobeandmail.com/life/?service=rss'),
        (u'Real Estate', u'http://www.theglobeandmail.com/real-estate/?service=rss'),
        (u'Sports', u'http://www.theglobeandmail.com/sports/?service=rss'),
        (u'Drive', u'http://www.theglobeandmail.com/auto/?service=rss')
    ]
-    preprocess_regexps = [
+    def parse_index(self):
-        (re.compile(r'<head.*?</head>', re.DOTALL), lambda m: '<head></head>'),
+        ans = []
-        (re.compile(r'<script.*?</script>', re.DOTALL), lambda m: ''),
+        for section in 'canada opinion politics sports life arts world'.split():
-    ]
+            if self.test and len(ans) >= self.test[0]:
                break
            soup = self.index_to_soup('https://www.theglobeandmail.com/{}/'.format(section))
            self.log('Processing section:', section)
            articles = list(self.parse_gm_section(soup))
            if articles:
                ans.append((section.capitalize(), articles))
        return ans
-    def populate_article_metadata(self, article, soup, first):
+    def parse_gm_section(self, soup):
-        if first and hasattr(self, 'add_toc_thumbnail'):
+        for a in soup.findAll('a', href=True, attrs={'data-lt-lid': lambda x: x and x.startswith('Headline.')}):
-            picdiv = soup.find('img')
+            title = self.tag_to_string(a)
-            if picdiv is not None:
+            url = absolutize(a['href'])
-                self.add_toc_thumbnail(article, picdiv['src'])
+            self.log('  ', title, 'at', url)
-
+            yield {'title': title, 'url': url}
    # Use the mobile version rather than the web version
    def print_version(self, url):
        return url.rpartition('?')[0] + '?service=mobile'