Update The Economist

The Economist is apparently doing some A/B testing with a new react based design for its print edition page.
2025-07-09 03:04:10 -04:00 · 2017-02-11 13:47:07 +05:30 · 2017-02-11 13:47:07 +05:30 · 4f4af3edf1
commit 4f4af3edf1
parent 06b4445307
2 changed files with 216 additions and 78 deletions
--- a/recipes/economist.recipe
+++ b/recipes/economist.recipe
@ -5,12 +5,24 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
 economist.com
 '''
-from calibre.web.feeds.news import BasicNewsRecipe
+import cookielib
-from calibre.ebooks.BeautifulSoup import Tag, NavigableString
+import re
 from collections import OrderedDict
-import re
+from calibre.ebooks.BeautifulSoup import NavigableString, Tag
-import cookielib
+from calibre.web.feeds.news import BasicNewsRecipe
 class NoArticles(Exception):
    pass
 def process_url(url, print_version=True):
    if print_version:
        url += '/print'
    if url.startswith('/'):
        url = 'https://www.economist.com' + url
    return url
 class Economist(BasicNewsRecipe):
@ -20,8 +32,10 @@ class Economist(BasicNewsRecipe):
    __author__ = "Kovid Goyal"
    INDEX = 'https://www.economist.com/printedition'
-    description = ('Global news and current affairs from a European'
+    description = (
-                   ' perspective. Best downloaded on Friday mornings (GMT)')
+        'Global news and current affairs from a European'
        ' perspective. Best downloaded on Friday mornings (GMT)'
    )
    extra_css = '''
        .headline {font-size: x-large;}
        h2 { font-size: small;  }
@ -45,17 +59,22 @@ class Economist(BasicNewsRecipe):
    oldest_article = 7.0
    resolve_internal_links = True
    remove_tags = [
-        dict(name=['script', 'noscript', 'title',
+        dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
-                   'iframe', 'cf_floatingcontent']),
+        dict(
-        dict(attrs={'class': ['dblClkTrk', 'ec-article-info',
+            attrs={
-                              'share_inline_header', 'related-items',
+                'class': [
-                              'main-content-container', 'ec-topic-widget']}),
+                    'dblClkTrk', 'ec-article-info', 'share_inline_header',
-        {'class': lambda x: x and 'share-links-header' in x},
+                    'related-items', 'main-content-container', 'ec-topic-widget'
                ]
            }
        ),
        {
            'class': lambda x: x and 'share-links-header' in x
        },
    ]
    keep_only_tags = [dict(name='article', id=lambda x: not x)]
    no_stylesheets = True
-    preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
+    preprocess_regexps = [(re.compile('</html>.*', re.DOTALL), lambda x: '</html>')]
                           lambda x:'</html>')]
    # economist.com has started throttling after about 60% of the total has
    # downloaded with connection reset by peer (104) errors.
@ -68,30 +87,81 @@ class Economist(BasicNewsRecipe):
        # Add a cookie indicating we have accepted Economist's cookie
        # policy (needed when running from some European countries)
        ck = cookielib.Cookie(
-            version=0, name='notice_preferences', value='2:', port=None,
+            version=0,
-            port_specified=False, domain='.economist.com',
+            name='notice_preferences',
-            domain_specified=False, domain_initial_dot=True, path='/',
+            value='2:',
-            path_specified=False, secure=False, expires=None, discard=False,
+            port=None,
-            comment=None, comment_url=None, rest={'HttpOnly': None},
+            port_specified=False,
-            rfc2109=False)
+            domain='.economist.com',
            domain_specified=False,
            domain_initial_dot=True,
            path='/',
            path_specified=False,
            secure=False,
            expires=None,
            discard=False,
            comment=None,
            comment_url=None,
            rest={'HttpOnly': None},
            rfc2109=False
        )
        br.cookiejar.set_cookie(ck)
        br.set_handle_gzip(True)
        return br
    def parse_index(self):
        return self.economist_parse_index()
    def economist_parse_index(self):
        # return [('Articles', [{'title':'test',
        # 'url':'https://www.economist.com/news/americas/21699494-guide-cutting-corners-way-jos'}])]
-        soup = self.index_to_soup(self.INDEX)
+        raw = self.index_to_soup(self.INDEX, raw=True)
        # with open('/t/raw.html', 'wb') as f:
        #     f.write(raw)
        soup = self.index_to_soup(raw)
        ans = self.economist_parse_index(soup)
        if not ans:
            raise NoArticles(
                'Could not find any articles, either the '
                'economist.com server is having trouble and you should '
                'try later or the website format has changed and the '
                'recipe needs to be updated.'
            )
        return ans
    def economist_parse_index(self, soup):
        img = soup.find(attrs={'class': 'print-edition__cover-widget__image'})
        if img is not None:
            self.cover_url = process_url(img['src'], False)
        else:
            div = soup.find('div', attrs={'class': 'issue-image'})
            if div is not None:
                img = div.find('img', src=True)
                if img is not None:
                    self.cover_url = re.sub('thumbnail', 'full', img['src'])
        sections = soup.findAll(
            'div', attrs={'class': 'list__title',
                          'data-reactid': True}
        )
        if sections:
            feeds = []
            for section in sections:
                articles = []
                secname = self.tag_to_string(section)
                self.log(secname)
                for a in section.findNextSiblings('a', href=True):
                    title = (
                        self.tag_to_string(
                            a.find(attrs={'class': 'print-edition__link-title'})
                        ) or self.tag_to_string(a)
                    )
                    articles.append({'title': title, 'url': process_url(a['href'])})
                    self.log(' ', title, articles[-1]['url'])
                if articles:
                    feeds.append((secname, articles))
            return feeds
        self.economist_parse_old_index(soup)
    def economist_parse_old_index(self, soup):
        feeds = OrderedDict()
-        for section in soup.findAll(attrs={'class': lambda x: x and 'section' in
+        for section in soup.findAll(attrs={'class': lambda x: x and 'section' in x}):
                                           x}):
            h4 = section.find('h4')
            if h4 is None:
                continue
@ -116,8 +186,12 @@ class Economist(BasicNewsRecipe):
                    if title:
                        title = prefix + title
                        self.log('\tFound article:', title)
-                        articles.append({'title': title, 'url': url,
+                        articles.append({
-                                         'description': '', 'date': ''})
+                            'title': title,
                            'url': url,
                            'description': '',
                            'date': ''
                        })
            if articles:
                if section_title not in feeds:
@ -125,11 +199,6 @@ class Economist(BasicNewsRecipe):
                feeds[section_title] += articles
        ans = [(key, val) for key, val in feeds.iteritems()]
        if not ans:
            raise Exception('Could not find any articles, either the '
                            'economist.com server is having trouble and you should '
                            'try later or the website format has changed and the '
                            'recipe needs to be updated.')
        return ans
    def eco_find_image_tables(self, soup):
--- a/recipes/economist_free.recipe
+++ b/recipes/economist_free.recipe
@ -5,12 +5,24 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
 economist.com
 '''
-from calibre.web.feeds.news import BasicNewsRecipe
+import cookielib
-from calibre.ebooks.BeautifulSoup import Tag, NavigableString
+import re
 from collections import OrderedDict
-import re
+from calibre.ebooks.BeautifulSoup import NavigableString, Tag
-import cookielib
+from calibre.web.feeds.news import BasicNewsRecipe
 class NoArticles(Exception):
    pass
 def process_url(url, print_version=True):
    if print_version:
        url += '/print'
    if url.startswith('/'):
        url = 'https://www.economist.com' + url
    return url
 class Economist(BasicNewsRecipe):
@ -20,8 +32,10 @@ class Economist(BasicNewsRecipe):
    __author__ = "Kovid Goyal"
    INDEX = 'https://www.economist.com/printedition'
-    description = ('Global news and current affairs from a European'
+    description = (
-                   ' perspective. Best downloaded on Friday mornings (GMT)')
+        'Global news and current affairs from a European'
        ' perspective. Best downloaded on Friday mornings (GMT)'
    )
    extra_css = '''
        .headline {font-size: x-large;}
        h2 { font-size: small;  }
@ -45,17 +59,22 @@ class Economist(BasicNewsRecipe):
    oldest_article = 7.0
    resolve_internal_links = True
    remove_tags = [
-        dict(name=['script', 'noscript', 'title',
+        dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
-                   'iframe', 'cf_floatingcontent']),
+        dict(
-        dict(attrs={'class': ['dblClkTrk', 'ec-article-info',
+            attrs={
-                              'share_inline_header', 'related-items',
+                'class': [
-                              'main-content-container', 'ec-topic-widget']}),
+                    'dblClkTrk', 'ec-article-info', 'share_inline_header',
-        {'class': lambda x: x and 'share-links-header' in x},
+                    'related-items', 'main-content-container', 'ec-topic-widget'
                ]
            }
        ),
        {
            'class': lambda x: x and 'share-links-header' in x
        },
    ]
    keep_only_tags = [dict(name='article', id=lambda x: not x)]
    no_stylesheets = True
-    preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
+    preprocess_regexps = [(re.compile('</html>.*', re.DOTALL), lambda x: '</html>')]
                           lambda x:'</html>')]
    # economist.com has started throttling after about 60% of the total has
    # downloaded with connection reset by peer (104) errors.
@ -68,30 +87,81 @@ class Economist(BasicNewsRecipe):
        # Add a cookie indicating we have accepted Economist's cookie
        # policy (needed when running from some European countries)
        ck = cookielib.Cookie(
-            version=0, name='notice_preferences', value='2:', port=None,
+            version=0,
-            port_specified=False, domain='.economist.com',
+            name='notice_preferences',
-            domain_specified=False, domain_initial_dot=True, path='/',
+            value='2:',
-            path_specified=False, secure=False, expires=None, discard=False,
+            port=None,
-            comment=None, comment_url=None, rest={'HttpOnly': None},
+            port_specified=False,
-            rfc2109=False)
+            domain='.economist.com',
            domain_specified=False,
            domain_initial_dot=True,
            path='/',
            path_specified=False,
            secure=False,
            expires=None,
            discard=False,
            comment=None,
            comment_url=None,
            rest={'HttpOnly': None},
            rfc2109=False
        )
        br.cookiejar.set_cookie(ck)
        br.set_handle_gzip(True)
        return br
    def parse_index(self):
        return self.economist_parse_index()
    def economist_parse_index(self):
        # return [('Articles', [{'title':'test',
        # 'url':'https://www.economist.com/news/americas/21699494-guide-cutting-corners-way-jos'}])]
-        soup = self.index_to_soup(self.INDEX)
+        raw = self.index_to_soup(self.INDEX, raw=True)
        # with open('/t/raw.html', 'wb') as f:
        #     f.write(raw)
        soup = self.index_to_soup(raw)
        ans = self.economist_parse_index(soup)
        if not ans:
            raise NoArticles(
                'Could not find any articles, either the '
                'economist.com server is having trouble and you should '
                'try later or the website format has changed and the '
                'recipe needs to be updated.'
            )
        return ans
    def economist_parse_index(self, soup):
        img = soup.find(attrs={'class': 'print-edition__cover-widget__image'})
        if img is not None:
            self.cover_url = process_url(img['src'], False)
        else:
            div = soup.find('div', attrs={'class': 'issue-image'})
            if div is not None:
                img = div.find('img', src=True)
                if img is not None:
                    self.cover_url = re.sub('thumbnail', 'full', img['src'])
        sections = soup.findAll(
            'div', attrs={'class': 'list__title',
                          'data-reactid': True}
        )
        if sections:
            feeds = []
            for section in sections:
                articles = []
                secname = self.tag_to_string(section)
                self.log(secname)
                for a in section.findNextSiblings('a', href=True):
                    title = (
                        self.tag_to_string(
                            a.find(attrs={'class': 'print-edition__link-title'})
                        ) or self.tag_to_string(a)
                    )
                    articles.append({'title': title, 'url': process_url(a['href'])})
                    self.log(' ', title, articles[-1]['url'])
                if articles:
                    feeds.append((secname, articles))
            return feeds
        self.economist_parse_old_index(soup)
    def economist_parse_old_index(self, soup):
        feeds = OrderedDict()
-        for section in soup.findAll(attrs={'class': lambda x: x and 'section' in
+        for section in soup.findAll(attrs={'class': lambda x: x and 'section' in x}):
                                           x}):
            h4 = section.find('h4')
            if h4 is None:
                continue
@ -116,8 +186,12 @@ class Economist(BasicNewsRecipe):
                    if title:
                        title = prefix + title
                        self.log('\tFound article:', title)
-                        articles.append({'title': title, 'url': url,
+                        articles.append({
-                                         'description': '', 'date': ''})
+                            'title': title,
                            'url': url,
                            'description': '',
                            'date': ''
                        })
            if articles:
                if section_title not in feeds:
@ -125,11 +199,6 @@ class Economist(BasicNewsRecipe):
                feeds[section_title] += articles
        ans = [(key, val) for key, val in feeds.iteritems()]
        if not ans:
            raise Exception('Could not find any articles, either the '
                            'economist.com server is having trouble and you should '
                            'try later or the website format has changed and the '
                            'recipe needs to be updated.')
        return ans
    def eco_find_image_tables(self, soup):