Update Economist recipes for index page markup change

Fixes #1882131 [Fetch "The Economist" News fails as they have totally revamped their website format](https://bugs.launchpad.net/calibre/+bug/1882131)
2025-07-09 03:04:10 -04:00 · 2020-06-05 07:42:15 +05:30 · 2020-06-05 07:42:15 +05:30 · 8f8a7b89c1
commit 8f8a7b89c1
parent 1c66024316
2 changed files with 38 additions and 134 deletions
--- a/recipes/economist.recipe
+++ b/recipes/economist.recipe
@ -6,7 +6,6 @@ try:
 except ImportError:
    from cookielib import Cookie
 import json
 from collections import OrderedDict
 from html5_parser import parse
 from lxml import etree
@ -250,76 +249,29 @@ class Economist(BasicNewsRecipe):
        return ans
    def economist_parse_index(self, soup):
-        img = soup.find(attrs={'srcset': True, 'class': lambda x: x and 'print-edition__cover-widget__image' in x.split()})
+        div = soup.find(attrs={'class': 'weekly-edition-header__image'})
-        if img is not None:
+        if div is not None:
-            for part in img['srcset'].split():
+            img = div.find('img', srcset=True)
-                if part.startswith('/'):
+            self.cover_url = img['srcset'].split(',')[-1].split()[0]
-                    part = part.replace('200-width', '640-width')
+            self.log('Got cover:', self.cover_url)
                    self.cover_url = 'https://www.economist.com' + part
                    self.log('Got cover:', self.cover_url)
                    break
-        sections = soup.findAll('div', attrs={'class': 'list__title'})
+        feeds = []
-        if sections:
+        for section in soup.findAll('div', **classes('layout-weekly-edition-section')):
-            feeds = []
+            h2 = section.find('h2')
-            for section in sections:
+            secname = self.tag_to_string(h2)
-                articles = []
+            self.log(secname)
                secname = self.tag_to_string(section)
                self.log(secname)
                for a in section.findNextSiblings('a', href=True):
                    spans = a.findAll('span')
                    if len(spans) == 2:
                        title = u'{}: {}'.format(*map(self.tag_to_string, spans))
                    else:
                        title = self.tag_to_string(a)
                    articles.append({'title': title, 'url': process_url(a['href'])})
                    self.log(' ', title, articles[-1]['url'])
                if articles:
                    feeds.append((secname, articles))
            return feeds
        return self.economist_parse_old_index(soup)
    def economist_parse_old_index(self, soup):
        feeds = OrderedDict()
        for section in soup.findAll(attrs={'class': lambda x: x and 'section' in x}):
            h4 = section.find('h4')
            if h4 is None:
                continue
            section_title = self.tag_to_string(h4).strip()
            if not section_title:
                continue
            self.log('Found section: %s' % section_title)
            articles = []
-            subsection = ''
+            for a in section.findAll('a', href=True, **classes('headline-link weekly-edition-wtw__link')):
-            for node in section.findAll(attrs={'class': 'article'}):
+                spans = a.findAll('span')
-                subsec = node.findPreviousSibling('h5')
+                if len(spans) == 2:
-                if subsec is not None:
+                    title = u'{}: {}'.format(*map(self.tag_to_string, spans))
-                    subsection = self.tag_to_string(subsec)
+                else:
                prefix = (subsection + ': ') if subsection else ''
                a = node.find('a', href=True)
                if a is not None:
                    url = a['href']
                    if url.startswith('/'):
                        url = 'https://www.economist.com' + url
                    url += '/print'
                    title = self.tag_to_string(a)
-                    if title:
+                articles.append({'title': title, 'url': process_url(a['href'])})
-                        title = prefix + title
+                self.log(' ', title, articles[-1]['url'])
                        self.log('\tFound article:', title)
                        articles.append({
                            'title': title,
                            'url': url,
                            'description': '',
                            'date': ''
                        })
            if articles:
-                if section_title not in feeds:
+                feeds.append((secname, articles))
-                    feeds[section_title] = []
+        return feeds
                feeds[section_title] += articles
        ans = [(key, val) for key, val in feeds.items()]
        return ans
    def eco_find_image_tables(self, soup):
        for x in soup.findAll('table', align=['right', 'center']):
--- a/recipes/economist_free.recipe
+++ b/recipes/economist_free.recipe
@ -6,7 +6,6 @@ try:
 except ImportError:
    from cookielib import Cookie
 import json
 from collections import OrderedDict
 from html5_parser import parse
 from lxml import etree
@ -250,76 +249,29 @@ class Economist(BasicNewsRecipe):
        return ans
    def economist_parse_index(self, soup):
-        img = soup.find(attrs={'srcset': True, 'class': lambda x: x and 'print-edition__cover-widget__image' in x.split()})
+        div = soup.find(attrs={'class': 'weekly-edition-header__image'})
-        if img is not None:
+        if div is not None:
-            for part in img['srcset'].split():
+            img = div.find('img', srcset=True)
-                if part.startswith('/'):
+            self.cover_url = img['srcset'].split(',')[-1].split()[0]
-                    part = part.replace('200-width', '640-width')
+            self.log('Got cover:', self.cover_url)
                    self.cover_url = 'https://www.economist.com' + part
                    self.log('Got cover:', self.cover_url)
                    break
-        sections = soup.findAll('div', attrs={'class': 'list__title'})
+        feeds = []
-        if sections:
+        for section in soup.findAll('div', **classes('layout-weekly-edition-section')):
-            feeds = []
+            h2 = section.find('h2')
-            for section in sections:
+            secname = self.tag_to_string(h2)
-                articles = []
+            self.log(secname)
                secname = self.tag_to_string(section)
                self.log(secname)
                for a in section.findNextSiblings('a', href=True):
                    spans = a.findAll('span')
                    if len(spans) == 2:
                        title = u'{}: {}'.format(*map(self.tag_to_string, spans))
                    else:
                        title = self.tag_to_string(a)
                    articles.append({'title': title, 'url': process_url(a['href'])})
                    self.log(' ', title, articles[-1]['url'])
                if articles:
                    feeds.append((secname, articles))
            return feeds
        return self.economist_parse_old_index(soup)
    def economist_parse_old_index(self, soup):
        feeds = OrderedDict()
        for section in soup.findAll(attrs={'class': lambda x: x and 'section' in x}):
            h4 = section.find('h4')
            if h4 is None:
                continue
            section_title = self.tag_to_string(h4).strip()
            if not section_title:
                continue
            self.log('Found section: %s' % section_title)
            articles = []
-            subsection = ''
+            for a in section.findAll('a', href=True, **classes('headline-link weekly-edition-wtw__link')):
-            for node in section.findAll(attrs={'class': 'article'}):
+                spans = a.findAll('span')
-                subsec = node.findPreviousSibling('h5')
+                if len(spans) == 2:
-                if subsec is not None:
+                    title = u'{}: {}'.format(*map(self.tag_to_string, spans))
-                    subsection = self.tag_to_string(subsec)
+                else:
                prefix = (subsection + ': ') if subsection else ''
                a = node.find('a', href=True)
                if a is not None:
                    url = a['href']
                    if url.startswith('/'):
                        url = 'https://www.economist.com' + url
                    url += '/print'
                    title = self.tag_to_string(a)
-                    if title:
+                articles.append({'title': title, 'url': process_url(a['href'])})
-                        title = prefix + title
+                self.log(' ', title, articles[-1]['url'])
                        self.log('\tFound article:', title)
                        articles.append({
                            'title': title,
                            'url': url,
                            'description': '',
                            'date': ''
                        })
            if articles:
-                if section_title not in feeds:
+                feeds.append((secname, articles))
-                    feeds[section_title] = []
+        return feeds
                feeds[section_title] += articles
        ans = [(key, val) for key, val in feeds.items()]
        return ans
    def eco_find_image_tables(self, soup):
        for x in soup.findAll('table', align=['right', 'center']):