Update Spectator recipe to use new markup

However, the spectator server randomnly serves the old markup but its impossible to get main.js so things dont actually work
2025-07-09 03:04:10 -04:00 · 2022-07-27 14:37:18 +05:30 · 2022-07-27 14:37:18 +05:30 · 525988b151
commit 525988b151
parent 28070b6661
1 changed files with 50 additions and 106 deletions
--- a/recipes/spectator_magazine.recipe
+++ b/recipes/spectator_magazine.recipe
@ -4,21 +4,15 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
-import json
+import time
-import re
+from calibre import random_user_agent
-
+from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
 from mechanize import Request
 from calibre.web.feeds.recipes import BasicNewsRecipe
 try:
    from urllib.parse import quote
 except ImportError:
    from urllib import quote
 def absolutize(url):
-    return 'https://spectator.co.uk' + url
+    if url.startswith('/'):
        url =  'https://spectator.co.uk' + url
    return url
 class Spectator(BasicNewsRecipe):
@ -27,100 +21,50 @@ class Spectator(BasicNewsRecipe):
    __author__ = 'Kovid Goyal'
    description = 'Magazine'
    language = 'en'
    no_stylesheets = True
-    use_embedded_content = True
+
    keep_only_tags = [
        prefixed_classes('ContentPageHeader_main__ ContentPageHeader_metadata__ ContentPageHero_container__ ContentPageBody_body__container__'),
        dict(name='noscript'),
    ]
    remove_attributes = ['style']
    def parse_index(self):
-        br = self.get_browser()
+        soup = self.index_to_soup('https://www.spectator.co.uk/magazine/latest')
-        main_js = br.open_novisit('https://spectator.co.uk/main.js').read().decode('utf-8')
+        raw = str(soup)
-        data = {}
+        # open('/t/raw.html', 'w').write(raw)
-        fields = ('apiKey', 'apiSecret', 'contentEnvironment', 'siteUrl', 'magazineIssueContentUrl', 'contentUrl')
+        section, articles = 'Featured', []
-        pat = r'this.({})\s*=\s*"(.+?)"'.format('|'.join(fields))
+        feeds = []
-        for m in re.finditer(pat, main_js):
+        for art in soup.findAll(**prefixed_classes(
-            data[m.group(1)] = m.group(2)
+                'MagazineContent_spectator-magazine__section-title__ MagazineContent_spectator-magazine-content__article-card__')):
-        self.log('Got Spectator data:', data)
+            cls = art['class']
-        headers = {
+            if not isinstance(cls, str):
-            'api_key': data['apiKey'],
+                cls = ' '.join(cls)
-            'origin': data['siteUrl'],
+            if 'section-title' in cls:
-            'access_token': data['apiSecret'],
+                if articles:
-            'Accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
+                    feeds.append((section, articles))
-            'Accept-encoding': 'gzip, deflate',
+                section = self.tag_to_string(art).strip()
-            'Accept': '*/*',
+                articles = []
-        }
+                self.log(section)
-
+                continue
-        def make_url(utype, query, includes=(), limit=None):
+            a = art.find('a', href=True)
-            ans = data[utype] + '/entries?environment=' + data['contentEnvironment']
+            url = absolutize(a['href'])
-            if limit is not None:
+            title = self.tag_to_string(a).strip()
-                ans += '&limit={}'.format(limit)
+            hd = art.find(**prefixed_classes('ArticleCard_spectator-article-card__headline__'))
-            for inc in includes:
+            if hd:
-                ans += '&include[]=' + inc
+                title = self.tag_to_string(hd).strip()
-            ans += '&query=' + quote(json.dumps(query))
+            desc = ''
-            return ans
+            dd = art.find(**prefixed_classes('ArticleCard_spectator-article-card__media-teaser__'))
-
+            if dd:
-        def get_result(url):
+                desc = self.tag_to_string(dd).strip()
-            self.log('Fetching:', url)
+            self.log('\t', title, url)
-            req = Request(url, headers=headers)
+            if desc:
-            raw = br.open_novisit(req).read().decode('utf-8')
+                self.log('\t\t', desc)
-            return json.loads(raw)['entries']
+            articles.append({'title': title, 'url': url, 'description': desc})
-
+        if not feeds and '<script src="/main.js' in raw:
-        # Get current issue
+            ua = random_user_agent(allow_ie=False)
-        url = data['magazineIssueContentUrl'] + '/entries?environment=' + data['contentEnvironment'] + "&desc=issue_date&limit=1&only[BASE][]=url"
+            self.log('Got old style main.js page, retrying with user agent:', ua)
-        result = get_result(url)
+            self.browser.set_user_agent(ua)
-        slug = result[0]['url']
+            time.sleep(1)
-        uid = result[0]['uid']  # noqa
+            return self.parse_index()
-        date = slug.split('/')[-1]
+        return feeds
        self.log('Downloading issue:', date)
        # Cover information
        url = make_url(
            'magazineIssueContentUrl',
            {'url': slug},
            limit=1
        )
        self.cover_url = get_result(url)[0]['magazine_cover']['url']
        self.log('Found cover:', self.cover_url)
        # List of articles
        url = make_url(
            'contentUrl',
            {
                "magazine_content_production_only.magazine_issue": {
                    "$in_query": {"url": slug},
                    "_content_type_uid": "magazine_issue"
                },
                "_content_type_uid": "article"
            },
            includes=(
                'topic', 'magazine_content_production_only.magazine_issue',
                'magazine_content_production_only.magazine_subsection', 'author'
            )
        )
        result = get_result(url)
        articles = {}
        for entry in result:
            title = entry['title']
            url = absolutize(entry['url'])
            blocks = []
            a = blocks.append
            byline = entry.get('byline') or ''
            if byline:
                a('<h3>{}</h3>'.format(byline))
            if entry.get('author'):
                for au in reversed(entry['author']):
                    cac = ''
                    if au.get('caricature'):
                        cac = '<div><img style="max-width: 80px" src="{}"></div>'.format(au['caricature']['url'])
                    a('<div>{} <a href="{}">{}</a></div>'.format(cac, absolutize(au['url']), au['title']))
            if entry.get('hero_image'):
                hi = entry['hero_image'][0]
                a('<div style="text-align: center"><img src="{}"></div>'.format(hi['url']))
            a(entry['text_body'])
            section = 'Unknown'
            if entry.get('topic'):
                topic = entry['topic'][0]
                section = topic['title']
            articles.setdefault(section, []).append({
                'title': title, 'url': url, 'description': byline, 'content': '\n\n'.join(blocks)})
        return [(sec, articles[sec]) for sec in sorted(articles)]