Update Spectator recipe to use new markup

However, the spectator server randomnly serves the old markup but its
impossible to get main.js so things dont actually work
This commit is contained in:
Kovid Goyal 2022-07-27 14:37:18 +05:30
parent 28070b6661
commit 525988b151
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -4,21 +4,15 @@
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import json import time
import re from calibre import random_user_agent
from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
from mechanize import Request
from calibre.web.feeds.recipes import BasicNewsRecipe
try:
from urllib.parse import quote
except ImportError:
from urllib import quote
def absolutize(url): def absolutize(url):
return 'https://spectator.co.uk' + url if url.startswith('/'):
url = 'https://spectator.co.uk' + url
return url
class Spectator(BasicNewsRecipe): class Spectator(BasicNewsRecipe):
@ -27,100 +21,50 @@ class Spectator(BasicNewsRecipe):
__author__ = 'Kovid Goyal' __author__ = 'Kovid Goyal'
description = 'Magazine' description = 'Magazine'
language = 'en' language = 'en'
no_stylesheets = True no_stylesheets = True
use_embedded_content = True
keep_only_tags = [
prefixed_classes('ContentPageHeader_main__ ContentPageHeader_metadata__ ContentPageHero_container__ ContentPageBody_body__container__'),
dict(name='noscript'),
]
remove_attributes = ['style']
def parse_index(self): def parse_index(self):
br = self.get_browser() soup = self.index_to_soup('https://www.spectator.co.uk/magazine/latest')
main_js = br.open_novisit('https://spectator.co.uk/main.js').read().decode('utf-8') raw = str(soup)
data = {} # open('/t/raw.html', 'w').write(raw)
fields = ('apiKey', 'apiSecret', 'contentEnvironment', 'siteUrl', 'magazineIssueContentUrl', 'contentUrl') section, articles = 'Featured', []
pat = r'this.({})\s*=\s*"(.+?)"'.format('|'.join(fields)) feeds = []
for m in re.finditer(pat, main_js): for art in soup.findAll(**prefixed_classes(
data[m.group(1)] = m.group(2) 'MagazineContent_spectator-magazine__section-title__ MagazineContent_spectator-magazine-content__article-card__')):
self.log('Got Spectator data:', data) cls = art['class']
headers = { if not isinstance(cls, str):
'api_key': data['apiKey'], cls = ' '.join(cls)
'origin': data['siteUrl'], if 'section-title' in cls:
'access_token': data['apiSecret'], if articles:
'Accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', feeds.append((section, articles))
'Accept-encoding': 'gzip, deflate', section = self.tag_to_string(art).strip()
'Accept': '*/*', articles = []
} self.log(section)
continue
def make_url(utype, query, includes=(), limit=None): a = art.find('a', href=True)
ans = data[utype] + '/entries?environment=' + data['contentEnvironment'] url = absolutize(a['href'])
if limit is not None: title = self.tag_to_string(a).strip()
ans += '&limit={}'.format(limit) hd = art.find(**prefixed_classes('ArticleCard_spectator-article-card__headline__'))
for inc in includes: if hd:
ans += '&include[]=' + inc title = self.tag_to_string(hd).strip()
ans += '&query=' + quote(json.dumps(query)) desc = ''
return ans dd = art.find(**prefixed_classes('ArticleCard_spectator-article-card__media-teaser__'))
if dd:
def get_result(url): desc = self.tag_to_string(dd).strip()
self.log('Fetching:', url) self.log('\t', title, url)
req = Request(url, headers=headers) if desc:
raw = br.open_novisit(req).read().decode('utf-8') self.log('\t\t', desc)
return json.loads(raw)['entries'] articles.append({'title': title, 'url': url, 'description': desc})
if not feeds and '<script src="/main.js' in raw:
# Get current issue ua = random_user_agent(allow_ie=False)
url = data['magazineIssueContentUrl'] + '/entries?environment=' + data['contentEnvironment'] + "&desc=issue_date&limit=1&only[BASE][]=url" self.log('Got old style main.js page, retrying with user agent:', ua)
result = get_result(url) self.browser.set_user_agent(ua)
slug = result[0]['url'] time.sleep(1)
uid = result[0]['uid'] # noqa return self.parse_index()
date = slug.split('/')[-1] return feeds
self.log('Downloading issue:', date)
# Cover information
url = make_url(
'magazineIssueContentUrl',
{'url': slug},
limit=1
)
self.cover_url = get_result(url)[0]['magazine_cover']['url']
self.log('Found cover:', self.cover_url)
# List of articles
url = make_url(
'contentUrl',
{
"magazine_content_production_only.magazine_issue": {
"$in_query": {"url": slug},
"_content_type_uid": "magazine_issue"
},
"_content_type_uid": "article"
},
includes=(
'topic', 'magazine_content_production_only.magazine_issue',
'magazine_content_production_only.magazine_subsection', 'author'
)
)
result = get_result(url)
articles = {}
for entry in result:
title = entry['title']
url = absolutize(entry['url'])
blocks = []
a = blocks.append
byline = entry.get('byline') or ''
if byline:
a('<h3>{}</h3>'.format(byline))
if entry.get('author'):
for au in reversed(entry['author']):
cac = ''
if au.get('caricature'):
cac = '<div><img style="max-width: 80px" src="{}"></div>'.format(au['caricature']['url'])
a('<div>{} <a href="{}">{}</a></div>'.format(cac, absolutize(au['url']), au['title']))
if entry.get('hero_image'):
hi = entry['hero_image'][0]
a('<div style="text-align: center"><img src="{}"></div>'.format(hi['url']))
a(entry['text_body'])
section = 'Unknown'
if entry.get('topic'):
topic = entry['topic'][0]
section = topic['title']
articles.setdefault(section, []).append({
'title': title, 'url': url, 'description': byline, 'content': '\n\n'.join(blocks)})
return [(sec, articles[sec]) for sec in sorted(articles)]