mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Spectator recipe to use new markup
However, the spectator server randomnly serves the old markup but its impossible to get main.js so things dont actually work
This commit is contained in:
parent
28070b6661
commit
525988b151
@ -4,21 +4,15 @@
|
|||||||
|
|
||||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
import json
|
import time
|
||||||
import re
|
from calibre import random_user_agent
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
|
||||||
from mechanize import Request
|
|
||||||
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
||||||
|
|
||||||
try:
|
|
||||||
from urllib.parse import quote
|
|
||||||
except ImportError:
|
|
||||||
from urllib import quote
|
|
||||||
|
|
||||||
|
|
||||||
def absolutize(url):
|
def absolutize(url):
|
||||||
return 'https://spectator.co.uk' + url
|
if url.startswith('/'):
|
||||||
|
url = 'https://spectator.co.uk' + url
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
class Spectator(BasicNewsRecipe):
|
class Spectator(BasicNewsRecipe):
|
||||||
@ -27,100 +21,50 @@ class Spectator(BasicNewsRecipe):
|
|||||||
__author__ = 'Kovid Goyal'
|
__author__ = 'Kovid Goyal'
|
||||||
description = 'Magazine'
|
description = 'Magazine'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
use_embedded_content = True
|
|
||||||
|
keep_only_tags = [
|
||||||
|
prefixed_classes('ContentPageHeader_main__ ContentPageHeader_metadata__ ContentPageHero_container__ ContentPageBody_body__container__'),
|
||||||
|
dict(name='noscript'),
|
||||||
|
]
|
||||||
|
remove_attributes = ['style']
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
br = self.get_browser()
|
soup = self.index_to_soup('https://www.spectator.co.uk/magazine/latest')
|
||||||
main_js = br.open_novisit('https://spectator.co.uk/main.js').read().decode('utf-8')
|
raw = str(soup)
|
||||||
data = {}
|
# open('/t/raw.html', 'w').write(raw)
|
||||||
fields = ('apiKey', 'apiSecret', 'contentEnvironment', 'siteUrl', 'magazineIssueContentUrl', 'contentUrl')
|
section, articles = 'Featured', []
|
||||||
pat = r'this.({})\s*=\s*"(.+?)"'.format('|'.join(fields))
|
feeds = []
|
||||||
for m in re.finditer(pat, main_js):
|
for art in soup.findAll(**prefixed_classes(
|
||||||
data[m.group(1)] = m.group(2)
|
'MagazineContent_spectator-magazine__section-title__ MagazineContent_spectator-magazine-content__article-card__')):
|
||||||
self.log('Got Spectator data:', data)
|
cls = art['class']
|
||||||
headers = {
|
if not isinstance(cls, str):
|
||||||
'api_key': data['apiKey'],
|
cls = ' '.join(cls)
|
||||||
'origin': data['siteUrl'],
|
if 'section-title' in cls:
|
||||||
'access_token': data['apiSecret'],
|
if articles:
|
||||||
'Accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
|
feeds.append((section, articles))
|
||||||
'Accept-encoding': 'gzip, deflate',
|
section = self.tag_to_string(art).strip()
|
||||||
'Accept': '*/*',
|
articles = []
|
||||||
}
|
self.log(section)
|
||||||
|
continue
|
||||||
def make_url(utype, query, includes=(), limit=None):
|
a = art.find('a', href=True)
|
||||||
ans = data[utype] + '/entries?environment=' + data['contentEnvironment']
|
url = absolutize(a['href'])
|
||||||
if limit is not None:
|
title = self.tag_to_string(a).strip()
|
||||||
ans += '&limit={}'.format(limit)
|
hd = art.find(**prefixed_classes('ArticleCard_spectator-article-card__headline__'))
|
||||||
for inc in includes:
|
if hd:
|
||||||
ans += '&include[]=' + inc
|
title = self.tag_to_string(hd).strip()
|
||||||
ans += '&query=' + quote(json.dumps(query))
|
desc = ''
|
||||||
return ans
|
dd = art.find(**prefixed_classes('ArticleCard_spectator-article-card__media-teaser__'))
|
||||||
|
if dd:
|
||||||
def get_result(url):
|
desc = self.tag_to_string(dd).strip()
|
||||||
self.log('Fetching:', url)
|
self.log('\t', title, url)
|
||||||
req = Request(url, headers=headers)
|
if desc:
|
||||||
raw = br.open_novisit(req).read().decode('utf-8')
|
self.log('\t\t', desc)
|
||||||
return json.loads(raw)['entries']
|
articles.append({'title': title, 'url': url, 'description': desc})
|
||||||
|
if not feeds and '<script src="/main.js' in raw:
|
||||||
# Get current issue
|
ua = random_user_agent(allow_ie=False)
|
||||||
url = data['magazineIssueContentUrl'] + '/entries?environment=' + data['contentEnvironment'] + "&desc=issue_date&limit=1&only[BASE][]=url"
|
self.log('Got old style main.js page, retrying with user agent:', ua)
|
||||||
result = get_result(url)
|
self.browser.set_user_agent(ua)
|
||||||
slug = result[0]['url']
|
time.sleep(1)
|
||||||
uid = result[0]['uid'] # noqa
|
return self.parse_index()
|
||||||
date = slug.split('/')[-1]
|
return feeds
|
||||||
self.log('Downloading issue:', date)
|
|
||||||
|
|
||||||
# Cover information
|
|
||||||
url = make_url(
|
|
||||||
'magazineIssueContentUrl',
|
|
||||||
{'url': slug},
|
|
||||||
limit=1
|
|
||||||
)
|
|
||||||
self.cover_url = get_result(url)[0]['magazine_cover']['url']
|
|
||||||
self.log('Found cover:', self.cover_url)
|
|
||||||
|
|
||||||
# List of articles
|
|
||||||
url = make_url(
|
|
||||||
'contentUrl',
|
|
||||||
{
|
|
||||||
"magazine_content_production_only.magazine_issue": {
|
|
||||||
"$in_query": {"url": slug},
|
|
||||||
"_content_type_uid": "magazine_issue"
|
|
||||||
},
|
|
||||||
"_content_type_uid": "article"
|
|
||||||
},
|
|
||||||
includes=(
|
|
||||||
'topic', 'magazine_content_production_only.magazine_issue',
|
|
||||||
'magazine_content_production_only.magazine_subsection', 'author'
|
|
||||||
)
|
|
||||||
)
|
|
||||||
result = get_result(url)
|
|
||||||
articles = {}
|
|
||||||
for entry in result:
|
|
||||||
title = entry['title']
|
|
||||||
url = absolutize(entry['url'])
|
|
||||||
blocks = []
|
|
||||||
a = blocks.append
|
|
||||||
byline = entry.get('byline') or ''
|
|
||||||
if byline:
|
|
||||||
a('<h3>{}</h3>'.format(byline))
|
|
||||||
if entry.get('author'):
|
|
||||||
for au in reversed(entry['author']):
|
|
||||||
cac = ''
|
|
||||||
if au.get('caricature'):
|
|
||||||
cac = '<div><img style="max-width: 80px" src="{}"></div>'.format(au['caricature']['url'])
|
|
||||||
a('<div>{} <a href="{}">{}</a></div>'.format(cac, absolutize(au['url']), au['title']))
|
|
||||||
|
|
||||||
if entry.get('hero_image'):
|
|
||||||
hi = entry['hero_image'][0]
|
|
||||||
a('<div style="text-align: center"><img src="{}"></div>'.format(hi['url']))
|
|
||||||
a(entry['text_body'])
|
|
||||||
section = 'Unknown'
|
|
||||||
if entry.get('topic'):
|
|
||||||
topic = entry['topic'][0]
|
|
||||||
section = topic['title']
|
|
||||||
articles.setdefault(section, []).append({
|
|
||||||
'title': title, 'url': url, 'description': byline, 'content': '\n\n'.join(blocks)})
|
|
||||||
return [(sec, articles[sec]) for sec in sorted(articles)]
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user