calibre/recipes/scmp.recipe

106 lines
4.1 KiB
Plaintext

'''
scmp.com
'''
from mechanize import Request
import json
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
def new_tag(soup, name, attrs=()):
impl = getattr(soup, 'new_tag', None)
if impl is not None:
return impl(name, attrs=dict(attrs))
return Tag(soup, name, attrs=attrs or None)
class SCMP(BasicNewsRecipe):
title = 'South China Morning Post'
__author__ = 'llam'
description = "SCMP.com, Hong Kong's premier online English daily provides exclusive up-to-date news, audio video news, podcasts, RSS Feeds, Blogs, breaking news, top stories, award winning news and analysis on Hong Kong and China." # noqa
publisher = 'South China Morning Post Publishers Ltd.'
oldest_article = 2
delay = 1
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
language = 'en_CN'
remove_empty_feeds = True
needs_subscription = 'optional'
publication_type = 'newspaper'
keep_only_tags = [
dict(name='h1'),
classes('info__subHeadline article-author main__right'),
]
remove_tags = [
dict(name='button')
]
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None:
# br.set_debug_http(True)
# br.set_debug_responses(True)
# br.set_debug_redirects(True)
rq = Request('https://account.scmp.com/login', headers={
'Accept': 'application/json, text/plain, */*',
'Content-Type': 'application/json;charset=UTF-8',
'Referer': 'https://account.scmp.com/login',
}, data=json.dumps({'username': self.username, 'password': self.password}))
self.log('Sending login request...')
try:
res = br.open(rq)
except Exception as err:
if hasattr(err, 'read'):
raise Exception('Login request failed with error: {} and body: {}'.format(err, err.read().decode('utf-8', 'replace')))
raise
if res.code != 200:
raise ValueError('Failed to login, check your username and password')
nonce = json.loads(res.read())['nonce']
rq = Request('https://www.scmp.com/centralize/signin?nonce=' + nonce, headers={
'referer': 'https://account.scmp.com/login',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-site',
'sec-fetch-user': '?1'})
res = br.open(rq)
if res.code != 200:
raise ValueError('Failed to login, check your username and password')
return br
feeds = [
('Hong Kong', 'https://www.scmp.com/rss/2/feed'),
('China', 'https://www.scmp.com/rss/4/feed'),
('Asia', 'https://www.scmp.com/rss/3/feed'),
('World', 'https://www.scmp.com/rss/5/feed'),
('Business', 'https://www.scmp.com/rss/92/feed'),
('Tech', 'https://www.scmp.com/rss/36/feed'),
('Life', 'https://www.scmp.com/rss/94/feed'),
('Culture', 'https://www.scmp.com/rss/322296/feed'),
('Sport', 'https://www.scmp.com/rss/95/feed'),
('Post Mag', 'https://www.scmp.com/rss/71/feed'),
('Style', 'https://www.scmp.com/rss/72/feed'),
]
def preprocess_html(self, soup):
for img in soup.findAll("img", attrs={'data-original':True}):
img['src'] = img['data-original']
meta = soup.find('meta', attrs={'name':'twitter:image:src'}, content=True)
if meta is not None:
wrapper = soup.find(**classes('image-wrapper__placeholder'))
if wrapper is not None:
p = wrapper.parent
img = new_tag(soup, 'img')
img['src'] = meta['content']
p.append(img)
wrapper.extract()
return soup