mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
106 lines
4.1 KiB
Plaintext
106 lines
4.1 KiB
Plaintext
'''
|
|
scmp.com
|
|
'''
|
|
|
|
from mechanize import Request
|
|
import json
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
from calibre.ebooks.BeautifulSoup import Tag
|
|
|
|
|
|
def classes(classes):
|
|
q = frozenset(classes.split(' '))
|
|
return dict(attrs={
|
|
'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
|
|
|
|
|
def new_tag(soup, name, attrs=()):
|
|
impl = getattr(soup, 'new_tag', None)
|
|
if impl is not None:
|
|
return impl(name, attrs=dict(attrs))
|
|
return Tag(soup, name, attrs=attrs or None)
|
|
|
|
|
|
class SCMP(BasicNewsRecipe):
|
|
title = 'South China Morning Post'
|
|
__author__ = 'llam'
|
|
description = "SCMP.com, Hong Kong's premier online English daily provides exclusive up-to-date news, audio video news, podcasts, RSS Feeds, Blogs, breaking news, top stories, award winning news and analysis on Hong Kong and China." # noqa
|
|
publisher = 'South China Morning Post Publishers Ltd.'
|
|
oldest_article = 2
|
|
delay = 1
|
|
max_articles_per_feed = 200
|
|
no_stylesheets = True
|
|
encoding = 'utf-8'
|
|
use_embedded_content = False
|
|
language = 'en_CN'
|
|
remove_empty_feeds = True
|
|
needs_subscription = 'optional'
|
|
publication_type = 'newspaper'
|
|
|
|
keep_only_tags = [
|
|
dict(name='h1'),
|
|
classes('info__subHeadline article-author main__right'),
|
|
]
|
|
remove_tags = [
|
|
dict(name='button')
|
|
]
|
|
|
|
def get_browser(self):
|
|
br = BasicNewsRecipe.get_browser(self)
|
|
if self.username is not None and self.password is not None:
|
|
# br.set_debug_http(True)
|
|
# br.set_debug_responses(True)
|
|
# br.set_debug_redirects(True)
|
|
rq = Request('https://account.scmp.com/login', headers={
|
|
'Accept': 'application/json, text/plain, */*',
|
|
'Content-Type': 'application/json;charset=UTF-8',
|
|
'Referer': 'https://account.scmp.com/login',
|
|
}, data=json.dumps({'username': self.username, 'password': self.password}))
|
|
self.log('Sending login request...')
|
|
try:
|
|
res = br.open(rq)
|
|
except Exception as err:
|
|
if hasattr(err, 'read'):
|
|
raise Exception('Login request failed with error: {} and body: {}'.format(err, err.read().decode('utf-8', 'replace')))
|
|
raise
|
|
if res.code != 200:
|
|
raise ValueError('Failed to login, check your username and password')
|
|
nonce = json.loads(res.read())['nonce']
|
|
rq = Request('https://www.scmp.com/centralize/signin?nonce=' + nonce, headers={
|
|
'referer': 'https://account.scmp.com/login',
|
|
'sec-fetch-mode': 'navigate',
|
|
'sec-fetch-site': 'same-site',
|
|
'sec-fetch-user': '?1'})
|
|
res = br.open(rq)
|
|
if res.code != 200:
|
|
raise ValueError('Failed to login, check your username and password')
|
|
return br
|
|
|
|
feeds = [
|
|
('Hong Kong', 'https://www.scmp.com/rss/2/feed'),
|
|
('China', 'https://www.scmp.com/rss/4/feed'),
|
|
('Asia', 'https://www.scmp.com/rss/3/feed'),
|
|
('World', 'https://www.scmp.com/rss/5/feed'),
|
|
('Business', 'https://www.scmp.com/rss/92/feed'),
|
|
('Tech', 'https://www.scmp.com/rss/36/feed'),
|
|
('Life', 'https://www.scmp.com/rss/94/feed'),
|
|
('Culture', 'https://www.scmp.com/rss/322296/feed'),
|
|
('Sport', 'https://www.scmp.com/rss/95/feed'),
|
|
('Post Mag', 'https://www.scmp.com/rss/71/feed'),
|
|
('Style', 'https://www.scmp.com/rss/72/feed'),
|
|
]
|
|
|
|
def preprocess_html(self, soup):
|
|
for img in soup.findAll("img", attrs={'data-original':True}):
|
|
img['src'] = img['data-original']
|
|
meta = soup.find('meta', attrs={'name':'twitter:image:src'}, content=True)
|
|
if meta is not None:
|
|
wrapper = soup.find(**classes('image-wrapper__placeholder'))
|
|
if wrapper is not None:
|
|
p = wrapper.parent
|
|
img = new_tag(soup, 'img')
|
|
img['src'] = meta['content']
|
|
p.append(img)
|
|
wrapper.extract()
|
|
return soup
|