mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Work on webengine browser backend
This commit is contained in:
parent
f17dd504b3
commit
0930904897
@ -9,9 +9,12 @@
|
||||
(function() {
|
||||
"use strict";
|
||||
|
||||
var messages = [];
|
||||
var live_requests = {};
|
||||
|
||||
function send_msg(data) {
|
||||
var token = 'TOKEN';
|
||||
var msg = token + ' ' + JSON.stringify(data);
|
||||
const token = 'TOKEN';
|
||||
const msg = token + ' ' + JSON.stringify(data);
|
||||
console.log(msg);
|
||||
}
|
||||
|
||||
@ -21,7 +24,49 @@
|
||||
send_msg({type: 'print', text: text});
|
||||
}
|
||||
|
||||
if (document.location && document.location.href && !document.location.href.startsWith('chrome-error:') && !document.location.href.startsWith('about:')) {
|
||||
send_msg({type: 'domready', url: document.location.href, html: new XMLSerializer().serializeToString(document)});
|
||||
function notify_that_messages_are_available() {
|
||||
send_msg({type: 'messages_available', count: messages.length});
|
||||
}
|
||||
|
||||
async function download(req, data) {
|
||||
try {
|
||||
const controller = new AbortController();
|
||||
var fetch_options = {
|
||||
method: req.method.toUpperCase(),
|
||||
signal: controller.signal,
|
||||
};
|
||||
const response = await fetch(req.url, fetch_options);
|
||||
var headers = [];
|
||||
for (const pair of response.headers) {
|
||||
headers.push(pair);
|
||||
}
|
||||
const body = await response.arrayBuffer();
|
||||
delete live_requests[req.id];
|
||||
messages.push({type: 'finished', req: req, status_code: response.status, status_msg: response.statusText, url: response.url, headers: headers, type: response.type, body: body});
|
||||
notify_that_messages_are_available();
|
||||
} catch (error) {
|
||||
messages.push({type: 'finished', error: error.message, req: req, url: req.url});
|
||||
notify_that_messages_are_available();
|
||||
}
|
||||
}
|
||||
|
||||
function abort_download(req_id) {
|
||||
var controller = live_requests[req_id];
|
||||
if (controller) {
|
||||
controller.abort();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function get_messages() {
|
||||
var ans = messages;
|
||||
messages = [];
|
||||
return ans;
|
||||
}
|
||||
|
||||
const payload = JSON.parse(document.getElementById('payload').textContent);
|
||||
window.get_messages = get_messages;
|
||||
window.abort_download = abort_download;
|
||||
download(payload.req, payload.data);
|
||||
})();
|
||||
|
241
src/calibre/scraper/webengine_backend.py
Normal file
241
src/calibre/scraper/webengine_backend.py
Normal file
@ -0,0 +1,241 @@
|
||||
#!/usr/bin/env python
|
||||
# License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
|
||||
|
||||
import base64
|
||||
import html
|
||||
import json
|
||||
import os
|
||||
import secrets
|
||||
import sys
|
||||
from contextlib import suppress
|
||||
from time import monotonic
|
||||
|
||||
from qt.core import QApplication, QNetworkCookie, QObject, Qt, QTimer, QUrl, pyqtSignal
|
||||
from qt.webengine import QWebEnginePage, QWebEngineScript
|
||||
|
||||
from calibre.scraper.qt_backend import Request
|
||||
from calibre.scraper.qt_backend import worker as qt_worker
|
||||
from calibre.scraper.simple_backend import create_base_profile
|
||||
from calibre.utils.resources import get_path as P
|
||||
from calibre.utils.webengine import create_script, insert_scripts
|
||||
|
||||
default_timeout: float = 60. # seconds
|
||||
|
||||
|
||||
def qurl_to_string(url: QUrl | str) -> str:
|
||||
return bytes(QUrl(url).toEncoded()).decode()
|
||||
|
||||
|
||||
def qurl_to_key(url: QUrl | str) -> str:
|
||||
return qurl_to_string(url).rstrip('/')
|
||||
|
||||
|
||||
Headers = list[tuple[str, str]]
|
||||
|
||||
|
||||
class DownloadRequest(QObject):
|
||||
|
||||
worth_retry: bool = False
|
||||
response_received = pyqtSignal(object)
|
||||
|
||||
def __init__(self, url: str, output_path: str, timeout: float, req_id: int, parent: 'FetchBackend'):
|
||||
super().__init__(parent)
|
||||
self.url, self.filename = url, os.path.basename(output_path)
|
||||
self.output_path = output_path
|
||||
self.req_id: int = req_id
|
||||
self.created_at = self.last_activity_at = monotonic()
|
||||
self.timeout = timeout
|
||||
|
||||
def handle_response(self, r: dict) -> None:
|
||||
result = {
|
||||
'action': 'finished', 'id': self.req_id, 'url': self.url, 'output': self.output_path,
|
||||
'final_url': r['url'], 'headers': r.get('headers', []), 'worth_retry': self.worth_retry,
|
||||
}
|
||||
if 'error' in r:
|
||||
result['error'] = r['error']
|
||||
else:
|
||||
if r['type'] != 'basic':
|
||||
print(f'WARNING: response type for {self.url} indicates headers are restrcited: {r["type"]}')
|
||||
with open(self.output_path, 'wb') as f:
|
||||
f.write(memoryview(r['data']))
|
||||
|
||||
|
||||
class Worker(QWebEnginePage):
|
||||
working_on_request: DownloadRequest | None = None
|
||||
|
||||
def javaScriptAlert(self, url, msg):
|
||||
pass
|
||||
|
||||
def javaScriptConfirm(self, url, msg):
|
||||
return True
|
||||
|
||||
def javaScriptPrompt(self, url, msg, defval):
|
||||
return True, defval
|
||||
|
||||
def javaScriptConsoleMessage(self, level: QWebEnginePage.JavaScriptConsoleMessageLevel, message: str, line_num: int, source_id: str) -> None:
|
||||
if source_id == 'userscript:scraper.js':
|
||||
if level == QWebEnginePage.JavaScriptConsoleMessageLevel.InfoMessageLevel and message.startswith(self.token):
|
||||
msg = json.loads(message.partition(' ')[2])
|
||||
t = msg.get('type')
|
||||
if t == 'print':
|
||||
print(msg['text'])
|
||||
elif t == 'messages_available':
|
||||
self.runjs('window.get_messages()', self.on_messages)
|
||||
else:
|
||||
print(f'{source_id}:{line_num}:{message}')
|
||||
return
|
||||
|
||||
def runjs(self, js: str, callback) -> None:
|
||||
self.runJavaScript(js, QWebEngineScript.ScriptWorldId.ApplicationWorld, callback)
|
||||
|
||||
def start_download(self, output_dir: str, req: Request, data: str) -> DownloadRequest:
|
||||
filename = os.path.basename(req['filename'])
|
||||
# TODO: Implement POST requests with data
|
||||
# TODO: Implement timeout
|
||||
payload = json.dumps({'req': req, 'data': data})
|
||||
content = f'''<!DOCTYPE html>
|
||||
<html><head></head></body><div id="payload">{html.escape(payload)}</div></body></html>
|
||||
'''
|
||||
self.setContent(content.encode(), 'text/html;charset=utf-8', QUrl(req['url']))
|
||||
self.working_on_request = DownloadRequest(req['url'], os.path.join(output_dir, filename), req['timeout'], req['id'], self.parent())
|
||||
return self.working_on_request
|
||||
|
||||
def on_messages(self, messages: list[dict]) -> None:
|
||||
for m in messages:
|
||||
if m['type'] == 'finished':
|
||||
self.working_on_request.handle_response(m)
|
||||
self.working_on_request = None
|
||||
|
||||
|
||||
class FetchBackend(QObject):
|
||||
|
||||
request_download = pyqtSignal(object)
|
||||
input_finished = pyqtSignal(str)
|
||||
set_cookies = pyqtSignal(object)
|
||||
set_user_agent_signal = pyqtSignal(str)
|
||||
download_finished = pyqtSignal(object)
|
||||
|
||||
def __init__(self, output_dir: str = '', cache_name: str = '', parent: QObject = None, user_agent: str = '', verify_ssl_certificates: bool = True) -> None:
|
||||
profile = create_base_profile(cache_name)
|
||||
self.token = secrets.token_hex()
|
||||
js = P('scraper.js', allow_user_override=False, data=True).decode('utf-8').replace('TOKEN', self.token)
|
||||
insert_scripts(profile, create_script('scraper.js', js))
|
||||
if user_agent:
|
||||
profile.setHttpUserAgent(user_agent)
|
||||
self.output_dir = output_dir or os.getcwd()
|
||||
self.profile = profile
|
||||
super().__init__(parent)
|
||||
self.workers: list[Worker] = []
|
||||
self.pending_requests: list[tuple[Request, str]] = []
|
||||
sys.excepthook = self.excepthook
|
||||
self.request_download.connect(self.download, type=Qt.ConnectionType.QueuedConnection)
|
||||
self.set_cookies.connect(self._set_cookies, type=Qt.ConnectionType.QueuedConnection)
|
||||
self.set_user_agent_signal.connect(self.set_user_agent, type=Qt.ConnectionType.QueuedConnection)
|
||||
self.input_finished.connect(self.on_input_finished, type=Qt.ConnectionType.QueuedConnection)
|
||||
self.all_request_cookies: list[QNetworkCookie] = []
|
||||
self.timeout_timer = t = QTimer(self)
|
||||
t.setInterval(50)
|
||||
t.timeout.connect(self.enforce_timeouts)
|
||||
|
||||
def excepthook(self, cls: type, exc: Exception, tb) -> None:
|
||||
if not isinstance(exc, KeyboardInterrupt):
|
||||
sys.__excepthook__(cls, exc, tb)
|
||||
QApplication.instance().exit(1)
|
||||
|
||||
def on_input_finished(self, error_msg: str) -> None:
|
||||
if error_msg:
|
||||
self.send_response({'action': 'input_error', 'error': error_msg})
|
||||
QApplication.instance().exit(1)
|
||||
|
||||
def enforce_timeouts(self):
|
||||
# TODO: Start timer on download and port this method
|
||||
now = monotonic()
|
||||
timed_out = tuple(dr for dr in self.live_requests if dr.too_slow_or_timed_out(now))
|
||||
for dr in timed_out:
|
||||
if dr.webengine_download_request is None:
|
||||
dr.cancel_on_start = True
|
||||
else:
|
||||
dr.webengine_download_request.cancel()
|
||||
self.live_requests.discard(dr)
|
||||
if self.live_requests:
|
||||
self.timeout_timer.start()
|
||||
|
||||
def download(self, req: Request) -> None:
|
||||
qurl = QUrl(req['url'])
|
||||
cs = self.profile.cookieStore()
|
||||
for c in self.all_request_cookies:
|
||||
c = QNetworkCookie(c)
|
||||
c.normalize(qurl)
|
||||
cs.setCookie(c)
|
||||
data_path = req['data_path']
|
||||
data = ''
|
||||
if data_path:
|
||||
with open(data_path, 'rb') as f:
|
||||
data = base64.standard_b64encode(f.read()).decode()
|
||||
if not self.workers:
|
||||
self.workers.append(self.create_worker())
|
||||
for w in self.workers:
|
||||
if w.working_on_request is None:
|
||||
w.start_download(self.output_dir, req, data)
|
||||
return
|
||||
if len(self.workers) < 5:
|
||||
self.workers.append(self.create_worker)
|
||||
self.workers[-1].start_download(self.output_dir, req, data)
|
||||
return
|
||||
# TODO: Drain pending requests on finish
|
||||
self.pending_requests.append((req, data))
|
||||
|
||||
def create_worker(self) -> Worker:
|
||||
ans = Worker(self.profile, self)
|
||||
ans.token = self.token + ' '
|
||||
return ans
|
||||
|
||||
def send_response(self, r: dict[str, str]) -> None:
|
||||
with suppress(OSError):
|
||||
print(json.dumps(r), flush=True, file=sys.__stdout__)
|
||||
|
||||
def set_user_agent(self, new_val: str) -> None:
|
||||
self.profile.setHttpUserAgent(new_val)
|
||||
|
||||
def add_cookie(self, c: QNetworkCookie) -> None:
|
||||
cs = self.profile.cookieStore()
|
||||
if c.domain():
|
||||
cs.setCookie(c)
|
||||
else:
|
||||
self.all_request_cookies.append(c)
|
||||
|
||||
def _set_cookie_from_header(self, cookie_string: str) -> None:
|
||||
for c in QNetworkCookie.parseCookies(cookie_string.encode()):
|
||||
self.add_cookie(c)
|
||||
|
||||
def _set_cookies(self, cookies: list[dict[str, str]]) -> None:
|
||||
for c in cookies:
|
||||
if 'header' in c:
|
||||
self._set_cookie_from_header(c['header'])
|
||||
else:
|
||||
self.set_simple_cookie(c['name'], c['value'], c.get('domain'), c.get('path'))
|
||||
|
||||
def set_simple_cookie(self, name: str, value: str, domain: str | None = None, path: str | None = '/'):
|
||||
c = QNetworkCookie()
|
||||
c.setName(name.encode())
|
||||
c.setValue(value.encode())
|
||||
if domain is not None:
|
||||
c.setDomain(domain)
|
||||
if path is not None:
|
||||
c.setPath(path)
|
||||
self.add_cookie(c)
|
||||
|
||||
|
||||
def worker(tdir: str, user_agent: str, verify_ssl_certificates: bool) -> None:
|
||||
return qt_worker(tdir, user_agent, verify_ssl_certificates, FetchBackend)
|
||||
|
||||
|
||||
def develop(url: str) -> None:
|
||||
from calibre.scraper.qt import WebEngineBrowser
|
||||
br = WebEngineBrowser()
|
||||
raw = br.open(url).read()
|
||||
print(len(raw))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
develop(sys.argv[-1])
|
Loading…
x
Reference in New Issue
Block a user