Get basic fetching working

This commit is contained in:
Kovid Goyal 2024-08-16 11:58:14 +05:30
parent 3c5e4ed751
commit 885961acc0
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 153 additions and 74 deletions

View File

@ -18,19 +18,33 @@
console.log(msg); console.log(msg);
} }
function debug() {
var args = Array.prototype.slice.call(arguments);
var text = args.join(' ');
send_msg({type: 'print', text: text});
}
function notify_that_messages_are_available() { function notify_that_messages_are_available() {
send_msg({type: 'messages_available', count: messages.length}); send_msg({type: 'messages_available', count: messages.length});
} }
function notify(type, msg) {
msg.type = type;
messages.push(msg);
notify_that_messages_are_available();
}
async function* stream_to_async_iterable(stream) {
const reader = stream.getReader()
try {
while (true) {
const {done, value} = await reader.read()
if (done) return
yield value
}
} finally {
reader.releaseLock()
}
}
async function download(req, data) { async function download(req, data) {
try { try {
const controller = new AbortController(); const controller = new AbortController();
live_requests[req.id] = controller;
var fetch_options = { var fetch_options = {
method: req.method.toUpperCase(), method: req.method.toUpperCase(),
signal: controller.signal, signal: controller.signal,
@ -40,13 +54,17 @@
for (const pair of response.headers) { for (const pair of response.headers) {
headers.push(pair); headers.push(pair);
} }
const body = await response.arrayBuffer(); notify('metadata_received', {
status_code: response.status, status_msg: response.statusText,
url: response.url, headers: headers, response_type: response.type,
})
for await (const chunk of stream_to_async_iterable(response.body)) {
notify('chunk_received', {chunk: chunk})
}
delete live_requests[req.id]; delete live_requests[req.id];
messages.push({type: 'finished', req: req, status_code: response.status, status_msg: response.statusText, url: response.url, headers: headers, type: response.type, body: body}); notify('finished', {})
notify_that_messages_are_available();
} catch (error) { } catch (error) {
messages.push({type: 'finished', error: error.message, req: req, url: req.url}); notify('error', {error: error.message});
notify_that_messages_are_available();
} }
} }

View File

@ -23,37 +23,41 @@ class FakeResponse:
self.queue = Queue() self.queue = Queue()
self.done = False self.done = False
self.final_url = '' self.final_url = ''
self._reason = ''
self._status = None self._status = None
self._headers = [] self._headers = []
self.data = BytesIO() self._data = BytesIO()
def _wait(self): def _wait(self):
if self.done: if self.done:
return return
self.done = True self.done = True
res = self.queue.get() res = self.queue.get()
del self.queue
if res['action'] == 'input_error': if res['action'] == 'input_error':
raise Exception(res['error']) raise Exception(res['error'])
self.final_url = res['final_url'] self.final_url = res['final_url']
self._status = res.get('http_code') self._status = res.get('http_code')
self._reason = res.get('http_status_message')
self._headers = res['headers'] self._headers = res['headers']
if 'error' in res: if 'error' in res:
ex = URLError(res['error']) ex = URLError(res['error'])
ex.worth_retry = bool(res.get('worth_retry')) ex.worth_retry = bool(res.get('worth_retry'))
raise ex raise ex
self.data = open(res['output'], 'rb') with suppress(FileNotFoundError):
self._data = open(res['output'], 'rb')
def read(self, *a, **kw): def read(self, *a, **kw):
self._wait() self._wait()
ans = self.data.read(*a, **kw) ans = self._data.read(*a, **kw)
return ans return ans
def seek(self, *a, **kw): def seek(self, *a, **kw):
self._wait() self._wait()
return self.data.seek(*a, **kw) return self._data.seek(*a, **kw)
def tell(self, *a, **kw): def tell(self, *a, **kw):
return self.data.tell(*a, **kw) return self._data.tell(*a, **kw)
@property @property
def url(self) -> str: def url(self) -> str:
@ -75,6 +79,11 @@ class FakeResponse:
ans[k] = v ans[k] = v
return ans return ans
@property
def reason(self) -> str:
self._wait()
return self._reason or ''
def getcode(self) -> int | None: def getcode(self) -> int | None:
return self.status return self.status
@ -85,7 +94,7 @@ class FakeResponse:
return self.headers return self.headers
def close(self): def close(self):
self.data.close() self._data.close()
class Browser: class Browser:

View File

@ -71,6 +71,16 @@ class CookieJar(QNetworkCookieJar):
return ans + super().cookiesForUrl(url) return ans + super().cookiesForUrl(url)
def too_slow_or_timed_out(timeout: float, last_activity_at: float, created_at: float, downloaded_bytes: int, now: float) -> bool:
if timeout and last_activity_at + timeout < now:
return True
time_taken = now - created_at
if time_taken > default_timeout:
rate = downloaded_bytes / time_taken
return rate < 10
return False
class DownloadRequest(QObject): class DownloadRequest(QObject):
worth_retry: bool = False worth_retry: bool = False
@ -83,14 +93,17 @@ class DownloadRequest(QObject):
self.req_id: int = req_id self.req_id: int = req_id
self.created_at = self.last_activity_at = monotonic() self.created_at = self.last_activity_at = monotonic()
self.timeout = timeout self.timeout = timeout
self.bytes_received = 0
self.reply.downloadProgress.connect(self.on_download_progress, type=Qt.ConnectionType.QueuedConnection) self.reply.downloadProgress.connect(self.on_download_progress, type=Qt.ConnectionType.QueuedConnection)
self.reply.uploadProgress.connect(self.on_upload_progress, type=Qt.ConnectionType.QueuedConnection) self.reply.uploadProgress.connect(self.on_upload_progress, type=Qt.ConnectionType.QueuedConnection)
# self.reply.readyRead.connect(self.on_data_available) # self.reply.readyRead.connect(self.on_data_available)
def on_download_progress(self, bytes_received: int, bytes_total: int) -> None: def on_download_progress(self, bytes_received: int, bytes_total: int) -> None:
self.bytes_received = bytes_received
self.last_activity_at = monotonic() self.last_activity_at = monotonic()
def on_upload_progress(self, bytes_received: int, bytes_total: int) -> None: def on_upload_progress(self, bytes_received: int, bytes_total: int) -> None:
self.bytes_received = bytes_received
self.last_activity_at = monotonic() self.last_activity_at = monotonic()
def save_data(self) -> None: def save_data(self) -> None:
@ -132,14 +145,7 @@ class DownloadRequest(QObject):
return result return result
def too_slow_or_timed_out(self, now: float) -> bool: def too_slow_or_timed_out(self, now: float) -> bool:
if self.timeout and self.last_activity_at + self.timeout < now: return too_slow_or_timed_out(self.timeout, self.last_activity_at, self.created_at, self.bytes_received, now)
return True
time_taken = now - self.created_at
if time_taken > default_timeout:
downloaded = self.webengine_download_request.receivedBytes()
rate = downloaded / time_taken
return rate < 10
return False
class FetchBackend(QNetworkAccessManager): class FetchBackend(QNetworkAccessManager):
@ -186,8 +192,8 @@ class FetchBackend(QNetworkAccessManager):
timed_out = tuple(dr for dr in self.live_requests if dr.too_slow_or_timed_out(now)) timed_out = tuple(dr for dr in self.live_requests if dr.too_slow_or_timed_out(now))
for dr in timed_out: for dr in timed_out:
dr.reply.abort() dr.reply.abort()
if self.live_requests: if not self.live_requests:
self.timeout_timer.start() self.timeout_timer.stop()
def current_user_agent(self) -> str: def current_user_agent(self) -> str:
return self.user_agent return self.user_agent

View File

@ -7,35 +7,24 @@ import json
import os import os
import secrets import secrets
import sys import sys
from collections import deque
from contextlib import suppress from contextlib import suppress
from http import HTTPStatus
from time import monotonic from time import monotonic
from qt.core import QApplication, QNetworkCookie, QObject, Qt, QTimer, QUrl, pyqtSignal from qt.core import QApplication, QByteArray, QNetworkCookie, QObject, Qt, QTimer, QUrl, pyqtSignal
from qt.webengine import QWebEnginePage, QWebEngineScript from qt.webengine import QWebEnginePage, QWebEngineScript
from calibre.scraper.qt_backend import Request from calibre.scraper.qt_backend import Request, too_slow_or_timed_out
from calibre.scraper.qt_backend import worker as qt_worker from calibre.scraper.qt_backend import worker as qt_worker
from calibre.scraper.simple_backend import create_base_profile from calibre.scraper.simple_backend import create_base_profile
from calibre.utils.resources import get_path as P from calibre.utils.resources import get_path as P
from calibre.utils.webengine import create_script, insert_scripts from calibre.utils.webengine import create_script, insert_scripts
default_timeout: float = 60. # seconds
def qurl_to_string(url: QUrl | str) -> str:
return bytes(QUrl(url).toEncoded()).decode()
def qurl_to_key(url: QUrl | str) -> str:
return qurl_to_string(url).rstrip('/')
Headers = list[tuple[str, str]]
class DownloadRequest(QObject): class DownloadRequest(QObject):
worth_retry: bool = False aborted_on_timeout: bool = False
response_received = pyqtSignal(object) response_received = pyqtSignal(object)
def __init__(self, url: str, output_path: str, timeout: float, req_id: int, parent: 'FetchBackend'): def __init__(self, url: str, output_path: str, timeout: float, req_id: int, parent: 'FetchBackend'):
@ -45,23 +34,50 @@ class DownloadRequest(QObject):
self.req_id: int = req_id self.req_id: int = req_id
self.created_at = self.last_activity_at = monotonic() self.created_at = self.last_activity_at = monotonic()
self.timeout = timeout self.timeout = timeout
self.bytes_received = 0
def handle_response(self, r: dict) -> None: self.result = {
result = {
'action': 'finished', 'id': self.req_id, 'url': self.url, 'output': self.output_path, 'action': 'finished', 'id': self.req_id, 'url': self.url, 'output': self.output_path,
'final_url': r['url'], 'headers': r.get('headers', []), 'worth_retry': self.worth_retry, 'headers': [], 'final_url': self.url, 'worth_retry': False,
} }
if 'error' in r:
result['error'] = r['error'] def metadata_received(self, r: dict) -> None:
if r['response_type'] != 'basic':
print(f'WARNING: response type for {self.url} indicates headers are restrcited: {r["type"]}')
self.result['worth_retry'] = r['status_code'] in (
HTTPStatus.TOO_MANY_REQUESTS, HTTPStatus.REQUEST_TIMEOUT, HTTPStatus.SERVICE_UNAVAILABLE, HTTPStatus.GATEWAY_TIMEOUT)
self.result['final_url'] = r['url']
self.result['headers'] = r['headers']
self.result['http_code'] = r['status_code']
self.result['http_status_message'] = r['status_msg']
def chunk_received(self, chunk: QByteArray) -> None:
mv = memoryview(chunk)
self.bytes_received += len(mv)
with open(self.output_path, 'ab') as f:
f.write(mv)
def as_result(self, r: dict | None = {}) -> dict:
if self.aborted_on_timeout:
self.result['error'] = 'Timed out'
self.result['worth_retry'] = True
else: else:
if r['type'] != 'basic': if r:
print(f'WARNING: response type for {self.url} indicates headers are restrcited: {r["type"]}') self.result['error'] = r['error']
with open(self.output_path, 'wb') as f: self.result['worth_retry'] = True # usually some kind of network error
f.write(memoryview(r['data'])) return self.result
def too_slow_or_timed_out(self, now: float) -> bool:
return too_slow_or_timed_out(self.timeout, self.last_activity_at, self.created_at, self.bytes_received, now)
class Worker(QWebEnginePage): class Worker(QWebEnginePage):
working_on_request: DownloadRequest | None = None working_on_request: DownloadRequest | None = None
messages_dispatch = pyqtSignal(object)
result_received = pyqtSignal(object)
def __init__(self, profile, parent):
super().__init__(profile, parent)
self.messages_dispatch.connect(self.on_messages)
def javaScriptAlert(self, url, msg): def javaScriptAlert(self, url, msg):
pass pass
@ -77,10 +93,8 @@ class Worker(QWebEnginePage):
if level == QWebEnginePage.JavaScriptConsoleMessageLevel.InfoMessageLevel and message.startswith(self.token): if level == QWebEnginePage.JavaScriptConsoleMessageLevel.InfoMessageLevel and message.startswith(self.token):
msg = json.loads(message.partition(' ')[2]) msg = json.loads(message.partition(' ')[2])
t = msg.get('type') t = msg.get('type')
if t == 'print': if t == 'messages_available':
print(msg['text']) self.runjs('window.get_messages()', self.messages_dispatch.emit)
elif t == 'messages_available':
self.runjs('window.get_messages()', self.on_messages)
else: else:
print(f'{source_id}:{line_num}:{message}') print(f'{source_id}:{line_num}:{message}')
return return
@ -91,7 +105,6 @@ class Worker(QWebEnginePage):
def start_download(self, output_dir: str, req: Request, data: str) -> DownloadRequest: def start_download(self, output_dir: str, req: Request, data: str) -> DownloadRequest:
filename = os.path.basename(req['filename']) filename = os.path.basename(req['filename'])
# TODO: Implement POST requests with data # TODO: Implement POST requests with data
# TODO: Implement timeout
payload = json.dumps({'req': req, 'data': data}) payload = json.dumps({'req': req, 'data': data})
content = f'''<!DOCTYPE html> content = f'''<!DOCTYPE html>
<html><head></head></body><div id="payload">{html.escape(payload)}</div></body></html> <html><head></head></body><div id="payload">{html.escape(payload)}</div></body></html>
@ -100,11 +113,32 @@ class Worker(QWebEnginePage):
self.working_on_request = DownloadRequest(req['url'], os.path.join(output_dir, filename), req['timeout'], req['id'], self.parent()) self.working_on_request = DownloadRequest(req['url'], os.path.join(output_dir, filename), req['timeout'], req['id'], self.parent())
return self.working_on_request return self.working_on_request
def abort_on_timeout(self) -> None:
if self.working_on_request is not None:
self.working_on_request.aborted_on_timeout = True
self.runjs(f'window.abort_download({self.req_id})')
def on_messages(self, messages: list[dict]) -> None: def on_messages(self, messages: list[dict]) -> None:
if not messages:
return
if self.working_on_request is None:
print('Got messages without request:', messages)
return
self.working_on_request.last_activity_at = monotonic()
for m in messages: for m in messages:
if m['type'] == 'finished': t = m['type']
self.working_on_request.handle_response(m) if t == 'metadata_received':
self.working_on_request.metadata_received(m)
elif t == 'chunk_received':
self.working_on_request.chunk_received(m['chunk'])
elif t == 'finished':
result = self.working_on_request.as_result()
self.working_on_request = None self.working_on_request = None
self.result_received.emit(result)
elif t == 'error':
result = self.working_on_request.as_result(m)
self.working_on_request = None
self.result_received.emit(result)
class FetchBackend(QObject): class FetchBackend(QObject):
@ -126,7 +160,7 @@ class FetchBackend(QObject):
self.profile = profile self.profile = profile
super().__init__(parent) super().__init__(parent)
self.workers: list[Worker] = [] self.workers: list[Worker] = []
self.pending_requests: list[tuple[Request, str]] = [] self.pending_requests: deque[tuple[Request, str]] = deque()
sys.excepthook = self.excepthook sys.excepthook = self.excepthook
self.request_download.connect(self.download, type=Qt.ConnectionType.QueuedConnection) self.request_download.connect(self.download, type=Qt.ConnectionType.QueuedConnection)
self.set_cookies.connect(self._set_cookies, type=Qt.ConnectionType.QueuedConnection) self.set_cookies.connect(self._set_cookies, type=Qt.ConnectionType.QueuedConnection)
@ -148,17 +182,16 @@ class FetchBackend(QObject):
QApplication.instance().exit(1) QApplication.instance().exit(1)
def enforce_timeouts(self): def enforce_timeouts(self):
# TODO: Start timer on download and port this method
now = monotonic() now = monotonic()
timed_out = tuple(dr for dr in self.live_requests if dr.too_slow_or_timed_out(now)) has_workers = False
for dr in timed_out: for w in self.workers:
if dr.webengine_download_request is None: if w.working_on_request is not None:
dr.cancel_on_start = True if w.working_on_request.too_slow_or_timed_out(now):
else: w.abort_on_timeout()
dr.webengine_download_request.cancel() else:
self.live_requests.discard(dr) has_workers = True
if self.live_requests: if not has_workers:
self.timeout_timer.start() self.timeout_timer.stop()
def download(self, req: Request) -> None: def download(self, req: Request) -> None:
qurl = QUrl(req['url']) qurl = QUrl(req['url'])
@ -177,19 +210,30 @@ class FetchBackend(QObject):
for w in self.workers: for w in self.workers:
if w.working_on_request is None: if w.working_on_request is None:
w.start_download(self.output_dir, req, data) w.start_download(self.output_dir, req, data)
self.timeout_timer.start()
return return
if len(self.workers) < 5: if len(self.workers) < 5:
self.workers.append(self.create_worker) self.workers.append(self.create_worker)
self.workers[-1].start_download(self.output_dir, req, data) self.workers[-1].start_download(self.output_dir, req, data)
self.timeout_timer.start()
return return
# TODO: Drain pending requests on finish
self.pending_requests.append((req, data)) self.pending_requests.append((req, data))
def create_worker(self) -> Worker: def create_worker(self) -> Worker:
ans = Worker(self.profile, self) ans = Worker(self.profile, self)
ans.token = self.token + ' ' ans.token = self.token + ' '
ans.result_received.connect(self.result_received)
return ans return ans
def result_received(self, result: dict) -> None:
self.send_response(result)
self.download_finished.emit(result)
if self.pending_requests:
w = self.sender()
req, data = self.pending_requests.popleft()
w.start_download(self.output_dir, req, data)
self.timeout_timer.start()
def send_response(self, r: dict[str, str]) -> None: def send_response(self, r: dict[str, str]) -> None:
with suppress(OSError): with suppress(OSError):
print(json.dumps(r), flush=True, file=sys.__stdout__) print(json.dumps(r), flush=True, file=sys.__stdout__)
@ -233,8 +277,10 @@ def worker(tdir: str, user_agent: str, verify_ssl_certificates: bool) -> None:
def develop(url: str) -> None: def develop(url: str) -> None:
from calibre.scraper.qt import WebEngineBrowser from calibre.scraper.qt import WebEngineBrowser
br = WebEngineBrowser() br = WebEngineBrowser()
raw = br.open(url).read() res = br.open(url)
print(len(raw)) print(f'{res.code} {res.reason}')
print(res.headers)
print(len(res.read()))
if __name__ == '__main__': if __name__ == '__main__':