More work on the Qt browser backend

This commit is contained in:
Kovid Goyal 2024-08-14 11:18:13 +05:30
parent ecebe2acef
commit 48b4faec95
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -8,8 +8,24 @@ import sys
from contextlib import suppress from contextlib import suppress
from threading import Thread from threading import Thread
from time import monotonic from time import monotonic
from typing import TypedDict
from qt.core import QApplication, QNetworkAccessManager, QNetworkCookie, QNetworkReply, QNetworkRequest, QObject, Qt, QTimer, QUrl, pyqtSignal, sip from qt.core import (
QApplication,
QNetworkAccessManager,
QNetworkCookie,
QNetworkCookieJar,
QNetworkReply,
QNetworkRequest,
QObject,
Qt,
QTimer,
QUrl,
pyqtSignal,
sip,
)
from calibre.utils.random_ua import random_common_chrome_user_agent
default_timeout: float = 60. # seconds default_timeout: float = 60. # seconds
@ -24,40 +40,81 @@ def qurl_to_key(url: QUrl | str) -> str:
Headers = list[tuple[str, str]] Headers = list[tuple[str, str]]
class Request(TypedDict):
id: int
url: str
headers: Headers
data_path: str
method: str
filename: str
timeout: float
class DownloadRequest(QNetworkRequest):
error: str = '' class CookieJar(QNetworkCookieJar):
def __init__(self, parent=None):
super().__init__(parent)
self.all_request_cookies = []
def add_cookie(self, c: QNetworkCookie) -> None:
if c.domain():
self.insertCookie(c)
else:
self.all_request_cookies.append(c)
def cookiesForUrl(self, url: QUrl) -> list[QNetworkCookie]:
ans = []
for c in self.all_request_cookies:
c = QNetworkCookie(c)
c.normalize(url)
ans.append(c)
return ans + super().cookiesForUrl(url)
class DownloadRequest(QObject):
worth_retry: bool = False worth_retry: bool = False
reply: QNetworkReply
def __init__(self, url: str, filename: str, headers: Headers | None = None, timeout: float = default_timeout, req_id: int = 0): def __init__(self, url: str, output_path: str, reply: QNetworkReply, timeout: float, req_id: int, parent: 'FetchBackend'):
super().__init__(QUrl(url)) super().__init__(parent)
self.setTransferTimeout(int(timeout * 1000)) self.url, self.filename = url, os.path.basename(output_path)
self.url, self.filename = url, filename self.output_path = output_path
self.url_key = qurl_to_key(url) self.reply = reply
self.headers: Headers = headers or []
for (name, val) in self.headers:
self.setRawHeader(name.encode(), val.encode())
self.req_id: int = req_id self.req_id: int = req_id
self.error_message = ''
self.created_at = self.last_activity_at = monotonic() self.created_at = self.last_activity_at = monotonic()
self.timeout = timeout self.timeout = timeout
self.reply.downloadProgress.connect(self.on_download_progress)
self.reply.uploadProgress.connect(self.on_upload_progress)
self.reply.readyRead.connect(self.on_data_available)
self.reply.sslErrors.connect(self.on_ssl_errors)
def on_download_progress(self, bytes_received: int, bytes_total: int) -> None:
self.last_activity_at = monotonic()
def on_upload_progress(self, bytes_received: int, bytes_total: int) -> None:
self.last_activity_at = monotonic()
def on_data_available(self) -> None:
with open(self.output_path, 'ab') as f:
f.write(memoryview(self.reply.readAll()))
def on_ssl_errors(self, err) -> None:
pass
def as_result(self) -> dict[str, str]: def as_result(self) -> dict[str, str]:
result = {'action': 'finished', 'id': self.req_id, 'url': self.url, 'output': os.path.join( e = self.reply.error()
self.webengine_download_request.downloadDirectory(), self.webengine_download_request.downloadFileName()), result = {'action': 'finished', 'id': self.req_id, 'url': self.url, 'output': self.output_path,
'final_url': qurl_to_string(self.webengine_download_request.url()) 'final_url': qurl_to_string(self.reply.url())}
}
if self.error: if e != QNetworkReply.NetworkError.NoError:
result['error'], result['worth_retry'] = self.error, self.worth_retry es = f'{e}: {self.reply.errorString()}'
result['error'], result['worth_retry'] = es, self.worth_retry
return result return result
def too_slow_or_timed_out(self, now: float) -> bool: def too_slow_or_timed_out(self, now: float) -> bool:
if self.timeout and self.last_activity_at + self.timeout < now: if self.timeout and self.last_activity_at + self.timeout < now:
return True return True
time_taken = now - self.created_at time_taken = now - self.created_at
if time_taken > default_timeout and self.webengine_download_request is not None: if time_taken > default_timeout:
downloaded = self.webengine_download_request.receivedBytes() downloaded = self.webengine_download_request.receivedBytes()
rate = downloaded / time_taken rate = downloaded / time_taken
return rate < 10 return rate < 10
@ -66,7 +123,7 @@ class DownloadRequest(QNetworkRequest):
class FetchBackend(QNetworkAccessManager): class FetchBackend(QNetworkAccessManager):
request_download = pyqtSignal(str, str, object, float, int) request_download = pyqtSignal(object)
input_finished = pyqtSignal(str) input_finished = pyqtSignal(str)
set_cookies = pyqtSignal(object) set_cookies = pyqtSignal(object)
set_user_agent_signal = pyqtSignal(str) set_user_agent_signal = pyqtSignal(str)
@ -74,6 +131,9 @@ class FetchBackend(QNetworkAccessManager):
def __init__(self, output_dir: str = '', cache_name: str = '', parent: QObject = None, user_agent: str = '') -> None: def __init__(self, output_dir: str = '', cache_name: str = '', parent: QObject = None, user_agent: str = '') -> None:
super().__init__(parent) super().__init__(parent)
self.cookie_jar = CookieJar(self)
self.setNetworkCookieJar(self.cookie_jar)
self.user_agent = user_agent or random_common_chrome_user_agent()
self.setTransferTimeout(default_timeout) self.setTransferTimeout(default_timeout)
self.output_dir = output_dir or os.getcwd() self.output_dir = output_dir or os.getcwd()
sys.excepthook = self.excepthook sys.excepthook = self.excepthook
@ -86,6 +146,7 @@ class FetchBackend(QNetworkAccessManager):
self.timeout_timer = t = QTimer(self) self.timeout_timer = t = QTimer(self)
t.setInterval(50) t.setInterval(50)
t.timeout.connect(self.enforce_timeouts) t.timeout.connect(self.enforce_timeouts)
self.finished.connect(self.on_reply_finished)
def excepthook(self, cls: type, exc: Exception, tb) -> None: def excepthook(self, cls: type, exc: Exception, tb) -> None:
if not isinstance(exc, KeyboardInterrupt): if not isinstance(exc, KeyboardInterrupt):
@ -101,107 +162,68 @@ class FetchBackend(QNetworkAccessManager):
now = monotonic() now = monotonic()
timed_out = tuple(dr for dr in self.live_requests if dr.too_slow_or_timed_out(now)) timed_out = tuple(dr for dr in self.live_requests if dr.too_slow_or_timed_out(now))
for dr in timed_out: for dr in timed_out:
if dr.webengine_download_request is None: dr.reply.abort()
dr.cancel_on_start = True
else:
dr.webengine_download_request.cancel()
self.live_requests.discard(dr)
if self.live_requests: if self.live_requests:
self.timeout_timer.start() self.timeout_timer.start()
def download(self, url: str, filename: str, extra_headers: Headers | None = None, timeout: float = default_timeout, req_id: int = 0) -> None: def current_user_agent(self) -> str:
filename = os.path.basename(filename) return self.user_agent
qurl = QUrl(url)
dr = DownloadRequest(url, filename, extra_headers, timeout, req_id) def download(self, req: Request) -> None:
self.dr_identifier_count += 1 filename = os.path.basename(req['filename'])
self.pending_download_requests[self.dr_identifier_count] = dr qurl = QUrl(req['url'])
rq = QNetworkRequest(qurl)
timeout = req['timeout']
rq.setTransferTimeout(int(timeout * 1000))
rq.setRawHeader(b'User-Agent', self.current_user_agent().encode())
for (name, val) in req['headers']:
rq.setRawHeader(name.encode(), val.encode())
method = req['method'].lower()
data_path = req['data_path']
data = None
if data_path:
with open(data_path, 'rb') as f:
data = f.read()
if method == 'get':
reply = self.get(rq, data)
elif method == 'post':
reply = self.post(rq, data)
elif method == 'put':
reply = self.put(rq, data)
elif method == 'head':
reply = self.head(rq, data)
else:
raise TypeError(f'Unknown HTTP request type: {method}')
dr = DownloadRequest(req['url'], os.path.join(self.output_dir, filename), reply, timeout, req['id'], self)
self.live_requests.add(dr) self.live_requests.add(dr)
if not self.timeout_timer.isActive(): if not self.timeout_timer.isActive():
self.timeout_timer.start() self.timeout_timer.start()
cs = self.profile().cookieStore()
for c in self.all_request_cookies:
c = QNetworkCookie(c)
c.normalize(qurl)
cs.setCookie(c)
super().download(qurl, str(self.dr_identifier_count))
def _download_requested(self, wdr: QWebEngineDownloadRequest) -> None: def on_reply_finished(self, reply: QNetworkReply) -> None:
try: reply.deleteLater()
idc = int(wdr.suggestedFileName()) for x in tuple(self.live_requests):
dr: DownloadRequest = self.pending_download_requests.pop(idc) if x.reply is reply:
except Exception: self.live_requests.discard(x)
return self.report_finish(x)
try: x.reply = None
if dr.cancel_on_start: break
dr.error = 'Timed out trying to open URL' reply.deleteLater()
dr.worth_retry = True
self.send_response(dr.as_result())
return
dr.last_activity_at = monotonic()
if dr.filename:
wdr.setDownloadFileName(dr.filename)
dr.webengine_download_request = wdr
self.download_requests_by_id[wdr.id()] = dr
wdr.isFinishedChanged.connect(self._download_finished)
wdr.receivedBytesChanged.connect(self._bytes_received)
wdr.accept()
except Exception:
import traceback
traceback.print_exc()
self.report_finish(wdr, dr)
def _bytes_received(self) -> None: def report_finish(self, dr: DownloadRequest) -> None:
wdr: QWebEngineDownloadRequest = self.sender() result = dr.as_result()
if dr := self.download_requests_by_id.get(wdr.id()): self.download_finished.emit(result)
dr.last_activity_at = monotonic() self.send_response(result)
def _download_finished(self) -> None:
wdr: QWebEngineDownloadRequest = self.sender()
if dr := self.download_requests_by_id.get(wdr.id()):
self.report_finish(wdr, dr)
def report_finish(self, wdr: QWebEngineDownloadRequest, dr: DownloadRequest) -> None:
s = wdr.state()
dr.last_activity_at = monotonic()
self.live_requests.discard(dr)
has_result = False
if s == QWebEngineDownloadRequest.DownloadState.DownloadRequested:
dr.error = 'Open of URL failed'
has_result = True
elif s == QWebEngineDownloadRequest.DownloadState.DownloadCancelled:
dr.error = 'Timed out waiting for download'
dr.worth_retry = True
has_result = True
elif s == QWebEngineDownloadRequest.DownloadState.DownloadInterrupted:
dr.error = wdr.interruptReasonString()
dr.worth_retry = wdr.interruptReason() in (
QWebEngineDownloadRequest.DownloadInterruptReason.NetworkTimeout,
QWebEngineDownloadRequest.DownloadInterruptReason.NetworkFailed,
QWebEngineDownloadRequest.DownloadInterruptReason.NetworkDisconnected,
QWebEngineDownloadRequest.DownloadInterruptReason.NetworkServerDown,
QWebEngineDownloadRequest.DownloadInterruptReason.ServerUnreachable,
)
has_result = True
elif s == QWebEngineDownloadRequest.DownloadState.DownloadCompleted:
has_result = True
if has_result:
result = dr.as_result()
self.download_finished.emit(result)
self.send_response(result)
def send_response(self, r: dict[str, str]) -> None: def send_response(self, r: dict[str, str]) -> None:
with suppress(OSError): with suppress(OSError):
print(json.dumps(r), flush=True, file=sys.__stdout__) print(json.dumps(r), flush=True, file=sys.__stdout__)
def set_user_agent(self, new_val: str) -> None: def set_user_agent(self, new_val: str) -> None:
self.profile().setHttpUserAgent(new_val) self.user_agent = new_val
def _set_cookie_from_header(self, cookie_string: str) -> None: def _set_cookie_from_header(self, cookie_string: str) -> None:
cs = self.profile().cookieStore()
for c in QNetworkCookie.parseCookies(cookie_string.encode()): for c in QNetworkCookie.parseCookies(cookie_string.encode()):
cs.setCookie(c) self.cookie_jar.add_cookie(c)
def _set_cookies(self, cookies: list[dict[str, str]]) -> None: def _set_cookies(self, cookies: list[dict[str, str]]) -> None:
for c in cookies: for c in cookies:
@ -218,10 +240,7 @@ class FetchBackend(QNetworkAccessManager):
c.setDomain(domain) c.setDomain(domain)
if path is not None: if path is not None:
c.setPath(path) c.setPath(path)
if c.domain(): self.cookie_jar.add_cookie(c)
self.profile().cookieStore().setCookie(c)
else:
self.all_request_cookies.append(c)
def read_commands(backend: FetchBackend, tdir: str) -> None: def read_commands(backend: FetchBackend, tdir: str) -> None:
@ -236,7 +255,16 @@ def read_commands(backend: FetchBackend, tdir: str) -> None:
timeout = cmd.get('timeout') timeout = cmd.get('timeout')
if timeout is None: if timeout is None:
timeout = default_timeout timeout = default_timeout
backend.request_download.emit(cmd['url'], os.path.join(tdir, str(file_counter)), cmd.get('headers'), timeout, cmd.get('id', 0)) req: Request = {
'id': cmd.get(id) or 0,
'url': cmd['url'],
'headers': cmd.get('headers') or [],
'data_path': cmd.get('data_path') or '',
'method': cmd.get('method') or 'get',
'filename': os.path.join(tdir, str(file_counter)),
'timeout': timeout,
}
backend.request_download.emit(req)
elif ac == 'set_cookies': elif ac == 'set_cookies':
backend.set_cookies.emit(cmd['cookies']) backend.set_cookies.emit(cmd['cookies'])
elif ac == 'set_user_agent': elif ac == 'set_user_agent':