From 6f307021a0ba673451e59e864d1710948a8a5a41 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 8 Aug 2024 15:02:17 +0530 Subject: [PATCH] More work on the new fetch backend --- src/calibre/scraper/fetch_backend.py | 193 ++++++++++++++++++++++++--- 1 file changed, 173 insertions(+), 20 deletions(-) diff --git a/src/calibre/scraper/fetch_backend.py b/src/calibre/scraper/fetch_backend.py index 2f68a299af..9c7f7909c2 100644 --- a/src/calibre/scraper/fetch_backend.py +++ b/src/calibre/scraper/fetch_backend.py @@ -2,51 +2,149 @@ # License: GPLv3 Copyright: 2024, Kovid Goyal +import json +import os +import sys +from contextlib import suppress +from threading import Thread from typing import Union -from qt.core import QNetworkCookie, QObject, Qt, QUrl, pyqtSignal +from qt.core import QApplication, QNetworkCookie, QObject, Qt, QUrl, pyqtSignal from qt.webengine import QWebEngineDownloadRequest, QWebEnginePage, QWebEngineUrlRequestInfo, QWebEngineUrlRequestInterceptor -from .simple_backend import create_base_profile +from calibre.scraper.simple_backend import create_base_profile class RequestInterceptor(QWebEngineUrlRequestInterceptor): def interceptRequest(self, req: QWebEngineUrlRequestInfo) -> None: - pass + fb: FetchBackend = self.parent() + if fb: + key = qurl_to_key(req.requestUrl()) + if dr := fb.download_requests[key]: + for name, x in dr.headers.items(): + if isinstance(x, str): + x = [x] + for val in x: + req.setHttpHeader(name.encode(), val.encode()) + + +def qurl_to_string(url: QUrl | str) -> str: + return bytes(QUrl(url).toEncoded()).decode() + + +qurl_to_key = qurl_to_string +Headers = dict[str, Union[str, list[str]]] + + +class DownloadRequest: + + def __init__(self, url: str, filename: str, headers: Headers | None = None): + self.url, self.filename = url, filename + self.url_key = qurl_to_key(url) + self.headers: Headers = headers or {} + self.num_of_responses_needed = 1 + self.finished = False + self.error_message = '' + self.webengine_download_request: QWebEngineDownloadRequest | None = None class FetchBackend(QWebEnginePage): - request_download = pyqtSignal(object, str) + request_download = pyqtSignal(str, str) + input_finished = pyqtSignal(str) + download_finished = pyqtSignal(object) - def __init__(self, output_dir: str, cache_name: str = '', parent: QObject = None) -> None: + def __init__(self, output_dir: str = '', cache_name: str = '', parent: QObject = None) -> None: self.profile = create_base_profile(cache_name) self.profile.downloadRequested.connect(self._download_requested) - self.profile.setDownloadPath(output_dir) + self.output_dir = output_dir or os.getcwd() + self.profile.setDownloadPath(self.output_dir) super().__init__(self.profile, parent) self.interceptor = RequestInterceptor(self) self.profile.setUrlRequestInterceptor(self.interceptor) self.request_download.connect(self.download, type=Qt.ConnectionType.QueuedConnection) + self.input_finished.connect(self.on_input_finished, type=Qt.ConnectionType.QueuedConnection) + self.download_requests: dict[str, DownloadRequest] = {} + self.pending_download_requests: dict[int, DownloadRequest] = {} + self.download_requests_by_id: dict[int, DownloadRequest] = {} + self.dr_identifier_count = 0 - def download(self, url: Union[str, QUrl], filename_or_path: str = '') -> str: - if isinstance(url, str): - url = QUrl(url) - super().download(url, filename_or_path) - return bytes(url.toEncoded()).decode() + def on_input_finished(self, error_msg: str) -> None: + if error_msg: + self.send_response({'action': 'input_error', 'error': error_msg}) + QApplication.instance().exit(1) - def _download_requested(self, dr: QWebEngineDownloadRequest) -> None: - dr.accept() - dr.isFinishedChanged.connect(self._download_finished) + def download(self, url: str, filename: str, extra_headers: Headers | None = None) -> None: + filename = os.path.basename(filename) + qurl = QUrl(url) + key = qurl_to_key(qurl) + dr = self.download_requests.get(key) + if dr: + dr.num_of_responses_needed += 1 + else: + self.download_requests[key] = dr = DownloadRequest(url, filename, extra_headers) + self.dr_identifier_count += 1 + self.pending_download_requests[self.dr_identifier_count] = dr + super().download(qurl, str(self.dr_identifier_count)) + + def _download_requested(self, wdr: QWebEngineDownloadRequest) -> None: + try: + idc = int(wdr.suggestedFileName()) + dr: DownloadRequest = self.pending_download_requests.pop(idc) + except Exception: + import traceback + traceback.print_exc() + return + try: + if dr.filename: + wdr.setDownloadFileName(dr.filename) + dr.webengine_download_request = wdr + self.download_requests_by_id[wdr.id()] = dr + wdr.isFinishedChanged.connect(self._download_finished) + wdr.receivedBytesChanged.connect(self._bytes_received) + wdr.accept() + except Exception: + import traceback + traceback.print_exc() + self.report_finish(wdr, dr) + + def _bytes_received(self) -> None: + wdr: QWebEngineDownloadRequest = self.sender() + wdr def _download_finished(self) -> None: - dr: QWebEngineDownloadRequest = self.sender() - s = dr.state() - url = bytes(dr.url().toEncoded()).decode() - if s == QWebEngineDownloadRequest.DownloadState.DownloadInterrupted: - print(99999999, url, dr.interruptReasonString()) + wdr: QWebEngineDownloadRequest = self.sender() + if dr := self.download_requests_by_id.get(wdr.id()): + self.report_finish(wdr, dr) + + def report_finish(self, wdr: QWebEngineDownloadRequest, dr: DownloadRequest) -> None: + s = wdr.state() + output = os.path.join(wdr.downloadDirectory(), wdr.downloadFileName()) + result: dict[str, str] = {} + if s == QWebEngineDownloadRequest.DownloadState.DownloadRequested: + # Open of URL failed + result = {'action': 'finished', 'error':'Open of URL failed', 'url': dr.url, 'output': output} + dr.finished = True + elif s == QWebEngineDownloadRequest.DownloadState.DownloadCancelled: + result = {'action': 'finished', 'error':'Timed out waiting for download', 'url': dr.url, 'output': output} + dr.finished = True + elif s == QWebEngineDownloadRequest.DownloadState.DownloadInterrupted: + result = {'action': 'finished', 'error':wdr.interruptReasonString(), 'url': dr.url, 'output': output} + dr.finished = True elif s == QWebEngineDownloadRequest.DownloadState.DownloadCompleted: - print(1111111, dr, url, dr.downloadFileName()) + result = {'action': 'finished', 'url': dr.url, 'output': output, 'final_url': qurl_to_string(wdr.url())} + dr.finished = True + + if result: + self.download_finished.emit(result) + while dr.num_of_responses_needed: + dr.num_of_responses_needed -= 1 + self.send_response(result) + + def send_response(self, r: dict[str, str]) -> None: + with suppress(OSError): + print(json.dumps(r), flush=True) def set_user_agent(self, new_val: str) -> None: self.profile.setHttpUserAgent(new_val) @@ -56,3 +154,58 @@ class FetchBackend(QWebEnginePage): cookie_string = f'{name}={value}; Domain={domain}; Path={path}' for c in QNetworkCookie.parseCookies(cookie_string): cs.setCookie(c) + + +def read_commands(backend: FetchBackend, tdir: str) -> None: + file_counter = 0 + error_msg = '' + try: + for line in sys.stdin: + cmd = json.loads(line) + ac = cmd['action'] + if ac == 'download': + file_counter += 1 + backend.request_download.emit(cmd['url'], os.path.join(tdir, str(file_counter))) + elif ac == 'quit': + break + except Exception as err: + import traceback + traceback.print_exc() + error_msg = str(err) + backend.input_finished.emit(error_msg) + + +def worker(tdir): + app = QApplication.instance() + backend = FetchBackend(output_dir=tdir, parent=app) + read_thread = Thread(target=read_commands, args=(backend, tdir), daemon=True) + read_thread.start() + app.exec() + del backend + del app + + +def develop(url: str) -> None: + from calibre.gui2 import must_use_qt, setup_unix_signals + must_use_qt() + app = QApplication.instance() + app.signal_received = lambda : app.exit(1) + setup_unix_signals(app) + backend = FetchBackend() + num_left = 0 + + def download_finished(dr: DownloadRequest): + nonlocal num_left + num_left -= 1 + if not num_left: + backend.input_finished.emit('') + + backend.download_finished.connect(download_finished) + for i, url in enumerate(sys.argv[1:]): + backend.download(url, f'test-output-{i}') + num_left += 1 + app.exec() + + +if __name__ == '__main__': + develop(sys.argv[-1])