diff --git a/src/calibre/scraper/qt.py b/src/calibre/scraper/qt.py new file mode 100644 index 0000000000..37ab85ad8a --- /dev/null +++ b/src/calibre/scraper/qt.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python +# License: GPLv3 Copyright: 2024, Kovid Goyal + +import json +import os +import shutil +import time +from contextlib import suppress +from io import BytesIO +from queue import Queue +from threading import RLock, Thread +from urllib.error import URLError +from urllib.parse import urlencode +from urllib.request import Request + +from calibre.ptempfile import PersistentTemporaryDirectory + + +class FakeResponse: + + def __init__(self): + self.queue = Queue() + self.done = False + self.final_url = '' + self.data = BytesIO() + + def _wait(self): + if self.done: + return + self.done = True + res = self.queue.get() + if res['action'] == 'input_error': + raise Exception(res['error']) + self.final_url = res['final_url'] + if 'error' in res: + ex = URLError(res['error']) + ex.worth_retry = bool(res.get('worth_retry')) + raise ex + self.data = open(res['output'], 'rb') + + def read(self, *a, **kw): + self._wait() + ans = self.data.read(*a, **kw) + return ans + + def seek(self, *a, **kw): + self._wait() + return self.data.seek(*a, **kw) + + def tell(self, *a, **kw): + return self.data.tell(*a, **kw) + + def geturl(self): + self._wait() + return self.final_url + + def close(self): + self.data.close() + + +class Browser: + + def __init__(self, user_agent: str = '', headers: tuple[tuple[str, str], ...] = (), start_worker: bool = False): + self.tdir = '' + self.worker = self.dispatcher = None + self.dispatch_map = {} + self.id_counter = 0 + self.addheaders: list[tuple[str, str]] = list(headers) + self.user_agent = user_agent + self.lock = RLock() + self.shutting_down = False + if start_worker: + self._ensure_state() + + def open(self, url_or_request: Request, data=None, timeout=None): + method = 'POST' if data else 'GET' + headers = [] + if hasattr(url_or_request, 'get_method'): + r = url_or_request + method = r.get_method() + data = data or r.data + headers = r.header_items() + url = r.full_url + else: + url = url_or_request + + def has_header(x: str) -> bool: + x = x.lower() + for (h, v) in headers: + if h.lower() == x: + return True + return False + + if isinstance(data, dict): + headers.append(('Content-Type', 'application/x-www-form-urlencoded')) + data = urlencode(data) + if isinstance(data, str): + data = data.encode('utf-8') + if not has_header('Content-Type'): + headers.append(('Content-Type', 'text/plain')) + + with self.lock: + self._ensure_state() + self.id_counter += 1 + cmd = { + 'action': 'download', 'id': self.id_counter, 'url': url, 'method': method, 'timeout': timeout, + 'headers': self.addheaders + headers} + if data: + with open(os.path.join(self.tdir, f'i{self.id_counter}'), 'wb') as f: + if hasattr(data, 'read'): + shutil.copyfileobj(data, f) + else: + f.write(data) + cmd['data_path'] = f.name + res = FakeResponse() + self.dispatch_map[self.id_counter] = res.queue + self._send_command(cmd) + return res + + open_novisit = open + + def set_simple_cookie(self, name: str, value: str, domain: str | None = None, path: str | None = '/'): + ''' + Set a simple cookie using a name and value. If domain is specified, the cookie is only sent with requests + to matching domains, otherwise it is sent with all requests. The leading dot in domain is optional. + Similarly, by default all paths match, to restrict to certain path use the path parameter. + ''' + c = {'name': name, 'value': value, 'domain': domain, 'path': path} + self._send_command({'action': 'set_cookies', 'cookies':[c]}) + + def set_user_agent(self, val: str = '') -> None: + self.user_agent = val + self._send_command({'action': 'set_user_agent', 'user_agent': val}) + + def clone_browser(self): + return self + + def _send_command(self, cmd): + with self.lock: + self.worker.stdin.write(json.dumps(cmd).encode()) + self.worker.stdin.write(b'\n') + self.worker.stdin.flush() + + def _ensure_state(self): + with self.lock: + if not self.tdir: + self.tdir = PersistentTemporaryDirectory() + self.worker = run_worker(self.tdir, self.user_agent) + self.dispatcher = Thread(target=self._dispatch, daemon=True) + self.dispatcher.start() + + def _dispatch(self): + try: + for line in self.worker.stdout: + cmd = json.loads(line) + if cmd.get('action') == 'finished': + with self.lock: + q = self.dispatch_map.pop(cmd['id']) + q.put(cmd) + else: + raise Exception(f'Unexpected response from backend fetch worker process: {cmd}') + except Exception: + if not self.shutting_down: + import traceback + traceback.print_exc() + + def shutdown(self): + self.shutting_down = True + import shutil + if self.worker: + with suppress(OSError): + self.worker.stdin.close() + with suppress(OSError): + self.worker.stdout.close() + give_up_at = time.monotonic() + 1.5 + while time.monotonic() < give_up_at and self.worker.poll() is None: + time.sleep(0.01) + if self.worker.poll() is None: + self.worker.kill() + if self.tdir: + with suppress(OSError): + shutil.rmtree(self.tdir) + self.tdir = '' + if self.dispatcher: + self.dispatcher.join() + self.dispatcher = None + + def __del__(self): + self.shutdown() + + +def run_worker(tdir: str, user_agent: str): + from calibre.utils.ipc.simple_worker import start_pipe_worker + return start_pipe_worker(f'from calibre.scraper.qt import worker; worker({tdir!r}, {user_agent!r})') + + +def worker(*args): + from calibre.gui2 import must_use_qt + must_use_qt() + from .qt_backend import worker + worker(*args) + + +def develop(): + import sys + br = Browser() + try: + for url in sys.argv[1:]: + res = br.open(url) + print(url, len(res.read())) + finally: + del br + + +if __name__ == '__main__': + develop() diff --git a/src/calibre/scraper/qt_backend.py b/src/calibre/scraper/qt_backend.py index a4d51d852f..f4ef73d221 100644 --- a/src/calibre/scraper/qt_backend.py +++ b/src/calibre/scraper/qt_backend.py @@ -95,7 +95,8 @@ class DownloadRequest(QObject): def on_data_available(self) -> None: with open(self.output_path, 'ab') as f: - f.write(memoryview(self.reply.readAll())) + ba = self.reply.readAll() + f.write(memoryview(ba)) def on_ssl_errors(self, err) -> None: pass @@ -106,6 +107,16 @@ class DownloadRequest(QObject): 'final_url': qurl_to_string(self.reply.url())} if e != QNetworkReply.NetworkError.NoError: + if e in ( + QNetworkReply.NetworkError.TimeoutError, + QNetworkReply.NetworkError.TemporaryNetworkFailureError, + + QNetworkReply.NetworkError.ConnectionRefusedError, + QNetworkReply.NetworkError.RemoteHostClosedError, + QNetworkReply.NetworkError.OperationCanceledError, # abort() called in overall timeout check + QNetworkReply.NetworkError.SslHandshakeFailedError, + ): + self.worth_retry = True es = f'{e}: {self.reply.errorString()}' result['error'], result['worth_retry'] = es, self.worth_retry return result @@ -248,27 +259,25 @@ def request_from_cmd(cmd: dict[str, Any], filename: str) -> Request: if timeout is None: timeout = default_timeout req: Request = { - 'id': cmd.get(id) or 0, + 'id': int(cmd['id']), 'url': cmd['url'], 'headers': cmd.get('headers') or [], 'data_path': cmd.get('data_path') or '', 'method': cmd.get('method') or 'get', 'filename': filename, - 'timeout': timeout, + 'timeout': float(timeout), } return req def read_commands(backend: FetchBackend, tdir: str) -> None: - file_counter = 0 error_msg = '' try: for line in sys.stdin: cmd = json.loads(line) ac = cmd['action'] if ac == 'download': - file_counter += 1 - backend.request_download.emit(request_from_cmd(cmd, str(file_counter))) + backend.request_download.emit(request_from_cmd(cmd, f'o{cmd["id"]}')) elif ac == 'set_cookies': backend.set_cookies.emit(cmd['cookies']) elif ac == 'set_user_agent': @@ -285,7 +294,7 @@ def read_commands(backend: FetchBackend, tdir: str) -> None: def worker(tdir: str, user_agent: str) -> None: app = QApplication.instance() sys.stdout = sys.stderr - backend = FetchBackend(parent=app, user_agent=user_agent) + backend = FetchBackend(parent=app, user_agent=user_agent, output_dir=tdir) try: read_thread = Thread(target=read_commands, args=(backend, tdir), daemon=True) read_thread.start() @@ -312,7 +321,7 @@ def develop(url: str) -> None: backend.download_finished.connect(download_finished) for i, url in enumerate(sys.argv[1:]): - backend.download(request_from_cmd({'url':url}, f'test-output-{i}')) + backend.download(request_from_cmd({'url':url, 'id': i}, f'test-output-{i}')) num_left += 1 app.exec() diff --git a/src/calibre/scraper/test_fetch_backend.py b/src/calibre/scraper/test_fetch_backend.py index 7e00037989..d0933a6559 100644 --- a/src/calibre/scraper/test_fetch_backend.py +++ b/src/calibre/scraper/test_fetch_backend.py @@ -12,8 +12,8 @@ from lxml.html import fromstring, tostring from calibre.utils.resources import get_path as P +from .qt import Browser from .simple import Overseer -from .webengine_download import Browser skip = '' is_sanitized = 'libasan' in os.environ.get('LD_PRELOAD', '') @@ -158,7 +158,7 @@ class TestFetchBackend(unittest.TestCase): br.set_user_agent('man in black') r = get() self.ae(r['headers']['User-Agent'], ['man in black']) - self.ae(r['headers']['Cookie'], ['sc=1; cook=ie']) + self.ae(r['headers']['Cookie'], ['cook=ie; sc=1']) finally: br.shutdown()