Qt browser frontend is not functional with tests

This commit is contained in:
Kovid Goyal 2024-08-14 12:32:45 +05:30
parent 8c0bb15478
commit e28f9c3ed3
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 235 additions and 10 deletions

216
src/calibre/scraper/qt.py Normal file
View File

@ -0,0 +1,216 @@
#!/usr/bin/env python
# License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
import json
import os
import shutil
import time
from contextlib import suppress
from io import BytesIO
from queue import Queue
from threading import RLock, Thread
from urllib.error import URLError
from urllib.parse import urlencode
from urllib.request import Request
from calibre.ptempfile import PersistentTemporaryDirectory
class FakeResponse:
def __init__(self):
self.queue = Queue()
self.done = False
self.final_url = ''
self.data = BytesIO()
def _wait(self):
if self.done:
return
self.done = True
res = self.queue.get()
if res['action'] == 'input_error':
raise Exception(res['error'])
self.final_url = res['final_url']
if 'error' in res:
ex = URLError(res['error'])
ex.worth_retry = bool(res.get('worth_retry'))
raise ex
self.data = open(res['output'], 'rb')
def read(self, *a, **kw):
self._wait()
ans = self.data.read(*a, **kw)
return ans
def seek(self, *a, **kw):
self._wait()
return self.data.seek(*a, **kw)
def tell(self, *a, **kw):
return self.data.tell(*a, **kw)
def geturl(self):
self._wait()
return self.final_url
def close(self):
self.data.close()
class Browser:
def __init__(self, user_agent: str = '', headers: tuple[tuple[str, str], ...] = (), start_worker: bool = False):
self.tdir = ''
self.worker = self.dispatcher = None
self.dispatch_map = {}
self.id_counter = 0
self.addheaders: list[tuple[str, str]] = list(headers)
self.user_agent = user_agent
self.lock = RLock()
self.shutting_down = False
if start_worker:
self._ensure_state()
def open(self, url_or_request: Request, data=None, timeout=None):
method = 'POST' if data else 'GET'
headers = []
if hasattr(url_or_request, 'get_method'):
r = url_or_request
method = r.get_method()
data = data or r.data
headers = r.header_items()
url = r.full_url
else:
url = url_or_request
def has_header(x: str) -> bool:
x = x.lower()
for (h, v) in headers:
if h.lower() == x:
return True
return False
if isinstance(data, dict):
headers.append(('Content-Type', 'application/x-www-form-urlencoded'))
data = urlencode(data)
if isinstance(data, str):
data = data.encode('utf-8')
if not has_header('Content-Type'):
headers.append(('Content-Type', 'text/plain'))
with self.lock:
self._ensure_state()
self.id_counter += 1
cmd = {
'action': 'download', 'id': self.id_counter, 'url': url, 'method': method, 'timeout': timeout,
'headers': self.addheaders + headers}
if data:
with open(os.path.join(self.tdir, f'i{self.id_counter}'), 'wb') as f:
if hasattr(data, 'read'):
shutil.copyfileobj(data, f)
else:
f.write(data)
cmd['data_path'] = f.name
res = FakeResponse()
self.dispatch_map[self.id_counter] = res.queue
self._send_command(cmd)
return res
open_novisit = open
def set_simple_cookie(self, name: str, value: str, domain: str | None = None, path: str | None = '/'):
'''
Set a simple cookie using a name and value. If domain is specified, the cookie is only sent with requests
to matching domains, otherwise it is sent with all requests. The leading dot in domain is optional.
Similarly, by default all paths match, to restrict to certain path use the path parameter.
'''
c = {'name': name, 'value': value, 'domain': domain, 'path': path}
self._send_command({'action': 'set_cookies', 'cookies':[c]})
def set_user_agent(self, val: str = '') -> None:
self.user_agent = val
self._send_command({'action': 'set_user_agent', 'user_agent': val})
def clone_browser(self):
return self
def _send_command(self, cmd):
with self.lock:
self.worker.stdin.write(json.dumps(cmd).encode())
self.worker.stdin.write(b'\n')
self.worker.stdin.flush()
def _ensure_state(self):
with self.lock:
if not self.tdir:
self.tdir = PersistentTemporaryDirectory()
self.worker = run_worker(self.tdir, self.user_agent)
self.dispatcher = Thread(target=self._dispatch, daemon=True)
self.dispatcher.start()
def _dispatch(self):
try:
for line in self.worker.stdout:
cmd = json.loads(line)
if cmd.get('action') == 'finished':
with self.lock:
q = self.dispatch_map.pop(cmd['id'])
q.put(cmd)
else:
raise Exception(f'Unexpected response from backend fetch worker process: {cmd}')
except Exception:
if not self.shutting_down:
import traceback
traceback.print_exc()
def shutdown(self):
self.shutting_down = True
import shutil
if self.worker:
with suppress(OSError):
self.worker.stdin.close()
with suppress(OSError):
self.worker.stdout.close()
give_up_at = time.monotonic() + 1.5
while time.monotonic() < give_up_at and self.worker.poll() is None:
time.sleep(0.01)
if self.worker.poll() is None:
self.worker.kill()
if self.tdir:
with suppress(OSError):
shutil.rmtree(self.tdir)
self.tdir = ''
if self.dispatcher:
self.dispatcher.join()
self.dispatcher = None
def __del__(self):
self.shutdown()
def run_worker(tdir: str, user_agent: str):
from calibre.utils.ipc.simple_worker import start_pipe_worker
return start_pipe_worker(f'from calibre.scraper.qt import worker; worker({tdir!r}, {user_agent!r})')
def worker(*args):
from calibre.gui2 import must_use_qt
must_use_qt()
from .qt_backend import worker
worker(*args)
def develop():
import sys
br = Browser()
try:
for url in sys.argv[1:]:
res = br.open(url)
print(url, len(res.read()))
finally:
del br
if __name__ == '__main__':
develop()

View File

@ -95,7 +95,8 @@ class DownloadRequest(QObject):
def on_data_available(self) -> None:
with open(self.output_path, 'ab') as f:
f.write(memoryview(self.reply.readAll()))
ba = self.reply.readAll()
f.write(memoryview(ba))
def on_ssl_errors(self, err) -> None:
pass
@ -106,6 +107,16 @@ class DownloadRequest(QObject):
'final_url': qurl_to_string(self.reply.url())}
if e != QNetworkReply.NetworkError.NoError:
if e in (
QNetworkReply.NetworkError.TimeoutError,
QNetworkReply.NetworkError.TemporaryNetworkFailureError,
QNetworkReply.NetworkError.ConnectionRefusedError,
QNetworkReply.NetworkError.RemoteHostClosedError,
QNetworkReply.NetworkError.OperationCanceledError, # abort() called in overall timeout check
QNetworkReply.NetworkError.SslHandshakeFailedError,
):
self.worth_retry = True
es = f'{e}: {self.reply.errorString()}'
result['error'], result['worth_retry'] = es, self.worth_retry
return result
@ -248,27 +259,25 @@ def request_from_cmd(cmd: dict[str, Any], filename: str) -> Request:
if timeout is None:
timeout = default_timeout
req: Request = {
'id': cmd.get(id) or 0,
'id': int(cmd['id']),
'url': cmd['url'],
'headers': cmd.get('headers') or [],
'data_path': cmd.get('data_path') or '',
'method': cmd.get('method') or 'get',
'filename': filename,
'timeout': timeout,
'timeout': float(timeout),
}
return req
def read_commands(backend: FetchBackend, tdir: str) -> None:
file_counter = 0
error_msg = ''
try:
for line in sys.stdin:
cmd = json.loads(line)
ac = cmd['action']
if ac == 'download':
file_counter += 1
backend.request_download.emit(request_from_cmd(cmd, str(file_counter)))
backend.request_download.emit(request_from_cmd(cmd, f'o{cmd["id"]}'))
elif ac == 'set_cookies':
backend.set_cookies.emit(cmd['cookies'])
elif ac == 'set_user_agent':
@ -285,7 +294,7 @@ def read_commands(backend: FetchBackend, tdir: str) -> None:
def worker(tdir: str, user_agent: str) -> None:
app = QApplication.instance()
sys.stdout = sys.stderr
backend = FetchBackend(parent=app, user_agent=user_agent)
backend = FetchBackend(parent=app, user_agent=user_agent, output_dir=tdir)
try:
read_thread = Thread(target=read_commands, args=(backend, tdir), daemon=True)
read_thread.start()
@ -312,7 +321,7 @@ def develop(url: str) -> None:
backend.download_finished.connect(download_finished)
for i, url in enumerate(sys.argv[1:]):
backend.download(request_from_cmd({'url':url}, f'test-output-{i}'))
backend.download(request_from_cmd({'url':url, 'id': i}, f'test-output-{i}'))
num_left += 1
app.exec()

View File

@ -12,8 +12,8 @@ from lxml.html import fromstring, tostring
from calibre.utils.resources import get_path as P
from .qt import Browser
from .simple import Overseer
from .webengine_download import Browser
skip = ''
is_sanitized = 'libasan' in os.environ.get('LD_PRELOAD', '')
@ -158,7 +158,7 @@ class TestFetchBackend(unittest.TestCase):
br.set_user_agent('man in black')
r = get()
self.ae(r['headers']['User-Agent'], ['man in black'])
self.ae(r['headers']['Cookie'], ['sc=1; cook=ie'])
self.ae(r['headers']['Cookie'], ['cook=ie; sc=1'])
finally:
br.shutdown()