mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Qt browser frontend is not functional with tests
This commit is contained in:
parent
8c0bb15478
commit
e28f9c3ed3
216
src/calibre/scraper/qt.py
Normal file
216
src/calibre/scraper/qt.py
Normal file
@ -0,0 +1,216 @@
|
||||
#!/usr/bin/env python
|
||||
# License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import time
|
||||
from contextlib import suppress
|
||||
from io import BytesIO
|
||||
from queue import Queue
|
||||
from threading import RLock, Thread
|
||||
from urllib.error import URLError
|
||||
from urllib.parse import urlencode
|
||||
from urllib.request import Request
|
||||
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
|
||||
|
||||
class FakeResponse:
|
||||
|
||||
def __init__(self):
|
||||
self.queue = Queue()
|
||||
self.done = False
|
||||
self.final_url = ''
|
||||
self.data = BytesIO()
|
||||
|
||||
def _wait(self):
|
||||
if self.done:
|
||||
return
|
||||
self.done = True
|
||||
res = self.queue.get()
|
||||
if res['action'] == 'input_error':
|
||||
raise Exception(res['error'])
|
||||
self.final_url = res['final_url']
|
||||
if 'error' in res:
|
||||
ex = URLError(res['error'])
|
||||
ex.worth_retry = bool(res.get('worth_retry'))
|
||||
raise ex
|
||||
self.data = open(res['output'], 'rb')
|
||||
|
||||
def read(self, *a, **kw):
|
||||
self._wait()
|
||||
ans = self.data.read(*a, **kw)
|
||||
return ans
|
||||
|
||||
def seek(self, *a, **kw):
|
||||
self._wait()
|
||||
return self.data.seek(*a, **kw)
|
||||
|
||||
def tell(self, *a, **kw):
|
||||
return self.data.tell(*a, **kw)
|
||||
|
||||
def geturl(self):
|
||||
self._wait()
|
||||
return self.final_url
|
||||
|
||||
def close(self):
|
||||
self.data.close()
|
||||
|
||||
|
||||
class Browser:
|
||||
|
||||
def __init__(self, user_agent: str = '', headers: tuple[tuple[str, str], ...] = (), start_worker: bool = False):
|
||||
self.tdir = ''
|
||||
self.worker = self.dispatcher = None
|
||||
self.dispatch_map = {}
|
||||
self.id_counter = 0
|
||||
self.addheaders: list[tuple[str, str]] = list(headers)
|
||||
self.user_agent = user_agent
|
||||
self.lock = RLock()
|
||||
self.shutting_down = False
|
||||
if start_worker:
|
||||
self._ensure_state()
|
||||
|
||||
def open(self, url_or_request: Request, data=None, timeout=None):
|
||||
method = 'POST' if data else 'GET'
|
||||
headers = []
|
||||
if hasattr(url_or_request, 'get_method'):
|
||||
r = url_or_request
|
||||
method = r.get_method()
|
||||
data = data or r.data
|
||||
headers = r.header_items()
|
||||
url = r.full_url
|
||||
else:
|
||||
url = url_or_request
|
||||
|
||||
def has_header(x: str) -> bool:
|
||||
x = x.lower()
|
||||
for (h, v) in headers:
|
||||
if h.lower() == x:
|
||||
return True
|
||||
return False
|
||||
|
||||
if isinstance(data, dict):
|
||||
headers.append(('Content-Type', 'application/x-www-form-urlencoded'))
|
||||
data = urlencode(data)
|
||||
if isinstance(data, str):
|
||||
data = data.encode('utf-8')
|
||||
if not has_header('Content-Type'):
|
||||
headers.append(('Content-Type', 'text/plain'))
|
||||
|
||||
with self.lock:
|
||||
self._ensure_state()
|
||||
self.id_counter += 1
|
||||
cmd = {
|
||||
'action': 'download', 'id': self.id_counter, 'url': url, 'method': method, 'timeout': timeout,
|
||||
'headers': self.addheaders + headers}
|
||||
if data:
|
||||
with open(os.path.join(self.tdir, f'i{self.id_counter}'), 'wb') as f:
|
||||
if hasattr(data, 'read'):
|
||||
shutil.copyfileobj(data, f)
|
||||
else:
|
||||
f.write(data)
|
||||
cmd['data_path'] = f.name
|
||||
res = FakeResponse()
|
||||
self.dispatch_map[self.id_counter] = res.queue
|
||||
self._send_command(cmd)
|
||||
return res
|
||||
|
||||
open_novisit = open
|
||||
|
||||
def set_simple_cookie(self, name: str, value: str, domain: str | None = None, path: str | None = '/'):
|
||||
'''
|
||||
Set a simple cookie using a name and value. If domain is specified, the cookie is only sent with requests
|
||||
to matching domains, otherwise it is sent with all requests. The leading dot in domain is optional.
|
||||
Similarly, by default all paths match, to restrict to certain path use the path parameter.
|
||||
'''
|
||||
c = {'name': name, 'value': value, 'domain': domain, 'path': path}
|
||||
self._send_command({'action': 'set_cookies', 'cookies':[c]})
|
||||
|
||||
def set_user_agent(self, val: str = '') -> None:
|
||||
self.user_agent = val
|
||||
self._send_command({'action': 'set_user_agent', 'user_agent': val})
|
||||
|
||||
def clone_browser(self):
|
||||
return self
|
||||
|
||||
def _send_command(self, cmd):
|
||||
with self.lock:
|
||||
self.worker.stdin.write(json.dumps(cmd).encode())
|
||||
self.worker.stdin.write(b'\n')
|
||||
self.worker.stdin.flush()
|
||||
|
||||
def _ensure_state(self):
|
||||
with self.lock:
|
||||
if not self.tdir:
|
||||
self.tdir = PersistentTemporaryDirectory()
|
||||
self.worker = run_worker(self.tdir, self.user_agent)
|
||||
self.dispatcher = Thread(target=self._dispatch, daemon=True)
|
||||
self.dispatcher.start()
|
||||
|
||||
def _dispatch(self):
|
||||
try:
|
||||
for line in self.worker.stdout:
|
||||
cmd = json.loads(line)
|
||||
if cmd.get('action') == 'finished':
|
||||
with self.lock:
|
||||
q = self.dispatch_map.pop(cmd['id'])
|
||||
q.put(cmd)
|
||||
else:
|
||||
raise Exception(f'Unexpected response from backend fetch worker process: {cmd}')
|
||||
except Exception:
|
||||
if not self.shutting_down:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
def shutdown(self):
|
||||
self.shutting_down = True
|
||||
import shutil
|
||||
if self.worker:
|
||||
with suppress(OSError):
|
||||
self.worker.stdin.close()
|
||||
with suppress(OSError):
|
||||
self.worker.stdout.close()
|
||||
give_up_at = time.monotonic() + 1.5
|
||||
while time.monotonic() < give_up_at and self.worker.poll() is None:
|
||||
time.sleep(0.01)
|
||||
if self.worker.poll() is None:
|
||||
self.worker.kill()
|
||||
if self.tdir:
|
||||
with suppress(OSError):
|
||||
shutil.rmtree(self.tdir)
|
||||
self.tdir = ''
|
||||
if self.dispatcher:
|
||||
self.dispatcher.join()
|
||||
self.dispatcher = None
|
||||
|
||||
def __del__(self):
|
||||
self.shutdown()
|
||||
|
||||
|
||||
def run_worker(tdir: str, user_agent: str):
|
||||
from calibre.utils.ipc.simple_worker import start_pipe_worker
|
||||
return start_pipe_worker(f'from calibre.scraper.qt import worker; worker({tdir!r}, {user_agent!r})')
|
||||
|
||||
|
||||
def worker(*args):
|
||||
from calibre.gui2 import must_use_qt
|
||||
must_use_qt()
|
||||
from .qt_backend import worker
|
||||
worker(*args)
|
||||
|
||||
|
||||
def develop():
|
||||
import sys
|
||||
br = Browser()
|
||||
try:
|
||||
for url in sys.argv[1:]:
|
||||
res = br.open(url)
|
||||
print(url, len(res.read()))
|
||||
finally:
|
||||
del br
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
develop()
|
@ -95,7 +95,8 @@ class DownloadRequest(QObject):
|
||||
|
||||
def on_data_available(self) -> None:
|
||||
with open(self.output_path, 'ab') as f:
|
||||
f.write(memoryview(self.reply.readAll()))
|
||||
ba = self.reply.readAll()
|
||||
f.write(memoryview(ba))
|
||||
|
||||
def on_ssl_errors(self, err) -> None:
|
||||
pass
|
||||
@ -106,6 +107,16 @@ class DownloadRequest(QObject):
|
||||
'final_url': qurl_to_string(self.reply.url())}
|
||||
|
||||
if e != QNetworkReply.NetworkError.NoError:
|
||||
if e in (
|
||||
QNetworkReply.NetworkError.TimeoutError,
|
||||
QNetworkReply.NetworkError.TemporaryNetworkFailureError,
|
||||
|
||||
QNetworkReply.NetworkError.ConnectionRefusedError,
|
||||
QNetworkReply.NetworkError.RemoteHostClosedError,
|
||||
QNetworkReply.NetworkError.OperationCanceledError, # abort() called in overall timeout check
|
||||
QNetworkReply.NetworkError.SslHandshakeFailedError,
|
||||
):
|
||||
self.worth_retry = True
|
||||
es = f'{e}: {self.reply.errorString()}'
|
||||
result['error'], result['worth_retry'] = es, self.worth_retry
|
||||
return result
|
||||
@ -248,27 +259,25 @@ def request_from_cmd(cmd: dict[str, Any], filename: str) -> Request:
|
||||
if timeout is None:
|
||||
timeout = default_timeout
|
||||
req: Request = {
|
||||
'id': cmd.get(id) or 0,
|
||||
'id': int(cmd['id']),
|
||||
'url': cmd['url'],
|
||||
'headers': cmd.get('headers') or [],
|
||||
'data_path': cmd.get('data_path') or '',
|
||||
'method': cmd.get('method') or 'get',
|
||||
'filename': filename,
|
||||
'timeout': timeout,
|
||||
'timeout': float(timeout),
|
||||
}
|
||||
return req
|
||||
|
||||
|
||||
def read_commands(backend: FetchBackend, tdir: str) -> None:
|
||||
file_counter = 0
|
||||
error_msg = ''
|
||||
try:
|
||||
for line in sys.stdin:
|
||||
cmd = json.loads(line)
|
||||
ac = cmd['action']
|
||||
if ac == 'download':
|
||||
file_counter += 1
|
||||
backend.request_download.emit(request_from_cmd(cmd, str(file_counter)))
|
||||
backend.request_download.emit(request_from_cmd(cmd, f'o{cmd["id"]}'))
|
||||
elif ac == 'set_cookies':
|
||||
backend.set_cookies.emit(cmd['cookies'])
|
||||
elif ac == 'set_user_agent':
|
||||
@ -285,7 +294,7 @@ def read_commands(backend: FetchBackend, tdir: str) -> None:
|
||||
def worker(tdir: str, user_agent: str) -> None:
|
||||
app = QApplication.instance()
|
||||
sys.stdout = sys.stderr
|
||||
backend = FetchBackend(parent=app, user_agent=user_agent)
|
||||
backend = FetchBackend(parent=app, user_agent=user_agent, output_dir=tdir)
|
||||
try:
|
||||
read_thread = Thread(target=read_commands, args=(backend, tdir), daemon=True)
|
||||
read_thread.start()
|
||||
@ -312,7 +321,7 @@ def develop(url: str) -> None:
|
||||
|
||||
backend.download_finished.connect(download_finished)
|
||||
for i, url in enumerate(sys.argv[1:]):
|
||||
backend.download(request_from_cmd({'url':url}, f'test-output-{i}'))
|
||||
backend.download(request_from_cmd({'url':url, 'id': i}, f'test-output-{i}'))
|
||||
num_left += 1
|
||||
app.exec()
|
||||
|
||||
|
@ -12,8 +12,8 @@ from lxml.html import fromstring, tostring
|
||||
|
||||
from calibre.utils.resources import get_path as P
|
||||
|
||||
from .qt import Browser
|
||||
from .simple import Overseer
|
||||
from .webengine_download import Browser
|
||||
|
||||
skip = ''
|
||||
is_sanitized = 'libasan' in os.environ.get('LD_PRELOAD', '')
|
||||
@ -158,7 +158,7 @@ class TestFetchBackend(unittest.TestCase):
|
||||
br.set_user_agent('man in black')
|
||||
r = get()
|
||||
self.ae(r['headers']['User-Agent'], ['man in black'])
|
||||
self.ae(r['headers']['Cookie'], ['sc=1; cook=ie'])
|
||||
self.ae(r['headers']['Cookie'], ['cook=ie; sc=1'])
|
||||
finally:
|
||||
br.shutdown()
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user