Get rid of the simple backend in favor of WebEngineBrowser

This commit is contained in:
Kovid Goyal 2024-08-16 20:22:20 +05:30
parent 5b00e588b2
commit f82da06184
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 43 additions and 314 deletions

View File

@ -2,138 +2,41 @@
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
import json
import os
import sys
import weakref
from threading import Lock, Thread, get_ident
from calibre.constants import iswindows
from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.filenames import retry_on_fail
from calibre.utils.ipc.simple_worker import start_pipe_worker
def worker_main(source):
from qt.core import QUrl
from calibre.gui2 import must_use_qt
from calibre.gui_launch import setup_qt_logging
setup_qt_logging()
from .simple_backend import SimpleScraper
must_use_qt()
s = SimpleScraper(source)
for line in sys.stdin.buffer:
line = line.strip()
if source == 'test':
print(line.decode('utf-8'), file=sys.stderr)
try:
cmd, rest = line.split(b':', 1)
except Exception:
continue
if cmd == b'EXIT':
raise SystemExit(int(rest))
if cmd == b'FETCH':
try:
d = json.loads(rest)
html = s.fetch(QUrl.fromEncoded(d['url'].encode('utf-8')), timeout=float(d['timeout']))
except Exception as e:
import traceback
result = {'ok': False, 'tb': traceback.format_exc(), 'err': str(e)}
else:
with PersistentTemporaryFile(suffix='-scraper-result.html') as t:
t.write(html.encode('utf-8'))
result = {'ok': True, 'html_file': t.name}
print(json.dumps(result), flush=True)
from threading import Lock
overseers = []
class Overseer:
def __init__(self):
self.lock = Lock()
self.workers = {}
overseers.append(weakref.ref(self))
def safe_wait(self, w, timeout):
try:
return w.wait(timeout)
except Exception:
pass
def worker_for_source(self, source):
wname = f'{source}::{get_ident()}'
with self.lock:
ans = self.workers.get(wname)
if ans is None:
w = start_pipe_worker(f'from calibre.scraper.simple import worker_main; worker_main({source!r})')
ans = self.workers[wname] = w
return ans
def fetch_url(self, url_or_qurl, source='', timeout=60):
from qt.core import QUrl
w = self.worker_for_source(source)
if isinstance(url_or_qurl, str):
url_or_qurl = QUrl(url_or_qurl)
w.stdin.write(b'FETCH:')
w.stdin.write(json.dumps({'url': bytes(url_or_qurl.toEncoded()).decode('utf-8'), 'timeout': timeout}).encode('utf-8'))
w.stdin.write(b'\n')
w.stdin.flush()
output = json.loads(w.stdout.readline())
if not output['ok']:
raise ValueError(output['err'])
with open(output['html_file'], 'rb') as f:
html = f.read().decode('utf-8')
retry_on_fail(os.remove, output['html_file'])
return html
def __del__(self):
with self.lock:
for w in self.workers.values():
w.stdin.write(b'EXIT:0\n')
w.stdin.flush()
w.stdin.close()
w.stdout.close()
for w in self.workers.values():
if self.safe_wait(w, 0.2) is None:
w.terminate()
if not iswindows:
if self.safe_wait(w, 0.1) is None:
w.kill()
self.workers.clear()
close = __del__
def cleanup_overseers():
threads = []
for x in overseers:
o = x()
if o is not None:
t = Thread(target=o.close, name='CloseOverSeer')
t.start()
threads.append(t)
browsers = tuple(filter(None, (x() for x in overseers)))
del overseers[:]
def join_all():
for t in threads:
t.join()
for br in browsers:
br.shutdown()
return join_all
read_url_lock = Lock()
def read_url(storage, url, timeout=60):
def read_url(storage, url, timeout=60, as_html=True):
with read_url_lock:
from calibre.scraper.qt import WebEngineBrowser
if not storage:
storage.append(Overseer())
storage.append(WebEngineBrowser())
overseers.append(weakref.ref(storage[-1]))
scraper = storage[0]
from calibre.ebooks.chardet import strip_encoding_declarations
return strip_encoding_declarations(scraper.fetch_url(url, timeout=timeout))
raw_bytes = scraper.open_novisit(url, timeout=timeout).read()
if not as_html:
return raw_bytes
from calibre.ebooks.chardet import xml_to_unicode
return xml_to_unicode(raw_bytes, strip_encoding_pats=True)[0]
if __name__ == '__main__':
print(read_url([], sys.argv[-1]))
try:
print(read_url([], sys.argv[-1]))
finally:
cleanup_overseers()()

View File

@ -1,164 +0,0 @@
#!/usr/bin/env python
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
import json
import secrets
import sys
import time
from functools import lru_cache
from qt.core import QApplication, QEventLoop, QUrl
from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineSettings
from calibre.utils.webengine import create_script, insert_scripts, setup_profile
def canonicalize_qurl(qurl):
qurl = qurl.adjusted(
QUrl.UrlFormattingOption.StripTrailingSlash | QUrl.UrlFormattingOption.NormalizePathSegments | QUrl.UrlFormattingOption.RemoveFragment
)
if qurl.path() == '/':
qurl = qurl.adjusted(QUrl.UrlFormattingOption.RemovePath)
return qurl
def create_base_profile(cache_name='', allow_js=False):
from calibre.utils.random_ua import random_common_chrome_user_agent
if cache_name:
ans = QWebEngineProfile(cache_name, QApplication.instance())
else:
ans = QWebEngineProfile(QApplication.instance())
setup_profile(ans)
ans.setHttpUserAgent(random_common_chrome_user_agent())
ans.setHttpCacheMaximumSize(0) # managed by webengine
s = ans.settings()
a = s.setAttribute
a(QWebEngineSettings.WebAttribute.PluginsEnabled, False)
a(QWebEngineSettings.WebAttribute.JavascriptEnabled, allow_js)
s.setUnknownUrlSchemePolicy(QWebEngineSettings.UnknownUrlSchemePolicy.DisallowUnknownUrlSchemes)
a(QWebEngineSettings.WebAttribute.JavascriptCanOpenWindows, False)
a(QWebEngineSettings.WebAttribute.JavascriptCanAccessClipboard, False)
# ensure javascript cannot read from local files
a(QWebEngineSettings.WebAttribute.LocalContentCanAccessFileUrls, False)
a(QWebEngineSettings.WebAttribute.AllowWindowActivationFromJavaScript, False)
return ans
@lru_cache(maxsize=None)
def create_profile(cache_name='', allow_js=False):
ans = create_base_profile(cache_name, allow_js)
ans.token = secrets.token_hex()
js = '''
(function() {
"use strict";
function send_msg(data) {
var token = 'TOKEN';
var msg = token + ' ' + JSON.stringify(data);
console.log(msg);
}
function debug() {
var args = Array.prototype.slice.call(arguments);
var text = args.join(' ');
send_msg({type: 'print', text: text});
}
if (document.location && document.location.href && !document.location.href.startsWith('chrome-error:') && !document.location.href.startsWith('about:')) {
send_msg({type: 'domready', url: document.location.href, html: new XMLSerializer().serializeToString(document)});
}
})();
'''
js = js.replace('TOKEN', ans.token)
insert_scripts(ans, create_script('scraper.js', js))
return ans
class SimpleScraper(QWebEnginePage):
def __init__(self, source='', parent=None):
profile = create_profile(source)
self.token = profile.token
self.is_being_tested = source == 'test'
super().__init__(profile, parent)
self.setAudioMuted(True)
self.loadStarted.connect(self.load_started)
self.loadFinished.connect(self.load_finished)
self.loadProgress.connect(self.load_progress)
def print(self, *a):
print(*a, file=sys.stderr)
def load_started(self):
if self.is_being_tested:
self.print(f'load_started: {self.is_current_url=} {self.requestedUrl()=}')
if self.is_current_url:
self.current_fetch['load_started'] = True
def load_finished(self, ok):
if self.is_being_tested:
self.print(f'load_finished: {ok=} {self.is_current_url=}')
if self.is_current_url:
self.current_fetch['load_finished'] = True
self.current_fetch['load_was_ok'] = ok
if not ok and self.is_current_url:
self.current_fetch['working'] = False
def load_progress(self, progress):
if self.is_being_tested:
self.print(f'load_progress: {progress=} {self.is_current_url=}')
if self.is_current_url:
self.current_fetch['end_time'] = time.monotonic() + self.current_fetch['timeout']
def javaScriptAlert(self, url, msg):
pass
def javaScriptConfirm(self, url, msg):
return True
def javaScriptPrompt(self, url, msg, defval):
return True, defval
@property
def is_current_url(self):
if not hasattr(self, 'current_fetch'):
return False
return canonicalize_qurl(self.requestedUrl()) == self.current_fetch['fetching_url']
def javaScriptConsoleMessage(self, level, message, line_num, source_id):
parts = message.split(maxsplit=1)
if len(parts) == 2 and parts[0] == self.token:
msg = json.loads(parts[1])
t = msg.get('type')
if t == 'print':
print(msg['text'], file=sys.stderr)
elif t == 'domready':
if self.is_being_tested:
self.print(f'domready: {self.is_current_url=}')
if self.is_current_url:
self.triggerAction(QWebEnginePage.WebAction.Stop)
self.current_fetch['working'] = False
if not msg.get('failed'):
self.current_fetch['html'] = msg['html']
def fetch(self, url_or_qurl, timeout=60):
fetching_url = QUrl(url_or_qurl)
self.current_fetch = {
'timeout': timeout, 'end_time': time.monotonic() + timeout,
'fetching_url': canonicalize_qurl(fetching_url), 'working': True,
'load_started': False
}
self.load(fetching_url)
try:
app = QApplication.instance()
while self.current_fetch['working'] and time.monotonic() < self.current_fetch['end_time']:
app.processEvents(QEventLoop.ProcessEventsFlag.ExcludeUserInputEvents)
ans = self.current_fetch.get('html')
if ans is None:
eurl = fetching_url.toString()
if self.current_fetch['working']:
raise TimeoutError(f'Timed out loading HTML from: {eurl}')
raise ValueError(f'Failed to load HTML from: {eurl}')
return ans
finally:
del self.current_fetch

View File

@ -4,16 +4,10 @@
import http.server
import json
import os
import re
import unittest
from threading import Event, Thread
from lxml.html import fromstring, tostring
from calibre.utils.resources import get_path as P
from .qt import Browser, WebEngineBrowser
from .simple import Overseer
skip = ''
is_sanitized = 'libasan' in os.environ.get('LD_PRELOAD', '')
@ -23,30 +17,6 @@ elif 'SKIP_QT_BUILD_TEST' in os.environ:
skip = 'Skipping Scraper tests as it causes crashes in macOS VM'
@unittest.skipIf(skip, skip)
class TestSimpleWebEngineScraper(unittest.TestCase):
def test_dom_load(self):
from qt.core import QUrl
overseer = Overseer()
for f in ('book', 'nav'):
path = P(f'templates/new_{f}.html', allow_user_override=False)
url = QUrl.fromLocalFile(path)
html = overseer.fetch_url(url, 'test')
def c(a):
ans = tostring(fromstring(a.encode('utf-8')), pretty_print=True, encoding='unicode')
return re.sub(r'\s+', ' ', ans)
with open(path, 'rb') as f:
raw = f.read().decode('utf-8')
self.assertEqual(c(html), c(raw))
self.assertRaises(ValueError, overseer.fetch_url, 'file:///does-not-exist.html', 'test')
w = overseer.workers
self.assertEqual(len(w), 1)
del overseer
self.assertFalse(w)
class Handler(http.server.BaseHTTPRequestHandler):
def __init__(self, test_obj, *a):
@ -192,6 +162,4 @@ class TestFetchBackend(unittest.TestCase):
def find_tests():
ans = unittest.defaultTestLoader.loadTestsFromTestCase(TestSimpleWebEngineScraper)
ans.addTests(iter(unittest.defaultTestLoader.loadTestsFromTestCase(TestFetchBackend)))
return ans
return unittest.defaultTestLoader.loadTestsFromTestCase(TestFetchBackend)

View File

@ -13,13 +13,35 @@ from http import HTTPStatus
from time import monotonic
from qt.core import QApplication, QByteArray, QNetworkCookie, QObject, Qt, QTimer, QUrl, pyqtSignal, sip
from qt.webengine import QWebEnginePage, QWebEngineScript
from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineScript, QWebEngineSettings
from calibre.scraper.qt_backend import Request, too_slow_or_timed_out
from calibre.scraper.qt_backend import worker as qt_worker
from calibre.scraper.simple_backend import create_base_profile
from calibre.utils.resources import get_path as P
from calibre.utils.webengine import create_script, insert_scripts
from calibre.utils.webengine import create_script, insert_scripts, setup_profile
def create_base_profile(cache_name='', allow_js=False):
from calibre.utils.random_ua import random_common_chrome_user_agent
if cache_name:
ans = QWebEngineProfile(cache_name, QApplication.instance())
else:
ans = QWebEngineProfile(QApplication.instance())
setup_profile(ans)
ans.setHttpUserAgent(random_common_chrome_user_agent())
ans.setHttpCacheMaximumSize(0) # managed by webengine
s = ans.settings()
a = s.setAttribute
a(QWebEngineSettings.WebAttribute.PluginsEnabled, False)
a(QWebEngineSettings.WebAttribute.JavascriptEnabled, allow_js)
s.setUnknownUrlSchemePolicy(QWebEngineSettings.UnknownUrlSchemePolicy.DisallowUnknownUrlSchemes)
a(QWebEngineSettings.WebAttribute.JavascriptCanOpenWindows, False)
a(QWebEngineSettings.WebAttribute.JavascriptCanAccessClipboard, False)
# ensure javascript cannot read from local files
a(QWebEngineSettings.WebAttribute.LocalContentCanAccessFileUrls, False)
a(QWebEngineSettings.WebAttribute.AllowWindowActivationFromJavaScript, False)
return ans
class DownloadRequest(QObject):