mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Get rid of the simple backend in favor of WebEngineBrowser
This commit is contained in:
parent
5b00e588b2
commit
f82da06184
@ -2,138 +2,41 @@
|
|||||||
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
|
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import sys
|
import sys
|
||||||
import weakref
|
import weakref
|
||||||
from threading import Lock, Thread, get_ident
|
from threading import Lock
|
||||||
|
|
||||||
from calibre.constants import iswindows
|
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
|
||||||
from calibre.utils.filenames import retry_on_fail
|
|
||||||
from calibre.utils.ipc.simple_worker import start_pipe_worker
|
|
||||||
|
|
||||||
|
|
||||||
def worker_main(source):
|
|
||||||
from qt.core import QUrl
|
|
||||||
|
|
||||||
from calibre.gui2 import must_use_qt
|
|
||||||
from calibre.gui_launch import setup_qt_logging
|
|
||||||
setup_qt_logging()
|
|
||||||
|
|
||||||
from .simple_backend import SimpleScraper
|
|
||||||
must_use_qt()
|
|
||||||
s = SimpleScraper(source)
|
|
||||||
for line in sys.stdin.buffer:
|
|
||||||
line = line.strip()
|
|
||||||
if source == 'test':
|
|
||||||
print(line.decode('utf-8'), file=sys.stderr)
|
|
||||||
try:
|
|
||||||
cmd, rest = line.split(b':', 1)
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
if cmd == b'EXIT':
|
|
||||||
raise SystemExit(int(rest))
|
|
||||||
if cmd == b'FETCH':
|
|
||||||
try:
|
|
||||||
d = json.loads(rest)
|
|
||||||
html = s.fetch(QUrl.fromEncoded(d['url'].encode('utf-8')), timeout=float(d['timeout']))
|
|
||||||
except Exception as e:
|
|
||||||
import traceback
|
|
||||||
result = {'ok': False, 'tb': traceback.format_exc(), 'err': str(e)}
|
|
||||||
else:
|
|
||||||
with PersistentTemporaryFile(suffix='-scraper-result.html') as t:
|
|
||||||
t.write(html.encode('utf-8'))
|
|
||||||
result = {'ok': True, 'html_file': t.name}
|
|
||||||
print(json.dumps(result), flush=True)
|
|
||||||
|
|
||||||
|
|
||||||
overseers = []
|
overseers = []
|
||||||
|
|
||||||
|
|
||||||
class Overseer:
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.lock = Lock()
|
|
||||||
self.workers = {}
|
|
||||||
overseers.append(weakref.ref(self))
|
|
||||||
|
|
||||||
def safe_wait(self, w, timeout):
|
|
||||||
try:
|
|
||||||
return w.wait(timeout)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def worker_for_source(self, source):
|
|
||||||
wname = f'{source}::{get_ident()}'
|
|
||||||
with self.lock:
|
|
||||||
ans = self.workers.get(wname)
|
|
||||||
if ans is None:
|
|
||||||
w = start_pipe_worker(f'from calibre.scraper.simple import worker_main; worker_main({source!r})')
|
|
||||||
ans = self.workers[wname] = w
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def fetch_url(self, url_or_qurl, source='', timeout=60):
|
|
||||||
from qt.core import QUrl
|
|
||||||
w = self.worker_for_source(source)
|
|
||||||
if isinstance(url_or_qurl, str):
|
|
||||||
url_or_qurl = QUrl(url_or_qurl)
|
|
||||||
w.stdin.write(b'FETCH:')
|
|
||||||
w.stdin.write(json.dumps({'url': bytes(url_or_qurl.toEncoded()).decode('utf-8'), 'timeout': timeout}).encode('utf-8'))
|
|
||||||
w.stdin.write(b'\n')
|
|
||||||
w.stdin.flush()
|
|
||||||
output = json.loads(w.stdout.readline())
|
|
||||||
if not output['ok']:
|
|
||||||
raise ValueError(output['err'])
|
|
||||||
with open(output['html_file'], 'rb') as f:
|
|
||||||
html = f.read().decode('utf-8')
|
|
||||||
retry_on_fail(os.remove, output['html_file'])
|
|
||||||
return html
|
|
||||||
|
|
||||||
def __del__(self):
|
|
||||||
with self.lock:
|
|
||||||
for w in self.workers.values():
|
|
||||||
w.stdin.write(b'EXIT:0\n')
|
|
||||||
w.stdin.flush()
|
|
||||||
w.stdin.close()
|
|
||||||
w.stdout.close()
|
|
||||||
for w in self.workers.values():
|
|
||||||
if self.safe_wait(w, 0.2) is None:
|
|
||||||
w.terminate()
|
|
||||||
if not iswindows:
|
|
||||||
if self.safe_wait(w, 0.1) is None:
|
|
||||||
w.kill()
|
|
||||||
self.workers.clear()
|
|
||||||
close = __del__
|
|
||||||
|
|
||||||
|
|
||||||
def cleanup_overseers():
|
def cleanup_overseers():
|
||||||
threads = []
|
browsers = tuple(filter(None, (x() for x in overseers)))
|
||||||
for x in overseers:
|
|
||||||
o = x()
|
|
||||||
if o is not None:
|
|
||||||
t = Thread(target=o.close, name='CloseOverSeer')
|
|
||||||
t.start()
|
|
||||||
threads.append(t)
|
|
||||||
del overseers[:]
|
del overseers[:]
|
||||||
|
|
||||||
def join_all():
|
def join_all():
|
||||||
for t in threads:
|
for br in browsers:
|
||||||
t.join()
|
br.shutdown()
|
||||||
return join_all
|
return join_all
|
||||||
|
|
||||||
|
|
||||||
read_url_lock = Lock()
|
read_url_lock = Lock()
|
||||||
|
|
||||||
|
|
||||||
def read_url(storage, url, timeout=60):
|
def read_url(storage, url, timeout=60, as_html=True):
|
||||||
with read_url_lock:
|
with read_url_lock:
|
||||||
|
from calibre.scraper.qt import WebEngineBrowser
|
||||||
if not storage:
|
if not storage:
|
||||||
storage.append(Overseer())
|
storage.append(WebEngineBrowser())
|
||||||
|
overseers.append(weakref.ref(storage[-1]))
|
||||||
scraper = storage[0]
|
scraper = storage[0]
|
||||||
from calibre.ebooks.chardet import strip_encoding_declarations
|
raw_bytes = scraper.open_novisit(url, timeout=timeout).read()
|
||||||
return strip_encoding_declarations(scraper.fetch_url(url, timeout=timeout))
|
if not as_html:
|
||||||
|
return raw_bytes
|
||||||
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
return xml_to_unicode(raw_bytes, strip_encoding_pats=True)[0]
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print(read_url([], sys.argv[-1]))
|
try:
|
||||||
|
print(read_url([], sys.argv[-1]))
|
||||||
|
finally:
|
||||||
|
cleanup_overseers()()
|
||||||
|
@ -1,164 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
|
|
||||||
|
|
||||||
import json
|
|
||||||
import secrets
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
from functools import lru_cache
|
|
||||||
|
|
||||||
from qt.core import QApplication, QEventLoop, QUrl
|
|
||||||
from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineSettings
|
|
||||||
|
|
||||||
from calibre.utils.webengine import create_script, insert_scripts, setup_profile
|
|
||||||
|
|
||||||
|
|
||||||
def canonicalize_qurl(qurl):
|
|
||||||
qurl = qurl.adjusted(
|
|
||||||
QUrl.UrlFormattingOption.StripTrailingSlash | QUrl.UrlFormattingOption.NormalizePathSegments | QUrl.UrlFormattingOption.RemoveFragment
|
|
||||||
)
|
|
||||||
if qurl.path() == '/':
|
|
||||||
qurl = qurl.adjusted(QUrl.UrlFormattingOption.RemovePath)
|
|
||||||
return qurl
|
|
||||||
|
|
||||||
|
|
||||||
def create_base_profile(cache_name='', allow_js=False):
|
|
||||||
from calibre.utils.random_ua import random_common_chrome_user_agent
|
|
||||||
if cache_name:
|
|
||||||
ans = QWebEngineProfile(cache_name, QApplication.instance())
|
|
||||||
else:
|
|
||||||
ans = QWebEngineProfile(QApplication.instance())
|
|
||||||
setup_profile(ans)
|
|
||||||
ans.setHttpUserAgent(random_common_chrome_user_agent())
|
|
||||||
ans.setHttpCacheMaximumSize(0) # managed by webengine
|
|
||||||
s = ans.settings()
|
|
||||||
a = s.setAttribute
|
|
||||||
a(QWebEngineSettings.WebAttribute.PluginsEnabled, False)
|
|
||||||
a(QWebEngineSettings.WebAttribute.JavascriptEnabled, allow_js)
|
|
||||||
s.setUnknownUrlSchemePolicy(QWebEngineSettings.UnknownUrlSchemePolicy.DisallowUnknownUrlSchemes)
|
|
||||||
a(QWebEngineSettings.WebAttribute.JavascriptCanOpenWindows, False)
|
|
||||||
a(QWebEngineSettings.WebAttribute.JavascriptCanAccessClipboard, False)
|
|
||||||
# ensure javascript cannot read from local files
|
|
||||||
a(QWebEngineSettings.WebAttribute.LocalContentCanAccessFileUrls, False)
|
|
||||||
a(QWebEngineSettings.WebAttribute.AllowWindowActivationFromJavaScript, False)
|
|
||||||
return ans
|
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
|
||||||
def create_profile(cache_name='', allow_js=False):
|
|
||||||
ans = create_base_profile(cache_name, allow_js)
|
|
||||||
ans.token = secrets.token_hex()
|
|
||||||
js = '''
|
|
||||||
(function() {
|
|
||||||
"use strict";
|
|
||||||
|
|
||||||
function send_msg(data) {
|
|
||||||
var token = 'TOKEN';
|
|
||||||
var msg = token + ' ' + JSON.stringify(data);
|
|
||||||
console.log(msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
function debug() {
|
|
||||||
var args = Array.prototype.slice.call(arguments);
|
|
||||||
var text = args.join(' ');
|
|
||||||
send_msg({type: 'print', text: text});
|
|
||||||
}
|
|
||||||
|
|
||||||
if (document.location && document.location.href && !document.location.href.startsWith('chrome-error:') && !document.location.href.startsWith('about:')) {
|
|
||||||
send_msg({type: 'domready', url: document.location.href, html: new XMLSerializer().serializeToString(document)});
|
|
||||||
}
|
|
||||||
})();
|
|
||||||
'''
|
|
||||||
js = js.replace('TOKEN', ans.token)
|
|
||||||
insert_scripts(ans, create_script('scraper.js', js))
|
|
||||||
return ans
|
|
||||||
|
|
||||||
|
|
||||||
class SimpleScraper(QWebEnginePage):
|
|
||||||
|
|
||||||
def __init__(self, source='', parent=None):
|
|
||||||
profile = create_profile(source)
|
|
||||||
self.token = profile.token
|
|
||||||
self.is_being_tested = source == 'test'
|
|
||||||
super().__init__(profile, parent)
|
|
||||||
self.setAudioMuted(True)
|
|
||||||
self.loadStarted.connect(self.load_started)
|
|
||||||
self.loadFinished.connect(self.load_finished)
|
|
||||||
self.loadProgress.connect(self.load_progress)
|
|
||||||
|
|
||||||
def print(self, *a):
|
|
||||||
print(*a, file=sys.stderr)
|
|
||||||
|
|
||||||
def load_started(self):
|
|
||||||
if self.is_being_tested:
|
|
||||||
self.print(f'load_started: {self.is_current_url=} {self.requestedUrl()=}')
|
|
||||||
if self.is_current_url:
|
|
||||||
self.current_fetch['load_started'] = True
|
|
||||||
|
|
||||||
def load_finished(self, ok):
|
|
||||||
if self.is_being_tested:
|
|
||||||
self.print(f'load_finished: {ok=} {self.is_current_url=}')
|
|
||||||
if self.is_current_url:
|
|
||||||
self.current_fetch['load_finished'] = True
|
|
||||||
self.current_fetch['load_was_ok'] = ok
|
|
||||||
if not ok and self.is_current_url:
|
|
||||||
self.current_fetch['working'] = False
|
|
||||||
|
|
||||||
def load_progress(self, progress):
|
|
||||||
if self.is_being_tested:
|
|
||||||
self.print(f'load_progress: {progress=} {self.is_current_url=}')
|
|
||||||
if self.is_current_url:
|
|
||||||
self.current_fetch['end_time'] = time.monotonic() + self.current_fetch['timeout']
|
|
||||||
|
|
||||||
def javaScriptAlert(self, url, msg):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def javaScriptConfirm(self, url, msg):
|
|
||||||
return True
|
|
||||||
|
|
||||||
def javaScriptPrompt(self, url, msg, defval):
|
|
||||||
return True, defval
|
|
||||||
|
|
||||||
@property
|
|
||||||
def is_current_url(self):
|
|
||||||
if not hasattr(self, 'current_fetch'):
|
|
||||||
return False
|
|
||||||
return canonicalize_qurl(self.requestedUrl()) == self.current_fetch['fetching_url']
|
|
||||||
|
|
||||||
def javaScriptConsoleMessage(self, level, message, line_num, source_id):
|
|
||||||
parts = message.split(maxsplit=1)
|
|
||||||
if len(parts) == 2 and parts[0] == self.token:
|
|
||||||
msg = json.loads(parts[1])
|
|
||||||
t = msg.get('type')
|
|
||||||
if t == 'print':
|
|
||||||
print(msg['text'], file=sys.stderr)
|
|
||||||
elif t == 'domready':
|
|
||||||
if self.is_being_tested:
|
|
||||||
self.print(f'domready: {self.is_current_url=}')
|
|
||||||
if self.is_current_url:
|
|
||||||
self.triggerAction(QWebEnginePage.WebAction.Stop)
|
|
||||||
self.current_fetch['working'] = False
|
|
||||||
if not msg.get('failed'):
|
|
||||||
self.current_fetch['html'] = msg['html']
|
|
||||||
|
|
||||||
def fetch(self, url_or_qurl, timeout=60):
|
|
||||||
fetching_url = QUrl(url_or_qurl)
|
|
||||||
self.current_fetch = {
|
|
||||||
'timeout': timeout, 'end_time': time.monotonic() + timeout,
|
|
||||||
'fetching_url': canonicalize_qurl(fetching_url), 'working': True,
|
|
||||||
'load_started': False
|
|
||||||
}
|
|
||||||
self.load(fetching_url)
|
|
||||||
try:
|
|
||||||
app = QApplication.instance()
|
|
||||||
while self.current_fetch['working'] and time.monotonic() < self.current_fetch['end_time']:
|
|
||||||
app.processEvents(QEventLoop.ProcessEventsFlag.ExcludeUserInputEvents)
|
|
||||||
ans = self.current_fetch.get('html')
|
|
||||||
if ans is None:
|
|
||||||
eurl = fetching_url.toString()
|
|
||||||
if self.current_fetch['working']:
|
|
||||||
raise TimeoutError(f'Timed out loading HTML from: {eurl}')
|
|
||||||
raise ValueError(f'Failed to load HTML from: {eurl}')
|
|
||||||
return ans
|
|
||||||
finally:
|
|
||||||
del self.current_fetch
|
|
@ -4,16 +4,10 @@
|
|||||||
import http.server
|
import http.server
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import unittest
|
import unittest
|
||||||
from threading import Event, Thread
|
from threading import Event, Thread
|
||||||
|
|
||||||
from lxml.html import fromstring, tostring
|
|
||||||
|
|
||||||
from calibre.utils.resources import get_path as P
|
|
||||||
|
|
||||||
from .qt import Browser, WebEngineBrowser
|
from .qt import Browser, WebEngineBrowser
|
||||||
from .simple import Overseer
|
|
||||||
|
|
||||||
skip = ''
|
skip = ''
|
||||||
is_sanitized = 'libasan' in os.environ.get('LD_PRELOAD', '')
|
is_sanitized = 'libasan' in os.environ.get('LD_PRELOAD', '')
|
||||||
@ -23,30 +17,6 @@ elif 'SKIP_QT_BUILD_TEST' in os.environ:
|
|||||||
skip = 'Skipping Scraper tests as it causes crashes in macOS VM'
|
skip = 'Skipping Scraper tests as it causes crashes in macOS VM'
|
||||||
|
|
||||||
|
|
||||||
@unittest.skipIf(skip, skip)
|
|
||||||
class TestSimpleWebEngineScraper(unittest.TestCase):
|
|
||||||
|
|
||||||
def test_dom_load(self):
|
|
||||||
from qt.core import QUrl
|
|
||||||
overseer = Overseer()
|
|
||||||
for f in ('book', 'nav'):
|
|
||||||
path = P(f'templates/new_{f}.html', allow_user_override=False)
|
|
||||||
url = QUrl.fromLocalFile(path)
|
|
||||||
html = overseer.fetch_url(url, 'test')
|
|
||||||
|
|
||||||
def c(a):
|
|
||||||
ans = tostring(fromstring(a.encode('utf-8')), pretty_print=True, encoding='unicode')
|
|
||||||
return re.sub(r'\s+', ' ', ans)
|
|
||||||
with open(path, 'rb') as f:
|
|
||||||
raw = f.read().decode('utf-8')
|
|
||||||
self.assertEqual(c(html), c(raw))
|
|
||||||
self.assertRaises(ValueError, overseer.fetch_url, 'file:///does-not-exist.html', 'test')
|
|
||||||
w = overseer.workers
|
|
||||||
self.assertEqual(len(w), 1)
|
|
||||||
del overseer
|
|
||||||
self.assertFalse(w)
|
|
||||||
|
|
||||||
|
|
||||||
class Handler(http.server.BaseHTTPRequestHandler):
|
class Handler(http.server.BaseHTTPRequestHandler):
|
||||||
|
|
||||||
def __init__(self, test_obj, *a):
|
def __init__(self, test_obj, *a):
|
||||||
@ -192,6 +162,4 @@ class TestFetchBackend(unittest.TestCase):
|
|||||||
|
|
||||||
|
|
||||||
def find_tests():
|
def find_tests():
|
||||||
ans = unittest.defaultTestLoader.loadTestsFromTestCase(TestSimpleWebEngineScraper)
|
return unittest.defaultTestLoader.loadTestsFromTestCase(TestFetchBackend)
|
||||||
ans.addTests(iter(unittest.defaultTestLoader.loadTestsFromTestCase(TestFetchBackend)))
|
|
||||||
return ans
|
|
||||||
|
@ -13,13 +13,35 @@ from http import HTTPStatus
|
|||||||
from time import monotonic
|
from time import monotonic
|
||||||
|
|
||||||
from qt.core import QApplication, QByteArray, QNetworkCookie, QObject, Qt, QTimer, QUrl, pyqtSignal, sip
|
from qt.core import QApplication, QByteArray, QNetworkCookie, QObject, Qt, QTimer, QUrl, pyqtSignal, sip
|
||||||
from qt.webengine import QWebEnginePage, QWebEngineScript
|
from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineScript, QWebEngineSettings
|
||||||
|
|
||||||
from calibre.scraper.qt_backend import Request, too_slow_or_timed_out
|
from calibre.scraper.qt_backend import Request, too_slow_or_timed_out
|
||||||
from calibre.scraper.qt_backend import worker as qt_worker
|
from calibre.scraper.qt_backend import worker as qt_worker
|
||||||
from calibre.scraper.simple_backend import create_base_profile
|
|
||||||
from calibre.utils.resources import get_path as P
|
from calibre.utils.resources import get_path as P
|
||||||
from calibre.utils.webengine import create_script, insert_scripts
|
from calibre.utils.webengine import create_script, insert_scripts, setup_profile
|
||||||
|
|
||||||
|
|
||||||
|
def create_base_profile(cache_name='', allow_js=False):
|
||||||
|
from calibre.utils.random_ua import random_common_chrome_user_agent
|
||||||
|
if cache_name:
|
||||||
|
ans = QWebEngineProfile(cache_name, QApplication.instance())
|
||||||
|
else:
|
||||||
|
ans = QWebEngineProfile(QApplication.instance())
|
||||||
|
setup_profile(ans)
|
||||||
|
ans.setHttpUserAgent(random_common_chrome_user_agent())
|
||||||
|
ans.setHttpCacheMaximumSize(0) # managed by webengine
|
||||||
|
s = ans.settings()
|
||||||
|
a = s.setAttribute
|
||||||
|
a(QWebEngineSettings.WebAttribute.PluginsEnabled, False)
|
||||||
|
a(QWebEngineSettings.WebAttribute.JavascriptEnabled, allow_js)
|
||||||
|
s.setUnknownUrlSchemePolicy(QWebEngineSettings.UnknownUrlSchemePolicy.DisallowUnknownUrlSchemes)
|
||||||
|
a(QWebEngineSettings.WebAttribute.JavascriptCanOpenWindows, False)
|
||||||
|
a(QWebEngineSettings.WebAttribute.JavascriptCanAccessClipboard, False)
|
||||||
|
# ensure javascript cannot read from local files
|
||||||
|
a(QWebEngineSettings.WebAttribute.LocalContentCanAccessFileUrls, False)
|
||||||
|
a(QWebEngineSettings.WebAttribute.AllowWindowActivationFromJavaScript, False)
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class DownloadRequest(QObject):
|
class DownloadRequest(QObject):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user