mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
Get rid of the simple backend in favor of WebEngineBrowser
This commit is contained in:
parent
5b00e588b2
commit
f82da06184
@ -2,138 +2,41 @@
|
||||
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
|
||||
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import weakref
|
||||
from threading import Lock, Thread, get_ident
|
||||
|
||||
from calibre.constants import iswindows
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
from calibre.utils.filenames import retry_on_fail
|
||||
from calibre.utils.ipc.simple_worker import start_pipe_worker
|
||||
|
||||
|
||||
def worker_main(source):
|
||||
from qt.core import QUrl
|
||||
|
||||
from calibre.gui2 import must_use_qt
|
||||
from calibre.gui_launch import setup_qt_logging
|
||||
setup_qt_logging()
|
||||
|
||||
from .simple_backend import SimpleScraper
|
||||
must_use_qt()
|
||||
s = SimpleScraper(source)
|
||||
for line in sys.stdin.buffer:
|
||||
line = line.strip()
|
||||
if source == 'test':
|
||||
print(line.decode('utf-8'), file=sys.stderr)
|
||||
try:
|
||||
cmd, rest = line.split(b':', 1)
|
||||
except Exception:
|
||||
continue
|
||||
if cmd == b'EXIT':
|
||||
raise SystemExit(int(rest))
|
||||
if cmd == b'FETCH':
|
||||
try:
|
||||
d = json.loads(rest)
|
||||
html = s.fetch(QUrl.fromEncoded(d['url'].encode('utf-8')), timeout=float(d['timeout']))
|
||||
except Exception as e:
|
||||
import traceback
|
||||
result = {'ok': False, 'tb': traceback.format_exc(), 'err': str(e)}
|
||||
else:
|
||||
with PersistentTemporaryFile(suffix='-scraper-result.html') as t:
|
||||
t.write(html.encode('utf-8'))
|
||||
result = {'ok': True, 'html_file': t.name}
|
||||
print(json.dumps(result), flush=True)
|
||||
|
||||
from threading import Lock
|
||||
|
||||
overseers = []
|
||||
|
||||
|
||||
class Overseer:
|
||||
|
||||
def __init__(self):
|
||||
self.lock = Lock()
|
||||
self.workers = {}
|
||||
overseers.append(weakref.ref(self))
|
||||
|
||||
def safe_wait(self, w, timeout):
|
||||
try:
|
||||
return w.wait(timeout)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def worker_for_source(self, source):
|
||||
wname = f'{source}::{get_ident()}'
|
||||
with self.lock:
|
||||
ans = self.workers.get(wname)
|
||||
if ans is None:
|
||||
w = start_pipe_worker(f'from calibre.scraper.simple import worker_main; worker_main({source!r})')
|
||||
ans = self.workers[wname] = w
|
||||
return ans
|
||||
|
||||
def fetch_url(self, url_or_qurl, source='', timeout=60):
|
||||
from qt.core import QUrl
|
||||
w = self.worker_for_source(source)
|
||||
if isinstance(url_or_qurl, str):
|
||||
url_or_qurl = QUrl(url_or_qurl)
|
||||
w.stdin.write(b'FETCH:')
|
||||
w.stdin.write(json.dumps({'url': bytes(url_or_qurl.toEncoded()).decode('utf-8'), 'timeout': timeout}).encode('utf-8'))
|
||||
w.stdin.write(b'\n')
|
||||
w.stdin.flush()
|
||||
output = json.loads(w.stdout.readline())
|
||||
if not output['ok']:
|
||||
raise ValueError(output['err'])
|
||||
with open(output['html_file'], 'rb') as f:
|
||||
html = f.read().decode('utf-8')
|
||||
retry_on_fail(os.remove, output['html_file'])
|
||||
return html
|
||||
|
||||
def __del__(self):
|
||||
with self.lock:
|
||||
for w in self.workers.values():
|
||||
w.stdin.write(b'EXIT:0\n')
|
||||
w.stdin.flush()
|
||||
w.stdin.close()
|
||||
w.stdout.close()
|
||||
for w in self.workers.values():
|
||||
if self.safe_wait(w, 0.2) is None:
|
||||
w.terminate()
|
||||
if not iswindows:
|
||||
if self.safe_wait(w, 0.1) is None:
|
||||
w.kill()
|
||||
self.workers.clear()
|
||||
close = __del__
|
||||
|
||||
|
||||
def cleanup_overseers():
|
||||
threads = []
|
||||
for x in overseers:
|
||||
o = x()
|
||||
if o is not None:
|
||||
t = Thread(target=o.close, name='CloseOverSeer')
|
||||
t.start()
|
||||
threads.append(t)
|
||||
browsers = tuple(filter(None, (x() for x in overseers)))
|
||||
del overseers[:]
|
||||
|
||||
def join_all():
|
||||
for t in threads:
|
||||
t.join()
|
||||
for br in browsers:
|
||||
br.shutdown()
|
||||
return join_all
|
||||
|
||||
|
||||
read_url_lock = Lock()
|
||||
|
||||
|
||||
def read_url(storage, url, timeout=60):
|
||||
def read_url(storage, url, timeout=60, as_html=True):
|
||||
with read_url_lock:
|
||||
from calibre.scraper.qt import WebEngineBrowser
|
||||
if not storage:
|
||||
storage.append(Overseer())
|
||||
storage.append(WebEngineBrowser())
|
||||
overseers.append(weakref.ref(storage[-1]))
|
||||
scraper = storage[0]
|
||||
from calibre.ebooks.chardet import strip_encoding_declarations
|
||||
return strip_encoding_declarations(scraper.fetch_url(url, timeout=timeout))
|
||||
raw_bytes = scraper.open_novisit(url, timeout=timeout).read()
|
||||
if not as_html:
|
||||
return raw_bytes
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
return xml_to_unicode(raw_bytes, strip_encoding_pats=True)[0]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(read_url([], sys.argv[-1]))
|
||||
try:
|
||||
print(read_url([], sys.argv[-1]))
|
||||
finally:
|
||||
cleanup_overseers()()
|
||||
|
@ -1,164 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
|
||||
|
||||
import json
|
||||
import secrets
|
||||
import sys
|
||||
import time
|
||||
from functools import lru_cache
|
||||
|
||||
from qt.core import QApplication, QEventLoop, QUrl
|
||||
from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineSettings
|
||||
|
||||
from calibre.utils.webengine import create_script, insert_scripts, setup_profile
|
||||
|
||||
|
||||
def canonicalize_qurl(qurl):
|
||||
qurl = qurl.adjusted(
|
||||
QUrl.UrlFormattingOption.StripTrailingSlash | QUrl.UrlFormattingOption.NormalizePathSegments | QUrl.UrlFormattingOption.RemoveFragment
|
||||
)
|
||||
if qurl.path() == '/':
|
||||
qurl = qurl.adjusted(QUrl.UrlFormattingOption.RemovePath)
|
||||
return qurl
|
||||
|
||||
|
||||
def create_base_profile(cache_name='', allow_js=False):
|
||||
from calibre.utils.random_ua import random_common_chrome_user_agent
|
||||
if cache_name:
|
||||
ans = QWebEngineProfile(cache_name, QApplication.instance())
|
||||
else:
|
||||
ans = QWebEngineProfile(QApplication.instance())
|
||||
setup_profile(ans)
|
||||
ans.setHttpUserAgent(random_common_chrome_user_agent())
|
||||
ans.setHttpCacheMaximumSize(0) # managed by webengine
|
||||
s = ans.settings()
|
||||
a = s.setAttribute
|
||||
a(QWebEngineSettings.WebAttribute.PluginsEnabled, False)
|
||||
a(QWebEngineSettings.WebAttribute.JavascriptEnabled, allow_js)
|
||||
s.setUnknownUrlSchemePolicy(QWebEngineSettings.UnknownUrlSchemePolicy.DisallowUnknownUrlSchemes)
|
||||
a(QWebEngineSettings.WebAttribute.JavascriptCanOpenWindows, False)
|
||||
a(QWebEngineSettings.WebAttribute.JavascriptCanAccessClipboard, False)
|
||||
# ensure javascript cannot read from local files
|
||||
a(QWebEngineSettings.WebAttribute.LocalContentCanAccessFileUrls, False)
|
||||
a(QWebEngineSettings.WebAttribute.AllowWindowActivationFromJavaScript, False)
|
||||
return ans
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def create_profile(cache_name='', allow_js=False):
|
||||
ans = create_base_profile(cache_name, allow_js)
|
||||
ans.token = secrets.token_hex()
|
||||
js = '''
|
||||
(function() {
|
||||
"use strict";
|
||||
|
||||
function send_msg(data) {
|
||||
var token = 'TOKEN';
|
||||
var msg = token + ' ' + JSON.stringify(data);
|
||||
console.log(msg);
|
||||
}
|
||||
|
||||
function debug() {
|
||||
var args = Array.prototype.slice.call(arguments);
|
||||
var text = args.join(' ');
|
||||
send_msg({type: 'print', text: text});
|
||||
}
|
||||
|
||||
if (document.location && document.location.href && !document.location.href.startsWith('chrome-error:') && !document.location.href.startsWith('about:')) {
|
||||
send_msg({type: 'domready', url: document.location.href, html: new XMLSerializer().serializeToString(document)});
|
||||
}
|
||||
})();
|
||||
'''
|
||||
js = js.replace('TOKEN', ans.token)
|
||||
insert_scripts(ans, create_script('scraper.js', js))
|
||||
return ans
|
||||
|
||||
|
||||
class SimpleScraper(QWebEnginePage):
|
||||
|
||||
def __init__(self, source='', parent=None):
|
||||
profile = create_profile(source)
|
||||
self.token = profile.token
|
||||
self.is_being_tested = source == 'test'
|
||||
super().__init__(profile, parent)
|
||||
self.setAudioMuted(True)
|
||||
self.loadStarted.connect(self.load_started)
|
||||
self.loadFinished.connect(self.load_finished)
|
||||
self.loadProgress.connect(self.load_progress)
|
||||
|
||||
def print(self, *a):
|
||||
print(*a, file=sys.stderr)
|
||||
|
||||
def load_started(self):
|
||||
if self.is_being_tested:
|
||||
self.print(f'load_started: {self.is_current_url=} {self.requestedUrl()=}')
|
||||
if self.is_current_url:
|
||||
self.current_fetch['load_started'] = True
|
||||
|
||||
def load_finished(self, ok):
|
||||
if self.is_being_tested:
|
||||
self.print(f'load_finished: {ok=} {self.is_current_url=}')
|
||||
if self.is_current_url:
|
||||
self.current_fetch['load_finished'] = True
|
||||
self.current_fetch['load_was_ok'] = ok
|
||||
if not ok and self.is_current_url:
|
||||
self.current_fetch['working'] = False
|
||||
|
||||
def load_progress(self, progress):
|
||||
if self.is_being_tested:
|
||||
self.print(f'load_progress: {progress=} {self.is_current_url=}')
|
||||
if self.is_current_url:
|
||||
self.current_fetch['end_time'] = time.monotonic() + self.current_fetch['timeout']
|
||||
|
||||
def javaScriptAlert(self, url, msg):
|
||||
pass
|
||||
|
||||
def javaScriptConfirm(self, url, msg):
|
||||
return True
|
||||
|
||||
def javaScriptPrompt(self, url, msg, defval):
|
||||
return True, defval
|
||||
|
||||
@property
|
||||
def is_current_url(self):
|
||||
if not hasattr(self, 'current_fetch'):
|
||||
return False
|
||||
return canonicalize_qurl(self.requestedUrl()) == self.current_fetch['fetching_url']
|
||||
|
||||
def javaScriptConsoleMessage(self, level, message, line_num, source_id):
|
||||
parts = message.split(maxsplit=1)
|
||||
if len(parts) == 2 and parts[0] == self.token:
|
||||
msg = json.loads(parts[1])
|
||||
t = msg.get('type')
|
||||
if t == 'print':
|
||||
print(msg['text'], file=sys.stderr)
|
||||
elif t == 'domready':
|
||||
if self.is_being_tested:
|
||||
self.print(f'domready: {self.is_current_url=}')
|
||||
if self.is_current_url:
|
||||
self.triggerAction(QWebEnginePage.WebAction.Stop)
|
||||
self.current_fetch['working'] = False
|
||||
if not msg.get('failed'):
|
||||
self.current_fetch['html'] = msg['html']
|
||||
|
||||
def fetch(self, url_or_qurl, timeout=60):
|
||||
fetching_url = QUrl(url_or_qurl)
|
||||
self.current_fetch = {
|
||||
'timeout': timeout, 'end_time': time.monotonic() + timeout,
|
||||
'fetching_url': canonicalize_qurl(fetching_url), 'working': True,
|
||||
'load_started': False
|
||||
}
|
||||
self.load(fetching_url)
|
||||
try:
|
||||
app = QApplication.instance()
|
||||
while self.current_fetch['working'] and time.monotonic() < self.current_fetch['end_time']:
|
||||
app.processEvents(QEventLoop.ProcessEventsFlag.ExcludeUserInputEvents)
|
||||
ans = self.current_fetch.get('html')
|
||||
if ans is None:
|
||||
eurl = fetching_url.toString()
|
||||
if self.current_fetch['working']:
|
||||
raise TimeoutError(f'Timed out loading HTML from: {eurl}')
|
||||
raise ValueError(f'Failed to load HTML from: {eurl}')
|
||||
return ans
|
||||
finally:
|
||||
del self.current_fetch
|
@ -4,16 +4,10 @@
|
||||
import http.server
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import unittest
|
||||
from threading import Event, Thread
|
||||
|
||||
from lxml.html import fromstring, tostring
|
||||
|
||||
from calibre.utils.resources import get_path as P
|
||||
|
||||
from .qt import Browser, WebEngineBrowser
|
||||
from .simple import Overseer
|
||||
|
||||
skip = ''
|
||||
is_sanitized = 'libasan' in os.environ.get('LD_PRELOAD', '')
|
||||
@ -23,30 +17,6 @@ elif 'SKIP_QT_BUILD_TEST' in os.environ:
|
||||
skip = 'Skipping Scraper tests as it causes crashes in macOS VM'
|
||||
|
||||
|
||||
@unittest.skipIf(skip, skip)
|
||||
class TestSimpleWebEngineScraper(unittest.TestCase):
|
||||
|
||||
def test_dom_load(self):
|
||||
from qt.core import QUrl
|
||||
overseer = Overseer()
|
||||
for f in ('book', 'nav'):
|
||||
path = P(f'templates/new_{f}.html', allow_user_override=False)
|
||||
url = QUrl.fromLocalFile(path)
|
||||
html = overseer.fetch_url(url, 'test')
|
||||
|
||||
def c(a):
|
||||
ans = tostring(fromstring(a.encode('utf-8')), pretty_print=True, encoding='unicode')
|
||||
return re.sub(r'\s+', ' ', ans)
|
||||
with open(path, 'rb') as f:
|
||||
raw = f.read().decode('utf-8')
|
||||
self.assertEqual(c(html), c(raw))
|
||||
self.assertRaises(ValueError, overseer.fetch_url, 'file:///does-not-exist.html', 'test')
|
||||
w = overseer.workers
|
||||
self.assertEqual(len(w), 1)
|
||||
del overseer
|
||||
self.assertFalse(w)
|
||||
|
||||
|
||||
class Handler(http.server.BaseHTTPRequestHandler):
|
||||
|
||||
def __init__(self, test_obj, *a):
|
||||
@ -192,6 +162,4 @@ class TestFetchBackend(unittest.TestCase):
|
||||
|
||||
|
||||
def find_tests():
|
||||
ans = unittest.defaultTestLoader.loadTestsFromTestCase(TestSimpleWebEngineScraper)
|
||||
ans.addTests(iter(unittest.defaultTestLoader.loadTestsFromTestCase(TestFetchBackend)))
|
||||
return ans
|
||||
return unittest.defaultTestLoader.loadTestsFromTestCase(TestFetchBackend)
|
||||
|
@ -13,13 +13,35 @@ from http import HTTPStatus
|
||||
from time import monotonic
|
||||
|
||||
from qt.core import QApplication, QByteArray, QNetworkCookie, QObject, Qt, QTimer, QUrl, pyqtSignal, sip
|
||||
from qt.webengine import QWebEnginePage, QWebEngineScript
|
||||
from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineScript, QWebEngineSettings
|
||||
|
||||
from calibre.scraper.qt_backend import Request, too_slow_or_timed_out
|
||||
from calibre.scraper.qt_backend import worker as qt_worker
|
||||
from calibre.scraper.simple_backend import create_base_profile
|
||||
from calibre.utils.resources import get_path as P
|
||||
from calibre.utils.webengine import create_script, insert_scripts
|
||||
from calibre.utils.webengine import create_script, insert_scripts, setup_profile
|
||||
|
||||
|
||||
def create_base_profile(cache_name='', allow_js=False):
|
||||
from calibre.utils.random_ua import random_common_chrome_user_agent
|
||||
if cache_name:
|
||||
ans = QWebEngineProfile(cache_name, QApplication.instance())
|
||||
else:
|
||||
ans = QWebEngineProfile(QApplication.instance())
|
||||
setup_profile(ans)
|
||||
ans.setHttpUserAgent(random_common_chrome_user_agent())
|
||||
ans.setHttpCacheMaximumSize(0) # managed by webengine
|
||||
s = ans.settings()
|
||||
a = s.setAttribute
|
||||
a(QWebEngineSettings.WebAttribute.PluginsEnabled, False)
|
||||
a(QWebEngineSettings.WebAttribute.JavascriptEnabled, allow_js)
|
||||
s.setUnknownUrlSchemePolicy(QWebEngineSettings.UnknownUrlSchemePolicy.DisallowUnknownUrlSchemes)
|
||||
a(QWebEngineSettings.WebAttribute.JavascriptCanOpenWindows, False)
|
||||
a(QWebEngineSettings.WebAttribute.JavascriptCanAccessClipboard, False)
|
||||
# ensure javascript cannot read from local files
|
||||
a(QWebEngineSettings.WebAttribute.LocalContentCanAccessFileUrls, False)
|
||||
a(QWebEngineSettings.WebAttribute.AllowWindowActivationFromJavaScript, False)
|
||||
return ans
|
||||
|
||||
|
||||
|
||||
class DownloadRequest(QObject):
|
||||
|
Loading…
x
Reference in New Issue
Block a user