Get rid of the simple backend in favor of WebEngineBrowser

2025-07-08 18:54:09 -04:00 · 2024-08-16 20:22:20 +05:30 · 2024-08-16 20:22:20 +05:30 · f82da06184
commit f82da06184
parent 5b00e588b2
4 changed files with 43 additions and 314 deletions
--- a/src/calibre/scraper/simple.py
+++ b/src/calibre/scraper/simple.py
@ -2,138 +2,41 @@
 # License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>


-import json
-import os
 import sys
 import weakref
-from threading import Lock, Thread, get_ident
-
-from calibre.constants import iswindows
-from calibre.ptempfile import PersistentTemporaryFile
-from calibre.utils.filenames import retry_on_fail
-from calibre.utils.ipc.simple_worker import start_pipe_worker
-
-
-def worker_main(source):
-    from qt.core import QUrl
-
-    from calibre.gui2 import must_use_qt
-    from calibre.gui_launch import setup_qt_logging
-    setup_qt_logging()
-
-    from .simple_backend import SimpleScraper
-    must_use_qt()
-    s = SimpleScraper(source)
-    for line in sys.stdin.buffer:
-        line = line.strip()
-        if source == 'test':
-            print(line.decode('utf-8'), file=sys.stderr)
-        try:
-            cmd, rest = line.split(b':', 1)
-        except Exception:
-            continue
-        if cmd == b'EXIT':
-            raise SystemExit(int(rest))
-        if cmd == b'FETCH':
-            try:
-                d = json.loads(rest)
-                html = s.fetch(QUrl.fromEncoded(d['url'].encode('utf-8')), timeout=float(d['timeout']))
-            except Exception as e:
-                import traceback
-                result = {'ok': False, 'tb': traceback.format_exc(), 'err': str(e)}
-            else:
-                with PersistentTemporaryFile(suffix='-scraper-result.html') as t:
-                    t.write(html.encode('utf-8'))
-                result = {'ok': True, 'html_file': t.name}
-            print(json.dumps(result), flush=True)
-
+from threading import Lock

 overseers = []

-
-class Overseer:
-
-    def __init__(self):
-        self.lock = Lock()
-        self.workers = {}
-        overseers.append(weakref.ref(self))
-
-    def safe_wait(self, w, timeout):
-        try:
-            return w.wait(timeout)
-        except Exception:
-            pass
-
-    def worker_for_source(self, source):
-        wname = f'{source}::{get_ident()}'
-        with self.lock:
-            ans = self.workers.get(wname)
-            if ans is None:
-                w = start_pipe_worker(f'from calibre.scraper.simple import worker_main; worker_main({source!r})')
-                ans = self.workers[wname] = w
-        return ans
-
-    def fetch_url(self, url_or_qurl, source='', timeout=60):
-        from qt.core import QUrl
-        w = self.worker_for_source(source)
-        if isinstance(url_or_qurl, str):
-            url_or_qurl = QUrl(url_or_qurl)
-        w.stdin.write(b'FETCH:')
-        w.stdin.write(json.dumps({'url': bytes(url_or_qurl.toEncoded()).decode('utf-8'), 'timeout': timeout}).encode('utf-8'))
-        w.stdin.write(b'\n')
-        w.stdin.flush()
-        output = json.loads(w.stdout.readline())
-        if not output['ok']:
-            raise ValueError(output['err'])
-        with open(output['html_file'], 'rb') as f:
-            html = f.read().decode('utf-8')
-        retry_on_fail(os.remove, output['html_file'])
-        return html
-
-    def __del__(self):
-        with self.lock:
-            for w in self.workers.values():
-                w.stdin.write(b'EXIT:0\n')
-                w.stdin.flush()
-                w.stdin.close()
-                w.stdout.close()
-            for w in self.workers.values():
-                if self.safe_wait(w, 0.2) is None:
-                    w.terminate()
-                    if not iswindows:
-                        if self.safe_wait(w, 0.1) is None:
-                            w.kill()
-            self.workers.clear()
-    close = __del__
-
-
 def cleanup_overseers():
-    threads = []
-    for x in overseers:
-        o = x()
-        if o is not None:
-            t = Thread(target=o.close, name='CloseOverSeer')
-            t.start()
-            threads.append(t)
+    browsers = tuple(filter(None, (x() for x in overseers)))
    del overseers[:]

    def join_all():
-        for t in threads:
-            t.join()
+        for br in browsers:
+            br.shutdown()
    return join_all


 read_url_lock = Lock()


-def read_url(storage, url, timeout=60):
+def read_url(storage, url, timeout=60, as_html=True):
    with read_url_lock:
+        from calibre.scraper.qt import WebEngineBrowser
        if not storage:
-            storage.append(Overseer())
+            storage.append(WebEngineBrowser())
+            overseers.append(weakref.ref(storage[-1]))
        scraper = storage[0]
-    from calibre.ebooks.chardet import strip_encoding_declarations
-    return strip_encoding_declarations(scraper.fetch_url(url, timeout=timeout))
+    raw_bytes = scraper.open_novisit(url, timeout=timeout).read()
+    if not as_html:
+        return raw_bytes
+    from calibre.ebooks.chardet import xml_to_unicode
+    return xml_to_unicode(raw_bytes, strip_encoding_pats=True)[0]


 if __name__ == '__main__':
-    print(read_url([], sys.argv[-1]))
+    try:
+        print(read_url([], sys.argv[-1]))
+    finally:
+        cleanup_overseers()()
--- a/src/calibre/scraper/simple_backend.py
+++ b/src/calibre/scraper/simple_backend.py
@ -1,164 +0,0 @@
-#!/usr/bin/env python
-# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
-
-import json
-import secrets
-import sys
-import time
-from functools import lru_cache
-
-from qt.core import QApplication, QEventLoop, QUrl
-from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineSettings
-
-from calibre.utils.webengine import create_script, insert_scripts, setup_profile
-
-
-def canonicalize_qurl(qurl):
-    qurl = qurl.adjusted(
-        QUrl.UrlFormattingOption.StripTrailingSlash | QUrl.UrlFormattingOption.NormalizePathSegments | QUrl.UrlFormattingOption.RemoveFragment
-    )
-    if qurl.path() == '/':
-        qurl = qurl.adjusted(QUrl.UrlFormattingOption.RemovePath)
-    return qurl
-
-
-def create_base_profile(cache_name='', allow_js=False):
-    from calibre.utils.random_ua import random_common_chrome_user_agent
-    if cache_name:
-        ans = QWebEngineProfile(cache_name, QApplication.instance())
-    else:
-        ans = QWebEngineProfile(QApplication.instance())
-    setup_profile(ans)
-    ans.setHttpUserAgent(random_common_chrome_user_agent())
-    ans.setHttpCacheMaximumSize(0)  # managed by webengine
-    s = ans.settings()
-    a = s.setAttribute
-    a(QWebEngineSettings.WebAttribute.PluginsEnabled, False)
-    a(QWebEngineSettings.WebAttribute.JavascriptEnabled, allow_js)
-    s.setUnknownUrlSchemePolicy(QWebEngineSettings.UnknownUrlSchemePolicy.DisallowUnknownUrlSchemes)
-    a(QWebEngineSettings.WebAttribute.JavascriptCanOpenWindows, False)
-    a(QWebEngineSettings.WebAttribute.JavascriptCanAccessClipboard, False)
-    # ensure javascript cannot read from local files
-    a(QWebEngineSettings.WebAttribute.LocalContentCanAccessFileUrls, False)
-    a(QWebEngineSettings.WebAttribute.AllowWindowActivationFromJavaScript, False)
-    return ans
-
-
-@lru_cache(maxsize=None)
-def create_profile(cache_name='', allow_js=False):
-    ans = create_base_profile(cache_name, allow_js)
-    ans.token = secrets.token_hex()
-    js = '''
-(function() {
-    "use strict";
-
-    function send_msg(data) {
-        var token = 'TOKEN';
-        var msg = token + '  ' + JSON.stringify(data);
-        console.log(msg);
-    }
-
-    function debug() {
-        var args = Array.prototype.slice.call(arguments);
-        var text = args.join(' ');
-        send_msg({type: 'print', text: text});
-    }
-
-    if (document.location && document.location.href && !document.location.href.startsWith('chrome-error:') && !document.location.href.startsWith('about:')) {
-        send_msg({type: 'domready', url: document.location.href, html: new XMLSerializer().serializeToString(document)});
-    }
-})();
-'''
-    js = js.replace('TOKEN', ans.token)
-    insert_scripts(ans, create_script('scraper.js', js))
-    return ans
-
-
-class SimpleScraper(QWebEnginePage):
-
-    def __init__(self, source='', parent=None):
-        profile = create_profile(source)
-        self.token = profile.token
-        self.is_being_tested = source == 'test'
-        super().__init__(profile, parent)
-        self.setAudioMuted(True)
-        self.loadStarted.connect(self.load_started)
-        self.loadFinished.connect(self.load_finished)
-        self.loadProgress.connect(self.load_progress)
-
-    def print(self, *a):
-        print(*a, file=sys.stderr)
-
-    def load_started(self):
-        if self.is_being_tested:
-            self.print(f'load_started: {self.is_current_url=} {self.requestedUrl()=}')
-        if self.is_current_url:
-            self.current_fetch['load_started'] = True
-
-    def load_finished(self, ok):
-        if self.is_being_tested:
-            self.print(f'load_finished: {ok=} {self.is_current_url=}')
-        if self.is_current_url:
-            self.current_fetch['load_finished'] = True
-            self.current_fetch['load_was_ok'] = ok
-            if not ok and self.is_current_url:
-                self.current_fetch['working'] = False
-
-    def load_progress(self, progress):
-        if self.is_being_tested:
-            self.print(f'load_progress: {progress=} {self.is_current_url=}')
-        if self.is_current_url:
-            self.current_fetch['end_time'] = time.monotonic() + self.current_fetch['timeout']
-
-    def javaScriptAlert(self, url, msg):
-        pass
-
-    def javaScriptConfirm(self, url, msg):
-        return True
-
-    def javaScriptPrompt(self, url, msg, defval):
-        return True, defval
-
-    @property
-    def is_current_url(self):
-        if not hasattr(self, 'current_fetch'):
-            return False
-        return canonicalize_qurl(self.requestedUrl()) == self.current_fetch['fetching_url']
-
-    def javaScriptConsoleMessage(self, level, message, line_num, source_id):
-        parts = message.split(maxsplit=1)
-        if len(parts) == 2 and parts[0] == self.token:
-            msg = json.loads(parts[1])
-            t = msg.get('type')
-            if t == 'print':
-                print(msg['text'], file=sys.stderr)
-            elif t == 'domready':
-                if self.is_being_tested:
-                    self.print(f'domready: {self.is_current_url=}')
-                if self.is_current_url:
-                    self.triggerAction(QWebEnginePage.WebAction.Stop)
-                    self.current_fetch['working'] = False
-                    if not msg.get('failed'):
-                        self.current_fetch['html'] = msg['html']
-
-    def fetch(self, url_or_qurl, timeout=60):
-        fetching_url = QUrl(url_or_qurl)
-        self.current_fetch = {
-            'timeout': timeout, 'end_time': time.monotonic() + timeout,
-            'fetching_url': canonicalize_qurl(fetching_url), 'working': True,
-            'load_started': False
-        }
-        self.load(fetching_url)
-        try:
-            app = QApplication.instance()
-            while self.current_fetch['working'] and time.monotonic() < self.current_fetch['end_time']:
-                app.processEvents(QEventLoop.ProcessEventsFlag.ExcludeUserInputEvents)
-            ans = self.current_fetch.get('html')
-            if ans is None:
-                eurl = fetching_url.toString()
-                if self.current_fetch['working']:
-                    raise TimeoutError(f'Timed out loading HTML from: {eurl}')
-                raise ValueError(f'Failed to load HTML from: {eurl}')
-            return ans
-        finally:
-            del self.current_fetch
--- a/src/calibre/scraper/test_fetch_backend.py
+++ b/src/calibre/scraper/test_fetch_backend.py
@ -4,16 +4,10 @@
 import http.server
 import json
 import os
-import re
 import unittest
 from threading import Event, Thread

-from lxml.html import fromstring, tostring
-
-from calibre.utils.resources import get_path as P
-
 from .qt import Browser, WebEngineBrowser
-from .simple import Overseer

 skip = ''
 is_sanitized = 'libasan' in os.environ.get('LD_PRELOAD', '')
@ -23,30 +17,6 @@ elif 'SKIP_QT_BUILD_TEST' in os.environ:
    skip = 'Skipping Scraper tests as it causes crashes in macOS VM'


-@unittest.skipIf(skip, skip)
-class TestSimpleWebEngineScraper(unittest.TestCase):
-
-    def test_dom_load(self):
-        from qt.core import QUrl
-        overseer = Overseer()
-        for f in ('book', 'nav'):
-            path = P(f'templates/new_{f}.html', allow_user_override=False)
-            url = QUrl.fromLocalFile(path)
-            html = overseer.fetch_url(url, 'test')
-
-            def c(a):
-                ans = tostring(fromstring(a.encode('utf-8')), pretty_print=True, encoding='unicode')
-                return re.sub(r'\s+', ' ', ans)
-            with open(path, 'rb') as f:
-                raw = f.read().decode('utf-8')
-            self.assertEqual(c(html), c(raw))
-        self.assertRaises(ValueError, overseer.fetch_url, 'file:///does-not-exist.html', 'test')
-        w = overseer.workers
-        self.assertEqual(len(w), 1)
-        del overseer
-        self.assertFalse(w)
-
-
 class Handler(http.server.BaseHTTPRequestHandler):

    def __init__(self, test_obj, *a):
@ -192,6 +162,4 @@ class TestFetchBackend(unittest.TestCase):


 def find_tests():
-    ans = unittest.defaultTestLoader.loadTestsFromTestCase(TestSimpleWebEngineScraper)
-    ans.addTests(iter(unittest.defaultTestLoader.loadTestsFromTestCase(TestFetchBackend)))
-    return ans
+    return unittest.defaultTestLoader.loadTestsFromTestCase(TestFetchBackend)
--- a/src/calibre/scraper/webengine_backend.py
+++ b/src/calibre/scraper/webengine_backend.py
@ -13,13 +13,35 @@ from http import HTTPStatus
 from time import monotonic

 from qt.core import QApplication, QByteArray, QNetworkCookie, QObject, Qt, QTimer, QUrl, pyqtSignal, sip
-from qt.webengine import QWebEnginePage, QWebEngineScript
+from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineScript, QWebEngineSettings

 from calibre.scraper.qt_backend import Request, too_slow_or_timed_out
 from calibre.scraper.qt_backend import worker as qt_worker
-from calibre.scraper.simple_backend import create_base_profile
 from calibre.utils.resources import get_path as P
-from calibre.utils.webengine import create_script, insert_scripts
+from calibre.utils.webengine import create_script, insert_scripts, setup_profile
+
+
+def create_base_profile(cache_name='', allow_js=False):
+    from calibre.utils.random_ua import random_common_chrome_user_agent
+    if cache_name:
+        ans = QWebEngineProfile(cache_name, QApplication.instance())
+    else:
+        ans = QWebEngineProfile(QApplication.instance())
+    setup_profile(ans)
+    ans.setHttpUserAgent(random_common_chrome_user_agent())
+    ans.setHttpCacheMaximumSize(0)  # managed by webengine
+    s = ans.settings()
+    a = s.setAttribute
+    a(QWebEngineSettings.WebAttribute.PluginsEnabled, False)
+    a(QWebEngineSettings.WebAttribute.JavascriptEnabled, allow_js)
+    s.setUnknownUrlSchemePolicy(QWebEngineSettings.UnknownUrlSchemePolicy.DisallowUnknownUrlSchemes)
+    a(QWebEngineSettings.WebAttribute.JavascriptCanOpenWindows, False)
+    a(QWebEngineSettings.WebAttribute.JavascriptCanAccessClipboard, False)
+    # ensure javascript cannot read from local files
+    a(QWebEngineSettings.WebAttribute.LocalContentCanAccessFileUrls, False)
+    a(QWebEngineSettings.WebAttribute.AllowWindowActivationFromJavaScript, False)
+    return ans
+


 class DownloadRequest(QObject):