diff --git a/src/calibre/scraper/simple.py b/src/calibre/scraper/simple.py index 35025b0011..1b936a4a1c 100644 --- a/src/calibre/scraper/simple.py +++ b/src/calibre/scraper/simple.py @@ -2,138 +2,41 @@ # License: GPL v3 Copyright: 2022, Kovid Goyal -import json -import os import sys import weakref -from threading import Lock, Thread, get_ident - -from calibre.constants import iswindows -from calibre.ptempfile import PersistentTemporaryFile -from calibre.utils.filenames import retry_on_fail -from calibre.utils.ipc.simple_worker import start_pipe_worker - - -def worker_main(source): - from qt.core import QUrl - - from calibre.gui2 import must_use_qt - from calibre.gui_launch import setup_qt_logging - setup_qt_logging() - - from .simple_backend import SimpleScraper - must_use_qt() - s = SimpleScraper(source) - for line in sys.stdin.buffer: - line = line.strip() - if source == 'test': - print(line.decode('utf-8'), file=sys.stderr) - try: - cmd, rest = line.split(b':', 1) - except Exception: - continue - if cmd == b'EXIT': - raise SystemExit(int(rest)) - if cmd == b'FETCH': - try: - d = json.loads(rest) - html = s.fetch(QUrl.fromEncoded(d['url'].encode('utf-8')), timeout=float(d['timeout'])) - except Exception as e: - import traceback - result = {'ok': False, 'tb': traceback.format_exc(), 'err': str(e)} - else: - with PersistentTemporaryFile(suffix='-scraper-result.html') as t: - t.write(html.encode('utf-8')) - result = {'ok': True, 'html_file': t.name} - print(json.dumps(result), flush=True) - +from threading import Lock overseers = [] - -class Overseer: - - def __init__(self): - self.lock = Lock() - self.workers = {} - overseers.append(weakref.ref(self)) - - def safe_wait(self, w, timeout): - try: - return w.wait(timeout) - except Exception: - pass - - def worker_for_source(self, source): - wname = f'{source}::{get_ident()}' - with self.lock: - ans = self.workers.get(wname) - if ans is None: - w = start_pipe_worker(f'from calibre.scraper.simple import worker_main; worker_main({source!r})') - ans = self.workers[wname] = w - return ans - - def fetch_url(self, url_or_qurl, source='', timeout=60): - from qt.core import QUrl - w = self.worker_for_source(source) - if isinstance(url_or_qurl, str): - url_or_qurl = QUrl(url_or_qurl) - w.stdin.write(b'FETCH:') - w.stdin.write(json.dumps({'url': bytes(url_or_qurl.toEncoded()).decode('utf-8'), 'timeout': timeout}).encode('utf-8')) - w.stdin.write(b'\n') - w.stdin.flush() - output = json.loads(w.stdout.readline()) - if not output['ok']: - raise ValueError(output['err']) - with open(output['html_file'], 'rb') as f: - html = f.read().decode('utf-8') - retry_on_fail(os.remove, output['html_file']) - return html - - def __del__(self): - with self.lock: - for w in self.workers.values(): - w.stdin.write(b'EXIT:0\n') - w.stdin.flush() - w.stdin.close() - w.stdout.close() - for w in self.workers.values(): - if self.safe_wait(w, 0.2) is None: - w.terminate() - if not iswindows: - if self.safe_wait(w, 0.1) is None: - w.kill() - self.workers.clear() - close = __del__ - - def cleanup_overseers(): - threads = [] - for x in overseers: - o = x() - if o is not None: - t = Thread(target=o.close, name='CloseOverSeer') - t.start() - threads.append(t) + browsers = tuple(filter(None, (x() for x in overseers))) del overseers[:] def join_all(): - for t in threads: - t.join() + for br in browsers: + br.shutdown() return join_all read_url_lock = Lock() -def read_url(storage, url, timeout=60): +def read_url(storage, url, timeout=60, as_html=True): with read_url_lock: + from calibre.scraper.qt import WebEngineBrowser if not storage: - storage.append(Overseer()) + storage.append(WebEngineBrowser()) + overseers.append(weakref.ref(storage[-1])) scraper = storage[0] - from calibre.ebooks.chardet import strip_encoding_declarations - return strip_encoding_declarations(scraper.fetch_url(url, timeout=timeout)) + raw_bytes = scraper.open_novisit(url, timeout=timeout).read() + if not as_html: + return raw_bytes + from calibre.ebooks.chardet import xml_to_unicode + return xml_to_unicode(raw_bytes, strip_encoding_pats=True)[0] if __name__ == '__main__': - print(read_url([], sys.argv[-1])) + try: + print(read_url([], sys.argv[-1])) + finally: + cleanup_overseers()() diff --git a/src/calibre/scraper/simple_backend.py b/src/calibre/scraper/simple_backend.py deleted file mode 100644 index 623d18bc02..0000000000 --- a/src/calibre/scraper/simple_backend.py +++ /dev/null @@ -1,164 +0,0 @@ -#!/usr/bin/env python -# License: GPL v3 Copyright: 2022, Kovid Goyal - -import json -import secrets -import sys -import time -from functools import lru_cache - -from qt.core import QApplication, QEventLoop, QUrl -from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineSettings - -from calibre.utils.webengine import create_script, insert_scripts, setup_profile - - -def canonicalize_qurl(qurl): - qurl = qurl.adjusted( - QUrl.UrlFormattingOption.StripTrailingSlash | QUrl.UrlFormattingOption.NormalizePathSegments | QUrl.UrlFormattingOption.RemoveFragment - ) - if qurl.path() == '/': - qurl = qurl.adjusted(QUrl.UrlFormattingOption.RemovePath) - return qurl - - -def create_base_profile(cache_name='', allow_js=False): - from calibre.utils.random_ua import random_common_chrome_user_agent - if cache_name: - ans = QWebEngineProfile(cache_name, QApplication.instance()) - else: - ans = QWebEngineProfile(QApplication.instance()) - setup_profile(ans) - ans.setHttpUserAgent(random_common_chrome_user_agent()) - ans.setHttpCacheMaximumSize(0) # managed by webengine - s = ans.settings() - a = s.setAttribute - a(QWebEngineSettings.WebAttribute.PluginsEnabled, False) - a(QWebEngineSettings.WebAttribute.JavascriptEnabled, allow_js) - s.setUnknownUrlSchemePolicy(QWebEngineSettings.UnknownUrlSchemePolicy.DisallowUnknownUrlSchemes) - a(QWebEngineSettings.WebAttribute.JavascriptCanOpenWindows, False) - a(QWebEngineSettings.WebAttribute.JavascriptCanAccessClipboard, False) - # ensure javascript cannot read from local files - a(QWebEngineSettings.WebAttribute.LocalContentCanAccessFileUrls, False) - a(QWebEngineSettings.WebAttribute.AllowWindowActivationFromJavaScript, False) - return ans - - -@lru_cache(maxsize=None) -def create_profile(cache_name='', allow_js=False): - ans = create_base_profile(cache_name, allow_js) - ans.token = secrets.token_hex() - js = ''' -(function() { - "use strict"; - - function send_msg(data) { - var token = 'TOKEN'; - var msg = token + ' ' + JSON.stringify(data); - console.log(msg); - } - - function debug() { - var args = Array.prototype.slice.call(arguments); - var text = args.join(' '); - send_msg({type: 'print', text: text}); - } - - if (document.location && document.location.href && !document.location.href.startsWith('chrome-error:') && !document.location.href.startsWith('about:')) { - send_msg({type: 'domready', url: document.location.href, html: new XMLSerializer().serializeToString(document)}); - } -})(); -''' - js = js.replace('TOKEN', ans.token) - insert_scripts(ans, create_script('scraper.js', js)) - return ans - - -class SimpleScraper(QWebEnginePage): - - def __init__(self, source='', parent=None): - profile = create_profile(source) - self.token = profile.token - self.is_being_tested = source == 'test' - super().__init__(profile, parent) - self.setAudioMuted(True) - self.loadStarted.connect(self.load_started) - self.loadFinished.connect(self.load_finished) - self.loadProgress.connect(self.load_progress) - - def print(self, *a): - print(*a, file=sys.stderr) - - def load_started(self): - if self.is_being_tested: - self.print(f'load_started: {self.is_current_url=} {self.requestedUrl()=}') - if self.is_current_url: - self.current_fetch['load_started'] = True - - def load_finished(self, ok): - if self.is_being_tested: - self.print(f'load_finished: {ok=} {self.is_current_url=}') - if self.is_current_url: - self.current_fetch['load_finished'] = True - self.current_fetch['load_was_ok'] = ok - if not ok and self.is_current_url: - self.current_fetch['working'] = False - - def load_progress(self, progress): - if self.is_being_tested: - self.print(f'load_progress: {progress=} {self.is_current_url=}') - if self.is_current_url: - self.current_fetch['end_time'] = time.monotonic() + self.current_fetch['timeout'] - - def javaScriptAlert(self, url, msg): - pass - - def javaScriptConfirm(self, url, msg): - return True - - def javaScriptPrompt(self, url, msg, defval): - return True, defval - - @property - def is_current_url(self): - if not hasattr(self, 'current_fetch'): - return False - return canonicalize_qurl(self.requestedUrl()) == self.current_fetch['fetching_url'] - - def javaScriptConsoleMessage(self, level, message, line_num, source_id): - parts = message.split(maxsplit=1) - if len(parts) == 2 and parts[0] == self.token: - msg = json.loads(parts[1]) - t = msg.get('type') - if t == 'print': - print(msg['text'], file=sys.stderr) - elif t == 'domready': - if self.is_being_tested: - self.print(f'domready: {self.is_current_url=}') - if self.is_current_url: - self.triggerAction(QWebEnginePage.WebAction.Stop) - self.current_fetch['working'] = False - if not msg.get('failed'): - self.current_fetch['html'] = msg['html'] - - def fetch(self, url_or_qurl, timeout=60): - fetching_url = QUrl(url_or_qurl) - self.current_fetch = { - 'timeout': timeout, 'end_time': time.monotonic() + timeout, - 'fetching_url': canonicalize_qurl(fetching_url), 'working': True, - 'load_started': False - } - self.load(fetching_url) - try: - app = QApplication.instance() - while self.current_fetch['working'] and time.monotonic() < self.current_fetch['end_time']: - app.processEvents(QEventLoop.ProcessEventsFlag.ExcludeUserInputEvents) - ans = self.current_fetch.get('html') - if ans is None: - eurl = fetching_url.toString() - if self.current_fetch['working']: - raise TimeoutError(f'Timed out loading HTML from: {eurl}') - raise ValueError(f'Failed to load HTML from: {eurl}') - return ans - finally: - del self.current_fetch diff --git a/src/calibre/scraper/test_fetch_backend.py b/src/calibre/scraper/test_fetch_backend.py index c960c7a4b4..286d845a85 100644 --- a/src/calibre/scraper/test_fetch_backend.py +++ b/src/calibre/scraper/test_fetch_backend.py @@ -4,16 +4,10 @@ import http.server import json import os -import re import unittest from threading import Event, Thread -from lxml.html import fromstring, tostring - -from calibre.utils.resources import get_path as P - from .qt import Browser, WebEngineBrowser -from .simple import Overseer skip = '' is_sanitized = 'libasan' in os.environ.get('LD_PRELOAD', '') @@ -23,30 +17,6 @@ elif 'SKIP_QT_BUILD_TEST' in os.environ: skip = 'Skipping Scraper tests as it causes crashes in macOS VM' -@unittest.skipIf(skip, skip) -class TestSimpleWebEngineScraper(unittest.TestCase): - - def test_dom_load(self): - from qt.core import QUrl - overseer = Overseer() - for f in ('book', 'nav'): - path = P(f'templates/new_{f}.html', allow_user_override=False) - url = QUrl.fromLocalFile(path) - html = overseer.fetch_url(url, 'test') - - def c(a): - ans = tostring(fromstring(a.encode('utf-8')), pretty_print=True, encoding='unicode') - return re.sub(r'\s+', ' ', ans) - with open(path, 'rb') as f: - raw = f.read().decode('utf-8') - self.assertEqual(c(html), c(raw)) - self.assertRaises(ValueError, overseer.fetch_url, 'file:///does-not-exist.html', 'test') - w = overseer.workers - self.assertEqual(len(w), 1) - del overseer - self.assertFalse(w) - - class Handler(http.server.BaseHTTPRequestHandler): def __init__(self, test_obj, *a): @@ -192,6 +162,4 @@ class TestFetchBackend(unittest.TestCase): def find_tests(): - ans = unittest.defaultTestLoader.loadTestsFromTestCase(TestSimpleWebEngineScraper) - ans.addTests(iter(unittest.defaultTestLoader.loadTestsFromTestCase(TestFetchBackend))) - return ans + return unittest.defaultTestLoader.loadTestsFromTestCase(TestFetchBackend) diff --git a/src/calibre/scraper/webengine_backend.py b/src/calibre/scraper/webengine_backend.py index fc594dcae9..9fbf11d9d6 100644 --- a/src/calibre/scraper/webengine_backend.py +++ b/src/calibre/scraper/webengine_backend.py @@ -13,13 +13,35 @@ from http import HTTPStatus from time import monotonic from qt.core import QApplication, QByteArray, QNetworkCookie, QObject, Qt, QTimer, QUrl, pyqtSignal, sip -from qt.webengine import QWebEnginePage, QWebEngineScript +from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineScript, QWebEngineSettings from calibre.scraper.qt_backend import Request, too_slow_or_timed_out from calibre.scraper.qt_backend import worker as qt_worker -from calibre.scraper.simple_backend import create_base_profile from calibre.utils.resources import get_path as P -from calibre.utils.webengine import create_script, insert_scripts +from calibre.utils.webengine import create_script, insert_scripts, setup_profile + + +def create_base_profile(cache_name='', allow_js=False): + from calibre.utils.random_ua import random_common_chrome_user_agent + if cache_name: + ans = QWebEngineProfile(cache_name, QApplication.instance()) + else: + ans = QWebEngineProfile(QApplication.instance()) + setup_profile(ans) + ans.setHttpUserAgent(random_common_chrome_user_agent()) + ans.setHttpCacheMaximumSize(0) # managed by webengine + s = ans.settings() + a = s.setAttribute + a(QWebEngineSettings.WebAttribute.PluginsEnabled, False) + a(QWebEngineSettings.WebAttribute.JavascriptEnabled, allow_js) + s.setUnknownUrlSchemePolicy(QWebEngineSettings.UnknownUrlSchemePolicy.DisallowUnknownUrlSchemes) + a(QWebEngineSettings.WebAttribute.JavascriptCanOpenWindows, False) + a(QWebEngineSettings.WebAttribute.JavascriptCanAccessClipboard, False) + # ensure javascript cannot read from local files + a(QWebEngineSettings.WebAttribute.LocalContentCanAccessFileUrls, False) + a(QWebEngineSettings.WebAttribute.AllowWindowActivationFromJavaScript, False) + return ans + class DownloadRequest(QObject):