diff --git a/src/calibre/scraper/simple.py b/src/calibre/scraper/simple.py index f8325fbd1e..066d43f8aa 100644 --- a/src/calibre/scraper/simple.py +++ b/src/calibre/scraper/simple.py @@ -5,139 +5,15 @@ import json import os -import secrets import sys -import time -from functools import lru_cache -from qt.core import QApplication, QEventLoop, QLoggingCategory, QUrl -from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineSettings -from threading import Lock +import weakref +from qt.core import QLoggingCategory, QUrl +from threading import Lock, Thread -from calibre.constants import cache_dir, iswindows -from calibre.gui2.webengine import create_script, insert_scripts +from calibre.constants import iswindows from calibre.ptempfile import PersistentTemporaryFile -from calibre.utils.ipc.simple_worker import start_pipe_worker from calibre.utils.filenames import retry_on_fail - - -def canonicalize_qurl(qurl): - qurl = qurl.adjusted(QUrl.UrlFormattingOption.StripTrailingSlash | QUrl.UrlFormattingOption.NormalizePathSegments) - if qurl.path() == '/': - qurl = qurl.adjusted(QUrl.UrlFormattingOption.RemovePath) - return qurl - - -@lru_cache(maxsize=None) -def create_profile(cache_name='simple', allow_js=False): - from calibre.utils.random_ua import random_common_chrome_user_agent - ans = QWebEngineProfile(cache_name, QApplication.instance()) - ans.setHttpUserAgent(random_common_chrome_user_agent()) - ans.setHttpCacheMaximumSize(0) # managed by webengine - ans.setCachePath(os.path.join(cache_dir(), 'scraper', cache_name)) - s = ans.settings() - a = s.setAttribute - a(QWebEngineSettings.WebAttribute.PluginsEnabled, False) - a(QWebEngineSettings.WebAttribute.JavascriptEnabled, allow_js) - s.setUnknownUrlSchemePolicy(QWebEngineSettings.UnknownUrlSchemePolicy.DisallowUnknownUrlSchemes) - a(QWebEngineSettings.WebAttribute.JavascriptCanOpenWindows, False) - a(QWebEngineSettings.WebAttribute.JavascriptCanAccessClipboard, False) - # ensure javascript cannot read from local files - a(QWebEngineSettings.WebAttribute.LocalContentCanAccessFileUrls, False) - a(QWebEngineSettings.WebAttribute.AllowWindowActivationFromJavaScript, False) - js = P('scraper.js', allow_user_override=False, data=True).decode('utf-8') - ans.token = secrets.token_hex() - js = js.replace('TOKEN', ans.token) - insert_scripts(ans, create_script('scraper.js', js)) - return ans - - -class SimpleScraper(QWebEnginePage): - - def __init__(self, source, parent=None): - profile = create_profile(source) - self.token = profile.token - self.is_being_tested = source == 'test' - super().__init__(profile, parent) - self.setAudioMuted(True) - self.loadStarted.connect(self.load_started) - self.loadFinished.connect(self.load_finished) - self.loadProgress.connect(self.load_progress) - - def print(self, *a): - print(*a, file=sys.stderr) - - def load_started(self): - if self.is_being_tested: - self.print(f'load_started: {self.is_current_url=} {self.requestedUrl()=}') - if self.is_current_url: - self.current_fetch['load_started'] = True - - def load_finished(self, ok): - if self.is_being_tested: - self.print(f'load_finished: {ok=} {self.is_current_url=}') - if self.is_current_url: - self.current_fetch['load_finished'] = True - self.current_fetch['load_was_ok'] = ok - if not ok and self.is_current_url: - self.current_fetch['working'] = False - - def load_progress(self, progress): - if self.is_being_tested: - self.print(f'load_progress: {progress=} {self.is_current_url=}') - if self.is_current_url: - self.current_fetch['end_time'] = time.monotonic() + self.current_fetch['timeout'] - - def javaScriptAlert(self, url, msg): - pass - - def javaScriptConfirm(self, url, msg): - return True - - def javaScriptPrompt(self, url, msg, defval): - return True, defval - - @property - def is_current_url(self): - if not hasattr(self, 'current_fetch'): - return False - return canonicalize_qurl(self.requestedUrl()) == self.current_fetch['fetching_url'] - - def javaScriptConsoleMessage(self, level, message, line_num, source_id): - parts = message.split(maxsplit=1) - if len(parts) == 2 and parts[0] == self.token: - msg = json.loads(parts[1]) - t = msg.get('type') - if t == 'print': - print(msg['text'], file=sys.stderr) - elif t == 'domready': - if self.is_being_tested: - self.print(f'domready: {self.is_current_url=}') - if self.is_current_url: - self.current_fetch['working'] = False - if not msg.get('failed'): - self.current_fetch['html'] = msg['html'] - - def fetch(self, url_or_qurl, timeout=60): - fetching_url = QUrl(url_or_qurl) - self.current_fetch = { - 'timeout': timeout, 'end_time': time.monotonic() + timeout, - 'fetching_url': canonicalize_qurl(fetching_url), 'working': True, - 'load_started': False - } - self.load(fetching_url) - try: - app = QApplication.instance() - while self.current_fetch['working'] and time.monotonic() < self.current_fetch['end_time']: - app.processEvents(QEventLoop.ProcessEventsFlag.ExcludeUserInputEvents) - ans = self.current_fetch.get('html') - if ans is None: - eurl = fetching_url.toString() - if self.current_fetch['working']: - raise TimeoutError(f'Timed out loading HTML from: {eurl}') - raise ValueError(f'Failed to load HTML from: {eurl}') - return ans - finally: - del self.current_fetch +from calibre.utils.ipc.simple_worker import start_pipe_worker def worker_main(source): @@ -145,6 +21,8 @@ def worker_main(source): qt.webenginecontext.info=false ''') from calibre.gui2 import must_use_qt + + from .simple_backend import SimpleScraper must_use_qt() s = SimpleScraper(source) for line in sys.stdin.buffer: @@ -170,11 +48,15 @@ qt.webenginecontext.info=false print(json.dumps(result), flush=True) +overseers = [] + + class Overseer: def __init__(self): self.lock = Lock() self.workers = {} + overseers.append(weakref.ref(self)) def worker_for_source(self, source): with self.lock: @@ -212,12 +94,29 @@ class Overseer: if w.wait(0.1) is None: w.kill() self.workers.clear() + close = __del__ + + +def cleanup_overseers(): + threads = [] + for x in overseers: + o = x() + if o is not None: + t = Thread(target=o.close, name='CloseOverSeer') + t.start() + threads.append(t) + del overseers[:] + + def join_all(): + for t in threads: + t.join() + return join_all def find_tests(): + import re import unittest from lxml.html import fromstring, tostring - import re skip = '' is_sanitized = 'libasan' in os.environ.get('LD_PRELOAD', '') if is_sanitized: diff --git a/src/calibre/scraper/simple_backend.py b/src/calibre/scraper/simple_backend.py new file mode 100644 index 0000000000..b4d3312589 --- /dev/null +++ b/src/calibre/scraper/simple_backend.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +# License: GPL v3 Copyright: 2022, Kovid Goyal + +import json +import os +import secrets +import sys +import time +from functools import lru_cache +from qt.core import QApplication, QEventLoop, QUrl +from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineSettings + +from calibre.constants import cache_dir +from calibre.gui2.webengine import create_script, insert_scripts + + +def canonicalize_qurl(qurl): + qurl = qurl.adjusted(QUrl.UrlFormattingOption.StripTrailingSlash | QUrl.UrlFormattingOption.NormalizePathSegments) + if qurl.path() == '/': + qurl = qurl.adjusted(QUrl.UrlFormattingOption.RemovePath) + return qurl + + +@lru_cache(maxsize=None) +def create_profile(cache_name='simple', allow_js=False): + from calibre.utils.random_ua import random_common_chrome_user_agent + ans = QWebEngineProfile(cache_name, QApplication.instance()) + ans.setHttpUserAgent(random_common_chrome_user_agent()) + ans.setHttpCacheMaximumSize(0) # managed by webengine + ans.setCachePath(os.path.join(cache_dir(), 'scraper', cache_name)) + s = ans.settings() + a = s.setAttribute + a(QWebEngineSettings.WebAttribute.PluginsEnabled, False) + a(QWebEngineSettings.WebAttribute.JavascriptEnabled, allow_js) + s.setUnknownUrlSchemePolicy(QWebEngineSettings.UnknownUrlSchemePolicy.DisallowUnknownUrlSchemes) + a(QWebEngineSettings.WebAttribute.JavascriptCanOpenWindows, False) + a(QWebEngineSettings.WebAttribute.JavascriptCanAccessClipboard, False) + # ensure javascript cannot read from local files + a(QWebEngineSettings.WebAttribute.LocalContentCanAccessFileUrls, False) + a(QWebEngineSettings.WebAttribute.AllowWindowActivationFromJavaScript, False) + js = P('scraper.js', allow_user_override=False, data=True).decode('utf-8') + ans.token = secrets.token_hex() + js = js.replace('TOKEN', ans.token) + insert_scripts(ans, create_script('scraper.js', js)) + return ans + + +class SimpleScraper(QWebEnginePage): + + def __init__(self, source, parent=None): + profile = create_profile(source) + self.token = profile.token + self.is_being_tested = source == 'test' + super().__init__(profile, parent) + self.setAudioMuted(True) + self.loadStarted.connect(self.load_started) + self.loadFinished.connect(self.load_finished) + self.loadProgress.connect(self.load_progress) + + def print(self, *a): + print(*a, file=sys.stderr) + + def load_started(self): + if self.is_being_tested: + self.print(f'load_started: {self.is_current_url=} {self.requestedUrl()=}') + if self.is_current_url: + self.current_fetch['load_started'] = True + + def load_finished(self, ok): + if self.is_being_tested: + self.print(f'load_finished: {ok=} {self.is_current_url=}') + if self.is_current_url: + self.current_fetch['load_finished'] = True + self.current_fetch['load_was_ok'] = ok + if not ok and self.is_current_url: + self.current_fetch['working'] = False + + def load_progress(self, progress): + if self.is_being_tested: + self.print(f'load_progress: {progress=} {self.is_current_url=}') + if self.is_current_url: + self.current_fetch['end_time'] = time.monotonic() + self.current_fetch['timeout'] + + def javaScriptAlert(self, url, msg): + pass + + def javaScriptConfirm(self, url, msg): + return True + + def javaScriptPrompt(self, url, msg, defval): + return True, defval + + @property + def is_current_url(self): + if not hasattr(self, 'current_fetch'): + return False + return canonicalize_qurl(self.requestedUrl()) == self.current_fetch['fetching_url'] + + def javaScriptConsoleMessage(self, level, message, line_num, source_id): + parts = message.split(maxsplit=1) + if len(parts) == 2 and parts[0] == self.token: + msg = json.loads(parts[1]) + t = msg.get('type') + if t == 'print': + print(msg['text'], file=sys.stderr) + elif t == 'domready': + if self.is_being_tested: + self.print(f'domready: {self.is_current_url=}') + if self.is_current_url: + self.current_fetch['working'] = False + if not msg.get('failed'): + self.current_fetch['html'] = msg['html'] + + def fetch(self, url_or_qurl, timeout=60): + fetching_url = QUrl(url_or_qurl) + self.current_fetch = { + 'timeout': timeout, 'end_time': time.monotonic() + timeout, + 'fetching_url': canonicalize_qurl(fetching_url), 'working': True, + 'load_started': False + } + self.load(fetching_url) + try: + app = QApplication.instance() + while self.current_fetch['working'] and time.monotonic() < self.current_fetch['end_time']: + app.processEvents(QEventLoop.ProcessEventsFlag.ExcludeUserInputEvents) + ans = self.current_fetch.get('html') + if ans is None: + eurl = fetching_url.toString() + if self.current_fetch['working']: + raise TimeoutError(f'Timed out loading HTML from: {eurl}') + raise ValueError(f'Failed to load HTML from: {eurl}') + return ans + finally: + del self.current_fetch