diff --git a/resources/scraper.js b/resources/scraper.js new file mode 100644 index 0000000000..240d7e144c --- /dev/null +++ b/resources/scraper.js @@ -0,0 +1,29 @@ +/* vim:fileencoding=utf-8 + * + * Copyright (C) 2022 Kovid Goyal + * + * Distributed under terms of the GPLv3 license + */ + + +(function() { + "use strict"; + + function send_msg(data) { + var token = 'TOKEN'; + var msg = token + ' ' + JSON.stringify(data); + console.log(msg); + } + + function debug() { + var args = Array.prototype.slice.call(arguments); + var text = args.join(' '); + send_msg({type: 'print', text: text}); + } + + if (document.location.href.startsWith('chrome-error://')) { + send_msg({type: 'domready', 'failed': true}); + } else { + send_msg({type: 'domready', html: new XMLSerializer().serializeToString(document)}); + } +})(); diff --git a/src/calibre/scraper/simple.py b/src/calibre/scraper/simple.py index f45ba8d89e..a599e91677 100644 --- a/src/calibre/scraper/simple.py +++ b/src/calibre/scraper/simple.py @@ -3,12 +3,17 @@ # License: GPL v3 Copyright: 2022, Kovid Goyal +import json import os +import secrets +import sys +import time from functools import lru_cache -from qt.core import QApplication +from qt.core import QApplication, QEventLoop, QUrl, pyqtSignal from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineSettings from calibre.constants import cache_dir +from calibre.gui2.webengine import create_script, insert_scripts @lru_cache(maxsize=4) @@ -28,14 +33,24 @@ def create_profile(cache_name='simple', allow_js=False): # ensure javascript cannot read from local files a(QWebEngineSettings.WebAttribute.LocalContentCanAccessFileUrls, False) a(QWebEngineSettings.WebAttribute.AllowWindowActivationFromJavaScript, False) - return s + js = P('scraper.js', allow_user_override=False, data=True).decode('utf-8') + ans.token = secrets.token_hex() + js = js.replace('TOKEN', ans.token) + insert_scripts(ans, create_script('scraper.js', js)) + return ans class SimpleScraper(QWebEnginePage): + html_fetched = pyqtSignal(str) + def __init__(self, source, parent=None): - super().__init__(create_profile(source), parent=parent) + profile = create_profile(source) + self.token = profile.token + super().__init__(profile, parent) self.setAudioMuted(True) + self.fetching_url = QUrl('invalid://XXX') + self.last_fetched_html = '' def javaScriptAlert(self, url, msg): pass @@ -47,4 +62,41 @@ class SimpleScraper(QWebEnginePage): return True, defval def javaScriptConsoleMessage(self, level, message, line_num, source_id): - pass + parts = message.split(maxsplit=1) + if len(parts) == 2 and parts[0] == self.token: + msg = json.loads(parts[1]) + t = msg.get('type') + if t == 'print': + print(msg['text'], file=sys.stderr) + elif t == 'domready': + if self.url() == self.fetching_url: + if msg.get('failed'): + self.last_fetched_html = '!' + else: + self.last_fetched_html = msg['html'] + self.html_fetched.emit(self.last_fetched_html) + + def start_fetch(self, url_or_qurl): + self.fetching_url = QUrl(url_or_qurl) + self.load(self.fetching_url) + + def fetch(self, url_or_qurl, timeout=60): + self.last_fetched_html = '' + self.start_fetch(url_or_qurl) + app = QApplication.instance() + end = time.monotonic() + timeout + while not self.last_fetched_html and time.monotonic() < end: + app.processEvents(QEventLoop.ProcessEventsFlag.ExcludeUserInputEvents) + ans = self.last_fetched_html + self.last_fetched_html = '' + if ans == '!': + raise ValueError(f'Failed to load HTML from {url_or_qurl}') + return ans + + +if __name__ == '__main__': + app = QApplication([]) + s = SimpleScraper('test') + s.fetch('file:///t/raw.html') + del s + del app