mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
More work on webengine scraper
This commit is contained in:
parent
3df6200ca7
commit
9005036dd0
29
resources/scraper.js
Normal file
29
resources/scraper.js
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
/* vim:fileencoding=utf-8
|
||||||
|
*
|
||||||
|
* Copyright (C) 2022 Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
*
|
||||||
|
* Distributed under terms of the GPLv3 license
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
(function() {
|
||||||
|
"use strict";
|
||||||
|
|
||||||
|
function send_msg(data) {
|
||||||
|
var token = 'TOKEN';
|
||||||
|
var msg = token + ' ' + JSON.stringify(data);
|
||||||
|
console.log(msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
function debug() {
|
||||||
|
var args = Array.prototype.slice.call(arguments);
|
||||||
|
var text = args.join(' ');
|
||||||
|
send_msg({type: 'print', text: text});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (document.location.href.startsWith('chrome-error://')) {
|
||||||
|
send_msg({type: 'domready', 'failed': true});
|
||||||
|
} else {
|
||||||
|
send_msg({type: 'domready', html: new XMLSerializer().serializeToString(document)});
|
||||||
|
}
|
||||||
|
})();
|
@ -3,12 +3,17 @@
|
|||||||
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
|
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
|
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
|
import secrets
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from qt.core import QApplication
|
from qt.core import QApplication, QEventLoop, QUrl, pyqtSignal
|
||||||
from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineSettings
|
from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineSettings
|
||||||
|
|
||||||
from calibre.constants import cache_dir
|
from calibre.constants import cache_dir
|
||||||
|
from calibre.gui2.webengine import create_script, insert_scripts
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=4)
|
@lru_cache(maxsize=4)
|
||||||
@ -28,14 +33,24 @@ def create_profile(cache_name='simple', allow_js=False):
|
|||||||
# ensure javascript cannot read from local files
|
# ensure javascript cannot read from local files
|
||||||
a(QWebEngineSettings.WebAttribute.LocalContentCanAccessFileUrls, False)
|
a(QWebEngineSettings.WebAttribute.LocalContentCanAccessFileUrls, False)
|
||||||
a(QWebEngineSettings.WebAttribute.AllowWindowActivationFromJavaScript, False)
|
a(QWebEngineSettings.WebAttribute.AllowWindowActivationFromJavaScript, False)
|
||||||
return s
|
js = P('scraper.js', allow_user_override=False, data=True).decode('utf-8')
|
||||||
|
ans.token = secrets.token_hex()
|
||||||
|
js = js.replace('TOKEN', ans.token)
|
||||||
|
insert_scripts(ans, create_script('scraper.js', js))
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
class SimpleScraper(QWebEnginePage):
|
class SimpleScraper(QWebEnginePage):
|
||||||
|
|
||||||
|
html_fetched = pyqtSignal(str)
|
||||||
|
|
||||||
def __init__(self, source, parent=None):
|
def __init__(self, source, parent=None):
|
||||||
super().__init__(create_profile(source), parent=parent)
|
profile = create_profile(source)
|
||||||
|
self.token = profile.token
|
||||||
|
super().__init__(profile, parent)
|
||||||
self.setAudioMuted(True)
|
self.setAudioMuted(True)
|
||||||
|
self.fetching_url = QUrl('invalid://XXX')
|
||||||
|
self.last_fetched_html = ''
|
||||||
|
|
||||||
def javaScriptAlert(self, url, msg):
|
def javaScriptAlert(self, url, msg):
|
||||||
pass
|
pass
|
||||||
@ -47,4 +62,41 @@ class SimpleScraper(QWebEnginePage):
|
|||||||
return True, defval
|
return True, defval
|
||||||
|
|
||||||
def javaScriptConsoleMessage(self, level, message, line_num, source_id):
|
def javaScriptConsoleMessage(self, level, message, line_num, source_id):
|
||||||
pass
|
parts = message.split(maxsplit=1)
|
||||||
|
if len(parts) == 2 and parts[0] == self.token:
|
||||||
|
msg = json.loads(parts[1])
|
||||||
|
t = msg.get('type')
|
||||||
|
if t == 'print':
|
||||||
|
print(msg['text'], file=sys.stderr)
|
||||||
|
elif t == 'domready':
|
||||||
|
if self.url() == self.fetching_url:
|
||||||
|
if msg.get('failed'):
|
||||||
|
self.last_fetched_html = '!'
|
||||||
|
else:
|
||||||
|
self.last_fetched_html = msg['html']
|
||||||
|
self.html_fetched.emit(self.last_fetched_html)
|
||||||
|
|
||||||
|
def start_fetch(self, url_or_qurl):
|
||||||
|
self.fetching_url = QUrl(url_or_qurl)
|
||||||
|
self.load(self.fetching_url)
|
||||||
|
|
||||||
|
def fetch(self, url_or_qurl, timeout=60):
|
||||||
|
self.last_fetched_html = ''
|
||||||
|
self.start_fetch(url_or_qurl)
|
||||||
|
app = QApplication.instance()
|
||||||
|
end = time.monotonic() + timeout
|
||||||
|
while not self.last_fetched_html and time.monotonic() < end:
|
||||||
|
app.processEvents(QEventLoop.ProcessEventsFlag.ExcludeUserInputEvents)
|
||||||
|
ans = self.last_fetched_html
|
||||||
|
self.last_fetched_html = ''
|
||||||
|
if ans == '!':
|
||||||
|
raise ValueError(f'Failed to load HTML from {url_or_qurl}')
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app = QApplication([])
|
||||||
|
s = SimpleScraper('test')
|
||||||
|
s.fetch('file:///t/raw.html')
|
||||||
|
del s
|
||||||
|
del app
|
||||||
|
Loading…
x
Reference in New Issue
Block a user