Get the simple scraper basically working

This commit is contained in:
Kovid Goyal 2022-03-29 10:39:03 +05:30
parent f6dbfb0bf4
commit b5e5f1a10b
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 57 additions and 30 deletions

View File

@ -21,9 +21,7 @@
send_msg({type: 'print', text: text}); send_msg({type: 'print', text: text});
} }
if (document.location.href.startsWith('chrome-error://')) { if (!document.location.href.startsWith('chrome-error://')) {
send_msg({type: 'domready', 'failed': true});
} else {
send_msg({type: 'domready', html: new XMLSerializer().serializeToString(document)}); send_msg({type: 'domready', html: new XMLSerializer().serializeToString(document)});
} }
})(); })();

View File

@ -9,13 +9,20 @@ import secrets
import sys import sys
import time import time
from functools import lru_cache from functools import lru_cache
from qt.core import QApplication, QEventLoop, QUrl, pyqtSignal from qt.core import QApplication, QEventLoop, QUrl
from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineSettings from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineSettings
from calibre.constants import cache_dir from calibre.constants import cache_dir
from calibre.gui2.webengine import create_script, insert_scripts from calibre.gui2.webengine import create_script, insert_scripts
def canonicalize_qurl(qurl):
qurl = qurl.adjusted(QUrl.UrlFormattingOption.StripTrailingSlash | QUrl.UrlFormattingOption.NormalizePathSegments)
if qurl.path() == '/':
qurl = qurl.adjusted(QUrl.UrlFormattingOption.RemovePath)
return qurl
@lru_cache(maxsize=4) @lru_cache(maxsize=4)
def create_profile(cache_name='simple', allow_js=False): def create_profile(cache_name='simple', allow_js=False):
from calibre.utils.random_ua import random_common_chrome_user_agent from calibre.utils.random_ua import random_common_chrome_user_agent
@ -42,15 +49,29 @@ def create_profile(cache_name='simple', allow_js=False):
class SimpleScraper(QWebEnginePage): class SimpleScraper(QWebEnginePage):
html_fetched = pyqtSignal(str)
def __init__(self, source, parent=None): def __init__(self, source, parent=None):
profile = create_profile(source) profile = create_profile(source)
self.token = profile.token self.token = profile.token
super().__init__(profile, parent) super().__init__(profile, parent)
self.setAudioMuted(True) self.setAudioMuted(True)
self.fetching_url = QUrl('invalid://XXX') self.loadStarted.connect(self.load_started)
self.last_fetched_html = '' self.loadFinished.connect(self.load_finished)
self.loadProgress.connect(self.load_progress)
def load_started(self):
if hasattr(self, 'current_fetch'):
self.current_fetch['load_started'] = True
def load_finished(self, ok):
if hasattr(self, 'current_fetch'):
self.current_fetch['load_finished'] = True
self.current_fetch['load_was_ok'] = ok
if not ok and self.is_current_url:
self.current_fetch['working'] = False
def load_progress(self, progress):
if hasattr(self, 'current_fetch'):
self.current_fetch['end_time'] = time.monotonic() + self.current_fetch['timeout']
def javaScriptAlert(self, url, msg): def javaScriptAlert(self, url, msg):
pass pass
@ -61,6 +82,12 @@ class SimpleScraper(QWebEnginePage):
def javaScriptPrompt(self, url, msg, defval): def javaScriptPrompt(self, url, msg, defval):
return True, defval return True, defval
@property
def is_current_url(self):
if not hasattr(self, 'current_fetch'):
return False
return canonicalize_qurl(self.url()) == self.current_fetch['fetching_url']
def javaScriptConsoleMessage(self, level, message, line_num, source_id): def javaScriptConsoleMessage(self, level, message, line_num, source_id):
parts = message.split(maxsplit=1) parts = message.split(maxsplit=1)
if len(parts) == 2 and parts[0] == self.token: if len(parts) == 2 and parts[0] == self.token:
@ -69,34 +96,36 @@ class SimpleScraper(QWebEnginePage):
if t == 'print': if t == 'print':
print(msg['text'], file=sys.stderr) print(msg['text'], file=sys.stderr)
elif t == 'domready': elif t == 'domready':
if self.url() == self.fetching_url: if self.is_current_url:
if msg.get('failed'): self.current_fetch['working'] = False
self.last_fetched_html = '!' if not msg.get('failed'):
else: self.current_fetch['html'] = msg['html']
self.last_fetched_html = msg['html']
self.html_fetched.emit(self.last_fetched_html)
def start_fetch(self, url_or_qurl):
self.fetching_url = QUrl(url_or_qurl)
self.load(self.fetching_url)
def fetch(self, url_or_qurl, timeout=60): def fetch(self, url_or_qurl, timeout=60):
self.last_fetched_html = '' fetching_url = QUrl(url_or_qurl)
self.start_fetch(url_or_qurl) self.current_fetch = {
app = QApplication.instance() 'timeout': timeout, 'end_time': time.monotonic() + timeout,
end = time.monotonic() + timeout 'fetching_url': canonicalize_qurl(fetching_url), 'working': True,
while not self.last_fetched_html and time.monotonic() < end: 'load_started': False
app.processEvents(QEventLoop.ProcessEventsFlag.ExcludeUserInputEvents) }
ans = self.last_fetched_html self.load(fetching_url)
self.last_fetched_html = '' try:
if ans == '!': app = QApplication.instance()
raise ValueError(f'Failed to load HTML from {url_or_qurl}') while self.current_fetch['working'] and time.monotonic() < self.current_fetch['end_time']:
return ans app.processEvents(QEventLoop.ProcessEventsFlag.ExcludeUserInputEvents)
ans = self.current_fetch.get('html')
if ans is None:
if self.current_fetch['working']:
raise ValueError(f'Timed out loading HTML from {url_or_qurl}')
raise ValueError(f'Failed to load HTML from {url_or_qurl}')
return ans
finally:
del self.current_fetch
if __name__ == '__main__': if __name__ == '__main__':
app = QApplication([]) app = QApplication([])
s = SimpleScraper('test') s = SimpleScraper('test')
s.fetch('file:///t/raw.html') s.fetch('file:///t/raw.html', timeout=5)
del s del s
del app del app