Get the simple scraper basically working

2025-07-09 03:04:10 -04:00 · 2022-03-29 10:39:03 +05:30 · 2022-03-29 10:39:03 +05:30 · b5e5f1a10b
commit b5e5f1a10b
parent f6dbfb0bf4
2 changed files with 57 additions and 30 deletions
--- a/resources/scraper.js
+++ b/resources/scraper.js
@ -21,9 +21,7 @@
        send_msg({type: 'print', text: text});
    }
-    if (document.location.href.startsWith('chrome-error://')) {
+    if (!document.location.href.startsWith('chrome-error://')) {
        send_msg({type: 'domready', 'failed': true});
    } else {
        send_msg({type: 'domready', html: new XMLSerializer().serializeToString(document)}); 
    }
 })();
--- a/src/calibre/scraper/simple.py
+++ b/src/calibre/scraper/simple.py
@ -9,13 +9,20 @@ import secrets
 import sys
 import time
 from functools import lru_cache
-from qt.core import QApplication, QEventLoop, QUrl, pyqtSignal
+from qt.core import QApplication, QEventLoop, QUrl
 from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineSettings
 from calibre.constants import cache_dir
 from calibre.gui2.webengine import create_script, insert_scripts
 def canonicalize_qurl(qurl):
    qurl = qurl.adjusted(QUrl.UrlFormattingOption.StripTrailingSlash | QUrl.UrlFormattingOption.NormalizePathSegments)
    if qurl.path() == '/':
        qurl = qurl.adjusted(QUrl.UrlFormattingOption.RemovePath)
    return qurl
@lru_cache(maxsize=4)
 def create_profile(cache_name='simple', allow_js=False):
    from calibre.utils.random_ua import random_common_chrome_user_agent
@ -42,15 +49,29 @@ def create_profile(cache_name='simple', allow_js=False):
 class SimpleScraper(QWebEnginePage):
    html_fetched = pyqtSignal(str)
    def __init__(self, source, parent=None):
        profile = create_profile(source)
        self.token = profile.token
        super().__init__(profile, parent)
        self.setAudioMuted(True)
-        self.fetching_url = QUrl('invalid://XXX')
+        self.loadStarted.connect(self.load_started)
-        self.last_fetched_html = ''
+        self.loadFinished.connect(self.load_finished)
        self.loadProgress.connect(self.load_progress)
    def load_started(self):
        if hasattr(self, 'current_fetch'):
            self.current_fetch['load_started'] = True
    def load_finished(self, ok):
        if hasattr(self, 'current_fetch'):
            self.current_fetch['load_finished'] = True
            self.current_fetch['load_was_ok'] = ok
            if not ok and self.is_current_url:
                self.current_fetch['working'] = False
    def load_progress(self, progress):
        if hasattr(self, 'current_fetch'):
            self.current_fetch['end_time'] = time.monotonic() + self.current_fetch['timeout']
    def javaScriptAlert(self, url, msg):
        pass
@ -61,6 +82,12 @@ class SimpleScraper(QWebEnginePage):
    def javaScriptPrompt(self, url, msg, defval):
        return True, defval
    @property
    def is_current_url(self):
        if not hasattr(self, 'current_fetch'):
            return False
        return canonicalize_qurl(self.url()) == self.current_fetch['fetching_url']
    def javaScriptConsoleMessage(self, level, message, line_num, source_id):
        parts = message.split(maxsplit=1)
        if len(parts) == 2 and parts[0] == self.token:
@ -69,34 +96,36 @@ class SimpleScraper(QWebEnginePage):
            if t == 'print':
                print(msg['text'], file=sys.stderr)
            elif t == 'domready':
-                if self.url() == self.fetching_url:
+                if self.is_current_url:
-                    if msg.get('failed'):
+                    self.current_fetch['working'] = False
-                        self.last_fetched_html = '!'
+                    if not msg.get('failed'):
-                    else:
+                        self.current_fetch['html'] = msg['html']
                        self.last_fetched_html = msg['html']
                    self.html_fetched.emit(self.last_fetched_html)
    def start_fetch(self, url_or_qurl):
        self.fetching_url = QUrl(url_or_qurl)
        self.load(self.fetching_url)
    def fetch(self, url_or_qurl, timeout=60):
-        self.last_fetched_html = ''
+        fetching_url = QUrl(url_or_qurl)
-        self.start_fetch(url_or_qurl)
+        self.current_fetch = {
-        app = QApplication.instance()
+            'timeout': timeout, 'end_time': time.monotonic() + timeout,
-        end = time.monotonic() + timeout
+            'fetching_url': canonicalize_qurl(fetching_url), 'working': True,
-        while not self.last_fetched_html and time.monotonic() < end:
+            'load_started': False
-            app.processEvents(QEventLoop.ProcessEventsFlag.ExcludeUserInputEvents)
+        }
-        ans = self.last_fetched_html
+        self.load(fetching_url)
-        self.last_fetched_html = ''
+        try:
-        if ans == '!':
+            app = QApplication.instance()
-            raise ValueError(f'Failed to load HTML from {url_or_qurl}')
+            while self.current_fetch['working'] and time.monotonic() < self.current_fetch['end_time']:
-        return ans
+                app.processEvents(QEventLoop.ProcessEventsFlag.ExcludeUserInputEvents)
            ans = self.current_fetch.get('html')
            if ans is None:
                if self.current_fetch['working']:
                    raise ValueError(f'Timed out loading HTML from {url_or_qurl}')
                raise ValueError(f'Failed to load HTML from {url_or_qurl}')
            return ans
        finally:
            del self.current_fetch
 if __name__ == '__main__':
    app = QApplication([])
    s = SimpleScraper('test')
-    s.fetch('file:///t/raw.html')
+    s.fetch('file:///t/raw.html', timeout=5)
    del s
    del app