mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Get the simple scraper basically working
This commit is contained in:
parent
f6dbfb0bf4
commit
b5e5f1a10b
@ -21,9 +21,7 @@
|
|||||||
send_msg({type: 'print', text: text});
|
send_msg({type: 'print', text: text});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (document.location.href.startsWith('chrome-error://')) {
|
if (!document.location.href.startsWith('chrome-error://')) {
|
||||||
send_msg({type: 'domready', 'failed': true});
|
|
||||||
} else {
|
|
||||||
send_msg({type: 'domready', html: new XMLSerializer().serializeToString(document)});
|
send_msg({type: 'domready', html: new XMLSerializer().serializeToString(document)});
|
||||||
}
|
}
|
||||||
})();
|
})();
|
||||||
|
@ -9,13 +9,20 @@ import secrets
|
|||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from qt.core import QApplication, QEventLoop, QUrl, pyqtSignal
|
from qt.core import QApplication, QEventLoop, QUrl
|
||||||
from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineSettings
|
from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineSettings
|
||||||
|
|
||||||
from calibre.constants import cache_dir
|
from calibre.constants import cache_dir
|
||||||
from calibre.gui2.webengine import create_script, insert_scripts
|
from calibre.gui2.webengine import create_script, insert_scripts
|
||||||
|
|
||||||
|
|
||||||
|
def canonicalize_qurl(qurl):
|
||||||
|
qurl = qurl.adjusted(QUrl.UrlFormattingOption.StripTrailingSlash | QUrl.UrlFormattingOption.NormalizePathSegments)
|
||||||
|
if qurl.path() == '/':
|
||||||
|
qurl = qurl.adjusted(QUrl.UrlFormattingOption.RemovePath)
|
||||||
|
return qurl
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=4)
|
@lru_cache(maxsize=4)
|
||||||
def create_profile(cache_name='simple', allow_js=False):
|
def create_profile(cache_name='simple', allow_js=False):
|
||||||
from calibre.utils.random_ua import random_common_chrome_user_agent
|
from calibre.utils.random_ua import random_common_chrome_user_agent
|
||||||
@ -42,15 +49,29 @@ def create_profile(cache_name='simple', allow_js=False):
|
|||||||
|
|
||||||
class SimpleScraper(QWebEnginePage):
|
class SimpleScraper(QWebEnginePage):
|
||||||
|
|
||||||
html_fetched = pyqtSignal(str)
|
|
||||||
|
|
||||||
def __init__(self, source, parent=None):
|
def __init__(self, source, parent=None):
|
||||||
profile = create_profile(source)
|
profile = create_profile(source)
|
||||||
self.token = profile.token
|
self.token = profile.token
|
||||||
super().__init__(profile, parent)
|
super().__init__(profile, parent)
|
||||||
self.setAudioMuted(True)
|
self.setAudioMuted(True)
|
||||||
self.fetching_url = QUrl('invalid://XXX')
|
self.loadStarted.connect(self.load_started)
|
||||||
self.last_fetched_html = ''
|
self.loadFinished.connect(self.load_finished)
|
||||||
|
self.loadProgress.connect(self.load_progress)
|
||||||
|
|
||||||
|
def load_started(self):
|
||||||
|
if hasattr(self, 'current_fetch'):
|
||||||
|
self.current_fetch['load_started'] = True
|
||||||
|
|
||||||
|
def load_finished(self, ok):
|
||||||
|
if hasattr(self, 'current_fetch'):
|
||||||
|
self.current_fetch['load_finished'] = True
|
||||||
|
self.current_fetch['load_was_ok'] = ok
|
||||||
|
if not ok and self.is_current_url:
|
||||||
|
self.current_fetch['working'] = False
|
||||||
|
|
||||||
|
def load_progress(self, progress):
|
||||||
|
if hasattr(self, 'current_fetch'):
|
||||||
|
self.current_fetch['end_time'] = time.monotonic() + self.current_fetch['timeout']
|
||||||
|
|
||||||
def javaScriptAlert(self, url, msg):
|
def javaScriptAlert(self, url, msg):
|
||||||
pass
|
pass
|
||||||
@ -61,6 +82,12 @@ class SimpleScraper(QWebEnginePage):
|
|||||||
def javaScriptPrompt(self, url, msg, defval):
|
def javaScriptPrompt(self, url, msg, defval):
|
||||||
return True, defval
|
return True, defval
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_current_url(self):
|
||||||
|
if not hasattr(self, 'current_fetch'):
|
||||||
|
return False
|
||||||
|
return canonicalize_qurl(self.url()) == self.current_fetch['fetching_url']
|
||||||
|
|
||||||
def javaScriptConsoleMessage(self, level, message, line_num, source_id):
|
def javaScriptConsoleMessage(self, level, message, line_num, source_id):
|
||||||
parts = message.split(maxsplit=1)
|
parts = message.split(maxsplit=1)
|
||||||
if len(parts) == 2 and parts[0] == self.token:
|
if len(parts) == 2 and parts[0] == self.token:
|
||||||
@ -69,34 +96,36 @@ class SimpleScraper(QWebEnginePage):
|
|||||||
if t == 'print':
|
if t == 'print':
|
||||||
print(msg['text'], file=sys.stderr)
|
print(msg['text'], file=sys.stderr)
|
||||||
elif t == 'domready':
|
elif t == 'domready':
|
||||||
if self.url() == self.fetching_url:
|
if self.is_current_url:
|
||||||
if msg.get('failed'):
|
self.current_fetch['working'] = False
|
||||||
self.last_fetched_html = '!'
|
if not msg.get('failed'):
|
||||||
else:
|
self.current_fetch['html'] = msg['html']
|
||||||
self.last_fetched_html = msg['html']
|
|
||||||
self.html_fetched.emit(self.last_fetched_html)
|
|
||||||
|
|
||||||
def start_fetch(self, url_or_qurl):
|
|
||||||
self.fetching_url = QUrl(url_or_qurl)
|
|
||||||
self.load(self.fetching_url)
|
|
||||||
|
|
||||||
def fetch(self, url_or_qurl, timeout=60):
|
def fetch(self, url_or_qurl, timeout=60):
|
||||||
self.last_fetched_html = ''
|
fetching_url = QUrl(url_or_qurl)
|
||||||
self.start_fetch(url_or_qurl)
|
self.current_fetch = {
|
||||||
app = QApplication.instance()
|
'timeout': timeout, 'end_time': time.monotonic() + timeout,
|
||||||
end = time.monotonic() + timeout
|
'fetching_url': canonicalize_qurl(fetching_url), 'working': True,
|
||||||
while not self.last_fetched_html and time.monotonic() < end:
|
'load_started': False
|
||||||
app.processEvents(QEventLoop.ProcessEventsFlag.ExcludeUserInputEvents)
|
}
|
||||||
ans = self.last_fetched_html
|
self.load(fetching_url)
|
||||||
self.last_fetched_html = ''
|
try:
|
||||||
if ans == '!':
|
app = QApplication.instance()
|
||||||
raise ValueError(f'Failed to load HTML from {url_or_qurl}')
|
while self.current_fetch['working'] and time.monotonic() < self.current_fetch['end_time']:
|
||||||
return ans
|
app.processEvents(QEventLoop.ProcessEventsFlag.ExcludeUserInputEvents)
|
||||||
|
ans = self.current_fetch.get('html')
|
||||||
|
if ans is None:
|
||||||
|
if self.current_fetch['working']:
|
||||||
|
raise ValueError(f'Timed out loading HTML from {url_or_qurl}')
|
||||||
|
raise ValueError(f'Failed to load HTML from {url_or_qurl}')
|
||||||
|
return ans
|
||||||
|
finally:
|
||||||
|
del self.current_fetch
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
app = QApplication([])
|
app = QApplication([])
|
||||||
s = SimpleScraper('test')
|
s = SimpleScraper('test')
|
||||||
s.fetch('file:///t/raw.html')
|
s.fetch('file:///t/raw.html', timeout=5)
|
||||||
del s
|
del s
|
||||||
del app
|
del app
|
||||||
|
Loading…
x
Reference in New Issue
Block a user