diff --git a/src/calibre/scraper/fetch_backend.py b/src/calibre/scraper/fetch_backend.py new file mode 100644 index 0000000000..2f68a299af --- /dev/null +++ b/src/calibre/scraper/fetch_backend.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python +# License: GPLv3 Copyright: 2024, Kovid Goyal + + +from typing import Union + +from qt.core import QNetworkCookie, QObject, Qt, QUrl, pyqtSignal +from qt.webengine import QWebEngineDownloadRequest, QWebEnginePage, QWebEngineUrlRequestInfo, QWebEngineUrlRequestInterceptor + +from .simple_backend import create_base_profile + + +class RequestInterceptor(QWebEngineUrlRequestInterceptor): + + def interceptRequest(self, req: QWebEngineUrlRequestInfo) -> None: + pass + + +class FetchBackend(QWebEnginePage): + + request_download = pyqtSignal(object, str) + + def __init__(self, output_dir: str, cache_name: str = '', parent: QObject = None) -> None: + self.profile = create_base_profile(cache_name) + self.profile.downloadRequested.connect(self._download_requested) + self.profile.setDownloadPath(output_dir) + super().__init__(self.profile, parent) + self.interceptor = RequestInterceptor(self) + self.profile.setUrlRequestInterceptor(self.interceptor) + self.request_download.connect(self.download, type=Qt.ConnectionType.QueuedConnection) + + def download(self, url: Union[str, QUrl], filename_or_path: str = '') -> str: + if isinstance(url, str): + url = QUrl(url) + super().download(url, filename_or_path) + return bytes(url.toEncoded()).decode() + + def _download_requested(self, dr: QWebEngineDownloadRequest) -> None: + dr.accept() + dr.isFinishedChanged.connect(self._download_finished) + + def _download_finished(self) -> None: + dr: QWebEngineDownloadRequest = self.sender() + s = dr.state() + url = bytes(dr.url().toEncoded()).decode() + if s == QWebEngineDownloadRequest.DownloadState.DownloadInterrupted: + print(99999999, url, dr.interruptReasonString()) + elif s == QWebEngineDownloadRequest.DownloadState.DownloadCompleted: + print(1111111, dr, url, dr.downloadFileName()) + + def set_user_agent(self, new_val: str) -> None: + self.profile.setHttpUserAgent(new_val) + + def set_simple_cookie(self, name, value, domain, path='/'): + cs = self.profile.cookieStore() + cookie_string = f'{name}={value}; Domain={domain}; Path={path}' + for c in QNetworkCookie.parseCookies(cookie_string): + cs.setCookie(c) diff --git a/src/calibre/scraper/simple_backend.py b/src/calibre/scraper/simple_backend.py index 46570d78be..a894fcddf5 100644 --- a/src/calibre/scraper/simple_backend.py +++ b/src/calibre/scraper/simple_backend.py @@ -23,8 +23,7 @@ def canonicalize_qurl(qurl): return qurl -@lru_cache(maxsize=None) -def create_profile(cache_name='', allow_js=False): +def create_base_profile(cache_name='', allow_js=False): from calibre.utils.random_ua import random_common_chrome_user_agent if cache_name: ans = QWebEngineProfile(cache_name, QApplication.instance()) @@ -43,6 +42,12 @@ def create_profile(cache_name='', allow_js=False): # ensure javascript cannot read from local files a(QWebEngineSettings.WebAttribute.LocalContentCanAccessFileUrls, False) a(QWebEngineSettings.WebAttribute.AllowWindowActivationFromJavaScript, False) + return ans + + +@lru_cache(maxsize=None) +def create_profile(cache_name='', allow_js=False): + ans = create_base_profile(cache_name, allow_js) js = P('scraper.js', allow_user_override=False, data=True).decode('utf-8') ans.token = secrets.token_hex() js = js.replace('TOKEN', ans.token)