From e46982f7df1fcdcb35c439cde49bcf69e204ea4c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 2 Apr 2022 14:22:56 +0530 Subject: [PATCH] Allow specifying timeout to read_url() --- src/calibre/gui2/store/stores/kobo_plugin.py | 8 ++++---- src/calibre/scraper/simple.py | 11 ++++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/calibre/gui2/store/stores/kobo_plugin.py b/src/calibre/gui2/store/stores/kobo_plugin.py index 3e0829d014..4294943d91 100644 --- a/src/calibre/gui2/store/stores/kobo_plugin.py +++ b/src/calibre/gui2/store/stores/kobo_plugin.py @@ -23,11 +23,11 @@ from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog -def read_url(url): +def read_url(url, timeout=60): # Kobo uses Akamai which has some bot detection that uses network/tls # protocol data. So use the Chromium network stack to make the request from calibre.scraper.simple import read_url as ru - return ru(read_url.storage, url) + return ru(read_url.storage, url, timeout=timeout) read_url.storage = [] @@ -36,7 +36,7 @@ read_url.storage = [] def search_kobo(query, max_results=10, timeout=60, write_html_to=None): from css_selectors import Select url = 'https://www.kobobooks.com/search/search.html?q=' + quote_plus(query) - raw = read_url(url) + raw = read_url(url, timeout=timeout) if write_html_to is not None: with open(write_html_to, 'w') as f: f.write(raw) @@ -119,7 +119,7 @@ class KoboStore(BasicStoreConfig, StorePlugin): yield result def get_details(self, search_result, timeout): - raw = read_url(search_result.detail_item) + raw = read_url(search_result.detail_item, timeout=timeout) idata = html.fromstring(raw) if idata.xpath('boolean(//div[@class="bookitem-secondary-metadata"]//li[contains(text(), "Download options")])'): if idata.xpath('boolean(//div[@class="bookitem-secondary-metadata"]//li[contains(text(), "DRM-Free")])'): diff --git a/src/calibre/scraper/simple.py b/src/calibre/scraper/simple.py index f9c97e681e..ec070cc514 100644 --- a/src/calibre/scraper/simple.py +++ b/src/calibre/scraper/simple.py @@ -37,7 +37,8 @@ qt.webenginecontext.info=false raise SystemExit(int(rest)) if cmd == b'FETCH': try: - html = s.fetch(QUrl.fromEncoded(json.loads(rest).encode('utf-8'))) + d = json.loads(rest) + html = s.fetch(QUrl.fromEncoded(d['url'].encode('utf-8')), timeout=float(d['timeout'])) except Exception as e: import traceback result = {'ok': False, 'tb': traceback.format_exc(), 'err': str(e)} @@ -67,12 +68,12 @@ class Overseer: ans = self.workers[wname] = w return ans - def fetch_url(self, url_or_qurl, source=''): + def fetch_url(self, url_or_qurl, source='', timeout=60): w = self.worker_for_source(source) if isinstance(url_or_qurl, str): url_or_qurl = QUrl(url_or_qurl) w.stdin.write(b'FETCH:') - w.stdin.write(json.dumps(bytes(url_or_qurl.toEncoded()).decode('utf-8')).encode('utf-8')) + w.stdin.write(json.dumps({'url': bytes(url_or_qurl.toEncoded()).decode('utf-8'), 'timeout': timeout}).encode('utf-8')) w.stdin.write(b'\n') w.stdin.flush() output = json.loads(w.stdout.readline()) @@ -117,13 +118,13 @@ def cleanup_overseers(): read_url_lock = Lock() -def read_url(storage, url): +def read_url(storage, url, timeout=60): with read_url_lock: if not storage: storage.append(Overseer()) scraper = storage[0] from calibre.ebooks.chardet import strip_encoding_declarations - return strip_encoding_declarations(scraper.fetch_url(url)) + return strip_encoding_declarations(scraper.fetch_url(url, timeout=timeout)) def find_tests():