Allow specifying timeout to read_url()

This commit is contained in:
Kovid Goyal 2022-04-02 14:22:56 +05:30
parent dc92c8f9bb
commit e46982f7df
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 10 additions and 9 deletions

View File

@ -23,11 +23,11 @@ from calibre.gui2.store.search_result import SearchResult
from calibre.gui2.store.web_store_dialog import WebStoreDialog from calibre.gui2.store.web_store_dialog import WebStoreDialog
def read_url(url): def read_url(url, timeout=60):
# Kobo uses Akamai which has some bot detection that uses network/tls # Kobo uses Akamai which has some bot detection that uses network/tls
# protocol data. So use the Chromium network stack to make the request # protocol data. So use the Chromium network stack to make the request
from calibre.scraper.simple import read_url as ru from calibre.scraper.simple import read_url as ru
return ru(read_url.storage, url) return ru(read_url.storage, url, timeout=timeout)
read_url.storage = [] read_url.storage = []
@ -36,7 +36,7 @@ read_url.storage = []
def search_kobo(query, max_results=10, timeout=60, write_html_to=None): def search_kobo(query, max_results=10, timeout=60, write_html_to=None):
from css_selectors import Select from css_selectors import Select
url = 'https://www.kobobooks.com/search/search.html?q=' + quote_plus(query) url = 'https://www.kobobooks.com/search/search.html?q=' + quote_plus(query)
raw = read_url(url) raw = read_url(url, timeout=timeout)
if write_html_to is not None: if write_html_to is not None:
with open(write_html_to, 'w') as f: with open(write_html_to, 'w') as f:
f.write(raw) f.write(raw)
@ -119,7 +119,7 @@ class KoboStore(BasicStoreConfig, StorePlugin):
yield result yield result
def get_details(self, search_result, timeout): def get_details(self, search_result, timeout):
raw = read_url(search_result.detail_item) raw = read_url(search_result.detail_item, timeout=timeout)
idata = html.fromstring(raw) idata = html.fromstring(raw)
if idata.xpath('boolean(//div[@class="bookitem-secondary-metadata"]//li[contains(text(), "Download options")])'): if idata.xpath('boolean(//div[@class="bookitem-secondary-metadata"]//li[contains(text(), "Download options")])'):
if idata.xpath('boolean(//div[@class="bookitem-secondary-metadata"]//li[contains(text(), "DRM-Free")])'): if idata.xpath('boolean(//div[@class="bookitem-secondary-metadata"]//li[contains(text(), "DRM-Free")])'):

View File

@ -37,7 +37,8 @@ qt.webenginecontext.info=false
raise SystemExit(int(rest)) raise SystemExit(int(rest))
if cmd == b'FETCH': if cmd == b'FETCH':
try: try:
html = s.fetch(QUrl.fromEncoded(json.loads(rest).encode('utf-8'))) d = json.loads(rest)
html = s.fetch(QUrl.fromEncoded(d['url'].encode('utf-8')), timeout=float(d['timeout']))
except Exception as e: except Exception as e:
import traceback import traceback
result = {'ok': False, 'tb': traceback.format_exc(), 'err': str(e)} result = {'ok': False, 'tb': traceback.format_exc(), 'err': str(e)}
@ -67,12 +68,12 @@ class Overseer:
ans = self.workers[wname] = w ans = self.workers[wname] = w
return ans return ans
def fetch_url(self, url_or_qurl, source=''): def fetch_url(self, url_or_qurl, source='', timeout=60):
w = self.worker_for_source(source) w = self.worker_for_source(source)
if isinstance(url_or_qurl, str): if isinstance(url_or_qurl, str):
url_or_qurl = QUrl(url_or_qurl) url_or_qurl = QUrl(url_or_qurl)
w.stdin.write(b'FETCH:') w.stdin.write(b'FETCH:')
w.stdin.write(json.dumps(bytes(url_or_qurl.toEncoded()).decode('utf-8')).encode('utf-8')) w.stdin.write(json.dumps({'url': bytes(url_or_qurl.toEncoded()).decode('utf-8'), 'timeout': timeout}).encode('utf-8'))
w.stdin.write(b'\n') w.stdin.write(b'\n')
w.stdin.flush() w.stdin.flush()
output = json.loads(w.stdout.readline()) output = json.loads(w.stdout.readline())
@ -117,13 +118,13 @@ def cleanup_overseers():
read_url_lock = Lock() read_url_lock = Lock()
def read_url(storage, url): def read_url(storage, url, timeout=60):
with read_url_lock: with read_url_lock:
if not storage: if not storage:
storage.append(Overseer()) storage.append(Overseer())
scraper = storage[0] scraper = storage[0]
from calibre.ebooks.chardet import strip_encoding_declarations from calibre.ebooks.chardet import strip_encoding_declarations
return strip_encoding_declarations(scraper.fetch_url(url)) return strip_encoding_declarations(scraper.fetch_url(url, timeout=timeout))
def find_tests(): def find_tests():