Allow per thread scrapers

This commit is contained in:
Kovid Goyal 2022-04-02 12:07:49 +05:30
parent a1096d303d
commit 212ad2a135
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 14 additions and 10 deletions

View File

@ -8,7 +8,7 @@ import os
import sys import sys
import weakref import weakref
from qt.core import QLoggingCategory, QUrl from qt.core import QLoggingCategory, QUrl
from threading import Lock, Thread from threading import Lock, Thread, get_ident
from calibre.constants import iswindows from calibre.constants import iswindows
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
@ -59,14 +59,15 @@ class Overseer:
overseers.append(weakref.ref(self)) overseers.append(weakref.ref(self))
def worker_for_source(self, source): def worker_for_source(self, source):
wname = f'{source}-{get_ident()}'
with self.lock: with self.lock:
ans = self.workers.get(source) ans = self.workers.get(wname)
if ans is None: if ans is None:
w = start_pipe_worker(f'from calibre.scraper.simple import worker_main; worker_main({source!r})') w = start_pipe_worker(f'from calibre.scraper.simple import worker_main; worker_main({source!r})')
ans = self.workers[source] = w ans = self.workers[wname] = w
return ans return ans
def fetch_url(self, source, url_or_qurl): def fetch_url(self, url_or_qurl, source=''):
w = self.worker_for_source(source) w = self.worker_for_source(source)
if isinstance(url_or_qurl, str): if isinstance(url_or_qurl, str):
url_or_qurl = QUrl(url_or_qurl) url_or_qurl = QUrl(url_or_qurl)
@ -132,7 +133,7 @@ def find_tests():
for f in ('book', 'nav'): for f in ('book', 'nav'):
path = P(f'templates/new_{f}.html', allow_user_override=False) path = P(f'templates/new_{f}.html', allow_user_override=False)
url = QUrl.fromLocalFile(path) url = QUrl.fromLocalFile(path)
html = overseer.fetch_url('test', url) html = overseer.fetch_url(url, 'test')
def c(a): def c(a):
ans = tostring(fromstring(a.encode('utf-8')), pretty_print=True, encoding='unicode') ans = tostring(fromstring(a.encode('utf-8')), pretty_print=True, encoding='unicode')
@ -140,7 +141,7 @@ def find_tests():
with open(path, 'rb') as f: with open(path, 'rb') as f:
raw = f.read().decode('utf-8') raw = f.read().decode('utf-8')
self.assertEqual(c(html), c(raw)) self.assertEqual(c(html), c(raw))
self.assertRaises(ValueError, overseer.fetch_url, 'test', 'file:///does-not-exist.html') self.assertRaises(ValueError, overseer.fetch_url, 'file:///does-not-exist.html', 'test')
w = overseer.workers w = overseer.workers
self.assertEqual(len(w), 1) self.assertEqual(len(w), 1)
del overseer del overseer

View File

@ -23,12 +23,15 @@ def canonicalize_qurl(qurl):
@lru_cache(maxsize=None) @lru_cache(maxsize=None)
def create_profile(cache_name='simple', allow_js=False): def create_profile(cache_name='', allow_js=False):
from calibre.utils.random_ua import random_common_chrome_user_agent from calibre.utils.random_ua import random_common_chrome_user_agent
ans = QWebEngineProfile(cache_name, QApplication.instance()) if cache_name:
ans = QWebEngineProfile(cache_name, QApplication.instance())
ans.setCachePath(os.path.join(cache_dir(), 'scraper', cache_name))
else:
ans = QWebEngineProfile(QApplication.instance())
ans.setHttpUserAgent(random_common_chrome_user_agent()) ans.setHttpUserAgent(random_common_chrome_user_agent())
ans.setHttpCacheMaximumSize(0) # managed by webengine ans.setHttpCacheMaximumSize(0) # managed by webengine
ans.setCachePath(os.path.join(cache_dir(), 'scraper', cache_name))
s = ans.settings() s = ans.settings()
a = s.setAttribute a = s.setAttribute
a(QWebEngineSettings.WebAttribute.PluginsEnabled, False) a(QWebEngineSettings.WebAttribute.PluginsEnabled, False)
@ -48,7 +51,7 @@ def create_profile(cache_name='simple', allow_js=False):
class SimpleScraper(QWebEnginePage): class SimpleScraper(QWebEnginePage):
def __init__(self, source, parent=None): def __init__(self, source='', parent=None):
profile = create_profile(source) profile = create_profile(source)
self.token = profile.token self.token = profile.token
self.is_being_tested = source == 'test' self.is_being_tested = source == 'test'