mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Allow per thread scrapers
This commit is contained in:
parent
a1096d303d
commit
212ad2a135
@ -8,7 +8,7 @@ import os
|
||||
import sys
|
||||
import weakref
|
||||
from qt.core import QLoggingCategory, QUrl
|
||||
from threading import Lock, Thread
|
||||
from threading import Lock, Thread, get_ident
|
||||
|
||||
from calibre.constants import iswindows
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
@ -59,14 +59,15 @@ class Overseer:
|
||||
overseers.append(weakref.ref(self))
|
||||
|
||||
def worker_for_source(self, source):
|
||||
wname = f'{source}-{get_ident()}'
|
||||
with self.lock:
|
||||
ans = self.workers.get(source)
|
||||
ans = self.workers.get(wname)
|
||||
if ans is None:
|
||||
w = start_pipe_worker(f'from calibre.scraper.simple import worker_main; worker_main({source!r})')
|
||||
ans = self.workers[source] = w
|
||||
ans = self.workers[wname] = w
|
||||
return ans
|
||||
|
||||
def fetch_url(self, source, url_or_qurl):
|
||||
def fetch_url(self, url_or_qurl, source=''):
|
||||
w = self.worker_for_source(source)
|
||||
if isinstance(url_or_qurl, str):
|
||||
url_or_qurl = QUrl(url_or_qurl)
|
||||
@ -132,7 +133,7 @@ def find_tests():
|
||||
for f in ('book', 'nav'):
|
||||
path = P(f'templates/new_{f}.html', allow_user_override=False)
|
||||
url = QUrl.fromLocalFile(path)
|
||||
html = overseer.fetch_url('test', url)
|
||||
html = overseer.fetch_url(url, 'test')
|
||||
|
||||
def c(a):
|
||||
ans = tostring(fromstring(a.encode('utf-8')), pretty_print=True, encoding='unicode')
|
||||
@ -140,7 +141,7 @@ def find_tests():
|
||||
with open(path, 'rb') as f:
|
||||
raw = f.read().decode('utf-8')
|
||||
self.assertEqual(c(html), c(raw))
|
||||
self.assertRaises(ValueError, overseer.fetch_url, 'test', 'file:///does-not-exist.html')
|
||||
self.assertRaises(ValueError, overseer.fetch_url, 'file:///does-not-exist.html', 'test')
|
||||
w = overseer.workers
|
||||
self.assertEqual(len(w), 1)
|
||||
del overseer
|
||||
|
@ -23,12 +23,15 @@ def canonicalize_qurl(qurl):
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def create_profile(cache_name='simple', allow_js=False):
|
||||
def create_profile(cache_name='', allow_js=False):
|
||||
from calibre.utils.random_ua import random_common_chrome_user_agent
|
||||
if cache_name:
|
||||
ans = QWebEngineProfile(cache_name, QApplication.instance())
|
||||
ans.setCachePath(os.path.join(cache_dir(), 'scraper', cache_name))
|
||||
else:
|
||||
ans = QWebEngineProfile(QApplication.instance())
|
||||
ans.setHttpUserAgent(random_common_chrome_user_agent())
|
||||
ans.setHttpCacheMaximumSize(0) # managed by webengine
|
||||
ans.setCachePath(os.path.join(cache_dir(), 'scraper', cache_name))
|
||||
s = ans.settings()
|
||||
a = s.setAttribute
|
||||
a(QWebEngineSettings.WebAttribute.PluginsEnabled, False)
|
||||
@ -48,7 +51,7 @@ def create_profile(cache_name='simple', allow_js=False):
|
||||
|
||||
class SimpleScraper(QWebEnginePage):
|
||||
|
||||
def __init__(self, source, parent=None):
|
||||
def __init__(self, source='', parent=None):
|
||||
profile = create_profile(source)
|
||||
self.token = profile.token
|
||||
self.is_being_tested = source == 'test'
|
||||
|
Loading…
x
Reference in New Issue
Block a user