Make rate limiting of search engine queries inter-process rather than per process

This commit is contained in:
Kovid Goyal 2022-07-28 22:45:34 +05:30
parent 8b01843531
commit 761f5ffe8c
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -3,12 +3,12 @@
# License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import json import json
import os
import re import re
import time import time
from threading import Lock from collections import namedtuple
from collections import defaultdict, namedtuple from contextlib import contextmanager
try: try:
from urllib.parse import parse_qs, quote_plus, unquote, urlencode from urllib.parse import parse_qs, quote_plus, unquote, urlencode
@ -19,18 +19,37 @@ except ImportError:
from lxml import etree from lxml import etree
from calibre import browser as _browser, prints, random_user_agent from calibre import browser as _browser, prints, random_user_agent
from calibre.constants import cache_dir
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.monotonic import monotonic from calibre.utils.lock import ExclusiveFile
from calibre.utils.random_ua import accept_header_for_ua from calibre.utils.random_ua import accept_header_for_ua
current_version = (1, 0, 16) current_version = (1, 0, 17)
minimum_calibre_version = (2, 80, 0) minimum_calibre_version = (2, 80, 0)
last_visited = defaultdict(lambda: 0)
Result = namedtuple('Result', 'url title cached_url') Result = namedtuple('Result', 'url title cached_url')
@contextmanager
def rate_limit(name='test', time_between_visits=1, max_wait_seconds=5 * 60, sleep_time=0.2):
lock_file = os.path.join(cache_dir(), 'search-engines.' + name + '.lock')
with ExclusiveFile(lock_file, timeout=max_wait_seconds, sleep_time=sleep_time) as f:
try:
lv = float(f.read().decode('utf-8').strip())
except Exception:
lv = 0
delta = time.time() - lv
if delta < time_between_visits:
time.sleep(time_between_visits - delta)
try:
yield
finally:
f.seek(0)
f.truncate()
f.write(repr(time.time()).encode('utf-8'))
def tostring(elem): def tostring(elem):
return etree.tostring(elem, encoding='unicode', method='text', with_tail=False) return etree.tostring(elem, encoding='unicode', method='text', with_tail=False)
@ -63,24 +82,13 @@ def parse_html(raw):
return parse(raw) return parse(raw)
last_visited_lock = Lock()
def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60, save_raw=None, simple_scraper=None): def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60, save_raw=None, simple_scraper=None):
with last_visited_lock: with rate_limit(key):
lv = last_visited[key]
delta = monotonic() - lv
if delta < limit and delta > 0:
time.sleep(delta)
try:
if simple_scraper is None: if simple_scraper is None:
raw = br.open_novisit(url, timeout=timeout).read() raw = br.open_novisit(url, timeout=timeout).read()
raw = xml_to_unicode(raw, strip_encoding_pats=True)[0] raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
else: else:
raw = simple_scraper(url, timeout=timeout) raw = simple_scraper(url, timeout=timeout)
finally:
with last_visited_lock:
last_visited[key] = monotonic()
if dump_raw is not None: if dump_raw is not None:
with open(dump_raw, 'w') as f: with open(dump_raw, 'w') as f:
f.write(raw) f.write(raw)