searxng/searx/engines/public_domain_image_archive.py
Markus Heiser ca441f419c
[fix] engines - set hard timouts in *sub-request* (#5460)
The requests changed here all run outside of the network context timeout,
thereby preventing the engine's timeout from being applied (the engine's timeout
can become longer than it was configured).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2025-11-21 08:16:24 +01:00

147 lines
4.4 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
"""Public domain image archive"""
from urllib.parse import urlencode, urlparse, urlunparse, parse_qsl
from json import dumps
from searx.network import get
from searx.utils import extr
from searx.exceptions import SearxEngineAccessDeniedException, SearxEngineException
THUMBNAIL_SUFFIX = "?fit=max&h=360&w=360"
"""
Example thumbnail urls (from requests & html):
- https://the-public-domain-review.imgix.net
/shop/nov-2023-prints-00043.jpg
?fit=max&h=360&w=360
- https://the-public-domain-review.imgix.net
/collections/the-history-of-four-footed-beasts-and-serpents-1658/
8616383182_5740fa7851_o.jpg
?fit=max&h=360&w=360
Example full image urls (from html)
- https://the-public-domain-review.imgix.net/shop/
nov-2023-prints-00043.jpg
?fit=clip&w=970&h=800&auto=format,compress
- https://the-public-domain-review.imgix.net/collections/
the-history-of-four-footed-beasts-and-serpents-1658/8616383182_5740fa7851_o.jpg
?fit=clip&w=310&h=800&auto=format,compress
The thumbnail url from the request will be cleaned for the full image link
The cleaned thumbnail url will have THUMBNAIL_SUFFIX added to them, based on the original thumbnail parameters
"""
# about
about = {
"website": 'https://pdimagearchive.org',
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
pdia_base_url = 'https://pdimagearchive.org'
pdia_config_start = "/_astro/InfiniteSearch."
pdia_config_end = ".js"
categories = ['images']
page_size = 20
paging = True
__CACHED_API_URL = None
def _clean_url(url):
parsed = urlparse(url)
query = [(k, v) for (k, v) in parse_qsl(parsed.query) if k not in ['ixid', 's']]
return urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, urlencode(query), parsed.fragment))
def _get_algolia_api_url():
global __CACHED_API_URL # pylint:disable=global-statement
if __CACHED_API_URL:
return __CACHED_API_URL
# fake request to extract api url
resp = get(f"{pdia_base_url}/search/?q=", timeout=3)
if resp.status_code != 200:
raise LookupError("Failed to fetch config location (and as such the API url) for PDImageArchive")
pdia_config_filepart = extr(resp.text, pdia_config_start, pdia_config_end)
pdia_config_url = pdia_base_url + pdia_config_start + pdia_config_filepart + pdia_config_end
resp = get(pdia_config_url)
if resp.status_code != 200:
raise LookupError("Failed to obtain AWS api url for PDImageArchive")
api_url = extr(resp.text, 'const r="', '"', default=None)
if api_url is None:
raise LookupError("Couldn't obtain AWS api url for PDImageArchive")
__CACHED_API_URL = api_url
return api_url
def _clear_cached_api_url():
global __CACHED_API_URL # pylint:disable=global-statement
__CACHED_API_URL = None
def request(query, params):
params['url'] = _get_algolia_api_url()
params['method'] = 'POST'
request_data = {
'page': params['pageno'] - 1,
'query': query,
'hitsPerPage': page_size,
'indexName': 'prod_all-images',
}
params['headers'] = {'Content-Type': 'application/json'}
params['data'] = dumps(request_data)
# http errors are handled manually to be able to reset the api url
params['raise_for_httperror'] = False
return params
def response(resp):
results = []
json_data = resp.json()
if resp.status_code == 403:
_clear_cached_api_url()
raise SearxEngineAccessDeniedException()
if resp.status_code != 200:
raise SearxEngineException()
if 'results' not in json_data:
return []
for result in json_data['results'][0]['hits']:
content = []
if result.get("themes"):
content.append("Themes: " + result['themes'])
if result.get("encompassingWork"):
content.append("Encompassing work: " + result['encompassingWork'])
base_image_url = result['thumbnail'].split("?")[0]
results.append(
{
'template': 'images.html',
'url': _clean_url(f"{about['website']}/images/{result['objectID']}"),
'img_src': _clean_url(base_image_url),
'thumbnail_src': _clean_url(base_image_url + THUMBNAIL_SUFFIX),
'title': f"{result['title'].strip()} by {result['artist']} {result.get('displayYear', '')}",
'content': "\n".join(content),
}
)
return results