mirror of
https://github.com/searxng/searxng.git
synced 2025-11-21 22:13:18 -05:00
The requests changed here all run outside of the network context timeout, thereby preventing the engine's timeout from being applied (the engine's timeout can become longer than it was configured). Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
147 lines
4.4 KiB
Python
147 lines
4.4 KiB
Python
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
"""Public domain image archive"""
|
|
|
|
from urllib.parse import urlencode, urlparse, urlunparse, parse_qsl
|
|
from json import dumps
|
|
|
|
from searx.network import get
|
|
from searx.utils import extr
|
|
from searx.exceptions import SearxEngineAccessDeniedException, SearxEngineException
|
|
|
|
THUMBNAIL_SUFFIX = "?fit=max&h=360&w=360"
|
|
"""
|
|
Example thumbnail urls (from requests & html):
|
|
- https://the-public-domain-review.imgix.net
|
|
/shop/nov-2023-prints-00043.jpg
|
|
?fit=max&h=360&w=360
|
|
- https://the-public-domain-review.imgix.net
|
|
/collections/the-history-of-four-footed-beasts-and-serpents-1658/
|
|
8616383182_5740fa7851_o.jpg
|
|
?fit=max&h=360&w=360
|
|
|
|
Example full image urls (from html)
|
|
- https://the-public-domain-review.imgix.net/shop/
|
|
nov-2023-prints-00043.jpg
|
|
?fit=clip&w=970&h=800&auto=format,compress
|
|
- https://the-public-domain-review.imgix.net/collections/
|
|
the-history-of-four-footed-beasts-and-serpents-1658/8616383182_5740fa7851_o.jpg
|
|
?fit=clip&w=310&h=800&auto=format,compress
|
|
|
|
The thumbnail url from the request will be cleaned for the full image link
|
|
The cleaned thumbnail url will have THUMBNAIL_SUFFIX added to them, based on the original thumbnail parameters
|
|
"""
|
|
|
|
# about
|
|
about = {
|
|
"website": 'https://pdimagearchive.org',
|
|
"use_official_api": False,
|
|
"require_api_key": False,
|
|
"results": 'JSON',
|
|
}
|
|
|
|
pdia_base_url = 'https://pdimagearchive.org'
|
|
pdia_config_start = "/_astro/InfiniteSearch."
|
|
pdia_config_end = ".js"
|
|
categories = ['images']
|
|
page_size = 20
|
|
paging = True
|
|
|
|
|
|
__CACHED_API_URL = None
|
|
|
|
|
|
def _clean_url(url):
|
|
parsed = urlparse(url)
|
|
query = [(k, v) for (k, v) in parse_qsl(parsed.query) if k not in ['ixid', 's']]
|
|
|
|
return urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, urlencode(query), parsed.fragment))
|
|
|
|
|
|
def _get_algolia_api_url():
|
|
global __CACHED_API_URL # pylint:disable=global-statement
|
|
|
|
if __CACHED_API_URL:
|
|
return __CACHED_API_URL
|
|
|
|
# fake request to extract api url
|
|
resp = get(f"{pdia_base_url}/search/?q=", timeout=3)
|
|
if resp.status_code != 200:
|
|
raise LookupError("Failed to fetch config location (and as such the API url) for PDImageArchive")
|
|
pdia_config_filepart = extr(resp.text, pdia_config_start, pdia_config_end)
|
|
pdia_config_url = pdia_base_url + pdia_config_start + pdia_config_filepart + pdia_config_end
|
|
|
|
resp = get(pdia_config_url)
|
|
if resp.status_code != 200:
|
|
raise LookupError("Failed to obtain AWS api url for PDImageArchive")
|
|
|
|
api_url = extr(resp.text, 'const r="', '"', default=None)
|
|
|
|
if api_url is None:
|
|
raise LookupError("Couldn't obtain AWS api url for PDImageArchive")
|
|
|
|
__CACHED_API_URL = api_url
|
|
return api_url
|
|
|
|
|
|
def _clear_cached_api_url():
|
|
global __CACHED_API_URL # pylint:disable=global-statement
|
|
|
|
__CACHED_API_URL = None
|
|
|
|
|
|
def request(query, params):
|
|
params['url'] = _get_algolia_api_url()
|
|
params['method'] = 'POST'
|
|
|
|
request_data = {
|
|
'page': params['pageno'] - 1,
|
|
'query': query,
|
|
'hitsPerPage': page_size,
|
|
'indexName': 'prod_all-images',
|
|
}
|
|
params['headers'] = {'Content-Type': 'application/json'}
|
|
params['data'] = dumps(request_data)
|
|
|
|
# http errors are handled manually to be able to reset the api url
|
|
params['raise_for_httperror'] = False
|
|
return params
|
|
|
|
|
|
def response(resp):
|
|
results = []
|
|
json_data = resp.json()
|
|
|
|
if resp.status_code == 403:
|
|
_clear_cached_api_url()
|
|
raise SearxEngineAccessDeniedException()
|
|
|
|
if resp.status_code != 200:
|
|
raise SearxEngineException()
|
|
|
|
if 'results' not in json_data:
|
|
return []
|
|
|
|
for result in json_data['results'][0]['hits']:
|
|
content = []
|
|
|
|
if result.get("themes"):
|
|
content.append("Themes: " + result['themes'])
|
|
|
|
if result.get("encompassingWork"):
|
|
content.append("Encompassing work: " + result['encompassingWork'])
|
|
|
|
base_image_url = result['thumbnail'].split("?")[0]
|
|
|
|
results.append(
|
|
{
|
|
'template': 'images.html',
|
|
'url': _clean_url(f"{about['website']}/images/{result['objectID']}"),
|
|
'img_src': _clean_url(base_image_url),
|
|
'thumbnail_src': _clean_url(base_image_url + THUMBNAIL_SUFFIX),
|
|
'title': f"{result['title'].strip()} by {result['artist']} {result.get('displayYear', '')}",
|
|
'content': "\n".join(content),
|
|
}
|
|
)
|
|
|
|
return results
|