mirror of
https://github.com/searxng/searxng.git
synced 2025-11-19 04:53:10 -05:00
[feat] sourcehut engine: implement as custom module, fix user agent
SourceHut uses a foss bot protection tool called `go-away` (which I can recommend BTW). It blocks common crawler user agents, such as the standard Firefox user agent. Hence, we're now using our custom SearXNG user agent to clarify we're not a crawler. Closes: https://github.com/searxng/searxng/issues/5270 Co-authored-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
3763b4bff4
commit
1be19f8b58
8
docs/dev/engines/online/sourcehut.rst
Normal file
8
docs/dev/engines/online/sourcehut.rst
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
.. _sourcehut engine:
|
||||||
|
|
||||||
|
=========
|
||||||
|
Sourcehut
|
||||||
|
=========
|
||||||
|
|
||||||
|
.. automodule:: searx.engines.sourcehut
|
||||||
|
:members:
|
||||||
90
searx/engines/sourcehut.py
Normal file
90
searx/engines/sourcehut.py
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
"""Engine to search in the collaborative software platform SourceHut_.
|
||||||
|
|
||||||
|
.. _SourceHut: https://sourcehut.org/
|
||||||
|
|
||||||
|
Configuration
|
||||||
|
=============
|
||||||
|
|
||||||
|
You can configure the following setting:
|
||||||
|
|
||||||
|
- :py:obj:`sourcehut_sort_order`
|
||||||
|
|
||||||
|
.. code:: yaml
|
||||||
|
|
||||||
|
- name: sourcehut
|
||||||
|
shortcut: srht
|
||||||
|
engine: sourcehut
|
||||||
|
# sourcehut_sort_order: longest-active
|
||||||
|
|
||||||
|
Implementations
|
||||||
|
===============
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import typing as t
|
||||||
|
|
||||||
|
from urllib.parse import urlencode
|
||||||
|
from lxml import html
|
||||||
|
|
||||||
|
from searx.utils import eval_xpath, eval_xpath_list, extract_text, searxng_useragent
|
||||||
|
from searx.result_types import EngineResults
|
||||||
|
|
||||||
|
if t.TYPE_CHECKING:
|
||||||
|
from searx.extended_types import SXNG_Response
|
||||||
|
from searx.search.processors import OnlineParams
|
||||||
|
|
||||||
|
|
||||||
|
about = {
|
||||||
|
"website": "https://sourcehut.org",
|
||||||
|
"wikidata_id": "Q78514485",
|
||||||
|
"official_api_documentation": "https://man.sr.ht/",
|
||||||
|
"use_official_api": False,
|
||||||
|
"require_api_key": False,
|
||||||
|
"results": "HTML",
|
||||||
|
}
|
||||||
|
|
||||||
|
categories = ["it", "repos"]
|
||||||
|
paging = True
|
||||||
|
|
||||||
|
base_url: str = "https://sr.ht/projects"
|
||||||
|
"""Browse public projects."""
|
||||||
|
|
||||||
|
|
||||||
|
sourcehut_sort_order: str = "recently-updated"
|
||||||
|
"""The sort order of the results. Possible values:
|
||||||
|
|
||||||
|
- ``recently-updated``
|
||||||
|
- ``longest-active``
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def request(query: str, params: "OnlineParams") -> None:
|
||||||
|
|
||||||
|
args = {"search": query, "page": params["pageno"], "sort": sourcehut_sort_order}
|
||||||
|
params["url"] = f"{base_url}?{urlencode(args)}"
|
||||||
|
|
||||||
|
# standard user agents are blocked by 'go-away', a foss bot detection tool
|
||||||
|
params["headers"]["User-Agent"] = searxng_useragent()
|
||||||
|
|
||||||
|
|
||||||
|
def response(resp: "SXNG_Response") -> EngineResults:
|
||||||
|
|
||||||
|
res = EngineResults()
|
||||||
|
doc = html.fromstring(resp.text)
|
||||||
|
|
||||||
|
for item in eval_xpath_list(doc, "(//div[@class='event-list'])[1]/div[contains(@class, 'event')]"):
|
||||||
|
res.add(
|
||||||
|
res.types.LegacyResult(
|
||||||
|
template="packages.html",
|
||||||
|
url=base_url + (extract_text(eval_xpath(item, "./h4/a[2]/@href")) or ""),
|
||||||
|
title=extract_text(eval_xpath(item, "./h4")),
|
||||||
|
package_name=extract_text(eval_xpath(item, "./h4/a[2]")),
|
||||||
|
content=extract_text(eval_xpath(item, "./p")),
|
||||||
|
maintainer=(extract_text(eval_xpath(item, "./h4/a[1]")) or "").removeprefix("~"),
|
||||||
|
tags=[
|
||||||
|
tag.removeprefix("#") for tag in eval_xpath_list(item, "./div[contains(@class, 'tags')]/a/text()")
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return res
|
||||||
@ -2677,23 +2677,10 @@ engines:
|
|||||||
|
|
||||||
- name: sourcehut
|
- name: sourcehut
|
||||||
shortcut: srht
|
shortcut: srht
|
||||||
engine: xpath
|
engine: sourcehut
|
||||||
paging: true
|
# https://docs.searxng.org/dev/engines/online/sourcehut.html
|
||||||
search_url: https://sr.ht/projects?page={pageno}&search={query}
|
# sourcehut_sort_order: longest-active
|
||||||
results_xpath: (//div[@class="event-list"])[1]/div[@class="event"]
|
|
||||||
url_xpath: ./h4/a[2]/@href
|
|
||||||
title_xpath: ./h4/a[2]
|
|
||||||
content_xpath: ./p
|
|
||||||
first_page_num: 1
|
|
||||||
categories: [it, repos]
|
|
||||||
disabled: true
|
disabled: true
|
||||||
about:
|
|
||||||
website: https://sr.ht
|
|
||||||
wikidata_id: Q78514485
|
|
||||||
official_api_documentation: https://man.sr.ht/
|
|
||||||
use_official_api: false
|
|
||||||
require_api_key: false
|
|
||||||
results: HTML
|
|
||||||
|
|
||||||
- name: goo
|
- name: goo
|
||||||
shortcut: goo
|
shortcut: goo
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user