mirror of
https://github.com/searxng/searxng.git
synced 2025-11-19 04:53:10 -05:00
[feat] sourcehut engine: implement as custom module, fix user agent
SourceHut uses a foss bot protection tool called `go-away` (which I can recommend BTW). It blocks common crawler user agents, such as the standard Firefox user agent. Hence, we're now using our custom SearXNG user agent to clarify we're not a crawler. Closes: https://github.com/searxng/searxng/issues/5270 Co-authored-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
3763b4bff4
commit
1be19f8b58
8
docs/dev/engines/online/sourcehut.rst
Normal file
8
docs/dev/engines/online/sourcehut.rst
Normal file
@ -0,0 +1,8 @@
|
||||
.. _sourcehut engine:
|
||||
|
||||
=========
|
||||
Sourcehut
|
||||
=========
|
||||
|
||||
.. automodule:: searx.engines.sourcehut
|
||||
:members:
|
||||
90
searx/engines/sourcehut.py
Normal file
90
searx/engines/sourcehut.py
Normal file
@ -0,0 +1,90 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Engine to search in the collaborative software platform SourceHut_.
|
||||
|
||||
.. _SourceHut: https://sourcehut.org/
|
||||
|
||||
Configuration
|
||||
=============
|
||||
|
||||
You can configure the following setting:
|
||||
|
||||
- :py:obj:`sourcehut_sort_order`
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: sourcehut
|
||||
shortcut: srht
|
||||
engine: sourcehut
|
||||
# sourcehut_sort_order: longest-active
|
||||
|
||||
Implementations
|
||||
===============
|
||||
|
||||
"""
|
||||
|
||||
import typing as t
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
|
||||
from searx.utils import eval_xpath, eval_xpath_list, extract_text, searxng_useragent
|
||||
from searx.result_types import EngineResults
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from searx.extended_types import SXNG_Response
|
||||
from searx.search.processors import OnlineParams
|
||||
|
||||
|
||||
about = {
|
||||
"website": "https://sourcehut.org",
|
||||
"wikidata_id": "Q78514485",
|
||||
"official_api_documentation": "https://man.sr.ht/",
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": "HTML",
|
||||
}
|
||||
|
||||
categories = ["it", "repos"]
|
||||
paging = True
|
||||
|
||||
base_url: str = "https://sr.ht/projects"
|
||||
"""Browse public projects."""
|
||||
|
||||
|
||||
sourcehut_sort_order: str = "recently-updated"
|
||||
"""The sort order of the results. Possible values:
|
||||
|
||||
- ``recently-updated``
|
||||
- ``longest-active``
|
||||
"""
|
||||
|
||||
|
||||
def request(query: str, params: "OnlineParams") -> None:
|
||||
|
||||
args = {"search": query, "page": params["pageno"], "sort": sourcehut_sort_order}
|
||||
params["url"] = f"{base_url}?{urlencode(args)}"
|
||||
|
||||
# standard user agents are blocked by 'go-away', a foss bot detection tool
|
||||
params["headers"]["User-Agent"] = searxng_useragent()
|
||||
|
||||
|
||||
def response(resp: "SXNG_Response") -> EngineResults:
|
||||
|
||||
res = EngineResults()
|
||||
doc = html.fromstring(resp.text)
|
||||
|
||||
for item in eval_xpath_list(doc, "(//div[@class='event-list'])[1]/div[contains(@class, 'event')]"):
|
||||
res.add(
|
||||
res.types.LegacyResult(
|
||||
template="packages.html",
|
||||
url=base_url + (extract_text(eval_xpath(item, "./h4/a[2]/@href")) or ""),
|
||||
title=extract_text(eval_xpath(item, "./h4")),
|
||||
package_name=extract_text(eval_xpath(item, "./h4/a[2]")),
|
||||
content=extract_text(eval_xpath(item, "./p")),
|
||||
maintainer=(extract_text(eval_xpath(item, "./h4/a[1]")) or "").removeprefix("~"),
|
||||
tags=[
|
||||
tag.removeprefix("#") for tag in eval_xpath_list(item, "./div[contains(@class, 'tags')]/a/text()")
|
||||
],
|
||||
)
|
||||
)
|
||||
return res
|
||||
@ -2677,23 +2677,10 @@ engines:
|
||||
|
||||
- name: sourcehut
|
||||
shortcut: srht
|
||||
engine: xpath
|
||||
paging: true
|
||||
search_url: https://sr.ht/projects?page={pageno}&search={query}
|
||||
results_xpath: (//div[@class="event-list"])[1]/div[@class="event"]
|
||||
url_xpath: ./h4/a[2]/@href
|
||||
title_xpath: ./h4/a[2]
|
||||
content_xpath: ./p
|
||||
first_page_num: 1
|
||||
categories: [it, repos]
|
||||
engine: sourcehut
|
||||
# https://docs.searxng.org/dev/engines/online/sourcehut.html
|
||||
# sourcehut_sort_order: longest-active
|
||||
disabled: true
|
||||
about:
|
||||
website: https://sr.ht
|
||||
wikidata_id: Q78514485
|
||||
official_api_documentation: https://man.sr.ht/
|
||||
use_official_api: false
|
||||
require_api_key: false
|
||||
results: HTML
|
||||
|
||||
- name: goo
|
||||
shortcut: goo
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user