[mod] Z-Library engine: revision of the engine (Paper result)

Revision of the engine / use of the result type Paper as well as other
typifications.

The engine has been placed on inactive because no service is currently
available, or at least not known in the SearXNG community [1]

[1] https://github.com/searxng/searxng/issues/3610

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2025-09-10 17:17:58 +02:00 committed by Markus Heiser
parent 4c42704c80
commit f8f7adce6b
2 changed files with 45 additions and 38 deletions

View File

@ -39,15 +39,16 @@ from urllib.parse import quote
from lxml import html from lxml import html
from flask_babel import gettext # pyright: ignore[reportUnknownVariableType] from flask_babel import gettext # pyright: ignore[reportUnknownVariableType]
from searx.utils import extract_text, eval_xpath, eval_xpath_list from searx.utils import extract_text, eval_xpath, eval_xpath_list, ElementType
from searx.enginelib.traits import EngineTraits from searx.enginelib.traits import EngineTraits
from searx.data import ENGINE_TRAITS from searx.data import ENGINE_TRAITS
from searx.exceptions import SearxException from searx.exceptions import SearxException
from searx.result_types import EngineResults
if t.TYPE_CHECKING: if t.TYPE_CHECKING:
from searx.extended_types import SXNG_Response from searx.extended_types import SXNG_Response
from searx.search.processors import OnlineParams
# about
about: dict[str, t.Any] = { about: dict[str, t.Any] = {
"website": "https://zlibrary-global.se", "website": "https://zlibrary-global.se",
"wikidata_id": "Q104863992", "wikidata_id": "Q104863992",
@ -57,7 +58,7 @@ about: dict[str, t.Any] = {
"results": "HTML", "results": "HTML",
} }
categories: list[str] = ["files"] categories: list[str] = ["files", "books"]
paging: bool = True paging: bool = True
base_url: str = "https://zlibrary-global.se" base_url: str = "https://zlibrary-global.se"
@ -74,8 +75,12 @@ zlib_ext: str = ""
``PDF`` and ``EPUB``. ``PDF`` and ``EPUB``.
""" """
i18n_language = gettext("Language")
i18n_book_rating = gettext("Book rating")
i18n_file_quality = gettext("File quality")
def init(engine_settings: dict[str, t.Any] | None = None) -> None: # pylint: disable=unused-argument
def setup(engine_settings: dict[str, t.Any]) -> bool: # pylint: disable=unused-argument
"""Check of engine's settings.""" """Check of engine's settings."""
traits: EngineTraits = EngineTraits(**ENGINE_TRAITS["z-library"]) traits: EngineTraits = EngineTraits(**ENGINE_TRAITS["z-library"])
@ -85,10 +90,11 @@ def init(engine_settings: dict[str, t.Any] | None = None) -> None: # pylint: di
raise ValueError(f"invalid setting year_from: {zlib_year_from}") raise ValueError(f"invalid setting year_from: {zlib_year_from}")
if zlib_year_to and zlib_year_to not in traits.custom["year_to"]: if zlib_year_to and zlib_year_to not in traits.custom["year_to"]:
raise ValueError(f"invalid setting year_to: {zlib_year_to}") raise ValueError(f"invalid setting year_to: {zlib_year_to}")
return True
def request(query: str, params: dict[str, t.Any]) -> dict[str, t.Any]: def request(query: str, params: "OnlineParams") -> None:
lang: str = traits.get_language(params["language"], traits.all_locale) # type: ignore lang: str | None = traits.get_language(params["searxng_locale"], traits.all_locale)
search_url: str = ( search_url: str = (
base_url base_url
+ "/s/{search_query}/?page={pageno}" + "/s/{search_query}/?page={pageno}"
@ -106,41 +112,35 @@ def request(query: str, params: dict[str, t.Any]) -> dict[str, t.Any]:
zlib_ext=zlib_ext, zlib_ext=zlib_ext,
) )
params["verify"] = False params["verify"] = False
return params
def domain_is_seized(dom): def response(resp: "SXNG_Response") -> EngineResults:
return bool(dom.xpath('//title') and "seized" in dom.xpath('//title')[0].text.lower()) res = EngineResults()
def response(resp: "SXNG_Response") -> list[dict[str, t.Any]]:
results: list[dict[str, t.Any]] = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
if domain_is_seized(dom): if domain_is_seized(dom):
raise SearxException(f"zlibrary domain is seized: {base_url}") raise SearxException(f"zlibrary domain is seized: {base_url}")
for item in dom.xpath('//div[@id="searchResultBox"]//div[contains(@class, "resItemBox")]'): for item in dom.xpath('//div[@id="searchResultBox"]//div[contains(@class, "resItemBox")]'):
results.append(_parse_result(item)) kwargs = _parse_result(item)
res.add(res.types.Paper(**kwargs))
return results return res
def _text(item, selector: str) -> str | None: def domain_is_seized(dom: ElementType):
return bool(dom.xpath('//title') and "seized" in dom.xpath('//title')[0].text.lower())
def _text(item: ElementType, selector: str) -> str | None:
return extract_text(eval_xpath(item, selector)) return extract_text(eval_xpath(item, selector))
i18n_language = gettext("Language") def _parse_result(item: ElementType) -> dict[str, t.Any]:
i18n_book_rating = gettext("Book rating")
i18n_file_quality = gettext("File quality")
def _parse_result(item) -> dict[str, t.Any]:
author_elements = eval_xpath_list(item, './/div[@class="authors"]//a[@itemprop="author"]') author_elements = eval_xpath_list(item, './/div[@class="authors"]//a[@itemprop="author"]')
result = { result = {
"template": "paper.html",
"url": base_url + item.xpath('(.//a[starts-with(@href, "/book/")])[1]/@href')[0], "url": base_url + item.xpath('(.//a[starts-with(@href, "/book/")])[1]/@href')[0],
"title": _text(item, './/*[@itemprop="name"]'), "title": _text(item, './/*[@itemprop="name"]'),
"authors": [extract_text(author) for author in author_elements], "authors": [extract_text(author) for author in author_elements],
@ -148,15 +148,15 @@ def _parse_result(item) -> dict[str, t.Any]:
"type": _text(item, './/div[contains(@class, "property__file")]//div[contains(@class, "property_value")]'), "type": _text(item, './/div[contains(@class, "property__file")]//div[contains(@class, "property_value")]'),
} }
thumbnail: str = _text(item, './/img[contains(@class, "cover")]/@data-src') thumbnail = _text(item, './/img[contains(@class, "cover")]/@data-src')
if not thumbnail.startswith('/'): if thumbnail and not thumbnail.startswith('/'):
result["thumbnail"] = thumbnail result["thumbnail"] = thumbnail
year = _text(item, './/div[contains(@class, "property_year")]//div[contains(@class, "property_value")]') year = _text(item, './/div[contains(@class, "property_year")]//div[contains(@class, "property_value")]')
if year: if year:
result["publishedDate"] = datetime.strptime(year, '%Y') result["publishedDate"] = datetime.strptime(year, '%Y')
content = [] content: list[str] = []
language = _text(item, './/div[contains(@class, "property_language")]//div[contains(@class, "property_value")]') language = _text(item, './/div[contains(@class, "property_language")]//div[contains(@class, "property_value")]')
if language: if language:
content.append(f"{i18n_language}: {language.capitalize()}") content.append(f"{i18n_language}: {language.capitalize()}")
@ -173,9 +173,10 @@ def _parse_result(item) -> dict[str, t.Any]:
def fetch_traits(engine_traits: EngineTraits) -> None: def fetch_traits(engine_traits: EngineTraits) -> None:
"""Fetch languages and other search arguments from zlibrary's search form.""" """Fetch languages and other search arguments from zlibrary's search form."""
# pylint: disable=import-outside-toplevel, too-many-branches # pylint: disable=import-outside-toplevel, too-many-branches, too-many-statements
import babel import babel
import babel.core
import httpx import httpx
from searx.network import get # see https://github.com/searxng/searxng/issues/762 from searx.network import get # see https://github.com/searxng/searxng/issues/762
@ -197,7 +198,7 @@ def fetch_traits(engine_traits: EngineTraits) -> None:
if not resp.ok: if not resp.ok:
raise RuntimeError("Response from zlibrary's search page is not OK.") raise RuntimeError("Response from zlibrary's search page is not OK.")
dom = html.fromstring(resp.text) # type: ignore dom = html.fromstring(resp.text)
if domain_is_seized(dom): if domain_is_seized(dom):
print(f"ERROR: zlibrary domain is seized: {base_url}") print(f"ERROR: zlibrary domain is seized: {base_url}")
@ -206,25 +207,30 @@ def fetch_traits(engine_traits: EngineTraits) -> None:
engine_traits.all_locale = "" engine_traits.all_locale = ""
engine_traits.custom["ext"] = [] engine_traits.custom["ext"] = []
engine_traits.custom["year_from"] = []
engine_traits.custom["year_to"] = []
l: list[str]
# years_from
l = []
for year in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_yearFrom']/option"): for year in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_yearFrom']/option"):
engine_traits.custom["year_from"].append(year.get("value")) l.append(year.get("value") or "")
engine_traits.custom["year_from"] = l
# years_to
l = []
for year in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_yearTo']/option"): for year in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_yearTo']/option"):
engine_traits.custom["year_to"].append(year.get("value")) l.append(year.get("value") or "")
engine_traits.custom["year_to"] = l
# ext (file extensions)
l = []
for ext in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_extensions']/option"): for ext in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_extensions']/option"):
value: str | None = ext.get("value") l.append(ext.get("value") or "")
if value is None: engine_traits.custom["ext"] = l
value = ""
engine_traits.custom["ext"].append(value)
# Handle languages # Handle languages
# Z-library uses English names for languages, so we need to map them to their respective locales # Z-library uses English names for languages, so we need to map them to their respective locales
language_name_locale_map: dict[str, babel.Locale] = {} language_name_locale_map: dict[str, babel.Locale] = {}
for locale in babel.core.localedata.locale_identifiers(): # type: ignore for locale in babel.core.localedata.locale_identifiers():
# Create a Locale object for the current locale # Create a Locale object for the current locale
loc = babel.Locale.parse(locale) loc = babel.Locale.parse(locale)
if loc.english_name is None: if loc.english_name is None:

View File

@ -1254,9 +1254,10 @@ engines:
- name: z-library - name: z-library
engine: zlibrary engine: zlibrary
shortcut: zlib shortcut: zlib
categories: files
timeout: 7.0 timeout: 7.0
disabled: true disabled: true
# https://github.com/searxng/searxng/issues/3610
inactive: true
- name: library of congress - name: library of congress
engine: loc engine: loc