mirror of
				https://github.com/searxng/searxng.git
				synced 2025-10-31 10:37:06 -04:00 
			
		
		
		
	Merge pull request #91 from return42/xpath-misc
[doc] add documentation about the XPath engine
This commit is contained in:
		
						commit
						703f8c4a8b
					
				| @ -100,6 +100,8 @@ example code | ||||
|    paging = True | ||||
| 
 | ||||
| 
 | ||||
| .. _engine request: | ||||
| 
 | ||||
| making a request | ||||
| ================ | ||||
| 
 | ||||
| @ -198,6 +200,8 @@ example code | ||||
|        return params | ||||
| 
 | ||||
| 
 | ||||
| .. _engine results: | ||||
| 
 | ||||
| returned results | ||||
| ================ | ||||
| 
 | ||||
|  | ||||
| @ -9,6 +9,7 @@ Developer documentation | ||||
|    quickstart | ||||
|    contribution_guide | ||||
|    engine_overview | ||||
|    xpath_engine | ||||
|    search_api | ||||
|    plugins | ||||
|    translation | ||||
|  | ||||
							
								
								
									
										9
									
								
								docs/dev/xpath_engine.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								docs/dev/xpath_engine.rst
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,9 @@ | ||||
| .. _xpath_engine: | ||||
| 
 | ||||
| ================ | ||||
| The XPath engine | ||||
| ================ | ||||
| 
 | ||||
| .. automodule:: searx.engines.xpath | ||||
|   :members: | ||||
| 
 | ||||
| @ -4,7 +4,8 @@ Welcome to searxng | ||||
| 
 | ||||
|     *Search without being tracked.* | ||||
| 
 | ||||
| .. warning:: | ||||
| .. hint:: | ||||
| 
 | ||||
|    This is not searx, but searxng. | ||||
| 
 | ||||
| Searxng is a free internet metasearch engine which aggregates results from more | ||||
|  | ||||
| @ -1,51 +1,106 @@ | ||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | ||||
| # lint: pylint | ||||
| # pylint: disable=missing-function-docstring | ||||
| """The XPath engine is a *generic* engine with which it is possible to configure | ||||
| engines in the settings. | ||||
| 
 | ||||
| Here is a simple example of a XPath engine configured in the | ||||
| :ref:`settings engine` section, further read :ref:`engines-dev`. | ||||
| 
 | ||||
| .. code:: yaml | ||||
| 
 | ||||
|   - name : bitbucket | ||||
|     engine : xpath | ||||
|     paging : True | ||||
|     search_url : https://bitbucket.org/repo/all/{pageno}?name={query} | ||||
|     url_xpath : //article[@class="repo-summary"]//a[@class="repo-link"]/@href | ||||
|     title_xpath : //article[@class="repo-summary"]//a[@class="repo-link"] | ||||
|     content_xpath : //article[@class="repo-summary"]/p | ||||
| 
 | ||||
| """ | ||||
| 
 | ||||
| from urllib.parse import urlencode | ||||
| 
 | ||||
| from lxml import html | ||||
| from urllib.parse import urlencode | ||||
| from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list | ||||
| from searx import logger | ||||
| 
 | ||||
| logger = logger.getChild('XPath engine') | ||||
| 
 | ||||
| search_url = None | ||||
| url_xpath = None | ||||
| content_xpath = None | ||||
| title_xpath = None | ||||
| thumbnail_xpath = False | ||||
| paging = False | ||||
| suggestion_xpath = '' | ||||
| """ | ||||
| Search URL of the engine, replacements are: | ||||
| 
 | ||||
| ``{query}``: | ||||
|   Search terms from user. | ||||
| 
 | ||||
| ``{pageno}``: | ||||
|   Page number if engine supports pagging :py:obj:`paging` | ||||
| 
 | ||||
| """ | ||||
| 
 | ||||
| soft_max_redirects = 0 | ||||
| '''Maximum redirects, soft limit. Record an error but don't stop the engine''' | ||||
| 
 | ||||
| results_xpath = '' | ||||
| '''XPath selector for the list of result items''' | ||||
| 
 | ||||
| url_xpath = None | ||||
| '''XPath selector of result's ``url``.''' | ||||
| 
 | ||||
| content_xpath = None | ||||
| '''XPath selector of result's ``content``.''' | ||||
| 
 | ||||
| title_xpath = None | ||||
| '''XPath selector of result's ``title``.''' | ||||
| 
 | ||||
| thumbnail_xpath = False | ||||
| '''XPath selector of result's ``img_src``.''' | ||||
| 
 | ||||
| suggestion_xpath = '' | ||||
| '''XPath selector of result's ``suggestion``.''' | ||||
| 
 | ||||
| cached_xpath = '' | ||||
| cached_url = '' | ||||
| soft_max_redirects = 0 | ||||
| 
 | ||||
| # parameters for engines with paging support | ||||
| # | ||||
| # number of results on each page | ||||
| # (only needed if the site requires not a page number, but an offset) | ||||
| paging = False | ||||
| '''Engine supports paging [True or False].''' | ||||
| 
 | ||||
| page_size = 1 | ||||
| # number of the first page (usually 0 or 1) | ||||
| first_page_num = 1 | ||||
| '''Number of results on each page.  Only needed if the site requires not a page | ||||
| number, but an offset.''' | ||||
| 
 | ||||
| first_page_num = 1 | ||||
| '''Number of the first page (usually 0 or 1).''' | ||||
| 
 | ||||
| def request(query, params): | ||||
|     '''Build request parameters (see :ref:`engine request`). | ||||
| 
 | ||||
|     ''' | ||||
|     query = urlencode({'q': query})[2:] | ||||
| 
 | ||||
|     fp = {'query': query} | ||||
|     fargs = {'query': query} | ||||
|     if paging and search_url.find('{pageno}') >= 0: | ||||
|         fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num | ||||
|         fargs['pageno'] = (params['pageno'] - 1) * page_size + first_page_num | ||||
| 
 | ||||
|     params['url'] = search_url.format(**fp) | ||||
|     params['url'] = search_url.format(**fargs) | ||||
|     params['query'] = query | ||||
|     params['soft_max_redirects'] = soft_max_redirects | ||||
|     logger.debug("query_url --> %s", params['url']) | ||||
| 
 | ||||
|     return params | ||||
| 
 | ||||
| 
 | ||||
| def response(resp): | ||||
|     '''Scrap *results* from the response (see :ref:`engine results`). | ||||
| 
 | ||||
|     ''' | ||||
|     results = [] | ||||
|     dom = html.fromstring(resp.text) | ||||
|     is_onion = True if 'onions' in categories else False  # pylint: disable=undefined-variable | ||||
|     is_onion = 'onions' in categories  # pylint: disable=undefined-variable | ||||
| 
 | ||||
|     if results_xpath: | ||||
|         for result in eval_xpath_list(dom, results_xpath): | ||||
| 
 | ||||
|             url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url) | ||||
|             title = extract_text(eval_xpath_list(result, title_xpath, min_len=1)) | ||||
|             content = extract_text(eval_xpath_list(result, content_xpath, min_len=1)) | ||||
| @ -59,13 +114,16 @@ def response(resp): | ||||
| 
 | ||||
|             # add alternative cached url if available | ||||
|             if cached_xpath: | ||||
|                 tmp_result['cached_url'] = cached_url\ | ||||
|                 tmp_result['cached_url'] = ( | ||||
|                     cached_url | ||||
|                     + extract_text(eval_xpath_list(result, cached_xpath, min_len=1)) | ||||
|                 ) | ||||
| 
 | ||||
|             if is_onion: | ||||
|                 tmp_result['is_onion'] = True | ||||
| 
 | ||||
|             results.append(tmp_result) | ||||
| 
 | ||||
|     else: | ||||
|         if cached_xpath: | ||||
|             for url, title, content, cached in zip( | ||||
| @ -75,8 +133,12 @@ def response(resp): | ||||
|                 map(extract_text, eval_xpath_list(dom, content_xpath)), | ||||
|                 map(extract_text, eval_xpath_list(dom, cached_xpath)) | ||||
|             ): | ||||
|                 results.append({'url': url, 'title': title, 'content': content, | ||||
|                                 'cached_url': cached_url + cached, 'is_onion': is_onion}) | ||||
|                 results.append({ | ||||
|                     'url': url, | ||||
|                     'title': title, | ||||
|                     'content': content, | ||||
|                     'cached_url': cached_url + cached, 'is_onion': is_onion | ||||
|                 }) | ||||
|         else: | ||||
|             for url, title, content in zip( | ||||
|                 (extract_url(x, search_url) for | ||||
| @ -84,10 +146,16 @@ def response(resp): | ||||
|                 map(extract_text, eval_xpath_list(dom, title_xpath)), | ||||
|                 map(extract_text, eval_xpath_list(dom, content_xpath)) | ||||
|             ): | ||||
|                 results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion}) | ||||
|                 results.append({ | ||||
|                     'url': url, | ||||
|                     'title': title, | ||||
|                     'content': content, | ||||
|                     'is_onion': is_onion | ||||
|                 }) | ||||
| 
 | ||||
|     if not suggestion_xpath: | ||||
|         return results | ||||
|     if suggestion_xpath: | ||||
|         for suggestion in eval_xpath(dom, suggestion_xpath): | ||||
|             results.append({'suggestion': extract_text(suggestion)}) | ||||
| 
 | ||||
|     logger.debug("found %s results", len(results)) | ||||
|     return results | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user