mirror of
				https://github.com/searxng/searxng.git
				synced 2025-10-31 10:37:06 -04:00 
			
		
		
		
	Merge pull request #91 from return42/xpath-misc
[doc] add documentation about the XPath engine
This commit is contained in:
		
						commit
						703f8c4a8b
					
				| @ -43,7 +43,7 @@ argument                type        information | |||||||
| categories              list        pages, in which the engine is working | categories              list        pages, in which the engine is working | ||||||
| paging                  boolean     support multible pages | paging                  boolean     support multible pages | ||||||
| time_range_support      boolean     support search time range | time_range_support      boolean     support search time range | ||||||
| engine_type             str         ``online`` by default, other possibles values are  | engine_type             str         ``online`` by default, other possibles values are | ||||||
|                                     ``offline``, ``online_dictionnary``, ``online_currency`` |                                     ``offline``, ``online_dictionnary``, ``online_currency`` | ||||||
| ======================= =========== ======================================================== | ======================= =========== ======================================================== | ||||||
| 
 | 
 | ||||||
| @ -100,6 +100,8 @@ example code | |||||||
|    paging = True |    paging = True | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | .. _engine request: | ||||||
|  | 
 | ||||||
| making a request | making a request | ||||||
| ================ | ================ | ||||||
| 
 | 
 | ||||||
| @ -198,6 +200,8 @@ example code | |||||||
|        return params |        return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | .. _engine results: | ||||||
|  | 
 | ||||||
| returned results | returned results | ||||||
| ================ | ================ | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -9,6 +9,7 @@ Developer documentation | |||||||
|    quickstart |    quickstart | ||||||
|    contribution_guide |    contribution_guide | ||||||
|    engine_overview |    engine_overview | ||||||
|  |    xpath_engine | ||||||
|    search_api |    search_api | ||||||
|    plugins |    plugins | ||||||
|    translation |    translation | ||||||
|  | |||||||
							
								
								
									
										9
									
								
								docs/dev/xpath_engine.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								docs/dev/xpath_engine.rst
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,9 @@ | |||||||
|  | .. _xpath_engine: | ||||||
|  | 
 | ||||||
|  | ================ | ||||||
|  | The XPath engine | ||||||
|  | ================ | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.engines.xpath | ||||||
|  |   :members: | ||||||
|  | 
 | ||||||
| @ -4,7 +4,8 @@ Welcome to searxng | |||||||
| 
 | 
 | ||||||
|     *Search without being tracked.* |     *Search without being tracked.* | ||||||
| 
 | 
 | ||||||
| .. warning:: | .. hint:: | ||||||
|  | 
 | ||||||
|    This is not searx, but searxng. |    This is not searx, but searxng. | ||||||
| 
 | 
 | ||||||
| Searxng is a free internet metasearch engine which aggregates results from more | Searxng is a free internet metasearch engine which aggregates results from more | ||||||
|  | |||||||
| @ -1,51 +1,106 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
|  | # lint: pylint | ||||||
|  | # pylint: disable=missing-function-docstring | ||||||
|  | """The XPath engine is a *generic* engine with which it is possible to configure | ||||||
|  | engines in the settings. | ||||||
|  | 
 | ||||||
|  | Here is a simple example of a XPath engine configured in the | ||||||
|  | :ref:`settings engine` section, further read :ref:`engines-dev`. | ||||||
|  | 
 | ||||||
|  | .. code:: yaml | ||||||
|  | 
 | ||||||
|  |   - name : bitbucket | ||||||
|  |     engine : xpath | ||||||
|  |     paging : True | ||||||
|  |     search_url : https://bitbucket.org/repo/all/{pageno}?name={query} | ||||||
|  |     url_xpath : //article[@class="repo-summary"]//a[@class="repo-link"]/@href | ||||||
|  |     title_xpath : //article[@class="repo-summary"]//a[@class="repo-link"] | ||||||
|  |     content_xpath : //article[@class="repo-summary"]/p | ||||||
|  | 
 | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | from urllib.parse import urlencode | ||||||
| 
 | 
 | ||||||
| from lxml import html | from lxml import html | ||||||
| from urllib.parse import urlencode |  | ||||||
| from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list | from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list | ||||||
|  | from searx import logger | ||||||
|  | 
 | ||||||
|  | logger = logger.getChild('XPath engine') | ||||||
| 
 | 
 | ||||||
| search_url = None | search_url = None | ||||||
| url_xpath = None | """ | ||||||
| content_xpath = None | Search URL of the engine, replacements are: | ||||||
| title_xpath = None | 
 | ||||||
| thumbnail_xpath = False | ``{query}``: | ||||||
| paging = False |   Search terms from user. | ||||||
| suggestion_xpath = '' | 
 | ||||||
|  | ``{pageno}``: | ||||||
|  |   Page number if engine supports pagging :py:obj:`paging` | ||||||
|  | 
 | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | soft_max_redirects = 0 | ||||||
|  | '''Maximum redirects, soft limit. Record an error but don't stop the engine''' | ||||||
|  | 
 | ||||||
| results_xpath = '' | results_xpath = '' | ||||||
|  | '''XPath selector for the list of result items''' | ||||||
|  | 
 | ||||||
|  | url_xpath = None | ||||||
|  | '''XPath selector of result's ``url``.''' | ||||||
|  | 
 | ||||||
|  | content_xpath = None | ||||||
|  | '''XPath selector of result's ``content``.''' | ||||||
|  | 
 | ||||||
|  | title_xpath = None | ||||||
|  | '''XPath selector of result's ``title``.''' | ||||||
|  | 
 | ||||||
|  | thumbnail_xpath = False | ||||||
|  | '''XPath selector of result's ``img_src``.''' | ||||||
|  | 
 | ||||||
|  | suggestion_xpath = '' | ||||||
|  | '''XPath selector of result's ``suggestion``.''' | ||||||
|  | 
 | ||||||
| cached_xpath = '' | cached_xpath = '' | ||||||
| cached_url = '' | cached_url = '' | ||||||
| soft_max_redirects = 0 |  | ||||||
| 
 | 
 | ||||||
| # parameters for engines with paging support | paging = False | ||||||
| # | '''Engine supports paging [True or False].''' | ||||||
| # number of results on each page | 
 | ||||||
| # (only needed if the site requires not a page number, but an offset) |  | ||||||
| page_size = 1 | page_size = 1 | ||||||
| # number of the first page (usually 0 or 1) | '''Number of results on each page.  Only needed if the site requires not a page | ||||||
| first_page_num = 1 | number, but an offset.''' | ||||||
| 
 | 
 | ||||||
|  | first_page_num = 1 | ||||||
|  | '''Number of the first page (usually 0 or 1).''' | ||||||
| 
 | 
 | ||||||
| def request(query, params): | def request(query, params): | ||||||
|  |     '''Build request parameters (see :ref:`engine request`). | ||||||
|  | 
 | ||||||
|  |     ''' | ||||||
|     query = urlencode({'q': query})[2:] |     query = urlencode({'q': query})[2:] | ||||||
| 
 | 
 | ||||||
|     fp = {'query': query} |     fargs = {'query': query} | ||||||
|     if paging and search_url.find('{pageno}') >= 0: |     if paging and search_url.find('{pageno}') >= 0: | ||||||
|         fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num |         fargs['pageno'] = (params['pageno'] - 1) * page_size + first_page_num | ||||||
| 
 | 
 | ||||||
|     params['url'] = search_url.format(**fp) |     params['url'] = search_url.format(**fargs) | ||||||
|     params['query'] = query |     params['query'] = query | ||||||
|     params['soft_max_redirects'] = soft_max_redirects |     params['soft_max_redirects'] = soft_max_redirects | ||||||
|  |     logger.debug("query_url --> %s", params['url']) | ||||||
| 
 | 
 | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| def response(resp): | def response(resp): | ||||||
|  |     '''Scrap *results* from the response (see :ref:`engine results`). | ||||||
|  | 
 | ||||||
|  |     ''' | ||||||
|     results = [] |     results = [] | ||||||
|     dom = html.fromstring(resp.text) |     dom = html.fromstring(resp.text) | ||||||
|     is_onion = True if 'onions' in categories else False  # pylint: disable=undefined-variable |     is_onion = 'onions' in categories  # pylint: disable=undefined-variable | ||||||
| 
 | 
 | ||||||
|     if results_xpath: |     if results_xpath: | ||||||
|         for result in eval_xpath_list(dom, results_xpath): |         for result in eval_xpath_list(dom, results_xpath): | ||||||
|  | 
 | ||||||
|             url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url) |             url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url) | ||||||
|             title = extract_text(eval_xpath_list(result, title_xpath, min_len=1)) |             title = extract_text(eval_xpath_list(result, title_xpath, min_len=1)) | ||||||
|             content = extract_text(eval_xpath_list(result, content_xpath, min_len=1)) |             content = extract_text(eval_xpath_list(result, content_xpath, min_len=1)) | ||||||
| @ -59,13 +114,16 @@ def response(resp): | |||||||
| 
 | 
 | ||||||
|             # add alternative cached url if available |             # add alternative cached url if available | ||||||
|             if cached_xpath: |             if cached_xpath: | ||||||
|                 tmp_result['cached_url'] = cached_url\ |                 tmp_result['cached_url'] = ( | ||||||
|  |                     cached_url | ||||||
|                     + extract_text(eval_xpath_list(result, cached_xpath, min_len=1)) |                     + extract_text(eval_xpath_list(result, cached_xpath, min_len=1)) | ||||||
|  |                 ) | ||||||
| 
 | 
 | ||||||
|             if is_onion: |             if is_onion: | ||||||
|                 tmp_result['is_onion'] = True |                 tmp_result['is_onion'] = True | ||||||
| 
 | 
 | ||||||
|             results.append(tmp_result) |             results.append(tmp_result) | ||||||
|  | 
 | ||||||
|     else: |     else: | ||||||
|         if cached_xpath: |         if cached_xpath: | ||||||
|             for url, title, content, cached in zip( |             for url, title, content, cached in zip( | ||||||
| @ -75,8 +133,12 @@ def response(resp): | |||||||
|                 map(extract_text, eval_xpath_list(dom, content_xpath)), |                 map(extract_text, eval_xpath_list(dom, content_xpath)), | ||||||
|                 map(extract_text, eval_xpath_list(dom, cached_xpath)) |                 map(extract_text, eval_xpath_list(dom, cached_xpath)) | ||||||
|             ): |             ): | ||||||
|                 results.append({'url': url, 'title': title, 'content': content, |                 results.append({ | ||||||
|                                 'cached_url': cached_url + cached, 'is_onion': is_onion}) |                     'url': url, | ||||||
|  |                     'title': title, | ||||||
|  |                     'content': content, | ||||||
|  |                     'cached_url': cached_url + cached, 'is_onion': is_onion | ||||||
|  |                 }) | ||||||
|         else: |         else: | ||||||
|             for url, title, content in zip( |             for url, title, content in zip( | ||||||
|                 (extract_url(x, search_url) for |                 (extract_url(x, search_url) for | ||||||
| @ -84,10 +146,16 @@ def response(resp): | |||||||
|                 map(extract_text, eval_xpath_list(dom, title_xpath)), |                 map(extract_text, eval_xpath_list(dom, title_xpath)), | ||||||
|                 map(extract_text, eval_xpath_list(dom, content_xpath)) |                 map(extract_text, eval_xpath_list(dom, content_xpath)) | ||||||
|             ): |             ): | ||||||
|                 results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion}) |                 results.append({ | ||||||
|  |                     'url': url, | ||||||
|  |                     'title': title, | ||||||
|  |                     'content': content, | ||||||
|  |                     'is_onion': is_onion | ||||||
|  |                 }) | ||||||
| 
 | 
 | ||||||
|     if not suggestion_xpath: |     if suggestion_xpath: | ||||||
|         return results |         for suggestion in eval_xpath(dom, suggestion_xpath): | ||||||
|     for suggestion in eval_xpath(dom, suggestion_xpath): |             results.append({'suggestion': extract_text(suggestion)}) | ||||||
|         results.append({'suggestion': extract_text(suggestion)}) | 
 | ||||||
|  |     logger.debug("found %s results", len(results)) | ||||||
|     return results |     return results | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user