mirror of
				https://github.com/searxng/searxng.git
				synced 2025-10-30 18:22:31 -04:00 
			
		
		
		
	Merge pull request #1969 from ArtikusHG/master
Add language autodetect plugin
This commit is contained in:
		
						commit
						966e9c3c5d
					
				| @ -36,6 +36,7 @@ RUN apk add --no-cache -t build-dependencies \ | ||||
|     su-exec \ | ||||
|     python3 \ | ||||
|     py3-pip \ | ||||
|     py3-numpy \ | ||||
|     libxml2 \ | ||||
|     libxslt \ | ||||
|     openssl \ | ||||
| @ -43,6 +44,8 @@ RUN apk add --no-cache -t build-dependencies \ | ||||
|     uwsgi \ | ||||
|     uwsgi-python3 \ | ||||
|     brotli \ | ||||
|  && pip3 install --no-cache setuptools wheel \ | ||||
|  && sed -i s/fasttext-wheel/fasttext/ requirements.txt \ | ||||
|  && pip3 install --no-cache -r requirements.txt \ | ||||
|  && apk del build-dependencies \ | ||||
|  && rm -rf /root/.cache | ||||
|  | ||||
							
								
								
									
										8
									
								
								docs/src/searx.plugins.autodetect_search_language.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								docs/src/searx.plugins.autodetect_search_language.rst
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,8 @@ | ||||
| .. _autodetect search language: | ||||
| 
 | ||||
| ====================== | ||||
| Search language plugin | ||||
| ====================== | ||||
| 
 | ||||
| .. automodule:: searx.plugins.autodetect_search_language | ||||
|   :members: | ||||
| @ -16,3 +16,4 @@ setproctitle==1.3.2 | ||||
| redis==4.4.0 | ||||
| markdown-it-py==2.1.0 | ||||
| typing_extensions==4.4.0 | ||||
| fasttext-wheel==0.9.2 | ||||
|  | ||||
							
								
								
									
										
											BIN
										
									
								
								searx/data/lid.176.ftz
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								searx/data/lid.176.ftz
									
									
									
									
									
										Executable file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										115
									
								
								searx/plugins/autodetect_search_language.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										115
									
								
								searx/plugins/autodetect_search_language.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,115 @@ | ||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | ||||
| # lint: pylint | ||||
| """Plugin to detect the search language from the search query. | ||||
| 
 | ||||
| The language detection is done by using the fastText_ library (`python | ||||
| fasttext`_). fastText_ distributes the `language identification model`_, for | ||||
| reference: | ||||
| 
 | ||||
| - `FastText.zip: Compressing text classification models`_ | ||||
| - `Bag of Tricks for Efficient Text Classification`_ | ||||
| 
 | ||||
| The `language identification model`_ support the language codes (ISO-639-3):: | ||||
| 
 | ||||
|    af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr | ||||
|    ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa | ||||
|    fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io | ||||
|    is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv | ||||
|    mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn | ||||
|    no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd | ||||
|    sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep | ||||
|    vi vls vo wa war wuu xal xmf yi yo yue zh | ||||
| 
 | ||||
| The `language identification model`_ is harmonized with the SearXNG's language | ||||
| (locale) model.  General conditions of SearXNG's locale model are: | ||||
| 
 | ||||
| a. SearXNG's locale of a query is passed to the | ||||
|    :py:obj:`searx.locales.get_engine_locale` to get a language and/or region | ||||
|    code that is used by an engine. | ||||
| 
 | ||||
| b. SearXNG and most of the engines do not support all the languages from | ||||
|    language model and there might be also a discrepancy in the ISO-639-3 and | ||||
|    ISO-639-2 handling (:py:obj:`searx.locales.get_engine_locale`).  Further | ||||
|    more, in SearXNG the locales like ``zh-TH`` (``zh-CN``) are mapped to | ||||
|    ``zh_Hant`` (``zh_Hans``). | ||||
| 
 | ||||
| Conclusion: This plugin does only auto-detect the languages a user can select in | ||||
| the language menu (:py:obj:`supported_langs`). | ||||
| 
 | ||||
| SearXNG's locale of a query comes from (*highest wins*): | ||||
| 
 | ||||
| 1. The ``Accept-Language`` header from user's HTTP client. | ||||
| 2. The user select a locale in the preferences. | ||||
| 3. The user select a locale from the menu in the query form (e.g. ``:zh-TW``) | ||||
| 4. This plugin is activated in the preferences and the locale (only the language | ||||
|    code / none region code) comes from the fastText's language detection. | ||||
| 
 | ||||
| Conclusion: There is a conflict between the language selected by the user and | ||||
| the language from language detection of this plugin.  For example, the user | ||||
| explicitly selects the German locale via the search syntax to search for a term | ||||
| that is identified as an English term (try ``:de-DE thermomix``, for example). | ||||
| 
 | ||||
| .. hint:: | ||||
| 
 | ||||
|    To SearXNG maintainers; please take into account: under some circumstances | ||||
|    the auto-detection of the language of this plugin could be detrimental to | ||||
|    users expectations.  Its not recommended to activate this plugin by | ||||
|    default. It should always be the user's decision whether to activate this | ||||
|    plugin or not. | ||||
| 
 | ||||
| .. _fastText: https://fasttext.cc/ | ||||
| .. _python fasttext: https://pypi.org/project/fasttext/ | ||||
| .. _language identification model: https://fasttext.cc/docs/en/language-identification.html | ||||
| .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759 | ||||
| .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651 | ||||
| 
 | ||||
| """ | ||||
| 
 | ||||
| from flask_babel import gettext | ||||
| import fasttext | ||||
| import babel | ||||
| 
 | ||||
| from searx.data import data_dir | ||||
| from searx.languages import language_codes | ||||
| 
 | ||||
| # Monkey patch: prevent fasttext from showing a (useless) warning when loading a | ||||
| # model. | ||||
| fasttext.FastText.eprint = lambda x: None | ||||
| 
 | ||||
| name = gettext('Autodetect search language') | ||||
| description = gettext('Automatically detect the query search language and switch to it.') | ||||
| preference_section = 'general' | ||||
| default_on = False | ||||
| 
 | ||||
| lang_model: fasttext.FastText._FastText = None | ||||
| """fasttext model to predict laguage of a search term""" | ||||
| 
 | ||||
| supported_langs = set() | ||||
| """Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`).""" | ||||
| 
 | ||||
| 
 | ||||
| def get_model(): | ||||
|     # lazy load, in order to to save memory | ||||
|     global lang_model  # pylint: disable=global-statement | ||||
|     if lang_model is None: | ||||
|         lang_model = fasttext.load_model(str(data_dir / 'lid.176.ftz')) | ||||
|     return lang_model | ||||
| 
 | ||||
| 
 | ||||
| def pre_search(request, search):  # pylint: disable=unused-argument | ||||
|     prediction = get_model().predict(search.search_query.query, k=1, threshold=0.3) | ||||
|     if prediction: | ||||
|         lang = prediction[0][0].split('__label__')[1] | ||||
|         if lang in supported_langs: | ||||
|             search.search_query.lang = lang | ||||
|             try: | ||||
|                 search.search_query.locale = babel.Locale.parse(lang) | ||||
|             except babel.core.UnknownLocaleError: | ||||
|                 pass | ||||
|     return True | ||||
| 
 | ||||
| 
 | ||||
| def init(app, settings):  # pylint: disable=unused-argument | ||||
|     for searxng_locale in language_codes: | ||||
|         supported_langs.add(searxng_locale[0].split('-')[0]) | ||||
|     return True | ||||
| @ -196,6 +196,10 @@ outgoing: | ||||
| #   - 'Open Access DOI rewrite' | ||||
| #   - 'Vim-like hotkeys' | ||||
| #   - 'Tor check plugin' | ||||
| #   # Read the docs before activate: auto-detection of the language could be | ||||
| #   # detrimental to users expectations / users can activate the plugin in the | ||||
| #   # preferences if they want. | ||||
| #   - 'Autodetect search language' | ||||
| 
 | ||||
| # Configuration of the "Hostname replace" plugin: | ||||
| # | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user