mirror of
				https://github.com/searxng/searxng.git
				synced 2025-10-31 10:37:06 -04:00 
			
		
		
		
	Merge pull request #1969 from ArtikusHG/master
Add language autodetect plugin
This commit is contained in:
		
						commit
						966e9c3c5d
					
				| @ -36,6 +36,7 @@ RUN apk add --no-cache -t build-dependencies \ | |||||||
|     su-exec \ |     su-exec \ | ||||||
|     python3 \ |     python3 \ | ||||||
|     py3-pip \ |     py3-pip \ | ||||||
|  |     py3-numpy \ | ||||||
|     libxml2 \ |     libxml2 \ | ||||||
|     libxslt \ |     libxslt \ | ||||||
|     openssl \ |     openssl \ | ||||||
| @ -43,6 +44,8 @@ RUN apk add --no-cache -t build-dependencies \ | |||||||
|     uwsgi \ |     uwsgi \ | ||||||
|     uwsgi-python3 \ |     uwsgi-python3 \ | ||||||
|     brotli \ |     brotli \ | ||||||
|  |  && pip3 install --no-cache setuptools wheel \ | ||||||
|  |  && sed -i s/fasttext-wheel/fasttext/ requirements.txt \ | ||||||
|  && pip3 install --no-cache -r requirements.txt \ |  && pip3 install --no-cache -r requirements.txt \ | ||||||
|  && apk del build-dependencies \ |  && apk del build-dependencies \ | ||||||
|  && rm -rf /root/.cache |  && rm -rf /root/.cache | ||||||
|  | |||||||
							
								
								
									
										8
									
								
								docs/src/searx.plugins.autodetect_search_language.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								docs/src/searx.plugins.autodetect_search_language.rst
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,8 @@ | |||||||
|  | .. _autodetect search language: | ||||||
|  | 
 | ||||||
|  | ====================== | ||||||
|  | Search language plugin | ||||||
|  | ====================== | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.plugins.autodetect_search_language | ||||||
|  |   :members: | ||||||
| @ -16,3 +16,4 @@ setproctitle==1.3.2 | |||||||
| redis==4.4.0 | redis==4.4.0 | ||||||
| markdown-it-py==2.1.0 | markdown-it-py==2.1.0 | ||||||
| typing_extensions==4.4.0 | typing_extensions==4.4.0 | ||||||
|  | fasttext-wheel==0.9.2 | ||||||
|  | |||||||
							
								
								
									
										
											BIN
										
									
								
								searx/data/lid.176.ftz
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								searx/data/lid.176.ftz
									
									
									
									
									
										Executable file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										115
									
								
								searx/plugins/autodetect_search_language.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										115
									
								
								searx/plugins/autodetect_search_language.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,115 @@ | |||||||
|  | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
|  | # lint: pylint | ||||||
|  | """Plugin to detect the search language from the search query. | ||||||
|  | 
 | ||||||
|  | The language detection is done by using the fastText_ library (`python | ||||||
|  | fasttext`_). fastText_ distributes the `language identification model`_, for | ||||||
|  | reference: | ||||||
|  | 
 | ||||||
|  | - `FastText.zip: Compressing text classification models`_ | ||||||
|  | - `Bag of Tricks for Efficient Text Classification`_ | ||||||
|  | 
 | ||||||
|  | The `language identification model`_ support the language codes (ISO-639-3):: | ||||||
|  | 
 | ||||||
|  |    af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr | ||||||
|  |    ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa | ||||||
|  |    fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io | ||||||
|  |    is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv | ||||||
|  |    mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn | ||||||
|  |    no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd | ||||||
|  |    sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep | ||||||
|  |    vi vls vo wa war wuu xal xmf yi yo yue zh | ||||||
|  | 
 | ||||||
|  | The `language identification model`_ is harmonized with the SearXNG's language | ||||||
|  | (locale) model.  General conditions of SearXNG's locale model are: | ||||||
|  | 
 | ||||||
|  | a. SearXNG's locale of a query is passed to the | ||||||
|  |    :py:obj:`searx.locales.get_engine_locale` to get a language and/or region | ||||||
|  |    code that is used by an engine. | ||||||
|  | 
 | ||||||
|  | b. SearXNG and most of the engines do not support all the languages from | ||||||
|  |    language model and there might be also a discrepancy in the ISO-639-3 and | ||||||
|  |    ISO-639-2 handling (:py:obj:`searx.locales.get_engine_locale`).  Further | ||||||
|  |    more, in SearXNG the locales like ``zh-TH`` (``zh-CN``) are mapped to | ||||||
|  |    ``zh_Hant`` (``zh_Hans``). | ||||||
|  | 
 | ||||||
|  | Conclusion: This plugin does only auto-detect the languages a user can select in | ||||||
|  | the language menu (:py:obj:`supported_langs`). | ||||||
|  | 
 | ||||||
|  | SearXNG's locale of a query comes from (*highest wins*): | ||||||
|  | 
 | ||||||
|  | 1. The ``Accept-Language`` header from user's HTTP client. | ||||||
|  | 2. The user select a locale in the preferences. | ||||||
|  | 3. The user select a locale from the menu in the query form (e.g. ``:zh-TW``) | ||||||
|  | 4. This plugin is activated in the preferences and the locale (only the language | ||||||
|  |    code / none region code) comes from the fastText's language detection. | ||||||
|  | 
 | ||||||
|  | Conclusion: There is a conflict between the language selected by the user and | ||||||
|  | the language from language detection of this plugin.  For example, the user | ||||||
|  | explicitly selects the German locale via the search syntax to search for a term | ||||||
|  | that is identified as an English term (try ``:de-DE thermomix``, for example). | ||||||
|  | 
 | ||||||
|  | .. hint:: | ||||||
|  | 
 | ||||||
|  |    To SearXNG maintainers; please take into account: under some circumstances | ||||||
|  |    the auto-detection of the language of this plugin could be detrimental to | ||||||
|  |    users expectations.  Its not recommended to activate this plugin by | ||||||
|  |    default. It should always be the user's decision whether to activate this | ||||||
|  |    plugin or not. | ||||||
|  | 
 | ||||||
|  | .. _fastText: https://fasttext.cc/ | ||||||
|  | .. _python fasttext: https://pypi.org/project/fasttext/ | ||||||
|  | .. _language identification model: https://fasttext.cc/docs/en/language-identification.html | ||||||
|  | .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759 | ||||||
|  | .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651 | ||||||
|  | 
 | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | from flask_babel import gettext | ||||||
|  | import fasttext | ||||||
|  | import babel | ||||||
|  | 
 | ||||||
|  | from searx.data import data_dir | ||||||
|  | from searx.languages import language_codes | ||||||
|  | 
 | ||||||
|  | # Monkey patch: prevent fasttext from showing a (useless) warning when loading a | ||||||
|  | # model. | ||||||
|  | fasttext.FastText.eprint = lambda x: None | ||||||
|  | 
 | ||||||
|  | name = gettext('Autodetect search language') | ||||||
|  | description = gettext('Automatically detect the query search language and switch to it.') | ||||||
|  | preference_section = 'general' | ||||||
|  | default_on = False | ||||||
|  | 
 | ||||||
|  | lang_model: fasttext.FastText._FastText = None | ||||||
|  | """fasttext model to predict laguage of a search term""" | ||||||
|  | 
 | ||||||
|  | supported_langs = set() | ||||||
|  | """Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`).""" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_model(): | ||||||
|  |     # lazy load, in order to to save memory | ||||||
|  |     global lang_model  # pylint: disable=global-statement | ||||||
|  |     if lang_model is None: | ||||||
|  |         lang_model = fasttext.load_model(str(data_dir / 'lid.176.ftz')) | ||||||
|  |     return lang_model | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def pre_search(request, search):  # pylint: disable=unused-argument | ||||||
|  |     prediction = get_model().predict(search.search_query.query, k=1, threshold=0.3) | ||||||
|  |     if prediction: | ||||||
|  |         lang = prediction[0][0].split('__label__')[1] | ||||||
|  |         if lang in supported_langs: | ||||||
|  |             search.search_query.lang = lang | ||||||
|  |             try: | ||||||
|  |                 search.search_query.locale = babel.Locale.parse(lang) | ||||||
|  |             except babel.core.UnknownLocaleError: | ||||||
|  |                 pass | ||||||
|  |     return True | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def init(app, settings):  # pylint: disable=unused-argument | ||||||
|  |     for searxng_locale in language_codes: | ||||||
|  |         supported_langs.add(searxng_locale[0].split('-')[0]) | ||||||
|  |     return True | ||||||
| @ -196,6 +196,10 @@ outgoing: | |||||||
| #   - 'Open Access DOI rewrite' | #   - 'Open Access DOI rewrite' | ||||||
| #   - 'Vim-like hotkeys' | #   - 'Vim-like hotkeys' | ||||||
| #   - 'Tor check plugin' | #   - 'Tor check plugin' | ||||||
|  | #   # Read the docs before activate: auto-detection of the language could be | ||||||
|  | #   # detrimental to users expectations / users can activate the plugin in the | ||||||
|  | #   # preferences if they want. | ||||||
|  | #   - 'Autodetect search language' | ||||||
| 
 | 
 | ||||||
| # Configuration of the "Hostname replace" plugin: | # Configuration of the "Hostname replace" plugin: | ||||||
| # | # | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user