mirror of
				https://github.com/searxng/searxng.git
				synced 2025-11-02 18:47:05 -05:00 
			
		
		
		
	[doc] improved docs of implementations for automatic speech recognition
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
		
							parent
							
								
									6748e8e2d5
								
							
						
					
					
						commit
						0b1444b61e
					
				@ -641,38 +641,56 @@ def _get_fasttext_model() -> "fasttext.FastText._FastText":
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]:
 | 
					def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]:
 | 
				
			||||||
    """Detect the language of the text parameter
 | 
					    """Detect the language of the ``text`` parameter.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Args:
 | 
					    :param str text: The string whose language is to be detected.
 | 
				
			||||||
        * text (str): the string whose language is to be detected.
 | 
					 | 
				
			||||||
        * threshold (float): threshold filters the returned labels by a threshold on probability.
 | 
					 | 
				
			||||||
          A choice of 0.3 will return labels with at least 0.3 probability.
 | 
					 | 
				
			||||||
        * only_search_languages (bool): if True, returns only supported SearXNG search languages.
 | 
					 | 
				
			||||||
          see :py:obj:`searx.languages`
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    :param float threshold: Threshold filters the returned labels by a threshold
 | 
				
			||||||
 | 
					        on probability.  A choice of 0.3 will return labels with at least 0.3
 | 
				
			||||||
 | 
					        probability.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Raises:
 | 
					    :param bool only_search_languages: If ``True``, returns only supported
 | 
				
			||||||
        * ValueError: if text is not a string
 | 
					        SearXNG search languages.  see :py:obj:`searx.languages`
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Returns:
 | 
					    :rtype: str, None
 | 
				
			||||||
        * result (str, None): the detected language code or None. See below.
 | 
					    :returns:
 | 
				
			||||||
 | 
					        The detected language code or ``None``. See below.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    The language detection is done by using `a fork`_ of the fastText_ library (`python
 | 
					    :raises ValueError: If ``text`` is not a string.
 | 
				
			||||||
    fasttext`_). fastText_ distributes the `language identification model`_, for
 | 
					
 | 
				
			||||||
    reference:
 | 
					    The language detection is done by using `a fork`_ of the fastText_ library
 | 
				
			||||||
 | 
					    (`python fasttext`_). fastText_ distributes the `language identification
 | 
				
			||||||
 | 
					    model`_, for reference:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    - `FastText.zip: Compressing text classification models`_
 | 
					    - `FastText.zip: Compressing text classification models`_
 | 
				
			||||||
    - `Bag of Tricks for Efficient Text Classification`_
 | 
					    - `Bag of Tricks for Efficient Text Classification`_
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    The `language identification model`_ support the language codes (ISO-639-3)::
 | 
					    The `language identification model`_ support the language codes
 | 
				
			||||||
    af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr
 | 
					    (ISO-639-3)::
 | 
				
			||||||
    ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa
 | 
					
 | 
				
			||||||
    fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io
 | 
					        af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
 | 
				
			||||||
    is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv
 | 
					        bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
 | 
				
			||||||
    mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn
 | 
					        et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
 | 
				
			||||||
    no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd
 | 
					        id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
 | 
				
			||||||
    sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep
 | 
					        lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
 | 
				
			||||||
    vi vls vo wa war wuu xal xmf yi yo yue zh
 | 
					        nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
 | 
				
			||||||
 | 
					        rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
 | 
				
			||||||
 | 
					        tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    By using ``only_search_languages=True`` the `language identification model`_
 | 
				
			||||||
 | 
					    is harmonized with the SearXNG's language (locale) model.  General
 | 
				
			||||||
 | 
					    conditions of SearXNG's locale model are:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    a. SearXNG's locale of a query is passed to the
 | 
				
			||||||
 | 
					       :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
 | 
				
			||||||
 | 
					       code that is used by an engine.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    b. Most of SearXNG's engines do not support all the languages from `language
 | 
				
			||||||
 | 
					       identification model`_ and there is also a discrepancy in the ISO-639-3
 | 
				
			||||||
 | 
					       (fastext) and ISO-639-2 (SearXNG)handling.  Further more, in SearXNG the
 | 
				
			||||||
 | 
					       locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
 | 
				
			||||||
 | 
					       (``zh_Hans``) while the `language identification model`_ reduce both to
 | 
				
			||||||
 | 
					       ``zh``.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    .. _a fork: https://github.com/searxng/fasttext-predict
 | 
					    .. _a fork: https://github.com/searxng/fasttext-predict
 | 
				
			||||||
    .. _fastText: https://fasttext.cc/
 | 
					    .. _fastText: https://fasttext.cc/
 | 
				
			||||||
@ -680,6 +698,7 @@ def detect_language(text: str, threshold: float = 0.3, only_search_languages: bo
 | 
				
			|||||||
    .. _language identification model: https://fasttext.cc/docs/en/language-identification.html
 | 
					    .. _language identification model: https://fasttext.cc/docs/en/language-identification.html
 | 
				
			||||||
    .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
 | 
					    .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
 | 
				
			||||||
    .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
 | 
					    .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    if not isinstance(text, str):
 | 
					    if not isinstance(text, str):
 | 
				
			||||||
        raise ValueError('text must a str')
 | 
					        raise ValueError('text must a str')
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user