mirror of
				https://github.com/searxng/searxng.git
				synced 2025-11-03 19:17:07 -05:00 
			
		
		
		
	Merge pull request #2019 from ArtikusHG/fasttext
Replace langdetect with fasttext (followup of #1969)
This commit is contained in:
		
						commit
						b927482195
					
				@ -11,7 +11,6 @@ httpx[http2]==0.21.2
 | 
			
		||||
Brotli==1.0.9
 | 
			
		||||
uvloop==0.17.0
 | 
			
		||||
httpx-socks[asyncio]==0.7.2
 | 
			
		||||
langdetect==1.0.9
 | 
			
		||||
setproctitle==1.3.2
 | 
			
		||||
redis==4.4.0
 | 
			
		||||
markdown-it-py==2.1.0
 | 
			
		||||
 | 
			
		||||
@ -66,46 +66,28 @@ that is identified as an English term (try ``:de-DE thermomix``, for example).
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
from flask_babel import gettext
 | 
			
		||||
import fasttext
 | 
			
		||||
import babel
 | 
			
		||||
 | 
			
		||||
from searx.data import data_dir
 | 
			
		||||
from searx.utils import detect_language
 | 
			
		||||
from searx.languages import language_codes
 | 
			
		||||
 | 
			
		||||
# Monkey patch: prevent fasttext from showing a (useless) warning when loading a
 | 
			
		||||
# model.
 | 
			
		||||
fasttext.FastText.eprint = lambda x: None
 | 
			
		||||
 | 
			
		||||
name = gettext('Autodetect search language')
 | 
			
		||||
description = gettext('Automatically detect the query search language and switch to it.')
 | 
			
		||||
preference_section = 'general'
 | 
			
		||||
default_on = False
 | 
			
		||||
 | 
			
		||||
lang_model: fasttext.FastText._FastText = None
 | 
			
		||||
"""fasttext model to predict laguage of a search term"""
 | 
			
		||||
 | 
			
		||||
supported_langs = set()
 | 
			
		||||
"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`)."""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_model():
 | 
			
		||||
    # lazy load, in order to to save memory
 | 
			
		||||
    global lang_model  # pylint: disable=global-statement
 | 
			
		||||
    if lang_model is None:
 | 
			
		||||
        lang_model = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
 | 
			
		||||
    return lang_model
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def pre_search(request, search):  # pylint: disable=unused-argument
 | 
			
		||||
    prediction = get_model().predict(search.search_query.query, k=1, threshold=0.3)
 | 
			
		||||
    if prediction:
 | 
			
		||||
        lang = prediction[0][0].split('__label__')[1]
 | 
			
		||||
        if lang in supported_langs:
 | 
			
		||||
            search.search_query.lang = lang
 | 
			
		||||
            try:
 | 
			
		||||
                search.search_query.locale = babel.Locale.parse(lang)
 | 
			
		||||
            except babel.core.UnknownLocaleError:
 | 
			
		||||
                pass
 | 
			
		||||
    lang = detect_language(search.search_query.query, min_probability=0)
 | 
			
		||||
    if lang in supported_langs:
 | 
			
		||||
        search.search_query.lang = lang
 | 
			
		||||
        try:
 | 
			
		||||
            search.search_query.locale = babel.Locale.parse(lang)
 | 
			
		||||
        except babel.core.UnknownLocaleError:
 | 
			
		||||
            pass
 | 
			
		||||
    return True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -10,12 +10,10 @@ from timeit import default_timer
 | 
			
		||||
from urllib.parse import urlparse
 | 
			
		||||
 | 
			
		||||
import re
 | 
			
		||||
from langdetect import detect_langs
 | 
			
		||||
from langdetect.lang_detect_exception import LangDetectException
 | 
			
		||||
import httpx
 | 
			
		||||
 | 
			
		||||
from searx import network, logger
 | 
			
		||||
from searx.utils import gen_useragent
 | 
			
		||||
from searx.utils import gen_useragent, detect_language
 | 
			
		||||
from searx.results import ResultContainer
 | 
			
		||||
from searx.search.models import SearchQuery, EngineRef
 | 
			
		||||
from searx.search.processors import EngineProcessor
 | 
			
		||||
@ -208,14 +206,10 @@ class ResultContainerTests:
 | 
			
		||||
        self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')')
 | 
			
		||||
 | 
			
		||||
    def _add_language(self, text: str) -> typing.Optional[str]:
 | 
			
		||||
        try:
 | 
			
		||||
            r = detect_langs(str(text))  # pylint: disable=E1101
 | 
			
		||||
        except LangDetectException:
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
        if len(r) > 0 and r[0].prob > 0.95:
 | 
			
		||||
            self.languages.add(r[0].lang)
 | 
			
		||||
            self.test_results.add_language(r[0].lang)
 | 
			
		||||
        langStr = detect_language(text)
 | 
			
		||||
        if langStr:
 | 
			
		||||
            self.languages.add(langStr)
 | 
			
		||||
            self.test_results.add_language(langStr)
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
    def _check_result(self, result):
 | 
			
		||||
 | 
			
		||||
@ -15,6 +15,7 @@ from os.path import splitext, join
 | 
			
		||||
from random import choice
 | 
			
		||||
from html.parser import HTMLParser
 | 
			
		||||
from urllib.parse import urljoin, urlparse
 | 
			
		||||
import fasttext
 | 
			
		||||
 | 
			
		||||
from lxml import html
 | 
			
		||||
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
 | 
			
		||||
@ -22,7 +23,7 @@ from babel.core import get_global
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
from searx import settings
 | 
			
		||||
from searx.data import USER_AGENTS
 | 
			
		||||
from searx.data import USER_AGENTS, data_dir
 | 
			
		||||
from searx.version import VERSION_TAG
 | 
			
		||||
from searx.languages import language_codes
 | 
			
		||||
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
 | 
			
		||||
@ -50,6 +51,12 @@ _STORAGE_UNIT_VALUE: Dict[str, int] = {
 | 
			
		||||
_XPATH_CACHE: Dict[str, XPath] = {}
 | 
			
		||||
_LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
 | 
			
		||||
 | 
			
		||||
_FASTTEXT_MODEL: Optional[fasttext.FastText._FastText] = None
 | 
			
		||||
"""fasttext model to predict laguage of a search term"""
 | 
			
		||||
 | 
			
		||||
# Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
 | 
			
		||||
fasttext.FastText.eprint = lambda x: None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class _NotSetClass:  # pylint: disable=too-few-public-methods
 | 
			
		||||
    """Internal class for this module, do not create instance of this class.
 | 
			
		||||
@ -621,3 +628,20 @@ def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index:
 | 
			
		||||
        # to record xpath_spec
 | 
			
		||||
        raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
 | 
			
		||||
    return default
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _get_fasttext_model() -> fasttext.FastText._FastText:
 | 
			
		||||
    global _FASTTEXT_MODEL  # pylint: disable=global-statement
 | 
			
		||||
    if _FASTTEXT_MODEL is None:
 | 
			
		||||
        _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
 | 
			
		||||
    return _FASTTEXT_MODEL
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def detect_language(text: str, threshold: float = 0.3, min_probability: float = 0.5) -> Optional[str]:
 | 
			
		||||
    """https://fasttext.cc/docs/en/language-identification.html"""
 | 
			
		||||
    if not isinstance(text, str):
 | 
			
		||||
        raise ValueError('text must a str')
 | 
			
		||||
    r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
 | 
			
		||||
    if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0 and r[1][0] > min_probability:
 | 
			
		||||
        return r[0][0].split('__label__')[1]
 | 
			
		||||
    return None
 | 
			
		||||
 | 
			
		||||
@ -17,14 +17,11 @@ from os.path import join
 | 
			
		||||
 | 
			
		||||
from lxml.html import fromstring
 | 
			
		||||
 | 
			
		||||
from langdetect import detect_langs
 | 
			
		||||
from langdetect.lang_detect_exception import LangDetectException
 | 
			
		||||
 | 
			
		||||
from searx.engines import wikidata, set_loggers
 | 
			
		||||
from searx.utils import extract_text, match_language
 | 
			
		||||
from searx.locales import LOCALE_NAMES, locales_initialize
 | 
			
		||||
from searx import searx_dir
 | 
			
		||||
from searx.utils import gen_useragent
 | 
			
		||||
from searx.utils import gen_useragent, detect_language
 | 
			
		||||
import searx.search
 | 
			
		||||
import searx.network
 | 
			
		||||
 | 
			
		||||
@ -117,17 +114,6 @@ def get_wikipedia_summary(lang, pageid):
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def detect_language(text):
 | 
			
		||||
    try:
 | 
			
		||||
        r = detect_langs(str(text))  # pylint: disable=E1101
 | 
			
		||||
    except LangDetectException:
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
    if len(r) > 0 and r[0].prob > 0.95:
 | 
			
		||||
        return r[0].lang
 | 
			
		||||
    return None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_website_description(url, lang1, lang2=None):
 | 
			
		||||
    headers = {
 | 
			
		||||
        'User-Agent': gen_useragent(),
 | 
			
		||||
 | 
			
		||||
@ -232,3 +232,25 @@ class TestXPathUtils(SearxTestCase):
 | 
			
		||||
        with self.assertRaises(SearxEngineXPathException) as context:
 | 
			
		||||
            utils.eval_xpath_getindex(doc, 'count(//i)', 1)
 | 
			
		||||
        self.assertEqual(context.exception.message, 'the result is not a list')
 | 
			
		||||
 | 
			
		||||
    def test_detect_language(self):
 | 
			
		||||
        # make sure new line are not an issue
 | 
			
		||||
        # fasttext.predict('') does not accept new line.
 | 
			
		||||
        l = utils.detect_language('The quick brown fox jumps over\nthe lazy dog')
 | 
			
		||||
        self.assertEqual(l, 'en')
 | 
			
		||||
 | 
			
		||||
        l = utils.detect_language('いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす')
 | 
			
		||||
        self.assertEqual(l, 'ja')
 | 
			
		||||
 | 
			
		||||
        l = utils.detect_language('Pijamalı hasta yağız şoföre çabucak güvendi.')
 | 
			
		||||
        self.assertEqual(l, 'tr')
 | 
			
		||||
 | 
			
		||||
        l = utils.detect_language('')
 | 
			
		||||
        self.assertIsNone(l)
 | 
			
		||||
 | 
			
		||||
        # mix languages --> None
 | 
			
		||||
        l = utils.detect_language('The いろはにほへと Pijamalı')
 | 
			
		||||
        self.assertIsNone(l)
 | 
			
		||||
 | 
			
		||||
        with self.assertRaises(ValueError):
 | 
			
		||||
            utils.detect_language(None)
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user