mirror of
				https://github.com/searxng/searxng.git
				synced 2025-10-26 00:02:31 -04:00 
			
		
		
		
	[fix] engine - yahoo: rewrite and fix issues
Languages are supported by mapping the language to a domain. If domain is not found in :py:obj:`lang2domain` URL ``<lang>.search.yahoo.com`` is used. BTW: fix issue reported at https://github.com/searx/searx/issues/3020 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
		
							parent
							
								
									38a157b56f
								
							
						
					
					
						commit
						f63ffbb22b
					
				
							
								
								
									
										8
									
								
								docs/src/searx.engines.yahoo.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								docs/src/searx.engines.yahoo.rst
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,8 @@ | |||||||
|  | .. _yahoo engine: | ||||||
|  | 
 | ||||||
|  | ============ | ||||||
|  | Yahoo Engine | ||||||
|  | ============ | ||||||
|  | 
 | ||||||
|  | .. automodule:: searx.engines.yahoo | ||||||
|  |   :members: | ||||||
| @ -28418,7 +28418,7 @@ | |||||||
|     "sv", |     "sv", | ||||||
|     "th", |     "th", | ||||||
|     "tr", |     "tr", | ||||||
|     "zh-CHS", |     "zh_chs", | ||||||
|     "zh-CHT" |     "zh_cht" | ||||||
|   ] |   ] | ||||||
| } | } | ||||||
| @ -1,12 +1,24 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| # lint: pylint | # lint: pylint | ||||||
| """Yahoo (Web) | """Yahoo Search (Web) | ||||||
|  | 
 | ||||||
|  | Languages are supported by mapping the language to a domain.  If domain is not | ||||||
|  | found in :py:obj:`lang2domain` URL ``<lang>.search.yahoo.com`` is used. | ||||||
| 
 | 
 | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| from urllib.parse import unquote, urlencode | from urllib.parse import ( | ||||||
|  |     unquote, | ||||||
|  |     urlencode, | ||||||
|  | ) | ||||||
| from lxml import html | from lxml import html | ||||||
| from searx.utils import extract_text, extract_url, match_language, eval_xpath | 
 | ||||||
|  | from searx.utils import ( | ||||||
|  |     eval_xpath_getindex, | ||||||
|  |     eval_xpath_list, | ||||||
|  |     extract_text, | ||||||
|  |     match_language, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
| @ -22,35 +34,78 @@ about = { | |||||||
| categories = ['general'] | categories = ['general'] | ||||||
| paging = True | paging = True | ||||||
| time_range_support = True | time_range_support = True | ||||||
| 
 | supported_languages_url = 'https://search.yahoo.com/preferences/languages' | ||||||
| # search-url | """Supported languages are read from Yahoo preference page.""" | ||||||
| base_url = 'https://search.yahoo.com/' |  | ||||||
| search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}' |  | ||||||
| search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&age={age}&btf={btf}&fr2=time' |  | ||||||
| 
 |  | ||||||
| supported_languages_url = 'https://search.yahoo.com/web/advanced' |  | ||||||
| 
 |  | ||||||
| # specific xpath variables |  | ||||||
| results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]" |  | ||||||
| url_xpath = './/h3/a/@href' |  | ||||||
| title_xpath = './/h3/a' |  | ||||||
| content_xpath = './/div[contains(@class, "compText")]' |  | ||||||
| suggestion_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' AlsoTry ')]//a" |  | ||||||
| 
 | 
 | ||||||
| time_range_dict = { | time_range_dict = { | ||||||
|     'day': ['1d', 'd'], |     'day': ('1d', 'd'), | ||||||
|     'week': ['1w', 'w'], |     'week': ('1w', 'w'), | ||||||
|     'month': ['1m', 'm'] |     'month': ('1m', 'm'), | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| language_aliases = { | language_aliases = { | ||||||
|     'zh-CN': 'zh-CHS', |     'zh-HK': 'zh_chs', | ||||||
|     'zh-TW': 'zh-CHT', |     'zh-CN': 'zh_chs',  # dead since 2015 / routed to hk.search.yahoo.com | ||||||
|     'zh-HK': 'zh-CHT' |     'zh-TW': 'zh_cht', | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| # remove yahoo-specific tracking-url | lang2domain = { | ||||||
|  |     'zh_chs' : 'hk.search.yahoo.com', | ||||||
|  |     'zh_cht' : 'tw.search.yahoo.com', | ||||||
|  |     'en'     : 'search.yahoo.com', | ||||||
|  | 
 | ||||||
|  |     'bg'     : 'search.yahoo.com', | ||||||
|  |     'cs'     : 'search.yahoo.com', | ||||||
|  |     'da'     : 'search.yahoo.com', | ||||||
|  |     'el'     : 'search.yahoo.com', | ||||||
|  |     'et'     : 'search.yahoo.com', | ||||||
|  |     'he'     : 'search.yahoo.com', | ||||||
|  |     'hr'     : 'search.yahoo.com', | ||||||
|  |     'ja'     : 'search.yahoo.com', | ||||||
|  |     'ko'     : 'search.yahoo.com', | ||||||
|  |     'sk'     : 'search.yahoo.com', | ||||||
|  |     'sl'     : 'search.yahoo.com', | ||||||
|  | 
 | ||||||
|  | } | ||||||
|  | """Map language to domain""" | ||||||
|  | 
 | ||||||
|  | def _get_language(params): | ||||||
|  | 
 | ||||||
|  |     lang = language_aliases.get(params['language']) | ||||||
|  |     if lang is None: | ||||||
|  |         lang = match_language( | ||||||
|  |             params['language'], supported_languages, language_aliases | ||||||
|  |         ) | ||||||
|  |     lang = lang.split('-')[0] | ||||||
|  |     logger.debug("params['language']: %s --> %s" , params['language'], lang) | ||||||
|  |     return lang | ||||||
|  | 
 | ||||||
|  | def request(query, params): | ||||||
|  |     """build request""" | ||||||
|  |     offset = (params['pageno'] - 1) * 7 + 1 | ||||||
|  |     lang  =  _get_language(params) | ||||||
|  |     age, btf = time_range_dict.get( | ||||||
|  |         params['time_range'], ('', '')) | ||||||
|  | 
 | ||||||
|  |     args = urlencode({ | ||||||
|  |         'p' : query, | ||||||
|  |         'ei' : 'UTF-8', | ||||||
|  |         'fl' : 1, | ||||||
|  |         'vl' : 'lang_' + lang, | ||||||
|  |         'btf' : btf, | ||||||
|  |         'fr2' : 'time', | ||||||
|  |         'age' : age, | ||||||
|  |         'b' : offset, | ||||||
|  |         'xargs' :0 | ||||||
|  |     }) | ||||||
|  | 
 | ||||||
|  |     domain = lang2domain.get(lang, '%s.search.yahoo.com' % lang) | ||||||
|  |     params['url'] = 'https://%s/search?%s' % (domain, args) | ||||||
|  |     return params | ||||||
|  | 
 | ||||||
| def parse_url(url_string): | def parse_url(url_string): | ||||||
|  |     """remove yahoo-specific tracking-url""" | ||||||
|  | 
 | ||||||
|     endings = ['/RS', '/RK'] |     endings = ['/RS', '/RK'] | ||||||
|     endpositions = [] |     endpositions = [] | ||||||
|     start = url_string.find('http', url_string.find('/RU=') + 1) |     start = url_string.find('http', url_string.find('/RU=') + 1) | ||||||
| @ -66,73 +121,30 @@ def parse_url(url_string): | |||||||
|     end = min(endpositions) |     end = min(endpositions) | ||||||
|     return unquote(url_string[start:end]) |     return unquote(url_string[start:end]) | ||||||
| 
 | 
 | ||||||
| def _get_url(query, offset, language, time_range): |  | ||||||
|     if time_range in time_range_dict: |  | ||||||
|         return base_url + search_url_with_time.format( |  | ||||||
|             offset = offset, |  | ||||||
|             query = urlencode({'p': query}), |  | ||||||
|             lang = language, |  | ||||||
|             age = time_range_dict[time_range][0], |  | ||||||
|             btf = time_range_dict[time_range][1] |  | ||||||
|         ) |  | ||||||
|     return base_url + search_url.format( |  | ||||||
|         offset=offset, |  | ||||||
|         query=urlencode({'p': query}), |  | ||||||
|         lang=language |  | ||||||
|     ) |  | ||||||
| 
 |  | ||||||
| def _get_language(params): |  | ||||||
|     if params['language'] == 'all': |  | ||||||
|         return 'en' |  | ||||||
| 
 |  | ||||||
|     language = match_language(params['language'], supported_languages, language_aliases) |  | ||||||
|     if language not in language_aliases.values(): |  | ||||||
|         language = language.split('-')[0] |  | ||||||
|     language = language.replace('-', '_').lower() |  | ||||||
| 
 |  | ||||||
|     return language |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # do search-request |  | ||||||
| def request(query, params): |  | ||||||
|     if params['time_range'] and params['time_range'] not in time_range_dict: |  | ||||||
|         return params |  | ||||||
| 
 |  | ||||||
|     offset = (params['pageno'] - 1) * 10 + 1 |  | ||||||
|     language = _get_language(params) |  | ||||||
| 
 |  | ||||||
|     params['url'] = _get_url(query, offset, language, params['time_range']) |  | ||||||
| 
 |  | ||||||
|     return params |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # get response from search-request |  | ||||||
| def response(resp): | def response(resp): | ||||||
|     results = [] |     """parse response""" | ||||||
| 
 | 
 | ||||||
|  |     results = [] | ||||||
|     dom = html.fromstring(resp.text) |     dom = html.fromstring(resp.text) | ||||||
| 
 | 
 | ||||||
|     try: |  | ||||||
|         results_num = int( |  | ||||||
|             eval_xpath( |  | ||||||
|                 dom, |  | ||||||
|                 '//div[@class="compPagination"]/span[last()]/text()' |  | ||||||
|             )[0].split()[0].replace(',', '') |  | ||||||
|         ) |  | ||||||
|         results.append({'number_of_results': results_num}) |  | ||||||
|     except:  # pylint: disable=bare-except |  | ||||||
|         pass |  | ||||||
| 
 |  | ||||||
|     # parse results |     # parse results | ||||||
|     for result in eval_xpath(dom, results_xpath): |     for result in eval_xpath_list(dom, '//div[contains(@class,"algo-sr")]'): | ||||||
|         try: |         url = eval_xpath_getindex(result, './/h3/a/@href', 0, default=None) | ||||||
|             url = parse_url(extract_url(eval_xpath(result, url_xpath), search_url)) |         if url is None: | ||||||
|             title = extract_text(eval_xpath(result, title_xpath)[0]) |  | ||||||
| 
 |  | ||||||
|         except:  # pylint: disable=bare-except |  | ||||||
|             continue |             continue | ||||||
|  |         url = parse_url(url) | ||||||
| 
 | 
 | ||||||
|         content = extract_text(eval_xpath(result, content_xpath)[0]) |         title = eval_xpath_getindex(result, './/h3/a', 0, default=None) | ||||||
|  |         if title is None: | ||||||
|  |             continue | ||||||
|  |         offset = len(extract_text(title.xpath('span'))) | ||||||
|  |         title = extract_text(title)[offset:] | ||||||
|  | 
 | ||||||
|  |         content = eval_xpath_getindex( | ||||||
|  |             result, './/div[contains(@class, "compText")]', 0, default='' | ||||||
|  |         ) | ||||||
|  |         if content: | ||||||
|  |             content = extract_text(content) | ||||||
| 
 | 
 | ||||||
|         # append result |         # append result | ||||||
|         results.append({ |         results.append({ | ||||||
| @ -141,17 +153,10 @@ def response(resp): | |||||||
|             'content': content |             'content': content | ||||||
|         }) |         }) | ||||||
| 
 | 
 | ||||||
|     # if no suggestion found, return results |     for suggestion in eval_xpath_list(dom, '//div[contains(@class, "AlsoTry")]'): | ||||||
|     suggestions = eval_xpath(dom, suggestion_xpath) |  | ||||||
|     if not suggestions: |  | ||||||
|         return results |  | ||||||
| 
 |  | ||||||
|     # parse suggestion |  | ||||||
|     for suggestion in suggestions: |  | ||||||
|         # append suggestion |         # append suggestion | ||||||
|         results.append({'suggestion': extract_text(suggestion)}) |         results.append({'suggestion': extract_text(suggestion)}) | ||||||
| 
 | 
 | ||||||
|     # return results |  | ||||||
|     return results |     return results | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -159,13 +164,9 @@ def response(resp): | |||||||
| def _fetch_supported_languages(resp): | def _fetch_supported_languages(resp): | ||||||
|     supported_languages = [] |     supported_languages = [] | ||||||
|     dom = html.fromstring(resp.text) |     dom = html.fromstring(resp.text) | ||||||
|     options = eval_xpath(dom, '//div[@id="yschlang"]/span/label/input') |     offset = len('lang_') | ||||||
|     for option in options: | 
 | ||||||
|         code_parts = eval_xpath(option, './@value')[0][5:].split('_') |     for val in eval_xpath_list(dom, '//div[contains(@class, "lang-item")]/input/@value'): | ||||||
|         if len(code_parts) == 2: |         supported_languages.append( val[offset:] ) | ||||||
|             code = code_parts[0] + '-' + code_parts[1].upper() |  | ||||||
|         else: |  | ||||||
|             code = code_parts[0] |  | ||||||
|         supported_languages.append(code) |  | ||||||
| 
 | 
 | ||||||
|     return supported_languages |     return supported_languages | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user