mirror of
				https://github.com/searxng/searxng.git
				synced 2025-10-31 02:27:06 -04:00 
			
		
		
		
	Merge pull request #1061 from a01200356/bing
[fix] Language support for Bing Images and Videos
This commit is contained in:
		
						commit
						c8a66a090a
					
				| @ -13,7 +13,7 @@ python: | ||||
| before_install: | ||||
|   - "export DISPLAY=:99.0" | ||||
|   - "sh -e /etc/init.d/xvfb start" | ||||
|   - npm install less less-plugin-clean-css grunt-cli | ||||
|   - npm install less@2.7 less-plugin-clean-css grunt-cli | ||||
|   - export PATH=`pwd`/node_modules/.bin:$PATH | ||||
|   - ./manage.sh install_geckodriver ~/drivers | ||||
|   - export PATH=~/drivers:$PATH | ||||
|  | ||||
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							| @ -18,7 +18,6 @@ | ||||
| from lxml import html | ||||
| from json import loads | ||||
| import re | ||||
| from searx.engines.bing import _fetch_supported_languages, supported_languages_url | ||||
| from searx.url_utils import urlencode | ||||
| 
 | ||||
| # engine dependent config | ||||
| @ -26,6 +25,8 @@ categories = ['images'] | ||||
| paging = True | ||||
| safesearch = True | ||||
| time_range_support = True | ||||
| language_support = True | ||||
| supported_languages_url = 'https://www.bing.com/account/general' | ||||
| 
 | ||||
| # search-url | ||||
| base_url = 'https://www.bing.com/' | ||||
| @ -45,23 +46,41 @@ safesearch_types = {2: 'STRICT', | ||||
| _quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U) | ||||
| 
 | ||||
| 
 | ||||
| # get supported region code | ||||
| def get_region_code(lang, lang_list=None): | ||||
|     region = None | ||||
|     if lang in (lang_list or supported_languages): | ||||
|         region = lang | ||||
|     elif lang.startswith('no'): | ||||
|         region = 'nb-NO' | ||||
|     else: | ||||
|         # try to get a supported country code with language | ||||
|         lang = lang.split('-')[0] | ||||
|         for lc in (lang_list or supported_languages): | ||||
|             if lang == lc.split('-')[0]: | ||||
|                 region = lc | ||||
|                 break | ||||
|     if region: | ||||
|         return region.lower() | ||||
|     else: | ||||
|         return 'en-us' | ||||
| 
 | ||||
| 
 | ||||
| # do search-request | ||||
| def request(query, params): | ||||
|     offset = (params['pageno'] - 1) * 10 + 1 | ||||
| 
 | ||||
|     # required for cookie | ||||
|     if params['language'] == 'all': | ||||
|         language = 'en-US' | ||||
|     else: | ||||
|         language = params['language'] | ||||
| 
 | ||||
|     search_path = search_string.format( | ||||
|         query=urlencode({'q': query}), | ||||
|         offset=offset) | ||||
| 
 | ||||
|     language = get_region_code(params['language']) | ||||
| 
 | ||||
|     params['cookies']['SRCHHPGUSR'] = \ | ||||
|         'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] +\ | ||||
|         '&ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') | ||||
|         'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') | ||||
| 
 | ||||
|     params['cookies']['_EDGE_S'] = 'mkt=' + language +\ | ||||
|         '&ui=' + language + '&F=1' | ||||
| 
 | ||||
|     params['url'] = base_url + search_path | ||||
|     if params['time_range'] in time_range_dict: | ||||
| @ -106,3 +125,22 @@ def response(resp): | ||||
| 
 | ||||
|     # return results | ||||
|     return results | ||||
| 
 | ||||
| 
 | ||||
| # get supported languages from their site | ||||
| def _fetch_supported_languages(resp): | ||||
|     supported_languages = [] | ||||
|     dom = html.fromstring(resp.text) | ||||
| 
 | ||||
|     regions_xpath = '//div[@id="region-section-content"]' \ | ||||
|                     + '//ul[@class="b_vList"]/li/a/@href' | ||||
| 
 | ||||
|     regions = dom.xpath(regions_xpath) | ||||
|     for region in regions: | ||||
|         code = re.search('setmkt=[^\&]+', region).group()[7:] | ||||
|         if code == 'nb-NO': | ||||
|             code = 'no-NO' | ||||
| 
 | ||||
|         supported_languages.append(code) | ||||
| 
 | ||||
|     return supported_languages | ||||
|  | ||||
| @ -12,6 +12,7 @@ | ||||
| 
 | ||||
| from json import loads | ||||
| from lxml import html | ||||
| from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url, get_region_code | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.url_utils import urlencode | ||||
| 
 | ||||
| @ -21,6 +22,7 @@ paging = True | ||||
| safesearch = True | ||||
| time_range_support = True | ||||
| number_of_results = 10 | ||||
| language_support = True | ||||
| 
 | ||||
| search_url = 'https://www.bing.com/videos/asyncv2?{query}&async=content&'\ | ||||
|              'first={offset}&count={number_of_results}&CW=1366&CH=25&FORM=R5VR5' | ||||
| @ -45,7 +47,8 @@ def request(query, params): | ||||
|         'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') | ||||
| 
 | ||||
|     # language cookie | ||||
|     params['cookies']['_EDGE_S'] = 'mkt=' + params['language'].lower() + '&F=1' | ||||
|     region = get_region_code(params['language'], lang_list=supported_languages) | ||||
|     params['cookies']['_EDGE_S'] = 'mkt=' + region + '&F=1' | ||||
| 
 | ||||
|     # query and paging | ||||
|     params['url'] = search_url.format(query=urlencode({'q': query}), | ||||
|  | ||||
| @ -134,4 +134,4 @@ def _fetch_supported_languages(resp): | ||||
|     regions_json = loads(response_page) | ||||
|     supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys()) | ||||
| 
 | ||||
|     return supported_languages | ||||
|     return list(supported_languages) | ||||
|  | ||||
| @ -118,7 +118,7 @@ def _fetch_supported_languages(resp): | ||||
|     dom = fromstring(resp.text) | ||||
|     options = dom.xpath('//div[@id="regions-popup"]//ul/li/a') | ||||
|     for option in options: | ||||
|         code = option.xpath('./@data-val')[0] | ||||
|         code = option.xpath('./@data-search-language')[0] | ||||
|         if code.startswith('nb-'): | ||||
|             code = code.replace('nb', 'no', 1) | ||||
|         supported_languages.append(code) | ||||
|  | ||||
| @ -5,6 +5,11 @@ | ||||
| language_codes = ( | ||||
|     (u"ar-SA", u"العربية", u"", u"Arabic"), | ||||
|     (u"bg-BG", u"Български", u"", u"Bulgarian"), | ||||
|     (u"ca", u"Català", u"", u"Catalan"), | ||||
|     (u"ca-AD", u"Català", u"Andorra", u"Catalan"), | ||||
|     (u"ca-CT", u"Català", u"", u"Catalan"), | ||||
|     (u"ca-ES", u"Català", u"Espanya", u"Catalan"), | ||||
|     (u"ca-FR", u"Català", u"França", u"Catalan"), | ||||
|     (u"cs-CZ", u"Čeština", u"", u"Czech"), | ||||
|     (u"da-DK", u"Dansk", u"", u"Danish"), | ||||
|     (u"de", u"Deutsch", u"", u"German"), | ||||
| @ -15,9 +20,7 @@ language_codes = ( | ||||
|     (u"en", u"English", u"", u"English"), | ||||
|     (u"en-AU", u"English", u"Australia", u"English"), | ||||
|     (u"en-CA", u"English", u"Canada", u"English"), | ||||
|     (u"en-CY", u"English", u"Cyprus", u"English"), | ||||
|     (u"en-GB", u"English", u"United Kingdom", u"English"), | ||||
|     (u"en-GD", u"English", u"Grenada", u"English"), | ||||
|     (u"en-ID", u"English", u"Indonesia", u"English"), | ||||
|     (u"en-IE", u"English", u"Ireland", u"English"), | ||||
|     (u"en-IN", u"English", u"India", u"English"), | ||||
| @ -28,6 +31,7 @@ language_codes = ( | ||||
|     (u"en-US", u"English", u"United States", u"English"), | ||||
|     (u"en-ZA", u"English", u"South Africa", u"English"), | ||||
|     (u"es", u"Español", u"", u"Spanish"), | ||||
|     (u"es-AD", u"Español", u"Andorra", u"Spanish"), | ||||
|     (u"es-AR", u"Español", u"Argentina", u"Spanish"), | ||||
|     (u"es-CL", u"Español", u"Chile", u"Spanish"), | ||||
|     (u"es-CO", u"Español", u"Colombia", u"Spanish"), | ||||
| @ -38,38 +42,32 @@ language_codes = ( | ||||
|     (u"et-EE", u"Eesti", u"", u"Estonian"), | ||||
|     (u"fi-FI", u"Suomi", u"", u"Finnish"), | ||||
|     (u"fr", u"Français", u"", u"French"), | ||||
|     (u"fr-AD", u"Français", u"Andorre", u"French"), | ||||
|     (u"fr-BE", u"Français", u"Belgique", u"French"), | ||||
|     (u"fr-CA", u"Français", u"Canada", u"French"), | ||||
|     (u"fr-CH", u"Français", u"Suisse", u"French"), | ||||
|     (u"fr-FR", u"Français", u"France", u"French"), | ||||
|     (u"he-IL", u"עברית", u"", u"Hebrew"), | ||||
|     (u"hr-HR", u"Hrvatski", u"", u"Croatian"), | ||||
|     (u"hu-HU", u"Magyar", u"", u"Hungarian"), | ||||
|     (u"id-ID", u"Bahasa Indonesia", u"", u"Indonesian"), | ||||
|     (u"it", u"Italiano", u"", u"Italian"), | ||||
|     (u"it-CH", u"Italiano", u"Svizzera", u"Italian"), | ||||
|     (u"it-IT", u"Italiano", u"Italia", u"Italian"), | ||||
|     (u"ja-JP", u"日本語", u"", u"Japanese"), | ||||
|     (u"ko-KR", u"한국어", u"", u"Korean"), | ||||
|     (u"lt-LT", u"Lietuvių", u"", u"Lithuanian"), | ||||
|     (u"lv-LV", u"Latviešu", u"", u"Latvian"), | ||||
|     (u"ms-MY", u"Bahasa Melayu", u"", u"Malay"), | ||||
|     (u"nl", u"Nederlands", u"", u"Dutch"), | ||||
|     (u"nl-BE", u"Nederlands", u"België", u"Dutch"), | ||||
|     (u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"), | ||||
|     (u"no-NO", u"Norsk", u"", u"Norwegian"), | ||||
|     (u"pl-PL", u"Polski", u"", u"Polish"), | ||||
|     (u"pt", u"Português", u"", u"Portuguese"), | ||||
|     (u"pt-AD", u"Português", u"Andorra", u"Portuguese"), | ||||
|     (u"pt-BR", u"Português", u"Brasil", u"Portuguese"), | ||||
|     (u"pt-PT", u"Português", u"Portugal", u"Portuguese"), | ||||
|     (u"ro-RO", u"Română", u"", u"Romanian"), | ||||
|     (u"ru-RU", u"Русский", u"", u"Russian"), | ||||
|     (u"sk-SK", u"Slovenčina", u"", u"Slovak"), | ||||
|     (u"sl", u"Slovenščina", u"", u"Slovenian"), | ||||
|     (u"sv-SE", u"Svenska", u"", u"Swedish"), | ||||
|     (u"th-TH", u"ไทย", u"", u"Thai"), | ||||
|     (u"tr-TR", u"Türkçe", u"", u"Turkish"), | ||||
|     (u"vi-VN", u"Tiếng Việt", u"", u"Vietnamese"), | ||||
|     (u"zh", u"中文", u"", u"Chinese"), | ||||
|     (u"zh-CN", u"中文", u"中国", u"Chinese"), | ||||
|     (u"zh-HK", u"中文", u"香港", u"Chinese"), | ||||
|  | ||||
| @ -8,10 +8,12 @@ from searx.testing import SearxTestCase | ||||
| class TestBingImagesEngine(SearxTestCase): | ||||
| 
 | ||||
|     def test_request(self): | ||||
|         bing_images.supported_languages = ['fr-FR', 'en-US'] | ||||
| 
 | ||||
|         query = 'test_query' | ||||
|         dicto = defaultdict(dict) | ||||
|         dicto['pageno'] = 1 | ||||
|         dicto['language'] = 'fr_FR' | ||||
|         dicto['language'] = 'fr-FR' | ||||
|         dicto['safesearch'] = 1 | ||||
|         dicto['time_range'] = '' | ||||
|         params = bing_images.request(query, dicto) | ||||
| @ -19,12 +21,19 @@ class TestBingImagesEngine(SearxTestCase): | ||||
|         self.assertTrue(query in params['url']) | ||||
|         self.assertTrue('bing.com' in params['url']) | ||||
|         self.assertTrue('SRCHHPGUSR' in params['cookies']) | ||||
|         self.assertTrue('fr' in params['cookies']['SRCHHPGUSR']) | ||||
|         self.assertTrue('DEMOTE' in params['cookies']['SRCHHPGUSR']) | ||||
|         self.assertTrue('_EDGE_S' in params['cookies']) | ||||
|         self.assertTrue('fr-fr' in params['cookies']['_EDGE_S']) | ||||
| 
 | ||||
|         dicto['language'] = 'fr' | ||||
|         params = bing_images.request(query, dicto) | ||||
|         self.assertTrue('_EDGE_S' in params['cookies']) | ||||
|         self.assertTrue('fr-fr' in params['cookies']['_EDGE_S']) | ||||
| 
 | ||||
|         dicto['language'] = 'all' | ||||
|         params = bing_images.request(query, dicto) | ||||
|         self.assertIn('SRCHHPGUSR', params['cookies']) | ||||
|         self.assertIn('en', params['cookies']['SRCHHPGUSR']) | ||||
|         self.assertTrue('_EDGE_S' in params['cookies']) | ||||
|         self.assertTrue('en-us' in params['cookies']['_EDGE_S']) | ||||
| 
 | ||||
|     def test_response(self): | ||||
|         self.assertRaises(AttributeError, bing_images.response, None) | ||||
| @ -82,3 +91,28 @@ class TestBingImagesEngine(SearxTestCase): | ||||
|         self.assertEqual(results[0]['content'], '') | ||||
|         self.assertEqual(results[0]['thumbnail_src'], 'thumb_url') | ||||
|         self.assertEqual(results[0]['img_src'], 'img_url') | ||||
| 
 | ||||
|     def test_fetch_supported_languages(self): | ||||
|         html = """ | ||||
|         <div> | ||||
|             <div id="region-section-content"> | ||||
|                 <ul class="b_vList"> | ||||
|                     <li> | ||||
|                         <a href="https://bing...&setmkt=de-DE&s...">Germany</a> | ||||
|                         <a href="https://bing...&setmkt=nb-NO&s...">Norway</a> | ||||
|                     </li> | ||||
|                 </ul> | ||||
|                 <ul class="b_vList"> | ||||
|                     <li> | ||||
|                         <a href="https://bing...&setmkt=es-AR&s...">Argentina</a> | ||||
|                     </li> | ||||
|                 </ul> | ||||
|             </div> | ||||
|         </div> | ||||
|         """ | ||||
|         response = mock.Mock(text=html) | ||||
|         languages = list(bing_images._fetch_supported_languages(response)) | ||||
|         self.assertEqual(len(languages), 3) | ||||
|         self.assertIn('de-DE', languages) | ||||
|         self.assertIn('no-NO', languages) | ||||
|         self.assertIn('es-AR', languages) | ||||
|  | ||||
| @ -8,6 +8,8 @@ from searx.testing import SearxTestCase | ||||
| class TestBingVideosEngine(SearxTestCase): | ||||
| 
 | ||||
|     def test_request(self): | ||||
|         bing_videos.supported_languages = ['fr-FR', 'en-US'] | ||||
| 
 | ||||
|         query = 'test_query' | ||||
|         dicto = defaultdict(dict) | ||||
|         dicto['pageno'] = 1 | ||||
|  | ||||
| @ -139,9 +139,9 @@ class TestSwisscowsEngine(SearxTestCase): | ||||
|             <div id="regions-popup"> | ||||
|                 <div> | ||||
|                     <ul> | ||||
|                         <li><a data-val="browser"></a></li> | ||||
|                         <li><a data-val="de-CH"></a></li> | ||||
|                         <li><a data-val="fr-CH"></a></li> | ||||
|                         <li><a data-search-language="browser"></a></li> | ||||
|                         <li><a data-search-language="de-CH"></a></li> | ||||
|                         <li><a data-search-language="fr-CH"></a></li> | ||||
|                     </ul> | ||||
|                 </div> | ||||
|             </div> | ||||
|  | ||||
| @ -8,13 +8,13 @@ | ||||
| # are written in current directory to avoid overwriting in case something goes wrong. | ||||
| 
 | ||||
| from requests import get | ||||
| from urllib import urlencode | ||||
| from lxml.html import fromstring | ||||
| from json import loads, dumps | ||||
| from json import loads, dump | ||||
| import io | ||||
| from sys import path | ||||
| path.append('../searx')  # noqa | ||||
| from searx import settings | ||||
| from searx.url_utils import urlencode | ||||
| from searx.engines import initialize_engines, engines | ||||
| 
 | ||||
| # Geonames API for country names. | ||||
| @ -70,7 +70,7 @@ def get_country_name(locale): | ||||
|     json = loads(response.text) | ||||
|     content = json.get('geonames', None) | ||||
|     if content is None or len(content) != 1: | ||||
|         print "No country name found for " + locale[0] + "-" + locale[1] | ||||
|         print("No country name found for " + locale[0] + "-" + locale[1]) | ||||
|         return '' | ||||
| 
 | ||||
|     return content[0].get('countryName', '') | ||||
| @ -84,11 +84,11 @@ def fetch_supported_languages(): | ||||
|             try: | ||||
|                 engines_languages[engine_name] = engines[engine_name].fetch_supported_languages() | ||||
|             except Exception as e: | ||||
|                 print e | ||||
|                 print(e) | ||||
| 
 | ||||
|     # write json file | ||||
|     with io.open(engines_languages_file, "w", encoding="utf-8") as f: | ||||
|         f.write(unicode(dumps(engines_languages, ensure_ascii=False, encoding="utf-8"))) | ||||
|         dump(engines_languages, f, ensure_ascii=False) | ||||
| 
 | ||||
| 
 | ||||
| # Join all language lists. | ||||
| @ -97,7 +97,7 @@ def join_language_lists(): | ||||
|     global languages | ||||
|     # include wikipedia first for more accurate language names | ||||
|     languages = {code: lang for code, lang | ||||
|                  in engines_languages['wikipedia'].iteritems() | ||||
|                  in engines_languages['wikipedia'].items() | ||||
|                  if valid_code(code)} | ||||
| 
 | ||||
|     for engine_name in engines_languages: | ||||
| @ -121,7 +121,7 @@ def join_language_lists(): | ||||
|     # filter list to include only languages supported by most engines | ||||
|     min_supported_engines = int(0.70 * len(engines_languages)) | ||||
|     languages = {code: lang for code, lang | ||||
|                  in languages.iteritems() | ||||
|                  in languages.items() | ||||
|                  if len(lang.get('counter', [])) >= min_supported_engines or | ||||
|                  len(languages.get(code.split('-')[0], {}).get('counter', [])) >= min_supported_engines} | ||||
| 
 | ||||
| @ -165,7 +165,7 @@ def filter_single_country_languages(): | ||||
| 
 | ||||
| # Write languages.py. | ||||
| def write_languages_file(): | ||||
|     new_file = open(languages_file, 'w') | ||||
|     new_file = open(languages_file, 'wb') | ||||
|     file_content = '# -*- coding: utf-8 -*-\n'\ | ||||
|                    + '# list of language codes\n'\ | ||||
|                    + '# this file is generated automatically by utils/update_search_languages.py\n'\ | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user