diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index 92da30e23..63b007729 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -44,6 +44,29 @@ time_range_dict = { 'month': ('1m', 'm'), } +region2domain = { + "CO": "co.search.yahoo.com", # Colombia + "TH": "th.search.yahoo.com", # Thailand + "VE": "ve.search.yahoo.com", # Venezuela + "CL": "cl.search.yahoo.com", # Chile + "HK": "hk.search.yahoo.com", # Hong Kong + "PE": "pe.search.yahoo.com", # Peru + "CA": "ca.search.yahoo.com", # Canada + "DE": "de.search.yahoo.com", # Germany + "FR": "fr.search.yahoo.com", # France + "TW": "tw.search.yahoo.com", # Taiwan + "GB": "uk.search.yahoo.com", # United Kingdom + "UK": "uk.search.yahoo.com", + "BR": "br.search.yahoo.com", # Brazil + "IN": "in.search.yahoo.com", # India + "ES": "espanol.search.yahoo.com", # Espanol + "PH": "ph.search.yahoo.com", # Philippines + "AR": "ar.search.yahoo.com", # Argentina + "MX": "mx.search.yahoo.com", # Mexico + "SG": "sg.search.yahoo.com", # Singapore +} +"""Map regions to domain""" + lang2domain = { 'zh_chs': 'hk.search.yahoo.com', 'zh_cht': 'tw.search.yahoo.com', @@ -65,40 +88,40 @@ lang2domain = { yahoo_languages = { "all": "any", - "ar": "ar", - "bg": "bg", - "cs": "cs", - "da": "da", - "de": "de", - "el": "el", - "en": "en", - "es": "es", - "et": "et", - "fi": "fi", - "fr": "fr", - "he": "he", - "hr": "hr", - "hu": "hu", - "it": "it", - "ja": "ja", - "ko": "ko", - "lt": "lt", - "lv": "lv", - "nl": "nl", - "no": "no", - "pl": "pl", - "pt": "pt", - "ro": "ro", - "ru": "ru", - "sk": "sk", - "sl": "sl", - "sv": "sv", - "th": "th", - "tr": "tr", - "zh": "zh_chs", + "ar": "ar", # Arabic + "bg": "bg", # Bulgarian + "cs": "cs", # Czech + "da": "da", # Danish + "de": "de", # German + "el": "el", # Greek + "en": "en", # English + "es": "es", # Spanish + "et": "et", # Estonian + "fi": "fi", # Finnish + "fr": "fr", # French + "he": "he", # Hebrew + "hr": "hr", # Croatian + "hu": "hu", # Hungarian + "it": "it", # Italian + "ja": "ja", # Japanese + "ko": "ko", # Korean + "lt": "lt", # Lithuanian + "lv": "lv", # Latvian + "nl": "nl", # Dutch + "no": "no", # Norwegian + "pl": "pl", # Polish + "pt": "pt", # Portuguese + "ro": "ro", # Romanian + "ru": "ru", # Russian + "sk": "sk", # Slovak + "sl": "sl", # Slovenian + "sv": "sv", # Swedish + "th": "th", # Thai + "tr": "tr", # Turkish + "zh": "zh_chs", # ChineseĀ (Simplified) "zh_Hans": "zh_chs", 'zh-CN': "zh_chs", - "zh_Hant": "zh_cht", + "zh_Hant": "zh_cht", # ChineseĀ (Traditional) "zh-HK": "zh_cht", 'zh-TW': "zh_cht", } @@ -107,7 +130,7 @@ yahoo_languages = { def request(query, params): """build request""" - lang = params["language"].split("-")[0] + lang, region = (params["language"].split("-") + [None])[:2] lang = yahoo_languages.get(lang, "any") offset = (params['pageno'] - 1) * 7 + 1 @@ -127,9 +150,11 @@ def request(query, params): } ) - domain = lang2domain.get(lang, '%s.search.yahoo.com' % lang) + domain = region2domain.get(region) + if not domain: + domain = lang2domain.get(lang, '%s.search.yahoo.com' % lang) params['url'] = 'https://%s/search?%s' % (domain, args) - return params + params['domain'] = domain def parse_url(url_string): @@ -157,14 +182,22 @@ def response(resp): results = [] dom = html.fromstring(resp.text) + url_xpath = './/div[contains(@class,"compTitle")]/h3/a/@href' + title_xpath = './/h3//a/@aria-label' + + domain = resp.search_params['domain'] + if domain == "search.yahoo.com": + url_xpath = './/div[contains(@class,"compTitle")]/a/@href' + title_xpath = './/div[contains(@class,"compTitle")]/a/h3/span' + # parse results for result in eval_xpath_list(dom, '//div[contains(@class,"algo-sr")]'): - url = eval_xpath_getindex(result, './/div[contains(@class,"compTitle")]/a/@href', 0, default=None) + url = eval_xpath_getindex(result, url_xpath, 0, default=None) if url is None: continue url = parse_url(url) - title = eval_xpath_getindex(result, './/div[contains(@class,"compTitle")]/a/h3/span', 0, default='') + title = eval_xpath_getindex(result, title_xpath, 0, default='') title: str = extract_text(title) content = eval_xpath_getindex(result, './/div[contains(@class, "compText")]', 0, default='') content: str = extract_text(content, allow_none=True)