mirror of
				https://github.com/searxng/searxng.git
				synced 2025-10-25 15:52:31 -04:00 
			
		
		
		
	[fix] rewrite the google engine since Google Web Search API is about to expire
This commit is contained in:
		
							parent
							
								
									5d1610d87a
								
							
						
					
					
						commit
						5dc3eb3399
					
				| @ -1,15 +1,17 @@ | ||||
| ## Google (Web) | ||||
| #  | ||||
| # @website     https://www.google.com | ||||
| # @provide-api yes (https://developers.google.com/web-search/docs/), deprecated! | ||||
| # @provide-api yes (https://developers.google.com/custom-search/) | ||||
| #  | ||||
| # @using-api   yes | ||||
| # @results     JSON | ||||
| # @stable      yes (but deprecated) | ||||
| # @parse       url, title, content | ||||
| # @using-api   no | ||||
| # @results     HTML | ||||
| # @stable      no (HTML can change) | ||||
| # @parse       url, title, content, suggestion | ||||
| 
 | ||||
| from urllib import urlencode | ||||
| from json import loads | ||||
| from urlparse import unquote,urlparse,parse_qsl | ||||
| from lxml import html | ||||
| from searx.engines.xpath import extract_text, extract_url | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['general'] | ||||
| @ -17,21 +19,45 @@ paging = True | ||||
| language_support = True | ||||
| 
 | ||||
| # search-url | ||||
| url = 'https://ajax.googleapis.com/' | ||||
| search_url = url + 'ajax/services/search/web?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}'  # noqa | ||||
| google_hostname = 'www.google.com' | ||||
| search_path = '/search' | ||||
| redirect_path = '/url' | ||||
| images_path = '/images' | ||||
| search_url = 'https://' + google_hostname + search_path + '?{query}&start={offset}&gbv=1' | ||||
| 
 | ||||
| # specific xpath variables | ||||
| results_xpath= '//li[@class="g"]' | ||||
| url_xpath = './/h3/a/@href' | ||||
| title_xpath = './/h3' | ||||
| content_xpath = './/span[@class="st"]' | ||||
| suggestion_xpath = '//p[@class="_Bmc"]' | ||||
| 
 | ||||
| images_xpath = './/div/a' | ||||
| image_url_xpath = './@href' | ||||
| image_img_src_xpath = './img/@src' | ||||
| 
 | ||||
| # remove google-specific tracking-url | ||||
| def parse_url(url_string): | ||||
|     parsed_url = urlparse(url_string) | ||||
|     if parsed_url.netloc in [google_hostname, ''] and parsed_url.path==redirect_path: | ||||
|         query = dict(parse_qsl(parsed_url.query)) | ||||
|         return query['q'] | ||||
|     else: | ||||
|         return url_string | ||||
| 
 | ||||
| # do search-request | ||||
| def request(query, params): | ||||
|     offset = (params['pageno'] - 1) * 8 | ||||
|     offset = (params['pageno'] - 1) * 10 | ||||
| 
 | ||||
|     language = 'en-US' | ||||
|     if params['language'] != 'all': | ||||
|         language = params['language'].replace('_', '-') | ||||
|     if params['language'] == 'all': | ||||
|         language = 'en' | ||||
|     else: | ||||
|         language = params['language'].replace('_','-').lower() | ||||
| 
 | ||||
|     params['url'] = search_url.format(offset=offset, | ||||
|                                       query=urlencode({'q': query}), | ||||
|                                       language=language) | ||||
|                                       query=urlencode({'q': query})) | ||||
| 
 | ||||
|     params['headers']['Accept-Language'] = language | ||||
| 
 | ||||
|     return params | ||||
| 
 | ||||
| @ -40,18 +66,50 @@ def request(query, params): | ||||
| def response(resp): | ||||
|     results = [] | ||||
| 
 | ||||
|     search_res = loads(resp.text) | ||||
| 
 | ||||
|     # return empty array if there are no results | ||||
|     if not search_res.get('responseData', {}).get('results'): | ||||
|         return [] | ||||
|     dom = html.fromstring(resp.text) | ||||
| 
 | ||||
|     # parse results | ||||
|     for result in search_res['responseData']['results']: | ||||
|     for result in dom.xpath(results_xpath): | ||||
|         title = extract_text(result.xpath(title_xpath)[0]) | ||||
|         try: | ||||
|             url = parse_url(extract_url(result.xpath(url_xpath), search_url)) | ||||
|             parsed_url = urlparse(url) | ||||
|             if parsed_url.netloc==google_hostname and parsed_url.path==search_path: | ||||
|                 # remove the link to google news | ||||
|                 continue | ||||
| 
 | ||||
|             if parsed_url.netloc==google_hostname and parsed_url.path==images_path: | ||||
|                 # images result | ||||
|                 results = results + parse_images(result) | ||||
|             else: | ||||
|                 # normal result | ||||
|                 content = extract_text(result.xpath(content_xpath)[0]) | ||||
|                 # append result | ||||
|         results.append({'url': result['unescapedUrl'], | ||||
|                         'title': result['titleNoFormatting'], | ||||
|                         'content': result['content']}) | ||||
|                 results.append({'url': url,  | ||||
|                                 'title': title,  | ||||
|                                 'content': content}) | ||||
|         except: | ||||
|             continue | ||||
| 
 | ||||
|     # parse suggestion | ||||
|     for suggestion in dom.xpath(suggestion_xpath): | ||||
|         # append suggestion | ||||
|         results.append({'suggestion': extract_text(suggestion)}) | ||||
| 
 | ||||
|     # return results | ||||
|     return results | ||||
| 
 | ||||
| def parse_images(result): | ||||
|     results = [] | ||||
|     for image in result.xpath(images_xpath): | ||||
|         url = parse_url(extract_text(image.xpath(image_url_xpath)[0])) | ||||
|         img_src = extract_text(image.xpath(image_img_src_xpath)[0]) | ||||
|          | ||||
|         # append result | ||||
|         results.append({'url': url, | ||||
|                         'title': '', | ||||
|                         'content': '', | ||||
|                         'img_src': img_src, | ||||
|                         'template': 'images.html'}) | ||||
| 
 | ||||
|     return results | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user