mirror of
				https://github.com/searxng/searxng.git
				synced 2025-10-31 18:47:07 -04:00 
			
		
		
		
	Merge pull request #99 from dalf/master
[enh] stick results from the same category and template and [fix] rewrite the google engine
This commit is contained in:
		
						commit
						090254feca
					
				| @ -1,15 +1,17 @@ | ||||
| ## Google (Web) | ||||
| #  | ||||
| # @website     https://www.google.com | ||||
| # @provide-api yes (https://developers.google.com/web-search/docs/), deprecated! | ||||
| # @provide-api yes (https://developers.google.com/custom-search/) | ||||
| #  | ||||
| # @using-api   yes | ||||
| # @results     JSON | ||||
| # @stable      yes (but deprecated) | ||||
| # @parse       url, title, content | ||||
| # @using-api   no | ||||
| # @results     HTML | ||||
| # @stable      no (HTML can change) | ||||
| # @parse       url, title, content, suggestion | ||||
| 
 | ||||
| from urllib import urlencode | ||||
| from json import loads | ||||
| from urlparse import unquote,urlparse,parse_qsl | ||||
| from lxml import html | ||||
| from searx.engines.xpath import extract_text, extract_url | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['general'] | ||||
| @ -17,21 +19,45 @@ paging = True | ||||
| language_support = True | ||||
| 
 | ||||
| # search-url | ||||
| url = 'https://ajax.googleapis.com/' | ||||
| search_url = url + 'ajax/services/search/web?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}'  # noqa | ||||
| google_hostname = 'www.google.com' | ||||
| search_path = '/search' | ||||
| redirect_path = '/url' | ||||
| images_path = '/images' | ||||
| search_url = 'https://' + google_hostname + search_path + '?{query}&start={offset}&gbv=1' | ||||
| 
 | ||||
| # specific xpath variables | ||||
| results_xpath= '//li[@class="g"]' | ||||
| url_xpath = './/h3/a/@href' | ||||
| title_xpath = './/h3' | ||||
| content_xpath = './/span[@class="st"]' | ||||
| suggestion_xpath = '//p[@class="_Bmc"]' | ||||
| 
 | ||||
| images_xpath = './/div/a' | ||||
| image_url_xpath = './@href' | ||||
| image_img_src_xpath = './img/@src' | ||||
| 
 | ||||
| # remove google-specific tracking-url | ||||
| def parse_url(url_string): | ||||
|     parsed_url = urlparse(url_string) | ||||
|     if parsed_url.netloc in [google_hostname, ''] and parsed_url.path==redirect_path: | ||||
|         query = dict(parse_qsl(parsed_url.query)) | ||||
|         return query['q'] | ||||
|     else: | ||||
|         return url_string | ||||
| 
 | ||||
| # do search-request | ||||
| def request(query, params): | ||||
|     offset = (params['pageno'] - 1) * 8 | ||||
|     offset = (params['pageno'] - 1) * 10 | ||||
| 
 | ||||
|     language = 'en-US' | ||||
|     if params['language'] != 'all': | ||||
|         language = params['language'].replace('_', '-') | ||||
|     if params['language'] == 'all': | ||||
|         language = 'en' | ||||
|     else: | ||||
|         language = params['language'].replace('_','-').lower() | ||||
| 
 | ||||
|     params['url'] = search_url.format(offset=offset, | ||||
|                                       query=urlencode({'q': query}), | ||||
|                                       language=language) | ||||
|                                       query=urlencode({'q': query})) | ||||
| 
 | ||||
|     params['headers']['Accept-Language'] = language | ||||
| 
 | ||||
|     return params | ||||
| 
 | ||||
| @ -40,18 +66,50 @@ def request(query, params): | ||||
| def response(resp): | ||||
|     results = [] | ||||
| 
 | ||||
|     search_res = loads(resp.text) | ||||
| 
 | ||||
|     # return empty array if there are no results | ||||
|     if not search_res.get('responseData', {}).get('results'): | ||||
|         return [] | ||||
|     dom = html.fromstring(resp.text) | ||||
| 
 | ||||
|     # parse results | ||||
|     for result in search_res['responseData']['results']: | ||||
|     for result in dom.xpath(results_xpath): | ||||
|         title = extract_text(result.xpath(title_xpath)[0]) | ||||
|         try: | ||||
|             url = parse_url(extract_url(result.xpath(url_xpath), search_url)) | ||||
|             parsed_url = urlparse(url) | ||||
|             if parsed_url.netloc==google_hostname and parsed_url.path==search_path: | ||||
|                 # remove the link to google news | ||||
|                 continue | ||||
| 
 | ||||
|             if parsed_url.netloc==google_hostname and parsed_url.path==images_path: | ||||
|                 # images result | ||||
|                 results = results + parse_images(result) | ||||
|             else: | ||||
|                 # normal result | ||||
|                 content = extract_text(result.xpath(content_xpath)[0]) | ||||
|                 # append result | ||||
|         results.append({'url': result['unescapedUrl'], | ||||
|                         'title': result['titleNoFormatting'], | ||||
|                         'content': result['content']}) | ||||
|                 results.append({'url': url,  | ||||
|                                 'title': title,  | ||||
|                                 'content': content}) | ||||
|         except: | ||||
|             continue | ||||
| 
 | ||||
|     # parse suggestion | ||||
|     for suggestion in dom.xpath(suggestion_xpath): | ||||
|         # append suggestion | ||||
|         results.append({'suggestion': extract_text(suggestion)}) | ||||
| 
 | ||||
|     # return results | ||||
|     return results | ||||
| 
 | ||||
| def parse_images(result): | ||||
|     results = [] | ||||
|     for image in result.xpath(images_xpath): | ||||
|         url = parse_url(extract_text(image.xpath(image_url_xpath)[0])) | ||||
|         img_src = extract_text(image.xpath(image_img_src_xpath)[0]) | ||||
|          | ||||
|         # append result | ||||
|         results.append({'url': url, | ||||
|                         'title': '', | ||||
|                         'content': '', | ||||
|                         'img_src': img_src, | ||||
|                         'template': 'images.html'}) | ||||
| 
 | ||||
|     return results | ||||
|  | ||||
| @ -49,7 +49,8 @@ def score_results(results): | ||||
|     flat_len = len(flat_res) | ||||
|     engines_len = len(results) | ||||
|     results = [] | ||||
|     # deduplication + scoring | ||||
| 
 | ||||
|     # pass 1: deduplication + scoring | ||||
|     for i, res in enumerate(flat_res): | ||||
| 
 | ||||
|         res['parsed_url'] = urlparse(res['url']) | ||||
| @ -90,7 +91,42 @@ def score_results(results): | ||||
|         else: | ||||
|             res['score'] = score | ||||
|             results.append(res) | ||||
|     return sorted(results, key=itemgetter('score'), reverse=True) | ||||
|     results = sorted(results, key=itemgetter('score'), reverse=True) | ||||
| 
 | ||||
|     # pass 2 : group results by category and template | ||||
|     gresults = [] | ||||
|     categoryPositions = {} | ||||
| 
 | ||||
|     for i, res in enumerate(results): | ||||
|         # FIXME : handle more than one category per engine | ||||
|         category = engines[res['engine']].categories[0] + ':' + '' if 'template' not in res else res['template']  | ||||
| 
 | ||||
|         current = None if category not in categoryPositions else categoryPositions[category] | ||||
| 
 | ||||
|         # group with previous results using the same category if the group can accept more result and is not too far from the current position | ||||
|         if current != None and (current['count'] > 0) and (len(gresults) - current['index'] < 20): | ||||
|             # group with the previous results using the same category with this one | ||||
|             index = current['index'] | ||||
|             gresults.insert(index, res) | ||||
| 
 | ||||
|             # update every index after the current one (including the current one) | ||||
|             for k in categoryPositions: | ||||
|                 v = categoryPositions[k]['index'] | ||||
|                 if v >= index: | ||||
|                     categoryPositions[k]['index'] = v+1 | ||||
| 
 | ||||
|             # update this category | ||||
|             current['count'] -= 1 | ||||
| 
 | ||||
|         else: | ||||
|             # same category | ||||
|             gresults.append(res) | ||||
| 
 | ||||
|             # update categoryIndex | ||||
|             categoryPositions[category] = { 'index' : len(gresults), 'count' : 8 } | ||||
| 
 | ||||
|     # return gresults | ||||
|     return gresults | ||||
| 
 | ||||
| 
 | ||||
| class Search(object): | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user